summaryrefslogtreecommitdiffstats
path: root/vendor/regex-automata/src/hybrid/id.rs
blob: a6fcde52e63e90f6db6be55c21678b277d1cddb6 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
/// A state identifier especially tailored for lazy DFAs.
///
/// A lazy state ID logically represents a pointer to a DFA state. In practice,
/// by limiting the number of DFA states it can address, it reserves some
/// bits of its representation to encode some additional information. That
/// additional information is called a "tag." That tag is used to record
/// whether the state it points to is an unknown, dead, quit, start or match
/// state.
///
/// When implementing a low level search routine with a lazy DFA, it is
/// necessary to query the type of the current state to know what to do:
///
/// * **Unknown** - The state has not yet been computed. The
/// parameters used to get this state ID must be re-passed to
/// [`DFA::next_state`](crate::hybrid::dfa::DFA), which will never return an
/// unknown state ID.
/// * **Dead** - A dead state only has transitions to itself. It indicates that
/// the search cannot do anything else and should stop with whatever result it
/// has.
/// * **Quit** - A quit state indicates that the automaton could not answer
/// whether a match exists or not. Correct search implementations must return a
/// [`MatchError::Quit`](crate::MatchError::Quit).
/// * **Start** - A start state indicates that the automaton will begin
/// searching at a starting state. Branching on this isn't required for
/// correctness, but a common optimization is to use this to more quickly look
/// for a prefix.
/// * **Match** - A match state indicates that a match has been found.
/// Depending on the semantics of your search implementation, it may either
/// continue until the end of the haystack or a dead state, or it might quit
/// and return the match immediately.
///
/// As an optimization, the [`is_tagged`](LazyStateID::is_tagged) predicate
/// can be used to determine if a tag exists at all. This is useful to avoid
/// branching on all of the above types for every byte searched.
///
/// # Example
///
/// This example shows how `LazyStateID` can be used to implement a correct
/// search routine with minimal branching. In particular, this search routine
/// implements "leftmost" matching, which means that it doesn't immediately
/// stop once a match is found. Instead, it continues until it reaches a dead
/// state.
///
/// Notice also how a correct search implementation deals with
/// [`CacheError`](crate::hybrid::CacheError)s returned by some of
/// the lazy DFA routines. When a `CacheError` occurs, it returns
/// [`MatchError::GaveUp`](crate::MatchError::GaveUp).
///
/// ```
/// use regex_automata::{
///     hybrid::dfa::{Cache, DFA},
///     HalfMatch, MatchError, PatternID,
/// };
///
/// fn find_leftmost_first(
///     dfa: &DFA,
///     cache: &mut Cache,
///     haystack: &[u8],
/// ) -> Result<Option<HalfMatch>, MatchError> {
///     // The start state is determined by inspecting the position and the
///     // initial bytes of the haystack. Note that start states can never
///     // be match states (since DFAs in this crate delay matches by 1
///     // byte), so we don't need to check if the start state is a match.
///     let mut sid = dfa.start_state_forward(
///         cache, None, haystack, 0, haystack.len(),
///     ).map_err(|_| MatchError::GaveUp { offset: 0 })?;
///     let mut last_match = None;
///     // Walk all the bytes in the haystack. We can quit early if we see
///     // a dead or a quit state. The former means the automaton will
///     // never transition to any other state. The latter means that the
///     // automaton entered a condition in which its search failed.
///     for (i, &b) in haystack.iter().enumerate() {
///         sid = dfa
///             .next_state(cache, sid, b)
///             .map_err(|_| MatchError::GaveUp { offset: i })?;
///         if sid.is_tagged() {
///             if sid.is_match() {
///                 last_match = Some(HalfMatch::new(
///                     dfa.match_pattern(cache, sid, 0),
///                     i,
///                 ));
///             } else if sid.is_dead() {
///                 return Ok(last_match);
///             } else if sid.is_quit() {
///                 // It is possible to enter into a quit state after
///                 // observing a match has occurred. In that case, we
///                 // should return the match instead of an error.
///                 if last_match.is_some() {
///                     return Ok(last_match);
///                 }
///                 return Err(MatchError::Quit { byte: b, offset: i });
///             }
///             // Implementors may also want to check for start states and
///             // handle them differently for performance reasons. But it is
///             // not necessary for correctness.
///         }
///     }
///     // Matches are always delayed by 1 byte, so we must explicitly walk
///     // the special "EOI" transition at the end of the search.
///     sid = dfa
///         .next_eoi_state(cache, sid)
///         .map_err(|_| MatchError::GaveUp { offset: haystack.len() })?;
///     if sid.is_match() {
///         last_match = Some(HalfMatch::new(
///             dfa.match_pattern(cache, sid, 0),
///             haystack.len(),
///         ));
///     }
///     Ok(last_match)
/// }
///
/// // We use a greedy '+' operator to show how the search doesn't just stop
/// // once a match is detected. It continues extending the match. Using
/// // '[a-z]+?' would also work as expected and stop the search early.
/// // Greediness is built into the automaton.
/// let dfa = DFA::new(r"[a-z]+")?;
/// let mut cache = dfa.create_cache();
/// let haystack = "123 foobar 4567".as_bytes();
/// let mat = find_leftmost_first(&dfa, &mut cache, haystack)?.unwrap();
/// assert_eq!(mat.pattern().as_usize(), 0);
/// assert_eq!(mat.offset(), 10);
///
/// // Here's another example that tests our handling of the special
/// // EOI transition. This will fail to find a match if we don't call
/// // 'next_eoi_state' at the end of the search since the match isn't found
/// // until the final byte in the haystack.
/// let dfa = DFA::new(r"[0-9]{4}")?;
/// let mut cache = dfa.create_cache();
/// let haystack = "123 foobar 4567".as_bytes();
/// let mat = find_leftmost_first(&dfa, &mut cache, haystack)?.unwrap();
/// assert_eq!(mat.pattern().as_usize(), 0);
/// assert_eq!(mat.offset(), 15);
///
/// // And note that our search implementation above automatically works
/// // with multi-DFAs. Namely, `dfa.match_pattern(match_state, 0)` selects
/// // the appropriate pattern ID for us.
/// let dfa = DFA::new_many(&[r"[a-z]+", r"[0-9]+"])?;
/// let mut cache = dfa.create_cache();
/// let haystack = "123 foobar 4567".as_bytes();
/// let mat = find_leftmost_first(&dfa, &mut cache, haystack)?.unwrap();
/// assert_eq!(mat.pattern().as_usize(), 1);
/// assert_eq!(mat.offset(), 3);
/// let mat = find_leftmost_first(&dfa, &mut cache, &haystack[3..])?.unwrap();
/// assert_eq!(mat.pattern().as_usize(), 0);
/// assert_eq!(mat.offset(), 7);
/// let mat = find_leftmost_first(&dfa, &mut cache, &haystack[10..])?.unwrap();
/// assert_eq!(mat.pattern().as_usize(), 1);
/// assert_eq!(mat.offset(), 5);
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
#[derive(
    Clone, Copy, Debug, Default, Eq, Hash, PartialEq, PartialOrd, Ord,
)]
pub struct LazyStateID(u32);

impl LazyStateID {
    #[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))]
    const MAX_BIT: usize = 31;

    #[cfg(target_pointer_width = "16")]
    const MAX_BIT: usize = 15;

    const MASK_UNKNOWN: usize = 1 << (LazyStateID::MAX_BIT);
    const MASK_DEAD: usize = 1 << (LazyStateID::MAX_BIT - 1);
    const MASK_QUIT: usize = 1 << (LazyStateID::MAX_BIT - 2);
    const MASK_START: usize = 1 << (LazyStateID::MAX_BIT - 3);
    const MASK_MATCH: usize = 1 << (LazyStateID::MAX_BIT - 4);
    const MAX: usize = LazyStateID::MASK_MATCH - 1;

    /// Create a new lazy state ID.
    ///
    /// If the given identifier exceeds [`LazyStateID::MAX`], then this returns
    /// an error.
    #[inline]
    pub(crate) fn new(id: usize) -> Result<LazyStateID, LazyStateIDError> {
        if id > LazyStateID::MAX {
            return Err(LazyStateIDError { attempted: id as u64 });
        }
        Ok(LazyStateID::new_unchecked(id))
    }

    /// Create a new lazy state ID without checking whether the given value
    /// exceeds [`LazyStateID::MAX`].
    ///
    /// While this is unchecked, providing an incorrect value must never
    /// sacrifice memory safety.
    #[inline]
    const fn new_unchecked(id: usize) -> LazyStateID {
        LazyStateID(id as u32)
    }

    /// Return this lazy state ID as its raw value if and only if it is not
    /// tagged (and thus not an unknown, dead, quit, start or match state ID).
    #[inline]
    pub(crate) fn as_usize(&self) -> Option<usize> {
        if self.is_tagged() {
            None
        } else {
            Some(self.as_usize_unchecked())
        }
    }

    /// Return this lazy state ID as an untagged `usize`.
    ///
    /// If this lazy state ID is tagged, then the usize returned is the state
    /// ID without the tag. If the ID was not tagged, then the usize returned
    /// is equivalent to the state ID.
    #[inline]
    pub(crate) fn as_usize_untagged(&self) -> usize {
        self.as_usize_unchecked() & LazyStateID::MAX
    }

    /// Return this lazy state ID as its raw internal `usize` value, which may
    /// be tagged (and thus greater than LazyStateID::MAX).
    #[inline]
    pub(crate) const fn as_usize_unchecked(&self) -> usize {
        self.0 as usize
    }

    #[inline]
    pub(crate) const fn to_unknown(&self) -> LazyStateID {
        LazyStateID::new_unchecked(
            self.as_usize_unchecked() | LazyStateID::MASK_UNKNOWN,
        )
    }

    #[inline]
    pub(crate) const fn to_dead(&self) -> LazyStateID {
        LazyStateID::new_unchecked(
            self.as_usize_unchecked() | LazyStateID::MASK_DEAD,
        )
    }

    #[inline]
    pub(crate) const fn to_quit(&self) -> LazyStateID {
        LazyStateID::new_unchecked(
            self.as_usize_unchecked() | LazyStateID::MASK_QUIT,
        )
    }

    /// Return this lazy state ID as a state ID that is tagged as a start
    /// state.
    #[inline]
    pub(crate) const fn to_start(&self) -> LazyStateID {
        LazyStateID::new_unchecked(
            self.as_usize_unchecked() | LazyStateID::MASK_START,
        )
    }

    /// Return this lazy state ID as a lazy state ID that is tagged as a match
    /// state.
    #[inline]
    pub(crate) const fn to_match(&self) -> LazyStateID {
        LazyStateID::new_unchecked(
            self.as_usize_unchecked() | LazyStateID::MASK_MATCH,
        )
    }

    /// Return true if and only if this lazy state ID is tagged.
    ///
    /// When a lazy state ID is tagged, then one can conclude that it is one
    /// of a match, start, dead, quit or unknown state.
    #[inline]
    pub const fn is_tagged(&self) -> bool {
        self.as_usize_unchecked() > LazyStateID::MAX
    }

    /// Return true if and only if this represents a lazy state ID that is
    /// "unknown." That is, the state has not yet been created. When a caller
    /// sees this state ID, it generally means that a state has to be computed
    /// in order to proceed.
    #[inline]
    pub const fn is_unknown(&self) -> bool {
        self.as_usize_unchecked() & LazyStateID::MASK_UNKNOWN > 0
    }

    /// Return true if and only if this represents a dead state. A dead state
    /// is a state that can never transition to any other state except the
    /// dead state. When a dead state is seen, it generally indicates that a
    /// search should stop.
    #[inline]
    pub const fn is_dead(&self) -> bool {
        self.as_usize_unchecked() & LazyStateID::MASK_DEAD > 0
    }

    /// Return true if and only if this represents a quit state. A quit state
    /// is a state that is representationally equivalent to a dead state,
    /// except it indicates the automaton has reached a point at which it can
    /// no longer determine whether a match exists or not. In general, this
    /// indicates an error during search and the caller must either pass this
    /// error up or use a different search technique.
    #[inline]
    pub const fn is_quit(&self) -> bool {
        self.as_usize_unchecked() & LazyStateID::MASK_QUIT > 0
    }

    /// Return true if and only if this lazy state ID has been tagged as a
    /// start state.
    #[inline]
    pub const fn is_start(&self) -> bool {
        self.as_usize_unchecked() & LazyStateID::MASK_START > 0
    }

    /// Return true if and only if this lazy state ID has been tagged as a
    /// match state.
    #[inline]
    pub const fn is_match(&self) -> bool {
        self.as_usize_unchecked() & LazyStateID::MASK_MATCH > 0
    }
}

/// This error occurs when a lazy state ID could not be constructed.
///
/// This occurs when given an integer exceeding the maximum lazy state ID
/// value.
///
/// When the `std` feature is enabled, this implements the `Error` trait.
#[derive(Clone, Debug, Eq, PartialEq)]
pub(crate) struct LazyStateIDError {
    attempted: u64,
}

impl LazyStateIDError {
    /// Returns the value that failed to constructed a lazy state ID.
    pub(crate) fn attempted(&self) -> u64 {
        self.attempted
    }
}

#[cfg(feature = "std")]
impl std::error::Error for LazyStateIDError {}

impl core::fmt::Display for LazyStateIDError {
    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
        write!(
            f,
            "failed to create LazyStateID from {:?}, which exceeds {:?}",
            self.attempted(),
            LazyStateID::MAX,
        )
    }
}

/// Represents the current state of an overlapping search.
///
/// This is used for overlapping searches since they need to know something
/// about the previous search. For example, when multiple patterns match at the
/// same position, this state tracks the last reported pattern so that the next
/// search knows whether to report another matching pattern or continue with
/// the search at the next position. Additionally, it also tracks which state
/// the last search call terminated in.
///
/// This type provides no introspection capabilities. The only thing a caller
/// can do is construct it and pass it around to permit search routines to use
/// it to track state.
///
/// Callers should always provide a fresh state constructed via
/// [`OverlappingState::start`] when starting a new search. Reusing state from
/// a previous search may result in incorrect results.
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct OverlappingState {
    /// The state ID of the state at which the search was in when the call
    /// terminated. When this is a match state, `last_match` must be set to a
    /// non-None value.
    ///
    /// A `None` value indicates the start state of the corresponding
    /// automaton. We cannot use the actual ID, since any one automaton may
    /// have many start states, and which one is in use depends on several
    /// search-time factors.
    id: Option<LazyStateID>,
    /// Information associated with a match when `id` corresponds to a match
    /// state.
    last_match: Option<StateMatch>,
}

/// Internal state about the last match that occurred. This records both the
/// offset of the match and the match index.
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub(crate) struct StateMatch {
    /// The index into the matching patterns for the current match state.
    pub(crate) match_index: usize,
    /// The offset in the haystack at which the match occurred. This is used
    /// when reporting multiple matches at the same offset. That is, when
    /// an overlapping search runs, the first thing it checks is whether it's
    /// already in a match state, and if so, whether there are more patterns
    /// to report as matches in that state. If so, it increments `match_index`
    /// and returns the pattern and this offset. Once `match_index` exceeds the
    /// number of matching patterns in the current state, the search continues.
    pub(crate) offset: usize,
}

impl OverlappingState {
    /// Create a new overlapping state that begins at the start state of any
    /// automaton.
    pub fn start() -> OverlappingState {
        OverlappingState { id: None, last_match: None }
    }

    pub(crate) fn id(&self) -> Option<LazyStateID> {
        self.id
    }

    pub(crate) fn set_id(&mut self, id: LazyStateID) {
        self.id = Some(id);
    }

    pub(crate) fn last_match(&mut self) -> Option<&mut StateMatch> {
        self.last_match.as_mut()
    }

    pub(crate) fn set_last_match(&mut self, last_match: StateMatch) {
        self.last_match = Some(last_match);
    }
}