summaryrefslogtreecommitdiffstats
path: root/vendor/regex-automata/src/dfa/search_unsafe.rs
blob: ea1c29ff72c7f9eb48dfd13b27491dd01713681c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
use crate::dfa::automaton::{Automaton, State};
use crate::MatchError;

/// This is marked as `inline(always)` specifically because it supports
/// multiple modes of searching. Namely, the 'earliest' boolean getting inlined
/// permits eliminating a few crucial branches.
#[inline(always)]
pub fn find_fwd<A: Automaton + ?Sized>(
    dfa: &A,
    bytes: &[u8],
    start: usize,
    end: usize,
    earliest: bool,
) -> Result<Option<usize>, MatchError> {
    assert!(start <= end);
    assert!(start <= bytes.len());
    assert!(end <= bytes.len());

    let (mut state, mut last_match) = init_fwd(dfa, bytes, start, end)?;
    if earliest && last_match.is_some() {
        return Ok(last_match);
    }

    let mut at = start;
    while at < end {
        let byte = bytes[at];
        state = dfa.next_state(state, byte);
        at += 1;
        if dfa.is_special_state(state) {
            if dfa.is_dead_state(state) {
                return Ok(last_match);
            } else if dfa.is_quit_state(state) {
                return Err(MatchError::Quit { byte, offset: at - 1 });
            }
            last_match = Some(at - dfa.match_offset());
            if earliest {
                return Ok(last_match);
            }
        }
    }
    /*
    unsafe {
        let mut p = bytes.as_ptr().add(start);
        while p < bytes[end..].as_ptr() {
            let byte = *p;
            state = dfa.next_state_unchecked(state, byte);
            p = p.add(1);
            if dfa.is_special_state(state) {
                if dfa.is_dead_state(state) {
                    return Ok(last_match);
                } else if dfa.is_quit_state(state) {
                    return Err(MatchError::Quit {
                        byte,
                        offset: offset(bytes, p) - 1,
                    });
                }
                last_match = Some(offset(bytes, p) - dfa.match_offset());
                if earliest {
                    return Ok(last_match);
                }
            }
        }
    }
    */
    Ok(eof_fwd(dfa, bytes, end, &mut state)?.or(last_match))
}

/// This is marked as `inline(always)` specifically because it supports
/// multiple modes of searching. Namely, the 'earliest' boolean getting inlined
/// permits eliminating a few crucial branches.
#[inline(always)]
pub fn find_rev<A: Automaton + ?Sized>(
    dfa: &A,
    bytes: &[u8],
    start: usize,
    end: usize,
    earliest: bool,
) -> Result<Option<usize>, MatchError> {
    assert!(start <= end);
    assert!(start <= bytes.len());
    assert!(end <= bytes.len());

    let (mut state, mut last_match) = init_rev(dfa, bytes, start, end)?;
    if earliest && last_match.is_some() {
        return Ok(last_match);
    }

    let mut at = end;
    while at > start {
        at -= 1;
        let byte = bytes[at];
        state = dfa.next_state(state, byte);
        if dfa.is_special_state(state) {
            if dfa.is_dead_state(state) {
                return Ok(last_match);
            } else if dfa.is_quit_state(state) {
                return Err(MatchError::Quit { byte, offset: at });
            }
            last_match = Some(at + dfa.match_offset());
            if earliest {
                return Ok(last_match);
            }
        }
    }
    /*
    unsafe {
        let mut p = bytes.as_ptr().add(end);
        while p > bytes[start..].as_ptr() {
            p = p.sub(1);
            let byte = *p;
            state = dfa.next_state_unchecked(state, byte);
            if dfa.is_special_state(state) {
                if dfa.is_dead_state(state) {
                    return Ok(last_match);
                } else if dfa.is_quit_state(state) {
                    return Err(MatchError::Quit {
                        byte,
                        offset: offset(bytes, p),
                    });
                }
                last_match = Some(offset(bytes, p) + dfa.match_offset());
                if earliest {
                    return Ok(last_match);
                }
            }
        }
    }
    */
    Ok(eof_rev(dfa, state, bytes, start)?.or(last_match))
}

pub fn find_overlapping_fwd<A: Automaton + ?Sized>(
    dfa: &A,
    bytes: &[u8],
    mut start: usize,
    end: usize,
    caller_state: &mut State<A::ID>,
) -> Result<Option<usize>, MatchError> {
    assert!(start <= end);
    assert!(start <= bytes.len());
    assert!(end <= bytes.len());

    let (mut state, mut last_match) = match caller_state.as_option() {
        None => init_fwd(dfa, bytes, start, end)?,
        Some(id) => {
            // This is a subtle but critical detail. If the caller provides a
            // non-None state ID, then it must be the case that the state ID
            // corresponds to one set by this function. The state ID therefore
            // corresponds to a match state, a dead state or some other state.
            // However, "some other" state _only_ occurs when the input has
            // been exhausted because the only way to stop before then is to
            // see a match or a dead/quit state.
            //
            // If the input is exhausted or if it's a dead state, then
            // incrementing the starting position has no relevance on
            // correctness, since the loop below will either not execute
            // at all or will immediately stop due to being in a dead state.
            // (Once in a dead state it is impossible to leave it.)
            //
            // Therefore, the only case we need to consider is when
            // caller_state is a match state. In this case, since our machines
            // support the ability to delay a match by a certain number of
            // bytes (to support look-around), it follows that we actually
            // consumed that many additional bytes on our previous search. When
            // the caller resumes their search to find subsequent matches, they
            // will use the ending location from the previous match as the next
            // starting point, which is `match_offset` bytes PRIOR to where
            // we scanned to on the previous search. Therefore, we need to
            // compensate by bumping `start` up by `match_offset` bytes.
            start += dfa.match_offset();
            // Since match_offset could be any arbitrary value and we use
            // `start` in pointer arithmetic below, we check that we are still
            // in bounds. Otherwise, we could materialize a pointer that is
            // more than one past the end point of `bytes`, which is UB.
            if start > end {
                return Ok(None);
            }
            (id, None)
        }
    };
    if last_match.is_some() {
        caller_state.set(state);
        return Ok(last_match);
    }

    let mut at = start;
    while at < end {
        let byte = bytes[at];
        state = dfa.next_state(state, byte);
        at += 1;
        if dfa.is_special_state(state) {
            caller_state.set(state);
            if dfa.is_dead_state(state) {
                return Ok(None);
            } else if dfa.is_quit_state(state) {
                return Err(MatchError::Quit { byte, offset: at - 1 });
            } else {
                return Ok(Some(at - dfa.match_offset()));
            }
        }
    }
    /*
    // SAFETY: Other than the normal pointer arithmetic happening here, a
    // unique aspect of safety for this function is the fact that the caller
    // can provide the state that the search routine will start with. If this
    // state were invalid, it would be possible to incorrectly index the
    // transition table. We however prevent this from happening by guaranteeing
    // that State is valid. Namely, callers cannot mutate a State. All they can
    // do is create a "start" state or otherwise reuse a previously set state.
    // Since callers can't mutate a state, it follows that a previously set
    // state can only be retrieved by crate internal functions. Therefore, our
    // use of it is safe since this code will only ever set the provided state
    // to a valid state.
    unsafe {
        let mut p = bytes.as_ptr().add(start);
        while p < bytes[end..].as_ptr() {
            let byte = *p;
            state = dfa.next_state_unchecked(state, byte);
            p = p.add(1);
            if dfa.is_special_state(state) {
                caller_state.set(state);
                return if dfa.is_dead_state(state) {
                    Ok(None)
                } else if dfa.is_quit_state(state) {
                    Err(MatchError::Quit { byte, offset: offset(bytes, p) - 1 })
                } else {
                    Ok(Some(offset(bytes, p) - dfa.match_offset()))
                };
            }
        }
    }
    */

    let result = eof_fwd(dfa, bytes, end, &mut state);
    caller_state.set(state);
    result
}

fn init_fwd<A: Automaton + ?Sized>(
    dfa: &A,
    bytes: &[u8],
    start: usize,
    end: usize,
) -> Result<(A::ID, Option<usize>), MatchError> {
    let state = dfa.start_state_forward(bytes, start, end);
    if dfa.is_match_state(state) {
        Ok((state, Some(start - dfa.match_offset())))
    } else {
        Ok((state, None))
    }
}

fn init_rev<A: Automaton + ?Sized>(
    dfa: &A,
    bytes: &[u8],
    start: usize,
    end: usize,
) -> Result<(A::ID, Option<usize>), MatchError> {
    let state = dfa.start_state_reverse(bytes, start, end);
    if dfa.is_match_state(state) {
        Ok((state, Some(end + dfa.match_offset())))
    } else {
        Ok((state, None))
    }
}

fn eof_fwd<A: Automaton + ?Sized>(
    dfa: &A,
    bytes: &[u8],
    end: usize,
    state: &mut A::ID,
) -> Result<Option<usize>, MatchError> {
    match bytes.get(end) {
        Some(&b) => {
            *state = dfa.next_state(*state, b);
            if dfa.is_match_state(*state) {
                Ok(Some(end))
            } else {
                Ok(None)
            }
        }
        None => {
            *state = dfa.next_eof_state(*state);
            if dfa.is_match_state(*state) {
                Ok(Some(bytes.len()))
            } else {
                Ok(None)
            }
        }
    }
}

fn eof_rev<A: Automaton + ?Sized>(
    dfa: &A,
    state: A::ID,
    bytes: &[u8],
    start: usize,
) -> Result<Option<usize>, MatchError> {
    if start > 0 {
        if dfa.is_match_state(dfa.next_state(state, bytes[start - 1])) {
            Ok(Some(start))
        } else {
            Ok(None)
        }
    } else {
        if dfa.is_match_state(dfa.next_eof_state(state)) {
            Ok(Some(0))
        } else {
            Ok(None)
        }
    }
}

/// Returns the distance between the given pointer and the start of `bytes`.
/// This assumes that the given pointer points to somewhere in the `bytes`
/// slice given.
fn offset(bytes: &[u8], p: *const u8) -> usize {
    debug_assert!(bytes.as_ptr() <= p);
    debug_assert!(bytes[bytes.len()..].as_ptr() >= p);
    ((p as isize) - (bytes.as_ptr() as isize)) as usize
}