summaryrefslogtreecommitdiffstats
path: root/vendor/regex-automata/src/util/mod.rs
blob: 798507da215d00fd24453a1c1b31144fc7295211 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
/*!
TODO
*/

use core::{ascii, fmt, str};

#[cfg(feature = "alloc")]
use alloc::vec::Vec;

pub mod alphabet;
pub(crate) mod bytes;
#[cfg(feature = "alloc")]
pub(crate) mod determinize;
pub mod id;
#[cfg(feature = "alloc")]
pub(crate) mod lazy;
pub(crate) mod matchtypes;
pub mod prefilter;
#[cfg(feature = "alloc")]
pub(crate) mod sparse_set;
pub(crate) mod start;
#[cfg(feature = "alloc")]
pub(crate) mod syntax;

/// The offset, in bytes, that a match is delayed by in the DFAs generated by
/// this crate. (This includes lazy DFAs.)
///
/// The purpose of this delay is to support look-ahead such as \b (ASCII-only)
/// and $. In particular, both of these operators may require the
/// identification of the end of input in order to confirm a match. Not only
/// does this mean that all matches must therefore be delayed by a single byte,
/// but that a special EOI value is added to the alphabet of all DFAs. (Which
/// means that even though the alphabet of a DFA is typically all byte values,
/// the actual maximum alphabet size is 257 due to the extra EOI value.)
///
/// Since we delay matches by only 1 byte, this can't fully support a
/// Unicode-aware \b operator, which requires multi-byte look-ahead. Indeed,
/// DFAs in this crate do not support it. (It's not as simple as just
/// increasing the match offset to do it---otherwise we would---but building
/// the full Unicode-aware word boundary detection into an automaton is quite
/// tricky.)
pub(crate) const MATCH_OFFSET: usize = 1;

/// A type that wraps a single byte with a convenient fmt::Debug impl that
/// escapes the byte.
pub(crate) struct DebugByte(pub u8);

impl fmt::Debug for DebugByte {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        // 10 bytes is enough to cover any output from ascii::escape_default.
        let mut bytes = [0u8; 10];
        let mut len = 0;
        for (i, mut b) in ascii::escape_default(self.0).enumerate() {
            // capitalize \xab to \xAB
            if i >= 2 && b'a' <= b && b <= b'f' {
                b -= 32;
            }
            bytes[len] = b;
            len += 1;
        }
        write!(f, "{}", str::from_utf8(&bytes[..len]).unwrap())
    }
}

/// Returns the smallest possible index of the next valid UTF-8 sequence
/// starting after `i`.
///
/// For all inputs, including invalid UTF-8 and any value of `i`, the return
/// value is guaranteed to be greater than `i`.
///
/// Generally speaking, this should only be called on `text` when it is
/// permitted to assume that it is valid UTF-8 and where either `i >=
/// text.len()` or where `text[i]` is a leading byte of a UTF-8 sequence.
#[inline(always)]
pub(crate) fn next_utf8(text: &[u8], i: usize) -> usize {
    let b = match text.get(i) {
        None => return i.checked_add(1).unwrap(),
        Some(&b) => b,
    };
    // For cases where we see an invalid UTF-8 byte, there isn't much we can do
    // other than just start at the next byte.
    let inc = utf8_len(b).unwrap_or(1);
    i.checked_add(inc).unwrap()
}

/// Returns true if and only if the given byte is considered a word character.
/// This only applies to ASCII.
///
/// This was copied from regex-syntax so that we can use it to determine the
/// starting DFA state while searching without depending on regex-syntax. The
/// definition is never going to change, so there's no maintenance/bit-rot
/// hazard here.
#[inline(always)]
pub(crate) fn is_word_byte(b: u8) -> bool {
    match b {
        b'_' | b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' => true,
        _ => false,
    }
}

/// Decodes the next UTF-8 encoded codepoint from the given byte slice.
///
/// If no valid encoding of a codepoint exists at the beginning of the given
/// byte slice, then the first byte is returned instead.
///
/// This returns `None` if and only if `bytes` is empty.
#[inline(always)]
pub(crate) fn decode_utf8(bytes: &[u8]) -> Option<Result<char, u8>> {
    if bytes.is_empty() {
        return None;
    }
    let len = match utf8_len(bytes[0]) {
        None => return Some(Err(bytes[0])),
        Some(len) if len > bytes.len() => return Some(Err(bytes[0])),
        Some(1) => return Some(Ok(bytes[0] as char)),
        Some(len) => len,
    };
    match str::from_utf8(&bytes[..len]) {
        Ok(s) => Some(Ok(s.chars().next().unwrap())),
        Err(_) => Some(Err(bytes[0])),
    }
}

/// Decodes the last UTF-8 encoded codepoint from the given byte slice.
///
/// If no valid encoding of a codepoint exists at the end of the given byte
/// slice, then the last byte is returned instead.
///
/// This returns `None` if and only if `bytes` is empty.
#[inline(always)]
pub(crate) fn decode_last_utf8(bytes: &[u8]) -> Option<Result<char, u8>> {
    if bytes.is_empty() {
        return None;
    }
    let mut start = bytes.len() - 1;
    let limit = bytes.len().saturating_sub(4);
    while start > limit && !is_leading_or_invalid_utf8_byte(bytes[start]) {
        start -= 1;
    }
    match decode_utf8(&bytes[start..]) {
        None => None,
        Some(Ok(ch)) => Some(Ok(ch)),
        Some(Err(_)) => Some(Err(bytes[bytes.len() - 1])),
    }
}

/// Given a UTF-8 leading byte, this returns the total number of code units
/// in the following encoded codepoint.
///
/// If the given byte is not a valid UTF-8 leading byte, then this returns
/// `None`.
#[inline(always)]
fn utf8_len(byte: u8) -> Option<usize> {
    if byte <= 0x7F {
        return Some(1);
    } else if byte & 0b1100_0000 == 0b1000_0000 {
        return None;
    } else if byte <= 0b1101_1111 {
        Some(2)
    } else if byte <= 0b1110_1111 {
        Some(3)
    } else if byte <= 0b1111_0111 {
        Some(4)
    } else {
        None
    }
}

/// Returns true if and only if the given byte is either a valid leading UTF-8
/// byte, or is otherwise an invalid byte that can never appear anywhere in a
/// valid UTF-8 sequence.
#[inline(always)]
fn is_leading_or_invalid_utf8_byte(b: u8) -> bool {
    // In the ASCII case, the most significant bit is never set. The leading
    // byte of a 2/3/4-byte sequence always has the top two most significant
    // bits set. For bytes that can never appear anywhere in valid UTF-8, this
    // also returns true, since every such byte has its two most significant
    // bits set:
    //
    //     \xC0 :: 11000000
    //     \xC1 :: 11000001
    //     \xF5 :: 11110101
    //     \xF6 :: 11110110
    //     \xF7 :: 11110111
    //     \xF8 :: 11111000
    //     \xF9 :: 11111001
    //     \xFA :: 11111010
    //     \xFB :: 11111011
    //     \xFC :: 11111100
    //     \xFD :: 11111101
    //     \xFE :: 11111110
    //     \xFF :: 11111111
    (b & 0b1100_0000) != 0b1000_0000
}

#[cfg(feature = "alloc")]
#[inline(always)]
pub(crate) fn is_word_char_fwd(bytes: &[u8], mut at: usize) -> bool {
    use core::{ptr, sync::atomic::AtomicPtr};

    use crate::{
        dfa::{
            dense::{self, DFA},
            Automaton,
        },
        util::lazy,
    };

    static WORD: AtomicPtr<DFA<Vec<u32>>> = AtomicPtr::new(ptr::null_mut());

    let dfa = lazy::get_or_init(&WORD, || {
        // TODO: Should we use a lazy DFA here instead? It does complicate
        // things somewhat, since we then need a mutable cache, which probably
        // means a thread local.
        dense::Builder::new()
            .configure(dense::Config::new().anchored(true))
            .build(r"\w")
            .unwrap()
    });
    // This is OK since '\w' contains no look-around.
    let mut sid = dfa.universal_start_state();
    while at < bytes.len() {
        let byte = bytes[at];
        sid = dfa.next_state(sid, byte);
        at += 1;
        if dfa.is_special_state(sid) {
            if dfa.is_match_state(sid) {
                return true;
            } else if dfa.is_dead_state(sid) {
                return false;
            }
        }
    }
    dfa.is_match_state(dfa.next_eoi_state(sid))
}

#[cfg(feature = "alloc")]
#[inline(always)]
pub(crate) fn is_word_char_rev(bytes: &[u8], mut at: usize) -> bool {
    use core::{ptr, sync::atomic::AtomicPtr};

    use crate::{
        dfa::{
            dense::{self, DFA},
            Automaton,
        },
        nfa::thompson::NFA,
    };

    static WORD: AtomicPtr<DFA<Vec<u32>>> = AtomicPtr::new(ptr::null_mut());

    let dfa = lazy::get_or_init(&WORD, || {
        dense::Builder::new()
            .configure(dense::Config::new().anchored(true))
            .thompson(NFA::config().reverse(true).shrink(true))
            .build(r"\w")
            .unwrap()
    });

    // This is OK since '\w' contains no look-around.
    let mut sid = dfa.universal_start_state();
    while at > 0 {
        at -= 1;
        let byte = bytes[at];
        sid = dfa.next_state(sid, byte);
        if dfa.is_special_state(sid) {
            if dfa.is_match_state(sid) {
                return true;
            } else if dfa.is_dead_state(sid) {
                return false;
            }
        }
    }
    dfa.is_match_state(dfa.next_eoi_state(sid))
}