diff options
Diffstat (limited to 'vendor/regex-automata/src/util/mod.rs')
-rw-r--r-- | vendor/regex-automata/src/util/mod.rs | 312 |
1 files changed, 47 insertions, 265 deletions
diff --git a/vendor/regex-automata/src/util/mod.rs b/vendor/regex-automata/src/util/mod.rs index 798507da2..bb739df1d 100644 --- a/vendor/regex-automata/src/util/mod.rs +++ b/vendor/regex-automata/src/util/mod.rs @@ -1,275 +1,57 @@ /*! -TODO +A collection of modules that provide APIs that are useful across many regex +engines. + +While one should explore the sub-modules directly to get a sense of what's +there, here are some highlights that tie the sub-modules to higher level +use cases: + +* `alphabet` contains APIs that are useful if you're doing low level things +with the DFAs in this crate. For example, implementing determinization or +walking its state graph directly. +* `captures` contains APIs for dealing with capture group matches and their +mapping to "slots" used inside an NFA graph. This is also where you can find +iterators over capture group names. +* `escape` contains types for pretty-printing raw byte slices as strings. +* `iter` contains API helpers for writing regex iterators. +* `lazy` contains a no-std and no-alloc variant of `lazy_static!` and +`once_cell`. +* `look` contains APIs for matching and configuring look-around assertions. +* `pool` provides a way to reuse mutable memory allocated in a thread safe +manner. +* `prefilter` provides APIs for building prefilters and using them in searches. +* `primitives` are what you might use if you're doing lower level work on +automata, such as walking an NFA state graph. +* `syntax` provides some higher level convenience functions for interacting +with the `regex-syntax` crate. +* `wire` is useful if you're working with DFA serialization. */ -use core::{ascii, fmt, str}; - -#[cfg(feature = "alloc")] -use alloc::vec::Vec; - pub mod alphabet; -pub(crate) mod bytes; #[cfg(feature = "alloc")] -pub(crate) mod determinize; -pub mod id; +pub mod captures; +pub mod escape; #[cfg(feature = "alloc")] -pub(crate) mod lazy; -pub(crate) mod matchtypes; +pub mod interpolate; +pub mod iter; +pub mod lazy; +pub mod look; +#[cfg(feature = "alloc")] +pub mod pool; pub mod prefilter; +pub mod primitives; +#[cfg(feature = "syntax")] +pub mod syntax; +pub mod wire; + +#[cfg(any(feature = "dfa-build", feature = "hybrid"))] +pub(crate) mod determinize; +pub(crate) mod empty; +pub(crate) mod int; +pub(crate) mod memchr; +pub(crate) mod search; #[cfg(feature = "alloc")] pub(crate) mod sparse_set; pub(crate) mod start; -#[cfg(feature = "alloc")] -pub(crate) mod syntax; - -/// The offset, in bytes, that a match is delayed by in the DFAs generated by -/// this crate. (This includes lazy DFAs.) -/// -/// The purpose of this delay is to support look-ahead such as \b (ASCII-only) -/// and $. In particular, both of these operators may require the -/// identification of the end of input in order to confirm a match. Not only -/// does this mean that all matches must therefore be delayed by a single byte, -/// but that a special EOI value is added to the alphabet of all DFAs. (Which -/// means that even though the alphabet of a DFA is typically all byte values, -/// the actual maximum alphabet size is 257 due to the extra EOI value.) -/// -/// Since we delay matches by only 1 byte, this can't fully support a -/// Unicode-aware \b operator, which requires multi-byte look-ahead. Indeed, -/// DFAs in this crate do not support it. (It's not as simple as just -/// increasing the match offset to do it---otherwise we would---but building -/// the full Unicode-aware word boundary detection into an automaton is quite -/// tricky.) -pub(crate) const MATCH_OFFSET: usize = 1; - -/// A type that wraps a single byte with a convenient fmt::Debug impl that -/// escapes the byte. -pub(crate) struct DebugByte(pub u8); - -impl fmt::Debug for DebugByte { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - // 10 bytes is enough to cover any output from ascii::escape_default. - let mut bytes = [0u8; 10]; - let mut len = 0; - for (i, mut b) in ascii::escape_default(self.0).enumerate() { - // capitalize \xab to \xAB - if i >= 2 && b'a' <= b && b <= b'f' { - b -= 32; - } - bytes[len] = b; - len += 1; - } - write!(f, "{}", str::from_utf8(&bytes[..len]).unwrap()) - } -} - -/// Returns the smallest possible index of the next valid UTF-8 sequence -/// starting after `i`. -/// -/// For all inputs, including invalid UTF-8 and any value of `i`, the return -/// value is guaranteed to be greater than `i`. -/// -/// Generally speaking, this should only be called on `text` when it is -/// permitted to assume that it is valid UTF-8 and where either `i >= -/// text.len()` or where `text[i]` is a leading byte of a UTF-8 sequence. -#[inline(always)] -pub(crate) fn next_utf8(text: &[u8], i: usize) -> usize { - let b = match text.get(i) { - None => return i.checked_add(1).unwrap(), - Some(&b) => b, - }; - // For cases where we see an invalid UTF-8 byte, there isn't much we can do - // other than just start at the next byte. - let inc = utf8_len(b).unwrap_or(1); - i.checked_add(inc).unwrap() -} - -/// Returns true if and only if the given byte is considered a word character. -/// This only applies to ASCII. -/// -/// This was copied from regex-syntax so that we can use it to determine the -/// starting DFA state while searching without depending on regex-syntax. The -/// definition is never going to change, so there's no maintenance/bit-rot -/// hazard here. -#[inline(always)] -pub(crate) fn is_word_byte(b: u8) -> bool { - match b { - b'_' | b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' => true, - _ => false, - } -} - -/// Decodes the next UTF-8 encoded codepoint from the given byte slice. -/// -/// If no valid encoding of a codepoint exists at the beginning of the given -/// byte slice, then the first byte is returned instead. -/// -/// This returns `None` if and only if `bytes` is empty. -#[inline(always)] -pub(crate) fn decode_utf8(bytes: &[u8]) -> Option<Result<char, u8>> { - if bytes.is_empty() { - return None; - } - let len = match utf8_len(bytes[0]) { - None => return Some(Err(bytes[0])), - Some(len) if len > bytes.len() => return Some(Err(bytes[0])), - Some(1) => return Some(Ok(bytes[0] as char)), - Some(len) => len, - }; - match str::from_utf8(&bytes[..len]) { - Ok(s) => Some(Ok(s.chars().next().unwrap())), - Err(_) => Some(Err(bytes[0])), - } -} - -/// Decodes the last UTF-8 encoded codepoint from the given byte slice. -/// -/// If no valid encoding of a codepoint exists at the end of the given byte -/// slice, then the last byte is returned instead. -/// -/// This returns `None` if and only if `bytes` is empty. -#[inline(always)] -pub(crate) fn decode_last_utf8(bytes: &[u8]) -> Option<Result<char, u8>> { - if bytes.is_empty() { - return None; - } - let mut start = bytes.len() - 1; - let limit = bytes.len().saturating_sub(4); - while start > limit && !is_leading_or_invalid_utf8_byte(bytes[start]) { - start -= 1; - } - match decode_utf8(&bytes[start..]) { - None => None, - Some(Ok(ch)) => Some(Ok(ch)), - Some(Err(_)) => Some(Err(bytes[bytes.len() - 1])), - } -} - -/// Given a UTF-8 leading byte, this returns the total number of code units -/// in the following encoded codepoint. -/// -/// If the given byte is not a valid UTF-8 leading byte, then this returns -/// `None`. -#[inline(always)] -fn utf8_len(byte: u8) -> Option<usize> { - if byte <= 0x7F { - return Some(1); - } else if byte & 0b1100_0000 == 0b1000_0000 { - return None; - } else if byte <= 0b1101_1111 { - Some(2) - } else if byte <= 0b1110_1111 { - Some(3) - } else if byte <= 0b1111_0111 { - Some(4) - } else { - None - } -} - -/// Returns true if and only if the given byte is either a valid leading UTF-8 -/// byte, or is otherwise an invalid byte that can never appear anywhere in a -/// valid UTF-8 sequence. -#[inline(always)] -fn is_leading_or_invalid_utf8_byte(b: u8) -> bool { - // In the ASCII case, the most significant bit is never set. The leading - // byte of a 2/3/4-byte sequence always has the top two most significant - // bits set. For bytes that can never appear anywhere in valid UTF-8, this - // also returns true, since every such byte has its two most significant - // bits set: - // - // \xC0 :: 11000000 - // \xC1 :: 11000001 - // \xF5 :: 11110101 - // \xF6 :: 11110110 - // \xF7 :: 11110111 - // \xF8 :: 11111000 - // \xF9 :: 11111001 - // \xFA :: 11111010 - // \xFB :: 11111011 - // \xFC :: 11111100 - // \xFD :: 11111101 - // \xFE :: 11111110 - // \xFF :: 11111111 - (b & 0b1100_0000) != 0b1000_0000 -} - -#[cfg(feature = "alloc")] -#[inline(always)] -pub(crate) fn is_word_char_fwd(bytes: &[u8], mut at: usize) -> bool { - use core::{ptr, sync::atomic::AtomicPtr}; - - use crate::{ - dfa::{ - dense::{self, DFA}, - Automaton, - }, - util::lazy, - }; - - static WORD: AtomicPtr<DFA<Vec<u32>>> = AtomicPtr::new(ptr::null_mut()); - - let dfa = lazy::get_or_init(&WORD, || { - // TODO: Should we use a lazy DFA here instead? It does complicate - // things somewhat, since we then need a mutable cache, which probably - // means a thread local. - dense::Builder::new() - .configure(dense::Config::new().anchored(true)) - .build(r"\w") - .unwrap() - }); - // This is OK since '\w' contains no look-around. - let mut sid = dfa.universal_start_state(); - while at < bytes.len() { - let byte = bytes[at]; - sid = dfa.next_state(sid, byte); - at += 1; - if dfa.is_special_state(sid) { - if dfa.is_match_state(sid) { - return true; - } else if dfa.is_dead_state(sid) { - return false; - } - } - } - dfa.is_match_state(dfa.next_eoi_state(sid)) -} - -#[cfg(feature = "alloc")] -#[inline(always)] -pub(crate) fn is_word_char_rev(bytes: &[u8], mut at: usize) -> bool { - use core::{ptr, sync::atomic::AtomicPtr}; - - use crate::{ - dfa::{ - dense::{self, DFA}, - Automaton, - }, - nfa::thompson::NFA, - }; - - static WORD: AtomicPtr<DFA<Vec<u32>>> = AtomicPtr::new(ptr::null_mut()); - - let dfa = lazy::get_or_init(&WORD, || { - dense::Builder::new() - .configure(dense::Config::new().anchored(true)) - .thompson(NFA::config().reverse(true).shrink(true)) - .build(r"\w") - .unwrap() - }); - - // This is OK since '\w' contains no look-around. - let mut sid = dfa.universal_start_state(); - while at > 0 { - at -= 1; - let byte = bytes[at]; - sid = dfa.next_state(sid, byte); - if dfa.is_special_state(sid) { - if dfa.is_match_state(sid) { - return true; - } else if dfa.is_dead_state(sid) { - return false; - } - } - } - dfa.is_match_state(dfa.next_eoi_state(sid)) -} +pub(crate) mod unicode_data; +pub(crate) mod utf8; |