1 files changed, 47 insertions, 265 deletions
diff --git a/vendor/regex-automata/src/util/mod.rs b/vendor/regex-automata/src/util/mod.rs
index 798507da2..bb739df1d 100644
--- a/vendor/regex-automata/src/util/mod.rs
+++ b/vendor/regex-automata/src/util/mod.rs
@@ -1,275 +1,57 @@
 /*!
-TODO
+A collection of modules that provide APIs that are useful across many regex
+engines.
+
+While one should explore the sub-modules directly to get a sense of what's
+there, here are some highlights that tie the sub-modules to higher level
+use cases:
+
+* `alphabet` contains APIs that are useful if you're doing low level things
+with the DFAs in this crate. For example, implementing determinization or
+walking its state graph directly.
+* `captures` contains APIs for dealing with capture group matches and their
+mapping to "slots" used inside an NFA graph. This is also where you can find
+iterators over capture group names.
+* `escape` contains types for pretty-printing raw byte slices as strings.
+* `iter` contains API helpers for writing regex iterators.
+* `lazy` contains a no-std and no-alloc variant of `lazy_static!` and
+`once_cell`.
+* `look` contains APIs for matching and configuring look-around assertions.
+* `pool` provides a way to reuse mutable memory allocated in a thread safe
+manner.
+* `prefilter` provides APIs for building prefilters and using them in searches.
+* `primitives` are what you might use if you're doing lower level work on
+automata, such as walking an NFA state graph.
+* `syntax` provides some higher level convenience functions for interacting
+with the `regex-syntax` crate.
+* `wire` is useful if you're working with DFA serialization.
 */
 
-use core::{ascii, fmt, str};
-
-#[cfg(feature = "alloc")]
-use alloc::vec::Vec;
-
 pub mod alphabet;
-pub(crate) mod bytes;
 #[cfg(feature = "alloc")]
-pub(crate) mod determinize;
-pub mod id;
+pub mod captures;
+pub mod escape;
 #[cfg(feature = "alloc")]
-pub(crate) mod lazy;
-pub(crate) mod matchtypes;
+pub mod interpolate;
+pub mod iter;
+pub mod lazy;
+pub mod look;
+#[cfg(feature = "alloc")]
+pub mod pool;
 pub mod prefilter;
+pub mod primitives;
+#[cfg(feature = "syntax")]
+pub mod syntax;
+pub mod wire;
+
+#[cfg(any(feature = "dfa-build", feature = "hybrid"))]
+pub(crate) mod determinize;
+pub(crate) mod empty;
+pub(crate) mod int;
+pub(crate) mod memchr;
+pub(crate) mod search;
 #[cfg(feature = "alloc")]
 pub(crate) mod sparse_set;
 pub(crate) mod start;
-#[cfg(feature = "alloc")]
-pub(crate) mod syntax;
-
-/// The offset, in bytes, that a match is delayed by in the DFAs generated by
-/// this crate. (This includes lazy DFAs.)
-///
-/// The purpose of this delay is to support look-ahead such as \b (ASCII-only)
-/// and $. In particular, both of these operators may require the
-/// identification of the end of input in order to confirm a match. Not only
-/// does this mean that all matches must therefore be delayed by a single byte,
-/// but that a special EOI value is added to the alphabet of all DFAs. (Which
-/// means that even though the alphabet of a DFA is typically all byte values,
-/// the actual maximum alphabet size is 257 due to the extra EOI value.)
-///
-/// Since we delay matches by only 1 byte, this can't fully support a
-/// Unicode-aware \b operator, which requires multi-byte look-ahead. Indeed,
-/// DFAs in this crate do not support it. (It's not as simple as just
-/// increasing the match offset to do it---otherwise we would---but building
-/// the full Unicode-aware word boundary detection into an automaton is quite
-/// tricky.)
-pub(crate) const MATCH_OFFSET: usize = 1;
-
-/// A type that wraps a single byte with a convenient fmt::Debug impl that
-/// escapes the byte.
-pub(crate) struct DebugByte(pub u8);
-
-impl fmt::Debug for DebugByte {
-    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        // 10 bytes is enough to cover any output from ascii::escape_default.
-        let mut bytes = [0u8; 10];
-        let mut len = 0;
-        for (i, mut b) in ascii::escape_default(self.0).enumerate() {
-            // capitalize \xab to \xAB
-            if i >= 2 && b'a' <= b && b <= b'f' {
-                b -= 32;
-            }
-            bytes[len] = b;
-            len += 1;
-        }
-        write!(f, "{}", str::from_utf8(&bytes[..len]).unwrap())
-    }
-}
-
-/// Returns the smallest possible index of the next valid UTF-8 sequence
-/// starting after `i`.
-///
-/// For all inputs, including invalid UTF-8 and any value of `i`, the return
-/// value is guaranteed to be greater than `i`.
-///
-/// Generally speaking, this should only be called on `text` when it is
-/// permitted to assume that it is valid UTF-8 and where either `i >=
-/// text.len()` or where `text[i]` is a leading byte of a UTF-8 sequence.
-#[inline(always)]
-pub(crate) fn next_utf8(text: &[u8], i: usize) -> usize {
-    let b = match text.get(i) {
-        None => return i.checked_add(1).unwrap(),
-        Some(&b) => b,
-    };
-    // For cases where we see an invalid UTF-8 byte, there isn't much we can do
-    // other than just start at the next byte.
-    let inc = utf8_len(b).unwrap_or(1);
-    i.checked_add(inc).unwrap()
-}
-
-/// Returns true if and only if the given byte is considered a word character.
-/// This only applies to ASCII.
-///
-/// This was copied from regex-syntax so that we can use it to determine the
-/// starting DFA state while searching without depending on regex-syntax. The
-/// definition is never going to change, so there's no maintenance/bit-rot
-/// hazard here.
-#[inline(always)]
-pub(crate) fn is_word_byte(b: u8) -> bool {
-    match b {
-        b'_' | b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' => true,
-        _ => false,
-    }
-}
-
-/// Decodes the next UTF-8 encoded codepoint from the given byte slice.
-///
-/// If no valid encoding of a codepoint exists at the beginning of the given
-/// byte slice, then the first byte is returned instead.
-///
-/// This returns `None` if and only if `bytes` is empty.
-#[inline(always)]
-pub(crate) fn decode_utf8(bytes: &[u8]) -> Option<Result<char, u8>> {
-    if bytes.is_empty() {
-        return None;
-    }
-    let len = match utf8_len(bytes[0]) {
-        None => return Some(Err(bytes[0])),
-        Some(len) if len > bytes.len() => return Some(Err(bytes[0])),
-        Some(1) => return Some(Ok(bytes[0] as char)),
-        Some(len) => len,
-    };
-    match str::from_utf8(&bytes[..len]) {
-        Ok(s) => Some(Ok(s.chars().next().unwrap())),
-        Err(_) => Some(Err(bytes[0])),
-    }
-}
-
-/// Decodes the last UTF-8 encoded codepoint from the given byte slice.
-///
-/// If no valid encoding of a codepoint exists at the end of the given byte
-/// slice, then the last byte is returned instead.
-///
-/// This returns `None` if and only if `bytes` is empty.
-#[inline(always)]
-pub(crate) fn decode_last_utf8(bytes: &[u8]) -> Option<Result<char, u8>> {
-    if bytes.is_empty() {
-        return None;
-    }
-    let mut start = bytes.len() - 1;
-    let limit = bytes.len().saturating_sub(4);
-    while start > limit && !is_leading_or_invalid_utf8_byte(bytes[start]) {
-        start -= 1;
-    }
-    match decode_utf8(&bytes[start..]) {
-        None => None,
-        Some(Ok(ch)) => Some(Ok(ch)),
-        Some(Err(_)) => Some(Err(bytes[bytes.len() - 1])),
-    }
-}
-
-/// Given a UTF-8 leading byte, this returns the total number of code units
-/// in the following encoded codepoint.
-///
-/// If the given byte is not a valid UTF-8 leading byte, then this returns
-/// `None`.
-#[inline(always)]
-fn utf8_len(byte: u8) -> Option<usize> {
-    if byte <= 0x7F {
-        return Some(1);
-    } else if byte & 0b1100_0000 == 0b1000_0000 {
-        return None;
-    } else if byte <= 0b1101_1111 {
-        Some(2)
-    } else if byte <= 0b1110_1111 {
-        Some(3)
-    } else if byte <= 0b1111_0111 {
-        Some(4)
-    } else {
-        None
-    }
-}
-
-/// Returns true if and only if the given byte is either a valid leading UTF-8
-/// byte, or is otherwise an invalid byte that can never appear anywhere in a
-/// valid UTF-8 sequence.
-#[inline(always)]
-fn is_leading_or_invalid_utf8_byte(b: u8) -> bool {
-    // In the ASCII case, the most significant bit is never set. The leading
-    // byte of a 2/3/4-byte sequence always has the top two most significant
-    // bits set. For bytes that can never appear anywhere in valid UTF-8, this
-    // also returns true, since every such byte has its two most significant
-    // bits set:
-    //
-    //     \xC0 :: 11000000
-    //     \xC1 :: 11000001
-    //     \xF5 :: 11110101
-    //     \xF6 :: 11110110
-    //     \xF7 :: 11110111
-    //     \xF8 :: 11111000
-    //     \xF9 :: 11111001
-    //     \xFA :: 11111010
-    //     \xFB :: 11111011
-    //     \xFC :: 11111100
-    //     \xFD :: 11111101
-    //     \xFE :: 11111110
-    //     \xFF :: 11111111
-    (b & 0b1100_0000) != 0b1000_0000
-}
-
-#[cfg(feature = "alloc")]
-#[inline(always)]
-pub(crate) fn is_word_char_fwd(bytes: &[u8], mut at: usize) -> bool {
-    use core::{ptr, sync::atomic::AtomicPtr};
-
-    use crate::{
-        dfa::{
-            dense::{self, DFA},
-            Automaton,
-        },
-        util::lazy,
-    };
-
-    static WORD: AtomicPtr<DFA<Vec<u32>>> = AtomicPtr::new(ptr::null_mut());
-
-    let dfa = lazy::get_or_init(&WORD, || {
-        // TODO: Should we use a lazy DFA here instead? It does complicate
-        // things somewhat, since we then need a mutable cache, which probably
-        // means a thread local.
-        dense::Builder::new()
-            .configure(dense::Config::new().anchored(true))
-            .build(r"\w")
-            .unwrap()
-    });
-    // This is OK since '\w' contains no look-around.
-    let mut sid = dfa.universal_start_state();
-    while at < bytes.len() {
-        let byte = bytes[at];
-        sid = dfa.next_state(sid, byte);
-        at += 1;
-        if dfa.is_special_state(sid) {
-            if dfa.is_match_state(sid) {
-                return true;
-            } else if dfa.is_dead_state(sid) {
-                return false;
-            }
-        }
-    }
-    dfa.is_match_state(dfa.next_eoi_state(sid))
-}
-
-#[cfg(feature = "alloc")]
-#[inline(always)]
-pub(crate) fn is_word_char_rev(bytes: &[u8], mut at: usize) -> bool {
-    use core::{ptr, sync::atomic::AtomicPtr};
-
-    use crate::{
-        dfa::{
-            dense::{self, DFA},
-            Automaton,
-        },
-        nfa::thompson::NFA,
-    };
-
-    static WORD: AtomicPtr<DFA<Vec<u32>>> = AtomicPtr::new(ptr::null_mut());
-
-    let dfa = lazy::get_or_init(&WORD, || {
-        dense::Builder::new()
-            .configure(dense::Config::new().anchored(true))
-            .thompson(NFA::config().reverse(true).shrink(true))
-            .build(r"\w")
-            .unwrap()
-    });
-
-    // This is OK since '\w' contains no look-around.
-    let mut sid = dfa.universal_start_state();
-    while at > 0 {
-        at -= 1;
-        let byte = bytes[at];
-        sid = dfa.next_state(sid, byte);
-        if dfa.is_special_state(sid) {
-            if dfa.is_match_state(sid) {
-                return true;
-            } else if dfa.is_dead_state(sid) {
-                return false;
-            }
-        }
-    }
-    dfa.is_match_state(dfa.next_eoi_state(sid))
-}
+pub(crate) mod unicode_data;
+pub(crate) mod utf8;