summaryrefslogtreecommitdiffstats
path: root/vendor/regex-automata/src/util
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-17 12:18:32 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-17 12:18:32 +0000
commit4547b622d8d29df964fa2914213088b148c498fc (patch)
tree9fc6b25f3c3add6b745be9a2400a6e96140046e9 /vendor/regex-automata/src/util
parentReleasing progress-linux version 1.66.0+dfsg1-1~progress7.99u1. (diff)
downloadrustc-4547b622d8d29df964fa2914213088b148c498fc.tar.xz
rustc-4547b622d8d29df964fa2914213088b148c498fc.zip
Merging upstream version 1.67.1+dfsg1.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'vendor/regex-automata/src/util')
-rw-r--r--vendor/regex-automata/src/util/alphabet.rs790
-rw-r--r--vendor/regex-automata/src/util/bytes.rs950
-rw-r--r--vendor/regex-automata/src/util/determinize/mod.rs493
-rw-r--r--vendor/regex-automata/src/util/determinize/state.rs873
-rw-r--r--vendor/regex-automata/src/util/id.rs608
-rw-r--r--vendor/regex-automata/src/util/lazy.rs31
-rw-r--r--vendor/regex-automata/src/util/matchtypes.rs356
-rw-r--r--vendor/regex-automata/src/util/mod.rs275
-rw-r--r--vendor/regex-automata/src/util/prefilter.rs281
-rw-r--r--vendor/regex-automata/src/util/sparse_set.rs229
-rw-r--r--vendor/regex-automata/src/util/start.rs109
-rw-r--r--vendor/regex-automata/src/util/syntax.rs272
12 files changed, 5267 insertions, 0 deletions
diff --git a/vendor/regex-automata/src/util/alphabet.rs b/vendor/regex-automata/src/util/alphabet.rs
new file mode 100644
index 000000000..0bc1ece58
--- /dev/null
+++ b/vendor/regex-automata/src/util/alphabet.rs
@@ -0,0 +1,790 @@
+use core::convert::TryFrom;
+
+use crate::util::{
+ bytes::{DeserializeError, SerializeError},
+ DebugByte,
+};
+
+/// Unit represents a single unit of input for DFA based regex engines.
+///
+/// **NOTE:** It is not expected for consumers of this crate to need to use
+/// this type unless they are implementing their own DFA. And even then, it's
+/// not required: implementors may use other techniques to handle input.
+///
+/// Typically, a single unit of input for a DFA would be a single byte.
+/// However, for the DFAs in this crate, matches are delayed by a single byte
+/// in order to handle look-ahead assertions (`\b`, `$` and `\z`). Thus, once
+/// we have consumed the haystack, we must run the DFA through one additional
+/// transition using an input that indicates the haystack has ended.
+///
+/// Since there is no way to represent a sentinel with a `u8` since all
+/// possible values *may* be valid inputs to a DFA, this type explicitly adds
+/// room for a sentinel value.
+///
+/// The sentinel EOI value is always its own equivalence class and is
+/// ultimately represented by adding 1 to the maximum equivalence class value.
+/// So for example, the regex `^[a-z]+$` might be split into the following
+/// equivalence classes:
+///
+/// ```text
+/// 0 => [\x00-`]
+/// 1 => [a-z]
+/// 2 => [{-\xFF]
+/// 3 => [EOI]
+/// ```
+///
+/// Where EOI is the special sentinel value that is always in its own
+/// singleton equivalence class.
+#[derive(Clone, Copy, Eq, PartialEq, PartialOrd, Ord)]
+pub enum Unit {
+ U8(u8),
+ EOI(u16),
+}
+
+impl Unit {
+ /// Create a new input unit from a byte value.
+ ///
+ /// All possible byte values are legal. However, when creating an input
+ /// unit for a specific DFA, one should be careful to only construct input
+ /// units that are in that DFA's alphabet. Namely, one way to compact a
+ /// DFA's in-memory representation is to collapse its transitions to a set
+ /// of equivalence classes into a set of all possible byte values. If a
+ /// DFA uses equivalence classes instead of byte values, then the byte
+ /// given here should be the equivalence class.
+ pub fn u8(byte: u8) -> Unit {
+ Unit::U8(byte)
+ }
+
+ pub fn eoi(num_byte_equiv_classes: usize) -> Unit {
+ assert!(
+ num_byte_equiv_classes <= 256,
+ "max number of byte-based equivalent classes is 256, but got {}",
+ num_byte_equiv_classes,
+ );
+ Unit::EOI(u16::try_from(num_byte_equiv_classes).unwrap())
+ }
+
+ pub fn as_u8(self) -> Option<u8> {
+ match self {
+ Unit::U8(b) => Some(b),
+ Unit::EOI(_) => None,
+ }
+ }
+
+ #[cfg(feature = "alloc")]
+ pub fn as_eoi(self) -> Option<usize> {
+ match self {
+ Unit::U8(_) => None,
+ Unit::EOI(eoi) => Some(eoi as usize),
+ }
+ }
+
+ pub fn as_usize(self) -> usize {
+ match self {
+ Unit::U8(b) => b as usize,
+ Unit::EOI(eoi) => eoi as usize,
+ }
+ }
+
+ pub fn is_eoi(&self) -> bool {
+ match *self {
+ Unit::EOI(_) => true,
+ _ => false,
+ }
+ }
+
+ #[cfg(feature = "alloc")]
+ pub fn is_word_byte(&self) -> bool {
+ self.as_u8().map_or(false, crate::util::is_word_byte)
+ }
+}
+
+impl core::fmt::Debug for Unit {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ match *self {
+ Unit::U8(b) => write!(f, "{:?}", DebugByte(b)),
+ Unit::EOI(_) => write!(f, "EOI"),
+ }
+ }
+}
+
+/// A representation of byte oriented equivalence classes.
+///
+/// This is used in a DFA to reduce the size of the transition table. This can
+/// have a particularly large impact not only on the total size of a dense DFA,
+/// but also on compile times.
+#[derive(Clone, Copy)]
+pub struct ByteClasses([u8; 256]);
+
+impl ByteClasses {
+ /// Creates a new set of equivalence classes where all bytes are mapped to
+ /// the same class.
+ pub fn empty() -> ByteClasses {
+ ByteClasses([0; 256])
+ }
+
+ /// Creates a new set of equivalence classes where each byte belongs to
+ /// its own equivalence class.
+ #[cfg(feature = "alloc")]
+ pub fn singletons() -> ByteClasses {
+ let mut classes = ByteClasses::empty();
+ for i in 0..256 {
+ classes.set(i as u8, i as u8);
+ }
+ classes
+ }
+
+ /// Deserializes a byte class map from the given slice. If the slice is of
+ /// insufficient length or otherwise contains an impossible mapping, then
+ /// an error is returned. Upon success, the number of bytes read along with
+ /// the map are returned. The number of bytes read is always a multiple of
+ /// 8.
+ pub fn from_bytes(
+ slice: &[u8],
+ ) -> Result<(ByteClasses, usize), DeserializeError> {
+ if slice.len() < 256 {
+ return Err(DeserializeError::buffer_too_small("byte class map"));
+ }
+ let mut classes = ByteClasses::empty();
+ for (b, &class) in slice[..256].iter().enumerate() {
+ classes.set(b as u8, class);
+ }
+ for b in classes.iter() {
+ if b.as_usize() >= classes.alphabet_len() {
+ return Err(DeserializeError::generic(
+ "found equivalence class greater than alphabet len",
+ ));
+ }
+ }
+ Ok((classes, 256))
+ }
+
+ /// Writes this byte class map to the given byte buffer. if the given
+ /// buffer is too small, then an error is returned. Upon success, the total
+ /// number of bytes written is returned. The number of bytes written is
+ /// guaranteed to be a multiple of 8.
+ pub fn write_to(
+ &self,
+ mut dst: &mut [u8],
+ ) -> Result<usize, SerializeError> {
+ let nwrite = self.write_to_len();
+ if dst.len() < nwrite {
+ return Err(SerializeError::buffer_too_small("byte class map"));
+ }
+ for b in 0..=255 {
+ dst[0] = self.get(b);
+ dst = &mut dst[1..];
+ }
+ Ok(nwrite)
+ }
+
+ /// Returns the total number of bytes written by `write_to`.
+ pub fn write_to_len(&self) -> usize {
+ 256
+ }
+
+ /// Set the equivalence class for the given byte.
+ #[inline]
+ pub fn set(&mut self, byte: u8, class: u8) {
+ self.0[byte as usize] = class;
+ }
+
+ /// Get the equivalence class for the given byte.
+ #[inline]
+ pub fn get(&self, byte: u8) -> u8 {
+ self.0[byte as usize]
+ }
+
+ /// Get the equivalence class for the given byte while forcefully
+ /// eliding bounds checks.
+ #[inline]
+ pub unsafe fn get_unchecked(&self, byte: u8) -> u8 {
+ *self.0.get_unchecked(byte as usize)
+ }
+
+ /// Get the equivalence class for the given input unit and return the
+ /// class as a `usize`.
+ #[inline]
+ pub fn get_by_unit(&self, unit: Unit) -> usize {
+ match unit {
+ Unit::U8(b) => usize::try_from(self.get(b)).unwrap(),
+ Unit::EOI(b) => usize::try_from(b).unwrap(),
+ }
+ }
+
+ #[inline]
+ pub fn eoi(&self) -> Unit {
+ Unit::eoi(self.alphabet_len().checked_sub(1).unwrap())
+ }
+
+ /// Return the total number of elements in the alphabet represented by
+ /// these equivalence classes. Equivalently, this returns the total number
+ /// of equivalence classes.
+ #[inline]
+ pub fn alphabet_len(&self) -> usize {
+ // Add one since the number of equivalence classes is one bigger than
+ // the last one. But add another to account for the final EOI class
+ // that isn't explicitly represented.
+ self.0[255] as usize + 1 + 1
+ }
+
+ /// Returns the stride, as a base-2 exponent, required for these
+ /// equivalence classes.
+ ///
+ /// The stride is always the smallest power of 2 that is greater than or
+ /// equal to the alphabet length. This is done so that converting between
+ /// state IDs and indices can be done with shifts alone, which is much
+ /// faster than integer division.
+ #[cfg(feature = "alloc")]
+ pub fn stride2(&self) -> usize {
+ self.alphabet_len().next_power_of_two().trailing_zeros() as usize
+ }
+
+ /// Returns true if and only if every byte in this class maps to its own
+ /// equivalence class. Equivalently, there are 257 equivalence classes
+ /// and each class contains exactly one byte (plus the special EOI class).
+ #[inline]
+ pub fn is_singleton(&self) -> bool {
+ self.alphabet_len() == 257
+ }
+
+ /// Returns an iterator over all equivalence classes in this set.
+ pub fn iter(&self) -> ByteClassIter<'_> {
+ ByteClassIter { classes: self, i: 0 }
+ }
+
+ /// Returns an iterator over a sequence of representative bytes from each
+ /// equivalence class. Namely, this yields exactly N items, where N is
+ /// equivalent to the number of equivalence classes. Each item is an
+ /// arbitrary byte drawn from each equivalence class.
+ ///
+ /// This is useful when one is determinizing an NFA and the NFA's alphabet
+ /// hasn't been converted to equivalence classes yet. Picking an arbitrary
+ /// byte from each equivalence class then permits a full exploration of
+ /// the NFA instead of using every possible byte value.
+ #[cfg(feature = "alloc")]
+ pub fn representatives(&self) -> ByteClassRepresentatives<'_> {
+ ByteClassRepresentatives { classes: self, byte: 0, last_class: None }
+ }
+
+ /// Returns an iterator of the bytes in the given equivalence class.
+ pub fn elements(&self, class: Unit) -> ByteClassElements {
+ ByteClassElements { classes: self, class, byte: 0 }
+ }
+
+ /// Returns an iterator of byte ranges in the given equivalence class.
+ ///
+ /// That is, a sequence of contiguous ranges are returned. Typically, every
+ /// class maps to a single contiguous range.
+ fn element_ranges(&self, class: Unit) -> ByteClassElementRanges {
+ ByteClassElementRanges { elements: self.elements(class), range: None }
+ }
+}
+
+impl core::fmt::Debug for ByteClasses {
+ fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+ if self.is_singleton() {
+ write!(f, "ByteClasses({{singletons}})")
+ } else {
+ write!(f, "ByteClasses(")?;
+ for (i, class) in self.iter().enumerate() {
+ if i > 0 {
+ write!(f, ", ")?;
+ }
+ write!(f, "{:?} => [", class.as_usize())?;
+ for (start, end) in self.element_ranges(class) {
+ if start == end {
+ write!(f, "{:?}", start)?;
+ } else {
+ write!(f, "{:?}-{:?}", start, end)?;
+ }
+ }
+ write!(f, "]")?;
+ }
+ write!(f, ")")
+ }
+ }
+}
+
+/// An iterator over each equivalence class.
+#[derive(Debug)]
+pub struct ByteClassIter<'a> {
+ classes: &'a ByteClasses,
+ i: usize,
+}
+
+impl<'a> Iterator for ByteClassIter<'a> {
+ type Item = Unit;
+
+ fn next(&mut self) -> Option<Unit> {
+ if self.i + 1 == self.classes.alphabet_len() {
+ self.i += 1;
+ Some(self.classes.eoi())
+ } else if self.i < self.classes.alphabet_len() {
+ let class = self.i as u8;
+ self.i += 1;
+ Some(Unit::u8(class))
+ } else {
+ None
+ }
+ }
+}
+
+/// An iterator over representative bytes from each equivalence class.
+#[cfg(feature = "alloc")]
+#[derive(Debug)]
+pub struct ByteClassRepresentatives<'a> {
+ classes: &'a ByteClasses,
+ byte: usize,
+ last_class: Option<u8>,
+}
+
+#[cfg(feature = "alloc")]
+impl<'a> Iterator for ByteClassRepresentatives<'a> {
+ type Item = Unit;
+
+ fn next(&mut self) -> Option<Unit> {
+ while self.byte < 256 {
+ let byte = self.byte as u8;
+ let class = self.classes.get(byte);
+ self.byte += 1;
+
+ if self.last_class != Some(class) {
+ self.last_class = Some(class);
+ return Some(Unit::u8(byte));
+ }
+ }
+ if self.byte == 256 {
+ self.byte += 1;
+ return Some(self.classes.eoi());
+ }
+ None
+ }
+}
+
+/// An iterator over all elements in an equivalence class.
+#[derive(Debug)]
+pub struct ByteClassElements<'a> {
+ classes: &'a ByteClasses,
+ class: Unit,
+ byte: usize,
+}
+
+impl<'a> Iterator for ByteClassElements<'a> {
+ type Item = Unit;
+
+ fn next(&mut self) -> Option<Unit> {
+ while self.byte < 256 {
+ let byte = self.byte as u8;
+ self.byte += 1;
+ if self.class.as_u8() == Some(self.classes.get(byte)) {
+ return Some(Unit::u8(byte));
+ }
+ }
+ if self.byte < 257 {
+ self.byte += 1;
+ if self.class.is_eoi() {
+ return Some(Unit::eoi(256));
+ }
+ }
+ None
+ }
+}
+
+/// An iterator over all elements in an equivalence class expressed as a
+/// sequence of contiguous ranges.
+#[derive(Debug)]
+pub struct ByteClassElementRanges<'a> {
+ elements: ByteClassElements<'a>,
+ range: Option<(Unit, Unit)>,
+}
+
+impl<'a> Iterator for ByteClassElementRanges<'a> {
+ type Item = (Unit, Unit);
+
+ fn next(&mut self) -> Option<(Unit, Unit)> {
+ loop {
+ let element = match self.elements.next() {
+ None => return self.range.take(),
+ Some(element) => element,
+ };
+ match self.range.take() {
+ None => {
+ self.range = Some((element, element));
+ }
+ Some((start, end)) => {
+ if end.as_usize() + 1 != element.as_usize()
+ || element.is_eoi()
+ {
+ self.range = Some((element, element));
+ return Some((start, end));
+ }
+ self.range = Some((start, element));
+ }
+ }
+ }
+ }
+}
+
+/// A byte class set keeps track of an *approximation* of equivalence classes
+/// of bytes during NFA construction. That is, every byte in an equivalence
+/// class cannot discriminate between a match and a non-match.
+///
+/// For example, in the regex `[ab]+`, the bytes `a` and `b` would be in the
+/// same equivalence class because it never matters whether an `a` or a `b` is
+/// seen, and no combination of `a`s and `b`s in the text can discriminate a
+/// match.
+///
+/// Note though that this does not compute the minimal set of equivalence
+/// classes. For example, in the regex `[ac]+`, both `a` and `c` are in the
+/// same equivalence class for the same reason that `a` and `b` are in the
+/// same equivalence class in the aforementioned regex. However, in this
+/// implementation, `a` and `c` are put into distinct equivalence classes. The
+/// reason for this is implementation complexity. In the future, we should
+/// endeavor to compute the minimal equivalence classes since they can have a
+/// rather large impact on the size of the DFA. (Doing this will likely require
+/// rethinking how equivalence classes are computed, including changing the
+/// representation here, which is only able to group contiguous bytes into the
+/// same equivalence class.)
+#[derive(Clone, Debug)]
+pub struct ByteClassSet(ByteSet);
+
+impl ByteClassSet {
+ /// Create a new set of byte classes where all bytes are part of the same
+ /// equivalence class.
+ #[cfg(feature = "alloc")]
+ pub fn empty() -> Self {
+ ByteClassSet(ByteSet::empty())
+ }
+
+ /// Indicate the the range of byte given (inclusive) can discriminate a
+ /// match between it and all other bytes outside of the range.
+ #[cfg(feature = "alloc")]
+ pub fn set_range(&mut self, start: u8, end: u8) {
+ debug_assert!(start <= end);
+ if start > 0 {
+ self.0.add(start - 1);
+ }
+ self.0.add(end);
+ }
+
+ /// Add the contiguous ranges in the set given to this byte class set.
+ #[cfg(feature = "alloc")]
+ pub fn add_set(&mut self, set: &ByteSet) {
+ for (start, end) in set.iter_ranges() {
+ self.set_range(start, end);
+ }
+ }
+
+ /// Convert this boolean set to a map that maps all byte values to their
+ /// corresponding equivalence class. The last mapping indicates the largest
+ /// equivalence class identifier (which is never bigger than 255).
+ #[cfg(feature = "alloc")]
+ pub fn byte_classes(&self) -> ByteClasses {
+ let mut classes = ByteClasses::empty();
+ let mut class = 0u8;
+ let mut b = 0u8;
+ loop {
+ classes.set(b, class);
+ if b == 255 {
+ break;
+ }
+ if self.0.contains(b) {
+ class = class.checked_add(1).unwrap();
+ }
+ b = b.checked_add(1).unwrap();
+ }
+ classes
+ }
+}
+
+/// A simple set of bytes that is reasonably cheap to copy and allocation free.
+#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
+pub struct ByteSet {
+ bits: BitSet,
+}
+
+/// The representation of a byte set. Split out so that we can define a
+/// convenient Debug impl for it while keeping "ByteSet" in the output.
+#[derive(Clone, Copy, Default, Eq, PartialEq)]
+struct BitSet([u128; 2]);
+
+impl ByteSet {
+ /// Create an empty set of bytes.
+ #[cfg(feature = "alloc")]
+ pub fn empty() -> ByteSet {
+ ByteSet { bits: BitSet([0; 2]) }
+ }
+
+ /// Add a byte to this set.
+ ///
+ /// If the given byte already belongs to this set, then this is a no-op.
+ #[cfg(feature = "alloc")]
+ pub fn add(&mut self, byte: u8) {
+ let bucket = byte / 128;
+ let bit = byte % 128;
+ self.bits.0[bucket as usize] |= 1 << bit;
+ }
+
+ /// Add an inclusive range of bytes.
+ #[cfg(feature = "alloc")]
+ pub fn add_all(&mut self, start: u8, end: u8) {
+ for b in start..=end {
+ self.add(b);
+ }
+ }
+
+ /// Remove a byte from this set.
+ ///
+ /// If the given byte is not in this set, then this is a no-op.
+ #[cfg(feature = "alloc")]
+ pub fn remove(&mut self, byte: u8) {
+ let bucket = byte / 128;
+ let bit = byte % 128;
+ self.bits.0[bucket as usize] &= !(1 << bit);
+ }
+
+ /// Remove an inclusive range of bytes.
+ #[cfg(feature = "alloc")]
+ pub fn remove_all(&mut self, start: u8, end: u8) {
+ for b in start..=end {
+ self.remove(b);
+ }
+ }
+
+ /// Return true if and only if the given byte is in this set.
+ pub fn contains(&self, byte: u8) -> bool {
+ let bucket = byte / 128;
+ let bit = byte % 128;
+ self.bits.0[bucket as usize] & (1 << bit) > 0
+ }
+
+ /// Return true if and only if the given inclusive range of bytes is in
+ /// this set.
+ #[cfg(feature = "alloc")]
+ pub fn contains_range(&self, start: u8, end: u8) -> bool {
+ (start..=end).all(|b| self.contains(b))
+ }
+
+ /// Returns an iterator over all bytes in this set.
+ #[cfg(feature = "alloc")]
+ pub fn iter(&self) -> ByteSetIter {
+ ByteSetIter { set: self, b: 0 }
+ }
+
+ /// Returns an iterator over all contiguous ranges of bytes in this set.
+ #[cfg(feature = "alloc")]
+ pub fn iter_ranges(&self) -> ByteSetRangeIter {
+ ByteSetRangeIter { set: self, b: 0 }
+ }
+
+ /// Return the number of bytes in this set.
+ #[cfg(feature = "alloc")]
+ pub fn len(&self) -> usize {
+ (self.bits.0[0].count_ones() + self.bits.0[1].count_ones()) as usize
+ }
+
+ /// Return true if and only if this set is empty.
+ #[cfg(feature = "alloc")]
+ pub fn is_empty(&self) -> bool {
+ self.bits.0 == [0, 0]
+ }
+}
+
+impl core::fmt::Debug for BitSet {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ let mut fmtd = f.debug_set();
+ for b in (0..256).map(|b| b as u8) {
+ if (ByteSet { bits: *self }).contains(b) {
+ fmtd.entry(&b);
+ }
+ }
+ fmtd.finish()
+ }
+}
+
+#[derive(Debug)]
+pub struct ByteSetIter<'a> {
+ set: &'a ByteSet,
+ b: usize,
+}
+
+impl<'a> Iterator for ByteSetIter<'a> {
+ type Item = u8;
+
+ fn next(&mut self) -> Option<u8> {
+ while self.b <= 255 {
+ let b = self.b as u8;
+ self.b += 1;
+ if self.set.contains(b) {
+ return Some(b);
+ }
+ }
+ None
+ }
+}
+
+#[derive(Debug)]
+pub struct ByteSetRangeIter<'a> {
+ set: &'a ByteSet,
+ b: usize,
+}
+
+impl<'a> Iterator for ByteSetRangeIter<'a> {
+ type Item = (u8, u8);
+
+ fn next(&mut self) -> Option<(u8, u8)> {
+ while self.b <= 255 {
+ let start = self.b as u8;
+ self.b += 1;
+ if !self.set.contains(start) {
+ continue;
+ }
+
+ let mut end = start;
+ while self.b <= 255 && self.set.contains(self.b as u8) {
+ end = self.b as u8;
+ self.b += 1;
+ }
+ return Some((start, end));
+ }
+ None
+ }
+}
+
+#[cfg(test)]
+#[cfg(feature = "alloc")]
+mod tests {
+ use alloc::{vec, vec::Vec};
+
+ use super::*;
+
+ #[test]
+ fn byte_classes() {
+ let mut set = ByteClassSet::empty();
+ set.set_range(b'a', b'z');
+
+ let classes = set.byte_classes();
+ assert_eq!(classes.get(0), 0);
+ assert_eq!(classes.get(1), 0);
+ assert_eq!(classes.get(2), 0);
+ assert_eq!(classes.get(b'a' - 1), 0);
+ assert_eq!(classes.get(b'a'), 1);
+ assert_eq!(classes.get(b'm'), 1);
+ assert_eq!(classes.get(b'z'), 1);
+ assert_eq!(classes.get(b'z' + 1), 2);
+ assert_eq!(classes.get(254), 2);
+ assert_eq!(classes.get(255), 2);
+
+ let mut set = ByteClassSet::empty();
+ set.set_range(0, 2);
+ set.set_range(4, 6);
+ let classes = set.byte_classes();
+ assert_eq!(classes.get(0), 0);
+ assert_eq!(classes.get(1), 0);
+ assert_eq!(classes.get(2), 0);
+ assert_eq!(classes.get(3), 1);
+ assert_eq!(classes.get(4), 2);
+ assert_eq!(classes.get(5), 2);
+ assert_eq!(classes.get(6), 2);
+ assert_eq!(classes.get(7), 3);
+ assert_eq!(classes.get(255), 3);
+ }
+
+ #[test]
+ fn full_byte_classes() {
+ let mut set = ByteClassSet::empty();
+ for i in 0..256u16 {
+ set.set_range(i as u8, i as u8);
+ }
+ assert_eq!(set.byte_classes().alphabet_len(), 257);
+ }
+
+ #[test]
+ fn elements_typical() {
+ let mut set = ByteClassSet::empty();
+ set.set_range(b'b', b'd');
+ set.set_range(b'g', b'm');
+ set.set_range(b'z', b'z');
+ let classes = set.byte_classes();
+ // class 0: \x00-a
+ // class 1: b-d
+ // class 2: e-f
+ // class 3: g-m
+ // class 4: n-y
+ // class 5: z-z
+ // class 6: \x7B-\xFF
+ // class 7: EOI
+ assert_eq!(classes.alphabet_len(), 8);
+
+ let elements = classes.elements(Unit::u8(0)).collect::<Vec<_>>();
+ assert_eq!(elements.len(), 98);
+ assert_eq!(elements[0], Unit::u8(b'\x00'));
+ assert_eq!(elements[97], Unit::u8(b'a'));
+
+ let elements = classes.elements(Unit::u8(1)).collect::<Vec<_>>();
+ assert_eq!(
+ elements,
+ vec![Unit::u8(b'b'), Unit::u8(b'c'), Unit::u8(b'd')],
+ );
+
+ let elements = classes.elements(Unit::u8(2)).collect::<Vec<_>>();
+ assert_eq!(elements, vec![Unit::u8(b'e'), Unit::u8(b'f')],);
+
+ let elements = classes.elements(Unit::u8(3)).collect::<Vec<_>>();
+ assert_eq!(
+ elements,
+ vec![
+ Unit::u8(b'g'),
+ Unit::u8(b'h'),
+ Unit::u8(b'i'),
+ Unit::u8(b'j'),
+ Unit::u8(b'k'),
+ Unit::u8(b'l'),
+ Unit::u8(b'm'),
+ ],
+ );
+
+ let elements = classes.elements(Unit::u8(4)).collect::<Vec<_>>();
+ assert_eq!(elements.len(), 12);
+ assert_eq!(elements[0], Unit::u8(b'n'));
+ assert_eq!(elements[11], Unit::u8(b'y'));
+
+ let elements = classes.elements(Unit::u8(5)).collect::<Vec<_>>();
+ assert_eq!(elements, vec![Unit::u8(b'z')]);
+
+ let elements = classes.elements(Unit::u8(6)).collect::<Vec<_>>();
+ assert_eq!(elements.len(), 133);
+ assert_eq!(elements[0], Unit::u8(b'\x7B'));
+ assert_eq!(elements[132], Unit::u8(b'\xFF'));
+
+ let elements = classes.elements(Unit::eoi(7)).collect::<Vec<_>>();
+ assert_eq!(elements, vec![Unit::eoi(256)]);
+ }
+
+ #[test]
+ fn elements_singletons() {
+ let classes = ByteClasses::singletons();
+ assert_eq!(classes.alphabet_len(), 257);
+
+ let elements = classes.elements(Unit::u8(b'a')).collect::<Vec<_>>();
+ assert_eq!(elements, vec![Unit::u8(b'a')]);
+
+ let elements = classes.elements(Unit::eoi(5)).collect::<Vec<_>>();
+ assert_eq!(elements, vec![Unit::eoi(256)]);
+ }
+
+ #[test]
+ fn elements_empty() {
+ let classes = ByteClasses::empty();
+ assert_eq!(classes.alphabet_len(), 2);
+
+ let elements = classes.elements(Unit::u8(0)).collect::<Vec<_>>();
+ assert_eq!(elements.len(), 256);
+ assert_eq!(elements[0], Unit::u8(b'\x00'));
+ assert_eq!(elements[255], Unit::u8(b'\xFF'));
+
+ let elements = classes.elements(Unit::eoi(1)).collect::<Vec<_>>();
+ assert_eq!(elements, vec![Unit::eoi(256)]);
+ }
+}
diff --git a/vendor/regex-automata/src/util/bytes.rs b/vendor/regex-automata/src/util/bytes.rs
new file mode 100644
index 000000000..5877bb149
--- /dev/null
+++ b/vendor/regex-automata/src/util/bytes.rs
@@ -0,0 +1,950 @@
+/*
+A collection of helper functions, types and traits for serializing automata.
+
+This crate defines its own bespoke serialization mechanism for some structures
+provided in the public API, namely, DFAs. A bespoke mechanism was developed
+primarily because structures like automata demand a specific binary format.
+Attempting to encode their rich structure in an existing serialization
+format is just not feasible. Moreover, the format for each structure is
+generally designed such that deserialization is cheap. More specifically, that
+deserialization can be done in constant time. (The idea being that you can
+embed it into your binary or mmap it, and then use it immediately.)
+
+In order to achieve this, most of the structures in this crate use an in-memory
+representation that very closely corresponds to its binary serialized form.
+This pervades and complicates everything, and in some cases, requires dealing
+with alignment and reasoning about safety.
+
+This technique does have major advantages. In particular, it permits doing
+the potentially costly work of compiling a finite state machine in an offline
+manner, and then loading it at runtime not only without having to re-compile
+the regex, but even without the code required to do the compilation. This, for
+example, permits one to use a pre-compiled DFA not only in environments without
+Rust's standard library, but also in environments without a heap.
+
+In the code below, whenever we insert some kind of padding, it's to enforce a
+4-byte alignment, unless otherwise noted. Namely, u32 is the only state ID type
+supported. (In a previous version of this library, DFAs were generic over the
+state ID representation.)
+
+Also, serialization generally requires the caller to specify endianness,
+where as deserialization always assumes native endianness (otherwise cheap
+deserialization would be impossible). This implies that serializing a structure
+generally requires serializing both its big-endian and little-endian variants,
+and then loading the correct one based on the target's endianness.
+*/
+
+use core::{
+ cmp,
+ convert::{TryFrom, TryInto},
+ mem::size_of,
+};
+
+#[cfg(feature = "alloc")]
+use alloc::{vec, vec::Vec};
+
+use crate::util::id::{PatternID, PatternIDError, StateID, StateIDError};
+
+/// An error that occurs when serializing an object from this crate.
+///
+/// Serialization, as used in this crate, universally refers to the process
+/// of transforming a structure (like a DFA) into a custom binary format
+/// represented by `&[u8]`. To this end, serialization is generally infallible.
+/// However, it can fail when caller provided buffer sizes are too small. When
+/// that occurs, a serialization error is reported.
+///
+/// A `SerializeError` provides no introspection capabilities. Its only
+/// supported operation is conversion to a human readable error message.
+///
+/// This error type implements the `std::error::Error` trait only when the
+/// `std` feature is enabled. Otherwise, this type is defined in all
+/// configurations.
+#[derive(Debug)]
+pub struct SerializeError {
+ /// The name of the thing that a buffer is too small for.
+ ///
+ /// Currently, the only kind of serialization error is one that is
+ /// committed by a caller: providing a destination buffer that is too
+ /// small to fit the serialized object. This makes sense conceptually,
+ /// since every valid inhabitant of a type should be serializable.
+ ///
+ /// This is somewhat exposed in the public API of this crate. For example,
+ /// the `to_bytes_{big,little}_endian` APIs return a `Vec<u8>` and are
+ /// guaranteed to never panic or error. This is only possible because the
+ /// implementation guarantees that it will allocate a `Vec<u8>` that is
+ /// big enough.
+ ///
+ /// In summary, if a new serialization error kind needs to be added, then
+ /// it will need careful consideration.
+ what: &'static str,
+}
+
+impl SerializeError {
+ pub(crate) fn buffer_too_small(what: &'static str) -> SerializeError {
+ SerializeError { what }
+ }
+}
+
+impl core::fmt::Display for SerializeError {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ write!(f, "destination buffer is too small to write {}", self.what)
+ }
+}
+
+#[cfg(feature = "std")]
+impl std::error::Error for SerializeError {}
+
+/// An error that occurs when deserializing an object defined in this crate.
+///
+/// Serialization, as used in this crate, universally refers to the process
+/// of transforming a structure (like a DFA) into a custom binary format
+/// represented by `&[u8]`. Deserialization, then, refers to the process of
+/// cheaply converting this binary format back to the object's in-memory
+/// representation as defined in this crate. To the extent possible,
+/// deserialization will report this error whenever this process fails.
+///
+/// A `DeserializeError` provides no introspection capabilities. Its only
+/// supported operation is conversion to a human readable error message.
+///
+/// This error type implements the `std::error::Error` trait only when the
+/// `std` feature is enabled. Otherwise, this type is defined in all
+/// configurations.
+#[derive(Debug)]
+pub struct DeserializeError(DeserializeErrorKind);
+
+#[derive(Debug)]
+enum DeserializeErrorKind {
+ Generic { msg: &'static str },
+ BufferTooSmall { what: &'static str },
+ InvalidUsize { what: &'static str },
+ InvalidVarint { what: &'static str },
+ VersionMismatch { expected: u32, found: u32 },
+ EndianMismatch { expected: u32, found: u32 },
+ AlignmentMismatch { alignment: usize, address: usize },
+ LabelMismatch { expected: &'static str },
+ ArithmeticOverflow { what: &'static str },
+ PatternID { err: PatternIDError, what: &'static str },
+ StateID { err: StateIDError, what: &'static str },
+}
+
+impl DeserializeError {
+ pub(crate) fn generic(msg: &'static str) -> DeserializeError {
+ DeserializeError(DeserializeErrorKind::Generic { msg })
+ }
+
+ pub(crate) fn buffer_too_small(what: &'static str) -> DeserializeError {
+ DeserializeError(DeserializeErrorKind::BufferTooSmall { what })
+ }
+
+ pub(crate) fn invalid_usize(what: &'static str) -> DeserializeError {
+ DeserializeError(DeserializeErrorKind::InvalidUsize { what })
+ }
+
+ fn invalid_varint(what: &'static str) -> DeserializeError {
+ DeserializeError(DeserializeErrorKind::InvalidVarint { what })
+ }
+
+ fn version_mismatch(expected: u32, found: u32) -> DeserializeError {
+ DeserializeError(DeserializeErrorKind::VersionMismatch {
+ expected,
+ found,
+ })
+ }
+
+ fn endian_mismatch(expected: u32, found: u32) -> DeserializeError {
+ DeserializeError(DeserializeErrorKind::EndianMismatch {
+ expected,
+ found,
+ })
+ }
+
+ fn alignment_mismatch(
+ alignment: usize,
+ address: usize,
+ ) -> DeserializeError {
+ DeserializeError(DeserializeErrorKind::AlignmentMismatch {
+ alignment,
+ address,
+ })
+ }
+
+ fn label_mismatch(expected: &'static str) -> DeserializeError {
+ DeserializeError(DeserializeErrorKind::LabelMismatch { expected })
+ }
+
+ fn arithmetic_overflow(what: &'static str) -> DeserializeError {
+ DeserializeError(DeserializeErrorKind::ArithmeticOverflow { what })
+ }
+
+ pub(crate) fn pattern_id_error(
+ err: PatternIDError,
+ what: &'static str,
+ ) -> DeserializeError {
+ DeserializeError(DeserializeErrorKind::PatternID { err, what })
+ }
+
+ pub(crate) fn state_id_error(
+ err: StateIDError,
+ what: &'static str,
+ ) -> DeserializeError {
+ DeserializeError(DeserializeErrorKind::StateID { err, what })
+ }
+}
+
+#[cfg(feature = "std")]
+impl std::error::Error for DeserializeError {}
+
+impl core::fmt::Display for DeserializeError {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ use self::DeserializeErrorKind::*;
+
+ match self.0 {
+ Generic { msg } => write!(f, "{}", msg),
+ BufferTooSmall { what } => {
+ write!(f, "buffer is too small to read {}", what)
+ }
+ InvalidUsize { what } => {
+ write!(f, "{} is too big to fit in a usize", what)
+ }
+ InvalidVarint { what } => {
+ write!(f, "could not decode valid varint for {}", what)
+ }
+ VersionMismatch { expected, found } => write!(
+ f,
+ "unsupported version: \
+ expected version {} but found version {}",
+ expected, found,
+ ),
+ EndianMismatch { expected, found } => write!(
+ f,
+ "endianness mismatch: expected 0x{:X} but got 0x{:X}. \
+ (Are you trying to load an object serialized with a \
+ different endianness?)",
+ expected, found,
+ ),
+ AlignmentMismatch { alignment, address } => write!(
+ f,
+ "alignment mismatch: slice starts at address \
+ 0x{:X}, which is not aligned to a {} byte boundary",
+ address, alignment,
+ ),
+ LabelMismatch { expected } => write!(
+ f,
+ "label mismatch: start of serialized object should \
+ contain a NUL terminated {:?} label, but a different \
+ label was found",
+ expected,
+ ),
+ ArithmeticOverflow { what } => {
+ write!(f, "arithmetic overflow for {}", what)
+ }
+ PatternID { ref err, what } => {
+ write!(f, "failed to read pattern ID for {}: {}", what, err)
+ }
+ StateID { ref err, what } => {
+ write!(f, "failed to read state ID for {}: {}", what, err)
+ }
+ }
+ }
+}
+
+/// Checks that the given slice has an alignment that matches `T`.
+///
+/// This is useful for checking that a slice has an appropriate alignment
+/// before casting it to a &[T]. Note though that alignment is not itself
+/// sufficient to perform the cast for any `T`.
+pub fn check_alignment<T>(slice: &[u8]) -> Result<(), DeserializeError> {
+ let alignment = core::mem::align_of::<T>();
+ let address = slice.as_ptr() as usize;
+ if address % alignment == 0 {
+ return Ok(());
+ }
+ Err(DeserializeError::alignment_mismatch(alignment, address))
+}
+
+/// Reads a possibly empty amount of padding, up to 7 bytes, from the beginning
+/// of the given slice. All padding bytes must be NUL bytes.
+///
+/// This is useful because it can be theoretically necessary to pad the
+/// beginning of a serialized object with NUL bytes to ensure that it starts
+/// at a correctly aligned address. These padding bytes should come immediately
+/// before the label.
+///
+/// This returns the number of bytes read from the given slice.
+pub fn skip_initial_padding(slice: &[u8]) -> usize {
+ let mut nread = 0;
+ while nread < 7 && nread < slice.len() && slice[nread] == 0 {
+ nread += 1;
+ }
+ nread
+}
+
+/// Allocate a byte buffer of the given size, along with some initial padding
+/// such that `buf[padding..]` has the same alignment as `T`, where the
+/// alignment of `T` must be at most `8`. In particular, callers should treat
+/// the first N bytes (second return value) as padding bytes that must not be
+/// overwritten. In all cases, the following identity holds:
+///
+/// ```ignore
+/// let (buf, padding) = alloc_aligned_buffer::<StateID>(SIZE);
+/// assert_eq!(SIZE, buf[padding..].len());
+/// ```
+///
+/// In practice, padding is often zero.
+///
+/// The requirement for `8` as a maximum here is somewhat arbitrary. In
+/// practice, we never need anything bigger in this crate, and so this function
+/// does some sanity asserts under the assumption of a max alignment of `8`.
+#[cfg(feature = "alloc")]
+pub fn alloc_aligned_buffer<T>(size: usize) -> (Vec<u8>, usize) {
+ // FIXME: This is a kludge because there's no easy way to allocate a
+ // Vec<u8> with an alignment guaranteed to be greater than 1. We could
+ // create a Vec<u32>, but this cannot be safely transmuted to a Vec<u8>
+ // without concern, since reallocing or dropping the Vec<u8> is UB
+ // (different alignment than the initial allocation). We could define a
+ // wrapper type to manage this for us, but it seems like more machinery
+ // than it's worth.
+ let mut buf = vec![0; size];
+ let align = core::mem::align_of::<T>();
+ let address = buf.as_ptr() as usize;
+ if address % align == 0 {
+ return (buf, 0);
+ }
+ // It's not quite clear how to robustly test this code, since the allocator
+ // in my environment appears to always return addresses aligned to at
+ // least 8 bytes, even when the alignment requirement is smaller. A feeble
+ // attempt at ensuring correctness is provided with asserts.
+ let padding = ((address & !0b111).checked_add(8).unwrap())
+ .checked_sub(address)
+ .unwrap();
+ assert!(padding <= 7, "padding of {} is bigger than 7", padding);
+ buf.extend(core::iter::repeat(0).take(padding));
+ assert_eq!(size + padding, buf.len());
+ assert_eq!(
+ 0,
+ buf[padding..].as_ptr() as usize % align,
+ "expected end of initial padding to be aligned to {}",
+ align,
+ );
+ (buf, padding)
+}
+
+/// Reads a NUL terminated label starting at the beginning of the given slice.
+///
+/// If a NUL terminated label could not be found, then an error is returned.
+/// Similary, if a label is found but doesn't match the expected label, then
+/// an error is returned.
+///
+/// Upon success, the total number of bytes read (including padding bytes) is
+/// returned.
+pub fn read_label(
+ slice: &[u8],
+ expected_label: &'static str,
+) -> Result<usize, DeserializeError> {
+ // Set an upper bound on how many bytes we scan for a NUL. Since no label
+ // in this crate is longer than 256 bytes, if we can't find one within that
+ // range, then we have corrupted data.
+ let first_nul =
+ slice[..cmp::min(slice.len(), 256)].iter().position(|&b| b == 0);
+ let first_nul = match first_nul {
+ Some(first_nul) => first_nul,
+ None => {
+ return Err(DeserializeError::generic(
+ "could not find NUL terminated label \
+ at start of serialized object",
+ ));
+ }
+ };
+ let len = first_nul + padding_len(first_nul);
+ if slice.len() < len {
+ return Err(DeserializeError::generic(
+ "could not find properly sized label at start of serialized object"
+ ));
+ }
+ if expected_label.as_bytes() != &slice[..first_nul] {
+ return Err(DeserializeError::label_mismatch(expected_label));
+ }
+ Ok(len)
+}
+
+/// Writes the given label to the buffer as a NUL terminated string. The label
+/// given must not contain NUL, otherwise this will panic. Similarly, the label
+/// must not be longer than 255 bytes, otherwise this will panic.
+///
+/// Additional NUL bytes are written as necessary to ensure that the number of
+/// bytes written is always a multiple of 4.
+///
+/// Upon success, the total number of bytes written (including padding) is
+/// returned.
+pub fn write_label(
+ label: &str,
+ dst: &mut [u8],
+) -> Result<usize, SerializeError> {
+ let nwrite = write_label_len(label);
+ if dst.len() < nwrite {
+ return Err(SerializeError::buffer_too_small("label"));
+ }
+ dst[..label.len()].copy_from_slice(label.as_bytes());
+ for i in 0..(nwrite - label.len()) {
+ dst[label.len() + i] = 0;
+ }
+ assert_eq!(nwrite % 4, 0);
+ Ok(nwrite)
+}
+
+/// Returns the total number of bytes (including padding) that would be written
+/// for the given label. This panics if the given label contains a NUL byte or
+/// is longer than 255 bytes. (The size restriction exists so that searching
+/// for a label during deserialization can be done in small bounded space.)
+pub fn write_label_len(label: &str) -> usize {
+ if label.len() > 255 {
+ panic!("label must not be longer than 255 bytes");
+ }
+ if label.as_bytes().iter().position(|&b| b == 0).is_some() {
+ panic!("label must not contain NUL bytes");
+ }
+ let label_len = label.len() + 1; // +1 for the NUL terminator
+ label_len + padding_len(label_len)
+}
+
+/// Reads the endianness check from the beginning of the given slice and
+/// confirms that the endianness of the serialized object matches the expected
+/// endianness. If the slice is too small or if the endianness check fails,
+/// this returns an error.
+///
+/// Upon success, the total number of bytes read is returned.
+pub fn read_endianness_check(slice: &[u8]) -> Result<usize, DeserializeError> {
+ let (n, nr) = try_read_u32(slice, "endianness check")?;
+ assert_eq!(nr, write_endianness_check_len());
+ if n != 0xFEFF {
+ return Err(DeserializeError::endian_mismatch(0xFEFF, n));
+ }
+ Ok(nr)
+}
+
+/// Writes 0xFEFF as an integer using the given endianness.
+///
+/// This is useful for writing into the header of a serialized object. It can
+/// be read during deserialization as a sanity check to ensure the proper
+/// endianness is used.
+///
+/// Upon success, the total number of bytes written is returned.
+pub fn write_endianness_check<E: Endian>(
+ dst: &mut [u8],
+) -> Result<usize, SerializeError> {
+ let nwrite = write_endianness_check_len();
+ if dst.len() < nwrite {
+ return Err(SerializeError::buffer_too_small("endianness check"));
+ }
+ E::write_u32(0xFEFF, dst);
+ Ok(nwrite)
+}
+
+/// Returns the number of bytes written by the endianness check.
+pub fn write_endianness_check_len() -> usize {
+ size_of::<u32>()
+}
+
+/// Reads a version number from the beginning of the given slice and confirms
+/// that is matches the expected version number given. If the slice is too
+/// small or if the version numbers aren't equivalent, this returns an error.
+///
+/// Upon success, the total number of bytes read is returned.
+///
+/// N.B. Currently, we require that the version number is exactly equivalent.
+/// In the future, if we bump the version number without a semver bump, then
+/// we'll need to relax this a bit and support older versions.
+pub fn read_version(
+ slice: &[u8],
+ expected_version: u32,
+) -> Result<usize, DeserializeError> {
+ let (n, nr) = try_read_u32(slice, "version")?;
+ assert_eq!(nr, write_version_len());
+ if n != expected_version {
+ return Err(DeserializeError::version_mismatch(expected_version, n));
+ }
+ Ok(nr)
+}
+
+/// Writes the given version number to the beginning of the given slice.
+///
+/// This is useful for writing into the header of a serialized object. It can
+/// be read during deserialization as a sanity check to ensure that the library
+/// code supports the format of the serialized object.
+///
+/// Upon success, the total number of bytes written is returned.
+pub fn write_version<E: Endian>(
+ version: u32,
+ dst: &mut [u8],
+) -> Result<usize, SerializeError> {
+ let nwrite = write_version_len();
+ if dst.len() < nwrite {
+ return Err(SerializeError::buffer_too_small("version number"));
+ }
+ E::write_u32(version, dst);
+ Ok(nwrite)
+}
+
+/// Returns the number of bytes written by writing the version number.
+pub fn write_version_len() -> usize {
+ size_of::<u32>()
+}
+
+/// Reads a pattern ID from the given slice. If the slice has insufficient
+/// length, then this panics. If the deserialized integer exceeds the pattern
+/// ID limit for the current target, then this returns an error.
+///
+/// Upon success, this also returns the number of bytes read.
+pub fn read_pattern_id(
+ slice: &[u8],
+ what: &'static str,
+) -> Result<(PatternID, usize), DeserializeError> {
+ let bytes: [u8; PatternID::SIZE] =
+ slice[..PatternID::SIZE].try_into().unwrap();
+ let pid = PatternID::from_ne_bytes(bytes)
+ .map_err(|err| DeserializeError::pattern_id_error(err, what))?;
+ Ok((pid, PatternID::SIZE))
+}
+
+/// Reads a pattern ID from the given slice. If the slice has insufficient
+/// length, then this panics. Otherwise, the deserialized integer is assumed
+/// to be a valid pattern ID.
+///
+/// This also returns the number of bytes read.
+pub fn read_pattern_id_unchecked(slice: &[u8]) -> (PatternID, usize) {
+ let pid = PatternID::from_ne_bytes_unchecked(
+ slice[..PatternID::SIZE].try_into().unwrap(),
+ );
+ (pid, PatternID::SIZE)
+}
+
+/// Write the given pattern ID to the beginning of the given slice of bytes
+/// using the specified endianness. The given slice must have length at least
+/// `PatternID::SIZE`, or else this panics. Upon success, the total number of
+/// bytes written is returned.
+pub fn write_pattern_id<E: Endian>(pid: PatternID, dst: &mut [u8]) -> usize {
+ E::write_u32(pid.as_u32(), dst);
+ PatternID::SIZE
+}
+
+/// Attempts to read a state ID from the given slice. If the slice has an
+/// insufficient number of bytes or if the state ID exceeds the limit for
+/// the current target, then this returns an error.
+///
+/// Upon success, this also returns the number of bytes read.
+pub fn try_read_state_id(
+ slice: &[u8],
+ what: &'static str,
+) -> Result<(StateID, usize), DeserializeError> {
+ if slice.len() < StateID::SIZE {
+ return Err(DeserializeError::buffer_too_small(what));
+ }
+ read_state_id(slice, what)
+}
+
+/// Reads a state ID from the given slice. If the slice has insufficient
+/// length, then this panics. If the deserialized integer exceeds the state ID
+/// limit for the current target, then this returns an error.
+///
+/// Upon success, this also returns the number of bytes read.
+pub fn read_state_id(
+ slice: &[u8],
+ what: &'static str,
+) -> Result<(StateID, usize), DeserializeError> {
+ let bytes: [u8; StateID::SIZE] =
+ slice[..StateID::SIZE].try_into().unwrap();
+ let sid = StateID::from_ne_bytes(bytes)
+ .map_err(|err| DeserializeError::state_id_error(err, what))?;
+ Ok((sid, StateID::SIZE))
+}
+
+/// Reads a state ID from the given slice. If the slice has insufficient
+/// length, then this panics. Otherwise, the deserialized integer is assumed
+/// to be a valid state ID.
+///
+/// This also returns the number of bytes read.
+pub fn read_state_id_unchecked(slice: &[u8]) -> (StateID, usize) {
+ let sid = StateID::from_ne_bytes_unchecked(
+ slice[..StateID::SIZE].try_into().unwrap(),
+ );
+ (sid, StateID::SIZE)
+}
+
+/// Write the given state ID to the beginning of the given slice of bytes
+/// using the specified endianness. The given slice must have length at least
+/// `StateID::SIZE`, or else this panics. Upon success, the total number of
+/// bytes written is returned.
+pub fn write_state_id<E: Endian>(sid: StateID, dst: &mut [u8]) -> usize {
+ E::write_u32(sid.as_u32(), dst);
+ StateID::SIZE
+}
+
+/// Try to read a u16 as a usize from the beginning of the given slice in
+/// native endian format. If the slice has fewer than 2 bytes or if the
+/// deserialized number cannot be represented by usize, then this returns an
+/// error. The error message will include the `what` description of what is
+/// being deserialized, for better error messages. `what` should be a noun in
+/// singular form.
+///
+/// Upon success, this also returns the number of bytes read.
+pub fn try_read_u16_as_usize(
+ slice: &[u8],
+ what: &'static str,
+) -> Result<(usize, usize), DeserializeError> {
+ try_read_u16(slice, what).and_then(|(n, nr)| {
+ usize::try_from(n)
+ .map(|n| (n, nr))
+ .map_err(|_| DeserializeError::invalid_usize(what))
+ })
+}
+
+/// Try to read a u32 as a usize from the beginning of the given slice in
+/// native endian format. If the slice has fewer than 4 bytes or if the
+/// deserialized number cannot be represented by usize, then this returns an
+/// error. The error message will include the `what` description of what is
+/// being deserialized, for better error messages. `what` should be a noun in
+/// singular form.
+///
+/// Upon success, this also returns the number of bytes read.
+pub fn try_read_u32_as_usize(
+ slice: &[u8],
+ what: &'static str,
+) -> Result<(usize, usize), DeserializeError> {
+ try_read_u32(slice, what).and_then(|(n, nr)| {
+ usize::try_from(n)
+ .map(|n| (n, nr))
+ .map_err(|_| DeserializeError::invalid_usize(what))
+ })
+}
+
+/// Try to read a u16 from the beginning of the given slice in native endian
+/// format. If the slice has fewer than 2 bytes, then this returns an error.
+/// The error message will include the `what` description of what is being
+/// deserialized, for better error messages. `what` should be a noun in
+/// singular form.
+///
+/// Upon success, this also returns the number of bytes read.
+pub fn try_read_u16(
+ slice: &[u8],
+ what: &'static str,
+) -> Result<(u16, usize), DeserializeError> {
+ if slice.len() < size_of::<u16>() {
+ return Err(DeserializeError::buffer_too_small(what));
+ }
+ Ok((read_u16(slice), size_of::<u16>()))
+}
+
+/// Try to read a u32 from the beginning of the given slice in native endian
+/// format. If the slice has fewer than 4 bytes, then this returns an error.
+/// The error message will include the `what` description of what is being
+/// deserialized, for better error messages. `what` should be a noun in
+/// singular form.
+///
+/// Upon success, this also returns the number of bytes read.
+pub fn try_read_u32(
+ slice: &[u8],
+ what: &'static str,
+) -> Result<(u32, usize), DeserializeError> {
+ if slice.len() < size_of::<u32>() {
+ return Err(DeserializeError::buffer_too_small(what));
+ }
+ Ok((read_u32(slice), size_of::<u32>()))
+}
+
+/// Read a u16 from the beginning of the given slice in native endian format.
+/// If the slice has fewer than 2 bytes, then this panics.
+///
+/// Marked as inline to speed up sparse searching which decodes integers from
+/// its automaton at search time.
+#[inline(always)]
+pub fn read_u16(slice: &[u8]) -> u16 {
+ let bytes: [u8; 2] = slice[..size_of::<u16>()].try_into().unwrap();
+ u16::from_ne_bytes(bytes)
+}
+
+/// Read a u32 from the beginning of the given slice in native endian format.
+/// If the slice has fewer than 4 bytes, then this panics.
+///
+/// Marked as inline to speed up sparse searching which decodes integers from
+/// its automaton at search time.
+#[inline(always)]
+pub fn read_u32(slice: &[u8]) -> u32 {
+ let bytes: [u8; 4] = slice[..size_of::<u32>()].try_into().unwrap();
+ u32::from_ne_bytes(bytes)
+}
+
+/// Read a u64 from the beginning of the given slice in native endian format.
+/// If the slice has fewer than 8 bytes, then this panics.
+///
+/// Marked as inline to speed up sparse searching which decodes integers from
+/// its automaton at search time.
+#[inline(always)]
+pub fn read_u64(slice: &[u8]) -> u64 {
+ let bytes: [u8; 8] = slice[..size_of::<u64>()].try_into().unwrap();
+ u64::from_ne_bytes(bytes)
+}
+
+/// Write a variable sized integer and return the total number of bytes
+/// written. If the slice was not big enough to contain the bytes, then this
+/// returns an error including the "what" description in it. This does no
+/// padding.
+///
+/// See: https://developers.google.com/protocol-buffers/docs/encoding#varints
+#[allow(dead_code)]
+pub fn write_varu64(
+ mut n: u64,
+ what: &'static str,
+ dst: &mut [u8],
+) -> Result<usize, SerializeError> {
+ let mut i = 0;
+ while n >= 0b1000_0000 {
+ if i >= dst.len() {
+ return Err(SerializeError::buffer_too_small(what));
+ }
+ dst[i] = (n as u8) | 0b1000_0000;
+ n >>= 7;
+ i += 1;
+ }
+ if i >= dst.len() {
+ return Err(SerializeError::buffer_too_small(what));
+ }
+ dst[i] = n as u8;
+ Ok(i + 1)
+}
+
+/// Returns the total number of bytes that would be writen to encode n as a
+/// variable sized integer.
+///
+/// See: https://developers.google.com/protocol-buffers/docs/encoding#varints
+#[allow(dead_code)]
+pub fn write_varu64_len(mut n: u64) -> usize {
+ let mut i = 0;
+ while n >= 0b1000_0000 {
+ n >>= 7;
+ i += 1;
+ }
+ i + 1
+}
+
+/// Like read_varu64, but attempts to cast the result to usize. If the integer
+/// cannot fit into a usize, then an error is returned.
+#[allow(dead_code)]
+pub fn read_varu64_as_usize(
+ slice: &[u8],
+ what: &'static str,
+) -> Result<(usize, usize), DeserializeError> {
+ let (n, nread) = read_varu64(slice, what)?;
+ let n = usize::try_from(n)
+ .map_err(|_| DeserializeError::invalid_usize(what))?;
+ Ok((n, nread))
+}
+
+/// Reads a variable sized integer from the beginning of slice, and returns the
+/// integer along with the total number of bytes read. If a valid variable
+/// sized integer could not be found, then an error is returned that includes
+/// the "what" description in it.
+///
+/// https://developers.google.com/protocol-buffers/docs/encoding#varints
+#[allow(dead_code)]
+pub fn read_varu64(
+ slice: &[u8],
+ what: &'static str,
+) -> Result<(u64, usize), DeserializeError> {
+ let mut n: u64 = 0;
+ let mut shift: u32 = 0;
+ // The biggest possible value is u64::MAX, which needs all 64 bits which
+ // requires 10 bytes (because 7 * 9 < 64). We use a limit to avoid reading
+ // an unnecessary number of bytes.
+ let limit = cmp::min(slice.len(), 10);
+ for (i, &b) in slice[..limit].iter().enumerate() {
+ if b < 0b1000_0000 {
+ return match (b as u64).checked_shl(shift) {
+ None => Err(DeserializeError::invalid_varint(what)),
+ Some(b) => Ok((n | b, i + 1)),
+ };
+ }
+ match ((b as u64) & 0b0111_1111).checked_shl(shift) {
+ None => return Err(DeserializeError::invalid_varint(what)),
+ Some(b) => n |= b,
+ }
+ shift += 7;
+ }
+ Err(DeserializeError::invalid_varint(what))
+}
+
+/// Checks that the given slice has some minimal length. If it's smaller than
+/// the bound given, then a "buffer too small" error is returned with `what`
+/// describing what the buffer represents.
+pub fn check_slice_len<T>(
+ slice: &[T],
+ at_least_len: usize,
+ what: &'static str,
+) -> Result<(), DeserializeError> {
+ if slice.len() < at_least_len {
+ return Err(DeserializeError::buffer_too_small(what));
+ }
+ Ok(())
+}
+
+/// Multiply the given numbers, and on overflow, return an error that includes
+/// 'what' in the error message.
+///
+/// This is useful when doing arithmetic with untrusted data.
+pub fn mul(
+ a: usize,
+ b: usize,
+ what: &'static str,
+) -> Result<usize, DeserializeError> {
+ match a.checked_mul(b) {
+ Some(c) => Ok(c),
+ None => Err(DeserializeError::arithmetic_overflow(what)),
+ }
+}
+
+/// Add the given numbers, and on overflow, return an error that includes
+/// 'what' in the error message.
+///
+/// This is useful when doing arithmetic with untrusted data.
+pub fn add(
+ a: usize,
+ b: usize,
+ what: &'static str,
+) -> Result<usize, DeserializeError> {
+ match a.checked_add(b) {
+ Some(c) => Ok(c),
+ None => Err(DeserializeError::arithmetic_overflow(what)),
+ }
+}
+
+/// Shift `a` left by `b`, and on overflow, return an error that includes
+/// 'what' in the error message.
+///
+/// This is useful when doing arithmetic with untrusted data.
+pub fn shl(
+ a: usize,
+ b: usize,
+ what: &'static str,
+) -> Result<usize, DeserializeError> {
+ let amount = u32::try_from(b)
+ .map_err(|_| DeserializeError::arithmetic_overflow(what))?;
+ match a.checked_shl(amount) {
+ Some(c) => Ok(c),
+ None => Err(DeserializeError::arithmetic_overflow(what)),
+ }
+}
+
+/// A simple trait for writing code generic over endianness.
+///
+/// This is similar to what byteorder provides, but we only need a very small
+/// subset.
+pub trait Endian {
+ /// Writes a u16 to the given destination buffer in a particular
+ /// endianness. If the destination buffer has a length smaller than 2, then
+ /// this panics.
+ fn write_u16(n: u16, dst: &mut [u8]);
+
+ /// Writes a u32 to the given destination buffer in a particular
+ /// endianness. If the destination buffer has a length smaller than 4, then
+ /// this panics.
+ fn write_u32(n: u32, dst: &mut [u8]);
+
+ /// Writes a u64 to the given destination buffer in a particular
+ /// endianness. If the destination buffer has a length smaller than 8, then
+ /// this panics.
+ fn write_u64(n: u64, dst: &mut [u8]);
+}
+
+/// Little endian writing.
+pub enum LE {}
+/// Big endian writing.
+pub enum BE {}
+
+#[cfg(target_endian = "little")]
+pub type NE = LE;
+#[cfg(target_endian = "big")]
+pub type NE = BE;
+
+impl Endian for LE {
+ fn write_u16(n: u16, dst: &mut [u8]) {
+ dst[..2].copy_from_slice(&n.to_le_bytes());
+ }
+
+ fn write_u32(n: u32, dst: &mut [u8]) {
+ dst[..4].copy_from_slice(&n.to_le_bytes());
+ }
+
+ fn write_u64(n: u64, dst: &mut [u8]) {
+ dst[..8].copy_from_slice(&n.to_le_bytes());
+ }
+}
+
+impl Endian for BE {
+ fn write_u16(n: u16, dst: &mut [u8]) {
+ dst[..2].copy_from_slice(&n.to_be_bytes());
+ }
+
+ fn write_u32(n: u32, dst: &mut [u8]) {
+ dst[..4].copy_from_slice(&n.to_be_bytes());
+ }
+
+ fn write_u64(n: u64, dst: &mut [u8]) {
+ dst[..8].copy_from_slice(&n.to_be_bytes());
+ }
+}
+
+/// Returns the number of additional bytes required to add to the given length
+/// in order to make the total length a multiple of 4. The return value is
+/// always less than 4.
+pub fn padding_len(non_padding_len: usize) -> usize {
+ (4 - (non_padding_len & 0b11)) & 0b11
+}
+
+#[cfg(all(test, feature = "alloc"))]
+mod tests {
+ use super::*;
+
+ #[test]
+ fn labels() {
+ let mut buf = [0; 1024];
+
+ let nwrite = write_label("fooba", &mut buf).unwrap();
+ assert_eq!(nwrite, 8);
+ assert_eq!(&buf[..nwrite], b"fooba\x00\x00\x00");
+
+ let nread = read_label(&buf, "fooba").unwrap();
+ assert_eq!(nread, 8);
+ }
+
+ #[test]
+ #[should_panic]
+ fn bad_label_interior_nul() {
+ // interior NULs are not allowed
+ write_label("foo\x00bar", &mut [0; 1024]).unwrap();
+ }
+
+ #[test]
+ fn bad_label_almost_too_long() {
+ // ok
+ write_label(&"z".repeat(255), &mut [0; 1024]).unwrap();
+ }
+
+ #[test]
+ #[should_panic]
+ fn bad_label_too_long() {
+ // labels longer than 255 bytes are banned
+ write_label(&"z".repeat(256), &mut [0; 1024]).unwrap();
+ }
+
+ #[test]
+ fn padding() {
+ assert_eq!(0, padding_len(8));
+ assert_eq!(3, padding_len(9));
+ assert_eq!(2, padding_len(10));
+ assert_eq!(1, padding_len(11));
+ assert_eq!(0, padding_len(12));
+ assert_eq!(3, padding_len(13));
+ assert_eq!(2, padding_len(14));
+ assert_eq!(1, padding_len(15));
+ assert_eq!(0, padding_len(16));
+ }
+}
diff --git a/vendor/regex-automata/src/util/determinize/mod.rs b/vendor/regex-automata/src/util/determinize/mod.rs
new file mode 100644
index 000000000..b384de8e1
--- /dev/null
+++ b/vendor/regex-automata/src/util/determinize/mod.rs
@@ -0,0 +1,493 @@
+/*!
+This module contains types and routines for implementing determinization.
+
+In this crate, there are at least two places where we implement
+determinization: fully ahead-of-time compiled DFAs in the `dfa` module and
+lazily compiled DFAs in the `hybrid` module. The stuff in this module
+corresponds to the things that are in common between these implementations.
+
+There are three broad things that our implementations of determinization have
+in common, as defined by this module:
+
+* The classification of start states. That is, whether we're dealing with
+word boundaries, line boundaries, etc., is all the same. This also includes
+the look-behind assertions that are satisfied by each starting state
+classification.
+
+* The representation of DFA states as sets of NFA states, including
+convenience types for building these DFA states that are amenable to reusing
+allocations.
+
+* Routines for the "classical" parts of determinization: computing the
+epsilon closure, tracking match states (with corresponding pattern IDs, since
+we support multi-pattern finite automata) and, of course, computing the
+transition function between states for units of input.
+
+I did consider a couple of alternatives to this particular form of code reuse:
+
+1. Don't do any code reuse. The problem here is that we *really* want both
+forms of determinization to do exactly identical things when it comes to
+their handling of NFA states. While our tests generally ensure this, the code
+is tricky and large enough where not reusing code is a pretty big bummer.
+
+2. Implement all of determinization once and make it generic over fully
+compiled DFAs and lazily compiled DFAs. While I didn't actually try this
+approach, my instinct is that it would be more complex than is needed here.
+And the interface required would be pretty hairy. Instead, I think splitting
+it into logical sub-components works better.
+*/
+
+use alloc::vec::Vec;
+
+pub(crate) use self::state::{
+ State, StateBuilderEmpty, StateBuilderMatches, StateBuilderNFA,
+};
+
+use crate::{
+ nfa::thompson::{self, Look, LookSet},
+ util::{
+ alphabet,
+ id::StateID,
+ matchtypes::MatchKind,
+ sparse_set::{SparseSet, SparseSets},
+ start::Start,
+ },
+};
+
+mod state;
+
+/// Compute the set of all eachable NFA states, including the full epsilon
+/// closure, from a DFA state for a single unit of input. The set of reachable
+/// states is returned as a `StateBuilderNFA`. The `StateBuilderNFA` returned
+/// also includes any look-behind assertions satisfied by `unit`, in addition
+/// to whether it is a match state. For multi-pattern DFAs, the builder will
+/// also include the pattern IDs that match (in the order seen).
+///
+/// `nfa` must be able to resolve any NFA state in `state` and any NFA state
+/// reachable via the epsilon closure of any NFA state in `state`. `sparses`
+/// must have capacity equivalent to `nfa.len()`.
+///
+/// `match_kind` should correspond to the match semantics implemented by the
+/// DFA being built. Generally speaking, for leftmost-first match semantics,
+/// states that appear after the first NFA match state will not be included in
+/// the `StateBuilderNFA` returned since they are impossible to visit.
+///
+/// `sparses` is used as scratch space for NFA traversal. Other than their
+/// capacity requirements (detailed above), there are no requirements on what's
+/// contained within them (if anything). Similarly, what's inside of them once
+/// this routine returns is unspecified.
+///
+/// `stack` must have length 0. It is used as scratch space for depth first
+/// traversal. After returning, it is guaranteed that `stack` will have length
+/// 0.
+///
+/// `state` corresponds to the current DFA state on which one wants to compute
+/// the transition for the input `unit`.
+///
+/// `empty_builder` corresponds to the builder allocation to use to produce a
+/// complete `StateBuilderNFA` state. If the state is not needed (or is already
+/// cached), then it can be cleared and reused without needing to create a new
+/// `State`. The `StateBuilderNFA` state returned is final and ready to be
+/// turned into a `State` if necessary.
+pub(crate) fn next(
+ nfa: &thompson::NFA,
+ match_kind: MatchKind,
+ sparses: &mut SparseSets,
+ stack: &mut Vec<StateID>,
+ state: &State,
+ unit: alphabet::Unit,
+ empty_builder: StateBuilderEmpty,
+) -> StateBuilderNFA {
+ sparses.clear();
+
+ // Put the NFA state IDs into a sparse set in case we need to
+ // re-compute their epsilon closure.
+ //
+ // Doing this state shuffling is technically not necessary unless some
+ // kind of look-around is used in the DFA. Some ad hoc experiments
+ // suggested that avoiding this didn't lead to much of an improvement,
+ // but perhaps more rigorous experimentation should be done. And in
+ // particular, avoiding this check requires some light refactoring of
+ // the code below.
+ state.iter_nfa_state_ids(|nfa_id| {
+ sparses.set1.insert(nfa_id);
+ });
+
+ // Compute look-ahead assertions originating from the current state.
+ // Based on the input unit we're transitioning over, some additional
+ // set of assertions may be true. Thus, we re-compute this state's
+ // epsilon closure (but only if necessary).
+ if !state.look_need().is_empty() {
+ // Add look-ahead assertions that are now true based on the current
+ // input unit.
+ let mut look_have = state.look_have().clone();
+ match unit.as_u8() {
+ Some(b'\n') => {
+ look_have.insert(Look::EndLine);
+ }
+ Some(_) => {}
+ None => {
+ look_have.insert(Look::EndText);
+ look_have.insert(Look::EndLine);
+ }
+ }
+ if state.is_from_word() == unit.is_word_byte() {
+ look_have.insert(Look::WordBoundaryUnicodeNegate);
+ look_have.insert(Look::WordBoundaryAsciiNegate);
+ } else {
+ look_have.insert(Look::WordBoundaryUnicode);
+ look_have.insert(Look::WordBoundaryAscii);
+ }
+ // If we have new assertions satisfied that are among the set of
+ // assertions that exist in this state (that is, just because
+ // we added an EndLine assertion above doesn't mean there is an
+ // EndLine conditional epsilon transition in this state), then we
+ // re-compute this state's epsilon closure using the updated set of
+ // assertions.
+ if !look_have
+ .subtract(state.look_have())
+ .intersect(state.look_need())
+ .is_empty()
+ {
+ for nfa_id in &sparses.set1 {
+ epsilon_closure(
+ nfa,
+ nfa_id,
+ look_have,
+ stack,
+ &mut sparses.set2,
+ );
+ }
+ sparses.swap();
+ sparses.set2.clear();
+ }
+ }
+
+ // Convert our empty builder into one that can record assertions and match
+ // pattern IDs.
+ let mut builder = empty_builder.into_matches();
+ // Set whether the StartLine look-behind assertion is true for this
+ // transition or not. The look-behind assertion for ASCII word boundaries
+ // is handled below.
+ if nfa.has_any_anchor() {
+ if unit.as_u8().map_or(false, |b| b == b'\n') {
+ // Why only handle StartLine here and not StartText? That's
+ // because StartText can only impact the starting state, which
+ // is speical cased in start state handling.
+ builder.look_have().insert(Look::StartLine);
+ }
+ }
+ for nfa_id in &sparses.set1 {
+ match *nfa.state(nfa_id) {
+ thompson::State::Union { .. }
+ | thompson::State::Fail
+ | thompson::State::Look { .. }
+ | thompson::State::Capture { .. } => {}
+ thompson::State::Match { id } => {
+ // Notice here that we are calling the NEW state a match
+ // state if the OLD state we are transitioning from
+ // contains an NFA match state. This is precisely how we
+ // delay all matches by one byte and also what therefore
+ // guarantees that starting states cannot be match states.
+ //
+ // If we didn't delay matches by one byte, then whether
+ // a DFA is a matching state or not would be determined
+ // by whether one of its own constituent NFA states
+ // was a match state. (And that would be done in
+ // 'add_nfa_states'.)
+ //
+ // Also, 'add_match_pattern_id' requires that callers never
+ // pass duplicative pattern IDs. We do in fact uphold that
+ // guarantee here, but it's subtle. In particular, a Thompson
+ // NFA guarantees that each pattern has exactly one match
+ // state. Moreover, since we're iterating over the NFA state
+ // IDs in a set, we are guarateed not to have any duplicative
+ // match states. Thus, it is impossible to add the same pattern
+ // ID more than once.
+ builder.add_match_pattern_id(id);
+ if !match_kind.continue_past_first_match() {
+ break;
+ }
+ }
+ thompson::State::Range { range: ref r } => {
+ if r.matches_unit(unit) {
+ epsilon_closure(
+ nfa,
+ r.next,
+ *builder.look_have(),
+ stack,
+ &mut sparses.set2,
+ );
+ }
+ }
+ thompson::State::Sparse(ref sparse) => {
+ if let Some(next) = sparse.matches_unit(unit) {
+ epsilon_closure(
+ nfa,
+ next,
+ *builder.look_have(),
+ stack,
+ &mut sparses.set2,
+ );
+ }
+ }
+ }
+ }
+ // We only set the word byte if there's a word boundary look-around
+ // anywhere in this regex. Otherwise, there's no point in bloating the
+ // number of states if we don't have one.
+ //
+ // We also only set it when the state has a non-zero number of NFA states.
+ // Otherwise, we could wind up with states that *should* be DEAD states
+ // but are otherwise distinct from DEAD states because of this look-behind
+ // assertion being set. While this can't technically impact correctness *in
+ // theory*, it can create pathological DFAs that consume input until EOI or
+ // a quit byte is seen. Consuming until EOI isn't a correctness problem,
+ // but a (serious) perf problem. Hitting a quit byte, however, could be a
+ // correctness problem since it could cause search routines to report an
+ // error instead of a detected match once the quit state is entered. (The
+ // search routine could be made to be a bit smarter by reporting a match
+ // if one was detected once it enters a quit state (and indeed, the search
+ // routines in this crate do just that), but it seems better to prevent
+ // these things by construction if possible.)
+ if nfa.has_word_boundary()
+ && unit.is_word_byte()
+ && !sparses.set2.is_empty()
+ {
+ builder.set_is_from_word();
+ }
+ let mut builder_nfa = builder.into_nfa();
+ add_nfa_states(nfa, &sparses.set2, &mut builder_nfa);
+ builder_nfa
+}
+
+/// Compute the epsilon closure for the given NFA state. The epsilon closure
+/// consists of all NFA state IDs, including `start_nfa_id`, that can be
+/// reached from `start_nfa_id` without consuming any input. These state IDs
+/// are written to `set` in the order they are visited, but only if they are
+/// not already in `set`. `start_nfa_id` must be a valid state ID for the NFA
+/// given.
+///
+/// `look_have` consists of the satisfied assertions at the current
+/// position. For conditional look-around epsilon transitions, these are
+/// only followed if they are satisfied by `look_have`.
+///
+/// `stack` must have length 0. It is used as scratch space for depth first
+/// traversal. After returning, it is guaranteed that `stack` will have length
+/// 0.
+pub(crate) fn epsilon_closure(
+ nfa: &thompson::NFA,
+ start_nfa_id: StateID,
+ look_have: LookSet,
+ stack: &mut Vec<StateID>,
+ set: &mut SparseSet,
+) {
+ assert!(stack.is_empty());
+ // If this isn't an epsilon state, then the epsilon closure is always just
+ // itself, so there's no need to spin up the machinery below to handle it.
+ if !nfa.state(start_nfa_id).is_epsilon() {
+ set.insert(start_nfa_id);
+ return;
+ }
+
+ stack.push(start_nfa_id);
+ while let Some(mut id) = stack.pop() {
+ // In many cases, we can avoid stack operations when an NFA state only
+ // adds one new state to visit. In that case, we just set our ID to
+ // that state and mush on. We only use the stack when an NFA state
+ // introduces multiple new states to visit.
+ loop {
+ // Insert this NFA state, and if it's already in the set and thus
+ // already visited, then we can move on to the next one.
+ if !set.insert(id) {
+ break;
+ }
+ match *nfa.state(id) {
+ thompson::State::Range { .. }
+ | thompson::State::Sparse { .. }
+ | thompson::State::Fail
+ | thompson::State::Match { .. } => break,
+ thompson::State::Look { look, next } => {
+ if !look_have.contains(look) {
+ break;
+ }
+ id = next;
+ }
+ thompson::State::Union { ref alternates } => {
+ id = match alternates.get(0) {
+ None => break,
+ Some(&id) => id,
+ };
+ // We need to process our alternates in order to preserve
+ // match preferences, so put the earliest alternates closer
+ // to the top of the stack.
+ stack.extend(alternates[1..].iter().rev());
+ }
+ thompson::State::Capture { next, .. } => {
+ id = next;
+ }
+ }
+ }
+ }
+}
+
+/// Add the NFA state IDs in the given `set` to the given DFA builder state.
+/// The order in which states are added corresponds to the order in which they
+/// were added to `set`.
+///
+/// The DFA builder state given should already have its complete set of match
+/// pattern IDs added (if any) and any look-behind assertions (StartLine,
+/// StartText and whether this state is being generated for a transition over a
+/// word byte when applicable) that are true immediately prior to transitioning
+/// into this state (via `builder.look_have()`). The match pattern IDs should
+/// correspond to matches that occured on the previous transition, since all
+/// matches are delayed by one byte. The things that should _not_ be set are
+/// look-ahead assertions (EndLine, EndText and whether the next byte is a
+/// word byte or not). The builder state should also not have anything in
+/// `look_need` set, as this routine will compute that for you.
+///
+/// The given NFA should be able to resolve all identifiers in `set` to a
+/// particular NFA state. Additionally, `set` must have capacity equivalent
+/// to `nfa.len()`.
+pub(crate) fn add_nfa_states(
+ nfa: &thompson::NFA,
+ set: &SparseSet,
+ builder: &mut StateBuilderNFA,
+) {
+ for nfa_id in set {
+ match *nfa.state(nfa_id) {
+ thompson::State::Range { .. } => {
+ builder.add_nfa_state_id(nfa_id);
+ }
+ thompson::State::Sparse { .. } => {
+ builder.add_nfa_state_id(nfa_id);
+ }
+ thompson::State::Look { look, .. } => {
+ builder.add_nfa_state_id(nfa_id);
+ builder.look_need().insert(look);
+ }
+ thompson::State::Union { .. }
+ | thompson::State::Capture { .. } => {
+ // Pure epsilon transitions don't need to be tracked
+ // as part of the DFA state. Tracking them is actually
+ // superfluous; they won't cause any harm other than making
+ // determinization slower.
+ //
+ // Why aren't these needed? Well, in an NFA, epsilon
+ // transitions are really just jumping points to other
+ // states. So once you hit an epsilon transition, the same
+ // set of resulting states always appears. Therefore,
+ // putting them in a DFA's set of ordered NFA states is
+ // strictly redundant.
+ //
+ // Look-around states are also epsilon transitions, but
+ // they are *conditional*. So their presence could be
+ // discriminatory, and thus, they are tracked above.
+ //
+ // But wait... why are epsilon states in our `set` in the
+ // first place? Why not just leave them out? They're in
+ // our `set` because it was generated by computing an
+ // epsilon closure, and we want to keep track of all states
+ // we visited to avoid re-visiting them. In exchange, we
+ // have to do this second iteration over our collected
+ // states to finalize our DFA state.
+ //
+ // Note that this optimization requires that we re-compute
+ // the epsilon closure to account for look-ahead in 'next'
+ // *only when necessary*. Namely, only when the set of
+ // look-around assertions changes and only when those
+ // changes are within the set of assertions that are
+ // needed in order to step through the closure correctly.
+ // Otherwise, if we re-do the epsilon closure needlessly,
+ // it could change based on the fact that we are omitting
+ // epsilon states here.
+ }
+ thompson::State::Fail => {
+ break;
+ }
+ thompson::State::Match { .. } => {
+ // Normally, the NFA match state doesn't actually need to
+ // be inside the DFA state. But since we delay matches by
+ // one byte, the matching DFA state corresponds to states
+ // that transition from the one we're building here. And
+ // the way we detect those cases is by looking for an NFA
+ // match state. See 'next' for how this is handled.
+ builder.add_nfa_state_id(nfa_id);
+ }
+ }
+ }
+ // If we know this state contains no look-around assertions, then
+ // there's no reason to track which look-around assertions were
+ // satisfied when this state was created.
+ if builder.look_need().is_empty() {
+ builder.look_have().clear();
+ }
+}
+
+/// Sets the appropriate look-behind assertions on the given state based on
+/// this starting configuration.
+pub(crate) fn set_lookbehind_from_start(
+ start: &Start,
+ builder: &mut StateBuilderMatches,
+) {
+ match *start {
+ Start::NonWordByte => {}
+ Start::WordByte => {
+ builder.set_is_from_word();
+ }
+ Start::Text => {
+ builder.look_have().insert(Look::StartText);
+ builder.look_have().insert(Look::StartLine);
+ }
+ Start::Line => {
+ builder.look_have().insert(Look::StartLine);
+ }
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::Start;
+
+ #[test]
+ #[should_panic]
+ fn start_fwd_bad_range() {
+ Start::from_position_fwd(&[], 0, 1);
+ }
+
+ #[test]
+ #[should_panic]
+ fn start_rev_bad_range() {
+ Start::from_position_rev(&[], 0, 1);
+ }
+
+ #[test]
+ fn start_fwd() {
+ let f = Start::from_position_fwd;
+
+ assert_eq!(Start::Text, f(&[], 0, 0));
+ assert_eq!(Start::Text, f(b"abc", 0, 3));
+ assert_eq!(Start::Text, f(b"\nabc", 0, 3));
+
+ assert_eq!(Start::Line, f(b"\nabc", 1, 3));
+
+ assert_eq!(Start::WordByte, f(b"abc", 1, 3));
+
+ assert_eq!(Start::NonWordByte, f(b" abc", 1, 3));
+ }
+
+ #[test]
+ fn start_rev() {
+ let f = Start::from_position_rev;
+
+ assert_eq!(Start::Text, f(&[], 0, 0));
+ assert_eq!(Start::Text, f(b"abc", 0, 3));
+ assert_eq!(Start::Text, f(b"abc\n", 0, 4));
+
+ assert_eq!(Start::Line, f(b"abc\nz", 0, 3));
+
+ assert_eq!(Start::WordByte, f(b"abc", 0, 2));
+
+ assert_eq!(Start::NonWordByte, f(b"abc ", 0, 3));
+ }
+}
diff --git a/vendor/regex-automata/src/util/determinize/state.rs b/vendor/regex-automata/src/util/determinize/state.rs
new file mode 100644
index 000000000..567e600d6
--- /dev/null
+++ b/vendor/regex-automata/src/util/determinize/state.rs
@@ -0,0 +1,873 @@
+/*!
+This module defines a DFA state representation and builders for constructing
+DFA states.
+
+This representation is specifically for use in implementations of NFA-to-DFA
+conversion via powerset construction. (Also called "determinization" in this
+crate.)
+
+The term "DFA state" is somewhat overloaded in this crate. In some cases, it
+refers to the set of transitions over an alphabet for a particular state. In
+other cases, it refers to a set of NFA states. The former is really about the
+final representation of a state in a DFA's transition table, where as the
+latter---what this module is focusedon---is closer to an intermediate form that
+is used to help eventually build the transition table.
+
+This module exports four types. All four types represent the same idea: an
+ordered set of NFA states. This ordered set represents the epsilon closure of a
+particular NFA state, where the "epsilon closure" is the set of NFA states that
+can be transitioned to without consuming any input. i.e., Follow all of theNFA
+state's epsilon transitions. In addition, this implementation of DFA states
+cares about two other things: the ordered set of pattern IDs corresponding
+to the patterns that match if the state is a match state, and the set of
+look-behind assertions that were true when the state was created.
+
+The first, `State`, is a frozen representation of a state that cannot be
+modified. It may be cheaply cloned without copying the state itself and can be
+accessed safely from multiple threads simultaneously. This type is useful for
+when one knows that the DFA state being constructed is distinct from any other
+previously constructed states. Namely, powerset construction, in practice,
+requires one to keep a cache of previously created DFA states. Otherwise,
+the number of DFA states created in memory balloons to an impractically
+large number. For this reason, equivalent states should endeavor to have an
+equivalent byte-level representation. (In general, "equivalency" here means,
+"equivalent assertions, pattern IDs and NFA state IDs." We do not require that
+full DFA minimization be implemented here. This form of equivalency is only
+surface deep and is more-or-less a practical necessity.)
+
+The other three types represent different phases in the construction of a
+DFA state. Internally, these three types (and `State`) all use the same
+byte-oriented representation. That means one can use any of the builder types
+to check whether the state it represents already exists or not. If it does,
+then there is no need to freeze it into a `State` (which requires an alloc and
+a copy). Here are the three types described succinctly:
+
+* `StateBuilderEmpty` represents a state with no pattern IDs, no assertions
+and no NFA states. Creating a `StateBuilderEmpty` performs no allocs. A
+`StateBuilderEmpty` can only be used to query its underlying memory capacity,
+or to convert into a builder for recording pattern IDs and/or assertions.
+* `StateBuilderMatches` represents a state with zero or more pattern IDs, zero
+or more satisfied assertions and zero NFA state IDs. A `StateBuilderMatches`
+can only be used for adding pattern IDs and recording assertions.
+* `StateBuilderNFA` represents a state with zero or more pattern IDs, zero or
+more satisfied assertions and zero or more NFA state IDs. A `StateBuilderNFA`
+can only be used for adding NFA state IDs and recording some assertions.
+
+The expected flow here is to use the above builders to construct a candidate
+DFA state to check if it already exists. If it does, then there's no need to
+freeze it into a `State`. It it doesn't exist, then `StateBuilderNFA::to_state`
+can be called to freeze the builder into an immutable `State`. In either
+case, `clear` should be called on the builder to turn it back into a
+`StateBuilderEmpty` that reuses the underyling memory.
+
+The main purpose for splitting the builder into these distinct types is to
+make it impossible to do things like adding a pattern ID after adding an NFA
+state ID. Namely, this makes it simpler to use a space-and-time efficient
+binary representation for the state. (The format is documented on the `Repr`
+type below.) If we just used one type for everything, it would be possible for
+callers to use an incorrect interleaving of calls and thus result in a corrupt
+representation. I chose to use more type machinery to make this impossible to
+do because 1) determinization is itself pretty complex and it wouldn't be too
+hard to foul this up and 2) there isn't too much machinery involve and it's
+well contained.
+
+As an optimization, sometimes states won't have certain things set. For
+example, if the underlying NFA has no word boundary assertions, then there is
+no reason to set a state's look-behind assertion as to whether it was generated
+from a word byte or not. Similarly, if a state has no NFA states corresponding
+to look-around assertions, then there is no reason to set `look_have` to a
+non-empty set. Finally, callers usually omit unconditional epsilon transitions
+when adding NFA state IDs since they aren't discriminatory.
+
+Finally, the binary representation used by these states is, thankfully, not
+serialized anywhere. So any kind of change can be made with reckless abandon,
+as long as everything in this module agrees.
+*/
+
+use core::{convert::TryFrom, mem};
+
+use alloc::{sync::Arc, vec::Vec};
+
+use crate::{
+ nfa::thompson::LookSet,
+ util::{
+ bytes::{self, Endian},
+ id::{PatternID, StateID},
+ },
+};
+
+/// A DFA state that, at its core, is represented by an ordered set of NFA
+/// states.
+///
+/// This type is intended to be used only in NFA-to-DFA conversion via powerset
+/// construction.
+///
+/// It may be cheaply cloned and accessed safely from mulitple threads
+/// simultaneously.
+#[derive(Clone, Eq, Hash, PartialEq, PartialOrd, Ord)]
+pub(crate) struct State(Arc<[u8]>);
+
+/// This Borrow impl permits us to lookup any state in a map by its byte
+/// representation. This is particularly convenient when one has a StateBuilder
+/// and we want to see if a correspondingly equivalent state already exists. If
+/// one does exist, then we can reuse the allocation required by StateBuilder
+/// without having to convert it into a State first.
+impl core::borrow::Borrow<[u8]> for State {
+ fn borrow(&self) -> &[u8] {
+ &*self.0
+ }
+}
+
+impl core::fmt::Debug for State {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ f.debug_tuple("State").field(&self.repr()).finish()
+ }
+}
+
+/// For docs on these routines, see the internal Repr and ReprVec types below.
+impl State {
+ pub(crate) fn dead() -> State {
+ StateBuilderEmpty::new().into_matches().into_nfa().to_state()
+ }
+
+ pub(crate) fn is_match(&self) -> bool {
+ self.repr().is_match()
+ }
+
+ pub(crate) fn is_from_word(&self) -> bool {
+ self.repr().is_from_word()
+ }
+
+ pub(crate) fn look_have(&self) -> LookSet {
+ self.repr().look_have()
+ }
+
+ pub(crate) fn look_need(&self) -> LookSet {
+ self.repr().look_need()
+ }
+
+ pub(crate) fn match_count(&self) -> usize {
+ self.repr().match_count()
+ }
+
+ pub(crate) fn match_pattern(&self, index: usize) -> PatternID {
+ self.repr().match_pattern(index)
+ }
+
+ pub(crate) fn match_pattern_ids(&self) -> Option<Vec<PatternID>> {
+ self.repr().match_pattern_ids()
+ }
+
+ pub(crate) fn iter_match_pattern_ids<F: FnMut(PatternID)>(&self, f: F) {
+ self.repr().iter_match_pattern_ids(f)
+ }
+
+ pub(crate) fn iter_nfa_state_ids<F: FnMut(StateID)>(&self, f: F) {
+ self.repr().iter_nfa_state_ids(f)
+ }
+
+ pub(crate) fn memory_usage(&self) -> usize {
+ self.0.len()
+ }
+
+ fn repr(&self) -> Repr<'_> {
+ Repr(&*self.0)
+ }
+}
+
+/// A state builder that represents an empty state.
+///
+/// This is a useful "initial condition" for state construction. It has no
+/// NFA state IDs, no assertions set and no pattern IDs. No allocations are
+/// made when new() is called. Its main use is for being converted into a
+/// builder that can capture assertions and pattern IDs.
+#[derive(Clone, Debug)]
+pub(crate) struct StateBuilderEmpty(Vec<u8>);
+
+/// For docs on these routines, see the internal Repr and ReprVec types below.
+impl StateBuilderEmpty {
+ pub(crate) fn new() -> StateBuilderEmpty {
+ StateBuilderEmpty(alloc::vec![])
+ }
+
+ pub(crate) fn into_matches(mut self) -> StateBuilderMatches {
+ self.0.extend_from_slice(&[0, 0, 0]);
+ StateBuilderMatches(self.0)
+ }
+
+ fn clear(&mut self) {
+ self.0.clear();
+ }
+
+ pub(crate) fn capacity(&self) -> usize {
+ self.0.capacity()
+ }
+}
+
+/// A state builder that collects assertions and pattern IDs.
+///
+/// When collecting pattern IDs is finished, this can be converted into a
+/// builder that collects NFA state IDs.
+#[derive(Clone)]
+pub(crate) struct StateBuilderMatches(Vec<u8>);
+
+impl core::fmt::Debug for StateBuilderMatches {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ f.debug_tuple("StateBuilderMatches").field(&self.repr()).finish()
+ }
+}
+
+/// For docs on these routines, see the internal Repr and ReprVec types below.
+impl StateBuilderMatches {
+ pub(crate) fn into_nfa(mut self) -> StateBuilderNFA {
+ self.repr_vec().close_match_pattern_ids();
+ StateBuilderNFA { repr: self.0, prev_nfa_state_id: StateID::ZERO }
+ }
+
+ pub(crate) fn clear(self) -> StateBuilderEmpty {
+ let mut builder = StateBuilderEmpty(self.0);
+ builder.clear();
+ builder
+ }
+
+ pub(crate) fn is_match(&self) -> bool {
+ self.repr().is_match()
+ }
+
+ pub(crate) fn is_from_word(&self) -> bool {
+ self.repr().is_from_word()
+ }
+
+ pub(crate) fn set_is_from_word(&mut self) {
+ self.repr_vec().set_is_from_word()
+ }
+
+ pub(crate) fn look_have(&mut self) -> &mut LookSet {
+ LookSet::from_repr_mut(&mut self.0[1])
+ }
+
+ pub(crate) fn look_need(&mut self) -> &mut LookSet {
+ LookSet::from_repr_mut(&mut self.0[2])
+ }
+
+ pub(crate) fn add_match_pattern_id(&mut self, pid: PatternID) {
+ self.repr_vec().add_match_pattern_id(pid)
+ }
+
+ fn repr(&self) -> Repr<'_> {
+ Repr(&self.0)
+ }
+
+ fn repr_vec(&mut self) -> ReprVec<'_> {
+ ReprVec(&mut self.0)
+ }
+}
+
+/// A state builder that collects some assertions and NFA state IDs.
+///
+/// When collecting NFA state IDs is finished, this can be used to build a
+/// `State` if necessary.
+///
+/// When dont with building a state (regardless of whether it got kept or not),
+/// it's usually a good idea to call `clear` to get an empty builder back so
+/// that it can be reused to build the next state.
+#[derive(Clone)]
+pub(crate) struct StateBuilderNFA {
+ repr: Vec<u8>,
+ prev_nfa_state_id: StateID,
+}
+
+impl core::fmt::Debug for StateBuilderNFA {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ f.debug_tuple("StateBuilderNFA").field(&self.repr()).finish()
+ }
+}
+
+/// For docs on these routines, see the internal Repr and ReprVec types below.
+impl StateBuilderNFA {
+ pub(crate) fn to_state(&self) -> State {
+ State(Arc::from(&*self.repr))
+ }
+
+ pub(crate) fn clear(self) -> StateBuilderEmpty {
+ let mut builder = StateBuilderEmpty(self.repr);
+ builder.clear();
+ builder
+ }
+
+ pub(crate) fn is_match(&self) -> bool {
+ self.repr().is_match()
+ }
+
+ pub(crate) fn is_from_word(&self) -> bool {
+ self.repr().is_from_word()
+ }
+
+ pub(crate) fn look_have(&mut self) -> &mut LookSet {
+ LookSet::from_repr_mut(&mut self.repr[1])
+ }
+
+ pub(crate) fn look_need(&mut self) -> &mut LookSet {
+ LookSet::from_repr_mut(&mut self.repr[2])
+ }
+
+ pub(crate) fn add_nfa_state_id(&mut self, sid: StateID) {
+ ReprVec(&mut self.repr)
+ .add_nfa_state_id(&mut self.prev_nfa_state_id, sid)
+ }
+
+ pub(crate) fn memory_usage(&self) -> usize {
+ self.repr.len()
+ }
+
+ pub(crate) fn as_bytes(&self) -> &[u8] {
+ &self.repr
+ }
+
+ fn repr(&self) -> Repr<'_> {
+ Repr(&self.repr)
+ }
+
+ fn repr_vec(&mut self) -> ReprVec<'_> {
+ ReprVec(&mut self.repr)
+ }
+}
+
+/// Repr is a read-only view into the representation of a DFA state.
+///
+/// Primarily, a Repr is how we achieve DRY: we implement decoding the format
+/// in one place, and then use a Repr to implement the various methods on the
+/// public state types.
+///
+/// The format is as follows:
+///
+/// The first three bytes correspond to bitsets.
+///
+/// Byte 0 is a bitset corresponding to miscellaneous flags associated with the
+/// state. Bit 0 is set to 1 if the state is a match state. Bit 1 is set to 1
+/// if the state has pattern IDs explicitly written to it. (This is a flag that
+/// is not meant to be set by determinization, but rather, is used as part of
+/// an internal space-saving optimization.) Bit 2 is set to 1 if the state was
+/// generated by a transition over a "word" byte. (Callers may not always set
+/// this. For example, if the NFA has no word boundary assertion, then needing
+/// to track whether a state came from a word byte or not is superfluous and
+/// wasteful.)
+///
+/// Byte 1 corresponds to the look-behind assertions that were satisfied by
+/// the transition that created this state. This generally only includes the
+/// StartLine and StartText assertions. (Look-ahead assertions are not tracked
+/// as part of states. Instead, these are applied by re-computing the epsilon
+/// closure of a state when computing the transition function. See `next` in
+/// the parent module.)
+///
+/// Byte 2 corresponds to the set of look-around assertions (including both
+/// look-behind and look-ahead) that appear somewhere in this state's set of
+/// NFA state IDs. This is used to determine whether this state's epsilon
+/// closure should be re-computed when computing the transition function.
+/// Namely, look-around assertions are "just" conditional epsilon transitions,
+/// so if there are new assertions available when computing the transition
+/// function, we should only re-compute the epsilon closure if those new
+/// assertions are relevant to this particular state.
+///
+/// Bytes 3..7 correspond to a 32-bit native-endian encoded integer
+/// corresponding to the number of patterns encoded in this state. If the state
+/// is not a match state (byte 0 bit 0 is 0) or if it's only pattern ID is
+/// PatternID::ZERO, then no integer is encoded at this position. Instead, byte
+/// offset 3 is the position at which the first NFA state ID is encoded.
+///
+/// For a match state with at least one non-ZERO pattern ID, the next bytes
+/// correspond to a sequence of 32-bit native endian encoded integers that
+/// represent each pattern ID, in order, that this match state represents.
+///
+/// After the pattern IDs (if any), NFA state IDs are delta encoded as
+/// varints.[1] The first NFA state ID is encoded as itself, and each
+/// subsequent NFA state ID is encoded as the difference between itself and the
+/// previous NFA state ID.
+///
+/// [1] - https://developers.google.com/protocol-buffers/docs/encoding#varints
+struct Repr<'a>(&'a [u8]);
+
+impl<'a> Repr<'a> {
+ /// Returns true if and only if this is a match state.
+ ///
+ /// If callers have added pattern IDs to this state, then callers MUST set
+ /// this state as a match state explicitly. However, as a special case,
+ /// states that are marked as match states but with no pattern IDs, then
+ /// the state is treated as if it had a single pattern ID equivalent to
+ /// PatternID::ZERO.
+ fn is_match(&self) -> bool {
+ self.0[0] & (1 << 0) > 0
+ }
+
+ /// Returns true if and only if this state has had at least one pattern
+ /// ID added to it.
+ ///
+ /// This is an internal-only flag that permits the representation to save
+ /// space in the common case of an NFA with one pattern in it. In that
+ /// case, a match state can only ever have exactly one pattern ID:
+ /// PatternID::ZERO. So there's no need to represent it.
+ fn has_pattern_ids(&self) -> bool {
+ self.0[0] & (1 << 1) > 0
+ }
+
+ /// Returns true if and only if this state is marked as having been created
+ /// from a transition over a word byte. This is useful for checking whether
+ /// a word boundary assertion is true or not, which requires look-behind
+ /// (whether the current state came from a word byte or not) and look-ahead
+ /// (whether the transition byte is a word byte or not).
+ ///
+ /// Since states with this set are distinct from states that don't have
+ /// this set (even if they are otherwise equivalent), callers should not
+ /// set this assertion unless the underlying NFA has at least one word
+ /// boundary assertion somewhere. Otherwise, a superfluous number of states
+ /// may be created.
+ fn is_from_word(&self) -> bool {
+ self.0[0] & (1 << 2) > 0
+ }
+
+ /// The set of look-behind assertions that were true in the transition that
+ /// created this state.
+ ///
+ /// Generally, this should be empty if 'look_need' is empty, since there is
+ /// no reason to track which look-behind assertions are true if the state
+ /// has no conditional epsilon transitions.
+ ///
+ /// Satisfied look-ahead assertions are not tracked in states. Instead,
+ /// these are re-computed on demand via epsilon closure when computing the
+ /// transition function.
+ fn look_have(&self) -> LookSet {
+ LookSet::from_repr(self.0[1])
+ }
+
+ /// The set of look-around (both behind and ahead) assertions that appear
+ /// at least once in this state's set of NFA states.
+ ///
+ /// This is used to determine whether the epsilon closure needs to be
+ /// re-computed when computing the transition function. Namely, if the
+ /// state has no conditional epsilon transitions, then there is no need
+ /// to re-compute the epsilon closure.
+ fn look_need(&self) -> LookSet {
+ LookSet::from_repr(self.0[2])
+ }
+
+ /// Returns the total number of match pattern IDs in this state.
+ ///
+ /// If this state is not a match state, then this always returns 0.
+ fn match_count(&self) -> usize {
+ if !self.is_match() {
+ return 0;
+ } else if !self.has_pattern_ids() {
+ 1
+ } else {
+ self.encoded_pattern_count()
+ }
+ }
+
+ /// Returns the pattern ID for this match state at the given index.
+ ///
+ /// If the given index is greater than or equal to `match_count()` for this
+ /// state, then this could panic or return incorrect results.
+ fn match_pattern(&self, index: usize) -> PatternID {
+ if !self.has_pattern_ids() {
+ PatternID::ZERO
+ } else {
+ let offset = 7 + index * PatternID::SIZE;
+ // This is OK since we only ever serialize valid PatternIDs to
+ // states.
+ bytes::read_pattern_id_unchecked(&self.0[offset..]).0
+ }
+ }
+
+ /// Returns a copy of all match pattern IDs in this state. If this state
+ /// is not a match state, then this returns None.
+ fn match_pattern_ids(&self) -> Option<Vec<PatternID>> {
+ if !self.is_match() {
+ return None;
+ }
+ let mut pids = alloc::vec![];
+ self.iter_match_pattern_ids(|pid| pids.push(pid));
+ Some(pids)
+ }
+
+ /// Calls the given function on every pattern ID in this state.
+ fn iter_match_pattern_ids<F: FnMut(PatternID)>(&self, mut f: F) {
+ if !self.is_match() {
+ return;
+ }
+ // As an optimization for a very common case, when this is a match
+ // state for an NFA with only one pattern, we don't actually write the
+ // pattern ID to the state representation. Instead, we know it must
+ // be there since it is the only possible choice.
+ if !self.has_pattern_ids() {
+ f(PatternID::ZERO);
+ return;
+ }
+ let mut pids = &self.0[7..self.pattern_offset_end()];
+ while !pids.is_empty() {
+ let pid = bytes::read_u32(pids);
+ pids = &pids[PatternID::SIZE..];
+ // This is OK since we only ever serialize valid PatternIDs to
+ // states. And since pattern IDs can never exceed a usize, the
+ // unwrap is OK.
+ f(PatternID::new_unchecked(usize::try_from(pid).unwrap()));
+ }
+ }
+
+ /// Calls the given function on every NFA state ID in this state.
+ fn iter_nfa_state_ids<F: FnMut(StateID)>(&self, mut f: F) {
+ let mut sids = &self.0[self.pattern_offset_end()..];
+ let mut prev = 0i32;
+ while !sids.is_empty() {
+ let (delta, nr) = read_vari32(sids);
+ sids = &sids[nr..];
+ let sid = prev + delta;
+ prev = sid;
+ // This is OK since we only ever serialize valid StateIDs to
+ // states. And since state IDs can never exceed an isize, they must
+ // always be able to fit into a usize, and thus cast is OK.
+ f(StateID::new_unchecked(sid as usize))
+ }
+ }
+
+ /// Returns the offset into this state's representation where the pattern
+ /// IDs end and the NFA state IDs begin.
+ fn pattern_offset_end(&self) -> usize {
+ let encoded = self.encoded_pattern_count();
+ if encoded == 0 {
+ return 3;
+ }
+ // This arithmetic is OK since we were able to address this many bytes
+ // when writing to the state, thus, it must fit into a usize.
+ encoded.checked_mul(4).unwrap().checked_add(7).unwrap()
+ }
+
+ /// Returns the total number of *encoded* pattern IDs in this state.
+ ///
+ /// This may return 0 even when this is a match state, since the pattern
+ /// ID `PatternID::ZERO` is not encoded when it's the only pattern ID in
+ /// the match state (the overwhelming common case).
+ fn encoded_pattern_count(&self) -> usize {
+ if !self.has_pattern_ids() {
+ return 0;
+ }
+ // This unwrap is OK since the total number of patterns is always
+ // guaranteed to fit into a usize.
+ usize::try_from(bytes::read_u32(&self.0[3..7])).unwrap()
+ }
+}
+
+impl<'a> core::fmt::Debug for Repr<'a> {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ let mut nfa_ids = alloc::vec![];
+ self.iter_nfa_state_ids(|sid| nfa_ids.push(sid));
+ f.debug_struct("Repr")
+ .field("is_match", &self.is_match())
+ .field("is_from_word", &self.is_from_word())
+ .field("look_have", &self.look_have())
+ .field("look_need", &self.look_need())
+ .field("match_pattern_ids", &self.match_pattern_ids())
+ .field("nfa_state_ids", &nfa_ids)
+ .finish()
+ }
+}
+
+/// ReprVec is a write-only view into the representation of a DFA state.
+///
+/// See Repr for more details on the purpose of this type and also the format.
+///
+/// Note that not all possible combinations of methods may be called. This is
+/// precisely what the various StateBuilder types encapsulate: they only
+/// permit valid combinations via Rust's linear typing.
+struct ReprVec<'a>(&'a mut Vec<u8>);
+
+impl<'a> ReprVec<'a> {
+ /// Set this state as a match state.
+ ///
+ /// This should not be exposed explicitly outside of this module. It is
+ /// set automatically when a pattern ID is added.
+ fn set_is_match(&mut self) {
+ self.0[0] |= 1 << 0;
+ }
+
+ /// Set that this state has pattern IDs explicitly written to it.
+ ///
+ /// This should not be exposed explicitly outside of this module. This is
+ /// used internally as a space saving optimization. Namely, if the state
+ /// is a match state but does not have any pattern IDs written to it,
+ /// then it is automatically inferred to have a pattern ID of ZERO.
+ fn set_has_pattern_ids(&mut self) {
+ self.0[0] |= 1 << 1;
+ }
+
+ /// Set this state as being built from a transition over a word byte.
+ ///
+ /// Setting this is only necessary when one needs to deal with word
+ /// boundary assertions. Therefore, if the underlying NFA has no word
+ /// boundary assertions, callers should not set this.
+ fn set_is_from_word(&mut self) {
+ self.0[0] |= 1 << 2;
+ }
+
+ /// Return a mutable reference to the 'look_have' assertion set.
+ fn look_have_mut(&mut self) -> &mut LookSet {
+ LookSet::from_repr_mut(&mut self.0[1])
+ }
+
+ /// Return a mutable reference to the 'look_need' assertion set.
+ fn look_need_mut(&mut self) -> &mut LookSet {
+ LookSet::from_repr_mut(&mut self.0[2])
+ }
+
+ /// Add a pattern ID to this state. All match states must have at least
+ /// one pattern ID associated with it.
+ ///
+ /// Callers must never add duplicative pattern IDs.
+ ///
+ /// The order in which patterns are added must correspond to the order
+ /// in which patterns are reported as matches.
+ fn add_match_pattern_id(&mut self, pid: PatternID) {
+ // As a (somewhat small) space saving optimization, in the case where
+ // a matching state has exactly one pattern ID, PatternID::ZERO, we do
+ // not write either the pattern ID or the number of patterns encoded.
+ // Instead, all we do is set the 'is_match' bit on this state. Overall,
+ // this saves 8 bytes per match state for the overwhelming majority of
+ // match states.
+ //
+ // In order to know whether pattern IDs need to be explicitly read or
+ // not, we use another internal-only bit, 'has_pattern_ids', to
+ // indicate whether they have been explicitly written or not.
+ if !self.repr().has_pattern_ids() {
+ if pid == PatternID::ZERO {
+ self.set_is_match();
+ return;
+ }
+ // Make room for 'close_match_pattern_ids' to write the total
+ // number of pattern IDs written.
+ self.0.extend(core::iter::repeat(0).take(PatternID::SIZE));
+ self.set_has_pattern_ids();
+ // If this was already a match state, then the only way that's
+ // possible when the state doesn't have pattern IDs is if
+ // PatternID::ZERO was added by the caller previously. In this
+ // case, we are now adding a non-ZERO pattern ID after it, in
+ // which case, we want to make sure to represent ZERO explicitly
+ // now.
+ if self.repr().is_match() {
+ write_u32(self.0, 0)
+ } else {
+ // Otherwise, just make sure the 'is_match' bit is set.
+ self.set_is_match();
+ }
+ }
+ write_u32(self.0, pid.as_u32());
+ }
+
+ /// Indicate that no more pattern IDs will be added to this state.
+ ///
+ /// Once this is called, callers must not call it or 'add_match_pattern_id'
+ /// again.
+ ///
+ /// This should not be exposed explicitly outside of this module. It
+ /// should be called only when converting a StateBuilderMatches into a
+ /// StateBuilderNFA.
+ fn close_match_pattern_ids(&mut self) {
+ // If we never wrote any pattern IDs, then there's nothing to do here.
+ if !self.repr().has_pattern_ids() {
+ return;
+ }
+ let patsize = PatternID::SIZE;
+ let pattern_bytes = self.0.len() - 7;
+ // Every pattern ID uses 4 bytes, so number of bytes should be
+ // divisible by 4.
+ assert_eq!(pattern_bytes % patsize, 0);
+ // This unwrap is OK since we are guaranteed that the maximum number
+ // of possible patterns fits into a u32.
+ let count32 = u32::try_from(pattern_bytes / patsize).unwrap();
+ bytes::NE::write_u32(count32, &mut self.0[3..7]);
+ }
+
+ /// Add an NFA state ID to this state. The order in which NFA states are
+ /// added matters. It is the caller's responsibility to ensure that
+ /// duplicate NFA state IDs are not added.
+ fn add_nfa_state_id(&mut self, prev: &mut StateID, sid: StateID) {
+ let delta = sid.as_i32() - prev.as_i32();
+ write_vari32(self.0, delta);
+ *prev = sid;
+ }
+
+ /// Return a read-only view of this state's representation.
+ fn repr(&self) -> Repr<'_> {
+ Repr(self.0.as_slice())
+ }
+}
+
+/// Write a signed 32-bit integer using zig-zag encoding.
+///
+/// https://developers.google.com/protocol-buffers/docs/encoding#varints
+fn write_vari32(data: &mut Vec<u8>, n: i32) {
+ let mut un = (n as u32) << 1;
+ if n < 0 {
+ un = !un;
+ }
+ write_varu32(data, un)
+}
+
+/// Read a signed 32-bit integer using zig-zag encoding. Also, return the
+/// number of bytes read.
+///
+/// https://developers.google.com/protocol-buffers/docs/encoding#varints
+fn read_vari32(data: &[u8]) -> (i32, usize) {
+ let (un, i) = read_varu32(data);
+ let mut n = (un >> 1) as i32;
+ if un & 1 != 0 {
+ n = !n;
+ }
+ (n, i)
+}
+
+/// Write an unsigned 32-bit integer as a varint. In essence, `n` is written
+/// as a sequence of bytes where all bytes except for the last one have the
+/// most significant bit set. The least significant 7 bits correspond to the
+/// actual bits of `n`. So in the worst case, a varint uses 5 bytes, but in
+/// very common cases, it uses fewer than 4.
+///
+/// https://developers.google.com/protocol-buffers/docs/encoding#varints
+fn write_varu32(data: &mut Vec<u8>, mut n: u32) {
+ while n >= 0b1000_0000 {
+ data.push((n as u8) | 0b1000_0000);
+ n >>= 7;
+ }
+ data.push(n as u8);
+}
+
+/// Read an unsigned 32-bit varint. Also, return the number of bytes read.
+///
+/// https://developers.google.com/protocol-buffers/docs/encoding#varints
+fn read_varu32(data: &[u8]) -> (u32, usize) {
+ // N.B. We can assume correctness here since we know that all varuints are
+ // written with write_varu32. Hence, the 'as' uses and unchecked arithmetic
+ // is all okay.
+ let mut n: u32 = 0;
+ let mut shift: u32 = 0;
+ for (i, &b) in data.iter().enumerate() {
+ if b < 0b1000_0000 {
+ return (n | ((b as u32) << shift), i + 1);
+ }
+ n |= ((b as u32) & 0b0111_1111) << shift;
+ shift += 7;
+ }
+ (0, 0)
+}
+
+/// Push a native-endian encoded `n` on to `dst`.
+fn write_u32(dst: &mut Vec<u8>, n: u32) {
+ use crate::util::bytes::{Endian, NE};
+
+ let start = dst.len();
+ dst.extend(core::iter::repeat(0).take(mem::size_of::<u32>()));
+ NE::write_u32(n, &mut dst[start..]);
+}
+
+#[cfg(test)]
+mod tests {
+ use alloc::vec;
+
+ use quickcheck::quickcheck;
+
+ use super::*;
+
+ quickcheck! {
+ fn prop_state_read_write_nfa_state_ids(sids: Vec<StateID>) -> bool {
+ // Builders states do not permit duplicate IDs.
+ let sids = dedup_state_ids(sids);
+
+ let mut b = StateBuilderEmpty::new().into_matches().into_nfa();
+ for &sid in &sids {
+ b.add_nfa_state_id(sid);
+ }
+ let s = b.to_state();
+ let mut got = vec![];
+ s.iter_nfa_state_ids(|sid| got.push(sid));
+ got == sids
+ }
+
+ fn prop_state_read_write_pattern_ids(pids: Vec<PatternID>) -> bool {
+ // Builders states do not permit duplicate IDs.
+ let pids = dedup_pattern_ids(pids);
+
+ let mut b = StateBuilderEmpty::new().into_matches();
+ for &pid in &pids {
+ b.add_match_pattern_id(pid);
+ }
+ let s = b.into_nfa().to_state();
+ let mut got = vec![];
+ s.iter_match_pattern_ids(|pid| got.push(pid));
+ got == pids
+ }
+
+ fn prop_state_read_write_nfa_state_and_pattern_ids(
+ sids: Vec<StateID>,
+ pids: Vec<PatternID>
+ ) -> bool {
+ // Builders states do not permit duplicate IDs.
+ let sids = dedup_state_ids(sids);
+ let pids = dedup_pattern_ids(pids);
+
+ let mut b = StateBuilderEmpty::new().into_matches();
+ for &pid in &pids {
+ b.add_match_pattern_id(pid);
+ }
+
+ let mut b = b.into_nfa();
+ for &sid in &sids {
+ b.add_nfa_state_id(sid);
+ }
+
+ let s = b.to_state();
+ let mut got_pids = vec![];
+ s.iter_match_pattern_ids(|pid| got_pids.push(pid));
+ let mut got_sids = vec![];
+ s.iter_nfa_state_ids(|sid| got_sids.push(sid));
+ got_pids == pids && got_sids == sids
+ }
+
+ fn prop_read_write_varu32(n: u32) -> bool {
+ let mut buf = vec![];
+ write_varu32(&mut buf, n);
+ let (got, nread) = read_varu32(&buf);
+ nread == buf.len() && got == n
+ }
+
+ fn prop_read_write_vari32(n: i32) -> bool {
+ let mut buf = vec![];
+ write_vari32(&mut buf, n);
+ let (got, nread) = read_vari32(&buf);
+ nread == buf.len() && got == n
+ }
+ }
+
+ fn dedup_state_ids(sids: Vec<StateID>) -> Vec<StateID> {
+ let mut set = alloc::collections::BTreeSet::new();
+ let mut deduped = vec![];
+ for sid in sids {
+ if set.contains(&sid) {
+ continue;
+ }
+ set.insert(sid);
+ deduped.push(sid);
+ }
+ deduped
+ }
+
+ fn dedup_pattern_ids(pids: Vec<PatternID>) -> Vec<PatternID> {
+ let mut set = alloc::collections::BTreeSet::new();
+ let mut deduped = vec![];
+ for pid in pids {
+ if set.contains(&pid) {
+ continue;
+ }
+ set.insert(pid);
+ deduped.push(pid);
+ }
+ deduped
+ }
+}
diff --git a/vendor/regex-automata/src/util/id.rs b/vendor/regex-automata/src/util/id.rs
new file mode 100644
index 000000000..70bf0a93b
--- /dev/null
+++ b/vendor/regex-automata/src/util/id.rs
@@ -0,0 +1,608 @@
+/*!
+Type definitions for identifier types.
+
+A [`StateID`] represents the possible set of identifiers used in regex engine
+implementations in this crate. For example, they are used to identify both NFA
+and DFA states.
+
+A [`PatternID`] represents the possible set of identifiers for patterns. All
+regex engine implementations in this crate support searching for multiple
+patterns simultaneously. A `PatternID` is how each pattern is uniquely
+identified for a particular instance of a regex engine. Namely, a pattern is
+assigned an auto-incrementing integer, starting at `0`, based on the order of
+patterns supplied during the construction of the regex engine.
+
+These identifier types represent a way for this crate to make correctness
+guarantees around the possible set of values that a `StateID` or a `PatternID`
+might represent. Similarly, they also provide a way of constraining the size of
+these identifiers to reduce space usage while still guaranteeing that all such
+identifiers are repsentable by a `usize` for the current target.
+
+Moreover, the identifier types clamp the range of permissible values to a range
+that is typically smaller than its internal representation. (With the maximum
+value being, e.g., `StateID::MAX`.) Users of these types may not rely this
+clamping for the purpose of memory safety. Users may, however, rely on these
+invariants to avoid panics or other types of logic bugs.
+*/
+
+// Continuing from the above comment about correctness guarantees, an example
+// of a way in which we use the guarantees on these types is delta encoding.
+// Namely, we require that IDs can be at most 2^31 - 2, which means the
+// difference between any two IDs is always representable as an i32.
+
+use core::{
+ convert::{Infallible, TryFrom},
+ mem, ops,
+};
+
+#[cfg(feature = "alloc")]
+use alloc::vec::Vec;
+
+/// An identifier for a regex pattern.
+///
+/// The identifier for a pattern corresponds to its relative position among
+/// other patterns in a single finite state machine. Namely, when building
+/// a multi-pattern regex engine, one must supply a sequence of patterns to
+/// match. The position (starting at 0) of each pattern in that sequence
+/// represents its identifier. This identifier is in turn used to identify and
+/// report matches of that pattern in various APIs.
+///
+/// A pattern ID is guaranteed to be representable by a `usize`. Similarly,
+/// the number of patterns in any regex engine in this crate is guaranteed to
+/// be representable by a `usize`. This applies to regex engines that have
+/// been deserialized; a deserialization error will be returned if it contains
+/// pattern IDs that violate these requirements in your current environment.
+///
+/// For extra convenience in some cases, this type also guarantees that all
+/// IDs can fit into an `i32` and an `isize` without overflowing.
+///
+/// # Representation
+///
+/// This type is always represented internally by a `u32` and is marked as
+/// `repr(transparent)`. Thus, this type always has the same representation as
+/// a `u32`.
+///
+/// # Indexing
+///
+/// For convenience, callers may use a `PatternID` to index slices.
+///
+/// # Safety
+///
+/// While a `PatternID` is meant to guarantee that its value fits into `usize`
+/// (while using a possibly smaller representation than `usize` on some
+/// targets), callers must not rely on this property for safety. Callers may
+/// choose to rely on this property for correctness however.
+#[repr(transparent)]
+#[derive(
+ Clone, Copy, Debug, Default, Eq, Hash, PartialEq, PartialOrd, Ord,
+)]
+pub struct PatternID(u32);
+
+impl PatternID {
+ /// The maximum pattern ID value, represented as a `usize`.
+ #[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))]
+ pub const MAX: PatternID =
+ PatternID::new_unchecked(core::i32::MAX as usize - 1);
+
+ /// The maximum pattern ID value, represented as a `usize`.
+ #[cfg(target_pointer_width = "16")]
+ pub const MAX: PatternID = PatternID::new_unchecked(core::isize::MAX - 1);
+
+ /// The total number of patterns that are allowed in any single regex
+ /// engine.
+ pub const LIMIT: usize = PatternID::MAX.as_usize() + 1;
+
+ /// The zero pattern ID value.
+ pub const ZERO: PatternID = PatternID::new_unchecked(0);
+
+ /// The number of bytes that a single `PatternID` uses in memory.
+ pub const SIZE: usize = core::mem::size_of::<PatternID>();
+
+ /// Create a new pattern ID.
+ ///
+ /// If the given identifier exceeds [`PatternID::MAX`], then this returns
+ /// an error.
+ #[inline]
+ pub fn new(id: usize) -> Result<PatternID, PatternIDError> {
+ PatternID::try_from(id)
+ }
+
+ /// Create a new pattern ID without checking whether the given value
+ /// exceeds [`PatternID::MAX`].
+ ///
+ /// While this is unchecked, providing an incorrect value must never
+ /// sacrifice memory safety, as documented above.
+ #[inline]
+ pub const fn new_unchecked(id: usize) -> PatternID {
+ PatternID(id as u32)
+ }
+
+ /// Like [`PatternID::new`], but panics if the given ID is not valid.
+ #[inline]
+ pub fn must(id: usize) -> PatternID {
+ PatternID::new(id).unwrap()
+ }
+
+ /// Return this pattern ID as a `usize`.
+ #[inline]
+ pub const fn as_usize(&self) -> usize {
+ self.0 as usize
+ }
+
+ /// Return the internal u32 of this pattern ID.
+ #[inline]
+ pub const fn as_u32(&self) -> u32 {
+ self.0
+ }
+
+ /// Return the internal u32 of this pattern ID represented as an i32.
+ ///
+ /// This is guaranteed to never overflow an `i32`.
+ #[inline]
+ pub const fn as_i32(&self) -> i32 {
+ self.0 as i32
+ }
+
+ /// Returns one more than this pattern ID as a usize.
+ ///
+ /// Since a pattern ID has constraints on its maximum value, adding `1` to
+ /// it will always fit in a `usize` (and a `u32`).
+ #[inline]
+ pub fn one_more(&self) -> usize {
+ self.as_usize().checked_add(1).unwrap()
+ }
+
+ /// Decode this pattern ID from the bytes given using the native endian
+ /// byte order for the current target.
+ ///
+ /// If the decoded integer is not representable as a pattern ID for the
+ /// current target, then this returns an error.
+ #[inline]
+ pub fn from_ne_bytes(bytes: [u8; 4]) -> Result<PatternID, PatternIDError> {
+ let id = u32::from_ne_bytes(bytes);
+ if id > PatternID::MAX.as_u32() {
+ return Err(PatternIDError { attempted: id as u64 });
+ }
+ Ok(PatternID::new_unchecked(id as usize))
+ }
+
+ /// Decode this pattern ID from the bytes given using the native endian
+ /// byte order for the current target.
+ ///
+ /// This is analogous to [`PatternID::new_unchecked`] in that is does not
+ /// check whether the decoded integer is representable as a pattern ID.
+ #[inline]
+ pub fn from_ne_bytes_unchecked(bytes: [u8; 4]) -> PatternID {
+ PatternID::new_unchecked(u32::from_ne_bytes(bytes) as usize)
+ }
+
+ /// Return the underlying pattern ID integer as raw bytes in native endian
+ /// format.
+ #[inline]
+ pub fn to_ne_bytes(&self) -> [u8; 4] {
+ self.0.to_ne_bytes()
+ }
+
+ /// Returns an iterator over all pattern IDs from 0 up to and not including
+ /// the given length.
+ ///
+ /// If the given length exceeds [`PatternID::LIMIT`], then this panics.
+ #[cfg(feature = "alloc")]
+ pub(crate) fn iter(len: usize) -> PatternIDIter {
+ PatternIDIter::new(len)
+ }
+}
+
+/// This error occurs when a pattern ID could not be constructed.
+///
+/// This occurs when given an integer exceeding the maximum pattern ID value.
+///
+/// When the `std` feature is enabled, this implements the `Error` trait.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct PatternIDError {
+ attempted: u64,
+}
+
+impl PatternIDError {
+ /// Returns the value that failed to constructed a pattern ID.
+ pub fn attempted(&self) -> u64 {
+ self.attempted
+ }
+}
+
+#[cfg(feature = "std")]
+impl std::error::Error for PatternIDError {}
+
+impl core::fmt::Display for PatternIDError {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ write!(
+ f,
+ "failed to create PatternID from {:?}, which exceeds {:?}",
+ self.attempted(),
+ PatternID::MAX,
+ )
+ }
+}
+
+/// An identifier for a state in a regex engine.
+///
+/// A state ID is guaranteed to be representable by a `usize`. Similarly, the
+/// number of states in any regex engine in this crate is guaranteed to be
+/// representable by a `usize`. This applies to regex engines that have been
+/// deserialized; a deserialization error will be returned if it contains state
+/// IDs that violate these requirements in your current environment.
+///
+/// For extra convenience in some cases, this type also guarantees that all
+/// IDs can fit into an `i32` and an `isize` without overflowing.
+///
+/// # Representation
+///
+/// This type is always represented internally by a `u32` and is marked as
+/// `repr(transparent)`. Thus, this type always has the same representation as
+/// a `u32`.
+///
+/// # Indexing
+///
+/// For convenience, callers may use a `StateID` to index slices.
+///
+/// # Safety
+///
+/// While a `StateID` is meant to guarantee that its value fits into `usize`
+/// (while using a possibly smaller representation than `usize` on some
+/// targets), callers must not rely on this property for safety. Callers may
+/// choose to rely on this property for correctness however.
+#[repr(transparent)]
+#[derive(
+ Clone, Copy, Debug, Default, Eq, Hash, PartialEq, PartialOrd, Ord,
+)]
+pub struct StateID(u32);
+
+impl StateID {
+ /// The maximum state ID value.
+ #[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))]
+ pub const MAX: StateID =
+ StateID::new_unchecked(core::i32::MAX as usize - 1);
+
+ /// The maximum state ID value.
+ #[cfg(target_pointer_width = "16")]
+ pub const MAX: StateID = StateID::new_unchecked(core::isize::MAX - 1);
+
+ /// The total number of states that are allowed in any single regex
+ /// engine, represented as a `usize`.
+ pub const LIMIT: usize = StateID::MAX.as_usize() + 1;
+
+ /// The zero state ID value.
+ pub const ZERO: StateID = StateID::new_unchecked(0);
+
+ /// The number of bytes that a single `StateID` uses in memory.
+ pub const SIZE: usize = core::mem::size_of::<StateID>();
+
+ /// Create a new state ID.
+ ///
+ /// If the given identifier exceeds [`StateID::MAX`], then this returns
+ /// an error.
+ #[inline]
+ pub fn new(id: usize) -> Result<StateID, StateIDError> {
+ StateID::try_from(id)
+ }
+
+ /// Create a new state ID without checking whether the given value
+ /// exceeds [`StateID::MAX`].
+ ///
+ /// While this is unchecked, providing an incorrect value must never
+ /// sacrifice memory safety, as documented above.
+ #[inline]
+ pub const fn new_unchecked(id: usize) -> StateID {
+ StateID(id as u32)
+ }
+
+ /// Like [`StateID::new`], but panics if the given ID is not valid.
+ #[inline]
+ pub fn must(id: usize) -> StateID {
+ StateID::new(id).unwrap()
+ }
+
+ /// Return this state ID as a `usize`.
+ #[inline]
+ pub const fn as_usize(&self) -> usize {
+ self.0 as usize
+ }
+
+ /// Return the internal u32 of this state ID.
+ #[inline]
+ pub const fn as_u32(&self) -> u32 {
+ self.0
+ }
+
+ /// Return the internal u32 of this pattern ID represented as an i32.
+ ///
+ /// This is guaranteed to never overflow an `i32`.
+ #[inline]
+ pub const fn as_i32(&self) -> i32 {
+ self.0 as i32
+ }
+
+ /// Returns one more than this state ID as a usize.
+ ///
+ /// Since a state ID has constraints on its maximum value, adding `1` to
+ /// it will always fit in a `usize` (and a `u32`).
+ #[inline]
+ pub fn one_more(&self) -> usize {
+ self.as_usize().checked_add(1).unwrap()
+ }
+
+ /// Decode this state ID from the bytes given using the native endian byte
+ /// order for the current target.
+ ///
+ /// If the decoded integer is not representable as a state ID for the
+ /// current target, then this returns an error.
+ #[inline]
+ pub fn from_ne_bytes(bytes: [u8; 4]) -> Result<StateID, StateIDError> {
+ let id = u32::from_ne_bytes(bytes);
+ if id > StateID::MAX.as_u32() {
+ return Err(StateIDError { attempted: id as u64 });
+ }
+ Ok(StateID::new_unchecked(id as usize))
+ }
+
+ /// Decode this state ID from the bytes given using the native endian
+ /// byte order for the current target.
+ ///
+ /// This is analogous to [`StateID::new_unchecked`] in that is does not
+ /// check whether the decoded integer is representable as a state ID.
+ #[inline]
+ pub fn from_ne_bytes_unchecked(bytes: [u8; 4]) -> StateID {
+ StateID::new_unchecked(u32::from_ne_bytes(bytes) as usize)
+ }
+
+ /// Return the underlying state ID integer as raw bytes in native endian
+ /// format.
+ #[inline]
+ pub fn to_ne_bytes(&self) -> [u8; 4] {
+ self.0.to_ne_bytes()
+ }
+
+ /// Returns an iterator over all state IDs from 0 up to and not including
+ /// the given length.
+ ///
+ /// If the given length exceeds [`StateID::LIMIT`], then this panics.
+ #[cfg(feature = "alloc")]
+ pub(crate) fn iter(len: usize) -> StateIDIter {
+ StateIDIter::new(len)
+ }
+}
+
+/// This error occurs when a state ID could not be constructed.
+///
+/// This occurs when given an integer exceeding the maximum state ID value.
+///
+/// When the `std` feature is enabled, this implements the `Error` trait.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct StateIDError {
+ attempted: u64,
+}
+
+impl StateIDError {
+ /// Returns the value that failed to constructed a state ID.
+ pub fn attempted(&self) -> u64 {
+ self.attempted
+ }
+}
+
+#[cfg(feature = "std")]
+impl std::error::Error for StateIDError {}
+
+impl core::fmt::Display for StateIDError {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ write!(
+ f,
+ "failed to create StateID from {:?}, which exceeds {:?}",
+ self.attempted(),
+ StateID::MAX,
+ )
+ }
+}
+
+/// A macro for defining exactly identical (modulo names) impls for ID types.
+macro_rules! impls {
+ ($ty:ident, $tyerr:ident, $tyiter:ident) => {
+ #[derive(Clone, Debug)]
+ pub(crate) struct $tyiter {
+ rng: ops::Range<usize>,
+ }
+
+ impl $tyiter {
+ #[cfg(feature = "alloc")]
+ fn new(len: usize) -> $tyiter {
+ assert!(
+ len <= $ty::LIMIT,
+ "cannot create iterator with IDs when number of \
+ elements exceed {:?}",
+ $ty::LIMIT,
+ );
+ $tyiter { rng: 0..len }
+ }
+ }
+
+ impl Iterator for $tyiter {
+ type Item = $ty;
+
+ fn next(&mut self) -> Option<$ty> {
+ if self.rng.start >= self.rng.end {
+ return None;
+ }
+ let next_id = self.rng.start + 1;
+ let id = mem::replace(&mut self.rng.start, next_id);
+ // new_unchecked is OK since we asserted that the number of
+ // elements in this iterator will fit in an ID at construction.
+ Some($ty::new_unchecked(id))
+ }
+ }
+
+ impl<T> core::ops::Index<$ty> for [T] {
+ type Output = T;
+
+ #[inline]
+ fn index(&self, index: $ty) -> &T {
+ &self[index.as_usize()]
+ }
+ }
+
+ impl<T> core::ops::IndexMut<$ty> for [T] {
+ #[inline]
+ fn index_mut(&mut self, index: $ty) -> &mut T {
+ &mut self[index.as_usize()]
+ }
+ }
+
+ #[cfg(feature = "alloc")]
+ impl<T> core::ops::Index<$ty> for Vec<T> {
+ type Output = T;
+
+ #[inline]
+ fn index(&self, index: $ty) -> &T {
+ &self[index.as_usize()]
+ }
+ }
+
+ #[cfg(feature = "alloc")]
+ impl<T> core::ops::IndexMut<$ty> for Vec<T> {
+ #[inline]
+ fn index_mut(&mut self, index: $ty) -> &mut T {
+ &mut self[index.as_usize()]
+ }
+ }
+
+ impl TryFrom<usize> for $ty {
+ type Error = $tyerr;
+
+ fn try_from(id: usize) -> Result<$ty, $tyerr> {
+ if id > $ty::MAX.as_usize() {
+ return Err($tyerr { attempted: id as u64 });
+ }
+ Ok($ty::new_unchecked(id))
+ }
+ }
+
+ impl TryFrom<u8> for $ty {
+ type Error = Infallible;
+
+ fn try_from(id: u8) -> Result<$ty, Infallible> {
+ Ok($ty::new_unchecked(id as usize))
+ }
+ }
+
+ impl TryFrom<u16> for $ty {
+ type Error = $tyerr;
+
+ fn try_from(id: u16) -> Result<$ty, $tyerr> {
+ if id as u32 > $ty::MAX.as_u32() {
+ return Err($tyerr { attempted: id as u64 });
+ }
+ Ok($ty::new_unchecked(id as usize))
+ }
+ }
+
+ impl TryFrom<u32> for $ty {
+ type Error = $tyerr;
+
+ fn try_from(id: u32) -> Result<$ty, $tyerr> {
+ if id > $ty::MAX.as_u32() {
+ return Err($tyerr { attempted: id as u64 });
+ }
+ Ok($ty::new_unchecked(id as usize))
+ }
+ }
+
+ impl TryFrom<u64> for $ty {
+ type Error = $tyerr;
+
+ fn try_from(id: u64) -> Result<$ty, $tyerr> {
+ if id > $ty::MAX.as_u32() as u64 {
+ return Err($tyerr { attempted: id });
+ }
+ Ok($ty::new_unchecked(id as usize))
+ }
+ }
+
+ #[cfg(test)]
+ impl quickcheck::Arbitrary for $ty {
+ fn arbitrary(gen: &mut quickcheck::Gen) -> $ty {
+ use core::cmp::max;
+
+ let id = max(i32::MIN + 1, i32::arbitrary(gen)).abs();
+ if id > $ty::MAX.as_i32() {
+ $ty::MAX
+ } else {
+ $ty::new(usize::try_from(id).unwrap()).unwrap()
+ }
+ }
+ }
+ };
+}
+
+impls!(PatternID, PatternIDError, PatternIDIter);
+impls!(StateID, StateIDError, StateIDIter);
+
+/// A utility trait that defines a couple of adapters for making it convenient
+/// to access indices as ID types. We require ExactSizeIterator so that
+/// iterator construction can do a single check to make sure the index of each
+/// element is representable by its ID type.
+#[cfg(feature = "alloc")]
+pub(crate) trait IteratorIDExt: Iterator {
+ fn with_pattern_ids(self) -> WithPatternIDIter<Self>
+ where
+ Self: Sized + ExactSizeIterator,
+ {
+ WithPatternIDIter::new(self)
+ }
+
+ fn with_state_ids(self) -> WithStateIDIter<Self>
+ where
+ Self: Sized + ExactSizeIterator,
+ {
+ WithStateIDIter::new(self)
+ }
+}
+
+#[cfg(feature = "alloc")]
+impl<I: Iterator> IteratorIDExt for I {}
+
+#[cfg(feature = "alloc")]
+macro_rules! iditer {
+ ($ty:ident, $iterty:ident, $withiterty:ident) => {
+ /// An iterator adapter that is like std::iter::Enumerate, but attaches
+ /// IDs. It requires ExactSizeIterator. At construction, it ensures
+ /// that the index of each element in the iterator is representable in
+ /// the corresponding ID type.
+ #[derive(Clone, Debug)]
+ pub(crate) struct $withiterty<I> {
+ it: I,
+ ids: $iterty,
+ }
+
+ impl<I: Iterator + ExactSizeIterator> $withiterty<I> {
+ fn new(it: I) -> $withiterty<I> {
+ let ids = $ty::iter(it.len());
+ $withiterty { it, ids }
+ }
+ }
+
+ impl<I: Iterator + ExactSizeIterator> Iterator for $withiterty<I> {
+ type Item = ($ty, I::Item);
+
+ fn next(&mut self) -> Option<($ty, I::Item)> {
+ let item = self.it.next()?;
+ // Number of elements in this iterator must match, according
+ // to contract of ExactSizeIterator.
+ let id = self.ids.next().unwrap();
+ Some((id, item))
+ }
+ }
+ };
+}
+
+#[cfg(feature = "alloc")]
+iditer!(PatternID, PatternIDIter, WithPatternIDIter);
+#[cfg(feature = "alloc")]
+iditer!(StateID, StateIDIter, WithStateIDIter);
diff --git a/vendor/regex-automata/src/util/lazy.rs b/vendor/regex-automata/src/util/lazy.rs
new file mode 100644
index 000000000..d8cac6ef4
--- /dev/null
+++ b/vendor/regex-automata/src/util/lazy.rs
@@ -0,0 +1,31 @@
+use core::{
+ cell::Cell,
+ ptr,
+ sync::atomic::{AtomicPtr, Ordering},
+};
+
+use alloc::{boxed::Box, vec::Vec};
+
+#[inline(always)]
+pub(crate) fn get_or_init<T: Send + Sync + 'static>(
+ location: &'static AtomicPtr<T>,
+ init: impl FnOnce() -> T,
+) -> &'static T {
+ let mut ptr = location.load(Ordering::Acquire);
+ if ptr.is_null() {
+ let new_dfa = Box::new(init());
+ ptr = Box::into_raw(new_dfa);
+ let result = location.compare_exchange(
+ ptr::null_mut(),
+ ptr,
+ Ordering::AcqRel,
+ Ordering::Acquire,
+ );
+ if let Err(old) = result {
+ let redundant = unsafe { Box::from_raw(ptr) };
+ drop(redundant);
+ ptr = old;
+ }
+ }
+ unsafe { &*ptr }
+}
diff --git a/vendor/regex-automata/src/util/matchtypes.rs b/vendor/regex-automata/src/util/matchtypes.rs
new file mode 100644
index 000000000..de0fa65bf
--- /dev/null
+++ b/vendor/regex-automata/src/util/matchtypes.rs
@@ -0,0 +1,356 @@
+use crate::util::id::PatternID;
+
+/// The kind of match semantics to use for a DFA.
+///
+/// The default match kind is `LeftmostFirst`.
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+pub enum MatchKind {
+ /// Report all possible matches.
+ All,
+ /// Report only the leftmost matches. When multiple leftmost matches exist,
+ /// report the match corresponding to the part of the regex that appears
+ /// first in the syntax.
+ LeftmostFirst,
+ /// Hints that destructuring should not be exhaustive.
+ ///
+ /// This enum may grow additional variants, so this makes sure clients
+ /// don't count on exhaustive matching. (Otherwise, adding a new variant
+ /// could break existing code.)
+ #[doc(hidden)]
+ __Nonexhaustive,
+ // There is prior art in RE2 that shows that we should be able to add
+ // LeftmostLongest too. The tricky part of it is supporting ungreedy
+ // repetitions. Instead of treating all NFA states as having equivalent
+ // priority (as in 'All') or treating all NFA states as having distinct
+ // priority based on order (as in 'LeftmostFirst'), we instead group NFA
+ // states into sets, and treat members of each set as having equivalent
+ // priority, but having greater priority than all following members
+ // of different sets.
+ //
+ // However, it's not clear whether it's really worth adding this. After
+ // all, leftmost-longest can be emulated when using literals by using
+ // leftmost-first and sorting the literals by length in descending order.
+ // However, this won't work for arbitrary regexes. e.g., `\w|\w\w` will
+ // always match `a` in `ab` when using leftmost-first, but leftmost-longest
+ // would match `ab`.
+}
+
+impl MatchKind {
+ #[cfg(feature = "alloc")]
+ pub(crate) fn continue_past_first_match(&self) -> bool {
+ *self == MatchKind::All
+ }
+}
+
+impl Default for MatchKind {
+ fn default() -> MatchKind {
+ MatchKind::LeftmostFirst
+ }
+}
+
+/// A representation of a match reported by a regex engine.
+///
+/// A match records the start and end offsets of the match in the haystack.
+///
+/// Every match guarantees that `start <= end`.
+#[derive(Clone, Debug, Eq, Hash, PartialEq)]
+pub struct Match {
+ /// The start offset of the match, inclusive.
+ start: usize,
+ /// The end offset of the match, exclusive.
+ end: usize,
+}
+
+impl Match {
+ /// Create a new match from a byte offset span.
+ ///
+ /// # Panics
+ ///
+ /// This panics if `end < start`.
+ #[inline]
+ pub fn new(start: usize, end: usize) -> Match {
+ assert!(start <= end);
+ Match { start, end }
+ }
+
+ /// The starting position of the match.
+ #[inline]
+ pub fn start(&self) -> usize {
+ self.start
+ }
+
+ /// The ending position of the match.
+ #[inline]
+ pub fn end(&self) -> usize {
+ self.end
+ }
+
+ /// Returns the match location as a range.
+ #[inline]
+ pub fn range(&self) -> core::ops::Range<usize> {
+ self.start..self.end
+ }
+
+ /// Returns true if and only if this match is empty. That is, when
+ /// `start() == end()`.
+ ///
+ /// An empty match can only be returned when the empty string was among
+ /// the patterns used to build the Aho-Corasick automaton.
+ #[inline]
+ pub fn is_empty(&self) -> bool {
+ self.start == self.end
+ }
+}
+
+/// A representation of a match reported by a DFA.
+///
+/// This is called a "half" match because it only includes the end location
+/// (or start location for a reverse match) of a match. This corresponds to the
+/// information that a single DFA scan can report. Getting the other half of
+/// the match requires a second scan with a reversed DFA.
+///
+/// A half match also includes the pattern that matched. The pattern is
+/// identified by an ID, which corresponds to its position (starting from `0`)
+/// relative to other patterns used to construct the corresponding DFA. If only
+/// a single pattern is provided to the DFA, then all matches are guaranteed to
+/// have a pattern ID of `0`.
+#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
+pub struct HalfMatch {
+ /// The pattern ID.
+ pub(crate) pattern: PatternID,
+ /// The offset of the match.
+ ///
+ /// For forward searches, the offset is exclusive. For reverse searches,
+ /// the offset is inclusive.
+ pub(crate) offset: usize,
+}
+
+impl HalfMatch {
+ /// Create a new half match from a pattern ID and a byte offset.
+ #[inline]
+ pub fn new(pattern: PatternID, offset: usize) -> HalfMatch {
+ HalfMatch { pattern, offset }
+ }
+
+ /// Create a new half match from a pattern ID and a byte offset.
+ ///
+ /// This is like [`HalfMatch::new`], but accepts a `usize` instead of a
+ /// [`PatternID`]. This panics if the given `usize` is not representable
+ /// as a `PatternID`.
+ #[inline]
+ pub fn must(pattern: usize, offset: usize) -> HalfMatch {
+ HalfMatch::new(PatternID::new(pattern).unwrap(), offset)
+ }
+
+ /// Returns the ID of the pattern that matched.
+ ///
+ /// The ID of a pattern is derived from the position in which it was
+ /// originally inserted into the corresponding DFA. The first pattern has
+ /// identifier `0`, and each subsequent pattern is `1`, `2` and so on.
+ #[inline]
+ pub fn pattern(&self) -> PatternID {
+ self.pattern
+ }
+
+ /// The position of the match.
+ ///
+ /// If this match was produced by a forward search, then the offset is
+ /// exclusive. If this match was produced by a reverse search, then the
+ /// offset is inclusive.
+ #[inline]
+ pub fn offset(&self) -> usize {
+ self.offset
+ }
+}
+
+/// A representation of a multi match reported by a regex engine.
+///
+/// A multi match has two essential pieces of information: the identifier of
+/// the pattern that matched, along with the start and end offsets of the match
+/// in the haystack.
+///
+/// The pattern is identified by an ID, which corresponds to its position
+/// (starting from `0`) relative to other patterns used to construct the
+/// corresponding regex engine. If only a single pattern is provided, then all
+/// multi matches are guaranteed to have a pattern ID of `0`.
+///
+/// Every multi match guarantees that `start <= end`.
+#[derive(Clone, Debug, Eq, Hash, PartialEq)]
+pub struct MultiMatch {
+ /// The pattern ID.
+ pattern: PatternID,
+ /// The start offset of the match, inclusive.
+ start: usize,
+ /// The end offset of the match, exclusive.
+ end: usize,
+}
+
+impl MultiMatch {
+ /// Create a new match from a pattern ID and a byte offset span.
+ ///
+ /// # Panics
+ ///
+ /// This panics if `end < start`.
+ #[inline]
+ pub fn new(pattern: PatternID, start: usize, end: usize) -> MultiMatch {
+ assert!(start <= end);
+ MultiMatch { pattern, start, end }
+ }
+
+ /// Create a new match from a pattern ID and a byte offset span.
+ ///
+ /// This is like [`MultiMatch::new`], but accepts a `usize` instead of a
+ /// [`PatternID`]. This panics if the given `usize` is not representable
+ /// as a `PatternID`.
+ ///
+ /// # Panics
+ ///
+ /// This panics if `end < start` or if `pattern > PatternID::MAX`.
+ #[inline]
+ pub fn must(pattern: usize, start: usize, end: usize) -> MultiMatch {
+ MultiMatch::new(PatternID::new(pattern).unwrap(), start, end)
+ }
+
+ /// Returns the ID of the pattern that matched.
+ ///
+ /// The ID of a pattern is derived from the position in which it was
+ /// originally inserted into the corresponding regex engine. The first
+ /// pattern has identifier `0`, and each subsequent pattern is `1`, `2` and
+ /// so on.
+ #[inline]
+ pub fn pattern(&self) -> PatternID {
+ self.pattern
+ }
+
+ /// The starting position of the match.
+ #[inline]
+ pub fn start(&self) -> usize {
+ self.start
+ }
+
+ /// The ending position of the match.
+ #[inline]
+ pub fn end(&self) -> usize {
+ self.end
+ }
+
+ /// Returns the match location as a range.
+ #[inline]
+ pub fn range(&self) -> core::ops::Range<usize> {
+ self.start..self.end
+ }
+
+ /// Returns true if and only if this match is empty. That is, when
+ /// `start() == end()`.
+ ///
+ /// An empty match can only be returned when the empty string was among
+ /// the patterns used to build the Aho-Corasick automaton.
+ #[inline]
+ pub fn is_empty(&self) -> bool {
+ self.start == self.end
+ }
+}
+
+/// An error type indicating that a search stopped prematurely without finding
+/// a match.
+///
+/// This error type implies that one cannot assume that no matches occur, since
+/// the search stopped before completing.
+///
+/// Normally, when one searches for something, the response is either an
+/// affirmative "it was found at this location" or a negative "not found at
+/// all." However, in some cases, a regex engine can be configured to stop its
+/// search before concluding whether a match exists or not. When this happens,
+/// it may be important for the caller to know why the regex engine gave up and
+/// where in the input it gave up at. This error type exposes the 'why' and the
+/// 'where.'
+///
+/// For example, the DFAs provided by this library generally cannot correctly
+/// implement Unicode word boundaries. Instead, they provide an option to
+/// eagerly support them on ASCII text (since Unicode word boundaries are
+/// equivalent to ASCII word boundaries when searching ASCII text), but will
+/// "give up" if a non-ASCII byte is seen. In such cases, one is usually
+/// required to either report the failure to the caller (unergonomic) or
+/// otherwise fall back to some other regex engine (ergonomic, but potentially
+/// costly).
+///
+/// More generally, some regex engines offer the ability for callers to specify
+/// certain bytes that will trigger the regex engine to automatically quit if
+/// they are seen.
+///
+/// Still yet, there may be other reasons for a failed match. For example,
+/// the hybrid DFA provided by this crate can be configured to give up if it
+/// believes that it is not efficient. This in turn permits callers to choose a
+/// different regex engine.
+///
+/// # Advice
+///
+/// While this form of error reporting adds complexity, it is generally
+/// possible for callers to configure regex engines to never give up a search,
+/// and thus never return an error. Indeed, the default configuration for every
+/// regex engine in this crate is such that they will never stop searching
+/// early. Therefore, the only way to get a match error is if the regex engine
+/// is explicitly configured to do so. Options that enable this behavior
+/// document the new error conditions they imply.
+///
+/// Regex engines for which no errors are possible for any configuration will
+/// return the normal `Option<Match>` and not use this error type at all.
+///
+/// For example, regex engines in the `dfa` sub-module will only report
+/// `MatchError::Quit` if instructed by either
+/// [enabling Unicode word boundaries](crate::dfa::dense::Config::unicode_word_boundary)
+/// or by
+/// [explicitly specifying one or more quit bytes](crate::dfa::dense::Config::quit).
+#[derive(Clone, Debug, Eq, Hash, PartialEq)]
+pub enum MatchError {
+ // Note that the first version of this type was called `SearchError` and it
+ // included a third `None` variant to indicate that the search completed
+ // and no match was found. However, this was problematic for iterator
+ // APIs where the `None` sentinel for stopping iteration corresponds
+ // precisely to the "match not found" case. The fact that the `None`
+ // variant was buried inside this type was in turn quite awkward. So
+ // instead, I removed the `None` variant, renamed the type and used
+ // `Result<Option<Match>, MatchError>` in non-iterator APIs instead of the
+ // conceptually simpler `Result<Match, MatchError>`. However, we "regain"
+ // ergonomics by only putting the more complex API in the `try_` variants
+ // ("fallible") of search methods. The infallible APIs will instead just
+ // return `Option<Match>` and panic on error.
+ /// The search saw a "quit" byte at which it was instructed to stop
+ /// searching.
+ Quit {
+ /// The "quit" byte that was observed that caused the search to stop.
+ byte: u8,
+ /// The offset at which the quit byte was observed.
+ offset: usize,
+ },
+ /// The search, based on heuristics, determined that it would be better
+ /// to stop, typically to provide the caller an opportunity to use an
+ /// alternative regex engine.
+ ///
+ /// Currently, the only way for this to occur is via the lazy DFA and
+ /// only when it is configured to do so (it will not return this error by
+ /// default).
+ GaveUp {
+ /// The offset at which the search stopped. This corresponds to the
+ /// position immediately following the last byte scanned.
+ offset: usize,
+ },
+}
+
+#[cfg(feature = "std")]
+impl std::error::Error for MatchError {}
+
+impl core::fmt::Display for MatchError {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ match *self {
+ MatchError::Quit { byte, offset } => write!(
+ f,
+ "quit search after observing byte \\x{:02X} at offset {}",
+ byte, offset,
+ ),
+ MatchError::GaveUp { offset } => {
+ write!(f, "gave up searching at offset {}", offset)
+ }
+ }
+ }
+}
diff --git a/vendor/regex-automata/src/util/mod.rs b/vendor/regex-automata/src/util/mod.rs
new file mode 100644
index 000000000..798507da2
--- /dev/null
+++ b/vendor/regex-automata/src/util/mod.rs
@@ -0,0 +1,275 @@
+/*!
+TODO
+*/
+
+use core::{ascii, fmt, str};
+
+#[cfg(feature = "alloc")]
+use alloc::vec::Vec;
+
+pub mod alphabet;
+pub(crate) mod bytes;
+#[cfg(feature = "alloc")]
+pub(crate) mod determinize;
+pub mod id;
+#[cfg(feature = "alloc")]
+pub(crate) mod lazy;
+pub(crate) mod matchtypes;
+pub mod prefilter;
+#[cfg(feature = "alloc")]
+pub(crate) mod sparse_set;
+pub(crate) mod start;
+#[cfg(feature = "alloc")]
+pub(crate) mod syntax;
+
+/// The offset, in bytes, that a match is delayed by in the DFAs generated by
+/// this crate. (This includes lazy DFAs.)
+///
+/// The purpose of this delay is to support look-ahead such as \b (ASCII-only)
+/// and $. In particular, both of these operators may require the
+/// identification of the end of input in order to confirm a match. Not only
+/// does this mean that all matches must therefore be delayed by a single byte,
+/// but that a special EOI value is added to the alphabet of all DFAs. (Which
+/// means that even though the alphabet of a DFA is typically all byte values,
+/// the actual maximum alphabet size is 257 due to the extra EOI value.)
+///
+/// Since we delay matches by only 1 byte, this can't fully support a
+/// Unicode-aware \b operator, which requires multi-byte look-ahead. Indeed,
+/// DFAs in this crate do not support it. (It's not as simple as just
+/// increasing the match offset to do it---otherwise we would---but building
+/// the full Unicode-aware word boundary detection into an automaton is quite
+/// tricky.)
+pub(crate) const MATCH_OFFSET: usize = 1;
+
+/// A type that wraps a single byte with a convenient fmt::Debug impl that
+/// escapes the byte.
+pub(crate) struct DebugByte(pub u8);
+
+impl fmt::Debug for DebugByte {
+ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ // 10 bytes is enough to cover any output from ascii::escape_default.
+ let mut bytes = [0u8; 10];
+ let mut len = 0;
+ for (i, mut b) in ascii::escape_default(self.0).enumerate() {
+ // capitalize \xab to \xAB
+ if i >= 2 && b'a' <= b && b <= b'f' {
+ b -= 32;
+ }
+ bytes[len] = b;
+ len += 1;
+ }
+ write!(f, "{}", str::from_utf8(&bytes[..len]).unwrap())
+ }
+}
+
+/// Returns the smallest possible index of the next valid UTF-8 sequence
+/// starting after `i`.
+///
+/// For all inputs, including invalid UTF-8 and any value of `i`, the return
+/// value is guaranteed to be greater than `i`.
+///
+/// Generally speaking, this should only be called on `text` when it is
+/// permitted to assume that it is valid UTF-8 and where either `i >=
+/// text.len()` or where `text[i]` is a leading byte of a UTF-8 sequence.
+#[inline(always)]
+pub(crate) fn next_utf8(text: &[u8], i: usize) -> usize {
+ let b = match text.get(i) {
+ None => return i.checked_add(1).unwrap(),
+ Some(&b) => b,
+ };
+ // For cases where we see an invalid UTF-8 byte, there isn't much we can do
+ // other than just start at the next byte.
+ let inc = utf8_len(b).unwrap_or(1);
+ i.checked_add(inc).unwrap()
+}
+
+/// Returns true if and only if the given byte is considered a word character.
+/// This only applies to ASCII.
+///
+/// This was copied from regex-syntax so that we can use it to determine the
+/// starting DFA state while searching without depending on regex-syntax. The
+/// definition is never going to change, so there's no maintenance/bit-rot
+/// hazard here.
+#[inline(always)]
+pub(crate) fn is_word_byte(b: u8) -> bool {
+ match b {
+ b'_' | b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' => true,
+ _ => false,
+ }
+}
+
+/// Decodes the next UTF-8 encoded codepoint from the given byte slice.
+///
+/// If no valid encoding of a codepoint exists at the beginning of the given
+/// byte slice, then the first byte is returned instead.
+///
+/// This returns `None` if and only if `bytes` is empty.
+#[inline(always)]
+pub(crate) fn decode_utf8(bytes: &[u8]) -> Option<Result<char, u8>> {
+ if bytes.is_empty() {
+ return None;
+ }
+ let len = match utf8_len(bytes[0]) {
+ None => return Some(Err(bytes[0])),
+ Some(len) if len > bytes.len() => return Some(Err(bytes[0])),
+ Some(1) => return Some(Ok(bytes[0] as char)),
+ Some(len) => len,
+ };
+ match str::from_utf8(&bytes[..len]) {
+ Ok(s) => Some(Ok(s.chars().next().unwrap())),
+ Err(_) => Some(Err(bytes[0])),
+ }
+}
+
+/// Decodes the last UTF-8 encoded codepoint from the given byte slice.
+///
+/// If no valid encoding of a codepoint exists at the end of the given byte
+/// slice, then the last byte is returned instead.
+///
+/// This returns `None` if and only if `bytes` is empty.
+#[inline(always)]
+pub(crate) fn decode_last_utf8(bytes: &[u8]) -> Option<Result<char, u8>> {
+ if bytes.is_empty() {
+ return None;
+ }
+ let mut start = bytes.len() - 1;
+ let limit = bytes.len().saturating_sub(4);
+ while start > limit && !is_leading_or_invalid_utf8_byte(bytes[start]) {
+ start -= 1;
+ }
+ match decode_utf8(&bytes[start..]) {
+ None => None,
+ Some(Ok(ch)) => Some(Ok(ch)),
+ Some(Err(_)) => Some(Err(bytes[bytes.len() - 1])),
+ }
+}
+
+/// Given a UTF-8 leading byte, this returns the total number of code units
+/// in the following encoded codepoint.
+///
+/// If the given byte is not a valid UTF-8 leading byte, then this returns
+/// `None`.
+#[inline(always)]
+fn utf8_len(byte: u8) -> Option<usize> {
+ if byte <= 0x7F {
+ return Some(1);
+ } else if byte & 0b1100_0000 == 0b1000_0000 {
+ return None;
+ } else if byte <= 0b1101_1111 {
+ Some(2)
+ } else if byte <= 0b1110_1111 {
+ Some(3)
+ } else if byte <= 0b1111_0111 {
+ Some(4)
+ } else {
+ None
+ }
+}
+
+/// Returns true if and only if the given byte is either a valid leading UTF-8
+/// byte, or is otherwise an invalid byte that can never appear anywhere in a
+/// valid UTF-8 sequence.
+#[inline(always)]
+fn is_leading_or_invalid_utf8_byte(b: u8) -> bool {
+ // In the ASCII case, the most significant bit is never set. The leading
+ // byte of a 2/3/4-byte sequence always has the top two most significant
+ // bits set. For bytes that can never appear anywhere in valid UTF-8, this
+ // also returns true, since every such byte has its two most significant
+ // bits set:
+ //
+ // \xC0 :: 11000000
+ // \xC1 :: 11000001
+ // \xF5 :: 11110101
+ // \xF6 :: 11110110
+ // \xF7 :: 11110111
+ // \xF8 :: 11111000
+ // \xF9 :: 11111001
+ // \xFA :: 11111010
+ // \xFB :: 11111011
+ // \xFC :: 11111100
+ // \xFD :: 11111101
+ // \xFE :: 11111110
+ // \xFF :: 11111111
+ (b & 0b1100_0000) != 0b1000_0000
+}
+
+#[cfg(feature = "alloc")]
+#[inline(always)]
+pub(crate) fn is_word_char_fwd(bytes: &[u8], mut at: usize) -> bool {
+ use core::{ptr, sync::atomic::AtomicPtr};
+
+ use crate::{
+ dfa::{
+ dense::{self, DFA},
+ Automaton,
+ },
+ util::lazy,
+ };
+
+ static WORD: AtomicPtr<DFA<Vec<u32>>> = AtomicPtr::new(ptr::null_mut());
+
+ let dfa = lazy::get_or_init(&WORD, || {
+ // TODO: Should we use a lazy DFA here instead? It does complicate
+ // things somewhat, since we then need a mutable cache, which probably
+ // means a thread local.
+ dense::Builder::new()
+ .configure(dense::Config::new().anchored(true))
+ .build(r"\w")
+ .unwrap()
+ });
+ // This is OK since '\w' contains no look-around.
+ let mut sid = dfa.universal_start_state();
+ while at < bytes.len() {
+ let byte = bytes[at];
+ sid = dfa.next_state(sid, byte);
+ at += 1;
+ if dfa.is_special_state(sid) {
+ if dfa.is_match_state(sid) {
+ return true;
+ } else if dfa.is_dead_state(sid) {
+ return false;
+ }
+ }
+ }
+ dfa.is_match_state(dfa.next_eoi_state(sid))
+}
+
+#[cfg(feature = "alloc")]
+#[inline(always)]
+pub(crate) fn is_word_char_rev(bytes: &[u8], mut at: usize) -> bool {
+ use core::{ptr, sync::atomic::AtomicPtr};
+
+ use crate::{
+ dfa::{
+ dense::{self, DFA},
+ Automaton,
+ },
+ nfa::thompson::NFA,
+ };
+
+ static WORD: AtomicPtr<DFA<Vec<u32>>> = AtomicPtr::new(ptr::null_mut());
+
+ let dfa = lazy::get_or_init(&WORD, || {
+ dense::Builder::new()
+ .configure(dense::Config::new().anchored(true))
+ .thompson(NFA::config().reverse(true).shrink(true))
+ .build(r"\w")
+ .unwrap()
+ });
+
+ // This is OK since '\w' contains no look-around.
+ let mut sid = dfa.universal_start_state();
+ while at > 0 {
+ at -= 1;
+ let byte = bytes[at];
+ sid = dfa.next_state(sid, byte);
+ if dfa.is_special_state(sid) {
+ if dfa.is_match_state(sid) {
+ return true;
+ } else if dfa.is_dead_state(sid) {
+ return false;
+ }
+ }
+ }
+ dfa.is_match_state(dfa.next_eoi_state(sid))
+}
diff --git a/vendor/regex-automata/src/util/prefilter.rs b/vendor/regex-automata/src/util/prefilter.rs
new file mode 100644
index 000000000..5fe151524
--- /dev/null
+++ b/vendor/regex-automata/src/util/prefilter.rs
@@ -0,0 +1,281 @@
+use crate::Match;
+
+/// A candidate is the result of running a prefilter on a haystack at a
+/// particular position. The result is one of no match, a confirmed match or
+/// a possible match.
+///
+/// When no match is returned, the prefilter is guaranteeing that no possible
+/// match can be found in the haystack, and the caller may trust this. That is,
+/// all correct prefilters must never report false negatives.
+///
+/// In some cases, a prefilter can confirm a match very quickly, in which case,
+/// the caller may use this to stop what it's doing and report the match. In
+/// this case, prefilter implementations must never report a false positive.
+/// In other cases, the prefilter can only report a potential match, in which
+/// case the callers must attempt to confirm the match. In this case, prefilter
+/// implementations are permitted to return false positives.
+#[derive(Clone, Debug)]
+pub enum Candidate {
+ /// The prefilter reports that no match is possible. Prefilter
+ /// implementations will never report false negatives.
+ None,
+ /// The prefilter reports that a match has been confirmed at the provided
+ /// byte offsets. When this variant is reported, the prefilter is
+ /// guaranteeing a match. No false positives are permitted.
+ Match(Match),
+ /// The prefilter reports that a match *may* start at the given position.
+ /// When this variant is reported, it may correspond to a false positive.
+ PossibleStartOfMatch(usize),
+}
+
+impl Candidate {
+ /// Convert this candidate into an option. This is useful when callers do
+ /// not distinguish between true positives and false positives (i.e., the
+ /// caller must always confirm the match in order to update some other
+ /// state).
+ ///
+ /// The byte offset in the option returned corresponds to the starting
+ /// position of the possible match.
+ pub fn into_option(self) -> Option<usize> {
+ match self {
+ Candidate::None => None,
+ Candidate::Match(ref m) => Some(m.start()),
+ Candidate::PossibleStartOfMatch(start) => Some(start),
+ }
+ }
+}
+
+/// A prefilter describes the behavior of fast literal scanners for quickly
+/// skipping past bytes in the haystack that we know cannot possibly
+/// participate in a match.
+pub trait Prefilter: core::fmt::Debug {
+ /// Returns the next possible match candidate. This may yield false
+ /// positives, so callers must confirm a match starting at the position
+ /// returned. This, however, must never produce false negatives. That is,
+ /// this must, at minimum, return the starting position of the next match
+ /// in the given haystack after or at the given position.
+ fn next_candidate(
+ &self,
+ state: &mut State,
+ haystack: &[u8],
+ at: usize,
+ ) -> Candidate;
+
+ /// Returns the approximate total amount of heap used by this prefilter, in
+ /// units of bytes.
+ fn heap_bytes(&self) -> usize;
+
+ /// Returns true if and only if this prefilter may return false positives
+ /// via the `Candidate::PossibleStartOfMatch` variant. This is most useful
+ /// when false positives are not posssible (in which case, implementations
+ /// should return false), which may allow completely avoiding heavier regex
+ /// machinery when the prefilter can quickly confirm its own matches.
+ ///
+ /// By default, this returns true, which is conservative; it is always
+ /// correct to return `true`. Returning `false` here and reporting a false
+ /// positive will result in incorrect searches.
+ fn reports_false_positives(&self) -> bool {
+ true
+ }
+}
+
+impl<'a, P: Prefilter + ?Sized> Prefilter for &'a P {
+ #[inline]
+ fn next_candidate(
+ &self,
+ state: &mut State,
+ haystack: &[u8],
+ at: usize,
+ ) -> Candidate {
+ (**self).next_candidate(state, haystack, at)
+ }
+
+ fn heap_bytes(&self) -> usize {
+ (**self).heap_bytes()
+ }
+
+ fn reports_false_positives(&self) -> bool {
+ (**self).reports_false_positives()
+ }
+}
+
+#[derive(Clone)]
+pub struct Scanner<'p> {
+ prefilter: &'p dyn Prefilter,
+ state: State,
+}
+
+impl<'p> Scanner<'p> {
+ pub fn new(prefilter: &'p dyn Prefilter) -> Scanner<'p> {
+ Scanner { prefilter, state: State::new() }
+ }
+
+ pub(crate) fn is_effective(&mut self, at: usize) -> bool {
+ self.state.is_effective(at)
+ }
+
+ pub(crate) fn reports_false_positives(&self) -> bool {
+ self.prefilter.reports_false_positives()
+ }
+
+ pub(crate) fn next_candidate(
+ &mut self,
+ bytes: &[u8],
+ at: usize,
+ ) -> Candidate {
+ let cand = self.prefilter.next_candidate(&mut self.state, bytes, at);
+ match cand {
+ Candidate::None => {
+ self.state.update_skipped_bytes(bytes.len() - at);
+ }
+ Candidate::Match(ref m) => {
+ self.state.update_skipped_bytes(m.start() - at);
+ }
+ Candidate::PossibleStartOfMatch(i) => {
+ self.state.update_skipped_bytes(i - at);
+ }
+ }
+ cand
+ }
+}
+
+impl<'p> core::fmt::Debug for Scanner<'p> {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ f.debug_struct("Scanner").field("state", &self.state).finish()
+ }
+}
+
+/// State tracks state associated with the effectiveness of a
+/// prefilter. It is used to track how many bytes, on average, are skipped by
+/// the prefilter. If this average dips below a certain threshold over time,
+/// then the state renders the prefilter inert and stops using it.
+///
+/// A prefilter state should be created for each search. (Where creating an
+/// iterator via, e.g., `find_iter`, is treated as a single search.)
+#[derive(Clone, Debug)]
+pub struct State {
+ /// The number of skips that has been executed.
+ skips: usize,
+ /// The total number of bytes that have been skipped.
+ skipped: usize,
+ /// Once this heuristic has been deemed permanently ineffective, it will be
+ /// inert throughout the rest of its lifetime. This serves as a cheap way
+ /// to check inertness.
+ inert: bool,
+ /// The last (absolute) position at which a prefilter scanned to.
+ /// Prefilters can use this position to determine whether to re-scan or
+ /// not.
+ ///
+ /// Unlike other things that impact effectiveness, this is a fleeting
+ /// condition. That is, a prefilter can be considered ineffective if it is
+ /// at a position before `last_scan_at`, but can become effective again
+ /// once the search moves past `last_scan_at`.
+ ///
+ /// The utility of this is to both avoid additional overhead from calling
+ /// the prefilter and to avoid quadratic behavior. This ensures that a
+ /// prefilter will scan any particular byte at most once. (Note that some
+ /// prefilters, like the start-byte prefilter, do not need to use this
+ /// field at all, since it only looks for starting bytes.)
+ last_scan_at: usize,
+}
+
+impl State {
+ /// The minimum number of skip attempts to try before considering whether
+ /// a prefilter is effective or not.
+ const MIN_SKIPS: usize = 40;
+
+ /// The minimum amount of bytes that skipping must average.
+ ///
+ /// That is, after MIN_SKIPS have occurred, if the average number of bytes
+ /// skipped ever falls below MIN_AVG_SKIP, then the prefilter will be
+ /// rendered inert.
+ const MIN_AVG_SKIP: usize = 16;
+
+ /// Create a fresh prefilter state.
+ pub fn new() -> State {
+ State { skips: 0, skipped: 0, inert: false, last_scan_at: 0 }
+ }
+
+ /// Updates the position at which the last scan stopped. This may be
+ /// greater than the position of the last candidate reported. For example,
+ /// searching for the byte `z` in `abczdef` for the pattern `abcz` will
+ /// report a candidate at position `0`, but the end of its last scan will
+ /// be at position `3`.
+ ///
+ /// This position factors into the effectiveness of this prefilter. If the
+ /// current position is less than the last position at which a scan ended,
+ /// then the prefilter should not be re-run until the search moves past
+ /// that position.
+ ///
+ /// It is always correct to never update the last scan position. In fact,
+ /// it is also always correct to set the last scan position to an arbitrary
+ /// value. The key is setting it to a position in the future at which it
+ /// makes sense to restart the prefilter.
+ pub fn update_last_scan(&mut self, at: usize) {
+ if at > self.last_scan_at {
+ self.last_scan_at = at;
+ }
+ }
+
+ /// Return true if and only if this state indicates that a prefilter is
+ /// still effective. If the prefilter is not effective, then this state
+ /// is rendered "inert." At which point, all subsequent calls to
+ /// `is_effective` on this state will return `false`.
+ ///
+ /// `at` should correspond to the current starting position of the search.
+ ///
+ /// Callers typically do not need to use this, as it represents the
+ /// default implementation of
+ /// [`Prefilter::is_effective`](trait.Prefilter.html#tymethod.is_effective).
+ fn is_effective(&mut self, at: usize) -> bool {
+ if self.inert {
+ return false;
+ }
+ if at < self.last_scan_at {
+ return false;
+ }
+ if self.skips < State::MIN_SKIPS {
+ return true;
+ }
+
+ if self.skipped >= State::MIN_AVG_SKIP * self.skips {
+ return true;
+ }
+
+ // We're inert.
+ self.inert = true;
+ false
+ }
+
+ /// Update this state with the number of bytes skipped on the last
+ /// invocation of the prefilter.
+ fn update_skipped_bytes(&mut self, skipped: usize) {
+ self.skips += 1;
+ self.skipped += skipped;
+ }
+}
+
+/// A `Prefilter` implementation that reports a possible match at every
+/// position.
+///
+/// This should generally not be used as an actual prefilter. It is only
+/// useful when one needs to represent the absence of a prefilter in a generic
+/// context. For example, a [`dfa::regex::Regex`](crate::dfa::regex::Regex)
+/// uses this prefilter by default to indicate that no prefilter should be
+/// used.
+///
+/// A `None` prefilter value cannot be constructed.
+#[derive(Clone, Debug)]
+pub struct None {
+ _priv: (),
+}
+
+impl Prefilter for None {
+ fn next_candidate(&self, _: &mut State, _: &[u8], at: usize) -> Candidate {
+ Candidate::PossibleStartOfMatch(at)
+ }
+
+ fn heap_bytes(&self) -> usize {
+ 0
+ }
+}
diff --git a/vendor/regex-automata/src/util/sparse_set.rs b/vendor/regex-automata/src/util/sparse_set.rs
new file mode 100644
index 000000000..bf59e4469
--- /dev/null
+++ b/vendor/regex-automata/src/util/sparse_set.rs
@@ -0,0 +1,229 @@
+use alloc::{boxed::Box, vec, vec::Vec};
+
+use crate::util::id::StateID;
+
+/// A pairse of sparse sets.
+///
+/// This is useful when one needs to compute NFA epsilon closures from a
+/// previous set of states derived from an epsilon closure. One set can be the
+/// starting states where as the other set can be the destination states after
+/// following the transitions for a particular byte of input.
+///
+/// There is no significance to 'set1' or 'set2'. They are both sparse sets of
+/// the same size.
+///
+/// The members of this struct are exposed so that callers may borrow 'set1'
+/// and 'set2' individually without being force to borrow both at the same
+/// time.
+#[derive(Clone, Debug)]
+pub(crate) struct SparseSets {
+ pub(crate) set1: SparseSet,
+ pub(crate) set2: SparseSet,
+}
+
+impl SparseSets {
+ /// Create a new pair of sparse sets where each set has the given capacity.
+ ///
+ /// This panics if the capacity given is bigger than `StateID::LIMIT`.
+ pub(crate) fn new(capacity: usize) -> SparseSets {
+ SparseSets {
+ set1: SparseSet::new(capacity),
+ set2: SparseSet::new(capacity),
+ }
+ }
+
+ /// Resizes these sparse sets to have the new capacity given.
+ ///
+ /// The sets are automatically cleared.
+ ///
+ /// This panics if the capacity given is bigger than `StateID::LIMIT`.
+ #[inline]
+ pub(crate) fn resize(&mut self, new_capacity: usize) {
+ self.set1.resize(new_capacity);
+ self.set2.resize(new_capacity);
+ }
+
+ /// Clear both sparse sets.
+ pub(crate) fn clear(&mut self) {
+ self.set1.clear();
+ self.set2.clear();
+ }
+
+ /// Swap set1 with set2.
+ pub(crate) fn swap(&mut self) {
+ core::mem::swap(&mut self.set1, &mut self.set2);
+ }
+
+ /// Returns the memory usage, in bytes, used by this pair of sparse sets.
+ pub(crate) fn memory_usage(&self) -> usize {
+ self.set1.memory_usage() + self.set2.memory_usage()
+ }
+}
+
+/// A sparse set used for representing ordered NFA states.
+///
+/// This supports constant time addition and membership testing. Clearing an
+/// entire set can also be done in constant time. Iteration yields elements
+/// in the order in which they were inserted.
+///
+/// The data structure is based on: https://research.swtch.com/sparse
+/// Note though that we don't actually use uninitialized memory. We generally
+/// reuse sparse sets, so the initial allocation cost is bareable. However, its
+/// other properties listed above are extremely useful.
+#[derive(Clone)]
+pub(crate) struct SparseSet {
+ /// The number of elements currently in this set.
+ len: usize,
+ /// Dense contains the ids in the order in which they were inserted.
+ dense: Vec<StateID>,
+ /// Sparse maps ids to their location in dense.
+ ///
+ /// A state ID is in the set if and only if
+ /// sparse[id] < dense.len() && id == dense[sparse[id]].
+ sparse: Vec<StateID>,
+}
+
+impl SparseSet {
+ /// Create a new sparse set with the given capacity.
+ ///
+ /// Sparse sets have a fixed size and they cannot grow. Attempting to
+ /// insert more distinct elements than the total capacity of the set will
+ /// result in a panic.
+ ///
+ /// This panics if the capacity given is bigger than `StateID::LIMIT`.
+ #[inline]
+ pub(crate) fn new(capacity: usize) -> SparseSet {
+ let mut set = SparseSet { len: 0, dense: vec![], sparse: vec![] };
+ set.resize(capacity);
+ set
+ }
+
+ /// Resizes this sparse set to have the new capacity given.
+ ///
+ /// This set is automatically cleared.
+ ///
+ /// This panics if the capacity given is bigger than `StateID::LIMIT`.
+ #[inline]
+ pub(crate) fn resize(&mut self, new_capacity: usize) {
+ assert!(
+ new_capacity <= StateID::LIMIT,
+ "sparse set capacity cannot excced {:?}",
+ StateID::LIMIT
+ );
+ self.clear();
+ self.dense.resize(new_capacity, StateID::ZERO);
+ self.sparse.resize(new_capacity, StateID::ZERO);
+ }
+
+ /// Returns the capacity of this set.
+ ///
+ /// The capacity represents a fixed limit on the number of distinct
+ /// elements that are allowed in this set. The capacity cannot be changed.
+ #[inline]
+ pub(crate) fn capacity(&self) -> usize {
+ self.dense.len()
+ }
+
+ /// Returns the number of elements in this set.
+ #[inline]
+ pub(crate) fn len(&self) -> usize {
+ self.len
+ }
+
+ /// Returns true if and only if this set is empty.
+ #[inline]
+ pub(crate) fn is_empty(&self) -> bool {
+ self.len() == 0
+ }
+
+ /// Insert the state ID value into this set and return true if the given
+ /// state ID was not previously in this set.
+ ///
+ /// This operation is idempotent. If the given value is already in this
+ /// set, then this is a no-op.
+ ///
+ /// If more than `capacity` ids are inserted, then this panics.
+ ///
+ /// This is marked as inline(always) since the compiler won't inline it
+ /// otherwise, and it's a fairly hot piece of code in DFA determinization.
+ #[inline(always)]
+ pub(crate) fn insert(&mut self, value: StateID) -> bool {
+ if self.contains(value) {
+ return false;
+ }
+
+ let i = self.len();
+ assert!(
+ i < self.capacity(),
+ "{:?} exceeds capacity of {:?} when inserting {:?}",
+ i,
+ self.capacity(),
+ value,
+ );
+ // OK since i < self.capacity() and self.capacity() is guaranteed to
+ // be <= StateID::LIMIT.
+ let id = StateID::new_unchecked(i);
+ self.dense[id] = value;
+ self.sparse[value] = id;
+ self.len += 1;
+ true
+ }
+
+ /// Returns true if and only if this set contains the given value.
+ #[inline]
+ pub(crate) fn contains(&self, value: StateID) -> bool {
+ let i = self.sparse[value];
+ i.as_usize() < self.len() && self.dense[i] == value
+ }
+
+ /// Returns the ith inserted element from this set.
+ ///
+ /// Panics when i >= self.len().
+ #[inline]
+ pub(crate) fn get(&self, i: usize) -> StateID {
+ self.dense[i]
+ }
+
+ /// Clear this set such that it has no members.
+ #[inline]
+ pub(crate) fn clear(&mut self) {
+ self.len = 0;
+ }
+
+ /// Returns the heap memory usage, in bytes, used by this sparse set.
+ #[inline]
+ pub(crate) fn memory_usage(&self) -> usize {
+ 2 * self.dense.len() * StateID::SIZE
+ }
+}
+
+impl core::fmt::Debug for SparseSet {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ let elements: Vec<StateID> = self.into_iter().collect();
+ f.debug_tuple("SparseSet").field(&elements).finish()
+ }
+}
+
+/// An iterator over all elements in a sparse set.
+///
+/// The lifetime `'a` refers to the lifetime of the set being iterated over.
+#[derive(Debug)]
+pub(crate) struct SparseSetIter<'a>(core::slice::Iter<'a, StateID>);
+
+impl<'a> IntoIterator for &'a SparseSet {
+ type Item = StateID;
+ type IntoIter = SparseSetIter<'a>;
+
+ fn into_iter(self) -> Self::IntoIter {
+ SparseSetIter(self.dense[..self.len()].iter())
+ }
+}
+
+impl<'a> Iterator for SparseSetIter<'a> {
+ type Item = StateID;
+
+ #[inline(always)]
+ fn next(&mut self) -> Option<StateID> {
+ self.0.next().map(|value| *value)
+ }
+}
diff --git a/vendor/regex-automata/src/util/start.rs b/vendor/regex-automata/src/util/start.rs
new file mode 100644
index 000000000..3c756fc26
--- /dev/null
+++ b/vendor/regex-automata/src/util/start.rs
@@ -0,0 +1,109 @@
+/// Represents the four possible starting configurations of a DFA search.
+///
+/// The starting configuration is determined by inspecting the the beginning of
+/// the haystack (up to 1 byte). Ultimately, this along with a pattern ID (if
+/// specified) is what selects the start state to use in a DFA.
+///
+/// In a DFA that doesn't have starting states for each pattern, then it will
+/// have a maximum of four DFA start states. If the DFA was compiled with start
+/// states for each pattern, then it will have a maximum of four DFA start
+/// states for searching for any pattern, and then another maximum of four DFA
+/// start states for executing an anchored search for each pattern.
+///
+/// This ends up being represented as a table in the DFA (whether lazy or fully
+/// built) where the stride of that table is 4, and each entry is an index into
+/// the state transition table. Note though that multiple entries in the table
+/// might point to the same state if the states would otherwise be equivalent.
+/// (This is guaranteed by DFA minimization and may even be accomplished by
+/// normal determinization, since it attempts to reuse equivalent states too.)
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+pub(crate) enum Start {
+ /// This occurs when the starting position is not any of the ones below.
+ NonWordByte = 0,
+ /// This occurs when the byte immediately preceding the start of the search
+ /// is an ASCII word byte.
+ WordByte = 1,
+ /// This occurs when the starting position of the search corresponds to the
+ /// beginning of the haystack.
+ Text = 2,
+ /// This occurs when the byte immediately preceding the start of the search
+ /// is a line terminator. Specifically, `\n`.
+ Line = 3,
+}
+
+impl Start {
+ /// Return the starting state corresponding to the given integer. If no
+ /// starting state exists for the given integer, then None is returned.
+ pub(crate) fn from_usize(n: usize) -> Option<Start> {
+ match n {
+ 0 => Some(Start::NonWordByte),
+ 1 => Some(Start::WordByte),
+ 2 => Some(Start::Text),
+ 3 => Some(Start::Line),
+ _ => None,
+ }
+ }
+
+ /// Returns the total number of starting state configurations.
+ pub(crate) fn count() -> usize {
+ 4
+ }
+
+ /// Returns the starting state configuration for the given search
+ /// parameters. If the given offset range is not valid, then this panics.
+ #[inline(always)]
+ pub(crate) fn from_position_fwd(
+ bytes: &[u8],
+ start: usize,
+ end: usize,
+ ) -> Start {
+ assert!(
+ bytes.get(start..end).is_some(),
+ "{}..{} is invalid",
+ start,
+ end
+ );
+ if start == 0 {
+ Start::Text
+ } else if bytes[start - 1] == b'\n' {
+ Start::Line
+ } else if crate::util::is_word_byte(bytes[start - 1]) {
+ Start::WordByte
+ } else {
+ Start::NonWordByte
+ }
+ }
+
+ /// Returns the starting state configuration for a reverse search with the
+ /// given search parameters. If the given offset range is not valid, then
+ /// this panics.
+ #[inline(always)]
+ pub(crate) fn from_position_rev(
+ bytes: &[u8],
+ start: usize,
+ end: usize,
+ ) -> Start {
+ assert!(
+ bytes.get(start..end).is_some(),
+ "{}..{} is invalid",
+ start,
+ end
+ );
+ if end == bytes.len() {
+ Start::Text
+ } else if bytes[end] == b'\n' {
+ Start::Line
+ } else if crate::util::is_word_byte(bytes[end]) {
+ Start::WordByte
+ } else {
+ Start::NonWordByte
+ }
+ }
+
+ /// Return this starting configuration as an integer. It is guaranteed to
+ /// be less than `Start::count()`.
+ #[inline(always)]
+ pub(crate) fn as_usize(&self) -> usize {
+ *self as usize
+ }
+}
diff --git a/vendor/regex-automata/src/util/syntax.rs b/vendor/regex-automata/src/util/syntax.rs
new file mode 100644
index 000000000..88beeee75
--- /dev/null
+++ b/vendor/regex-automata/src/util/syntax.rs
@@ -0,0 +1,272 @@
+use regex_syntax::ParserBuilder;
+
+/// A common set of configuration options that apply to the syntax of a regex.
+///
+/// This represents a group of configuration options that specifically apply
+/// to how the concrete syntax of a regular expression is interpreted. In
+/// particular, they are generally forwarded to the
+/// [`ParserBuilder`](https://docs.rs/regex-syntax/*/regex_syntax/struct.ParserBuilder.html)
+/// in the
+/// [`regex-syntax`](https://docs.rs/regex-syntax)
+/// crate when building a regex from its concrete syntax directly.
+///
+/// These options are defined as a group since they apply to every regex engine
+/// in this crate. Instead of re-defining them on every engine's builder, they
+/// are instead provided here as one cohesive unit.
+#[derive(Clone, Copy, Debug)]
+pub struct SyntaxConfig {
+ case_insensitive: bool,
+ multi_line: bool,
+ dot_matches_new_line: bool,
+ swap_greed: bool,
+ ignore_whitespace: bool,
+ unicode: bool,
+ utf8: bool,
+ nest_limit: u32,
+ octal: bool,
+}
+
+impl SyntaxConfig {
+ /// Return a new default syntax configuration.
+ pub fn new() -> SyntaxConfig {
+ // These defaults match the ones used in regex-syntax.
+ SyntaxConfig {
+ case_insensitive: false,
+ multi_line: false,
+ dot_matches_new_line: false,
+ swap_greed: false,
+ ignore_whitespace: false,
+ unicode: true,
+ utf8: true,
+ nest_limit: 250,
+ octal: false,
+ }
+ }
+
+ /// Enable or disable the case insensitive flag by default.
+ ///
+ /// When Unicode mode is enabled, case insensitivity is Unicode-aware.
+ /// Specifically, it will apply the "simple" case folding rules as
+ /// specified by Unicode.
+ ///
+ /// By default this is disabled. It may alternatively be selectively
+ /// enabled in the regular expression itself via the `i` flag.
+ pub fn case_insensitive(mut self, yes: bool) -> SyntaxConfig {
+ self.case_insensitive = yes;
+ self
+ }
+
+ /// Enable or disable the multi-line matching flag by default.
+ ///
+ /// When this is enabled, the `^` and `$` look-around assertions will
+ /// match immediately after and immediately before a new line character,
+ /// respectively. Note that the `\A` and `\z` look-around assertions are
+ /// unaffected by this setting and always correspond to matching at the
+ /// beginning and end of the input.
+ ///
+ /// By default this is disabled. It may alternatively be selectively
+ /// enabled in the regular expression itself via the `m` flag.
+ pub fn multi_line(mut self, yes: bool) -> SyntaxConfig {
+ self.multi_line = yes;
+ self
+ }
+
+ /// Enable or disable the "dot matches any character" flag by default.
+ ///
+ /// When this is enabled, `.` will match any character. When it's disabled,
+ /// then `.` will match any character except for a new line character.
+ ///
+ /// Note that `.` is impacted by whether the "unicode" setting is enabled
+ /// or not. When Unicode is enabled (the defualt), `.` will match any UTF-8
+ /// encoding of any Unicode scalar value (sans a new line, depending on
+ /// whether this "dot matches new line" option is enabled). When Unicode
+ /// mode is disabled, `.` will match any byte instead. Because of this,
+ /// when Unicode mode is disabled, `.` can only be used when the "allow
+ /// invalid UTF-8" option is enabled, since `.` could otherwise match
+ /// invalid UTF-8.
+ ///
+ /// By default this is disabled. It may alternatively be selectively
+ /// enabled in the regular expression itself via the `s` flag.
+ pub fn dot_matches_new_line(mut self, yes: bool) -> SyntaxConfig {
+ self.dot_matches_new_line = yes;
+ self
+ }
+
+ /// Enable or disable the "swap greed" flag by default.
+ ///
+ /// When this is enabled, `.*` (for example) will become ungreedy and `.*?`
+ /// will become greedy.
+ ///
+ /// By default this is disabled. It may alternatively be selectively
+ /// enabled in the regular expression itself via the `U` flag.
+ pub fn swap_greed(mut self, yes: bool) -> SyntaxConfig {
+ self.swap_greed = yes;
+ self
+ }
+
+ /// Enable verbose mode in the regular expression.
+ ///
+ /// When enabled, verbose mode permits insigificant whitespace in many
+ /// places in the regular expression, as well as comments. Comments are
+ /// started using `#` and continue until the end of the line.
+ ///
+ /// By default, this is disabled. It may be selectively enabled in the
+ /// regular expression by using the `x` flag regardless of this setting.
+ pub fn ignore_whitespace(mut self, yes: bool) -> SyntaxConfig {
+ self.ignore_whitespace = yes;
+ self
+ }
+
+ /// Enable or disable the Unicode flag (`u`) by default.
+ ///
+ /// By default this is **enabled**. It may alternatively be selectively
+ /// disabled in the regular expression itself via the `u` flag.
+ ///
+ /// Note that unless "allow invalid UTF-8" is enabled (it's disabled by
+ /// default), a regular expression will fail to parse if Unicode mode is
+ /// disabled and a sub-expression could possibly match invalid UTF-8.
+ ///
+ /// **WARNING**: Unicode mode can greatly increase the size of the compiled
+ /// DFA, which can noticeably impact both memory usage and compilation
+ /// time. This is especially noticeable if your regex contains character
+ /// classes like `\w` that are impacted by whether Unicode is enabled or
+ /// not. If Unicode is not necessary, you are encouraged to disable it.
+ pub fn unicode(mut self, yes: bool) -> SyntaxConfig {
+ self.unicode = yes;
+ self
+ }
+
+ /// When disabled, the builder will permit the construction of a regular
+ /// expression that may match invalid UTF-8.
+ ///
+ /// For example, when [`SyntaxConfig::unicode`] is disabled, then
+ /// expressions like `[^a]` may match invalid UTF-8 since they can match
+ /// any single byte that is not `a`. By default, these sub-expressions
+ /// are disallowed to avoid returning offsets that split a UTF-8
+ /// encoded codepoint. However, in cases where matching at arbitrary
+ /// locations is desired, this option can be disabled to permit all such
+ /// sub-expressions.
+ ///
+ /// When enabled (the default), the builder is guaranteed to produce a
+ /// regex that will only ever match valid UTF-8 (otherwise, the builder
+ /// will return an error).
+ pub fn utf8(mut self, yes: bool) -> SyntaxConfig {
+ self.utf8 = yes;
+ self
+ }
+
+ /// Set the nesting limit used for the regular expression parser.
+ ///
+ /// The nesting limit controls how deep the abstract syntax tree is allowed
+ /// to be. If the AST exceeds the given limit (e.g., with too many nested
+ /// groups), then an error is returned by the parser.
+ ///
+ /// The purpose of this limit is to act as a heuristic to prevent stack
+ /// overflow when building a finite automaton from a regular expression's
+ /// abstract syntax tree. In particular, construction currently uses
+ /// recursion. In the future, the implementation may stop using recursion
+ /// and this option will no longer be necessary.
+ ///
+ /// This limit is not checked until the entire AST is parsed. Therefore,
+ /// if callers want to put a limit on the amount of heap space used, then
+ /// they should impose a limit on the length, in bytes, of the concrete
+ /// pattern string. In particular, this is viable since the parser will
+ /// limit itself to heap space proportional to the lenth of the pattern
+ /// string.
+ ///
+ /// Note that a nest limit of `0` will return a nest limit error for most
+ /// patterns but not all. For example, a nest limit of `0` permits `a` but
+ /// not `ab`, since `ab` requires a concatenation AST item, which results
+ /// in a nest depth of `1`. In general, a nest limit is not something that
+ /// manifests in an obvious way in the concrete syntax, therefore, it
+ /// should not be used in a granular way.
+ pub fn nest_limit(mut self, limit: u32) -> SyntaxConfig {
+ self.nest_limit = limit;
+ self
+ }
+
+ /// Whether to support octal syntax or not.
+ ///
+ /// Octal syntax is a little-known way of uttering Unicode codepoints in
+ /// a regular expression. For example, `a`, `\x61`, `\u0061` and
+ /// `\141` are all equivalent regular expressions, where the last example
+ /// shows octal syntax.
+ ///
+ /// While supporting octal syntax isn't in and of itself a problem, it does
+ /// make good error messages harder. That is, in PCRE based regex engines,
+ /// syntax like `\1` invokes a backreference, which is explicitly
+ /// unsupported in Rust's regex engine. However, many users expect it to
+ /// be supported. Therefore, when octal support is disabled, the error
+ /// message will explicitly mention that backreferences aren't supported.
+ ///
+ /// Octal syntax is disabled by default.
+ pub fn octal(mut self, yes: bool) -> SyntaxConfig {
+ self.octal = yes;
+ self
+ }
+
+ /// Returns whether "unicode" mode is enabled.
+ pub fn get_unicode(&self) -> bool {
+ self.unicode
+ }
+
+ /// Returns whether "case insensitive" mode is enabled.
+ pub fn get_case_insensitive(&self) -> bool {
+ self.case_insensitive
+ }
+
+ /// Returns whether "multi line" mode is enabled.
+ pub fn get_multi_line(&self) -> bool {
+ self.multi_line
+ }
+
+ /// Returns whether "dot matches new line" mode is enabled.
+ pub fn get_dot_matches_new_line(&self) -> bool {
+ self.dot_matches_new_line
+ }
+
+ /// Returns whether "swap greed" mode is enabled.
+ pub fn get_swap_greed(&self) -> bool {
+ self.swap_greed
+ }
+
+ /// Returns whether "ignore whitespace" mode is enabled.
+ pub fn get_ignore_whitespace(&self) -> bool {
+ self.ignore_whitespace
+ }
+
+ /// Returns whether UTF-8 mode is enabled.
+ pub fn get_utf8(&self) -> bool {
+ self.utf8
+ }
+
+ /// Returns the "nest limit" setting.
+ pub fn get_nest_limit(&self) -> u32 {
+ self.nest_limit
+ }
+
+ /// Returns whether "octal" mode is enabled.
+ pub fn get_octal(&self) -> bool {
+ self.octal
+ }
+
+ /// Applies this configuration to the given parser.
+ pub(crate) fn apply(&self, builder: &mut ParserBuilder) {
+ builder
+ .unicode(self.unicode)
+ .case_insensitive(self.case_insensitive)
+ .multi_line(self.multi_line)
+ .dot_matches_new_line(self.dot_matches_new_line)
+ .swap_greed(self.swap_greed)
+ .ignore_whitespace(self.ignore_whitespace)
+ .allow_invalid_utf8(!self.utf8)
+ .nest_limit(self.nest_limit)
+ .octal(self.octal);
+ }
+}
+
+impl Default for SyntaxConfig {
+ fn default() -> SyntaxConfig {
+ SyntaxConfig::new()
+ }
+}