diff options
Diffstat (limited to 'vendor/regex-automata/src/util')
-rw-r--r-- | vendor/regex-automata/src/util/alphabet.rs | 790 | ||||
-rw-r--r-- | vendor/regex-automata/src/util/bytes.rs | 950 | ||||
-rw-r--r-- | vendor/regex-automata/src/util/determinize/mod.rs | 493 | ||||
-rw-r--r-- | vendor/regex-automata/src/util/determinize/state.rs | 873 | ||||
-rw-r--r-- | vendor/regex-automata/src/util/id.rs | 608 | ||||
-rw-r--r-- | vendor/regex-automata/src/util/lazy.rs | 31 | ||||
-rw-r--r-- | vendor/regex-automata/src/util/matchtypes.rs | 356 | ||||
-rw-r--r-- | vendor/regex-automata/src/util/mod.rs | 275 | ||||
-rw-r--r-- | vendor/regex-automata/src/util/prefilter.rs | 281 | ||||
-rw-r--r-- | vendor/regex-automata/src/util/sparse_set.rs | 229 | ||||
-rw-r--r-- | vendor/regex-automata/src/util/start.rs | 109 | ||||
-rw-r--r-- | vendor/regex-automata/src/util/syntax.rs | 272 |
12 files changed, 5267 insertions, 0 deletions
diff --git a/vendor/regex-automata/src/util/alphabet.rs b/vendor/regex-automata/src/util/alphabet.rs new file mode 100644 index 000000000..0bc1ece58 --- /dev/null +++ b/vendor/regex-automata/src/util/alphabet.rs @@ -0,0 +1,790 @@ +use core::convert::TryFrom; + +use crate::util::{ + bytes::{DeserializeError, SerializeError}, + DebugByte, +}; + +/// Unit represents a single unit of input for DFA based regex engines. +/// +/// **NOTE:** It is not expected for consumers of this crate to need to use +/// this type unless they are implementing their own DFA. And even then, it's +/// not required: implementors may use other techniques to handle input. +/// +/// Typically, a single unit of input for a DFA would be a single byte. +/// However, for the DFAs in this crate, matches are delayed by a single byte +/// in order to handle look-ahead assertions (`\b`, `$` and `\z`). Thus, once +/// we have consumed the haystack, we must run the DFA through one additional +/// transition using an input that indicates the haystack has ended. +/// +/// Since there is no way to represent a sentinel with a `u8` since all +/// possible values *may* be valid inputs to a DFA, this type explicitly adds +/// room for a sentinel value. +/// +/// The sentinel EOI value is always its own equivalence class and is +/// ultimately represented by adding 1 to the maximum equivalence class value. +/// So for example, the regex `^[a-z]+$` might be split into the following +/// equivalence classes: +/// +/// ```text +/// 0 => [\x00-`] +/// 1 => [a-z] +/// 2 => [{-\xFF] +/// 3 => [EOI] +/// ``` +/// +/// Where EOI is the special sentinel value that is always in its own +/// singleton equivalence class. +#[derive(Clone, Copy, Eq, PartialEq, PartialOrd, Ord)] +pub enum Unit { + U8(u8), + EOI(u16), +} + +impl Unit { + /// Create a new input unit from a byte value. + /// + /// All possible byte values are legal. However, when creating an input + /// unit for a specific DFA, one should be careful to only construct input + /// units that are in that DFA's alphabet. Namely, one way to compact a + /// DFA's in-memory representation is to collapse its transitions to a set + /// of equivalence classes into a set of all possible byte values. If a + /// DFA uses equivalence classes instead of byte values, then the byte + /// given here should be the equivalence class. + pub fn u8(byte: u8) -> Unit { + Unit::U8(byte) + } + + pub fn eoi(num_byte_equiv_classes: usize) -> Unit { + assert!( + num_byte_equiv_classes <= 256, + "max number of byte-based equivalent classes is 256, but got {}", + num_byte_equiv_classes, + ); + Unit::EOI(u16::try_from(num_byte_equiv_classes).unwrap()) + } + + pub fn as_u8(self) -> Option<u8> { + match self { + Unit::U8(b) => Some(b), + Unit::EOI(_) => None, + } + } + + #[cfg(feature = "alloc")] + pub fn as_eoi(self) -> Option<usize> { + match self { + Unit::U8(_) => None, + Unit::EOI(eoi) => Some(eoi as usize), + } + } + + pub fn as_usize(self) -> usize { + match self { + Unit::U8(b) => b as usize, + Unit::EOI(eoi) => eoi as usize, + } + } + + pub fn is_eoi(&self) -> bool { + match *self { + Unit::EOI(_) => true, + _ => false, + } + } + + #[cfg(feature = "alloc")] + pub fn is_word_byte(&self) -> bool { + self.as_u8().map_or(false, crate::util::is_word_byte) + } +} + +impl core::fmt::Debug for Unit { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + match *self { + Unit::U8(b) => write!(f, "{:?}", DebugByte(b)), + Unit::EOI(_) => write!(f, "EOI"), + } + } +} + +/// A representation of byte oriented equivalence classes. +/// +/// This is used in a DFA to reduce the size of the transition table. This can +/// have a particularly large impact not only on the total size of a dense DFA, +/// but also on compile times. +#[derive(Clone, Copy)] +pub struct ByteClasses([u8; 256]); + +impl ByteClasses { + /// Creates a new set of equivalence classes where all bytes are mapped to + /// the same class. + pub fn empty() -> ByteClasses { + ByteClasses([0; 256]) + } + + /// Creates a new set of equivalence classes where each byte belongs to + /// its own equivalence class. + #[cfg(feature = "alloc")] + pub fn singletons() -> ByteClasses { + let mut classes = ByteClasses::empty(); + for i in 0..256 { + classes.set(i as u8, i as u8); + } + classes + } + + /// Deserializes a byte class map from the given slice. If the slice is of + /// insufficient length or otherwise contains an impossible mapping, then + /// an error is returned. Upon success, the number of bytes read along with + /// the map are returned. The number of bytes read is always a multiple of + /// 8. + pub fn from_bytes( + slice: &[u8], + ) -> Result<(ByteClasses, usize), DeserializeError> { + if slice.len() < 256 { + return Err(DeserializeError::buffer_too_small("byte class map")); + } + let mut classes = ByteClasses::empty(); + for (b, &class) in slice[..256].iter().enumerate() { + classes.set(b as u8, class); + } + for b in classes.iter() { + if b.as_usize() >= classes.alphabet_len() { + return Err(DeserializeError::generic( + "found equivalence class greater than alphabet len", + )); + } + } + Ok((classes, 256)) + } + + /// Writes this byte class map to the given byte buffer. if the given + /// buffer is too small, then an error is returned. Upon success, the total + /// number of bytes written is returned. The number of bytes written is + /// guaranteed to be a multiple of 8. + pub fn write_to( + &self, + mut dst: &mut [u8], + ) -> Result<usize, SerializeError> { + let nwrite = self.write_to_len(); + if dst.len() < nwrite { + return Err(SerializeError::buffer_too_small("byte class map")); + } + for b in 0..=255 { + dst[0] = self.get(b); + dst = &mut dst[1..]; + } + Ok(nwrite) + } + + /// Returns the total number of bytes written by `write_to`. + pub fn write_to_len(&self) -> usize { + 256 + } + + /// Set the equivalence class for the given byte. + #[inline] + pub fn set(&mut self, byte: u8, class: u8) { + self.0[byte as usize] = class; + } + + /// Get the equivalence class for the given byte. + #[inline] + pub fn get(&self, byte: u8) -> u8 { + self.0[byte as usize] + } + + /// Get the equivalence class for the given byte while forcefully + /// eliding bounds checks. + #[inline] + pub unsafe fn get_unchecked(&self, byte: u8) -> u8 { + *self.0.get_unchecked(byte as usize) + } + + /// Get the equivalence class for the given input unit and return the + /// class as a `usize`. + #[inline] + pub fn get_by_unit(&self, unit: Unit) -> usize { + match unit { + Unit::U8(b) => usize::try_from(self.get(b)).unwrap(), + Unit::EOI(b) => usize::try_from(b).unwrap(), + } + } + + #[inline] + pub fn eoi(&self) -> Unit { + Unit::eoi(self.alphabet_len().checked_sub(1).unwrap()) + } + + /// Return the total number of elements in the alphabet represented by + /// these equivalence classes. Equivalently, this returns the total number + /// of equivalence classes. + #[inline] + pub fn alphabet_len(&self) -> usize { + // Add one since the number of equivalence classes is one bigger than + // the last one. But add another to account for the final EOI class + // that isn't explicitly represented. + self.0[255] as usize + 1 + 1 + } + + /// Returns the stride, as a base-2 exponent, required for these + /// equivalence classes. + /// + /// The stride is always the smallest power of 2 that is greater than or + /// equal to the alphabet length. This is done so that converting between + /// state IDs and indices can be done with shifts alone, which is much + /// faster than integer division. + #[cfg(feature = "alloc")] + pub fn stride2(&self) -> usize { + self.alphabet_len().next_power_of_two().trailing_zeros() as usize + } + + /// Returns true if and only if every byte in this class maps to its own + /// equivalence class. Equivalently, there are 257 equivalence classes + /// and each class contains exactly one byte (plus the special EOI class). + #[inline] + pub fn is_singleton(&self) -> bool { + self.alphabet_len() == 257 + } + + /// Returns an iterator over all equivalence classes in this set. + pub fn iter(&self) -> ByteClassIter<'_> { + ByteClassIter { classes: self, i: 0 } + } + + /// Returns an iterator over a sequence of representative bytes from each + /// equivalence class. Namely, this yields exactly N items, where N is + /// equivalent to the number of equivalence classes. Each item is an + /// arbitrary byte drawn from each equivalence class. + /// + /// This is useful when one is determinizing an NFA and the NFA's alphabet + /// hasn't been converted to equivalence classes yet. Picking an arbitrary + /// byte from each equivalence class then permits a full exploration of + /// the NFA instead of using every possible byte value. + #[cfg(feature = "alloc")] + pub fn representatives(&self) -> ByteClassRepresentatives<'_> { + ByteClassRepresentatives { classes: self, byte: 0, last_class: None } + } + + /// Returns an iterator of the bytes in the given equivalence class. + pub fn elements(&self, class: Unit) -> ByteClassElements { + ByteClassElements { classes: self, class, byte: 0 } + } + + /// Returns an iterator of byte ranges in the given equivalence class. + /// + /// That is, a sequence of contiguous ranges are returned. Typically, every + /// class maps to a single contiguous range. + fn element_ranges(&self, class: Unit) -> ByteClassElementRanges { + ByteClassElementRanges { elements: self.elements(class), range: None } + } +} + +impl core::fmt::Debug for ByteClasses { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + if self.is_singleton() { + write!(f, "ByteClasses({{singletons}})") + } else { + write!(f, "ByteClasses(")?; + for (i, class) in self.iter().enumerate() { + if i > 0 { + write!(f, ", ")?; + } + write!(f, "{:?} => [", class.as_usize())?; + for (start, end) in self.element_ranges(class) { + if start == end { + write!(f, "{:?}", start)?; + } else { + write!(f, "{:?}-{:?}", start, end)?; + } + } + write!(f, "]")?; + } + write!(f, ")") + } + } +} + +/// An iterator over each equivalence class. +#[derive(Debug)] +pub struct ByteClassIter<'a> { + classes: &'a ByteClasses, + i: usize, +} + +impl<'a> Iterator for ByteClassIter<'a> { + type Item = Unit; + + fn next(&mut self) -> Option<Unit> { + if self.i + 1 == self.classes.alphabet_len() { + self.i += 1; + Some(self.classes.eoi()) + } else if self.i < self.classes.alphabet_len() { + let class = self.i as u8; + self.i += 1; + Some(Unit::u8(class)) + } else { + None + } + } +} + +/// An iterator over representative bytes from each equivalence class. +#[cfg(feature = "alloc")] +#[derive(Debug)] +pub struct ByteClassRepresentatives<'a> { + classes: &'a ByteClasses, + byte: usize, + last_class: Option<u8>, +} + +#[cfg(feature = "alloc")] +impl<'a> Iterator for ByteClassRepresentatives<'a> { + type Item = Unit; + + fn next(&mut self) -> Option<Unit> { + while self.byte < 256 { + let byte = self.byte as u8; + let class = self.classes.get(byte); + self.byte += 1; + + if self.last_class != Some(class) { + self.last_class = Some(class); + return Some(Unit::u8(byte)); + } + } + if self.byte == 256 { + self.byte += 1; + return Some(self.classes.eoi()); + } + None + } +} + +/// An iterator over all elements in an equivalence class. +#[derive(Debug)] +pub struct ByteClassElements<'a> { + classes: &'a ByteClasses, + class: Unit, + byte: usize, +} + +impl<'a> Iterator for ByteClassElements<'a> { + type Item = Unit; + + fn next(&mut self) -> Option<Unit> { + while self.byte < 256 { + let byte = self.byte as u8; + self.byte += 1; + if self.class.as_u8() == Some(self.classes.get(byte)) { + return Some(Unit::u8(byte)); + } + } + if self.byte < 257 { + self.byte += 1; + if self.class.is_eoi() { + return Some(Unit::eoi(256)); + } + } + None + } +} + +/// An iterator over all elements in an equivalence class expressed as a +/// sequence of contiguous ranges. +#[derive(Debug)] +pub struct ByteClassElementRanges<'a> { + elements: ByteClassElements<'a>, + range: Option<(Unit, Unit)>, +} + +impl<'a> Iterator for ByteClassElementRanges<'a> { + type Item = (Unit, Unit); + + fn next(&mut self) -> Option<(Unit, Unit)> { + loop { + let element = match self.elements.next() { + None => return self.range.take(), + Some(element) => element, + }; + match self.range.take() { + None => { + self.range = Some((element, element)); + } + Some((start, end)) => { + if end.as_usize() + 1 != element.as_usize() + || element.is_eoi() + { + self.range = Some((element, element)); + return Some((start, end)); + } + self.range = Some((start, element)); + } + } + } + } +} + +/// A byte class set keeps track of an *approximation* of equivalence classes +/// of bytes during NFA construction. That is, every byte in an equivalence +/// class cannot discriminate between a match and a non-match. +/// +/// For example, in the regex `[ab]+`, the bytes `a` and `b` would be in the +/// same equivalence class because it never matters whether an `a` or a `b` is +/// seen, and no combination of `a`s and `b`s in the text can discriminate a +/// match. +/// +/// Note though that this does not compute the minimal set of equivalence +/// classes. For example, in the regex `[ac]+`, both `a` and `c` are in the +/// same equivalence class for the same reason that `a` and `b` are in the +/// same equivalence class in the aforementioned regex. However, in this +/// implementation, `a` and `c` are put into distinct equivalence classes. The +/// reason for this is implementation complexity. In the future, we should +/// endeavor to compute the minimal equivalence classes since they can have a +/// rather large impact on the size of the DFA. (Doing this will likely require +/// rethinking how equivalence classes are computed, including changing the +/// representation here, which is only able to group contiguous bytes into the +/// same equivalence class.) +#[derive(Clone, Debug)] +pub struct ByteClassSet(ByteSet); + +impl ByteClassSet { + /// Create a new set of byte classes where all bytes are part of the same + /// equivalence class. + #[cfg(feature = "alloc")] + pub fn empty() -> Self { + ByteClassSet(ByteSet::empty()) + } + + /// Indicate the the range of byte given (inclusive) can discriminate a + /// match between it and all other bytes outside of the range. + #[cfg(feature = "alloc")] + pub fn set_range(&mut self, start: u8, end: u8) { + debug_assert!(start <= end); + if start > 0 { + self.0.add(start - 1); + } + self.0.add(end); + } + + /// Add the contiguous ranges in the set given to this byte class set. + #[cfg(feature = "alloc")] + pub fn add_set(&mut self, set: &ByteSet) { + for (start, end) in set.iter_ranges() { + self.set_range(start, end); + } + } + + /// Convert this boolean set to a map that maps all byte values to their + /// corresponding equivalence class. The last mapping indicates the largest + /// equivalence class identifier (which is never bigger than 255). + #[cfg(feature = "alloc")] + pub fn byte_classes(&self) -> ByteClasses { + let mut classes = ByteClasses::empty(); + let mut class = 0u8; + let mut b = 0u8; + loop { + classes.set(b, class); + if b == 255 { + break; + } + if self.0.contains(b) { + class = class.checked_add(1).unwrap(); + } + b = b.checked_add(1).unwrap(); + } + classes + } +} + +/// A simple set of bytes that is reasonably cheap to copy and allocation free. +#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)] +pub struct ByteSet { + bits: BitSet, +} + +/// The representation of a byte set. Split out so that we can define a +/// convenient Debug impl for it while keeping "ByteSet" in the output. +#[derive(Clone, Copy, Default, Eq, PartialEq)] +struct BitSet([u128; 2]); + +impl ByteSet { + /// Create an empty set of bytes. + #[cfg(feature = "alloc")] + pub fn empty() -> ByteSet { + ByteSet { bits: BitSet([0; 2]) } + } + + /// Add a byte to this set. + /// + /// If the given byte already belongs to this set, then this is a no-op. + #[cfg(feature = "alloc")] + pub fn add(&mut self, byte: u8) { + let bucket = byte / 128; + let bit = byte % 128; + self.bits.0[bucket as usize] |= 1 << bit; + } + + /// Add an inclusive range of bytes. + #[cfg(feature = "alloc")] + pub fn add_all(&mut self, start: u8, end: u8) { + for b in start..=end { + self.add(b); + } + } + + /// Remove a byte from this set. + /// + /// If the given byte is not in this set, then this is a no-op. + #[cfg(feature = "alloc")] + pub fn remove(&mut self, byte: u8) { + let bucket = byte / 128; + let bit = byte % 128; + self.bits.0[bucket as usize] &= !(1 << bit); + } + + /// Remove an inclusive range of bytes. + #[cfg(feature = "alloc")] + pub fn remove_all(&mut self, start: u8, end: u8) { + for b in start..=end { + self.remove(b); + } + } + + /// Return true if and only if the given byte is in this set. + pub fn contains(&self, byte: u8) -> bool { + let bucket = byte / 128; + let bit = byte % 128; + self.bits.0[bucket as usize] & (1 << bit) > 0 + } + + /// Return true if and only if the given inclusive range of bytes is in + /// this set. + #[cfg(feature = "alloc")] + pub fn contains_range(&self, start: u8, end: u8) -> bool { + (start..=end).all(|b| self.contains(b)) + } + + /// Returns an iterator over all bytes in this set. + #[cfg(feature = "alloc")] + pub fn iter(&self) -> ByteSetIter { + ByteSetIter { set: self, b: 0 } + } + + /// Returns an iterator over all contiguous ranges of bytes in this set. + #[cfg(feature = "alloc")] + pub fn iter_ranges(&self) -> ByteSetRangeIter { + ByteSetRangeIter { set: self, b: 0 } + } + + /// Return the number of bytes in this set. + #[cfg(feature = "alloc")] + pub fn len(&self) -> usize { + (self.bits.0[0].count_ones() + self.bits.0[1].count_ones()) as usize + } + + /// Return true if and only if this set is empty. + #[cfg(feature = "alloc")] + pub fn is_empty(&self) -> bool { + self.bits.0 == [0, 0] + } +} + +impl core::fmt::Debug for BitSet { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + let mut fmtd = f.debug_set(); + for b in (0..256).map(|b| b as u8) { + if (ByteSet { bits: *self }).contains(b) { + fmtd.entry(&b); + } + } + fmtd.finish() + } +} + +#[derive(Debug)] +pub struct ByteSetIter<'a> { + set: &'a ByteSet, + b: usize, +} + +impl<'a> Iterator for ByteSetIter<'a> { + type Item = u8; + + fn next(&mut self) -> Option<u8> { + while self.b <= 255 { + let b = self.b as u8; + self.b += 1; + if self.set.contains(b) { + return Some(b); + } + } + None + } +} + +#[derive(Debug)] +pub struct ByteSetRangeIter<'a> { + set: &'a ByteSet, + b: usize, +} + +impl<'a> Iterator for ByteSetRangeIter<'a> { + type Item = (u8, u8); + + fn next(&mut self) -> Option<(u8, u8)> { + while self.b <= 255 { + let start = self.b as u8; + self.b += 1; + if !self.set.contains(start) { + continue; + } + + let mut end = start; + while self.b <= 255 && self.set.contains(self.b as u8) { + end = self.b as u8; + self.b += 1; + } + return Some((start, end)); + } + None + } +} + +#[cfg(test)] +#[cfg(feature = "alloc")] +mod tests { + use alloc::{vec, vec::Vec}; + + use super::*; + + #[test] + fn byte_classes() { + let mut set = ByteClassSet::empty(); + set.set_range(b'a', b'z'); + + let classes = set.byte_classes(); + assert_eq!(classes.get(0), 0); + assert_eq!(classes.get(1), 0); + assert_eq!(classes.get(2), 0); + assert_eq!(classes.get(b'a' - 1), 0); + assert_eq!(classes.get(b'a'), 1); + assert_eq!(classes.get(b'm'), 1); + assert_eq!(classes.get(b'z'), 1); + assert_eq!(classes.get(b'z' + 1), 2); + assert_eq!(classes.get(254), 2); + assert_eq!(classes.get(255), 2); + + let mut set = ByteClassSet::empty(); + set.set_range(0, 2); + set.set_range(4, 6); + let classes = set.byte_classes(); + assert_eq!(classes.get(0), 0); + assert_eq!(classes.get(1), 0); + assert_eq!(classes.get(2), 0); + assert_eq!(classes.get(3), 1); + assert_eq!(classes.get(4), 2); + assert_eq!(classes.get(5), 2); + assert_eq!(classes.get(6), 2); + assert_eq!(classes.get(7), 3); + assert_eq!(classes.get(255), 3); + } + + #[test] + fn full_byte_classes() { + let mut set = ByteClassSet::empty(); + for i in 0..256u16 { + set.set_range(i as u8, i as u8); + } + assert_eq!(set.byte_classes().alphabet_len(), 257); + } + + #[test] + fn elements_typical() { + let mut set = ByteClassSet::empty(); + set.set_range(b'b', b'd'); + set.set_range(b'g', b'm'); + set.set_range(b'z', b'z'); + let classes = set.byte_classes(); + // class 0: \x00-a + // class 1: b-d + // class 2: e-f + // class 3: g-m + // class 4: n-y + // class 5: z-z + // class 6: \x7B-\xFF + // class 7: EOI + assert_eq!(classes.alphabet_len(), 8); + + let elements = classes.elements(Unit::u8(0)).collect::<Vec<_>>(); + assert_eq!(elements.len(), 98); + assert_eq!(elements[0], Unit::u8(b'\x00')); + assert_eq!(elements[97], Unit::u8(b'a')); + + let elements = classes.elements(Unit::u8(1)).collect::<Vec<_>>(); + assert_eq!( + elements, + vec![Unit::u8(b'b'), Unit::u8(b'c'), Unit::u8(b'd')], + ); + + let elements = classes.elements(Unit::u8(2)).collect::<Vec<_>>(); + assert_eq!(elements, vec![Unit::u8(b'e'), Unit::u8(b'f')],); + + let elements = classes.elements(Unit::u8(3)).collect::<Vec<_>>(); + assert_eq!( + elements, + vec![ + Unit::u8(b'g'), + Unit::u8(b'h'), + Unit::u8(b'i'), + Unit::u8(b'j'), + Unit::u8(b'k'), + Unit::u8(b'l'), + Unit::u8(b'm'), + ], + ); + + let elements = classes.elements(Unit::u8(4)).collect::<Vec<_>>(); + assert_eq!(elements.len(), 12); + assert_eq!(elements[0], Unit::u8(b'n')); + assert_eq!(elements[11], Unit::u8(b'y')); + + let elements = classes.elements(Unit::u8(5)).collect::<Vec<_>>(); + assert_eq!(elements, vec![Unit::u8(b'z')]); + + let elements = classes.elements(Unit::u8(6)).collect::<Vec<_>>(); + assert_eq!(elements.len(), 133); + assert_eq!(elements[0], Unit::u8(b'\x7B')); + assert_eq!(elements[132], Unit::u8(b'\xFF')); + + let elements = classes.elements(Unit::eoi(7)).collect::<Vec<_>>(); + assert_eq!(elements, vec![Unit::eoi(256)]); + } + + #[test] + fn elements_singletons() { + let classes = ByteClasses::singletons(); + assert_eq!(classes.alphabet_len(), 257); + + let elements = classes.elements(Unit::u8(b'a')).collect::<Vec<_>>(); + assert_eq!(elements, vec![Unit::u8(b'a')]); + + let elements = classes.elements(Unit::eoi(5)).collect::<Vec<_>>(); + assert_eq!(elements, vec![Unit::eoi(256)]); + } + + #[test] + fn elements_empty() { + let classes = ByteClasses::empty(); + assert_eq!(classes.alphabet_len(), 2); + + let elements = classes.elements(Unit::u8(0)).collect::<Vec<_>>(); + assert_eq!(elements.len(), 256); + assert_eq!(elements[0], Unit::u8(b'\x00')); + assert_eq!(elements[255], Unit::u8(b'\xFF')); + + let elements = classes.elements(Unit::eoi(1)).collect::<Vec<_>>(); + assert_eq!(elements, vec![Unit::eoi(256)]); + } +} diff --git a/vendor/regex-automata/src/util/bytes.rs b/vendor/regex-automata/src/util/bytes.rs new file mode 100644 index 000000000..5877bb149 --- /dev/null +++ b/vendor/regex-automata/src/util/bytes.rs @@ -0,0 +1,950 @@ +/* +A collection of helper functions, types and traits for serializing automata. + +This crate defines its own bespoke serialization mechanism for some structures +provided in the public API, namely, DFAs. A bespoke mechanism was developed +primarily because structures like automata demand a specific binary format. +Attempting to encode their rich structure in an existing serialization +format is just not feasible. Moreover, the format for each structure is +generally designed such that deserialization is cheap. More specifically, that +deserialization can be done in constant time. (The idea being that you can +embed it into your binary or mmap it, and then use it immediately.) + +In order to achieve this, most of the structures in this crate use an in-memory +representation that very closely corresponds to its binary serialized form. +This pervades and complicates everything, and in some cases, requires dealing +with alignment and reasoning about safety. + +This technique does have major advantages. In particular, it permits doing +the potentially costly work of compiling a finite state machine in an offline +manner, and then loading it at runtime not only without having to re-compile +the regex, but even without the code required to do the compilation. This, for +example, permits one to use a pre-compiled DFA not only in environments without +Rust's standard library, but also in environments without a heap. + +In the code below, whenever we insert some kind of padding, it's to enforce a +4-byte alignment, unless otherwise noted. Namely, u32 is the only state ID type +supported. (In a previous version of this library, DFAs were generic over the +state ID representation.) + +Also, serialization generally requires the caller to specify endianness, +where as deserialization always assumes native endianness (otherwise cheap +deserialization would be impossible). This implies that serializing a structure +generally requires serializing both its big-endian and little-endian variants, +and then loading the correct one based on the target's endianness. +*/ + +use core::{ + cmp, + convert::{TryFrom, TryInto}, + mem::size_of, +}; + +#[cfg(feature = "alloc")] +use alloc::{vec, vec::Vec}; + +use crate::util::id::{PatternID, PatternIDError, StateID, StateIDError}; + +/// An error that occurs when serializing an object from this crate. +/// +/// Serialization, as used in this crate, universally refers to the process +/// of transforming a structure (like a DFA) into a custom binary format +/// represented by `&[u8]`. To this end, serialization is generally infallible. +/// However, it can fail when caller provided buffer sizes are too small. When +/// that occurs, a serialization error is reported. +/// +/// A `SerializeError` provides no introspection capabilities. Its only +/// supported operation is conversion to a human readable error message. +/// +/// This error type implements the `std::error::Error` trait only when the +/// `std` feature is enabled. Otherwise, this type is defined in all +/// configurations. +#[derive(Debug)] +pub struct SerializeError { + /// The name of the thing that a buffer is too small for. + /// + /// Currently, the only kind of serialization error is one that is + /// committed by a caller: providing a destination buffer that is too + /// small to fit the serialized object. This makes sense conceptually, + /// since every valid inhabitant of a type should be serializable. + /// + /// This is somewhat exposed in the public API of this crate. For example, + /// the `to_bytes_{big,little}_endian` APIs return a `Vec<u8>` and are + /// guaranteed to never panic or error. This is only possible because the + /// implementation guarantees that it will allocate a `Vec<u8>` that is + /// big enough. + /// + /// In summary, if a new serialization error kind needs to be added, then + /// it will need careful consideration. + what: &'static str, +} + +impl SerializeError { + pub(crate) fn buffer_too_small(what: &'static str) -> SerializeError { + SerializeError { what } + } +} + +impl core::fmt::Display for SerializeError { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + write!(f, "destination buffer is too small to write {}", self.what) + } +} + +#[cfg(feature = "std")] +impl std::error::Error for SerializeError {} + +/// An error that occurs when deserializing an object defined in this crate. +/// +/// Serialization, as used in this crate, universally refers to the process +/// of transforming a structure (like a DFA) into a custom binary format +/// represented by `&[u8]`. Deserialization, then, refers to the process of +/// cheaply converting this binary format back to the object's in-memory +/// representation as defined in this crate. To the extent possible, +/// deserialization will report this error whenever this process fails. +/// +/// A `DeserializeError` provides no introspection capabilities. Its only +/// supported operation is conversion to a human readable error message. +/// +/// This error type implements the `std::error::Error` trait only when the +/// `std` feature is enabled. Otherwise, this type is defined in all +/// configurations. +#[derive(Debug)] +pub struct DeserializeError(DeserializeErrorKind); + +#[derive(Debug)] +enum DeserializeErrorKind { + Generic { msg: &'static str }, + BufferTooSmall { what: &'static str }, + InvalidUsize { what: &'static str }, + InvalidVarint { what: &'static str }, + VersionMismatch { expected: u32, found: u32 }, + EndianMismatch { expected: u32, found: u32 }, + AlignmentMismatch { alignment: usize, address: usize }, + LabelMismatch { expected: &'static str }, + ArithmeticOverflow { what: &'static str }, + PatternID { err: PatternIDError, what: &'static str }, + StateID { err: StateIDError, what: &'static str }, +} + +impl DeserializeError { + pub(crate) fn generic(msg: &'static str) -> DeserializeError { + DeserializeError(DeserializeErrorKind::Generic { msg }) + } + + pub(crate) fn buffer_too_small(what: &'static str) -> DeserializeError { + DeserializeError(DeserializeErrorKind::BufferTooSmall { what }) + } + + pub(crate) fn invalid_usize(what: &'static str) -> DeserializeError { + DeserializeError(DeserializeErrorKind::InvalidUsize { what }) + } + + fn invalid_varint(what: &'static str) -> DeserializeError { + DeserializeError(DeserializeErrorKind::InvalidVarint { what }) + } + + fn version_mismatch(expected: u32, found: u32) -> DeserializeError { + DeserializeError(DeserializeErrorKind::VersionMismatch { + expected, + found, + }) + } + + fn endian_mismatch(expected: u32, found: u32) -> DeserializeError { + DeserializeError(DeserializeErrorKind::EndianMismatch { + expected, + found, + }) + } + + fn alignment_mismatch( + alignment: usize, + address: usize, + ) -> DeserializeError { + DeserializeError(DeserializeErrorKind::AlignmentMismatch { + alignment, + address, + }) + } + + fn label_mismatch(expected: &'static str) -> DeserializeError { + DeserializeError(DeserializeErrorKind::LabelMismatch { expected }) + } + + fn arithmetic_overflow(what: &'static str) -> DeserializeError { + DeserializeError(DeserializeErrorKind::ArithmeticOverflow { what }) + } + + pub(crate) fn pattern_id_error( + err: PatternIDError, + what: &'static str, + ) -> DeserializeError { + DeserializeError(DeserializeErrorKind::PatternID { err, what }) + } + + pub(crate) fn state_id_error( + err: StateIDError, + what: &'static str, + ) -> DeserializeError { + DeserializeError(DeserializeErrorKind::StateID { err, what }) + } +} + +#[cfg(feature = "std")] +impl std::error::Error for DeserializeError {} + +impl core::fmt::Display for DeserializeError { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + use self::DeserializeErrorKind::*; + + match self.0 { + Generic { msg } => write!(f, "{}", msg), + BufferTooSmall { what } => { + write!(f, "buffer is too small to read {}", what) + } + InvalidUsize { what } => { + write!(f, "{} is too big to fit in a usize", what) + } + InvalidVarint { what } => { + write!(f, "could not decode valid varint for {}", what) + } + VersionMismatch { expected, found } => write!( + f, + "unsupported version: \ + expected version {} but found version {}", + expected, found, + ), + EndianMismatch { expected, found } => write!( + f, + "endianness mismatch: expected 0x{:X} but got 0x{:X}. \ + (Are you trying to load an object serialized with a \ + different endianness?)", + expected, found, + ), + AlignmentMismatch { alignment, address } => write!( + f, + "alignment mismatch: slice starts at address \ + 0x{:X}, which is not aligned to a {} byte boundary", + address, alignment, + ), + LabelMismatch { expected } => write!( + f, + "label mismatch: start of serialized object should \ + contain a NUL terminated {:?} label, but a different \ + label was found", + expected, + ), + ArithmeticOverflow { what } => { + write!(f, "arithmetic overflow for {}", what) + } + PatternID { ref err, what } => { + write!(f, "failed to read pattern ID for {}: {}", what, err) + } + StateID { ref err, what } => { + write!(f, "failed to read state ID for {}: {}", what, err) + } + } + } +} + +/// Checks that the given slice has an alignment that matches `T`. +/// +/// This is useful for checking that a slice has an appropriate alignment +/// before casting it to a &[T]. Note though that alignment is not itself +/// sufficient to perform the cast for any `T`. +pub fn check_alignment<T>(slice: &[u8]) -> Result<(), DeserializeError> { + let alignment = core::mem::align_of::<T>(); + let address = slice.as_ptr() as usize; + if address % alignment == 0 { + return Ok(()); + } + Err(DeserializeError::alignment_mismatch(alignment, address)) +} + +/// Reads a possibly empty amount of padding, up to 7 bytes, from the beginning +/// of the given slice. All padding bytes must be NUL bytes. +/// +/// This is useful because it can be theoretically necessary to pad the +/// beginning of a serialized object with NUL bytes to ensure that it starts +/// at a correctly aligned address. These padding bytes should come immediately +/// before the label. +/// +/// This returns the number of bytes read from the given slice. +pub fn skip_initial_padding(slice: &[u8]) -> usize { + let mut nread = 0; + while nread < 7 && nread < slice.len() && slice[nread] == 0 { + nread += 1; + } + nread +} + +/// Allocate a byte buffer of the given size, along with some initial padding +/// such that `buf[padding..]` has the same alignment as `T`, where the +/// alignment of `T` must be at most `8`. In particular, callers should treat +/// the first N bytes (second return value) as padding bytes that must not be +/// overwritten. In all cases, the following identity holds: +/// +/// ```ignore +/// let (buf, padding) = alloc_aligned_buffer::<StateID>(SIZE); +/// assert_eq!(SIZE, buf[padding..].len()); +/// ``` +/// +/// In practice, padding is often zero. +/// +/// The requirement for `8` as a maximum here is somewhat arbitrary. In +/// practice, we never need anything bigger in this crate, and so this function +/// does some sanity asserts under the assumption of a max alignment of `8`. +#[cfg(feature = "alloc")] +pub fn alloc_aligned_buffer<T>(size: usize) -> (Vec<u8>, usize) { + // FIXME: This is a kludge because there's no easy way to allocate a + // Vec<u8> with an alignment guaranteed to be greater than 1. We could + // create a Vec<u32>, but this cannot be safely transmuted to a Vec<u8> + // without concern, since reallocing or dropping the Vec<u8> is UB + // (different alignment than the initial allocation). We could define a + // wrapper type to manage this for us, but it seems like more machinery + // than it's worth. + let mut buf = vec![0; size]; + let align = core::mem::align_of::<T>(); + let address = buf.as_ptr() as usize; + if address % align == 0 { + return (buf, 0); + } + // It's not quite clear how to robustly test this code, since the allocator + // in my environment appears to always return addresses aligned to at + // least 8 bytes, even when the alignment requirement is smaller. A feeble + // attempt at ensuring correctness is provided with asserts. + let padding = ((address & !0b111).checked_add(8).unwrap()) + .checked_sub(address) + .unwrap(); + assert!(padding <= 7, "padding of {} is bigger than 7", padding); + buf.extend(core::iter::repeat(0).take(padding)); + assert_eq!(size + padding, buf.len()); + assert_eq!( + 0, + buf[padding..].as_ptr() as usize % align, + "expected end of initial padding to be aligned to {}", + align, + ); + (buf, padding) +} + +/// Reads a NUL terminated label starting at the beginning of the given slice. +/// +/// If a NUL terminated label could not be found, then an error is returned. +/// Similary, if a label is found but doesn't match the expected label, then +/// an error is returned. +/// +/// Upon success, the total number of bytes read (including padding bytes) is +/// returned. +pub fn read_label( + slice: &[u8], + expected_label: &'static str, +) -> Result<usize, DeserializeError> { + // Set an upper bound on how many bytes we scan for a NUL. Since no label + // in this crate is longer than 256 bytes, if we can't find one within that + // range, then we have corrupted data. + let first_nul = + slice[..cmp::min(slice.len(), 256)].iter().position(|&b| b == 0); + let first_nul = match first_nul { + Some(first_nul) => first_nul, + None => { + return Err(DeserializeError::generic( + "could not find NUL terminated label \ + at start of serialized object", + )); + } + }; + let len = first_nul + padding_len(first_nul); + if slice.len() < len { + return Err(DeserializeError::generic( + "could not find properly sized label at start of serialized object" + )); + } + if expected_label.as_bytes() != &slice[..first_nul] { + return Err(DeserializeError::label_mismatch(expected_label)); + } + Ok(len) +} + +/// Writes the given label to the buffer as a NUL terminated string. The label +/// given must not contain NUL, otherwise this will panic. Similarly, the label +/// must not be longer than 255 bytes, otherwise this will panic. +/// +/// Additional NUL bytes are written as necessary to ensure that the number of +/// bytes written is always a multiple of 4. +/// +/// Upon success, the total number of bytes written (including padding) is +/// returned. +pub fn write_label( + label: &str, + dst: &mut [u8], +) -> Result<usize, SerializeError> { + let nwrite = write_label_len(label); + if dst.len() < nwrite { + return Err(SerializeError::buffer_too_small("label")); + } + dst[..label.len()].copy_from_slice(label.as_bytes()); + for i in 0..(nwrite - label.len()) { + dst[label.len() + i] = 0; + } + assert_eq!(nwrite % 4, 0); + Ok(nwrite) +} + +/// Returns the total number of bytes (including padding) that would be written +/// for the given label. This panics if the given label contains a NUL byte or +/// is longer than 255 bytes. (The size restriction exists so that searching +/// for a label during deserialization can be done in small bounded space.) +pub fn write_label_len(label: &str) -> usize { + if label.len() > 255 { + panic!("label must not be longer than 255 bytes"); + } + if label.as_bytes().iter().position(|&b| b == 0).is_some() { + panic!("label must not contain NUL bytes"); + } + let label_len = label.len() + 1; // +1 for the NUL terminator + label_len + padding_len(label_len) +} + +/// Reads the endianness check from the beginning of the given slice and +/// confirms that the endianness of the serialized object matches the expected +/// endianness. If the slice is too small or if the endianness check fails, +/// this returns an error. +/// +/// Upon success, the total number of bytes read is returned. +pub fn read_endianness_check(slice: &[u8]) -> Result<usize, DeserializeError> { + let (n, nr) = try_read_u32(slice, "endianness check")?; + assert_eq!(nr, write_endianness_check_len()); + if n != 0xFEFF { + return Err(DeserializeError::endian_mismatch(0xFEFF, n)); + } + Ok(nr) +} + +/// Writes 0xFEFF as an integer using the given endianness. +/// +/// This is useful for writing into the header of a serialized object. It can +/// be read during deserialization as a sanity check to ensure the proper +/// endianness is used. +/// +/// Upon success, the total number of bytes written is returned. +pub fn write_endianness_check<E: Endian>( + dst: &mut [u8], +) -> Result<usize, SerializeError> { + let nwrite = write_endianness_check_len(); + if dst.len() < nwrite { + return Err(SerializeError::buffer_too_small("endianness check")); + } + E::write_u32(0xFEFF, dst); + Ok(nwrite) +} + +/// Returns the number of bytes written by the endianness check. +pub fn write_endianness_check_len() -> usize { + size_of::<u32>() +} + +/// Reads a version number from the beginning of the given slice and confirms +/// that is matches the expected version number given. If the slice is too +/// small or if the version numbers aren't equivalent, this returns an error. +/// +/// Upon success, the total number of bytes read is returned. +/// +/// N.B. Currently, we require that the version number is exactly equivalent. +/// In the future, if we bump the version number without a semver bump, then +/// we'll need to relax this a bit and support older versions. +pub fn read_version( + slice: &[u8], + expected_version: u32, +) -> Result<usize, DeserializeError> { + let (n, nr) = try_read_u32(slice, "version")?; + assert_eq!(nr, write_version_len()); + if n != expected_version { + return Err(DeserializeError::version_mismatch(expected_version, n)); + } + Ok(nr) +} + +/// Writes the given version number to the beginning of the given slice. +/// +/// This is useful for writing into the header of a serialized object. It can +/// be read during deserialization as a sanity check to ensure that the library +/// code supports the format of the serialized object. +/// +/// Upon success, the total number of bytes written is returned. +pub fn write_version<E: Endian>( + version: u32, + dst: &mut [u8], +) -> Result<usize, SerializeError> { + let nwrite = write_version_len(); + if dst.len() < nwrite { + return Err(SerializeError::buffer_too_small("version number")); + } + E::write_u32(version, dst); + Ok(nwrite) +} + +/// Returns the number of bytes written by writing the version number. +pub fn write_version_len() -> usize { + size_of::<u32>() +} + +/// Reads a pattern ID from the given slice. If the slice has insufficient +/// length, then this panics. If the deserialized integer exceeds the pattern +/// ID limit for the current target, then this returns an error. +/// +/// Upon success, this also returns the number of bytes read. +pub fn read_pattern_id( + slice: &[u8], + what: &'static str, +) -> Result<(PatternID, usize), DeserializeError> { + let bytes: [u8; PatternID::SIZE] = + slice[..PatternID::SIZE].try_into().unwrap(); + let pid = PatternID::from_ne_bytes(bytes) + .map_err(|err| DeserializeError::pattern_id_error(err, what))?; + Ok((pid, PatternID::SIZE)) +} + +/// Reads a pattern ID from the given slice. If the slice has insufficient +/// length, then this panics. Otherwise, the deserialized integer is assumed +/// to be a valid pattern ID. +/// +/// This also returns the number of bytes read. +pub fn read_pattern_id_unchecked(slice: &[u8]) -> (PatternID, usize) { + let pid = PatternID::from_ne_bytes_unchecked( + slice[..PatternID::SIZE].try_into().unwrap(), + ); + (pid, PatternID::SIZE) +} + +/// Write the given pattern ID to the beginning of the given slice of bytes +/// using the specified endianness. The given slice must have length at least +/// `PatternID::SIZE`, or else this panics. Upon success, the total number of +/// bytes written is returned. +pub fn write_pattern_id<E: Endian>(pid: PatternID, dst: &mut [u8]) -> usize { + E::write_u32(pid.as_u32(), dst); + PatternID::SIZE +} + +/// Attempts to read a state ID from the given slice. If the slice has an +/// insufficient number of bytes or if the state ID exceeds the limit for +/// the current target, then this returns an error. +/// +/// Upon success, this also returns the number of bytes read. +pub fn try_read_state_id( + slice: &[u8], + what: &'static str, +) -> Result<(StateID, usize), DeserializeError> { + if slice.len() < StateID::SIZE { + return Err(DeserializeError::buffer_too_small(what)); + } + read_state_id(slice, what) +} + +/// Reads a state ID from the given slice. If the slice has insufficient +/// length, then this panics. If the deserialized integer exceeds the state ID +/// limit for the current target, then this returns an error. +/// +/// Upon success, this also returns the number of bytes read. +pub fn read_state_id( + slice: &[u8], + what: &'static str, +) -> Result<(StateID, usize), DeserializeError> { + let bytes: [u8; StateID::SIZE] = + slice[..StateID::SIZE].try_into().unwrap(); + let sid = StateID::from_ne_bytes(bytes) + .map_err(|err| DeserializeError::state_id_error(err, what))?; + Ok((sid, StateID::SIZE)) +} + +/// Reads a state ID from the given slice. If the slice has insufficient +/// length, then this panics. Otherwise, the deserialized integer is assumed +/// to be a valid state ID. +/// +/// This also returns the number of bytes read. +pub fn read_state_id_unchecked(slice: &[u8]) -> (StateID, usize) { + let sid = StateID::from_ne_bytes_unchecked( + slice[..StateID::SIZE].try_into().unwrap(), + ); + (sid, StateID::SIZE) +} + +/// Write the given state ID to the beginning of the given slice of bytes +/// using the specified endianness. The given slice must have length at least +/// `StateID::SIZE`, or else this panics. Upon success, the total number of +/// bytes written is returned. +pub fn write_state_id<E: Endian>(sid: StateID, dst: &mut [u8]) -> usize { + E::write_u32(sid.as_u32(), dst); + StateID::SIZE +} + +/// Try to read a u16 as a usize from the beginning of the given slice in +/// native endian format. If the slice has fewer than 2 bytes or if the +/// deserialized number cannot be represented by usize, then this returns an +/// error. The error message will include the `what` description of what is +/// being deserialized, for better error messages. `what` should be a noun in +/// singular form. +/// +/// Upon success, this also returns the number of bytes read. +pub fn try_read_u16_as_usize( + slice: &[u8], + what: &'static str, +) -> Result<(usize, usize), DeserializeError> { + try_read_u16(slice, what).and_then(|(n, nr)| { + usize::try_from(n) + .map(|n| (n, nr)) + .map_err(|_| DeserializeError::invalid_usize(what)) + }) +} + +/// Try to read a u32 as a usize from the beginning of the given slice in +/// native endian format. If the slice has fewer than 4 bytes or if the +/// deserialized number cannot be represented by usize, then this returns an +/// error. The error message will include the `what` description of what is +/// being deserialized, for better error messages. `what` should be a noun in +/// singular form. +/// +/// Upon success, this also returns the number of bytes read. +pub fn try_read_u32_as_usize( + slice: &[u8], + what: &'static str, +) -> Result<(usize, usize), DeserializeError> { + try_read_u32(slice, what).and_then(|(n, nr)| { + usize::try_from(n) + .map(|n| (n, nr)) + .map_err(|_| DeserializeError::invalid_usize(what)) + }) +} + +/// Try to read a u16 from the beginning of the given slice in native endian +/// format. If the slice has fewer than 2 bytes, then this returns an error. +/// The error message will include the `what` description of what is being +/// deserialized, for better error messages. `what` should be a noun in +/// singular form. +/// +/// Upon success, this also returns the number of bytes read. +pub fn try_read_u16( + slice: &[u8], + what: &'static str, +) -> Result<(u16, usize), DeserializeError> { + if slice.len() < size_of::<u16>() { + return Err(DeserializeError::buffer_too_small(what)); + } + Ok((read_u16(slice), size_of::<u16>())) +} + +/// Try to read a u32 from the beginning of the given slice in native endian +/// format. If the slice has fewer than 4 bytes, then this returns an error. +/// The error message will include the `what` description of what is being +/// deserialized, for better error messages. `what` should be a noun in +/// singular form. +/// +/// Upon success, this also returns the number of bytes read. +pub fn try_read_u32( + slice: &[u8], + what: &'static str, +) -> Result<(u32, usize), DeserializeError> { + if slice.len() < size_of::<u32>() { + return Err(DeserializeError::buffer_too_small(what)); + } + Ok((read_u32(slice), size_of::<u32>())) +} + +/// Read a u16 from the beginning of the given slice in native endian format. +/// If the slice has fewer than 2 bytes, then this panics. +/// +/// Marked as inline to speed up sparse searching which decodes integers from +/// its automaton at search time. +#[inline(always)] +pub fn read_u16(slice: &[u8]) -> u16 { + let bytes: [u8; 2] = slice[..size_of::<u16>()].try_into().unwrap(); + u16::from_ne_bytes(bytes) +} + +/// Read a u32 from the beginning of the given slice in native endian format. +/// If the slice has fewer than 4 bytes, then this panics. +/// +/// Marked as inline to speed up sparse searching which decodes integers from +/// its automaton at search time. +#[inline(always)] +pub fn read_u32(slice: &[u8]) -> u32 { + let bytes: [u8; 4] = slice[..size_of::<u32>()].try_into().unwrap(); + u32::from_ne_bytes(bytes) +} + +/// Read a u64 from the beginning of the given slice in native endian format. +/// If the slice has fewer than 8 bytes, then this panics. +/// +/// Marked as inline to speed up sparse searching which decodes integers from +/// its automaton at search time. +#[inline(always)] +pub fn read_u64(slice: &[u8]) -> u64 { + let bytes: [u8; 8] = slice[..size_of::<u64>()].try_into().unwrap(); + u64::from_ne_bytes(bytes) +} + +/// Write a variable sized integer and return the total number of bytes +/// written. If the slice was not big enough to contain the bytes, then this +/// returns an error including the "what" description in it. This does no +/// padding. +/// +/// See: https://developers.google.com/protocol-buffers/docs/encoding#varints +#[allow(dead_code)] +pub fn write_varu64( + mut n: u64, + what: &'static str, + dst: &mut [u8], +) -> Result<usize, SerializeError> { + let mut i = 0; + while n >= 0b1000_0000 { + if i >= dst.len() { + return Err(SerializeError::buffer_too_small(what)); + } + dst[i] = (n as u8) | 0b1000_0000; + n >>= 7; + i += 1; + } + if i >= dst.len() { + return Err(SerializeError::buffer_too_small(what)); + } + dst[i] = n as u8; + Ok(i + 1) +} + +/// Returns the total number of bytes that would be writen to encode n as a +/// variable sized integer. +/// +/// See: https://developers.google.com/protocol-buffers/docs/encoding#varints +#[allow(dead_code)] +pub fn write_varu64_len(mut n: u64) -> usize { + let mut i = 0; + while n >= 0b1000_0000 { + n >>= 7; + i += 1; + } + i + 1 +} + +/// Like read_varu64, but attempts to cast the result to usize. If the integer +/// cannot fit into a usize, then an error is returned. +#[allow(dead_code)] +pub fn read_varu64_as_usize( + slice: &[u8], + what: &'static str, +) -> Result<(usize, usize), DeserializeError> { + let (n, nread) = read_varu64(slice, what)?; + let n = usize::try_from(n) + .map_err(|_| DeserializeError::invalid_usize(what))?; + Ok((n, nread)) +} + +/// Reads a variable sized integer from the beginning of slice, and returns the +/// integer along with the total number of bytes read. If a valid variable +/// sized integer could not be found, then an error is returned that includes +/// the "what" description in it. +/// +/// https://developers.google.com/protocol-buffers/docs/encoding#varints +#[allow(dead_code)] +pub fn read_varu64( + slice: &[u8], + what: &'static str, +) -> Result<(u64, usize), DeserializeError> { + let mut n: u64 = 0; + let mut shift: u32 = 0; + // The biggest possible value is u64::MAX, which needs all 64 bits which + // requires 10 bytes (because 7 * 9 < 64). We use a limit to avoid reading + // an unnecessary number of bytes. + let limit = cmp::min(slice.len(), 10); + for (i, &b) in slice[..limit].iter().enumerate() { + if b < 0b1000_0000 { + return match (b as u64).checked_shl(shift) { + None => Err(DeserializeError::invalid_varint(what)), + Some(b) => Ok((n | b, i + 1)), + }; + } + match ((b as u64) & 0b0111_1111).checked_shl(shift) { + None => return Err(DeserializeError::invalid_varint(what)), + Some(b) => n |= b, + } + shift += 7; + } + Err(DeserializeError::invalid_varint(what)) +} + +/// Checks that the given slice has some minimal length. If it's smaller than +/// the bound given, then a "buffer too small" error is returned with `what` +/// describing what the buffer represents. +pub fn check_slice_len<T>( + slice: &[T], + at_least_len: usize, + what: &'static str, +) -> Result<(), DeserializeError> { + if slice.len() < at_least_len { + return Err(DeserializeError::buffer_too_small(what)); + } + Ok(()) +} + +/// Multiply the given numbers, and on overflow, return an error that includes +/// 'what' in the error message. +/// +/// This is useful when doing arithmetic with untrusted data. +pub fn mul( + a: usize, + b: usize, + what: &'static str, +) -> Result<usize, DeserializeError> { + match a.checked_mul(b) { + Some(c) => Ok(c), + None => Err(DeserializeError::arithmetic_overflow(what)), + } +} + +/// Add the given numbers, and on overflow, return an error that includes +/// 'what' in the error message. +/// +/// This is useful when doing arithmetic with untrusted data. +pub fn add( + a: usize, + b: usize, + what: &'static str, +) -> Result<usize, DeserializeError> { + match a.checked_add(b) { + Some(c) => Ok(c), + None => Err(DeserializeError::arithmetic_overflow(what)), + } +} + +/// Shift `a` left by `b`, and on overflow, return an error that includes +/// 'what' in the error message. +/// +/// This is useful when doing arithmetic with untrusted data. +pub fn shl( + a: usize, + b: usize, + what: &'static str, +) -> Result<usize, DeserializeError> { + let amount = u32::try_from(b) + .map_err(|_| DeserializeError::arithmetic_overflow(what))?; + match a.checked_shl(amount) { + Some(c) => Ok(c), + None => Err(DeserializeError::arithmetic_overflow(what)), + } +} + +/// A simple trait for writing code generic over endianness. +/// +/// This is similar to what byteorder provides, but we only need a very small +/// subset. +pub trait Endian { + /// Writes a u16 to the given destination buffer in a particular + /// endianness. If the destination buffer has a length smaller than 2, then + /// this panics. + fn write_u16(n: u16, dst: &mut [u8]); + + /// Writes a u32 to the given destination buffer in a particular + /// endianness. If the destination buffer has a length smaller than 4, then + /// this panics. + fn write_u32(n: u32, dst: &mut [u8]); + + /// Writes a u64 to the given destination buffer in a particular + /// endianness. If the destination buffer has a length smaller than 8, then + /// this panics. + fn write_u64(n: u64, dst: &mut [u8]); +} + +/// Little endian writing. +pub enum LE {} +/// Big endian writing. +pub enum BE {} + +#[cfg(target_endian = "little")] +pub type NE = LE; +#[cfg(target_endian = "big")] +pub type NE = BE; + +impl Endian for LE { + fn write_u16(n: u16, dst: &mut [u8]) { + dst[..2].copy_from_slice(&n.to_le_bytes()); + } + + fn write_u32(n: u32, dst: &mut [u8]) { + dst[..4].copy_from_slice(&n.to_le_bytes()); + } + + fn write_u64(n: u64, dst: &mut [u8]) { + dst[..8].copy_from_slice(&n.to_le_bytes()); + } +} + +impl Endian for BE { + fn write_u16(n: u16, dst: &mut [u8]) { + dst[..2].copy_from_slice(&n.to_be_bytes()); + } + + fn write_u32(n: u32, dst: &mut [u8]) { + dst[..4].copy_from_slice(&n.to_be_bytes()); + } + + fn write_u64(n: u64, dst: &mut [u8]) { + dst[..8].copy_from_slice(&n.to_be_bytes()); + } +} + +/// Returns the number of additional bytes required to add to the given length +/// in order to make the total length a multiple of 4. The return value is +/// always less than 4. +pub fn padding_len(non_padding_len: usize) -> usize { + (4 - (non_padding_len & 0b11)) & 0b11 +} + +#[cfg(all(test, feature = "alloc"))] +mod tests { + use super::*; + + #[test] + fn labels() { + let mut buf = [0; 1024]; + + let nwrite = write_label("fooba", &mut buf).unwrap(); + assert_eq!(nwrite, 8); + assert_eq!(&buf[..nwrite], b"fooba\x00\x00\x00"); + + let nread = read_label(&buf, "fooba").unwrap(); + assert_eq!(nread, 8); + } + + #[test] + #[should_panic] + fn bad_label_interior_nul() { + // interior NULs are not allowed + write_label("foo\x00bar", &mut [0; 1024]).unwrap(); + } + + #[test] + fn bad_label_almost_too_long() { + // ok + write_label(&"z".repeat(255), &mut [0; 1024]).unwrap(); + } + + #[test] + #[should_panic] + fn bad_label_too_long() { + // labels longer than 255 bytes are banned + write_label(&"z".repeat(256), &mut [0; 1024]).unwrap(); + } + + #[test] + fn padding() { + assert_eq!(0, padding_len(8)); + assert_eq!(3, padding_len(9)); + assert_eq!(2, padding_len(10)); + assert_eq!(1, padding_len(11)); + assert_eq!(0, padding_len(12)); + assert_eq!(3, padding_len(13)); + assert_eq!(2, padding_len(14)); + assert_eq!(1, padding_len(15)); + assert_eq!(0, padding_len(16)); + } +} diff --git a/vendor/regex-automata/src/util/determinize/mod.rs b/vendor/regex-automata/src/util/determinize/mod.rs new file mode 100644 index 000000000..b384de8e1 --- /dev/null +++ b/vendor/regex-automata/src/util/determinize/mod.rs @@ -0,0 +1,493 @@ +/*! +This module contains types and routines for implementing determinization. + +In this crate, there are at least two places where we implement +determinization: fully ahead-of-time compiled DFAs in the `dfa` module and +lazily compiled DFAs in the `hybrid` module. The stuff in this module +corresponds to the things that are in common between these implementations. + +There are three broad things that our implementations of determinization have +in common, as defined by this module: + +* The classification of start states. That is, whether we're dealing with +word boundaries, line boundaries, etc., is all the same. This also includes +the look-behind assertions that are satisfied by each starting state +classification. + +* The representation of DFA states as sets of NFA states, including +convenience types for building these DFA states that are amenable to reusing +allocations. + +* Routines for the "classical" parts of determinization: computing the +epsilon closure, tracking match states (with corresponding pattern IDs, since +we support multi-pattern finite automata) and, of course, computing the +transition function between states for units of input. + +I did consider a couple of alternatives to this particular form of code reuse: + +1. Don't do any code reuse. The problem here is that we *really* want both +forms of determinization to do exactly identical things when it comes to +their handling of NFA states. While our tests generally ensure this, the code +is tricky and large enough where not reusing code is a pretty big bummer. + +2. Implement all of determinization once and make it generic over fully +compiled DFAs and lazily compiled DFAs. While I didn't actually try this +approach, my instinct is that it would be more complex than is needed here. +And the interface required would be pretty hairy. Instead, I think splitting +it into logical sub-components works better. +*/ + +use alloc::vec::Vec; + +pub(crate) use self::state::{ + State, StateBuilderEmpty, StateBuilderMatches, StateBuilderNFA, +}; + +use crate::{ + nfa::thompson::{self, Look, LookSet}, + util::{ + alphabet, + id::StateID, + matchtypes::MatchKind, + sparse_set::{SparseSet, SparseSets}, + start::Start, + }, +}; + +mod state; + +/// Compute the set of all eachable NFA states, including the full epsilon +/// closure, from a DFA state for a single unit of input. The set of reachable +/// states is returned as a `StateBuilderNFA`. The `StateBuilderNFA` returned +/// also includes any look-behind assertions satisfied by `unit`, in addition +/// to whether it is a match state. For multi-pattern DFAs, the builder will +/// also include the pattern IDs that match (in the order seen). +/// +/// `nfa` must be able to resolve any NFA state in `state` and any NFA state +/// reachable via the epsilon closure of any NFA state in `state`. `sparses` +/// must have capacity equivalent to `nfa.len()`. +/// +/// `match_kind` should correspond to the match semantics implemented by the +/// DFA being built. Generally speaking, for leftmost-first match semantics, +/// states that appear after the first NFA match state will not be included in +/// the `StateBuilderNFA` returned since they are impossible to visit. +/// +/// `sparses` is used as scratch space for NFA traversal. Other than their +/// capacity requirements (detailed above), there are no requirements on what's +/// contained within them (if anything). Similarly, what's inside of them once +/// this routine returns is unspecified. +/// +/// `stack` must have length 0. It is used as scratch space for depth first +/// traversal. After returning, it is guaranteed that `stack` will have length +/// 0. +/// +/// `state` corresponds to the current DFA state on which one wants to compute +/// the transition for the input `unit`. +/// +/// `empty_builder` corresponds to the builder allocation to use to produce a +/// complete `StateBuilderNFA` state. If the state is not needed (or is already +/// cached), then it can be cleared and reused without needing to create a new +/// `State`. The `StateBuilderNFA` state returned is final and ready to be +/// turned into a `State` if necessary. +pub(crate) fn next( + nfa: &thompson::NFA, + match_kind: MatchKind, + sparses: &mut SparseSets, + stack: &mut Vec<StateID>, + state: &State, + unit: alphabet::Unit, + empty_builder: StateBuilderEmpty, +) -> StateBuilderNFA { + sparses.clear(); + + // Put the NFA state IDs into a sparse set in case we need to + // re-compute their epsilon closure. + // + // Doing this state shuffling is technically not necessary unless some + // kind of look-around is used in the DFA. Some ad hoc experiments + // suggested that avoiding this didn't lead to much of an improvement, + // but perhaps more rigorous experimentation should be done. And in + // particular, avoiding this check requires some light refactoring of + // the code below. + state.iter_nfa_state_ids(|nfa_id| { + sparses.set1.insert(nfa_id); + }); + + // Compute look-ahead assertions originating from the current state. + // Based on the input unit we're transitioning over, some additional + // set of assertions may be true. Thus, we re-compute this state's + // epsilon closure (but only if necessary). + if !state.look_need().is_empty() { + // Add look-ahead assertions that are now true based on the current + // input unit. + let mut look_have = state.look_have().clone(); + match unit.as_u8() { + Some(b'\n') => { + look_have.insert(Look::EndLine); + } + Some(_) => {} + None => { + look_have.insert(Look::EndText); + look_have.insert(Look::EndLine); + } + } + if state.is_from_word() == unit.is_word_byte() { + look_have.insert(Look::WordBoundaryUnicodeNegate); + look_have.insert(Look::WordBoundaryAsciiNegate); + } else { + look_have.insert(Look::WordBoundaryUnicode); + look_have.insert(Look::WordBoundaryAscii); + } + // If we have new assertions satisfied that are among the set of + // assertions that exist in this state (that is, just because + // we added an EndLine assertion above doesn't mean there is an + // EndLine conditional epsilon transition in this state), then we + // re-compute this state's epsilon closure using the updated set of + // assertions. + if !look_have + .subtract(state.look_have()) + .intersect(state.look_need()) + .is_empty() + { + for nfa_id in &sparses.set1 { + epsilon_closure( + nfa, + nfa_id, + look_have, + stack, + &mut sparses.set2, + ); + } + sparses.swap(); + sparses.set2.clear(); + } + } + + // Convert our empty builder into one that can record assertions and match + // pattern IDs. + let mut builder = empty_builder.into_matches(); + // Set whether the StartLine look-behind assertion is true for this + // transition or not. The look-behind assertion for ASCII word boundaries + // is handled below. + if nfa.has_any_anchor() { + if unit.as_u8().map_or(false, |b| b == b'\n') { + // Why only handle StartLine here and not StartText? That's + // because StartText can only impact the starting state, which + // is speical cased in start state handling. + builder.look_have().insert(Look::StartLine); + } + } + for nfa_id in &sparses.set1 { + match *nfa.state(nfa_id) { + thompson::State::Union { .. } + | thompson::State::Fail + | thompson::State::Look { .. } + | thompson::State::Capture { .. } => {} + thompson::State::Match { id } => { + // Notice here that we are calling the NEW state a match + // state if the OLD state we are transitioning from + // contains an NFA match state. This is precisely how we + // delay all matches by one byte and also what therefore + // guarantees that starting states cannot be match states. + // + // If we didn't delay matches by one byte, then whether + // a DFA is a matching state or not would be determined + // by whether one of its own constituent NFA states + // was a match state. (And that would be done in + // 'add_nfa_states'.) + // + // Also, 'add_match_pattern_id' requires that callers never + // pass duplicative pattern IDs. We do in fact uphold that + // guarantee here, but it's subtle. In particular, a Thompson + // NFA guarantees that each pattern has exactly one match + // state. Moreover, since we're iterating over the NFA state + // IDs in a set, we are guarateed not to have any duplicative + // match states. Thus, it is impossible to add the same pattern + // ID more than once. + builder.add_match_pattern_id(id); + if !match_kind.continue_past_first_match() { + break; + } + } + thompson::State::Range { range: ref r } => { + if r.matches_unit(unit) { + epsilon_closure( + nfa, + r.next, + *builder.look_have(), + stack, + &mut sparses.set2, + ); + } + } + thompson::State::Sparse(ref sparse) => { + if let Some(next) = sparse.matches_unit(unit) { + epsilon_closure( + nfa, + next, + *builder.look_have(), + stack, + &mut sparses.set2, + ); + } + } + } + } + // We only set the word byte if there's a word boundary look-around + // anywhere in this regex. Otherwise, there's no point in bloating the + // number of states if we don't have one. + // + // We also only set it when the state has a non-zero number of NFA states. + // Otherwise, we could wind up with states that *should* be DEAD states + // but are otherwise distinct from DEAD states because of this look-behind + // assertion being set. While this can't technically impact correctness *in + // theory*, it can create pathological DFAs that consume input until EOI or + // a quit byte is seen. Consuming until EOI isn't a correctness problem, + // but a (serious) perf problem. Hitting a quit byte, however, could be a + // correctness problem since it could cause search routines to report an + // error instead of a detected match once the quit state is entered. (The + // search routine could be made to be a bit smarter by reporting a match + // if one was detected once it enters a quit state (and indeed, the search + // routines in this crate do just that), but it seems better to prevent + // these things by construction if possible.) + if nfa.has_word_boundary() + && unit.is_word_byte() + && !sparses.set2.is_empty() + { + builder.set_is_from_word(); + } + let mut builder_nfa = builder.into_nfa(); + add_nfa_states(nfa, &sparses.set2, &mut builder_nfa); + builder_nfa +} + +/// Compute the epsilon closure for the given NFA state. The epsilon closure +/// consists of all NFA state IDs, including `start_nfa_id`, that can be +/// reached from `start_nfa_id` without consuming any input. These state IDs +/// are written to `set` in the order they are visited, but only if they are +/// not already in `set`. `start_nfa_id` must be a valid state ID for the NFA +/// given. +/// +/// `look_have` consists of the satisfied assertions at the current +/// position. For conditional look-around epsilon transitions, these are +/// only followed if they are satisfied by `look_have`. +/// +/// `stack` must have length 0. It is used as scratch space for depth first +/// traversal. After returning, it is guaranteed that `stack` will have length +/// 0. +pub(crate) fn epsilon_closure( + nfa: &thompson::NFA, + start_nfa_id: StateID, + look_have: LookSet, + stack: &mut Vec<StateID>, + set: &mut SparseSet, +) { + assert!(stack.is_empty()); + // If this isn't an epsilon state, then the epsilon closure is always just + // itself, so there's no need to spin up the machinery below to handle it. + if !nfa.state(start_nfa_id).is_epsilon() { + set.insert(start_nfa_id); + return; + } + + stack.push(start_nfa_id); + while let Some(mut id) = stack.pop() { + // In many cases, we can avoid stack operations when an NFA state only + // adds one new state to visit. In that case, we just set our ID to + // that state and mush on. We only use the stack when an NFA state + // introduces multiple new states to visit. + loop { + // Insert this NFA state, and if it's already in the set and thus + // already visited, then we can move on to the next one. + if !set.insert(id) { + break; + } + match *nfa.state(id) { + thompson::State::Range { .. } + | thompson::State::Sparse { .. } + | thompson::State::Fail + | thompson::State::Match { .. } => break, + thompson::State::Look { look, next } => { + if !look_have.contains(look) { + break; + } + id = next; + } + thompson::State::Union { ref alternates } => { + id = match alternates.get(0) { + None => break, + Some(&id) => id, + }; + // We need to process our alternates in order to preserve + // match preferences, so put the earliest alternates closer + // to the top of the stack. + stack.extend(alternates[1..].iter().rev()); + } + thompson::State::Capture { next, .. } => { + id = next; + } + } + } + } +} + +/// Add the NFA state IDs in the given `set` to the given DFA builder state. +/// The order in which states are added corresponds to the order in which they +/// were added to `set`. +/// +/// The DFA builder state given should already have its complete set of match +/// pattern IDs added (if any) and any look-behind assertions (StartLine, +/// StartText and whether this state is being generated for a transition over a +/// word byte when applicable) that are true immediately prior to transitioning +/// into this state (via `builder.look_have()`). The match pattern IDs should +/// correspond to matches that occured on the previous transition, since all +/// matches are delayed by one byte. The things that should _not_ be set are +/// look-ahead assertions (EndLine, EndText and whether the next byte is a +/// word byte or not). The builder state should also not have anything in +/// `look_need` set, as this routine will compute that for you. +/// +/// The given NFA should be able to resolve all identifiers in `set` to a +/// particular NFA state. Additionally, `set` must have capacity equivalent +/// to `nfa.len()`. +pub(crate) fn add_nfa_states( + nfa: &thompson::NFA, + set: &SparseSet, + builder: &mut StateBuilderNFA, +) { + for nfa_id in set { + match *nfa.state(nfa_id) { + thompson::State::Range { .. } => { + builder.add_nfa_state_id(nfa_id); + } + thompson::State::Sparse { .. } => { + builder.add_nfa_state_id(nfa_id); + } + thompson::State::Look { look, .. } => { + builder.add_nfa_state_id(nfa_id); + builder.look_need().insert(look); + } + thompson::State::Union { .. } + | thompson::State::Capture { .. } => { + // Pure epsilon transitions don't need to be tracked + // as part of the DFA state. Tracking them is actually + // superfluous; they won't cause any harm other than making + // determinization slower. + // + // Why aren't these needed? Well, in an NFA, epsilon + // transitions are really just jumping points to other + // states. So once you hit an epsilon transition, the same + // set of resulting states always appears. Therefore, + // putting them in a DFA's set of ordered NFA states is + // strictly redundant. + // + // Look-around states are also epsilon transitions, but + // they are *conditional*. So their presence could be + // discriminatory, and thus, they are tracked above. + // + // But wait... why are epsilon states in our `set` in the + // first place? Why not just leave them out? They're in + // our `set` because it was generated by computing an + // epsilon closure, and we want to keep track of all states + // we visited to avoid re-visiting them. In exchange, we + // have to do this second iteration over our collected + // states to finalize our DFA state. + // + // Note that this optimization requires that we re-compute + // the epsilon closure to account for look-ahead in 'next' + // *only when necessary*. Namely, only when the set of + // look-around assertions changes and only when those + // changes are within the set of assertions that are + // needed in order to step through the closure correctly. + // Otherwise, if we re-do the epsilon closure needlessly, + // it could change based on the fact that we are omitting + // epsilon states here. + } + thompson::State::Fail => { + break; + } + thompson::State::Match { .. } => { + // Normally, the NFA match state doesn't actually need to + // be inside the DFA state. But since we delay matches by + // one byte, the matching DFA state corresponds to states + // that transition from the one we're building here. And + // the way we detect those cases is by looking for an NFA + // match state. See 'next' for how this is handled. + builder.add_nfa_state_id(nfa_id); + } + } + } + // If we know this state contains no look-around assertions, then + // there's no reason to track which look-around assertions were + // satisfied when this state was created. + if builder.look_need().is_empty() { + builder.look_have().clear(); + } +} + +/// Sets the appropriate look-behind assertions on the given state based on +/// this starting configuration. +pub(crate) fn set_lookbehind_from_start( + start: &Start, + builder: &mut StateBuilderMatches, +) { + match *start { + Start::NonWordByte => {} + Start::WordByte => { + builder.set_is_from_word(); + } + Start::Text => { + builder.look_have().insert(Look::StartText); + builder.look_have().insert(Look::StartLine); + } + Start::Line => { + builder.look_have().insert(Look::StartLine); + } + } +} + +#[cfg(test)] +mod tests { + use super::Start; + + #[test] + #[should_panic] + fn start_fwd_bad_range() { + Start::from_position_fwd(&[], 0, 1); + } + + #[test] + #[should_panic] + fn start_rev_bad_range() { + Start::from_position_rev(&[], 0, 1); + } + + #[test] + fn start_fwd() { + let f = Start::from_position_fwd; + + assert_eq!(Start::Text, f(&[], 0, 0)); + assert_eq!(Start::Text, f(b"abc", 0, 3)); + assert_eq!(Start::Text, f(b"\nabc", 0, 3)); + + assert_eq!(Start::Line, f(b"\nabc", 1, 3)); + + assert_eq!(Start::WordByte, f(b"abc", 1, 3)); + + assert_eq!(Start::NonWordByte, f(b" abc", 1, 3)); + } + + #[test] + fn start_rev() { + let f = Start::from_position_rev; + + assert_eq!(Start::Text, f(&[], 0, 0)); + assert_eq!(Start::Text, f(b"abc", 0, 3)); + assert_eq!(Start::Text, f(b"abc\n", 0, 4)); + + assert_eq!(Start::Line, f(b"abc\nz", 0, 3)); + + assert_eq!(Start::WordByte, f(b"abc", 0, 2)); + + assert_eq!(Start::NonWordByte, f(b"abc ", 0, 3)); + } +} diff --git a/vendor/regex-automata/src/util/determinize/state.rs b/vendor/regex-automata/src/util/determinize/state.rs new file mode 100644 index 000000000..567e600d6 --- /dev/null +++ b/vendor/regex-automata/src/util/determinize/state.rs @@ -0,0 +1,873 @@ +/*! +This module defines a DFA state representation and builders for constructing +DFA states. + +This representation is specifically for use in implementations of NFA-to-DFA +conversion via powerset construction. (Also called "determinization" in this +crate.) + +The term "DFA state" is somewhat overloaded in this crate. In some cases, it +refers to the set of transitions over an alphabet for a particular state. In +other cases, it refers to a set of NFA states. The former is really about the +final representation of a state in a DFA's transition table, where as the +latter---what this module is focusedon---is closer to an intermediate form that +is used to help eventually build the transition table. + +This module exports four types. All four types represent the same idea: an +ordered set of NFA states. This ordered set represents the epsilon closure of a +particular NFA state, where the "epsilon closure" is the set of NFA states that +can be transitioned to without consuming any input. i.e., Follow all of theNFA +state's epsilon transitions. In addition, this implementation of DFA states +cares about two other things: the ordered set of pattern IDs corresponding +to the patterns that match if the state is a match state, and the set of +look-behind assertions that were true when the state was created. + +The first, `State`, is a frozen representation of a state that cannot be +modified. It may be cheaply cloned without copying the state itself and can be +accessed safely from multiple threads simultaneously. This type is useful for +when one knows that the DFA state being constructed is distinct from any other +previously constructed states. Namely, powerset construction, in practice, +requires one to keep a cache of previously created DFA states. Otherwise, +the number of DFA states created in memory balloons to an impractically +large number. For this reason, equivalent states should endeavor to have an +equivalent byte-level representation. (In general, "equivalency" here means, +"equivalent assertions, pattern IDs and NFA state IDs." We do not require that +full DFA minimization be implemented here. This form of equivalency is only +surface deep and is more-or-less a practical necessity.) + +The other three types represent different phases in the construction of a +DFA state. Internally, these three types (and `State`) all use the same +byte-oriented representation. That means one can use any of the builder types +to check whether the state it represents already exists or not. If it does, +then there is no need to freeze it into a `State` (which requires an alloc and +a copy). Here are the three types described succinctly: + +* `StateBuilderEmpty` represents a state with no pattern IDs, no assertions +and no NFA states. Creating a `StateBuilderEmpty` performs no allocs. A +`StateBuilderEmpty` can only be used to query its underlying memory capacity, +or to convert into a builder for recording pattern IDs and/or assertions. +* `StateBuilderMatches` represents a state with zero or more pattern IDs, zero +or more satisfied assertions and zero NFA state IDs. A `StateBuilderMatches` +can only be used for adding pattern IDs and recording assertions. +* `StateBuilderNFA` represents a state with zero or more pattern IDs, zero or +more satisfied assertions and zero or more NFA state IDs. A `StateBuilderNFA` +can only be used for adding NFA state IDs and recording some assertions. + +The expected flow here is to use the above builders to construct a candidate +DFA state to check if it already exists. If it does, then there's no need to +freeze it into a `State`. It it doesn't exist, then `StateBuilderNFA::to_state` +can be called to freeze the builder into an immutable `State`. In either +case, `clear` should be called on the builder to turn it back into a +`StateBuilderEmpty` that reuses the underyling memory. + +The main purpose for splitting the builder into these distinct types is to +make it impossible to do things like adding a pattern ID after adding an NFA +state ID. Namely, this makes it simpler to use a space-and-time efficient +binary representation for the state. (The format is documented on the `Repr` +type below.) If we just used one type for everything, it would be possible for +callers to use an incorrect interleaving of calls and thus result in a corrupt +representation. I chose to use more type machinery to make this impossible to +do because 1) determinization is itself pretty complex and it wouldn't be too +hard to foul this up and 2) there isn't too much machinery involve and it's +well contained. + +As an optimization, sometimes states won't have certain things set. For +example, if the underlying NFA has no word boundary assertions, then there is +no reason to set a state's look-behind assertion as to whether it was generated +from a word byte or not. Similarly, if a state has no NFA states corresponding +to look-around assertions, then there is no reason to set `look_have` to a +non-empty set. Finally, callers usually omit unconditional epsilon transitions +when adding NFA state IDs since they aren't discriminatory. + +Finally, the binary representation used by these states is, thankfully, not +serialized anywhere. So any kind of change can be made with reckless abandon, +as long as everything in this module agrees. +*/ + +use core::{convert::TryFrom, mem}; + +use alloc::{sync::Arc, vec::Vec}; + +use crate::{ + nfa::thompson::LookSet, + util::{ + bytes::{self, Endian}, + id::{PatternID, StateID}, + }, +}; + +/// A DFA state that, at its core, is represented by an ordered set of NFA +/// states. +/// +/// This type is intended to be used only in NFA-to-DFA conversion via powerset +/// construction. +/// +/// It may be cheaply cloned and accessed safely from mulitple threads +/// simultaneously. +#[derive(Clone, Eq, Hash, PartialEq, PartialOrd, Ord)] +pub(crate) struct State(Arc<[u8]>); + +/// This Borrow impl permits us to lookup any state in a map by its byte +/// representation. This is particularly convenient when one has a StateBuilder +/// and we want to see if a correspondingly equivalent state already exists. If +/// one does exist, then we can reuse the allocation required by StateBuilder +/// without having to convert it into a State first. +impl core::borrow::Borrow<[u8]> for State { + fn borrow(&self) -> &[u8] { + &*self.0 + } +} + +impl core::fmt::Debug for State { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + f.debug_tuple("State").field(&self.repr()).finish() + } +} + +/// For docs on these routines, see the internal Repr and ReprVec types below. +impl State { + pub(crate) fn dead() -> State { + StateBuilderEmpty::new().into_matches().into_nfa().to_state() + } + + pub(crate) fn is_match(&self) -> bool { + self.repr().is_match() + } + + pub(crate) fn is_from_word(&self) -> bool { + self.repr().is_from_word() + } + + pub(crate) fn look_have(&self) -> LookSet { + self.repr().look_have() + } + + pub(crate) fn look_need(&self) -> LookSet { + self.repr().look_need() + } + + pub(crate) fn match_count(&self) -> usize { + self.repr().match_count() + } + + pub(crate) fn match_pattern(&self, index: usize) -> PatternID { + self.repr().match_pattern(index) + } + + pub(crate) fn match_pattern_ids(&self) -> Option<Vec<PatternID>> { + self.repr().match_pattern_ids() + } + + pub(crate) fn iter_match_pattern_ids<F: FnMut(PatternID)>(&self, f: F) { + self.repr().iter_match_pattern_ids(f) + } + + pub(crate) fn iter_nfa_state_ids<F: FnMut(StateID)>(&self, f: F) { + self.repr().iter_nfa_state_ids(f) + } + + pub(crate) fn memory_usage(&self) -> usize { + self.0.len() + } + + fn repr(&self) -> Repr<'_> { + Repr(&*self.0) + } +} + +/// A state builder that represents an empty state. +/// +/// This is a useful "initial condition" for state construction. It has no +/// NFA state IDs, no assertions set and no pattern IDs. No allocations are +/// made when new() is called. Its main use is for being converted into a +/// builder that can capture assertions and pattern IDs. +#[derive(Clone, Debug)] +pub(crate) struct StateBuilderEmpty(Vec<u8>); + +/// For docs on these routines, see the internal Repr and ReprVec types below. +impl StateBuilderEmpty { + pub(crate) fn new() -> StateBuilderEmpty { + StateBuilderEmpty(alloc::vec![]) + } + + pub(crate) fn into_matches(mut self) -> StateBuilderMatches { + self.0.extend_from_slice(&[0, 0, 0]); + StateBuilderMatches(self.0) + } + + fn clear(&mut self) { + self.0.clear(); + } + + pub(crate) fn capacity(&self) -> usize { + self.0.capacity() + } +} + +/// A state builder that collects assertions and pattern IDs. +/// +/// When collecting pattern IDs is finished, this can be converted into a +/// builder that collects NFA state IDs. +#[derive(Clone)] +pub(crate) struct StateBuilderMatches(Vec<u8>); + +impl core::fmt::Debug for StateBuilderMatches { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + f.debug_tuple("StateBuilderMatches").field(&self.repr()).finish() + } +} + +/// For docs on these routines, see the internal Repr and ReprVec types below. +impl StateBuilderMatches { + pub(crate) fn into_nfa(mut self) -> StateBuilderNFA { + self.repr_vec().close_match_pattern_ids(); + StateBuilderNFA { repr: self.0, prev_nfa_state_id: StateID::ZERO } + } + + pub(crate) fn clear(self) -> StateBuilderEmpty { + let mut builder = StateBuilderEmpty(self.0); + builder.clear(); + builder + } + + pub(crate) fn is_match(&self) -> bool { + self.repr().is_match() + } + + pub(crate) fn is_from_word(&self) -> bool { + self.repr().is_from_word() + } + + pub(crate) fn set_is_from_word(&mut self) { + self.repr_vec().set_is_from_word() + } + + pub(crate) fn look_have(&mut self) -> &mut LookSet { + LookSet::from_repr_mut(&mut self.0[1]) + } + + pub(crate) fn look_need(&mut self) -> &mut LookSet { + LookSet::from_repr_mut(&mut self.0[2]) + } + + pub(crate) fn add_match_pattern_id(&mut self, pid: PatternID) { + self.repr_vec().add_match_pattern_id(pid) + } + + fn repr(&self) -> Repr<'_> { + Repr(&self.0) + } + + fn repr_vec(&mut self) -> ReprVec<'_> { + ReprVec(&mut self.0) + } +} + +/// A state builder that collects some assertions and NFA state IDs. +/// +/// When collecting NFA state IDs is finished, this can be used to build a +/// `State` if necessary. +/// +/// When dont with building a state (regardless of whether it got kept or not), +/// it's usually a good idea to call `clear` to get an empty builder back so +/// that it can be reused to build the next state. +#[derive(Clone)] +pub(crate) struct StateBuilderNFA { + repr: Vec<u8>, + prev_nfa_state_id: StateID, +} + +impl core::fmt::Debug for StateBuilderNFA { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + f.debug_tuple("StateBuilderNFA").field(&self.repr()).finish() + } +} + +/// For docs on these routines, see the internal Repr and ReprVec types below. +impl StateBuilderNFA { + pub(crate) fn to_state(&self) -> State { + State(Arc::from(&*self.repr)) + } + + pub(crate) fn clear(self) -> StateBuilderEmpty { + let mut builder = StateBuilderEmpty(self.repr); + builder.clear(); + builder + } + + pub(crate) fn is_match(&self) -> bool { + self.repr().is_match() + } + + pub(crate) fn is_from_word(&self) -> bool { + self.repr().is_from_word() + } + + pub(crate) fn look_have(&mut self) -> &mut LookSet { + LookSet::from_repr_mut(&mut self.repr[1]) + } + + pub(crate) fn look_need(&mut self) -> &mut LookSet { + LookSet::from_repr_mut(&mut self.repr[2]) + } + + pub(crate) fn add_nfa_state_id(&mut self, sid: StateID) { + ReprVec(&mut self.repr) + .add_nfa_state_id(&mut self.prev_nfa_state_id, sid) + } + + pub(crate) fn memory_usage(&self) -> usize { + self.repr.len() + } + + pub(crate) fn as_bytes(&self) -> &[u8] { + &self.repr + } + + fn repr(&self) -> Repr<'_> { + Repr(&self.repr) + } + + fn repr_vec(&mut self) -> ReprVec<'_> { + ReprVec(&mut self.repr) + } +} + +/// Repr is a read-only view into the representation of a DFA state. +/// +/// Primarily, a Repr is how we achieve DRY: we implement decoding the format +/// in one place, and then use a Repr to implement the various methods on the +/// public state types. +/// +/// The format is as follows: +/// +/// The first three bytes correspond to bitsets. +/// +/// Byte 0 is a bitset corresponding to miscellaneous flags associated with the +/// state. Bit 0 is set to 1 if the state is a match state. Bit 1 is set to 1 +/// if the state has pattern IDs explicitly written to it. (This is a flag that +/// is not meant to be set by determinization, but rather, is used as part of +/// an internal space-saving optimization.) Bit 2 is set to 1 if the state was +/// generated by a transition over a "word" byte. (Callers may not always set +/// this. For example, if the NFA has no word boundary assertion, then needing +/// to track whether a state came from a word byte or not is superfluous and +/// wasteful.) +/// +/// Byte 1 corresponds to the look-behind assertions that were satisfied by +/// the transition that created this state. This generally only includes the +/// StartLine and StartText assertions. (Look-ahead assertions are not tracked +/// as part of states. Instead, these are applied by re-computing the epsilon +/// closure of a state when computing the transition function. See `next` in +/// the parent module.) +/// +/// Byte 2 corresponds to the set of look-around assertions (including both +/// look-behind and look-ahead) that appear somewhere in this state's set of +/// NFA state IDs. This is used to determine whether this state's epsilon +/// closure should be re-computed when computing the transition function. +/// Namely, look-around assertions are "just" conditional epsilon transitions, +/// so if there are new assertions available when computing the transition +/// function, we should only re-compute the epsilon closure if those new +/// assertions are relevant to this particular state. +/// +/// Bytes 3..7 correspond to a 32-bit native-endian encoded integer +/// corresponding to the number of patterns encoded in this state. If the state +/// is not a match state (byte 0 bit 0 is 0) or if it's only pattern ID is +/// PatternID::ZERO, then no integer is encoded at this position. Instead, byte +/// offset 3 is the position at which the first NFA state ID is encoded. +/// +/// For a match state with at least one non-ZERO pattern ID, the next bytes +/// correspond to a sequence of 32-bit native endian encoded integers that +/// represent each pattern ID, in order, that this match state represents. +/// +/// After the pattern IDs (if any), NFA state IDs are delta encoded as +/// varints.[1] The first NFA state ID is encoded as itself, and each +/// subsequent NFA state ID is encoded as the difference between itself and the +/// previous NFA state ID. +/// +/// [1] - https://developers.google.com/protocol-buffers/docs/encoding#varints +struct Repr<'a>(&'a [u8]); + +impl<'a> Repr<'a> { + /// Returns true if and only if this is a match state. + /// + /// If callers have added pattern IDs to this state, then callers MUST set + /// this state as a match state explicitly. However, as a special case, + /// states that are marked as match states but with no pattern IDs, then + /// the state is treated as if it had a single pattern ID equivalent to + /// PatternID::ZERO. + fn is_match(&self) -> bool { + self.0[0] & (1 << 0) > 0 + } + + /// Returns true if and only if this state has had at least one pattern + /// ID added to it. + /// + /// This is an internal-only flag that permits the representation to save + /// space in the common case of an NFA with one pattern in it. In that + /// case, a match state can only ever have exactly one pattern ID: + /// PatternID::ZERO. So there's no need to represent it. + fn has_pattern_ids(&self) -> bool { + self.0[0] & (1 << 1) > 0 + } + + /// Returns true if and only if this state is marked as having been created + /// from a transition over a word byte. This is useful for checking whether + /// a word boundary assertion is true or not, which requires look-behind + /// (whether the current state came from a word byte or not) and look-ahead + /// (whether the transition byte is a word byte or not). + /// + /// Since states with this set are distinct from states that don't have + /// this set (even if they are otherwise equivalent), callers should not + /// set this assertion unless the underlying NFA has at least one word + /// boundary assertion somewhere. Otherwise, a superfluous number of states + /// may be created. + fn is_from_word(&self) -> bool { + self.0[0] & (1 << 2) > 0 + } + + /// The set of look-behind assertions that were true in the transition that + /// created this state. + /// + /// Generally, this should be empty if 'look_need' is empty, since there is + /// no reason to track which look-behind assertions are true if the state + /// has no conditional epsilon transitions. + /// + /// Satisfied look-ahead assertions are not tracked in states. Instead, + /// these are re-computed on demand via epsilon closure when computing the + /// transition function. + fn look_have(&self) -> LookSet { + LookSet::from_repr(self.0[1]) + } + + /// The set of look-around (both behind and ahead) assertions that appear + /// at least once in this state's set of NFA states. + /// + /// This is used to determine whether the epsilon closure needs to be + /// re-computed when computing the transition function. Namely, if the + /// state has no conditional epsilon transitions, then there is no need + /// to re-compute the epsilon closure. + fn look_need(&self) -> LookSet { + LookSet::from_repr(self.0[2]) + } + + /// Returns the total number of match pattern IDs in this state. + /// + /// If this state is not a match state, then this always returns 0. + fn match_count(&self) -> usize { + if !self.is_match() { + return 0; + } else if !self.has_pattern_ids() { + 1 + } else { + self.encoded_pattern_count() + } + } + + /// Returns the pattern ID for this match state at the given index. + /// + /// If the given index is greater than or equal to `match_count()` for this + /// state, then this could panic or return incorrect results. + fn match_pattern(&self, index: usize) -> PatternID { + if !self.has_pattern_ids() { + PatternID::ZERO + } else { + let offset = 7 + index * PatternID::SIZE; + // This is OK since we only ever serialize valid PatternIDs to + // states. + bytes::read_pattern_id_unchecked(&self.0[offset..]).0 + } + } + + /// Returns a copy of all match pattern IDs in this state. If this state + /// is not a match state, then this returns None. + fn match_pattern_ids(&self) -> Option<Vec<PatternID>> { + if !self.is_match() { + return None; + } + let mut pids = alloc::vec![]; + self.iter_match_pattern_ids(|pid| pids.push(pid)); + Some(pids) + } + + /// Calls the given function on every pattern ID in this state. + fn iter_match_pattern_ids<F: FnMut(PatternID)>(&self, mut f: F) { + if !self.is_match() { + return; + } + // As an optimization for a very common case, when this is a match + // state for an NFA with only one pattern, we don't actually write the + // pattern ID to the state representation. Instead, we know it must + // be there since it is the only possible choice. + if !self.has_pattern_ids() { + f(PatternID::ZERO); + return; + } + let mut pids = &self.0[7..self.pattern_offset_end()]; + while !pids.is_empty() { + let pid = bytes::read_u32(pids); + pids = &pids[PatternID::SIZE..]; + // This is OK since we only ever serialize valid PatternIDs to + // states. And since pattern IDs can never exceed a usize, the + // unwrap is OK. + f(PatternID::new_unchecked(usize::try_from(pid).unwrap())); + } + } + + /// Calls the given function on every NFA state ID in this state. + fn iter_nfa_state_ids<F: FnMut(StateID)>(&self, mut f: F) { + let mut sids = &self.0[self.pattern_offset_end()..]; + let mut prev = 0i32; + while !sids.is_empty() { + let (delta, nr) = read_vari32(sids); + sids = &sids[nr..]; + let sid = prev + delta; + prev = sid; + // This is OK since we only ever serialize valid StateIDs to + // states. And since state IDs can never exceed an isize, they must + // always be able to fit into a usize, and thus cast is OK. + f(StateID::new_unchecked(sid as usize)) + } + } + + /// Returns the offset into this state's representation where the pattern + /// IDs end and the NFA state IDs begin. + fn pattern_offset_end(&self) -> usize { + let encoded = self.encoded_pattern_count(); + if encoded == 0 { + return 3; + } + // This arithmetic is OK since we were able to address this many bytes + // when writing to the state, thus, it must fit into a usize. + encoded.checked_mul(4).unwrap().checked_add(7).unwrap() + } + + /// Returns the total number of *encoded* pattern IDs in this state. + /// + /// This may return 0 even when this is a match state, since the pattern + /// ID `PatternID::ZERO` is not encoded when it's the only pattern ID in + /// the match state (the overwhelming common case). + fn encoded_pattern_count(&self) -> usize { + if !self.has_pattern_ids() { + return 0; + } + // This unwrap is OK since the total number of patterns is always + // guaranteed to fit into a usize. + usize::try_from(bytes::read_u32(&self.0[3..7])).unwrap() + } +} + +impl<'a> core::fmt::Debug for Repr<'a> { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + let mut nfa_ids = alloc::vec![]; + self.iter_nfa_state_ids(|sid| nfa_ids.push(sid)); + f.debug_struct("Repr") + .field("is_match", &self.is_match()) + .field("is_from_word", &self.is_from_word()) + .field("look_have", &self.look_have()) + .field("look_need", &self.look_need()) + .field("match_pattern_ids", &self.match_pattern_ids()) + .field("nfa_state_ids", &nfa_ids) + .finish() + } +} + +/// ReprVec is a write-only view into the representation of a DFA state. +/// +/// See Repr for more details on the purpose of this type and also the format. +/// +/// Note that not all possible combinations of methods may be called. This is +/// precisely what the various StateBuilder types encapsulate: they only +/// permit valid combinations via Rust's linear typing. +struct ReprVec<'a>(&'a mut Vec<u8>); + +impl<'a> ReprVec<'a> { + /// Set this state as a match state. + /// + /// This should not be exposed explicitly outside of this module. It is + /// set automatically when a pattern ID is added. + fn set_is_match(&mut self) { + self.0[0] |= 1 << 0; + } + + /// Set that this state has pattern IDs explicitly written to it. + /// + /// This should not be exposed explicitly outside of this module. This is + /// used internally as a space saving optimization. Namely, if the state + /// is a match state but does not have any pattern IDs written to it, + /// then it is automatically inferred to have a pattern ID of ZERO. + fn set_has_pattern_ids(&mut self) { + self.0[0] |= 1 << 1; + } + + /// Set this state as being built from a transition over a word byte. + /// + /// Setting this is only necessary when one needs to deal with word + /// boundary assertions. Therefore, if the underlying NFA has no word + /// boundary assertions, callers should not set this. + fn set_is_from_word(&mut self) { + self.0[0] |= 1 << 2; + } + + /// Return a mutable reference to the 'look_have' assertion set. + fn look_have_mut(&mut self) -> &mut LookSet { + LookSet::from_repr_mut(&mut self.0[1]) + } + + /// Return a mutable reference to the 'look_need' assertion set. + fn look_need_mut(&mut self) -> &mut LookSet { + LookSet::from_repr_mut(&mut self.0[2]) + } + + /// Add a pattern ID to this state. All match states must have at least + /// one pattern ID associated with it. + /// + /// Callers must never add duplicative pattern IDs. + /// + /// The order in which patterns are added must correspond to the order + /// in which patterns are reported as matches. + fn add_match_pattern_id(&mut self, pid: PatternID) { + // As a (somewhat small) space saving optimization, in the case where + // a matching state has exactly one pattern ID, PatternID::ZERO, we do + // not write either the pattern ID or the number of patterns encoded. + // Instead, all we do is set the 'is_match' bit on this state. Overall, + // this saves 8 bytes per match state for the overwhelming majority of + // match states. + // + // In order to know whether pattern IDs need to be explicitly read or + // not, we use another internal-only bit, 'has_pattern_ids', to + // indicate whether they have been explicitly written or not. + if !self.repr().has_pattern_ids() { + if pid == PatternID::ZERO { + self.set_is_match(); + return; + } + // Make room for 'close_match_pattern_ids' to write the total + // number of pattern IDs written. + self.0.extend(core::iter::repeat(0).take(PatternID::SIZE)); + self.set_has_pattern_ids(); + // If this was already a match state, then the only way that's + // possible when the state doesn't have pattern IDs is if + // PatternID::ZERO was added by the caller previously. In this + // case, we are now adding a non-ZERO pattern ID after it, in + // which case, we want to make sure to represent ZERO explicitly + // now. + if self.repr().is_match() { + write_u32(self.0, 0) + } else { + // Otherwise, just make sure the 'is_match' bit is set. + self.set_is_match(); + } + } + write_u32(self.0, pid.as_u32()); + } + + /// Indicate that no more pattern IDs will be added to this state. + /// + /// Once this is called, callers must not call it or 'add_match_pattern_id' + /// again. + /// + /// This should not be exposed explicitly outside of this module. It + /// should be called only when converting a StateBuilderMatches into a + /// StateBuilderNFA. + fn close_match_pattern_ids(&mut self) { + // If we never wrote any pattern IDs, then there's nothing to do here. + if !self.repr().has_pattern_ids() { + return; + } + let patsize = PatternID::SIZE; + let pattern_bytes = self.0.len() - 7; + // Every pattern ID uses 4 bytes, so number of bytes should be + // divisible by 4. + assert_eq!(pattern_bytes % patsize, 0); + // This unwrap is OK since we are guaranteed that the maximum number + // of possible patterns fits into a u32. + let count32 = u32::try_from(pattern_bytes / patsize).unwrap(); + bytes::NE::write_u32(count32, &mut self.0[3..7]); + } + + /// Add an NFA state ID to this state. The order in which NFA states are + /// added matters. It is the caller's responsibility to ensure that + /// duplicate NFA state IDs are not added. + fn add_nfa_state_id(&mut self, prev: &mut StateID, sid: StateID) { + let delta = sid.as_i32() - prev.as_i32(); + write_vari32(self.0, delta); + *prev = sid; + } + + /// Return a read-only view of this state's representation. + fn repr(&self) -> Repr<'_> { + Repr(self.0.as_slice()) + } +} + +/// Write a signed 32-bit integer using zig-zag encoding. +/// +/// https://developers.google.com/protocol-buffers/docs/encoding#varints +fn write_vari32(data: &mut Vec<u8>, n: i32) { + let mut un = (n as u32) << 1; + if n < 0 { + un = !un; + } + write_varu32(data, un) +} + +/// Read a signed 32-bit integer using zig-zag encoding. Also, return the +/// number of bytes read. +/// +/// https://developers.google.com/protocol-buffers/docs/encoding#varints +fn read_vari32(data: &[u8]) -> (i32, usize) { + let (un, i) = read_varu32(data); + let mut n = (un >> 1) as i32; + if un & 1 != 0 { + n = !n; + } + (n, i) +} + +/// Write an unsigned 32-bit integer as a varint. In essence, `n` is written +/// as a sequence of bytes where all bytes except for the last one have the +/// most significant bit set. The least significant 7 bits correspond to the +/// actual bits of `n`. So in the worst case, a varint uses 5 bytes, but in +/// very common cases, it uses fewer than 4. +/// +/// https://developers.google.com/protocol-buffers/docs/encoding#varints +fn write_varu32(data: &mut Vec<u8>, mut n: u32) { + while n >= 0b1000_0000 { + data.push((n as u8) | 0b1000_0000); + n >>= 7; + } + data.push(n as u8); +} + +/// Read an unsigned 32-bit varint. Also, return the number of bytes read. +/// +/// https://developers.google.com/protocol-buffers/docs/encoding#varints +fn read_varu32(data: &[u8]) -> (u32, usize) { + // N.B. We can assume correctness here since we know that all varuints are + // written with write_varu32. Hence, the 'as' uses and unchecked arithmetic + // is all okay. + let mut n: u32 = 0; + let mut shift: u32 = 0; + for (i, &b) in data.iter().enumerate() { + if b < 0b1000_0000 { + return (n | ((b as u32) << shift), i + 1); + } + n |= ((b as u32) & 0b0111_1111) << shift; + shift += 7; + } + (0, 0) +} + +/// Push a native-endian encoded `n` on to `dst`. +fn write_u32(dst: &mut Vec<u8>, n: u32) { + use crate::util::bytes::{Endian, NE}; + + let start = dst.len(); + dst.extend(core::iter::repeat(0).take(mem::size_of::<u32>())); + NE::write_u32(n, &mut dst[start..]); +} + +#[cfg(test)] +mod tests { + use alloc::vec; + + use quickcheck::quickcheck; + + use super::*; + + quickcheck! { + fn prop_state_read_write_nfa_state_ids(sids: Vec<StateID>) -> bool { + // Builders states do not permit duplicate IDs. + let sids = dedup_state_ids(sids); + + let mut b = StateBuilderEmpty::new().into_matches().into_nfa(); + for &sid in &sids { + b.add_nfa_state_id(sid); + } + let s = b.to_state(); + let mut got = vec![]; + s.iter_nfa_state_ids(|sid| got.push(sid)); + got == sids + } + + fn prop_state_read_write_pattern_ids(pids: Vec<PatternID>) -> bool { + // Builders states do not permit duplicate IDs. + let pids = dedup_pattern_ids(pids); + + let mut b = StateBuilderEmpty::new().into_matches(); + for &pid in &pids { + b.add_match_pattern_id(pid); + } + let s = b.into_nfa().to_state(); + let mut got = vec![]; + s.iter_match_pattern_ids(|pid| got.push(pid)); + got == pids + } + + fn prop_state_read_write_nfa_state_and_pattern_ids( + sids: Vec<StateID>, + pids: Vec<PatternID> + ) -> bool { + // Builders states do not permit duplicate IDs. + let sids = dedup_state_ids(sids); + let pids = dedup_pattern_ids(pids); + + let mut b = StateBuilderEmpty::new().into_matches(); + for &pid in &pids { + b.add_match_pattern_id(pid); + } + + let mut b = b.into_nfa(); + for &sid in &sids { + b.add_nfa_state_id(sid); + } + + let s = b.to_state(); + let mut got_pids = vec![]; + s.iter_match_pattern_ids(|pid| got_pids.push(pid)); + let mut got_sids = vec![]; + s.iter_nfa_state_ids(|sid| got_sids.push(sid)); + got_pids == pids && got_sids == sids + } + + fn prop_read_write_varu32(n: u32) -> bool { + let mut buf = vec![]; + write_varu32(&mut buf, n); + let (got, nread) = read_varu32(&buf); + nread == buf.len() && got == n + } + + fn prop_read_write_vari32(n: i32) -> bool { + let mut buf = vec![]; + write_vari32(&mut buf, n); + let (got, nread) = read_vari32(&buf); + nread == buf.len() && got == n + } + } + + fn dedup_state_ids(sids: Vec<StateID>) -> Vec<StateID> { + let mut set = alloc::collections::BTreeSet::new(); + let mut deduped = vec![]; + for sid in sids { + if set.contains(&sid) { + continue; + } + set.insert(sid); + deduped.push(sid); + } + deduped + } + + fn dedup_pattern_ids(pids: Vec<PatternID>) -> Vec<PatternID> { + let mut set = alloc::collections::BTreeSet::new(); + let mut deduped = vec![]; + for pid in pids { + if set.contains(&pid) { + continue; + } + set.insert(pid); + deduped.push(pid); + } + deduped + } +} diff --git a/vendor/regex-automata/src/util/id.rs b/vendor/regex-automata/src/util/id.rs new file mode 100644 index 000000000..70bf0a93b --- /dev/null +++ b/vendor/regex-automata/src/util/id.rs @@ -0,0 +1,608 @@ +/*! +Type definitions for identifier types. + +A [`StateID`] represents the possible set of identifiers used in regex engine +implementations in this crate. For example, they are used to identify both NFA +and DFA states. + +A [`PatternID`] represents the possible set of identifiers for patterns. All +regex engine implementations in this crate support searching for multiple +patterns simultaneously. A `PatternID` is how each pattern is uniquely +identified for a particular instance of a regex engine. Namely, a pattern is +assigned an auto-incrementing integer, starting at `0`, based on the order of +patterns supplied during the construction of the regex engine. + +These identifier types represent a way for this crate to make correctness +guarantees around the possible set of values that a `StateID` or a `PatternID` +might represent. Similarly, they also provide a way of constraining the size of +these identifiers to reduce space usage while still guaranteeing that all such +identifiers are repsentable by a `usize` for the current target. + +Moreover, the identifier types clamp the range of permissible values to a range +that is typically smaller than its internal representation. (With the maximum +value being, e.g., `StateID::MAX`.) Users of these types may not rely this +clamping for the purpose of memory safety. Users may, however, rely on these +invariants to avoid panics or other types of logic bugs. +*/ + +// Continuing from the above comment about correctness guarantees, an example +// of a way in which we use the guarantees on these types is delta encoding. +// Namely, we require that IDs can be at most 2^31 - 2, which means the +// difference between any two IDs is always representable as an i32. + +use core::{ + convert::{Infallible, TryFrom}, + mem, ops, +}; + +#[cfg(feature = "alloc")] +use alloc::vec::Vec; + +/// An identifier for a regex pattern. +/// +/// The identifier for a pattern corresponds to its relative position among +/// other patterns in a single finite state machine. Namely, when building +/// a multi-pattern regex engine, one must supply a sequence of patterns to +/// match. The position (starting at 0) of each pattern in that sequence +/// represents its identifier. This identifier is in turn used to identify and +/// report matches of that pattern in various APIs. +/// +/// A pattern ID is guaranteed to be representable by a `usize`. Similarly, +/// the number of patterns in any regex engine in this crate is guaranteed to +/// be representable by a `usize`. This applies to regex engines that have +/// been deserialized; a deserialization error will be returned if it contains +/// pattern IDs that violate these requirements in your current environment. +/// +/// For extra convenience in some cases, this type also guarantees that all +/// IDs can fit into an `i32` and an `isize` without overflowing. +/// +/// # Representation +/// +/// This type is always represented internally by a `u32` and is marked as +/// `repr(transparent)`. Thus, this type always has the same representation as +/// a `u32`. +/// +/// # Indexing +/// +/// For convenience, callers may use a `PatternID` to index slices. +/// +/// # Safety +/// +/// While a `PatternID` is meant to guarantee that its value fits into `usize` +/// (while using a possibly smaller representation than `usize` on some +/// targets), callers must not rely on this property for safety. Callers may +/// choose to rely on this property for correctness however. +#[repr(transparent)] +#[derive( + Clone, Copy, Debug, Default, Eq, Hash, PartialEq, PartialOrd, Ord, +)] +pub struct PatternID(u32); + +impl PatternID { + /// The maximum pattern ID value, represented as a `usize`. + #[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))] + pub const MAX: PatternID = + PatternID::new_unchecked(core::i32::MAX as usize - 1); + + /// The maximum pattern ID value, represented as a `usize`. + #[cfg(target_pointer_width = "16")] + pub const MAX: PatternID = PatternID::new_unchecked(core::isize::MAX - 1); + + /// The total number of patterns that are allowed in any single regex + /// engine. + pub const LIMIT: usize = PatternID::MAX.as_usize() + 1; + + /// The zero pattern ID value. + pub const ZERO: PatternID = PatternID::new_unchecked(0); + + /// The number of bytes that a single `PatternID` uses in memory. + pub const SIZE: usize = core::mem::size_of::<PatternID>(); + + /// Create a new pattern ID. + /// + /// If the given identifier exceeds [`PatternID::MAX`], then this returns + /// an error. + #[inline] + pub fn new(id: usize) -> Result<PatternID, PatternIDError> { + PatternID::try_from(id) + } + + /// Create a new pattern ID without checking whether the given value + /// exceeds [`PatternID::MAX`]. + /// + /// While this is unchecked, providing an incorrect value must never + /// sacrifice memory safety, as documented above. + #[inline] + pub const fn new_unchecked(id: usize) -> PatternID { + PatternID(id as u32) + } + + /// Like [`PatternID::new`], but panics if the given ID is not valid. + #[inline] + pub fn must(id: usize) -> PatternID { + PatternID::new(id).unwrap() + } + + /// Return this pattern ID as a `usize`. + #[inline] + pub const fn as_usize(&self) -> usize { + self.0 as usize + } + + /// Return the internal u32 of this pattern ID. + #[inline] + pub const fn as_u32(&self) -> u32 { + self.0 + } + + /// Return the internal u32 of this pattern ID represented as an i32. + /// + /// This is guaranteed to never overflow an `i32`. + #[inline] + pub const fn as_i32(&self) -> i32 { + self.0 as i32 + } + + /// Returns one more than this pattern ID as a usize. + /// + /// Since a pattern ID has constraints on its maximum value, adding `1` to + /// it will always fit in a `usize` (and a `u32`). + #[inline] + pub fn one_more(&self) -> usize { + self.as_usize().checked_add(1).unwrap() + } + + /// Decode this pattern ID from the bytes given using the native endian + /// byte order for the current target. + /// + /// If the decoded integer is not representable as a pattern ID for the + /// current target, then this returns an error. + #[inline] + pub fn from_ne_bytes(bytes: [u8; 4]) -> Result<PatternID, PatternIDError> { + let id = u32::from_ne_bytes(bytes); + if id > PatternID::MAX.as_u32() { + return Err(PatternIDError { attempted: id as u64 }); + } + Ok(PatternID::new_unchecked(id as usize)) + } + + /// Decode this pattern ID from the bytes given using the native endian + /// byte order for the current target. + /// + /// This is analogous to [`PatternID::new_unchecked`] in that is does not + /// check whether the decoded integer is representable as a pattern ID. + #[inline] + pub fn from_ne_bytes_unchecked(bytes: [u8; 4]) -> PatternID { + PatternID::new_unchecked(u32::from_ne_bytes(bytes) as usize) + } + + /// Return the underlying pattern ID integer as raw bytes in native endian + /// format. + #[inline] + pub fn to_ne_bytes(&self) -> [u8; 4] { + self.0.to_ne_bytes() + } + + /// Returns an iterator over all pattern IDs from 0 up to and not including + /// the given length. + /// + /// If the given length exceeds [`PatternID::LIMIT`], then this panics. + #[cfg(feature = "alloc")] + pub(crate) fn iter(len: usize) -> PatternIDIter { + PatternIDIter::new(len) + } +} + +/// This error occurs when a pattern ID could not be constructed. +/// +/// This occurs when given an integer exceeding the maximum pattern ID value. +/// +/// When the `std` feature is enabled, this implements the `Error` trait. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct PatternIDError { + attempted: u64, +} + +impl PatternIDError { + /// Returns the value that failed to constructed a pattern ID. + pub fn attempted(&self) -> u64 { + self.attempted + } +} + +#[cfg(feature = "std")] +impl std::error::Error for PatternIDError {} + +impl core::fmt::Display for PatternIDError { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + write!( + f, + "failed to create PatternID from {:?}, which exceeds {:?}", + self.attempted(), + PatternID::MAX, + ) + } +} + +/// An identifier for a state in a regex engine. +/// +/// A state ID is guaranteed to be representable by a `usize`. Similarly, the +/// number of states in any regex engine in this crate is guaranteed to be +/// representable by a `usize`. This applies to regex engines that have been +/// deserialized; a deserialization error will be returned if it contains state +/// IDs that violate these requirements in your current environment. +/// +/// For extra convenience in some cases, this type also guarantees that all +/// IDs can fit into an `i32` and an `isize` without overflowing. +/// +/// # Representation +/// +/// This type is always represented internally by a `u32` and is marked as +/// `repr(transparent)`. Thus, this type always has the same representation as +/// a `u32`. +/// +/// # Indexing +/// +/// For convenience, callers may use a `StateID` to index slices. +/// +/// # Safety +/// +/// While a `StateID` is meant to guarantee that its value fits into `usize` +/// (while using a possibly smaller representation than `usize` on some +/// targets), callers must not rely on this property for safety. Callers may +/// choose to rely on this property for correctness however. +#[repr(transparent)] +#[derive( + Clone, Copy, Debug, Default, Eq, Hash, PartialEq, PartialOrd, Ord, +)] +pub struct StateID(u32); + +impl StateID { + /// The maximum state ID value. + #[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))] + pub const MAX: StateID = + StateID::new_unchecked(core::i32::MAX as usize - 1); + + /// The maximum state ID value. + #[cfg(target_pointer_width = "16")] + pub const MAX: StateID = StateID::new_unchecked(core::isize::MAX - 1); + + /// The total number of states that are allowed in any single regex + /// engine, represented as a `usize`. + pub const LIMIT: usize = StateID::MAX.as_usize() + 1; + + /// The zero state ID value. + pub const ZERO: StateID = StateID::new_unchecked(0); + + /// The number of bytes that a single `StateID` uses in memory. + pub const SIZE: usize = core::mem::size_of::<StateID>(); + + /// Create a new state ID. + /// + /// If the given identifier exceeds [`StateID::MAX`], then this returns + /// an error. + #[inline] + pub fn new(id: usize) -> Result<StateID, StateIDError> { + StateID::try_from(id) + } + + /// Create a new state ID without checking whether the given value + /// exceeds [`StateID::MAX`]. + /// + /// While this is unchecked, providing an incorrect value must never + /// sacrifice memory safety, as documented above. + #[inline] + pub const fn new_unchecked(id: usize) -> StateID { + StateID(id as u32) + } + + /// Like [`StateID::new`], but panics if the given ID is not valid. + #[inline] + pub fn must(id: usize) -> StateID { + StateID::new(id).unwrap() + } + + /// Return this state ID as a `usize`. + #[inline] + pub const fn as_usize(&self) -> usize { + self.0 as usize + } + + /// Return the internal u32 of this state ID. + #[inline] + pub const fn as_u32(&self) -> u32 { + self.0 + } + + /// Return the internal u32 of this pattern ID represented as an i32. + /// + /// This is guaranteed to never overflow an `i32`. + #[inline] + pub const fn as_i32(&self) -> i32 { + self.0 as i32 + } + + /// Returns one more than this state ID as a usize. + /// + /// Since a state ID has constraints on its maximum value, adding `1` to + /// it will always fit in a `usize` (and a `u32`). + #[inline] + pub fn one_more(&self) -> usize { + self.as_usize().checked_add(1).unwrap() + } + + /// Decode this state ID from the bytes given using the native endian byte + /// order for the current target. + /// + /// If the decoded integer is not representable as a state ID for the + /// current target, then this returns an error. + #[inline] + pub fn from_ne_bytes(bytes: [u8; 4]) -> Result<StateID, StateIDError> { + let id = u32::from_ne_bytes(bytes); + if id > StateID::MAX.as_u32() { + return Err(StateIDError { attempted: id as u64 }); + } + Ok(StateID::new_unchecked(id as usize)) + } + + /// Decode this state ID from the bytes given using the native endian + /// byte order for the current target. + /// + /// This is analogous to [`StateID::new_unchecked`] in that is does not + /// check whether the decoded integer is representable as a state ID. + #[inline] + pub fn from_ne_bytes_unchecked(bytes: [u8; 4]) -> StateID { + StateID::new_unchecked(u32::from_ne_bytes(bytes) as usize) + } + + /// Return the underlying state ID integer as raw bytes in native endian + /// format. + #[inline] + pub fn to_ne_bytes(&self) -> [u8; 4] { + self.0.to_ne_bytes() + } + + /// Returns an iterator over all state IDs from 0 up to and not including + /// the given length. + /// + /// If the given length exceeds [`StateID::LIMIT`], then this panics. + #[cfg(feature = "alloc")] + pub(crate) fn iter(len: usize) -> StateIDIter { + StateIDIter::new(len) + } +} + +/// This error occurs when a state ID could not be constructed. +/// +/// This occurs when given an integer exceeding the maximum state ID value. +/// +/// When the `std` feature is enabled, this implements the `Error` trait. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct StateIDError { + attempted: u64, +} + +impl StateIDError { + /// Returns the value that failed to constructed a state ID. + pub fn attempted(&self) -> u64 { + self.attempted + } +} + +#[cfg(feature = "std")] +impl std::error::Error for StateIDError {} + +impl core::fmt::Display for StateIDError { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + write!( + f, + "failed to create StateID from {:?}, which exceeds {:?}", + self.attempted(), + StateID::MAX, + ) + } +} + +/// A macro for defining exactly identical (modulo names) impls for ID types. +macro_rules! impls { + ($ty:ident, $tyerr:ident, $tyiter:ident) => { + #[derive(Clone, Debug)] + pub(crate) struct $tyiter { + rng: ops::Range<usize>, + } + + impl $tyiter { + #[cfg(feature = "alloc")] + fn new(len: usize) -> $tyiter { + assert!( + len <= $ty::LIMIT, + "cannot create iterator with IDs when number of \ + elements exceed {:?}", + $ty::LIMIT, + ); + $tyiter { rng: 0..len } + } + } + + impl Iterator for $tyiter { + type Item = $ty; + + fn next(&mut self) -> Option<$ty> { + if self.rng.start >= self.rng.end { + return None; + } + let next_id = self.rng.start + 1; + let id = mem::replace(&mut self.rng.start, next_id); + // new_unchecked is OK since we asserted that the number of + // elements in this iterator will fit in an ID at construction. + Some($ty::new_unchecked(id)) + } + } + + impl<T> core::ops::Index<$ty> for [T] { + type Output = T; + + #[inline] + fn index(&self, index: $ty) -> &T { + &self[index.as_usize()] + } + } + + impl<T> core::ops::IndexMut<$ty> for [T] { + #[inline] + fn index_mut(&mut self, index: $ty) -> &mut T { + &mut self[index.as_usize()] + } + } + + #[cfg(feature = "alloc")] + impl<T> core::ops::Index<$ty> for Vec<T> { + type Output = T; + + #[inline] + fn index(&self, index: $ty) -> &T { + &self[index.as_usize()] + } + } + + #[cfg(feature = "alloc")] + impl<T> core::ops::IndexMut<$ty> for Vec<T> { + #[inline] + fn index_mut(&mut self, index: $ty) -> &mut T { + &mut self[index.as_usize()] + } + } + + impl TryFrom<usize> for $ty { + type Error = $tyerr; + + fn try_from(id: usize) -> Result<$ty, $tyerr> { + if id > $ty::MAX.as_usize() { + return Err($tyerr { attempted: id as u64 }); + } + Ok($ty::new_unchecked(id)) + } + } + + impl TryFrom<u8> for $ty { + type Error = Infallible; + + fn try_from(id: u8) -> Result<$ty, Infallible> { + Ok($ty::new_unchecked(id as usize)) + } + } + + impl TryFrom<u16> for $ty { + type Error = $tyerr; + + fn try_from(id: u16) -> Result<$ty, $tyerr> { + if id as u32 > $ty::MAX.as_u32() { + return Err($tyerr { attempted: id as u64 }); + } + Ok($ty::new_unchecked(id as usize)) + } + } + + impl TryFrom<u32> for $ty { + type Error = $tyerr; + + fn try_from(id: u32) -> Result<$ty, $tyerr> { + if id > $ty::MAX.as_u32() { + return Err($tyerr { attempted: id as u64 }); + } + Ok($ty::new_unchecked(id as usize)) + } + } + + impl TryFrom<u64> for $ty { + type Error = $tyerr; + + fn try_from(id: u64) -> Result<$ty, $tyerr> { + if id > $ty::MAX.as_u32() as u64 { + return Err($tyerr { attempted: id }); + } + Ok($ty::new_unchecked(id as usize)) + } + } + + #[cfg(test)] + impl quickcheck::Arbitrary for $ty { + fn arbitrary(gen: &mut quickcheck::Gen) -> $ty { + use core::cmp::max; + + let id = max(i32::MIN + 1, i32::arbitrary(gen)).abs(); + if id > $ty::MAX.as_i32() { + $ty::MAX + } else { + $ty::new(usize::try_from(id).unwrap()).unwrap() + } + } + } + }; +} + +impls!(PatternID, PatternIDError, PatternIDIter); +impls!(StateID, StateIDError, StateIDIter); + +/// A utility trait that defines a couple of adapters for making it convenient +/// to access indices as ID types. We require ExactSizeIterator so that +/// iterator construction can do a single check to make sure the index of each +/// element is representable by its ID type. +#[cfg(feature = "alloc")] +pub(crate) trait IteratorIDExt: Iterator { + fn with_pattern_ids(self) -> WithPatternIDIter<Self> + where + Self: Sized + ExactSizeIterator, + { + WithPatternIDIter::new(self) + } + + fn with_state_ids(self) -> WithStateIDIter<Self> + where + Self: Sized + ExactSizeIterator, + { + WithStateIDIter::new(self) + } +} + +#[cfg(feature = "alloc")] +impl<I: Iterator> IteratorIDExt for I {} + +#[cfg(feature = "alloc")] +macro_rules! iditer { + ($ty:ident, $iterty:ident, $withiterty:ident) => { + /// An iterator adapter that is like std::iter::Enumerate, but attaches + /// IDs. It requires ExactSizeIterator. At construction, it ensures + /// that the index of each element in the iterator is representable in + /// the corresponding ID type. + #[derive(Clone, Debug)] + pub(crate) struct $withiterty<I> { + it: I, + ids: $iterty, + } + + impl<I: Iterator + ExactSizeIterator> $withiterty<I> { + fn new(it: I) -> $withiterty<I> { + let ids = $ty::iter(it.len()); + $withiterty { it, ids } + } + } + + impl<I: Iterator + ExactSizeIterator> Iterator for $withiterty<I> { + type Item = ($ty, I::Item); + + fn next(&mut self) -> Option<($ty, I::Item)> { + let item = self.it.next()?; + // Number of elements in this iterator must match, according + // to contract of ExactSizeIterator. + let id = self.ids.next().unwrap(); + Some((id, item)) + } + } + }; +} + +#[cfg(feature = "alloc")] +iditer!(PatternID, PatternIDIter, WithPatternIDIter); +#[cfg(feature = "alloc")] +iditer!(StateID, StateIDIter, WithStateIDIter); diff --git a/vendor/regex-automata/src/util/lazy.rs b/vendor/regex-automata/src/util/lazy.rs new file mode 100644 index 000000000..d8cac6ef4 --- /dev/null +++ b/vendor/regex-automata/src/util/lazy.rs @@ -0,0 +1,31 @@ +use core::{ + cell::Cell, + ptr, + sync::atomic::{AtomicPtr, Ordering}, +}; + +use alloc::{boxed::Box, vec::Vec}; + +#[inline(always)] +pub(crate) fn get_or_init<T: Send + Sync + 'static>( + location: &'static AtomicPtr<T>, + init: impl FnOnce() -> T, +) -> &'static T { + let mut ptr = location.load(Ordering::Acquire); + if ptr.is_null() { + let new_dfa = Box::new(init()); + ptr = Box::into_raw(new_dfa); + let result = location.compare_exchange( + ptr::null_mut(), + ptr, + Ordering::AcqRel, + Ordering::Acquire, + ); + if let Err(old) = result { + let redundant = unsafe { Box::from_raw(ptr) }; + drop(redundant); + ptr = old; + } + } + unsafe { &*ptr } +} diff --git a/vendor/regex-automata/src/util/matchtypes.rs b/vendor/regex-automata/src/util/matchtypes.rs new file mode 100644 index 000000000..de0fa65bf --- /dev/null +++ b/vendor/regex-automata/src/util/matchtypes.rs @@ -0,0 +1,356 @@ +use crate::util::id::PatternID; + +/// The kind of match semantics to use for a DFA. +/// +/// The default match kind is `LeftmostFirst`. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum MatchKind { + /// Report all possible matches. + All, + /// Report only the leftmost matches. When multiple leftmost matches exist, + /// report the match corresponding to the part of the regex that appears + /// first in the syntax. + LeftmostFirst, + /// Hints that destructuring should not be exhaustive. + /// + /// This enum may grow additional variants, so this makes sure clients + /// don't count on exhaustive matching. (Otherwise, adding a new variant + /// could break existing code.) + #[doc(hidden)] + __Nonexhaustive, + // There is prior art in RE2 that shows that we should be able to add + // LeftmostLongest too. The tricky part of it is supporting ungreedy + // repetitions. Instead of treating all NFA states as having equivalent + // priority (as in 'All') or treating all NFA states as having distinct + // priority based on order (as in 'LeftmostFirst'), we instead group NFA + // states into sets, and treat members of each set as having equivalent + // priority, but having greater priority than all following members + // of different sets. + // + // However, it's not clear whether it's really worth adding this. After + // all, leftmost-longest can be emulated when using literals by using + // leftmost-first and sorting the literals by length in descending order. + // However, this won't work for arbitrary regexes. e.g., `\w|\w\w` will + // always match `a` in `ab` when using leftmost-first, but leftmost-longest + // would match `ab`. +} + +impl MatchKind { + #[cfg(feature = "alloc")] + pub(crate) fn continue_past_first_match(&self) -> bool { + *self == MatchKind::All + } +} + +impl Default for MatchKind { + fn default() -> MatchKind { + MatchKind::LeftmostFirst + } +} + +/// A representation of a match reported by a regex engine. +/// +/// A match records the start and end offsets of the match in the haystack. +/// +/// Every match guarantees that `start <= end`. +#[derive(Clone, Debug, Eq, Hash, PartialEq)] +pub struct Match { + /// The start offset of the match, inclusive. + start: usize, + /// The end offset of the match, exclusive. + end: usize, +} + +impl Match { + /// Create a new match from a byte offset span. + /// + /// # Panics + /// + /// This panics if `end < start`. + #[inline] + pub fn new(start: usize, end: usize) -> Match { + assert!(start <= end); + Match { start, end } + } + + /// The starting position of the match. + #[inline] + pub fn start(&self) -> usize { + self.start + } + + /// The ending position of the match. + #[inline] + pub fn end(&self) -> usize { + self.end + } + + /// Returns the match location as a range. + #[inline] + pub fn range(&self) -> core::ops::Range<usize> { + self.start..self.end + } + + /// Returns true if and only if this match is empty. That is, when + /// `start() == end()`. + /// + /// An empty match can only be returned when the empty string was among + /// the patterns used to build the Aho-Corasick automaton. + #[inline] + pub fn is_empty(&self) -> bool { + self.start == self.end + } +} + +/// A representation of a match reported by a DFA. +/// +/// This is called a "half" match because it only includes the end location +/// (or start location for a reverse match) of a match. This corresponds to the +/// information that a single DFA scan can report. Getting the other half of +/// the match requires a second scan with a reversed DFA. +/// +/// A half match also includes the pattern that matched. The pattern is +/// identified by an ID, which corresponds to its position (starting from `0`) +/// relative to other patterns used to construct the corresponding DFA. If only +/// a single pattern is provided to the DFA, then all matches are guaranteed to +/// have a pattern ID of `0`. +#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] +pub struct HalfMatch { + /// The pattern ID. + pub(crate) pattern: PatternID, + /// The offset of the match. + /// + /// For forward searches, the offset is exclusive. For reverse searches, + /// the offset is inclusive. + pub(crate) offset: usize, +} + +impl HalfMatch { + /// Create a new half match from a pattern ID and a byte offset. + #[inline] + pub fn new(pattern: PatternID, offset: usize) -> HalfMatch { + HalfMatch { pattern, offset } + } + + /// Create a new half match from a pattern ID and a byte offset. + /// + /// This is like [`HalfMatch::new`], but accepts a `usize` instead of a + /// [`PatternID`]. This panics if the given `usize` is not representable + /// as a `PatternID`. + #[inline] + pub fn must(pattern: usize, offset: usize) -> HalfMatch { + HalfMatch::new(PatternID::new(pattern).unwrap(), offset) + } + + /// Returns the ID of the pattern that matched. + /// + /// The ID of a pattern is derived from the position in which it was + /// originally inserted into the corresponding DFA. The first pattern has + /// identifier `0`, and each subsequent pattern is `1`, `2` and so on. + #[inline] + pub fn pattern(&self) -> PatternID { + self.pattern + } + + /// The position of the match. + /// + /// If this match was produced by a forward search, then the offset is + /// exclusive. If this match was produced by a reverse search, then the + /// offset is inclusive. + #[inline] + pub fn offset(&self) -> usize { + self.offset + } +} + +/// A representation of a multi match reported by a regex engine. +/// +/// A multi match has two essential pieces of information: the identifier of +/// the pattern that matched, along with the start and end offsets of the match +/// in the haystack. +/// +/// The pattern is identified by an ID, which corresponds to its position +/// (starting from `0`) relative to other patterns used to construct the +/// corresponding regex engine. If only a single pattern is provided, then all +/// multi matches are guaranteed to have a pattern ID of `0`. +/// +/// Every multi match guarantees that `start <= end`. +#[derive(Clone, Debug, Eq, Hash, PartialEq)] +pub struct MultiMatch { + /// The pattern ID. + pattern: PatternID, + /// The start offset of the match, inclusive. + start: usize, + /// The end offset of the match, exclusive. + end: usize, +} + +impl MultiMatch { + /// Create a new match from a pattern ID and a byte offset span. + /// + /// # Panics + /// + /// This panics if `end < start`. + #[inline] + pub fn new(pattern: PatternID, start: usize, end: usize) -> MultiMatch { + assert!(start <= end); + MultiMatch { pattern, start, end } + } + + /// Create a new match from a pattern ID and a byte offset span. + /// + /// This is like [`MultiMatch::new`], but accepts a `usize` instead of a + /// [`PatternID`]. This panics if the given `usize` is not representable + /// as a `PatternID`. + /// + /// # Panics + /// + /// This panics if `end < start` or if `pattern > PatternID::MAX`. + #[inline] + pub fn must(pattern: usize, start: usize, end: usize) -> MultiMatch { + MultiMatch::new(PatternID::new(pattern).unwrap(), start, end) + } + + /// Returns the ID of the pattern that matched. + /// + /// The ID of a pattern is derived from the position in which it was + /// originally inserted into the corresponding regex engine. The first + /// pattern has identifier `0`, and each subsequent pattern is `1`, `2` and + /// so on. + #[inline] + pub fn pattern(&self) -> PatternID { + self.pattern + } + + /// The starting position of the match. + #[inline] + pub fn start(&self) -> usize { + self.start + } + + /// The ending position of the match. + #[inline] + pub fn end(&self) -> usize { + self.end + } + + /// Returns the match location as a range. + #[inline] + pub fn range(&self) -> core::ops::Range<usize> { + self.start..self.end + } + + /// Returns true if and only if this match is empty. That is, when + /// `start() == end()`. + /// + /// An empty match can only be returned when the empty string was among + /// the patterns used to build the Aho-Corasick automaton. + #[inline] + pub fn is_empty(&self) -> bool { + self.start == self.end + } +} + +/// An error type indicating that a search stopped prematurely without finding +/// a match. +/// +/// This error type implies that one cannot assume that no matches occur, since +/// the search stopped before completing. +/// +/// Normally, when one searches for something, the response is either an +/// affirmative "it was found at this location" or a negative "not found at +/// all." However, in some cases, a regex engine can be configured to stop its +/// search before concluding whether a match exists or not. When this happens, +/// it may be important for the caller to know why the regex engine gave up and +/// where in the input it gave up at. This error type exposes the 'why' and the +/// 'where.' +/// +/// For example, the DFAs provided by this library generally cannot correctly +/// implement Unicode word boundaries. Instead, they provide an option to +/// eagerly support them on ASCII text (since Unicode word boundaries are +/// equivalent to ASCII word boundaries when searching ASCII text), but will +/// "give up" if a non-ASCII byte is seen. In such cases, one is usually +/// required to either report the failure to the caller (unergonomic) or +/// otherwise fall back to some other regex engine (ergonomic, but potentially +/// costly). +/// +/// More generally, some regex engines offer the ability for callers to specify +/// certain bytes that will trigger the regex engine to automatically quit if +/// they are seen. +/// +/// Still yet, there may be other reasons for a failed match. For example, +/// the hybrid DFA provided by this crate can be configured to give up if it +/// believes that it is not efficient. This in turn permits callers to choose a +/// different regex engine. +/// +/// # Advice +/// +/// While this form of error reporting adds complexity, it is generally +/// possible for callers to configure regex engines to never give up a search, +/// and thus never return an error. Indeed, the default configuration for every +/// regex engine in this crate is such that they will never stop searching +/// early. Therefore, the only way to get a match error is if the regex engine +/// is explicitly configured to do so. Options that enable this behavior +/// document the new error conditions they imply. +/// +/// Regex engines for which no errors are possible for any configuration will +/// return the normal `Option<Match>` and not use this error type at all. +/// +/// For example, regex engines in the `dfa` sub-module will only report +/// `MatchError::Quit` if instructed by either +/// [enabling Unicode word boundaries](crate::dfa::dense::Config::unicode_word_boundary) +/// or by +/// [explicitly specifying one or more quit bytes](crate::dfa::dense::Config::quit). +#[derive(Clone, Debug, Eq, Hash, PartialEq)] +pub enum MatchError { + // Note that the first version of this type was called `SearchError` and it + // included a third `None` variant to indicate that the search completed + // and no match was found. However, this was problematic for iterator + // APIs where the `None` sentinel for stopping iteration corresponds + // precisely to the "match not found" case. The fact that the `None` + // variant was buried inside this type was in turn quite awkward. So + // instead, I removed the `None` variant, renamed the type and used + // `Result<Option<Match>, MatchError>` in non-iterator APIs instead of the + // conceptually simpler `Result<Match, MatchError>`. However, we "regain" + // ergonomics by only putting the more complex API in the `try_` variants + // ("fallible") of search methods. The infallible APIs will instead just + // return `Option<Match>` and panic on error. + /// The search saw a "quit" byte at which it was instructed to stop + /// searching. + Quit { + /// The "quit" byte that was observed that caused the search to stop. + byte: u8, + /// The offset at which the quit byte was observed. + offset: usize, + }, + /// The search, based on heuristics, determined that it would be better + /// to stop, typically to provide the caller an opportunity to use an + /// alternative regex engine. + /// + /// Currently, the only way for this to occur is via the lazy DFA and + /// only when it is configured to do so (it will not return this error by + /// default). + GaveUp { + /// The offset at which the search stopped. This corresponds to the + /// position immediately following the last byte scanned. + offset: usize, + }, +} + +#[cfg(feature = "std")] +impl std::error::Error for MatchError {} + +impl core::fmt::Display for MatchError { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + match *self { + MatchError::Quit { byte, offset } => write!( + f, + "quit search after observing byte \\x{:02X} at offset {}", + byte, offset, + ), + MatchError::GaveUp { offset } => { + write!(f, "gave up searching at offset {}", offset) + } + } + } +} diff --git a/vendor/regex-automata/src/util/mod.rs b/vendor/regex-automata/src/util/mod.rs new file mode 100644 index 000000000..798507da2 --- /dev/null +++ b/vendor/regex-automata/src/util/mod.rs @@ -0,0 +1,275 @@ +/*! +TODO +*/ + +use core::{ascii, fmt, str}; + +#[cfg(feature = "alloc")] +use alloc::vec::Vec; + +pub mod alphabet; +pub(crate) mod bytes; +#[cfg(feature = "alloc")] +pub(crate) mod determinize; +pub mod id; +#[cfg(feature = "alloc")] +pub(crate) mod lazy; +pub(crate) mod matchtypes; +pub mod prefilter; +#[cfg(feature = "alloc")] +pub(crate) mod sparse_set; +pub(crate) mod start; +#[cfg(feature = "alloc")] +pub(crate) mod syntax; + +/// The offset, in bytes, that a match is delayed by in the DFAs generated by +/// this crate. (This includes lazy DFAs.) +/// +/// The purpose of this delay is to support look-ahead such as \b (ASCII-only) +/// and $. In particular, both of these operators may require the +/// identification of the end of input in order to confirm a match. Not only +/// does this mean that all matches must therefore be delayed by a single byte, +/// but that a special EOI value is added to the alphabet of all DFAs. (Which +/// means that even though the alphabet of a DFA is typically all byte values, +/// the actual maximum alphabet size is 257 due to the extra EOI value.) +/// +/// Since we delay matches by only 1 byte, this can't fully support a +/// Unicode-aware \b operator, which requires multi-byte look-ahead. Indeed, +/// DFAs in this crate do not support it. (It's not as simple as just +/// increasing the match offset to do it---otherwise we would---but building +/// the full Unicode-aware word boundary detection into an automaton is quite +/// tricky.) +pub(crate) const MATCH_OFFSET: usize = 1; + +/// A type that wraps a single byte with a convenient fmt::Debug impl that +/// escapes the byte. +pub(crate) struct DebugByte(pub u8); + +impl fmt::Debug for DebugByte { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + // 10 bytes is enough to cover any output from ascii::escape_default. + let mut bytes = [0u8; 10]; + let mut len = 0; + for (i, mut b) in ascii::escape_default(self.0).enumerate() { + // capitalize \xab to \xAB + if i >= 2 && b'a' <= b && b <= b'f' { + b -= 32; + } + bytes[len] = b; + len += 1; + } + write!(f, "{}", str::from_utf8(&bytes[..len]).unwrap()) + } +} + +/// Returns the smallest possible index of the next valid UTF-8 sequence +/// starting after `i`. +/// +/// For all inputs, including invalid UTF-8 and any value of `i`, the return +/// value is guaranteed to be greater than `i`. +/// +/// Generally speaking, this should only be called on `text` when it is +/// permitted to assume that it is valid UTF-8 and where either `i >= +/// text.len()` or where `text[i]` is a leading byte of a UTF-8 sequence. +#[inline(always)] +pub(crate) fn next_utf8(text: &[u8], i: usize) -> usize { + let b = match text.get(i) { + None => return i.checked_add(1).unwrap(), + Some(&b) => b, + }; + // For cases where we see an invalid UTF-8 byte, there isn't much we can do + // other than just start at the next byte. + let inc = utf8_len(b).unwrap_or(1); + i.checked_add(inc).unwrap() +} + +/// Returns true if and only if the given byte is considered a word character. +/// This only applies to ASCII. +/// +/// This was copied from regex-syntax so that we can use it to determine the +/// starting DFA state while searching without depending on regex-syntax. The +/// definition is never going to change, so there's no maintenance/bit-rot +/// hazard here. +#[inline(always)] +pub(crate) fn is_word_byte(b: u8) -> bool { + match b { + b'_' | b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' => true, + _ => false, + } +} + +/// Decodes the next UTF-8 encoded codepoint from the given byte slice. +/// +/// If no valid encoding of a codepoint exists at the beginning of the given +/// byte slice, then the first byte is returned instead. +/// +/// This returns `None` if and only if `bytes` is empty. +#[inline(always)] +pub(crate) fn decode_utf8(bytes: &[u8]) -> Option<Result<char, u8>> { + if bytes.is_empty() { + return None; + } + let len = match utf8_len(bytes[0]) { + None => return Some(Err(bytes[0])), + Some(len) if len > bytes.len() => return Some(Err(bytes[0])), + Some(1) => return Some(Ok(bytes[0] as char)), + Some(len) => len, + }; + match str::from_utf8(&bytes[..len]) { + Ok(s) => Some(Ok(s.chars().next().unwrap())), + Err(_) => Some(Err(bytes[0])), + } +} + +/// Decodes the last UTF-8 encoded codepoint from the given byte slice. +/// +/// If no valid encoding of a codepoint exists at the end of the given byte +/// slice, then the last byte is returned instead. +/// +/// This returns `None` if and only if `bytes` is empty. +#[inline(always)] +pub(crate) fn decode_last_utf8(bytes: &[u8]) -> Option<Result<char, u8>> { + if bytes.is_empty() { + return None; + } + let mut start = bytes.len() - 1; + let limit = bytes.len().saturating_sub(4); + while start > limit && !is_leading_or_invalid_utf8_byte(bytes[start]) { + start -= 1; + } + match decode_utf8(&bytes[start..]) { + None => None, + Some(Ok(ch)) => Some(Ok(ch)), + Some(Err(_)) => Some(Err(bytes[bytes.len() - 1])), + } +} + +/// Given a UTF-8 leading byte, this returns the total number of code units +/// in the following encoded codepoint. +/// +/// If the given byte is not a valid UTF-8 leading byte, then this returns +/// `None`. +#[inline(always)] +fn utf8_len(byte: u8) -> Option<usize> { + if byte <= 0x7F { + return Some(1); + } else if byte & 0b1100_0000 == 0b1000_0000 { + return None; + } else if byte <= 0b1101_1111 { + Some(2) + } else if byte <= 0b1110_1111 { + Some(3) + } else if byte <= 0b1111_0111 { + Some(4) + } else { + None + } +} + +/// Returns true if and only if the given byte is either a valid leading UTF-8 +/// byte, or is otherwise an invalid byte that can never appear anywhere in a +/// valid UTF-8 sequence. +#[inline(always)] +fn is_leading_or_invalid_utf8_byte(b: u8) -> bool { + // In the ASCII case, the most significant bit is never set. The leading + // byte of a 2/3/4-byte sequence always has the top two most significant + // bits set. For bytes that can never appear anywhere in valid UTF-8, this + // also returns true, since every such byte has its two most significant + // bits set: + // + // \xC0 :: 11000000 + // \xC1 :: 11000001 + // \xF5 :: 11110101 + // \xF6 :: 11110110 + // \xF7 :: 11110111 + // \xF8 :: 11111000 + // \xF9 :: 11111001 + // \xFA :: 11111010 + // \xFB :: 11111011 + // \xFC :: 11111100 + // \xFD :: 11111101 + // \xFE :: 11111110 + // \xFF :: 11111111 + (b & 0b1100_0000) != 0b1000_0000 +} + +#[cfg(feature = "alloc")] +#[inline(always)] +pub(crate) fn is_word_char_fwd(bytes: &[u8], mut at: usize) -> bool { + use core::{ptr, sync::atomic::AtomicPtr}; + + use crate::{ + dfa::{ + dense::{self, DFA}, + Automaton, + }, + util::lazy, + }; + + static WORD: AtomicPtr<DFA<Vec<u32>>> = AtomicPtr::new(ptr::null_mut()); + + let dfa = lazy::get_or_init(&WORD, || { + // TODO: Should we use a lazy DFA here instead? It does complicate + // things somewhat, since we then need a mutable cache, which probably + // means a thread local. + dense::Builder::new() + .configure(dense::Config::new().anchored(true)) + .build(r"\w") + .unwrap() + }); + // This is OK since '\w' contains no look-around. + let mut sid = dfa.universal_start_state(); + while at < bytes.len() { + let byte = bytes[at]; + sid = dfa.next_state(sid, byte); + at += 1; + if dfa.is_special_state(sid) { + if dfa.is_match_state(sid) { + return true; + } else if dfa.is_dead_state(sid) { + return false; + } + } + } + dfa.is_match_state(dfa.next_eoi_state(sid)) +} + +#[cfg(feature = "alloc")] +#[inline(always)] +pub(crate) fn is_word_char_rev(bytes: &[u8], mut at: usize) -> bool { + use core::{ptr, sync::atomic::AtomicPtr}; + + use crate::{ + dfa::{ + dense::{self, DFA}, + Automaton, + }, + nfa::thompson::NFA, + }; + + static WORD: AtomicPtr<DFA<Vec<u32>>> = AtomicPtr::new(ptr::null_mut()); + + let dfa = lazy::get_or_init(&WORD, || { + dense::Builder::new() + .configure(dense::Config::new().anchored(true)) + .thompson(NFA::config().reverse(true).shrink(true)) + .build(r"\w") + .unwrap() + }); + + // This is OK since '\w' contains no look-around. + let mut sid = dfa.universal_start_state(); + while at > 0 { + at -= 1; + let byte = bytes[at]; + sid = dfa.next_state(sid, byte); + if dfa.is_special_state(sid) { + if dfa.is_match_state(sid) { + return true; + } else if dfa.is_dead_state(sid) { + return false; + } + } + } + dfa.is_match_state(dfa.next_eoi_state(sid)) +} diff --git a/vendor/regex-automata/src/util/prefilter.rs b/vendor/regex-automata/src/util/prefilter.rs new file mode 100644 index 000000000..5fe151524 --- /dev/null +++ b/vendor/regex-automata/src/util/prefilter.rs @@ -0,0 +1,281 @@ +use crate::Match; + +/// A candidate is the result of running a prefilter on a haystack at a +/// particular position. The result is one of no match, a confirmed match or +/// a possible match. +/// +/// When no match is returned, the prefilter is guaranteeing that no possible +/// match can be found in the haystack, and the caller may trust this. That is, +/// all correct prefilters must never report false negatives. +/// +/// In some cases, a prefilter can confirm a match very quickly, in which case, +/// the caller may use this to stop what it's doing and report the match. In +/// this case, prefilter implementations must never report a false positive. +/// In other cases, the prefilter can only report a potential match, in which +/// case the callers must attempt to confirm the match. In this case, prefilter +/// implementations are permitted to return false positives. +#[derive(Clone, Debug)] +pub enum Candidate { + /// The prefilter reports that no match is possible. Prefilter + /// implementations will never report false negatives. + None, + /// The prefilter reports that a match has been confirmed at the provided + /// byte offsets. When this variant is reported, the prefilter is + /// guaranteeing a match. No false positives are permitted. + Match(Match), + /// The prefilter reports that a match *may* start at the given position. + /// When this variant is reported, it may correspond to a false positive. + PossibleStartOfMatch(usize), +} + +impl Candidate { + /// Convert this candidate into an option. This is useful when callers do + /// not distinguish between true positives and false positives (i.e., the + /// caller must always confirm the match in order to update some other + /// state). + /// + /// The byte offset in the option returned corresponds to the starting + /// position of the possible match. + pub fn into_option(self) -> Option<usize> { + match self { + Candidate::None => None, + Candidate::Match(ref m) => Some(m.start()), + Candidate::PossibleStartOfMatch(start) => Some(start), + } + } +} + +/// A prefilter describes the behavior of fast literal scanners for quickly +/// skipping past bytes in the haystack that we know cannot possibly +/// participate in a match. +pub trait Prefilter: core::fmt::Debug { + /// Returns the next possible match candidate. This may yield false + /// positives, so callers must confirm a match starting at the position + /// returned. This, however, must never produce false negatives. That is, + /// this must, at minimum, return the starting position of the next match + /// in the given haystack after or at the given position. + fn next_candidate( + &self, + state: &mut State, + haystack: &[u8], + at: usize, + ) -> Candidate; + + /// Returns the approximate total amount of heap used by this prefilter, in + /// units of bytes. + fn heap_bytes(&self) -> usize; + + /// Returns true if and only if this prefilter may return false positives + /// via the `Candidate::PossibleStartOfMatch` variant. This is most useful + /// when false positives are not posssible (in which case, implementations + /// should return false), which may allow completely avoiding heavier regex + /// machinery when the prefilter can quickly confirm its own matches. + /// + /// By default, this returns true, which is conservative; it is always + /// correct to return `true`. Returning `false` here and reporting a false + /// positive will result in incorrect searches. + fn reports_false_positives(&self) -> bool { + true + } +} + +impl<'a, P: Prefilter + ?Sized> Prefilter for &'a P { + #[inline] + fn next_candidate( + &self, + state: &mut State, + haystack: &[u8], + at: usize, + ) -> Candidate { + (**self).next_candidate(state, haystack, at) + } + + fn heap_bytes(&self) -> usize { + (**self).heap_bytes() + } + + fn reports_false_positives(&self) -> bool { + (**self).reports_false_positives() + } +} + +#[derive(Clone)] +pub struct Scanner<'p> { + prefilter: &'p dyn Prefilter, + state: State, +} + +impl<'p> Scanner<'p> { + pub fn new(prefilter: &'p dyn Prefilter) -> Scanner<'p> { + Scanner { prefilter, state: State::new() } + } + + pub(crate) fn is_effective(&mut self, at: usize) -> bool { + self.state.is_effective(at) + } + + pub(crate) fn reports_false_positives(&self) -> bool { + self.prefilter.reports_false_positives() + } + + pub(crate) fn next_candidate( + &mut self, + bytes: &[u8], + at: usize, + ) -> Candidate { + let cand = self.prefilter.next_candidate(&mut self.state, bytes, at); + match cand { + Candidate::None => { + self.state.update_skipped_bytes(bytes.len() - at); + } + Candidate::Match(ref m) => { + self.state.update_skipped_bytes(m.start() - at); + } + Candidate::PossibleStartOfMatch(i) => { + self.state.update_skipped_bytes(i - at); + } + } + cand + } +} + +impl<'p> core::fmt::Debug for Scanner<'p> { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + f.debug_struct("Scanner").field("state", &self.state).finish() + } +} + +/// State tracks state associated with the effectiveness of a +/// prefilter. It is used to track how many bytes, on average, are skipped by +/// the prefilter. If this average dips below a certain threshold over time, +/// then the state renders the prefilter inert and stops using it. +/// +/// A prefilter state should be created for each search. (Where creating an +/// iterator via, e.g., `find_iter`, is treated as a single search.) +#[derive(Clone, Debug)] +pub struct State { + /// The number of skips that has been executed. + skips: usize, + /// The total number of bytes that have been skipped. + skipped: usize, + /// Once this heuristic has been deemed permanently ineffective, it will be + /// inert throughout the rest of its lifetime. This serves as a cheap way + /// to check inertness. + inert: bool, + /// The last (absolute) position at which a prefilter scanned to. + /// Prefilters can use this position to determine whether to re-scan or + /// not. + /// + /// Unlike other things that impact effectiveness, this is a fleeting + /// condition. That is, a prefilter can be considered ineffective if it is + /// at a position before `last_scan_at`, but can become effective again + /// once the search moves past `last_scan_at`. + /// + /// The utility of this is to both avoid additional overhead from calling + /// the prefilter and to avoid quadratic behavior. This ensures that a + /// prefilter will scan any particular byte at most once. (Note that some + /// prefilters, like the start-byte prefilter, do not need to use this + /// field at all, since it only looks for starting bytes.) + last_scan_at: usize, +} + +impl State { + /// The minimum number of skip attempts to try before considering whether + /// a prefilter is effective or not. + const MIN_SKIPS: usize = 40; + + /// The minimum amount of bytes that skipping must average. + /// + /// That is, after MIN_SKIPS have occurred, if the average number of bytes + /// skipped ever falls below MIN_AVG_SKIP, then the prefilter will be + /// rendered inert. + const MIN_AVG_SKIP: usize = 16; + + /// Create a fresh prefilter state. + pub fn new() -> State { + State { skips: 0, skipped: 0, inert: false, last_scan_at: 0 } + } + + /// Updates the position at which the last scan stopped. This may be + /// greater than the position of the last candidate reported. For example, + /// searching for the byte `z` in `abczdef` for the pattern `abcz` will + /// report a candidate at position `0`, but the end of its last scan will + /// be at position `3`. + /// + /// This position factors into the effectiveness of this prefilter. If the + /// current position is less than the last position at which a scan ended, + /// then the prefilter should not be re-run until the search moves past + /// that position. + /// + /// It is always correct to never update the last scan position. In fact, + /// it is also always correct to set the last scan position to an arbitrary + /// value. The key is setting it to a position in the future at which it + /// makes sense to restart the prefilter. + pub fn update_last_scan(&mut self, at: usize) { + if at > self.last_scan_at { + self.last_scan_at = at; + } + } + + /// Return true if and only if this state indicates that a prefilter is + /// still effective. If the prefilter is not effective, then this state + /// is rendered "inert." At which point, all subsequent calls to + /// `is_effective` on this state will return `false`. + /// + /// `at` should correspond to the current starting position of the search. + /// + /// Callers typically do not need to use this, as it represents the + /// default implementation of + /// [`Prefilter::is_effective`](trait.Prefilter.html#tymethod.is_effective). + fn is_effective(&mut self, at: usize) -> bool { + if self.inert { + return false; + } + if at < self.last_scan_at { + return false; + } + if self.skips < State::MIN_SKIPS { + return true; + } + + if self.skipped >= State::MIN_AVG_SKIP * self.skips { + return true; + } + + // We're inert. + self.inert = true; + false + } + + /// Update this state with the number of bytes skipped on the last + /// invocation of the prefilter. + fn update_skipped_bytes(&mut self, skipped: usize) { + self.skips += 1; + self.skipped += skipped; + } +} + +/// A `Prefilter` implementation that reports a possible match at every +/// position. +/// +/// This should generally not be used as an actual prefilter. It is only +/// useful when one needs to represent the absence of a prefilter in a generic +/// context. For example, a [`dfa::regex::Regex`](crate::dfa::regex::Regex) +/// uses this prefilter by default to indicate that no prefilter should be +/// used. +/// +/// A `None` prefilter value cannot be constructed. +#[derive(Clone, Debug)] +pub struct None { + _priv: (), +} + +impl Prefilter for None { + fn next_candidate(&self, _: &mut State, _: &[u8], at: usize) -> Candidate { + Candidate::PossibleStartOfMatch(at) + } + + fn heap_bytes(&self) -> usize { + 0 + } +} diff --git a/vendor/regex-automata/src/util/sparse_set.rs b/vendor/regex-automata/src/util/sparse_set.rs new file mode 100644 index 000000000..bf59e4469 --- /dev/null +++ b/vendor/regex-automata/src/util/sparse_set.rs @@ -0,0 +1,229 @@ +use alloc::{boxed::Box, vec, vec::Vec}; + +use crate::util::id::StateID; + +/// A pairse of sparse sets. +/// +/// This is useful when one needs to compute NFA epsilon closures from a +/// previous set of states derived from an epsilon closure. One set can be the +/// starting states where as the other set can be the destination states after +/// following the transitions for a particular byte of input. +/// +/// There is no significance to 'set1' or 'set2'. They are both sparse sets of +/// the same size. +/// +/// The members of this struct are exposed so that callers may borrow 'set1' +/// and 'set2' individually without being force to borrow both at the same +/// time. +#[derive(Clone, Debug)] +pub(crate) struct SparseSets { + pub(crate) set1: SparseSet, + pub(crate) set2: SparseSet, +} + +impl SparseSets { + /// Create a new pair of sparse sets where each set has the given capacity. + /// + /// This panics if the capacity given is bigger than `StateID::LIMIT`. + pub(crate) fn new(capacity: usize) -> SparseSets { + SparseSets { + set1: SparseSet::new(capacity), + set2: SparseSet::new(capacity), + } + } + + /// Resizes these sparse sets to have the new capacity given. + /// + /// The sets are automatically cleared. + /// + /// This panics if the capacity given is bigger than `StateID::LIMIT`. + #[inline] + pub(crate) fn resize(&mut self, new_capacity: usize) { + self.set1.resize(new_capacity); + self.set2.resize(new_capacity); + } + + /// Clear both sparse sets. + pub(crate) fn clear(&mut self) { + self.set1.clear(); + self.set2.clear(); + } + + /// Swap set1 with set2. + pub(crate) fn swap(&mut self) { + core::mem::swap(&mut self.set1, &mut self.set2); + } + + /// Returns the memory usage, in bytes, used by this pair of sparse sets. + pub(crate) fn memory_usage(&self) -> usize { + self.set1.memory_usage() + self.set2.memory_usage() + } +} + +/// A sparse set used for representing ordered NFA states. +/// +/// This supports constant time addition and membership testing. Clearing an +/// entire set can also be done in constant time. Iteration yields elements +/// in the order in which they were inserted. +/// +/// The data structure is based on: https://research.swtch.com/sparse +/// Note though that we don't actually use uninitialized memory. We generally +/// reuse sparse sets, so the initial allocation cost is bareable. However, its +/// other properties listed above are extremely useful. +#[derive(Clone)] +pub(crate) struct SparseSet { + /// The number of elements currently in this set. + len: usize, + /// Dense contains the ids in the order in which they were inserted. + dense: Vec<StateID>, + /// Sparse maps ids to their location in dense. + /// + /// A state ID is in the set if and only if + /// sparse[id] < dense.len() && id == dense[sparse[id]]. + sparse: Vec<StateID>, +} + +impl SparseSet { + /// Create a new sparse set with the given capacity. + /// + /// Sparse sets have a fixed size and they cannot grow. Attempting to + /// insert more distinct elements than the total capacity of the set will + /// result in a panic. + /// + /// This panics if the capacity given is bigger than `StateID::LIMIT`. + #[inline] + pub(crate) fn new(capacity: usize) -> SparseSet { + let mut set = SparseSet { len: 0, dense: vec![], sparse: vec![] }; + set.resize(capacity); + set + } + + /// Resizes this sparse set to have the new capacity given. + /// + /// This set is automatically cleared. + /// + /// This panics if the capacity given is bigger than `StateID::LIMIT`. + #[inline] + pub(crate) fn resize(&mut self, new_capacity: usize) { + assert!( + new_capacity <= StateID::LIMIT, + "sparse set capacity cannot excced {:?}", + StateID::LIMIT + ); + self.clear(); + self.dense.resize(new_capacity, StateID::ZERO); + self.sparse.resize(new_capacity, StateID::ZERO); + } + + /// Returns the capacity of this set. + /// + /// The capacity represents a fixed limit on the number of distinct + /// elements that are allowed in this set. The capacity cannot be changed. + #[inline] + pub(crate) fn capacity(&self) -> usize { + self.dense.len() + } + + /// Returns the number of elements in this set. + #[inline] + pub(crate) fn len(&self) -> usize { + self.len + } + + /// Returns true if and only if this set is empty. + #[inline] + pub(crate) fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Insert the state ID value into this set and return true if the given + /// state ID was not previously in this set. + /// + /// This operation is idempotent. If the given value is already in this + /// set, then this is a no-op. + /// + /// If more than `capacity` ids are inserted, then this panics. + /// + /// This is marked as inline(always) since the compiler won't inline it + /// otherwise, and it's a fairly hot piece of code in DFA determinization. + #[inline(always)] + pub(crate) fn insert(&mut self, value: StateID) -> bool { + if self.contains(value) { + return false; + } + + let i = self.len(); + assert!( + i < self.capacity(), + "{:?} exceeds capacity of {:?} when inserting {:?}", + i, + self.capacity(), + value, + ); + // OK since i < self.capacity() and self.capacity() is guaranteed to + // be <= StateID::LIMIT. + let id = StateID::new_unchecked(i); + self.dense[id] = value; + self.sparse[value] = id; + self.len += 1; + true + } + + /// Returns true if and only if this set contains the given value. + #[inline] + pub(crate) fn contains(&self, value: StateID) -> bool { + let i = self.sparse[value]; + i.as_usize() < self.len() && self.dense[i] == value + } + + /// Returns the ith inserted element from this set. + /// + /// Panics when i >= self.len(). + #[inline] + pub(crate) fn get(&self, i: usize) -> StateID { + self.dense[i] + } + + /// Clear this set such that it has no members. + #[inline] + pub(crate) fn clear(&mut self) { + self.len = 0; + } + + /// Returns the heap memory usage, in bytes, used by this sparse set. + #[inline] + pub(crate) fn memory_usage(&self) -> usize { + 2 * self.dense.len() * StateID::SIZE + } +} + +impl core::fmt::Debug for SparseSet { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + let elements: Vec<StateID> = self.into_iter().collect(); + f.debug_tuple("SparseSet").field(&elements).finish() + } +} + +/// An iterator over all elements in a sparse set. +/// +/// The lifetime `'a` refers to the lifetime of the set being iterated over. +#[derive(Debug)] +pub(crate) struct SparseSetIter<'a>(core::slice::Iter<'a, StateID>); + +impl<'a> IntoIterator for &'a SparseSet { + type Item = StateID; + type IntoIter = SparseSetIter<'a>; + + fn into_iter(self) -> Self::IntoIter { + SparseSetIter(self.dense[..self.len()].iter()) + } +} + +impl<'a> Iterator for SparseSetIter<'a> { + type Item = StateID; + + #[inline(always)] + fn next(&mut self) -> Option<StateID> { + self.0.next().map(|value| *value) + } +} diff --git a/vendor/regex-automata/src/util/start.rs b/vendor/regex-automata/src/util/start.rs new file mode 100644 index 000000000..3c756fc26 --- /dev/null +++ b/vendor/regex-automata/src/util/start.rs @@ -0,0 +1,109 @@ +/// Represents the four possible starting configurations of a DFA search. +/// +/// The starting configuration is determined by inspecting the the beginning of +/// the haystack (up to 1 byte). Ultimately, this along with a pattern ID (if +/// specified) is what selects the start state to use in a DFA. +/// +/// In a DFA that doesn't have starting states for each pattern, then it will +/// have a maximum of four DFA start states. If the DFA was compiled with start +/// states for each pattern, then it will have a maximum of four DFA start +/// states for searching for any pattern, and then another maximum of four DFA +/// start states for executing an anchored search for each pattern. +/// +/// This ends up being represented as a table in the DFA (whether lazy or fully +/// built) where the stride of that table is 4, and each entry is an index into +/// the state transition table. Note though that multiple entries in the table +/// might point to the same state if the states would otherwise be equivalent. +/// (This is guaranteed by DFA minimization and may even be accomplished by +/// normal determinization, since it attempts to reuse equivalent states too.) +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub(crate) enum Start { + /// This occurs when the starting position is not any of the ones below. + NonWordByte = 0, + /// This occurs when the byte immediately preceding the start of the search + /// is an ASCII word byte. + WordByte = 1, + /// This occurs when the starting position of the search corresponds to the + /// beginning of the haystack. + Text = 2, + /// This occurs when the byte immediately preceding the start of the search + /// is a line terminator. Specifically, `\n`. + Line = 3, +} + +impl Start { + /// Return the starting state corresponding to the given integer. If no + /// starting state exists for the given integer, then None is returned. + pub(crate) fn from_usize(n: usize) -> Option<Start> { + match n { + 0 => Some(Start::NonWordByte), + 1 => Some(Start::WordByte), + 2 => Some(Start::Text), + 3 => Some(Start::Line), + _ => None, + } + } + + /// Returns the total number of starting state configurations. + pub(crate) fn count() -> usize { + 4 + } + + /// Returns the starting state configuration for the given search + /// parameters. If the given offset range is not valid, then this panics. + #[inline(always)] + pub(crate) fn from_position_fwd( + bytes: &[u8], + start: usize, + end: usize, + ) -> Start { + assert!( + bytes.get(start..end).is_some(), + "{}..{} is invalid", + start, + end + ); + if start == 0 { + Start::Text + } else if bytes[start - 1] == b'\n' { + Start::Line + } else if crate::util::is_word_byte(bytes[start - 1]) { + Start::WordByte + } else { + Start::NonWordByte + } + } + + /// Returns the starting state configuration for a reverse search with the + /// given search parameters. If the given offset range is not valid, then + /// this panics. + #[inline(always)] + pub(crate) fn from_position_rev( + bytes: &[u8], + start: usize, + end: usize, + ) -> Start { + assert!( + bytes.get(start..end).is_some(), + "{}..{} is invalid", + start, + end + ); + if end == bytes.len() { + Start::Text + } else if bytes[end] == b'\n' { + Start::Line + } else if crate::util::is_word_byte(bytes[end]) { + Start::WordByte + } else { + Start::NonWordByte + } + } + + /// Return this starting configuration as an integer. It is guaranteed to + /// be less than `Start::count()`. + #[inline(always)] + pub(crate) fn as_usize(&self) -> usize { + *self as usize + } +} diff --git a/vendor/regex-automata/src/util/syntax.rs b/vendor/regex-automata/src/util/syntax.rs new file mode 100644 index 000000000..88beeee75 --- /dev/null +++ b/vendor/regex-automata/src/util/syntax.rs @@ -0,0 +1,272 @@ +use regex_syntax::ParserBuilder; + +/// A common set of configuration options that apply to the syntax of a regex. +/// +/// This represents a group of configuration options that specifically apply +/// to how the concrete syntax of a regular expression is interpreted. In +/// particular, they are generally forwarded to the +/// [`ParserBuilder`](https://docs.rs/regex-syntax/*/regex_syntax/struct.ParserBuilder.html) +/// in the +/// [`regex-syntax`](https://docs.rs/regex-syntax) +/// crate when building a regex from its concrete syntax directly. +/// +/// These options are defined as a group since they apply to every regex engine +/// in this crate. Instead of re-defining them on every engine's builder, they +/// are instead provided here as one cohesive unit. +#[derive(Clone, Copy, Debug)] +pub struct SyntaxConfig { + case_insensitive: bool, + multi_line: bool, + dot_matches_new_line: bool, + swap_greed: bool, + ignore_whitespace: bool, + unicode: bool, + utf8: bool, + nest_limit: u32, + octal: bool, +} + +impl SyntaxConfig { + /// Return a new default syntax configuration. + pub fn new() -> SyntaxConfig { + // These defaults match the ones used in regex-syntax. + SyntaxConfig { + case_insensitive: false, + multi_line: false, + dot_matches_new_line: false, + swap_greed: false, + ignore_whitespace: false, + unicode: true, + utf8: true, + nest_limit: 250, + octal: false, + } + } + + /// Enable or disable the case insensitive flag by default. + /// + /// When Unicode mode is enabled, case insensitivity is Unicode-aware. + /// Specifically, it will apply the "simple" case folding rules as + /// specified by Unicode. + /// + /// By default this is disabled. It may alternatively be selectively + /// enabled in the regular expression itself via the `i` flag. + pub fn case_insensitive(mut self, yes: bool) -> SyntaxConfig { + self.case_insensitive = yes; + self + } + + /// Enable or disable the multi-line matching flag by default. + /// + /// When this is enabled, the `^` and `$` look-around assertions will + /// match immediately after and immediately before a new line character, + /// respectively. Note that the `\A` and `\z` look-around assertions are + /// unaffected by this setting and always correspond to matching at the + /// beginning and end of the input. + /// + /// By default this is disabled. It may alternatively be selectively + /// enabled in the regular expression itself via the `m` flag. + pub fn multi_line(mut self, yes: bool) -> SyntaxConfig { + self.multi_line = yes; + self + } + + /// Enable or disable the "dot matches any character" flag by default. + /// + /// When this is enabled, `.` will match any character. When it's disabled, + /// then `.` will match any character except for a new line character. + /// + /// Note that `.` is impacted by whether the "unicode" setting is enabled + /// or not. When Unicode is enabled (the defualt), `.` will match any UTF-8 + /// encoding of any Unicode scalar value (sans a new line, depending on + /// whether this "dot matches new line" option is enabled). When Unicode + /// mode is disabled, `.` will match any byte instead. Because of this, + /// when Unicode mode is disabled, `.` can only be used when the "allow + /// invalid UTF-8" option is enabled, since `.` could otherwise match + /// invalid UTF-8. + /// + /// By default this is disabled. It may alternatively be selectively + /// enabled in the regular expression itself via the `s` flag. + pub fn dot_matches_new_line(mut self, yes: bool) -> SyntaxConfig { + self.dot_matches_new_line = yes; + self + } + + /// Enable or disable the "swap greed" flag by default. + /// + /// When this is enabled, `.*` (for example) will become ungreedy and `.*?` + /// will become greedy. + /// + /// By default this is disabled. It may alternatively be selectively + /// enabled in the regular expression itself via the `U` flag. + pub fn swap_greed(mut self, yes: bool) -> SyntaxConfig { + self.swap_greed = yes; + self + } + + /// Enable verbose mode in the regular expression. + /// + /// When enabled, verbose mode permits insigificant whitespace in many + /// places in the regular expression, as well as comments. Comments are + /// started using `#` and continue until the end of the line. + /// + /// By default, this is disabled. It may be selectively enabled in the + /// regular expression by using the `x` flag regardless of this setting. + pub fn ignore_whitespace(mut self, yes: bool) -> SyntaxConfig { + self.ignore_whitespace = yes; + self + } + + /// Enable or disable the Unicode flag (`u`) by default. + /// + /// By default this is **enabled**. It may alternatively be selectively + /// disabled in the regular expression itself via the `u` flag. + /// + /// Note that unless "allow invalid UTF-8" is enabled (it's disabled by + /// default), a regular expression will fail to parse if Unicode mode is + /// disabled and a sub-expression could possibly match invalid UTF-8. + /// + /// **WARNING**: Unicode mode can greatly increase the size of the compiled + /// DFA, which can noticeably impact both memory usage and compilation + /// time. This is especially noticeable if your regex contains character + /// classes like `\w` that are impacted by whether Unicode is enabled or + /// not. If Unicode is not necessary, you are encouraged to disable it. + pub fn unicode(mut self, yes: bool) -> SyntaxConfig { + self.unicode = yes; + self + } + + /// When disabled, the builder will permit the construction of a regular + /// expression that may match invalid UTF-8. + /// + /// For example, when [`SyntaxConfig::unicode`] is disabled, then + /// expressions like `[^a]` may match invalid UTF-8 since they can match + /// any single byte that is not `a`. By default, these sub-expressions + /// are disallowed to avoid returning offsets that split a UTF-8 + /// encoded codepoint. However, in cases where matching at arbitrary + /// locations is desired, this option can be disabled to permit all such + /// sub-expressions. + /// + /// When enabled (the default), the builder is guaranteed to produce a + /// regex that will only ever match valid UTF-8 (otherwise, the builder + /// will return an error). + pub fn utf8(mut self, yes: bool) -> SyntaxConfig { + self.utf8 = yes; + self + } + + /// Set the nesting limit used for the regular expression parser. + /// + /// The nesting limit controls how deep the abstract syntax tree is allowed + /// to be. If the AST exceeds the given limit (e.g., with too many nested + /// groups), then an error is returned by the parser. + /// + /// The purpose of this limit is to act as a heuristic to prevent stack + /// overflow when building a finite automaton from a regular expression's + /// abstract syntax tree. In particular, construction currently uses + /// recursion. In the future, the implementation may stop using recursion + /// and this option will no longer be necessary. + /// + /// This limit is not checked until the entire AST is parsed. Therefore, + /// if callers want to put a limit on the amount of heap space used, then + /// they should impose a limit on the length, in bytes, of the concrete + /// pattern string. In particular, this is viable since the parser will + /// limit itself to heap space proportional to the lenth of the pattern + /// string. + /// + /// Note that a nest limit of `0` will return a nest limit error for most + /// patterns but not all. For example, a nest limit of `0` permits `a` but + /// not `ab`, since `ab` requires a concatenation AST item, which results + /// in a nest depth of `1`. In general, a nest limit is not something that + /// manifests in an obvious way in the concrete syntax, therefore, it + /// should not be used in a granular way. + pub fn nest_limit(mut self, limit: u32) -> SyntaxConfig { + self.nest_limit = limit; + self + } + + /// Whether to support octal syntax or not. + /// + /// Octal syntax is a little-known way of uttering Unicode codepoints in + /// a regular expression. For example, `a`, `\x61`, `\u0061` and + /// `\141` are all equivalent regular expressions, where the last example + /// shows octal syntax. + /// + /// While supporting octal syntax isn't in and of itself a problem, it does + /// make good error messages harder. That is, in PCRE based regex engines, + /// syntax like `\1` invokes a backreference, which is explicitly + /// unsupported in Rust's regex engine. However, many users expect it to + /// be supported. Therefore, when octal support is disabled, the error + /// message will explicitly mention that backreferences aren't supported. + /// + /// Octal syntax is disabled by default. + pub fn octal(mut self, yes: bool) -> SyntaxConfig { + self.octal = yes; + self + } + + /// Returns whether "unicode" mode is enabled. + pub fn get_unicode(&self) -> bool { + self.unicode + } + + /// Returns whether "case insensitive" mode is enabled. + pub fn get_case_insensitive(&self) -> bool { + self.case_insensitive + } + + /// Returns whether "multi line" mode is enabled. + pub fn get_multi_line(&self) -> bool { + self.multi_line + } + + /// Returns whether "dot matches new line" mode is enabled. + pub fn get_dot_matches_new_line(&self) -> bool { + self.dot_matches_new_line + } + + /// Returns whether "swap greed" mode is enabled. + pub fn get_swap_greed(&self) -> bool { + self.swap_greed + } + + /// Returns whether "ignore whitespace" mode is enabled. + pub fn get_ignore_whitespace(&self) -> bool { + self.ignore_whitespace + } + + /// Returns whether UTF-8 mode is enabled. + pub fn get_utf8(&self) -> bool { + self.utf8 + } + + /// Returns the "nest limit" setting. + pub fn get_nest_limit(&self) -> u32 { + self.nest_limit + } + + /// Returns whether "octal" mode is enabled. + pub fn get_octal(&self) -> bool { + self.octal + } + + /// Applies this configuration to the given parser. + pub(crate) fn apply(&self, builder: &mut ParserBuilder) { + builder + .unicode(self.unicode) + .case_insensitive(self.case_insensitive) + .multi_line(self.multi_line) + .dot_matches_new_line(self.dot_matches_new_line) + .swap_greed(self.swap_greed) + .ignore_whitespace(self.ignore_whitespace) + .allow_invalid_utf8(!self.utf8) + .nest_limit(self.nest_limit) + .octal(self.octal); + } +} + +impl Default for SyntaxConfig { + fn default() -> SyntaxConfig { + SyntaxConfig::new() + } +} |