diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 12:47:55 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 12:47:55 +0000 |
commit | 2aadc03ef15cb5ca5cc2af8a7c08e070742f0ac4 (patch) | |
tree | 033cc839730fda84ff08db877037977be94e5e3a /vendor/regex-automata/src/util | |
parent | Initial commit. (diff) | |
download | cargo-upstream.tar.xz cargo-upstream.zip |
Adding upstream version 0.70.1+ds1.upstream/0.70.1+ds1upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'vendor/regex-automata/src/util')
29 files changed, 19091 insertions, 0 deletions
diff --git a/vendor/regex-automata/src/util/alphabet.rs b/vendor/regex-automata/src/util/alphabet.rs new file mode 100644 index 0000000..22b5a76 --- /dev/null +++ b/vendor/regex-automata/src/util/alphabet.rs @@ -0,0 +1,1139 @@ +/*! +This module provides APIs for dealing with the alphabets of finite state +machines. + +There are two principal types in this module, [`ByteClasses`] and [`Unit`]. +The former defines the alphabet of a finite state machine while the latter +represents an element of that alphabet. + +To a first approximation, the alphabet of all automata in this crate is just +a `u8`. Namely, every distinct byte value. All 256 of them. In practice, this +can be quite wasteful when building a transition table for a DFA, since it +requires storing a state identifier for each element in the alphabet. Instead, +we collapse the alphabet of an automaton down into equivalence classes, where +every byte in the same equivalence class never discriminates between a match or +a non-match from any other byte in the same class. For example, in the regex +`[a-z]+`, then you could consider it having an alphabet consisting of two +equivalence classes: `a-z` and everything else. In terms of the transitions on +an automaton, it doesn't actually require representing every distinct byte. +Just the equivalence classes. + +The downside of equivalence classes is that, of course, searching a haystack +deals with individual byte values. Those byte values need to be mapped to +their corresponding equivalence class. This is what `ByteClasses` does. In +practice, doing this for every state transition has negligible impact on modern +CPUs. Moreover, it helps make more efficient use of the CPU cache by (possibly +considerably) shrinking the size of the transition table. + +One last hiccup concerns `Unit`. Namely, because of look-around and how the +DFAs in this crate work, we need to add a sentinel value to our alphabet +of equivalence classes that represents the "end" of a search. We call that +sentinel [`Unit::eoi`] or "end of input." Thus, a `Unit` is either an +equivalence class corresponding to a set of bytes, or it is a special "end of +input" sentinel. + +In general, you should not expect to need either of these types unless you're +doing lower level shenanigans with DFAs, or even building your own DFAs. +(Although, you don't have to use these types to build your own DFAs of course.) +For example, if you're walking a DFA's state graph, it's probably useful to +make use of [`ByteClasses`] to visit each element in the DFA's alphabet instead +of just visiting every distinct `u8` value. The latter isn't necessarily wrong, +but it could be potentially very wasteful. +*/ +use crate::util::{ + escape::DebugByte, + wire::{self, DeserializeError, SerializeError}, +}; + +/// Unit represents a single unit of haystack for DFA based regex engines. +/// +/// It is not expected for consumers of this crate to need to use this type +/// unless they are implementing their own DFA. And even then, it's not +/// required: implementors may use other techniques to handle haystack units. +/// +/// Typically, a single unit of haystack for a DFA would be a single byte. +/// However, for the DFAs in this crate, matches are delayed by a single byte +/// in order to handle look-ahead assertions (`\b`, `$` and `\z`). Thus, once +/// we have consumed the haystack, we must run the DFA through one additional +/// transition using a unit that indicates the haystack has ended. +/// +/// There is no way to represent a sentinel with a `u8` since all possible +/// values *may* be valid haystack units to a DFA, therefore this type +/// explicitly adds room for a sentinel value. +/// +/// The sentinel EOI value is always its own equivalence class and is +/// ultimately represented by adding 1 to the maximum equivalence class value. +/// So for example, the regex `^[a-z]+$` might be split into the following +/// equivalence classes: +/// +/// ```text +/// 0 => [\x00-`] +/// 1 => [a-z] +/// 2 => [{-\xFF] +/// 3 => [EOI] +/// ``` +/// +/// Where EOI is the special sentinel value that is always in its own +/// singleton equivalence class. +#[derive(Clone, Copy, Eq, PartialEq, PartialOrd, Ord)] +pub struct Unit(UnitKind); + +#[derive(Clone, Copy, Eq, PartialEq, PartialOrd, Ord)] +enum UnitKind { + /// Represents a byte value, or more typically, an equivalence class + /// represented as a byte value. + U8(u8), + /// Represents the "end of input" sentinel. We regretably use a `u16` + /// here since the maximum sentinel value is `256`. Thankfully, we don't + /// actually store a `Unit` anywhere, so this extra space shouldn't be too + /// bad. + EOI(u16), +} + +impl Unit { + /// Create a new haystack unit from a byte value. + /// + /// All possible byte values are legal. However, when creating a haystack + /// unit for a specific DFA, one should be careful to only construct units + /// that are in that DFA's alphabet. Namely, one way to compact a DFA's + /// in-memory representation is to collapse its transitions to a set of + /// equivalence classes into a set of all possible byte values. If a DFA + /// uses equivalence classes instead of byte values, then the byte given + /// here should be the equivalence class. + pub fn u8(byte: u8) -> Unit { + Unit(UnitKind::U8(byte)) + } + + /// Create a new "end of input" haystack unit. + /// + /// The value given is the sentinel value used by this unit to represent + /// the "end of input." The value should be the total number of equivalence + /// classes in the corresponding alphabet. Its maximum value is `256`, + /// which occurs when every byte is its own equivalence class. + /// + /// # Panics + /// + /// This panics when `num_byte_equiv_classes` is greater than `256`. + pub fn eoi(num_byte_equiv_classes: usize) -> Unit { + assert!( + num_byte_equiv_classes <= 256, + "max number of byte-based equivalent classes is 256, but got {}", + num_byte_equiv_classes, + ); + Unit(UnitKind::EOI(u16::try_from(num_byte_equiv_classes).unwrap())) + } + + /// If this unit is not an "end of input" sentinel, then returns its + /// underlying byte value. Otherwise return `None`. + pub fn as_u8(self) -> Option<u8> { + match self.0 { + UnitKind::U8(b) => Some(b), + UnitKind::EOI(_) => None, + } + } + + /// If this unit is an "end of input" sentinel, then return the underlying + /// sentinel value that was given to [`Unit::eoi`]. Otherwise return + /// `None`. + pub fn as_eoi(self) -> Option<u16> { + match self.0 { + UnitKind::U8(_) => None, + UnitKind::EOI(sentinel) => Some(sentinel), + } + } + + /// Return this unit as a `usize`, regardless of whether it is a byte value + /// or an "end of input" sentinel. In the latter case, the underlying + /// sentinel value given to [`Unit::eoi`] is returned. + pub fn as_usize(self) -> usize { + match self.0 { + UnitKind::U8(b) => usize::from(b), + UnitKind::EOI(eoi) => usize::from(eoi), + } + } + + /// Returns true if and only of this unit is a byte value equivalent to the + /// byte given. This always returns false when this is an "end of input" + /// sentinel. + pub fn is_byte(self, byte: u8) -> bool { + self.as_u8().map_or(false, |b| b == byte) + } + + /// Returns true when this unit represents an "end of input" sentinel. + pub fn is_eoi(self) -> bool { + self.as_eoi().is_some() + } + + /// Returns true when this unit corresponds to an ASCII word byte. + /// + /// This always returns false when this unit represents an "end of input" + /// sentinel. + pub fn is_word_byte(self) -> bool { + self.as_u8().map_or(false, crate::util::utf8::is_word_byte) + } +} + +impl core::fmt::Debug for Unit { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + match self.0 { + UnitKind::U8(b) => write!(f, "{:?}", DebugByte(b)), + UnitKind::EOI(_) => write!(f, "EOI"), + } + } +} + +/// A representation of byte oriented equivalence classes. +/// +/// This is used in a DFA to reduce the size of the transition table. This can +/// have a particularly large impact not only on the total size of a dense DFA, +/// but also on compile times. +/// +/// The essential idea here is that the alphabet of a DFA is shrunk from the +/// usual 256 distinct byte values down to a set of equivalence classes. The +/// guarantee you get is that any byte belonging to the same equivalence class +/// can be treated as if it were any other byte in the same class, and the +/// result of a search wouldn't change. +/// +/// # Example +/// +/// This example shows how to get byte classes from an +/// [`NFA`](crate::nfa::thompson::NFA) and ask for the class of various bytes. +/// +/// ``` +/// use regex_automata::nfa::thompson::NFA; +/// +/// let nfa = NFA::new("[a-z]+")?; +/// let classes = nfa.byte_classes(); +/// // 'a' and 'z' are in the same class for this regex. +/// assert_eq!(classes.get(b'a'), classes.get(b'z')); +/// // But 'a' and 'A' are not. +/// assert_ne!(classes.get(b'a'), classes.get(b'A')); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +#[derive(Clone, Copy)] +pub struct ByteClasses([u8; 256]); + +impl ByteClasses { + /// Creates a new set of equivalence classes where all bytes are mapped to + /// the same class. + #[inline] + pub fn empty() -> ByteClasses { + ByteClasses([0; 256]) + } + + /// Creates a new set of equivalence classes where each byte belongs to + /// its own equivalence class. + #[inline] + pub fn singletons() -> ByteClasses { + let mut classes = ByteClasses::empty(); + for b in 0..=255 { + classes.set(b, b); + } + classes + } + + /// Deserializes a byte class map from the given slice. If the slice is of + /// insufficient length or otherwise contains an impossible mapping, then + /// an error is returned. Upon success, the number of bytes read along with + /// the map are returned. The number of bytes read is always a multiple of + /// 8. + pub(crate) fn from_bytes( + slice: &[u8], + ) -> Result<(ByteClasses, usize), DeserializeError> { + wire::check_slice_len(slice, 256, "byte class map")?; + let mut classes = ByteClasses::empty(); + for (b, &class) in slice[..256].iter().enumerate() { + classes.set(u8::try_from(b).unwrap(), class); + } + // We specifically don't use 'classes.iter()' here because that + // iterator depends on 'classes.alphabet_len()' being correct. But that + // is precisely the thing we're trying to verify below! + for &b in classes.0.iter() { + if usize::from(b) >= classes.alphabet_len() { + return Err(DeserializeError::generic( + "found equivalence class greater than alphabet len", + )); + } + } + Ok((classes, 256)) + } + + /// Writes this byte class map to the given byte buffer. if the given + /// buffer is too small, then an error is returned. Upon success, the total + /// number of bytes written is returned. The number of bytes written is + /// guaranteed to be a multiple of 8. + pub(crate) fn write_to( + &self, + mut dst: &mut [u8], + ) -> Result<usize, SerializeError> { + let nwrite = self.write_to_len(); + if dst.len() < nwrite { + return Err(SerializeError::buffer_too_small("byte class map")); + } + for b in 0..=255 { + dst[0] = self.get(b); + dst = &mut dst[1..]; + } + Ok(nwrite) + } + + /// Returns the total number of bytes written by `write_to`. + pub(crate) fn write_to_len(&self) -> usize { + 256 + } + + /// Set the equivalence class for the given byte. + #[inline] + pub fn set(&mut self, byte: u8, class: u8) { + self.0[usize::from(byte)] = class; + } + + /// Get the equivalence class for the given byte. + #[inline] + pub fn get(&self, byte: u8) -> u8 { + self.0[usize::from(byte)] + } + + /// Get the equivalence class for the given haystack unit and return the + /// class as a `usize`. + #[inline] + pub fn get_by_unit(&self, unit: Unit) -> usize { + match unit.0 { + UnitKind::U8(b) => usize::from(self.get(b)), + UnitKind::EOI(b) => usize::from(b), + } + } + + /// Create a unit that represents the "end of input" sentinel based on the + /// number of equivalence classes. + #[inline] + pub fn eoi(&self) -> Unit { + // The alphabet length already includes the EOI sentinel, hence why + // we subtract 1. + Unit::eoi(self.alphabet_len().checked_sub(1).unwrap()) + } + + /// Return the total number of elements in the alphabet represented by + /// these equivalence classes. Equivalently, this returns the total number + /// of equivalence classes. + #[inline] + pub fn alphabet_len(&self) -> usize { + // Add one since the number of equivalence classes is one bigger than + // the last one. But add another to account for the final EOI class + // that isn't explicitly represented. + usize::from(self.0[255]) + 1 + 1 + } + + /// Returns the stride, as a base-2 exponent, required for these + /// equivalence classes. + /// + /// The stride is always the smallest power of 2 that is greater than or + /// equal to the alphabet length, and the `stride2` returned here is the + /// exponent applied to `2` to get the smallest power. This is done so that + /// converting between premultiplied state IDs and indices can be done with + /// shifts alone, which is much faster than integer division. + #[inline] + pub fn stride2(&self) -> usize { + let zeros = self.alphabet_len().next_power_of_two().trailing_zeros(); + usize::try_from(zeros).unwrap() + } + + /// Returns true if and only if every byte in this class maps to its own + /// equivalence class. Equivalently, there are 257 equivalence classes + /// and each class contains either exactly one byte or corresponds to the + /// singleton class containing the "end of input" sentinel. + #[inline] + pub fn is_singleton(&self) -> bool { + self.alphabet_len() == 257 + } + + /// Returns an iterator over all equivalence classes in this set. + #[inline] + pub fn iter(&self) -> ByteClassIter<'_> { + ByteClassIter { classes: self, i: 0 } + } + + /// Returns an iterator over a sequence of representative bytes from each + /// equivalence class within the range of bytes given. + /// + /// When the given range is unbounded on both sides, the iterator yields + /// exactly N items, where N is equivalent to the number of equivalence + /// classes. Each item is an arbitrary byte drawn from each equivalence + /// class. + /// + /// This is useful when one is determinizing an NFA and the NFA's alphabet + /// hasn't been converted to equivalence classes. Picking an arbitrary byte + /// from each equivalence class then permits a full exploration of the NFA + /// instead of using every possible byte value and thus potentially saves + /// quite a lot of redundant work. + /// + /// # Example + /// + /// This shows an example of what a complete sequence of representatives + /// might look like from a real example. + /// + /// ``` + /// use regex_automata::{nfa::thompson::NFA, util::alphabet::Unit}; + /// + /// let nfa = NFA::new("[a-z]+")?; + /// let classes = nfa.byte_classes(); + /// let reps: Vec<Unit> = classes.representatives(..).collect(); + /// // Note that the specific byte values yielded are not guaranteed! + /// let expected = vec![ + /// Unit::u8(b'\x00'), + /// Unit::u8(b'a'), + /// Unit::u8(b'{'), + /// Unit::eoi(3), + /// ]; + /// assert_eq!(expected, reps); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// Note though, that you can ask for an arbitrary range of bytes, and only + /// representatives for that range will be returned: + /// + /// ``` + /// use regex_automata::{nfa::thompson::NFA, util::alphabet::Unit}; + /// + /// let nfa = NFA::new("[a-z]+")?; + /// let classes = nfa.byte_classes(); + /// let reps: Vec<Unit> = classes.representatives(b'A'..=b'z').collect(); + /// // Note that the specific byte values yielded are not guaranteed! + /// let expected = vec![ + /// Unit::u8(b'A'), + /// Unit::u8(b'a'), + /// ]; + /// assert_eq!(expected, reps); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn representatives<R: core::ops::RangeBounds<u8>>( + &self, + range: R, + ) -> ByteClassRepresentatives<'_> { + use core::ops::Bound; + + let cur_byte = match range.start_bound() { + Bound::Included(&i) => usize::from(i), + Bound::Excluded(&i) => usize::from(i).checked_add(1).unwrap(), + Bound::Unbounded => 0, + }; + let end_byte = match range.end_bound() { + Bound::Included(&i) => { + Some(usize::from(i).checked_add(1).unwrap()) + } + Bound::Excluded(&i) => Some(usize::from(i)), + Bound::Unbounded => None, + }; + assert_ne!( + cur_byte, + usize::MAX, + "start range must be less than usize::MAX", + ); + ByteClassRepresentatives { + classes: self, + cur_byte, + end_byte, + last_class: None, + } + } + + /// Returns an iterator of the bytes in the given equivalence class. + /// + /// This is useful when one needs to know the actual bytes that belong to + /// an equivalence class. For example, conceptually speaking, accelerating + /// a DFA state occurs when a state only has a few outgoing transitions. + /// But in reality, what is required is that there are only a small + /// number of distinct bytes that can lead to an outgoing transition. The + /// difference is that any one transition can correspond to an equivalence + /// class which may contains many bytes. Therefore, DFA state acceleration + /// considers the actual elements in each equivalence class of each + /// outgoing transition. + /// + /// # Example + /// + /// This shows an example of how to get all of the elements in an + /// equivalence class. + /// + /// ``` + /// use regex_automata::{nfa::thompson::NFA, util::alphabet::Unit}; + /// + /// let nfa = NFA::new("[a-z]+")?; + /// let classes = nfa.byte_classes(); + /// let elements: Vec<Unit> = classes.elements(Unit::u8(1)).collect(); + /// let expected: Vec<Unit> = (b'a'..=b'z').map(Unit::u8).collect(); + /// assert_eq!(expected, elements); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn elements(&self, class: Unit) -> ByteClassElements { + ByteClassElements { classes: self, class, byte: 0 } + } + + /// Returns an iterator of byte ranges in the given equivalence class. + /// + /// That is, a sequence of contiguous ranges are returned. Typically, every + /// class maps to a single contiguous range. + fn element_ranges(&self, class: Unit) -> ByteClassElementRanges { + ByteClassElementRanges { elements: self.elements(class), range: None } + } +} + +impl Default for ByteClasses { + fn default() -> ByteClasses { + ByteClasses::singletons() + } +} + +impl core::fmt::Debug for ByteClasses { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + if self.is_singleton() { + write!(f, "ByteClasses({{singletons}})") + } else { + write!(f, "ByteClasses(")?; + for (i, class) in self.iter().enumerate() { + if i > 0 { + write!(f, ", ")?; + } + write!(f, "{:?} => [", class.as_usize())?; + for (start, end) in self.element_ranges(class) { + if start == end { + write!(f, "{:?}", start)?; + } else { + write!(f, "{:?}-{:?}", start, end)?; + } + } + write!(f, "]")?; + } + write!(f, ")") + } + } +} + +/// An iterator over each equivalence class. +/// +/// The last element in this iterator always corresponds to [`Unit::eoi`]. +/// +/// This is created by the [`ByteClasses::iter`] method. +/// +/// The lifetime `'a` refers to the lifetime of the byte classes that this +/// iterator was created from. +#[derive(Debug)] +pub struct ByteClassIter<'a> { + classes: &'a ByteClasses, + i: usize, +} + +impl<'a> Iterator for ByteClassIter<'a> { + type Item = Unit; + + fn next(&mut self) -> Option<Unit> { + if self.i + 1 == self.classes.alphabet_len() { + self.i += 1; + Some(self.classes.eoi()) + } else if self.i < self.classes.alphabet_len() { + let class = u8::try_from(self.i).unwrap(); + self.i += 1; + Some(Unit::u8(class)) + } else { + None + } + } +} + +/// An iterator over representative bytes from each equivalence class. +/// +/// This is created by the [`ByteClasses::representatives`] method. +/// +/// The lifetime `'a` refers to the lifetime of the byte classes that this +/// iterator was created from. +#[derive(Debug)] +pub struct ByteClassRepresentatives<'a> { + classes: &'a ByteClasses, + cur_byte: usize, + end_byte: Option<usize>, + last_class: Option<u8>, +} + +impl<'a> Iterator for ByteClassRepresentatives<'a> { + type Item = Unit; + + fn next(&mut self) -> Option<Unit> { + while self.cur_byte < self.end_byte.unwrap_or(256) { + let byte = u8::try_from(self.cur_byte).unwrap(); + let class = self.classes.get(byte); + self.cur_byte += 1; + + if self.last_class != Some(class) { + self.last_class = Some(class); + return Some(Unit::u8(byte)); + } + } + if self.cur_byte != usize::MAX && self.end_byte.is_none() { + // Using usize::MAX as a sentinel is OK because we ban usize::MAX + // from appearing as a start bound in iterator construction. But + // why do it this way? Well, we want to return the EOI class + // whenever the end of the given range is unbounded because EOI + // isn't really a "byte" per se, so the only way it should be + // excluded is if there is a bounded end to the range. Therefore, + // when the end is unbounded, we just need to know whether we've + // reported EOI or not. When we do, we set cur_byte to a value it + // can never otherwise be. + self.cur_byte = usize::MAX; + return Some(self.classes.eoi()); + } + None + } +} + +/// An iterator over all elements in an equivalence class. +/// +/// This is created by the [`ByteClasses::elements`] method. +/// +/// The lifetime `'a` refers to the lifetime of the byte classes that this +/// iterator was created from. +#[derive(Debug)] +pub struct ByteClassElements<'a> { + classes: &'a ByteClasses, + class: Unit, + byte: usize, +} + +impl<'a> Iterator for ByteClassElements<'a> { + type Item = Unit; + + fn next(&mut self) -> Option<Unit> { + while self.byte < 256 { + let byte = u8::try_from(self.byte).unwrap(); + self.byte += 1; + if self.class.is_byte(self.classes.get(byte)) { + return Some(Unit::u8(byte)); + } + } + if self.byte < 257 { + self.byte += 1; + if self.class.is_eoi() { + return Some(Unit::eoi(256)); + } + } + None + } +} + +/// An iterator over all elements in an equivalence class expressed as a +/// sequence of contiguous ranges. +#[derive(Debug)] +struct ByteClassElementRanges<'a> { + elements: ByteClassElements<'a>, + range: Option<(Unit, Unit)>, +} + +impl<'a> Iterator for ByteClassElementRanges<'a> { + type Item = (Unit, Unit); + + fn next(&mut self) -> Option<(Unit, Unit)> { + loop { + let element = match self.elements.next() { + None => return self.range.take(), + Some(element) => element, + }; + match self.range.take() { + None => { + self.range = Some((element, element)); + } + Some((start, end)) => { + if end.as_usize() + 1 != element.as_usize() + || element.is_eoi() + { + self.range = Some((element, element)); + return Some((start, end)); + } + self.range = Some((start, element)); + } + } + } + } +} + +/// A partitioning of bytes into equivalence classes. +/// +/// A byte class set keeps track of an *approximation* of equivalence classes +/// of bytes during NFA construction. That is, every byte in an equivalence +/// class cannot discriminate between a match and a non-match. +/// +/// For example, in the regex `[ab]+`, the bytes `a` and `b` would be in the +/// same equivalence class because it never matters whether an `a` or a `b` is +/// seen, and no combination of `a`s and `b`s in the text can discriminate a +/// match. +/// +/// Note though that this does not compute the minimal set of equivalence +/// classes. For example, in the regex `[ac]+`, both `a` and `c` are in the +/// same equivalence class for the same reason that `a` and `b` are in the +/// same equivalence class in the aforementioned regex. However, in this +/// implementation, `a` and `c` are put into distinct equivalence classes. The +/// reason for this is implementation complexity. In the future, we should +/// endeavor to compute the minimal equivalence classes since they can have a +/// rather large impact on the size of the DFA. (Doing this will likely require +/// rethinking how equivalence classes are computed, including changing the +/// representation here, which is only able to group contiguous bytes into the +/// same equivalence class.) +#[cfg(feature = "alloc")] +#[derive(Clone, Debug)] +pub(crate) struct ByteClassSet(ByteSet); + +#[cfg(feature = "alloc")] +impl Default for ByteClassSet { + fn default() -> ByteClassSet { + ByteClassSet::empty() + } +} + +#[cfg(feature = "alloc")] +impl ByteClassSet { + /// Create a new set of byte classes where all bytes are part of the same + /// equivalence class. + pub(crate) fn empty() -> Self { + ByteClassSet(ByteSet::empty()) + } + + /// Indicate the the range of byte given (inclusive) can discriminate a + /// match between it and all other bytes outside of the range. + pub(crate) fn set_range(&mut self, start: u8, end: u8) { + debug_assert!(start <= end); + if start > 0 { + self.0.add(start - 1); + } + self.0.add(end); + } + + /// Add the contiguous ranges in the set given to this byte class set. + pub(crate) fn add_set(&mut self, set: &ByteSet) { + for (start, end) in set.iter_ranges() { + self.set_range(start, end); + } + } + + /// Convert this boolean set to a map that maps all byte values to their + /// corresponding equivalence class. The last mapping indicates the largest + /// equivalence class identifier (which is never bigger than 255). + pub(crate) fn byte_classes(&self) -> ByteClasses { + let mut classes = ByteClasses::empty(); + let mut class = 0u8; + let mut b = 0u8; + loop { + classes.set(b, class); + if b == 255 { + break; + } + if self.0.contains(b) { + class = class.checked_add(1).unwrap(); + } + b = b.checked_add(1).unwrap(); + } + classes + } +} + +/// A simple set of bytes that is reasonably cheap to copy and allocation free. +#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)] +pub(crate) struct ByteSet { + bits: BitSet, +} + +/// The representation of a byte set. Split out so that we can define a +/// convenient Debug impl for it while keeping "ByteSet" in the output. +#[derive(Clone, Copy, Default, Eq, PartialEq)] +struct BitSet([u128; 2]); + +impl ByteSet { + /// Create an empty set of bytes. + pub(crate) fn empty() -> ByteSet { + ByteSet { bits: BitSet([0; 2]) } + } + + /// Add a byte to this set. + /// + /// If the given byte already belongs to this set, then this is a no-op. + pub(crate) fn add(&mut self, byte: u8) { + let bucket = byte / 128; + let bit = byte % 128; + self.bits.0[usize::from(bucket)] |= 1 << bit; + } + + /// Remove a byte from this set. + /// + /// If the given byte is not in this set, then this is a no-op. + pub(crate) fn remove(&mut self, byte: u8) { + let bucket = byte / 128; + let bit = byte % 128; + self.bits.0[usize::from(bucket)] &= !(1 << bit); + } + + /// Return true if and only if the given byte is in this set. + pub(crate) fn contains(&self, byte: u8) -> bool { + let bucket = byte / 128; + let bit = byte % 128; + self.bits.0[usize::from(bucket)] & (1 << bit) > 0 + } + + /// Return true if and only if the given inclusive range of bytes is in + /// this set. + pub(crate) fn contains_range(&self, start: u8, end: u8) -> bool { + (start..=end).all(|b| self.contains(b)) + } + + /// Returns an iterator over all bytes in this set. + pub(crate) fn iter(&self) -> ByteSetIter { + ByteSetIter { set: self, b: 0 } + } + + /// Returns an iterator over all contiguous ranges of bytes in this set. + pub(crate) fn iter_ranges(&self) -> ByteSetRangeIter { + ByteSetRangeIter { set: self, b: 0 } + } + + /// Return true if and only if this set is empty. + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn is_empty(&self) -> bool { + self.bits.0 == [0, 0] + } + + /// Deserializes a byte set from the given slice. If the slice is of + /// incorrect length or is otherwise malformed, then an error is returned. + /// Upon success, the number of bytes read along with the set are returned. + /// The number of bytes read is always a multiple of 8. + pub(crate) fn from_bytes( + slice: &[u8], + ) -> Result<(ByteSet, usize), DeserializeError> { + use core::mem::size_of; + + wire::check_slice_len(slice, 2 * size_of::<u128>(), "byte set")?; + let mut nread = 0; + let (low, nr) = wire::try_read_u128(slice, "byte set low bucket")?; + nread += nr; + let (high, nr) = wire::try_read_u128(slice, "byte set high bucket")?; + nread += nr; + Ok((ByteSet { bits: BitSet([low, high]) }, nread)) + } + + /// Writes this byte set to the given byte buffer. If the given buffer is + /// too small, then an error is returned. Upon success, the total number of + /// bytes written is returned. The number of bytes written is guaranteed to + /// be a multiple of 8. + pub(crate) fn write_to<E: crate::util::wire::Endian>( + &self, + dst: &mut [u8], + ) -> Result<usize, SerializeError> { + use core::mem::size_of; + + let nwrite = self.write_to_len(); + if dst.len() < nwrite { + return Err(SerializeError::buffer_too_small("byte set")); + } + let mut nw = 0; + E::write_u128(self.bits.0[0], &mut dst[nw..]); + nw += size_of::<u128>(); + E::write_u128(self.bits.0[1], &mut dst[nw..]); + nw += size_of::<u128>(); + assert_eq!(nwrite, nw, "expected to write certain number of bytes",); + assert_eq!( + nw % 8, + 0, + "expected to write multiple of 8 bytes for byte set", + ); + Ok(nw) + } + + /// Returns the total number of bytes written by `write_to`. + pub(crate) fn write_to_len(&self) -> usize { + 2 * core::mem::size_of::<u128>() + } +} + +impl core::fmt::Debug for BitSet { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + let mut fmtd = f.debug_set(); + for b in 0u8..=255 { + if (ByteSet { bits: *self }).contains(b) { + fmtd.entry(&b); + } + } + fmtd.finish() + } +} + +#[derive(Debug)] +pub(crate) struct ByteSetIter<'a> { + set: &'a ByteSet, + b: usize, +} + +impl<'a> Iterator for ByteSetIter<'a> { + type Item = u8; + + fn next(&mut self) -> Option<u8> { + while self.b <= 255 { + let b = u8::try_from(self.b).unwrap(); + self.b += 1; + if self.set.contains(b) { + return Some(b); + } + } + None + } +} + +#[derive(Debug)] +pub(crate) struct ByteSetRangeIter<'a> { + set: &'a ByteSet, + b: usize, +} + +impl<'a> Iterator for ByteSetRangeIter<'a> { + type Item = (u8, u8); + + fn next(&mut self) -> Option<(u8, u8)> { + let asu8 = |n: usize| u8::try_from(n).unwrap(); + while self.b <= 255 { + let start = asu8(self.b); + self.b += 1; + if !self.set.contains(start) { + continue; + } + + let mut end = start; + while self.b <= 255 && self.set.contains(asu8(self.b)) { + end = asu8(self.b); + self.b += 1; + } + return Some((start, end)); + } + None + } +} + +#[cfg(all(test, feature = "alloc"))] +mod tests { + use alloc::{vec, vec::Vec}; + + use super::*; + + #[test] + fn byte_classes() { + let mut set = ByteClassSet::empty(); + set.set_range(b'a', b'z'); + + let classes = set.byte_classes(); + assert_eq!(classes.get(0), 0); + assert_eq!(classes.get(1), 0); + assert_eq!(classes.get(2), 0); + assert_eq!(classes.get(b'a' - 1), 0); + assert_eq!(classes.get(b'a'), 1); + assert_eq!(classes.get(b'm'), 1); + assert_eq!(classes.get(b'z'), 1); + assert_eq!(classes.get(b'z' + 1), 2); + assert_eq!(classes.get(254), 2); + assert_eq!(classes.get(255), 2); + + let mut set = ByteClassSet::empty(); + set.set_range(0, 2); + set.set_range(4, 6); + let classes = set.byte_classes(); + assert_eq!(classes.get(0), 0); + assert_eq!(classes.get(1), 0); + assert_eq!(classes.get(2), 0); + assert_eq!(classes.get(3), 1); + assert_eq!(classes.get(4), 2); + assert_eq!(classes.get(5), 2); + assert_eq!(classes.get(6), 2); + assert_eq!(classes.get(7), 3); + assert_eq!(classes.get(255), 3); + } + + #[test] + fn full_byte_classes() { + let mut set = ByteClassSet::empty(); + for b in 0u8..=255 { + set.set_range(b, b); + } + assert_eq!(set.byte_classes().alphabet_len(), 257); + } + + #[test] + fn elements_typical() { + let mut set = ByteClassSet::empty(); + set.set_range(b'b', b'd'); + set.set_range(b'g', b'm'); + set.set_range(b'z', b'z'); + let classes = set.byte_classes(); + // class 0: \x00-a + // class 1: b-d + // class 2: e-f + // class 3: g-m + // class 4: n-y + // class 5: z-z + // class 6: \x7B-\xFF + // class 7: EOI + assert_eq!(classes.alphabet_len(), 8); + + let elements = classes.elements(Unit::u8(0)).collect::<Vec<_>>(); + assert_eq!(elements.len(), 98); + assert_eq!(elements[0], Unit::u8(b'\x00')); + assert_eq!(elements[97], Unit::u8(b'a')); + + let elements = classes.elements(Unit::u8(1)).collect::<Vec<_>>(); + assert_eq!( + elements, + vec![Unit::u8(b'b'), Unit::u8(b'c'), Unit::u8(b'd')], + ); + + let elements = classes.elements(Unit::u8(2)).collect::<Vec<_>>(); + assert_eq!(elements, vec![Unit::u8(b'e'), Unit::u8(b'f')],); + + let elements = classes.elements(Unit::u8(3)).collect::<Vec<_>>(); + assert_eq!( + elements, + vec![ + Unit::u8(b'g'), + Unit::u8(b'h'), + Unit::u8(b'i'), + Unit::u8(b'j'), + Unit::u8(b'k'), + Unit::u8(b'l'), + Unit::u8(b'm'), + ], + ); + + let elements = classes.elements(Unit::u8(4)).collect::<Vec<_>>(); + assert_eq!(elements.len(), 12); + assert_eq!(elements[0], Unit::u8(b'n')); + assert_eq!(elements[11], Unit::u8(b'y')); + + let elements = classes.elements(Unit::u8(5)).collect::<Vec<_>>(); + assert_eq!(elements, vec![Unit::u8(b'z')]); + + let elements = classes.elements(Unit::u8(6)).collect::<Vec<_>>(); + assert_eq!(elements.len(), 133); + assert_eq!(elements[0], Unit::u8(b'\x7B')); + assert_eq!(elements[132], Unit::u8(b'\xFF')); + + let elements = classes.elements(Unit::eoi(7)).collect::<Vec<_>>(); + assert_eq!(elements, vec![Unit::eoi(256)]); + } + + #[test] + fn elements_singletons() { + let classes = ByteClasses::singletons(); + assert_eq!(classes.alphabet_len(), 257); + + let elements = classes.elements(Unit::u8(b'a')).collect::<Vec<_>>(); + assert_eq!(elements, vec![Unit::u8(b'a')]); + + let elements = classes.elements(Unit::eoi(5)).collect::<Vec<_>>(); + assert_eq!(elements, vec![Unit::eoi(256)]); + } + + #[test] + fn elements_empty() { + let classes = ByteClasses::empty(); + assert_eq!(classes.alphabet_len(), 2); + + let elements = classes.elements(Unit::u8(0)).collect::<Vec<_>>(); + assert_eq!(elements.len(), 256); + assert_eq!(elements[0], Unit::u8(b'\x00')); + assert_eq!(elements[255], Unit::u8(b'\xFF')); + + let elements = classes.elements(Unit::eoi(1)).collect::<Vec<_>>(); + assert_eq!(elements, vec![Unit::eoi(256)]); + } + + #[test] + fn representatives() { + let mut set = ByteClassSet::empty(); + set.set_range(b'b', b'd'); + set.set_range(b'g', b'm'); + set.set_range(b'z', b'z'); + let classes = set.byte_classes(); + + let got: Vec<Unit> = classes.representatives(..).collect(); + let expected = vec![ + Unit::u8(b'\x00'), + Unit::u8(b'b'), + Unit::u8(b'e'), + Unit::u8(b'g'), + Unit::u8(b'n'), + Unit::u8(b'z'), + Unit::u8(b'\x7B'), + Unit::eoi(7), + ]; + assert_eq!(expected, got); + + let got: Vec<Unit> = classes.representatives(..0).collect(); + assert!(got.is_empty()); + let got: Vec<Unit> = classes.representatives(1..1).collect(); + assert!(got.is_empty()); + let got: Vec<Unit> = classes.representatives(255..255).collect(); + assert!(got.is_empty()); + + // A weird case that is the only guaranteed to way to get an iterator + // of just the EOI class by excluding all possible byte values. + let got: Vec<Unit> = classes + .representatives(( + core::ops::Bound::Excluded(255), + core::ops::Bound::Unbounded, + )) + .collect(); + let expected = vec![Unit::eoi(7)]; + assert_eq!(expected, got); + + let got: Vec<Unit> = classes.representatives(..=255).collect(); + let expected = vec![ + Unit::u8(b'\x00'), + Unit::u8(b'b'), + Unit::u8(b'e'), + Unit::u8(b'g'), + Unit::u8(b'n'), + Unit::u8(b'z'), + Unit::u8(b'\x7B'), + ]; + assert_eq!(expected, got); + + let got: Vec<Unit> = classes.representatives(b'b'..=b'd').collect(); + let expected = vec![Unit::u8(b'b')]; + assert_eq!(expected, got); + + let got: Vec<Unit> = classes.representatives(b'a'..=b'd').collect(); + let expected = vec![Unit::u8(b'a'), Unit::u8(b'b')]; + assert_eq!(expected, got); + + let got: Vec<Unit> = classes.representatives(b'b'..=b'e').collect(); + let expected = vec![Unit::u8(b'b'), Unit::u8(b'e')]; + assert_eq!(expected, got); + + let got: Vec<Unit> = classes.representatives(b'A'..=b'Z').collect(); + let expected = vec![Unit::u8(b'A')]; + assert_eq!(expected, got); + + let got: Vec<Unit> = classes.representatives(b'A'..=b'z').collect(); + let expected = vec![ + Unit::u8(b'A'), + Unit::u8(b'b'), + Unit::u8(b'e'), + Unit::u8(b'g'), + Unit::u8(b'n'), + Unit::u8(b'z'), + ]; + assert_eq!(expected, got); + + let got: Vec<Unit> = classes.representatives(b'z'..).collect(); + let expected = vec![Unit::u8(b'z'), Unit::u8(b'\x7B'), Unit::eoi(7)]; + assert_eq!(expected, got); + + let got: Vec<Unit> = classes.representatives(b'z'..=0xFF).collect(); + let expected = vec![Unit::u8(b'z'), Unit::u8(b'\x7B')]; + assert_eq!(expected, got); + } +} diff --git a/vendor/regex-automata/src/util/captures.rs b/vendor/regex-automata/src/util/captures.rs new file mode 100644 index 0000000..05db6a9 --- /dev/null +++ b/vendor/regex-automata/src/util/captures.rs @@ -0,0 +1,2548 @@ +/*! +Provides types for dealing with capturing groups. + +Capturing groups refer to sub-patterns of regexes that some regex engines can +report matching offsets for. For example, matching `[a-z]([0-9]+)` against +`a789` would give `a789` as the overall match (for the implicit capturing group +at index `0`) and `789` as the match for the capturing group `([0-9]+)` (an +explicit capturing group at index `1`). + +Not all regex engines can report match offsets for capturing groups. Indeed, +to a first approximation, regex engines that can report capturing group offsets +tend to be quite a bit slower than regex engines that can't. This is because +tracking capturing groups at search time usually requires more "power" that +in turn adds overhead. + +Other regex implementations might call capturing groups "submatches." + +# Overview + +The main types in this module are: + +* [`Captures`] records the capturing group offsets found during a search. It +provides convenience routines for looking up capturing group offsets by either +index or name. +* [`GroupInfo`] records the mapping between capturing groups and "slots," +where the latter are how capturing groups are recorded during a regex search. +This also keeps a mapping from capturing group name to index, and capture +group index to name. A `GroupInfo` is used by `Captures` internally to +provide a convenient API. It is unlikely that you'll use a `GroupInfo` +directly, but for example, if you've compiled an Thompson NFA, then you can use +[`thompson::NFA::group_info`](crate::nfa::thompson::NFA::group_info) to get its +underlying `GroupInfo`. +*/ + +use alloc::{string::String, sync::Arc, vec, vec::Vec}; + +use crate::util::{ + interpolate, + primitives::{ + NonMaxUsize, PatternID, PatternIDError, PatternIDIter, SmallIndex, + }, + search::{Match, Span}, +}; + +/// The span offsets of capturing groups after a match has been found. +/// +/// This type represents the output of regex engines that can report the +/// offsets at which capturing groups matches or "submatches" occur. For +/// example, the [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM). When a match +/// occurs, it will at minimum contain the [`PatternID`] of the pattern that +/// matched. Depending upon how it was constructed, it may also contain the +/// start/end offsets of the entire match of the pattern and the start/end +/// offsets of each capturing group that participated in the match. +/// +/// Values of this type are always created for a specific [`GroupInfo`]. It is +/// unspecified behavior to use a `Captures` value in a search with any regex +/// engine that has a different `GroupInfo` than the one the `Captures` were +/// created with. +/// +/// # Constructors +/// +/// There are three constructors for this type that control what kind of +/// information is available upon a match: +/// +/// * [`Captures::all`]: Will store overall pattern match offsets in addition +/// to the offsets of capturing groups that participated in the match. +/// * [`Captures::matches`]: Will store only the overall pattern +/// match offsets. The offsets of capturing groups (even ones that participated +/// in the match) are not available. +/// * [`Captures::empty`]: Will only store the pattern ID that matched. No +/// match offsets are available at all. +/// +/// If you aren't sure which to choose, then pick the first one. The first one +/// is what convenience routines like, +/// [`PikeVM::create_captures`](crate::nfa::thompson::pikevm::PikeVM::create_captures), +/// will use automatically. +/// +/// The main difference between these choices is performance. Namely, if you +/// ask for _less_ information, then the execution of regex search may be able +/// to run more quickly. +/// +/// # Notes +/// +/// It is worth pointing out that this type is not coupled to any one specific +/// regex engine. Instead, its coupling is with [`GroupInfo`], which is the +/// thing that is responsible for mapping capturing groups to "slot" offsets. +/// Slot offsets are indices into a single sequence of memory at which matching +/// haystack offsets for the corresponding group are written by regex engines. +/// +/// # Example +/// +/// This example shows how to parse a simple date and extract the components of +/// the date via capturing groups: +/// +/// ``` +/// use regex_automata::{nfa::thompson::pikevm::PikeVM, Span}; +/// +/// let re = PikeVM::new(r"^([0-9]{4})-([0-9]{2})-([0-9]{2})$")?; +/// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); +/// +/// re.captures(&mut cache, "2010-03-14", &mut caps); +/// assert!(caps.is_match()); +/// assert_eq!(Some(Span::from(0..4)), caps.get_group(1)); +/// assert_eq!(Some(Span::from(5..7)), caps.get_group(2)); +/// assert_eq!(Some(Span::from(8..10)), caps.get_group(3)); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +/// +/// # Example: named capturing groups +/// +/// This example is like the one above, but leverages the ability to name +/// capturing groups in order to make the code a bit clearer: +/// +/// ``` +/// use regex_automata::{nfa::thompson::pikevm::PikeVM, Span}; +/// +/// let re = PikeVM::new(r"^(?P<y>[0-9]{4})-(?P<m>[0-9]{2})-(?P<d>[0-9]{2})$")?; +/// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); +/// +/// re.captures(&mut cache, "2010-03-14", &mut caps); +/// assert!(caps.is_match()); +/// assert_eq!(Some(Span::from(0..4)), caps.get_group_by_name("y")); +/// assert_eq!(Some(Span::from(5..7)), caps.get_group_by_name("m")); +/// assert_eq!(Some(Span::from(8..10)), caps.get_group_by_name("d")); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +#[derive(Clone)] +pub struct Captures { + /// The group info that these capture groups are coupled to. This is what + /// gives the "convenience" of the `Captures` API. Namely, it provides the + /// slot mapping and the name|-->index mapping for capture lookups by name. + group_info: GroupInfo, + /// The ID of the pattern that matched. Regex engines must set this to + /// None when no match occurs. + pid: Option<PatternID>, + /// The slot values, i.e., submatch offsets. + /// + /// In theory, the smallest sequence of slots would be something like + /// `max(groups(pattern) for pattern in regex) * 2`, but instead, we use + /// `sum(groups(pattern) for pattern in regex) * 2`. Why? + /// + /// Well, the former could be used in theory, because we don't generally + /// have any overlapping APIs that involve capturing groups. Therefore, + /// there's technically never any need to have slots set for multiple + /// patterns. However, this might change some day, in which case, we would + /// need to have slots available. + /// + /// The other reason is that during the execution of some regex engines, + /// there exists a point in time where multiple slots for different + /// patterns may be written to before knowing which pattern has matched. + /// Therefore, the regex engines themselves, in order to support multiple + /// patterns correctly, must have all slots available. If `Captures` + /// doesn't have all slots available, then regex engines can't write + /// directly into the caller provided `Captures` and must instead write + /// into some other storage and then copy the slots involved in the match + /// at the end of the search. + /// + /// So overall, at least as of the time of writing, it seems like the path + /// of least resistance is to just require allocating all possible slots + /// instead of the conceptual minimum. Another way to justify this is that + /// the most common case is a single pattern, in which case, there is no + /// inefficiency here since the 'max' and 'sum' calculations above are + /// equivalent in that case. + /// + /// N.B. The mapping from group index to slot is maintained by `GroupInfo` + /// and is considered an API guarantee. See `GroupInfo` for more details on + /// that mapping. + /// + /// N.B. `Option<NonMaxUsize>` has the same size as a `usize`. + slots: Vec<Option<NonMaxUsize>>, +} + +impl Captures { + /// Create new storage for the offsets of all matching capturing groups. + /// + /// This routine provides the most information for matches---namely, the + /// spans of matching capturing groups---but also requires the regex search + /// routines to do the most work. + /// + /// It is unspecified behavior to use the returned `Captures` value in a + /// search with a `GroupInfo` other than the one that is provided to this + /// constructor. + /// + /// # Example + /// + /// This example shows that all capturing groups---but only ones that + /// participated in a match---are available to query after a match has + /// been found: + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson::pikevm::PikeVM, + /// util::captures::Captures, + /// Span, Match, + /// }; + /// + /// let re = PikeVM::new( + /// r"^(?:(?P<lower>[a-z]+)|(?P<upper>[A-Z]+))(?P<digits>[0-9]+)$", + /// )?; + /// let mut cache = re.create_cache(); + /// let mut caps = Captures::all(re.get_nfa().group_info().clone()); + /// + /// re.captures(&mut cache, "ABC123", &mut caps); + /// assert!(caps.is_match()); + /// assert_eq!(Some(Match::must(0, 0..6)), caps.get_match()); + /// // The 'lower' group didn't match, so it won't have any offsets. + /// assert_eq!(None, caps.get_group_by_name("lower")); + /// assert_eq!(Some(Span::from(0..3)), caps.get_group_by_name("upper")); + /// assert_eq!(Some(Span::from(3..6)), caps.get_group_by_name("digits")); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn all(group_info: GroupInfo) -> Captures { + let slots = group_info.slot_len(); + Captures { group_info, pid: None, slots: vec![None; slots] } + } + + /// Create new storage for only the full match spans of a pattern. This + /// does not include any capturing group offsets. + /// + /// It is unspecified behavior to use the returned `Captures` value in a + /// search with a `GroupInfo` other than the one that is provided to this + /// constructor. + /// + /// # Example + /// + /// This example shows that only overall match offsets are reported when + /// this constructor is used. Accessing any capturing groups other than + /// the 0th will always return `None`. + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson::pikevm::PikeVM, + /// util::captures::Captures, + /// Match, + /// }; + /// + /// let re = PikeVM::new( + /// r"^(?:(?P<lower>[a-z]+)|(?P<upper>[A-Z]+))(?P<digits>[0-9]+)$", + /// )?; + /// let mut cache = re.create_cache(); + /// let mut caps = Captures::matches(re.get_nfa().group_info().clone()); + /// + /// re.captures(&mut cache, "ABC123", &mut caps); + /// assert!(caps.is_match()); + /// assert_eq!(Some(Match::must(0, 0..6)), caps.get_match()); + /// // We didn't ask for capturing group offsets, so they aren't available. + /// assert_eq!(None, caps.get_group_by_name("lower")); + /// assert_eq!(None, caps.get_group_by_name("upper")); + /// assert_eq!(None, caps.get_group_by_name("digits")); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn matches(group_info: GroupInfo) -> Captures { + // This is OK because we know there are at least this many slots, + // and GroupInfo construction guarantees that the number of slots fits + // into a usize. + let slots = group_info.pattern_len().checked_mul(2).unwrap(); + Captures { group_info, pid: None, slots: vec![None; slots] } + } + + /// Create new storage for only tracking which pattern matched. No offsets + /// are stored at all. + /// + /// It is unspecified behavior to use the returned `Captures` value in a + /// search with a `GroupInfo` other than the one that is provided to this + /// constructor. + /// + /// # Example + /// + /// This example shows that only the pattern that matched can be accessed + /// from a `Captures` value created via this constructor. + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson::pikevm::PikeVM, + /// util::captures::Captures, + /// PatternID, + /// }; + /// + /// let re = PikeVM::new_many(&[r"[a-z]+", r"[A-Z]+"])?; + /// let mut cache = re.create_cache(); + /// let mut caps = Captures::empty(re.get_nfa().group_info().clone()); + /// + /// re.captures(&mut cache, "aABCz", &mut caps); + /// assert!(caps.is_match()); + /// assert_eq!(Some(PatternID::must(0)), caps.pattern()); + /// // We didn't ask for any offsets, so they aren't available. + /// assert_eq!(None, caps.get_match()); + /// + /// re.captures(&mut cache, &"aABCz"[1..], &mut caps); + /// assert!(caps.is_match()); + /// assert_eq!(Some(PatternID::must(1)), caps.pattern()); + /// // We didn't ask for any offsets, so they aren't available. + /// assert_eq!(None, caps.get_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn empty(group_info: GroupInfo) -> Captures { + Captures { group_info, pid: None, slots: vec![] } + } + + /// Returns true if and only if this capturing group represents a match. + /// + /// This is a convenience routine for `caps.pattern().is_some()`. + /// + /// # Example + /// + /// When using the PikeVM (for example), the lightest weight way of + /// detecting whether a match exists is to create capturing groups that + /// only track the ID of the pattern that match (if any): + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson::pikevm::PikeVM, + /// util::captures::Captures, + /// }; + /// + /// let re = PikeVM::new(r"[a-z]+")?; + /// let mut cache = re.create_cache(); + /// let mut caps = Captures::empty(re.get_nfa().group_info().clone()); + /// + /// re.captures(&mut cache, "aABCz", &mut caps); + /// assert!(caps.is_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn is_match(&self) -> bool { + self.pid.is_some() + } + + /// Returns the identifier of the pattern that matched when this + /// capturing group represents a match. If no match was found, then this + /// always returns `None`. + /// + /// This returns a pattern ID in precisely the cases in which `is_match` + /// returns `true`. Similarly, the pattern ID returned is always the + /// same pattern ID found in the `Match` returned by `get_match`. + /// + /// # Example + /// + /// When using the PikeVM (for example), the lightest weight way of + /// detecting which pattern matched is to create capturing groups that only + /// track the ID of the pattern that match (if any): + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson::pikevm::PikeVM, + /// util::captures::Captures, + /// PatternID, + /// }; + /// + /// let re = PikeVM::new_many(&[r"[a-z]+", r"[A-Z]+"])?; + /// let mut cache = re.create_cache(); + /// let mut caps = Captures::empty(re.get_nfa().group_info().clone()); + /// + /// re.captures(&mut cache, "ABC", &mut caps); + /// assert_eq!(Some(PatternID::must(1)), caps.pattern()); + /// // Recall that offsets are only available when using a non-empty + /// // Captures value. So even though a match occurred, this returns None! + /// assert_eq!(None, caps.get_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn pattern(&self) -> Option<PatternID> { + self.pid + } + + /// Returns the pattern ID and the span of the match, if one occurred. + /// + /// This always returns `None` when `Captures` was created with + /// [`Captures::empty`], even if a match was found. + /// + /// If this routine returns a non-`None` value, then `is_match` is + /// guaranteed to return `true` and `pattern` is also guaranteed to return + /// a non-`None` value. + /// + /// # Example + /// + /// This example shows how to get the full match from a search: + /// + /// ``` + /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; + /// + /// let re = PikeVM::new_many(&[r"[a-z]+", r"[A-Z]+"])?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// + /// re.captures(&mut cache, "ABC", &mut caps); + /// assert_eq!(Some(Match::must(1, 0..3)), caps.get_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn get_match(&self) -> Option<Match> { + Some(Match::new(self.pattern()?, self.get_group(0)?)) + } + + /// Returns the span of a capturing group match corresponding to the group + /// index given, only if both the overall pattern matched and the capturing + /// group participated in that match. + /// + /// This returns `None` if `index` is invalid. `index` is valid if and only + /// if it's less than [`Captures::group_len`] for the matching pattern. + /// + /// This always returns `None` when `Captures` was created with + /// [`Captures::empty`], even if a match was found. This also always + /// returns `None` for any `index > 0` when `Captures` was created with + /// [`Captures::matches`]. + /// + /// If this routine returns a non-`None` value, then `is_match` is + /// guaranteed to return `true`, `pattern` is guaranteed to return a + /// non-`None` value and `get_match` is guaranteed to return a non-`None` + /// value. + /// + /// By convention, the 0th capture group will always return the same + /// span as the span returned by `get_match`. This is because the 0th + /// capture group always corresponds to the entirety of the pattern's + /// match. (It is similarly always unnamed because it is implicit.) This + /// isn't necessarily true of all regex engines. For example, one can + /// hand-compile a [`thompson::NFA`](crate::nfa::thompson::NFA) via a + /// [`thompson::Builder`](crate::nfa::thompson::Builder), which isn't + /// technically forced to make the 0th capturing group always correspond to + /// the entire match. + /// + /// # Example + /// + /// This example shows how to get the capturing groups, by index, from a + /// match: + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Span, Match}; + /// + /// let re = PikeVM::new(r"^(?P<first>\pL+)\s+(?P<last>\pL+)$")?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// + /// re.captures(&mut cache, "Bruce Springsteen", &mut caps); + /// assert_eq!(Some(Match::must(0, 0..17)), caps.get_match()); + /// assert_eq!(Some(Span::from(0..5)), caps.get_group(1)); + /// assert_eq!(Some(Span::from(6..17)), caps.get_group(2)); + /// // Looking for a non-existent capturing group will return None: + /// assert_eq!(None, caps.get_group(3)); + /// # // literals are too big for 32-bit usize: #1039 + /// # #[cfg(target_pointer_width = "64")] + /// assert_eq!(None, caps.get_group(9944060567225171988)); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn get_group(&self, index: usize) -> Option<Span> { + let pid = self.pattern()?; + // There's a little bit of work needed to map captures to slots in the + // fully general case. But in the overwhelming common case of a single + // pattern, we can just do some simple arithmetic. + let (slot_start, slot_end) = if self.group_info().pattern_len() == 1 { + (index.checked_mul(2)?, index.checked_mul(2)?.checked_add(1)?) + } else { + self.group_info().slots(pid, index)? + }; + let start = self.slots.get(slot_start).copied()??; + let end = self.slots.get(slot_end).copied()??; + Some(Span { start: start.get(), end: end.get() }) + } + + /// Returns the span of a capturing group match corresponding to the group + /// name given, only if both the overall pattern matched and the capturing + /// group participated in that match. + /// + /// This returns `None` if `name` does not correspond to a valid capturing + /// group for the pattern that matched. + /// + /// This always returns `None` when `Captures` was created with + /// [`Captures::empty`], even if a match was found. This also always + /// returns `None` for any `index > 0` when `Captures` was created with + /// [`Captures::matches`]. + /// + /// If this routine returns a non-`None` value, then `is_match` is + /// guaranteed to return `true`, `pattern` is guaranteed to return a + /// non-`None` value and `get_match` is guaranteed to return a non-`None` + /// value. + /// + /// # Example + /// + /// This example shows how to get the capturing groups, by name, from a + /// match: + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Span, Match}; + /// + /// let re = PikeVM::new(r"^(?P<first>\pL+)\s+(?P<last>\pL+)$")?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// + /// re.captures(&mut cache, "Bruce Springsteen", &mut caps); + /// assert_eq!(Some(Match::must(0, 0..17)), caps.get_match()); + /// assert_eq!(Some(Span::from(0..5)), caps.get_group_by_name("first")); + /// assert_eq!(Some(Span::from(6..17)), caps.get_group_by_name("last")); + /// // Looking for a non-existent capturing group will return None: + /// assert_eq!(None, caps.get_group_by_name("middle")); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn get_group_by_name(&self, name: &str) -> Option<Span> { + let index = self.group_info().to_index(self.pattern()?, name)?; + self.get_group(index) + } + + /// Returns an iterator of possible spans for every capturing group in the + /// matching pattern. + /// + /// If this `Captures` value does not correspond to a match, then the + /// iterator returned yields no elements. + /// + /// Note that the iterator returned yields elements of type `Option<Span>`. + /// A span is present if and only if it corresponds to a capturing group + /// that participated in a match. + /// + /// # Example + /// + /// This example shows how to collect all capturing groups: + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Span}; + /// + /// let re = PikeVM::new( + /// // Matches first/last names, with an optional middle name. + /// r"^(?P<first>\pL+)\s+(?:(?P<middle>\pL+)\s+)?(?P<last>\pL+)$", + /// )?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// + /// re.captures(&mut cache, "Harry James Potter", &mut caps); + /// assert!(caps.is_match()); + /// let groups: Vec<Option<Span>> = caps.iter().collect(); + /// assert_eq!(groups, vec![ + /// Some(Span::from(0..18)), + /// Some(Span::from(0..5)), + /// Some(Span::from(6..11)), + /// Some(Span::from(12..18)), + /// ]); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// This example uses the same regex as the previous example, but with a + /// haystack that omits the middle name. This results in a capturing group + /// that is present in the elements yielded by the iterator but without a + /// match: + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Span}; + /// + /// let re = PikeVM::new( + /// // Matches first/last names, with an optional middle name. + /// r"^(?P<first>\pL+)\s+(?:(?P<middle>\pL+)\s+)?(?P<last>\pL+)$", + /// )?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// + /// re.captures(&mut cache, "Harry Potter", &mut caps); + /// assert!(caps.is_match()); + /// let groups: Vec<Option<Span>> = caps.iter().collect(); + /// assert_eq!(groups, vec![ + /// Some(Span::from(0..12)), + /// Some(Span::from(0..5)), + /// None, + /// Some(Span::from(6..12)), + /// ]); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn iter(&self) -> CapturesPatternIter<'_> { + let names = self + .pattern() + .map_or(GroupInfoPatternNames::empty().enumerate(), |pid| { + self.group_info().pattern_names(pid).enumerate() + }); + CapturesPatternIter { caps: self, names } + } + + /// Return the total number of capturing groups for the matching pattern. + /// + /// If this `Captures` value does not correspond to a match, then this + /// always returns `0`. + /// + /// This always returns the same number of elements yielded by + /// [`Captures::iter`]. That is, the number includes capturing groups even + /// if they don't participate in the match. + /// + /// # Example + /// + /// This example shows how to count the total number of capturing groups + /// associated with a pattern. Notice that it includes groups that did not + /// participate in a match (just like `Captures::iter` does). + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::nfa::thompson::pikevm::PikeVM; + /// + /// let re = PikeVM::new( + /// // Matches first/last names, with an optional middle name. + /// r"^(?P<first>\pL+)\s+(?:(?P<middle>\pL+)\s+)?(?P<last>\pL+)$", + /// )?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// + /// re.captures(&mut cache, "Harry Potter", &mut caps); + /// assert_eq!(4, caps.group_len()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn group_len(&self) -> usize { + let pid = match self.pattern() { + None => return 0, + Some(pid) => pid, + }; + self.group_info().group_len(pid) + } + + /// Returns a reference to the underlying group info on which these + /// captures are based. + /// + /// The difference between `GroupInfo` and `Captures` is that the former + /// defines the structure of capturing groups where as the latter is what + /// stores the actual match information. So where as `Captures` only gives + /// you access to the current match, `GroupInfo` lets you query any + /// information about all capturing groups, even ones for patterns that + /// weren't involved in a match. + /// + /// Note that a `GroupInfo` uses reference counting internally, so it may + /// be cloned cheaply. + /// + /// # Example + /// + /// This example shows how to get all capturing group names from the + /// underlying `GroupInfo`. Notice that we don't even need to run a + /// search. + /// + /// ``` + /// use regex_automata::{nfa::thompson::pikevm::PikeVM, PatternID}; + /// + /// let re = PikeVM::new_many(&[ + /// r"(?P<foo>a)", + /// r"(a)(b)", + /// r"ab", + /// r"(?P<bar>a)(?P<quux>a)", + /// r"(?P<foo>z)", + /// ])?; + /// let caps = re.create_captures(); + /// + /// let expected = vec![ + /// (PatternID::must(0), 0, None), + /// (PatternID::must(0), 1, Some("foo")), + /// (PatternID::must(1), 0, None), + /// (PatternID::must(1), 1, None), + /// (PatternID::must(1), 2, None), + /// (PatternID::must(2), 0, None), + /// (PatternID::must(3), 0, None), + /// (PatternID::must(3), 1, Some("bar")), + /// (PatternID::must(3), 2, Some("quux")), + /// (PatternID::must(4), 0, None), + /// (PatternID::must(4), 1, Some("foo")), + /// ]; + /// // We could also just use 're.get_nfa().group_info()'. + /// let got: Vec<(PatternID, usize, Option<&str>)> = + /// caps.group_info().all_names().collect(); + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn group_info(&self) -> &GroupInfo { + &self.group_info + } + + /// Interpolates the capture references in `replacement` with the + /// corresponding substrings in `haystack` matched by each reference. The + /// interpolated string is returned. + /// + /// See the [`interpolate` module](interpolate) for documentation on the + /// format of the replacement string. + /// + /// # Example + /// + /// This example shows how to use interpolation, and also shows how it + /// can work with multi-pattern regexes. + /// + /// ``` + /// use regex_automata::{nfa::thompson::pikevm::PikeVM, PatternID}; + /// + /// let re = PikeVM::new_many(&[ + /// r"(?<day>[0-9]{2})-(?<month>[0-9]{2})-(?<year>[0-9]{4})", + /// r"(?<year>[0-9]{4})-(?<month>[0-9]{2})-(?<day>[0-9]{2})", + /// ])?; + /// let mut cache = re.create_cache(); + /// let mut caps = re.create_captures(); + /// + /// let replacement = "year=$year, month=$month, day=$day"; + /// + /// // This matches the first pattern. + /// let hay = "On 14-03-2010, I became a Tenneessee lamb."; + /// re.captures(&mut cache, hay, &mut caps); + /// let result = caps.interpolate_string(hay, replacement); + /// assert_eq!("year=2010, month=03, day=14", result); + /// + /// // And this matches the second pattern. + /// let hay = "On 2010-03-14, I became a Tenneessee lamb."; + /// re.captures(&mut cache, hay, &mut caps); + /// let result = caps.interpolate_string(hay, replacement); + /// assert_eq!("year=2010, month=03, day=14", result); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn interpolate_string( + &self, + haystack: &str, + replacement: &str, + ) -> String { + let mut dst = String::new(); + self.interpolate_string_into(haystack, replacement, &mut dst); + dst + } + + /// Interpolates the capture references in `replacement` with the + /// corresponding substrings in `haystack` matched by each reference. The + /// interpolated string is written to `dst`. + /// + /// See the [`interpolate` module](interpolate) for documentation on the + /// format of the replacement string. + /// + /// # Example + /// + /// This example shows how to use interpolation, and also shows how it + /// can work with multi-pattern regexes. + /// + /// ``` + /// use regex_automata::{nfa::thompson::pikevm::PikeVM, PatternID}; + /// + /// let re = PikeVM::new_many(&[ + /// r"(?<day>[0-9]{2})-(?<month>[0-9]{2})-(?<year>[0-9]{4})", + /// r"(?<year>[0-9]{4})-(?<month>[0-9]{2})-(?<day>[0-9]{2})", + /// ])?; + /// let mut cache = re.create_cache(); + /// let mut caps = re.create_captures(); + /// + /// let replacement = "year=$year, month=$month, day=$day"; + /// + /// // This matches the first pattern. + /// let hay = "On 14-03-2010, I became a Tenneessee lamb."; + /// re.captures(&mut cache, hay, &mut caps); + /// let mut dst = String::new(); + /// caps.interpolate_string_into(hay, replacement, &mut dst); + /// assert_eq!("year=2010, month=03, day=14", dst); + /// + /// // And this matches the second pattern. + /// let hay = "On 2010-03-14, I became a Tenneessee lamb."; + /// re.captures(&mut cache, hay, &mut caps); + /// let mut dst = String::new(); + /// caps.interpolate_string_into(hay, replacement, &mut dst); + /// assert_eq!("year=2010, month=03, day=14", dst); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn interpolate_string_into( + &self, + haystack: &str, + replacement: &str, + dst: &mut String, + ) { + interpolate::string( + replacement, + |index, dst| { + let span = match self.get_group(index) { + None => return, + Some(span) => span, + }; + dst.push_str(&haystack[span]); + }, + |name| self.group_info().to_index(self.pattern()?, name), + dst, + ); + } + + /// Interpolates the capture references in `replacement` with the + /// corresponding substrings in `haystack` matched by each reference. The + /// interpolated byte string is returned. + /// + /// See the [`interpolate` module](interpolate) for documentation on the + /// format of the replacement string. + /// + /// # Example + /// + /// This example shows how to use interpolation, and also shows how it + /// can work with multi-pattern regexes. + /// + /// ``` + /// use regex_automata::{nfa::thompson::pikevm::PikeVM, PatternID}; + /// + /// let re = PikeVM::new_many(&[ + /// r"(?<day>[0-9]{2})-(?<month>[0-9]{2})-(?<year>[0-9]{4})", + /// r"(?<year>[0-9]{4})-(?<month>[0-9]{2})-(?<day>[0-9]{2})", + /// ])?; + /// let mut cache = re.create_cache(); + /// let mut caps = re.create_captures(); + /// + /// let replacement = b"year=$year, month=$month, day=$day"; + /// + /// // This matches the first pattern. + /// let hay = b"On 14-03-2010, I became a Tenneessee lamb."; + /// re.captures(&mut cache, hay, &mut caps); + /// let result = caps.interpolate_bytes(hay, replacement); + /// assert_eq!(&b"year=2010, month=03, day=14"[..], result); + /// + /// // And this matches the second pattern. + /// let hay = b"On 2010-03-14, I became a Tenneessee lamb."; + /// re.captures(&mut cache, hay, &mut caps); + /// let result = caps.interpolate_bytes(hay, replacement); + /// assert_eq!(&b"year=2010, month=03, day=14"[..], result); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn interpolate_bytes( + &self, + haystack: &[u8], + replacement: &[u8], + ) -> Vec<u8> { + let mut dst = vec![]; + self.interpolate_bytes_into(haystack, replacement, &mut dst); + dst + } + + /// Interpolates the capture references in `replacement` with the + /// corresponding substrings in `haystack` matched by each reference. The + /// interpolated byte string is written to `dst`. + /// + /// See the [`interpolate` module](interpolate) for documentation on the + /// format of the replacement string. + /// + /// # Example + /// + /// This example shows how to use interpolation, and also shows how it + /// can work with multi-pattern regexes. + /// + /// ``` + /// use regex_automata::{nfa::thompson::pikevm::PikeVM, PatternID}; + /// + /// let re = PikeVM::new_many(&[ + /// r"(?<day>[0-9]{2})-(?<month>[0-9]{2})-(?<year>[0-9]{4})", + /// r"(?<year>[0-9]{4})-(?<month>[0-9]{2})-(?<day>[0-9]{2})", + /// ])?; + /// let mut cache = re.create_cache(); + /// let mut caps = re.create_captures(); + /// + /// let replacement = b"year=$year, month=$month, day=$day"; + /// + /// // This matches the first pattern. + /// let hay = b"On 14-03-2010, I became a Tenneessee lamb."; + /// re.captures(&mut cache, hay, &mut caps); + /// let mut dst = vec![]; + /// caps.interpolate_bytes_into(hay, replacement, &mut dst); + /// assert_eq!(&b"year=2010, month=03, day=14"[..], dst); + /// + /// // And this matches the second pattern. + /// let hay = b"On 2010-03-14, I became a Tenneessee lamb."; + /// re.captures(&mut cache, hay, &mut caps); + /// let mut dst = vec![]; + /// caps.interpolate_bytes_into(hay, replacement, &mut dst); + /// assert_eq!(&b"year=2010, month=03, day=14"[..], dst); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn interpolate_bytes_into( + &self, + haystack: &[u8], + replacement: &[u8], + dst: &mut Vec<u8>, + ) { + interpolate::bytes( + replacement, + |index, dst| { + let span = match self.get_group(index) { + None => return, + Some(span) => span, + }; + dst.extend_from_slice(&haystack[span]); + }, + |name| self.group_info().to_index(self.pattern()?, name), + dst, + ); + } + + /// This is a convenience routine for extracting the substrings + /// corresponding to matching capture groups in the given `haystack`. The + /// `haystack` should be the same substring used to find the match spans in + /// this `Captures` value. + /// + /// This is identical to [`Captures::extract_bytes`], except it works with + /// `&str` instead of `&[u8]`. + /// + /// # Panics + /// + /// This panics if the number of explicit matching groups in this + /// `Captures` value is less than `N`. This also panics if this `Captures` + /// value does not correspond to a match. + /// + /// Note that this does *not* panic if the number of explicit matching + /// groups is bigger than `N`. In that case, only the first `N` matching + /// groups are extracted. + /// + /// # Example + /// + /// ``` + /// use regex_automata::nfa::thompson::pikevm::PikeVM; + /// + /// let re = PikeVM::new(r"([0-9]{4})-([0-9]{2})-([0-9]{2})")?; + /// let mut cache = re.create_cache(); + /// let mut caps = re.create_captures(); + /// + /// let hay = "On 2010-03-14, I became a Tenneessee lamb."; + /// re.captures(&mut cache, hay, &mut caps); + /// assert!(caps.is_match()); + /// let (full, [year, month, day]) = caps.extract(hay); + /// assert_eq!("2010-03-14", full); + /// assert_eq!("2010", year); + /// assert_eq!("03", month); + /// assert_eq!("14", day); + /// + /// // We can also ask for fewer than all capture groups. + /// let (full, [year]) = caps.extract(hay); + /// assert_eq!("2010-03-14", full); + /// assert_eq!("2010", year); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn extract<'h, const N: usize>( + &self, + haystack: &'h str, + ) -> (&'h str, [&'h str; N]) { + let mut matched = self.iter().flatten(); + let whole_match = &haystack[matched.next().expect("a match")]; + let group_matches = [0; N].map(|_| { + let sp = matched.next().expect("too few matching groups"); + &haystack[sp] + }); + (whole_match, group_matches) + } + + /// This is a convenience routine for extracting the substrings + /// corresponding to matching capture groups in the given `haystack`. The + /// `haystack` should be the same substring used to find the match spans in + /// this `Captures` value. + /// + /// This is identical to [`Captures::extract`], except it works with + /// `&[u8]` instead of `&str`. + /// + /// # Panics + /// + /// This panics if the number of explicit matching groups in this + /// `Captures` value is less than `N`. This also panics if this `Captures` + /// value does not correspond to a match. + /// + /// Note that this does *not* panic if the number of explicit matching + /// groups is bigger than `N`. In that case, only the first `N` matching + /// groups are extracted. + /// + /// # Example + /// + /// ``` + /// use regex_automata::nfa::thompson::pikevm::PikeVM; + /// + /// let re = PikeVM::new(r"([0-9]{4})-([0-9]{2})-([0-9]{2})")?; + /// let mut cache = re.create_cache(); + /// let mut caps = re.create_captures(); + /// + /// let hay = b"On 2010-03-14, I became a Tenneessee lamb."; + /// re.captures(&mut cache, hay, &mut caps); + /// assert!(caps.is_match()); + /// let (full, [year, month, day]) = caps.extract_bytes(hay); + /// assert_eq!(b"2010-03-14", full); + /// assert_eq!(b"2010", year); + /// assert_eq!(b"03", month); + /// assert_eq!(b"14", day); + /// + /// // We can also ask for fewer than all capture groups. + /// let (full, [year]) = caps.extract_bytes(hay); + /// assert_eq!(b"2010-03-14", full); + /// assert_eq!(b"2010", year); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn extract_bytes<'h, const N: usize>( + &self, + haystack: &'h [u8], + ) -> (&'h [u8], [&'h [u8]; N]) { + let mut matched = self.iter().flatten(); + let whole_match = &haystack[matched.next().expect("a match")]; + let group_matches = [0; N].map(|_| { + let sp = matched.next().expect("too few matching groups"); + &haystack[sp] + }); + (whole_match, group_matches) + } +} + +/// Lower level "slot" oriented APIs. One does not typically need to use these +/// when executing a search. They are instead mostly intended for folks that +/// are writing their own regex engine while reusing this `Captures` type. +impl Captures { + /// Clear this `Captures` value. + /// + /// After clearing, all slots inside this `Captures` value will be set to + /// `None`. Similarly, any pattern ID that it was previously associated + /// with (for a match) is erased. + /// + /// It is not usually necessary to call this routine. Namely, a `Captures` + /// value only provides high level access to the capturing groups of the + /// pattern that matched, and only low level access to individual slots. + /// Thus, even if slots corresponding to groups that aren't associated + /// with the matching pattern are set, then it won't impact the higher + /// level APIs. Namely, higher level APIs like [`Captures::get_group`] will + /// return `None` if no pattern ID is present, even if there are spans set + /// in the underlying slots. + /// + /// Thus, to "clear" a `Captures` value of a match, it is usually only + /// necessary to call [`Captures::set_pattern`] with `None`. + /// + /// # Example + /// + /// This example shows what happens when a `Captures` value is cleared. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::nfa::thompson::pikevm::PikeVM; + /// + /// let re = PikeVM::new(r"^(?P<first>\pL+)\s+(?P<last>\pL+)$")?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// + /// re.captures(&mut cache, "Bruce Springsteen", &mut caps); + /// assert!(caps.is_match()); + /// let slots: Vec<Option<usize>> = + /// caps.slots().iter().map(|s| s.map(|x| x.get())).collect(); + /// // Note that the following ordering is considered an API guarantee. + /// assert_eq!(slots, vec![ + /// Some(0), + /// Some(17), + /// Some(0), + /// Some(5), + /// Some(6), + /// Some(17), + /// ]); + /// + /// // Now clear the slots. Everything is gone and it is no longer a match. + /// caps.clear(); + /// assert!(!caps.is_match()); + /// let slots: Vec<Option<usize>> = + /// caps.slots().iter().map(|s| s.map(|x| x.get())).collect(); + /// assert_eq!(slots, vec![ + /// None, + /// None, + /// None, + /// None, + /// None, + /// None, + /// ]); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn clear(&mut self) { + self.pid = None; + for slot in self.slots.iter_mut() { + *slot = None; + } + } + + /// Set the pattern on this `Captures` value. + /// + /// When the pattern ID is `None`, then this `Captures` value does not + /// correspond to a match (`is_match` will return `false`). Otherwise, it + /// corresponds to a match. + /// + /// This is useful in search implementations where you might want to + /// initially call `set_pattern(None)` in order to avoid the cost of + /// calling `clear()` if it turns out to not be necessary. + /// + /// # Example + /// + /// This example shows that `set_pattern` merely overwrites the pattern ID. + /// It does not actually change the underlying slot values. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::nfa::thompson::pikevm::PikeVM; + /// + /// let re = PikeVM::new(r"^(?P<first>\pL+)\s+(?P<last>\pL+)$")?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// + /// re.captures(&mut cache, "Bruce Springsteen", &mut caps); + /// assert!(caps.is_match()); + /// assert!(caps.pattern().is_some()); + /// let slots: Vec<Option<usize>> = + /// caps.slots().iter().map(|s| s.map(|x| x.get())).collect(); + /// // Note that the following ordering is considered an API guarantee. + /// assert_eq!(slots, vec![ + /// Some(0), + /// Some(17), + /// Some(0), + /// Some(5), + /// Some(6), + /// Some(17), + /// ]); + /// + /// // Now set the pattern to None. Note that the slot values remain. + /// caps.set_pattern(None); + /// assert!(!caps.is_match()); + /// assert!(!caps.pattern().is_some()); + /// let slots: Vec<Option<usize>> = + /// caps.slots().iter().map(|s| s.map(|x| x.get())).collect(); + /// // Note that the following ordering is considered an API guarantee. + /// assert_eq!(slots, vec![ + /// Some(0), + /// Some(17), + /// Some(0), + /// Some(5), + /// Some(6), + /// Some(17), + /// ]); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn set_pattern(&mut self, pid: Option<PatternID>) { + self.pid = pid; + } + + /// Returns the underlying slots, where each slot stores a single offset. + /// + /// Every matching capturing group generally corresponds to two slots: one + /// slot for the starting position and another for the ending position. + /// Typically, either both are present or neither are. (The weasel word + /// "typically" is used here because it really depends on the regex engine + /// implementation. Every sensible regex engine likely adheres to this + /// invariant, and every regex engine in this crate is sensible.) + /// + /// Generally speaking, callers should prefer to use higher level routines + /// like [`Captures::get_match`] or [`Captures::get_group`]. + /// + /// An important note here is that a regex engine may not reset all of the + /// slots to `None` values when no match occurs, or even when a match of + /// a different pattern occurs. But this depends on how the regex engine + /// implementation deals with slots. + /// + /// # Example + /// + /// This example shows how to get the underlying slots from a regex match. + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson::pikevm::PikeVM, + /// util::primitives::{PatternID, NonMaxUsize}, + /// }; + /// + /// let re = PikeVM::new_many(&[ + /// r"[a-z]+", + /// r"[0-9]+", + /// ])?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// + /// re.captures(&mut cache, "123", &mut caps); + /// assert_eq!(Some(PatternID::must(1)), caps.pattern()); + /// // Note that the only guarantee we have here is that slots 2 and 3 + /// // are set to correct values. The contents of the first two slots are + /// // unspecified since the 0th pattern did not match. + /// let expected = &[ + /// None, + /// None, + /// NonMaxUsize::new(0), + /// NonMaxUsize::new(3), + /// ]; + /// assert_eq!(expected, caps.slots()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn slots(&self) -> &[Option<NonMaxUsize>] { + &self.slots + } + + /// Returns the underlying slots as a mutable slice, where each slot stores + /// a single offset. + /// + /// This tends to be most useful for regex engine implementations for + /// writing offsets for matching capturing groups to slots. + /// + /// See [`Captures::slots`] for more information about slots. + #[inline] + pub fn slots_mut(&mut self) -> &mut [Option<NonMaxUsize>] { + &mut self.slots + } +} + +impl core::fmt::Debug for Captures { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + let mut dstruct = f.debug_struct("Captures"); + dstruct.field("pid", &self.pid); + if let Some(pid) = self.pid { + dstruct.field("spans", &CapturesDebugMap { pid, caps: self }); + } + dstruct.finish() + } +} + +/// A little helper type to provide a nice map-like debug representation for +/// our capturing group spans. +struct CapturesDebugMap<'a> { + pid: PatternID, + caps: &'a Captures, +} + +impl<'a> core::fmt::Debug for CapturesDebugMap<'a> { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + struct Key<'a>(usize, Option<&'a str>); + + impl<'a> core::fmt::Debug for Key<'a> { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + write!(f, "{}", self.0)?; + if let Some(name) = self.1 { + write!(f, "/{:?}", name)?; + } + Ok(()) + } + } + + let mut map = f.debug_map(); + let names = self.caps.group_info().pattern_names(self.pid); + for (group_index, maybe_name) in names.enumerate() { + let key = Key(group_index, maybe_name); + match self.caps.get_group(group_index) { + None => map.entry(&key, &None::<()>), + Some(span) => map.entry(&key, &span), + }; + } + map.finish() + } +} + +/// An iterator over all capturing groups in a `Captures` value. +/// +/// This iterator includes capturing groups that did not participate in a +/// match. See the [`Captures::iter`] method documentation for more details +/// and examples. +/// +/// The lifetime parameter `'a` refers to the lifetime of the underlying +/// `Captures` value. +#[derive(Clone, Debug)] +pub struct CapturesPatternIter<'a> { + caps: &'a Captures, + names: core::iter::Enumerate<GroupInfoPatternNames<'a>>, +} + +impl<'a> Iterator for CapturesPatternIter<'a> { + type Item = Option<Span>; + + fn next(&mut self) -> Option<Option<Span>> { + let (group_index, _) = self.names.next()?; + Some(self.caps.get_group(group_index)) + } + + fn size_hint(&self) -> (usize, Option<usize>) { + self.names.size_hint() + } + + fn count(self) -> usize { + self.names.count() + } +} + +impl<'a> ExactSizeIterator for CapturesPatternIter<'a> {} +impl<'a> core::iter::FusedIterator for CapturesPatternIter<'a> {} + +/// Represents information about capturing groups in a compiled regex. +/// +/// The information encapsulated by this type consists of the following. For +/// each pattern: +/// +/// * A map from every capture group name to its corresponding capture group +/// index. +/// * A map from every capture group index to its corresponding capture group +/// name. +/// * A map from capture group index to its corresponding slot index. A slot +/// refers to one half of a capturing group. That is, a capture slot is either +/// the start or end of a capturing group. A slot is usually the mechanism +/// by which a regex engine records offsets for each capturing group during a +/// search. +/// +/// A `GroupInfo` uses reference counting internally and is thus cheap to +/// clone. +/// +/// # Mapping from capture groups to slots +/// +/// One of the main responsibilities of a `GroupInfo` is to build a mapping +/// from `(PatternID, u32)` (where the `u32` is a capture index) to something +/// called a "slot." As mentioned above, a slot refers to one half of a +/// capturing group. Both combined provide the start and end offsets of +/// a capturing group that participated in a match. +/// +/// **The mapping between group indices and slots is an API guarantee.** That +/// is, the mapping won't change within a semver compatible release. +/// +/// Slots exist primarily because this is a convenient mechanism by which +/// regex engines report group offsets at search time. For example, the +/// [`nfa::thompson::State::Capture`](crate::nfa::thompson::State::Capture) +/// NFA state includes the slot index. When a regex engine transitions through +/// this state, it will likely use the slot index to write the current haystack +/// offset to some region of memory. When a match is found, those slots are +/// then reported to the caller, typically via a convenient abstraction like a +/// [`Captures`] value. +/// +/// Because this crate provides first class support for multi-pattern regexes, +/// and because of some performance related reasons, the mapping between +/// capturing groups and slots is a little complex. However, in the case of a +/// single pattern, the mapping can be described very simply: for all capture +/// group indices `i`, its corresponding slots are at `i * 2` and `i * 2 + 1`. +/// Notice that the pattern ID isn't involved at all here, because it only +/// applies to a single-pattern regex, it is therefore always `0`. +/// +/// In the multi-pattern case, the mapping is a bit more complicated. To talk +/// about it, we must define what we mean by "implicit" vs "explicit" +/// capturing groups: +/// +/// * An **implicit** capturing group refers to the capturing group that is +/// present for every pattern automatically, and corresponds to the overall +/// match of a pattern. Every pattern has precisely one implicit capturing +/// group. It is always unnamed and it always corresponds to the capture group +/// index `0`. +/// * An **explicit** capturing group refers to any capturing group that +/// appears in the concrete syntax of the pattern. (Or, if an NFA was hand +/// built without any concrete syntax, it refers to any capturing group with an +/// index greater than `0`.) +/// +/// Some examples: +/// +/// * `\w+` has one implicit capturing group and zero explicit capturing +/// groups. +/// * `(\w+)` has one implicit group and one explicit group. +/// * `foo(\d+)(?:\pL+)(\d+)` has one implicit group and two explicit groups. +/// +/// Turning back to the slot mapping, we can now state it as follows: +/// +/// * Given a pattern ID `pid`, the slots for its implicit group are always +/// at `pid * 2` and `pid * 2 + 1`. +/// * Given a pattern ID `0`, the slots for its explicit groups start +/// at `group_info.pattern_len() * 2`. +/// * Given a pattern ID `pid > 0`, the slots for its explicit groups start +/// immediately following where the slots for the explicit groups of `pid - 1` +/// end. +/// +/// In particular, while there is a concrete formula one can use to determine +/// where the slots for the implicit group of any pattern are, there is no +/// general formula for determining where the slots for explicit capturing +/// groups are. This is because each pattern can contain a different number +/// of groups. +/// +/// The intended way of getting the slots for a particular capturing group +/// (whether implicit or explicit) is via the [`GroupInfo::slot`] or +/// [`GroupInfo::slots`] method. +/// +/// See below for a concrete example of how capturing groups get mapped to +/// slots. +/// +/// # Example +/// +/// This example shows how to build a new `GroupInfo` and query it for +/// information. +/// +/// ``` +/// use regex_automata::util::{captures::GroupInfo, primitives::PatternID}; +/// +/// let info = GroupInfo::new(vec![ +/// vec![None, Some("foo")], +/// vec![None], +/// vec![None, None, None, Some("bar"), None], +/// vec![None, None, Some("foo")], +/// ])?; +/// // The number of patterns being tracked. +/// assert_eq!(4, info.pattern_len()); +/// // We can query the number of groups for any pattern. +/// assert_eq!(2, info.group_len(PatternID::must(0))); +/// assert_eq!(1, info.group_len(PatternID::must(1))); +/// assert_eq!(5, info.group_len(PatternID::must(2))); +/// assert_eq!(3, info.group_len(PatternID::must(3))); +/// // An invalid pattern always has zero groups. +/// assert_eq!(0, info.group_len(PatternID::must(999))); +/// // 2 slots per group +/// assert_eq!(22, info.slot_len()); +/// +/// // We can map a group index for a particular pattern to its name, if +/// // one exists. +/// assert_eq!(Some("foo"), info.to_name(PatternID::must(3), 2)); +/// assert_eq!(None, info.to_name(PatternID::must(2), 4)); +/// // Or map a name to its group index. +/// assert_eq!(Some(1), info.to_index(PatternID::must(0), "foo")); +/// assert_eq!(Some(2), info.to_index(PatternID::must(3), "foo")); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +/// +/// # Example: mapping from capture groups to slots +/// +/// This example shows the specific mapping from capture group indices for +/// each pattern to their corresponding slots. The slot values shown in this +/// example are considered an API guarantee. +/// +/// ``` +/// use regex_automata::util::{captures::GroupInfo, primitives::PatternID}; +/// +/// let info = GroupInfo::new(vec![ +/// vec![None, Some("foo")], +/// vec![None], +/// vec![None, None, None, Some("bar"), None], +/// vec![None, None, Some("foo")], +/// ])?; +/// +/// // We first show the slots for each pattern's implicit group. +/// assert_eq!(Some((0, 1)), info.slots(PatternID::must(0), 0)); +/// assert_eq!(Some((2, 3)), info.slots(PatternID::must(1), 0)); +/// assert_eq!(Some((4, 5)), info.slots(PatternID::must(2), 0)); +/// assert_eq!(Some((6, 7)), info.slots(PatternID::must(3), 0)); +/// +/// // And now we show the slots for each pattern's explicit group. +/// assert_eq!(Some((8, 9)), info.slots(PatternID::must(0), 1)); +/// assert_eq!(Some((10, 11)), info.slots(PatternID::must(2), 1)); +/// assert_eq!(Some((12, 13)), info.slots(PatternID::must(2), 2)); +/// assert_eq!(Some((14, 15)), info.slots(PatternID::must(2), 3)); +/// assert_eq!(Some((16, 17)), info.slots(PatternID::must(2), 4)); +/// assert_eq!(Some((18, 19)), info.slots(PatternID::must(3), 1)); +/// assert_eq!(Some((20, 21)), info.slots(PatternID::must(3), 2)); +/// +/// // Asking for the slots for an invalid pattern ID or even for an invalid +/// // group index for a specific pattern will return None. So for example, +/// // you're guaranteed to not get the slots for a different pattern than the +/// // one requested. +/// assert_eq!(None, info.slots(PatternID::must(5), 0)); +/// assert_eq!(None, info.slots(PatternID::must(1), 1)); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +#[derive(Clone, Debug, Default)] +pub struct GroupInfo(Arc<GroupInfoInner>); + +impl GroupInfo { + /// Creates a new group info from a sequence of patterns, where each + /// sequence of patterns yields a sequence of possible group names. The + /// index of each pattern in the sequence corresponds to its `PatternID`, + /// and the index of each group in each pattern's sequence corresponds to + /// its corresponding group index. + /// + /// While this constructor is very generic and therefore perhaps hard to + /// chew on, an example of a valid concrete type that can be passed to + /// this constructor is `Vec<Vec<Option<String>>>`. The outer `Vec` + /// corresponds to the patterns, i.e., one `Vec<Option<String>>` per + /// pattern. The inner `Vec` corresponds to the capturing groups for + /// each pattern. The `Option<String>` corresponds to the name of the + /// capturing group, if present. + /// + /// It is legal to pass an empty iterator to this constructor. It will + /// return an empty group info with zero slots. An empty group info is + /// useful for cases where you have no patterns or for cases where slots + /// aren't being used at all (e.g., for most DFAs in this crate). + /// + /// # Errors + /// + /// This constructor returns an error if the given capturing groups are + /// invalid in some way. Those reasons include, but are not necessarily + /// limited to: + /// + /// * Too many patterns (i.e., `PatternID` would overflow). + /// * Too many capturing groups (e.g., `u32` would overflow). + /// * A pattern is given that has no capturing groups. (All patterns must + /// have at least an implicit capturing group at index `0`.) + /// * The capturing group at index `0` has a name. It must be unnamed. + /// * There are duplicate capturing group names within the same pattern. + /// (Multiple capturing groups with the same name may exist, but they + /// must be in different patterns.) + /// + /// An example below shows how to trigger some of the above error + /// conditions. + /// + /// # Example + /// + /// This example shows how to build a new `GroupInfo` and query it for + /// information. + /// + /// ``` + /// use regex_automata::util::captures::GroupInfo; + /// + /// let info = GroupInfo::new(vec![ + /// vec![None, Some("foo")], + /// vec![None], + /// vec![None, None, None, Some("bar"), None], + /// vec![None, None, Some("foo")], + /// ])?; + /// // The number of patterns being tracked. + /// assert_eq!(4, info.pattern_len()); + /// // 2 slots per group + /// assert_eq!(22, info.slot_len()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// # Example: empty `GroupInfo` + /// + /// This example shows how to build a new `GroupInfo` and query it for + /// information. + /// + /// ``` + /// use regex_automata::util::captures::GroupInfo; + /// + /// let info = GroupInfo::empty(); + /// // Everything is zero. + /// assert_eq!(0, info.pattern_len()); + /// assert_eq!(0, info.slot_len()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// # Example: error conditions + /// + /// This example shows how to provoke some of the ways in which building + /// a `GroupInfo` can fail. + /// + /// ``` + /// use regex_automata::util::captures::GroupInfo; + /// + /// // Either the group info is empty, or all patterns must have at least + /// // one capturing group. + /// assert!(GroupInfo::new(vec![ + /// vec![None, Some("a")], // ok + /// vec![None], // ok + /// vec![], // not ok + /// ]).is_err()); + /// // Note that building an empty group info is OK. + /// assert!(GroupInfo::new(Vec::<Vec<Option<String>>>::new()).is_ok()); + /// + /// // The first group in each pattern must correspond to an implicit + /// // anonymous group. i.e., One that is not named. By convention, this + /// // group corresponds to the overall match of a regex. Every other group + /// // in a pattern is explicit and optional. + /// assert!(GroupInfo::new(vec![vec![Some("foo")]]).is_err()); + /// + /// // There must not be duplicate group names within the same pattern. + /// assert!(GroupInfo::new(vec![ + /// vec![None, Some("foo"), Some("foo")], + /// ]).is_err()); + /// // But duplicate names across distinct patterns is OK. + /// assert!(GroupInfo::new(vec![ + /// vec![None, Some("foo")], + /// vec![None, Some("foo")], + /// ]).is_ok()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// There are other ways for building a `GroupInfo` to fail but are + /// difficult to show. For example, if the number of patterns given would + /// overflow `PatternID`. + pub fn new<P, G, N>(pattern_groups: P) -> Result<GroupInfo, GroupInfoError> + where + P: IntoIterator<Item = G>, + G: IntoIterator<Item = Option<N>>, + N: AsRef<str>, + { + let mut group_info = GroupInfoInner { + slot_ranges: vec![], + name_to_index: vec![], + index_to_name: vec![], + memory_extra: 0, + }; + for (pattern_index, groups) in pattern_groups.into_iter().enumerate() { + // If we can't convert the pattern index to an ID, then the caller + // tried to build capture info for too many patterns. + let pid = PatternID::new(pattern_index) + .map_err(GroupInfoError::too_many_patterns)?; + + let mut groups_iter = groups.into_iter().enumerate(); + match groups_iter.next() { + None => return Err(GroupInfoError::missing_groups(pid)), + Some((_, Some(_))) => { + return Err(GroupInfoError::first_must_be_unnamed(pid)) + } + Some((_, None)) => {} + } + group_info.add_first_group(pid); + // Now iterate over the rest, which correspond to all of the + // (conventionally) explicit capture groups in a regex pattern. + for (group_index, maybe_name) in groups_iter { + // Just like for patterns, if the group index can't be + // converted to a "small" index, then the caller has given too + // many groups for a particular pattern. + let group = SmallIndex::new(group_index).map_err(|_| { + GroupInfoError::too_many_groups(pid, group_index) + })?; + group_info.add_explicit_group(pid, group, maybe_name)?; + } + } + group_info.fixup_slot_ranges()?; + Ok(GroupInfo(Arc::new(group_info))) + } + + /// This creates an empty `GroupInfo`. + /// + /// This is a convenience routine for calling `GroupInfo::new` with an + /// iterator that yields no elements. + /// + /// # Example + /// + /// This example shows how to build a new empty `GroupInfo` and query it + /// for information. + /// + /// ``` + /// use regex_automata::util::captures::GroupInfo; + /// + /// let info = GroupInfo::empty(); + /// // Everything is zero. + /// assert_eq!(0, info.pattern_len()); + /// assert_eq!(0, info.all_group_len()); + /// assert_eq!(0, info.slot_len()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn empty() -> GroupInfo { + GroupInfo::new(core::iter::empty::<[Option<&str>; 0]>()) + .expect("empty group info is always valid") + } + + /// Return the capture group index corresponding to the given name in the + /// given pattern. If no such capture group name exists in the given + /// pattern, then this returns `None`. + /// + /// If the given pattern ID is invalid, then this returns `None`. + /// + /// This also returns `None` for all inputs if these captures are empty + /// (e.g., built from an empty [`GroupInfo`]). To check whether captures + /// are are present for a specific pattern, use [`GroupInfo::group_len`]. + /// + /// # Example + /// + /// This example shows how to find the capture index for the given pattern + /// and group name. + /// + /// Remember that capture indices are relative to the pattern, such that + /// the same capture index value may refer to different capturing groups + /// for distinct patterns. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{nfa::thompson::NFA, PatternID}; + /// + /// let (pid0, pid1) = (PatternID::must(0), PatternID::must(1)); + /// + /// let nfa = NFA::new_many(&[ + /// r"a(?P<quux>\w+)z(?P<foo>\s+)", + /// r"a(?P<foo>\d+)z", + /// ])?; + /// let groups = nfa.group_info(); + /// assert_eq!(Some(2), groups.to_index(pid0, "foo")); + /// // Recall that capture index 0 is always unnamed and refers to the + /// // entire pattern. So the first capturing group present in the pattern + /// // itself always starts at index 1. + /// assert_eq!(Some(1), groups.to_index(pid1, "foo")); + /// + /// // And if a name does not exist for a particular pattern, None is + /// // returned. + /// assert!(groups.to_index(pid0, "quux").is_some()); + /// assert!(groups.to_index(pid1, "quux").is_none()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn to_index(&self, pid: PatternID, name: &str) -> Option<usize> { + let indices = self.0.name_to_index.get(pid.as_usize())?; + indices.get(name).cloned().map(|i| i.as_usize()) + } + + /// Return the capture name for the given index and given pattern. If the + /// corresponding group does not have a name, then this returns `None`. + /// + /// If the pattern ID is invalid, then this returns `None`. + /// + /// If the group index is invalid for the given pattern, then this returns + /// `None`. A group `index` is valid for a pattern `pid` in an `nfa` if and + /// only if `index < nfa.pattern_capture_len(pid)`. + /// + /// This also returns `None` for all inputs if these captures are empty + /// (e.g., built from an empty [`GroupInfo`]). To check whether captures + /// are are present for a specific pattern, use [`GroupInfo::group_len`]. + /// + /// # Example + /// + /// This example shows how to find the capture group name for the given + /// pattern and group index. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{nfa::thompson::NFA, PatternID}; + /// + /// let (pid0, pid1) = (PatternID::must(0), PatternID::must(1)); + /// + /// let nfa = NFA::new_many(&[ + /// r"a(?P<foo>\w+)z(\s+)x(\d+)", + /// r"a(\d+)z(?P<foo>\s+)", + /// ])?; + /// let groups = nfa.group_info(); + /// assert_eq!(None, groups.to_name(pid0, 0)); + /// assert_eq!(Some("foo"), groups.to_name(pid0, 1)); + /// assert_eq!(None, groups.to_name(pid0, 2)); + /// assert_eq!(None, groups.to_name(pid0, 3)); + /// + /// assert_eq!(None, groups.to_name(pid1, 0)); + /// assert_eq!(None, groups.to_name(pid1, 1)); + /// assert_eq!(Some("foo"), groups.to_name(pid1, 2)); + /// // '3' is not a valid capture index for the second pattern. + /// assert_eq!(None, groups.to_name(pid1, 3)); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn to_name(&self, pid: PatternID, group_index: usize) -> Option<&str> { + let pattern_names = self.0.index_to_name.get(pid.as_usize())?; + pattern_names.get(group_index)?.as_deref() + } + + /// Return an iterator of all capture groups and their names (if present) + /// for a particular pattern. + /// + /// If the given pattern ID is invalid or if this `GroupInfo` is empty, + /// then the iterator yields no elements. + /// + /// The number of elements yielded by this iterator is always equal to + /// the result of calling [`GroupInfo::group_len`] with the same + /// `PatternID`. + /// + /// # Example + /// + /// This example shows how to get a list of all capture group names for + /// a particular pattern. + /// + /// ``` + /// use regex_automata::{nfa::thompson::NFA, PatternID}; + /// + /// let nfa = NFA::new(r"(a)(?P<foo>b)(c)(d)(?P<bar>e)")?; + /// // The first is the implicit group that is always unnammed. The next + /// // 5 groups are the explicit groups found in the concrete syntax above. + /// let expected = vec![None, None, Some("foo"), None, None, Some("bar")]; + /// let got: Vec<Option<&str>> = + /// nfa.group_info().pattern_names(PatternID::ZERO).collect(); + /// assert_eq!(expected, got); + /// + /// // Using an invalid pattern ID will result in nothing yielded. + /// let got = nfa.group_info().pattern_names(PatternID::must(999)).count(); + /// assert_eq!(0, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn pattern_names(&self, pid: PatternID) -> GroupInfoPatternNames<'_> { + GroupInfoPatternNames { + it: self + .0 + .index_to_name + .get(pid.as_usize()) + .map(|indices| indices.iter()) + .unwrap_or([].iter()), + } + } + + /// Return an iterator of all capture groups for all patterns supported by + /// this `GroupInfo`. Each item yielded is a triple of the group's pattern + /// ID, index in the pattern and the group's name, if present. + /// + /// # Example + /// + /// This example shows how to get a list of all capture groups found in + /// one NFA, potentially spanning multiple patterns. + /// + /// ``` + /// use regex_automata::{nfa::thompson::NFA, PatternID}; + /// + /// let nfa = NFA::new_many(&[ + /// r"(?P<foo>a)", + /// r"a", + /// r"(a)", + /// ])?; + /// let expected = vec![ + /// (PatternID::must(0), 0, None), + /// (PatternID::must(0), 1, Some("foo")), + /// (PatternID::must(1), 0, None), + /// (PatternID::must(2), 0, None), + /// (PatternID::must(2), 1, None), + /// ]; + /// let got: Vec<(PatternID, usize, Option<&str>)> = + /// nfa.group_info().all_names().collect(); + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// Unlike other capturing group related routines, this routine doesn't + /// panic even if captures aren't enabled on this NFA: + /// + /// ``` + /// use regex_automata::nfa::thompson::{NFA, WhichCaptures}; + /// + /// let nfa = NFA::compiler() + /// .configure(NFA::config().which_captures(WhichCaptures::None)) + /// .build_many(&[ + /// r"(?P<foo>a)", + /// r"a", + /// r"(a)", + /// ])?; + /// // When captures aren't enabled, there's nothing to return. + /// assert_eq!(0, nfa.group_info().all_names().count()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn all_names(&self) -> GroupInfoAllNames<'_> { + GroupInfoAllNames { + group_info: self, + pids: PatternID::iter(self.pattern_len()), + current_pid: None, + names: None, + } + } + + /// Returns the starting and ending slot corresponding to the given + /// capturing group for the given pattern. The ending slot is always one + /// more than the starting slot returned. + /// + /// Note that this is like [`GroupInfo::slot`], except that it also returns + /// the ending slot value for convenience. + /// + /// If either the pattern ID or the capture index is invalid, then this + /// returns None. + /// + /// # Example + /// + /// This example shows that the starting slots for the first capturing + /// group of each pattern are distinct. + /// + /// ``` + /// use regex_automata::{nfa::thompson::NFA, PatternID}; + /// + /// let nfa = NFA::new_many(&["a", "b"])?; + /// assert_ne!( + /// nfa.group_info().slots(PatternID::must(0), 0), + /// nfa.group_info().slots(PatternID::must(1), 0), + /// ); + /// + /// // Also, the start and end slot values are never equivalent. + /// let (start, end) = nfa.group_info().slots(PatternID::ZERO, 0).unwrap(); + /// assert_ne!(start, end); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn slots( + &self, + pid: PatternID, + group_index: usize, + ) -> Option<(usize, usize)> { + // Since 'slot' only even returns valid starting slots, we know that + // there must also be an end slot and that end slot is always one more + // than the start slot. + self.slot(pid, group_index).map(|start| (start, start + 1)) + } + + /// Returns the starting slot corresponding to the given capturing group + /// for the given pattern. The ending slot is always one more than the + /// value returned. + /// + /// If either the pattern ID or the capture index is invalid, then this + /// returns None. + /// + /// # Example + /// + /// This example shows that the starting slots for the first capturing + /// group of each pattern are distinct. + /// + /// ``` + /// use regex_automata::{nfa::thompson::NFA, PatternID}; + /// + /// let nfa = NFA::new_many(&["a", "b"])?; + /// assert_ne!( + /// nfa.group_info().slot(PatternID::must(0), 0), + /// nfa.group_info().slot(PatternID::must(1), 0), + /// ); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn slot(&self, pid: PatternID, group_index: usize) -> Option<usize> { + if group_index >= self.group_len(pid) { + return None; + } + // At this point, we know that 'pid' refers to a real pattern and that + // 'group_index' refers to a real group. We therefore also know that + // the pattern and group can be combined to return a correct slot. + // That's why we don't need to use checked arithmetic below. + if group_index == 0 { + Some(pid.as_usize() * 2) + } else { + // As above, we don't need to check that our slot is less than the + // end of our range since we already know the group index is a + // valid index for the given pattern. + let (start, _) = self.0.slot_ranges[pid]; + Some(start.as_usize() + ((group_index - 1) * 2)) + } + } + + /// Returns the total number of patterns in this `GroupInfo`. + /// + /// This may return zero if the `GroupInfo` was constructed with no + /// patterns. + /// + /// This is guaranteed to be no bigger than [`PatternID::LIMIT`] because + /// `GroupInfo` construction will fail if too many patterns are added. + /// + /// # Example + /// + /// ``` + /// use regex_automata::nfa::thompson::NFA; + /// + /// let nfa = NFA::new_many(&["[0-9]+", "[a-z]+", "[A-Z]+"])?; + /// assert_eq!(3, nfa.group_info().pattern_len()); + /// + /// let nfa = NFA::never_match(); + /// assert_eq!(0, nfa.group_info().pattern_len()); + /// + /// let nfa = NFA::always_match(); + /// assert_eq!(1, nfa.group_info().pattern_len()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn pattern_len(&self) -> usize { + self.0.pattern_len() + } + + /// Return the number of capture groups in a pattern. + /// + /// If the pattern ID is invalid, then this returns `0`. + /// + /// # Example + /// + /// This example shows how the values returned by this routine may vary + /// for different patterns and NFA configurations. + /// + /// ``` + /// use regex_automata::{nfa::thompson::{NFA, WhichCaptures}, PatternID}; + /// + /// let nfa = NFA::new(r"(a)(b)(c)")?; + /// // There are 3 explicit groups in the pattern's concrete syntax and + /// // 1 unnamed and implicit group spanning the entire pattern. + /// assert_eq!(4, nfa.group_info().group_len(PatternID::ZERO)); + /// + /// let nfa = NFA::new(r"abc")?; + /// // There is just the unnamed implicit group. + /// assert_eq!(1, nfa.group_info().group_len(PatternID::ZERO)); + /// + /// let nfa = NFA::compiler() + /// .configure(NFA::config().which_captures(WhichCaptures::None)) + /// .build(r"abc")?; + /// // We disabled capturing groups, so there are none. + /// assert_eq!(0, nfa.group_info().group_len(PatternID::ZERO)); + /// + /// let nfa = NFA::compiler() + /// .configure(NFA::config().which_captures(WhichCaptures::None)) + /// .build(r"(a)(b)(c)")?; + /// // We disabled capturing groups, so there are none, even if there are + /// // explicit groups in the concrete syntax. + /// assert_eq!(0, nfa.group_info().group_len(PatternID::ZERO)); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn group_len(&self, pid: PatternID) -> usize { + self.0.group_len(pid) + } + + /// Return the total number of capture groups across all patterns. + /// + /// This includes implicit groups that represent the entire match of a + /// pattern. + /// + /// # Example + /// + /// This example shows how the values returned by this routine may vary + /// for different patterns and NFA configurations. + /// + /// ``` + /// use regex_automata::{nfa::thompson::{NFA, WhichCaptures}, PatternID}; + /// + /// let nfa = NFA::new(r"(a)(b)(c)")?; + /// // There are 3 explicit groups in the pattern's concrete syntax and + /// // 1 unnamed and implicit group spanning the entire pattern. + /// assert_eq!(4, nfa.group_info().all_group_len()); + /// + /// let nfa = NFA::new(r"abc")?; + /// // There is just the unnamed implicit group. + /// assert_eq!(1, nfa.group_info().all_group_len()); + /// + /// let nfa = NFA::new_many(&["(a)", "b", "(c)"])?; + /// // Each pattern has one implicit groups, and two + /// // patterns have one explicit group each. + /// assert_eq!(5, nfa.group_info().all_group_len()); + /// + /// let nfa = NFA::compiler() + /// .configure(NFA::config().which_captures(WhichCaptures::None)) + /// .build(r"abc")?; + /// // We disabled capturing groups, so there are none. + /// assert_eq!(0, nfa.group_info().all_group_len()); + /// + /// let nfa = NFA::compiler() + /// .configure(NFA::config().which_captures(WhichCaptures::None)) + /// .build(r"(a)(b)(c)")?; + /// // We disabled capturing groups, so there are none, even if there are + /// // explicit groups in the concrete syntax. + /// assert_eq!(0, nfa.group_info().group_len(PatternID::ZERO)); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn all_group_len(&self) -> usize { + self.slot_len() / 2 + } + + /// Returns the total number of slots in this `GroupInfo` across all + /// patterns. + /// + /// The total number of slots is always twice the total number of capturing + /// groups, including both implicit and explicit groups. + /// + /// # Example + /// + /// This example shows the relationship between the number of capturing + /// groups and slots. + /// + /// ``` + /// use regex_automata::util::captures::GroupInfo; + /// + /// // There are 11 total groups here. + /// let info = GroupInfo::new(vec![ + /// vec![None, Some("foo")], + /// vec![None], + /// vec![None, None, None, Some("bar"), None], + /// vec![None, None, Some("foo")], + /// ])?; + /// // 2 slots per group gives us 11*2=22 slots. + /// assert_eq!(22, info.slot_len()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn slot_len(&self) -> usize { + self.0.small_slot_len().as_usize() + } + + /// Returns the total number of slots for implicit capturing groups. + /// + /// This is like [`GroupInfo::slot_len`], except it doesn't include the + /// explicit slots for each pattern. Since there are always exactly 2 + /// implicit slots for each pattern, the number of implicit slots is always + /// equal to twice the number of patterns. + /// + /// # Example + /// + /// This example shows the relationship between the number of capturing + /// groups, implicit slots and explicit slots. + /// + /// ``` + /// use regex_automata::util::captures::GroupInfo; + /// + /// // There are 11 total groups here. + /// let info = GroupInfo::new(vec![vec![None, Some("foo"), Some("bar")]])?; + /// // 2 slots per group gives us 11*2=22 slots. + /// assert_eq!(6, info.slot_len()); + /// // 2 implicit slots per pattern gives us 2 implicit slots since there + /// // is 1 pattern. + /// assert_eq!(2, info.implicit_slot_len()); + /// // 2 explicit capturing groups gives us 2*2=4 explicit slots. + /// assert_eq!(4, info.explicit_slot_len()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn implicit_slot_len(&self) -> usize { + self.pattern_len() * 2 + } + + /// Returns the total number of slots for explicit capturing groups. + /// + /// This is like [`GroupInfo::slot_len`], except it doesn't include the + /// implicit slots for each pattern. (There are always 2 implicit slots for + /// each pattern.) + /// + /// For a non-empty `GroupInfo`, it is always the case that `slot_len` is + /// strictly greater than `explicit_slot_len`. For an empty `GroupInfo`, + /// both the total number of slots and the number of explicit slots is + /// `0`. + /// + /// # Example + /// + /// This example shows the relationship between the number of capturing + /// groups, implicit slots and explicit slots. + /// + /// ``` + /// use regex_automata::util::captures::GroupInfo; + /// + /// // There are 11 total groups here. + /// let info = GroupInfo::new(vec![vec![None, Some("foo"), Some("bar")]])?; + /// // 2 slots per group gives us 11*2=22 slots. + /// assert_eq!(6, info.slot_len()); + /// // 2 implicit slots per pattern gives us 2 implicit slots since there + /// // is 1 pattern. + /// assert_eq!(2, info.implicit_slot_len()); + /// // 2 explicit capturing groups gives us 2*2=4 explicit slots. + /// assert_eq!(4, info.explicit_slot_len()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn explicit_slot_len(&self) -> usize { + self.slot_len().saturating_sub(self.implicit_slot_len()) + } + + /// Returns the memory usage, in bytes, of this `GroupInfo`. + /// + /// This does **not** include the stack size used up by this `GroupInfo`. + /// To compute that, use `std::mem::size_of::<GroupInfo>()`. + #[inline] + pub fn memory_usage(&self) -> usize { + use core::mem::size_of as s; + + s::<GroupInfoInner>() + + self.0.slot_ranges.len() * s::<(SmallIndex, SmallIndex)>() + + self.0.name_to_index.len() * s::<CaptureNameMap>() + + self.0.index_to_name.len() * s::<Vec<Option<Arc<str>>>>() + + self.0.memory_extra + } +} + +/// A map from capture group name to its corresponding capture group index. +/// +/// This type is actually wrapped inside a Vec indexed by pattern ID on a +/// `GroupInfo`, since multiple patterns may have the same capture group name. +/// That is, each pattern gets its own namespace of capture group names. +/// +/// Perhaps a more memory efficient representation would be +/// HashMap<(PatternID, Arc<str>), usize>, but this makes it difficult to look +/// up a capture index by name without producing a `Arc<str>`, which requires +/// an allocation. To fix this, I think we'd need to define our own unsized +/// type or something? Anyway, I didn't give this much thought since it +/// probably doesn't matter much in the grand scheme of things. But it did +/// stand out to me as mildly wasteful. +#[cfg(feature = "std")] +type CaptureNameMap = std::collections::HashMap<Arc<str>, SmallIndex>; +#[cfg(not(feature = "std"))] +type CaptureNameMap = alloc::collections::BTreeMap<Arc<str>, SmallIndex>; + +/// The inner guts of `GroupInfo`. This type only exists so that it can +/// be wrapped in an `Arc` to make `GroupInfo` reference counted. +#[derive(Debug, Default)] +struct GroupInfoInner { + slot_ranges: Vec<(SmallIndex, SmallIndex)>, + name_to_index: Vec<CaptureNameMap>, + index_to_name: Vec<Vec<Option<Arc<str>>>>, + memory_extra: usize, +} + +impl GroupInfoInner { + /// This adds the first unnamed group for the given pattern ID. The given + /// pattern ID must be zero if this is the first time this method is + /// called, or must be exactly one more than the pattern ID supplied to the + /// previous call to this method. (This method panics if this rule is + /// violated.) + /// + /// This can be thought of as initializing the GroupInfo state for the + /// given pattern and closing off the state for any previous pattern. + fn add_first_group(&mut self, pid: PatternID) { + assert_eq!(pid.as_usize(), self.slot_ranges.len()); + assert_eq!(pid.as_usize(), self.name_to_index.len()); + assert_eq!(pid.as_usize(), self.index_to_name.len()); + // This is the start of our slots for the explicit capturing groups. + // Note that since the slots for the 0th group for every pattern appear + // before any slots for the nth group (where n > 0) in any pattern, we + // will have to fix up the slot ranges once we know how many patterns + // we've added capture groups for. + let slot_start = self.small_slot_len(); + self.slot_ranges.push((slot_start, slot_start)); + self.name_to_index.push(CaptureNameMap::new()); + self.index_to_name.push(vec![None]); + self.memory_extra += core::mem::size_of::<Option<Arc<str>>>(); + } + + /// Add an explicit capturing group for the given pattern with the given + /// index. If the group has a name, then that must be given as well. + /// + /// Note that every capturing group except for the first or zeroth group is + /// explicit. + /// + /// This returns an error if adding this group would result in overflowing + /// slot indices or if a capturing group with the same name for this + /// pattern has already been added. + fn add_explicit_group<N: AsRef<str>>( + &mut self, + pid: PatternID, + group: SmallIndex, + maybe_name: Option<N>, + ) -> Result<(), GroupInfoError> { + // We also need to check that the slot index generated for + // this group is also valid. Although, this is a little weird + // because we offset these indices below, at which point, we'll + // have to recheck them. Gosh this is annoying. Note that + // the '+2' below is OK because 'end' is guaranteed to be less + // than isize::MAX. + let end = &mut self.slot_ranges[pid].1; + *end = SmallIndex::new(end.as_usize() + 2).map_err(|_| { + GroupInfoError::too_many_groups(pid, group.as_usize()) + })?; + if let Some(name) = maybe_name { + let name = Arc::<str>::from(name.as_ref()); + if self.name_to_index[pid].contains_key(&*name) { + return Err(GroupInfoError::duplicate(pid, &name)); + } + let len = name.len(); + self.name_to_index[pid].insert(Arc::clone(&name), group); + self.index_to_name[pid].push(Some(name)); + // Adds the memory used by the Arc<str> in both maps. + self.memory_extra += + 2 * (len + core::mem::size_of::<Option<Arc<str>>>()); + // And also the value entry for the 'name_to_index' map. + // This is probably an underestimate for 'name_to_index' since + // hashmaps/btrees likely have some non-zero overhead, but we + // assume here that they have zero overhead. + self.memory_extra += core::mem::size_of::<SmallIndex>(); + } else { + self.index_to_name[pid].push(None); + self.memory_extra += core::mem::size_of::<Option<Arc<str>>>(); + } + // This is a sanity assert that checks that our group index + // is in line with the number of groups added so far for this + // pattern. + assert_eq!(group.one_more(), self.group_len(pid)); + // And is also in line with the 'index_to_name' map. + assert_eq!(group.one_more(), self.index_to_name[pid].len()); + Ok(()) + } + + /// This corrects the slot ranges to account for the slots corresponding + /// to the zeroth group of each pattern. That is, every slot range is + /// offset by 'pattern_len() * 2', since each pattern uses two slots to + /// represent the zeroth group. + fn fixup_slot_ranges(&mut self) -> Result<(), GroupInfoError> { + use crate::util::primitives::IteratorIndexExt; + // Since we know number of patterns fits in PatternID and + // PatternID::MAX < isize::MAX, it follows that multiplying by 2 will + // never overflow usize. + let offset = self.pattern_len().checked_mul(2).unwrap(); + for (pid, &mut (ref mut start, ref mut end)) in + self.slot_ranges.iter_mut().with_pattern_ids() + { + let group_len = 1 + ((end.as_usize() - start.as_usize()) / 2); + let new_end = match end.as_usize().checked_add(offset) { + Some(new_end) => new_end, + None => { + return Err(GroupInfoError::too_many_groups( + pid, group_len, + )) + } + }; + *end = SmallIndex::new(new_end).map_err(|_| { + GroupInfoError::too_many_groups(pid, group_len) + })?; + // Since start <= end, if end is valid then start must be too. + *start = SmallIndex::new(start.as_usize() + offset).unwrap(); + } + Ok(()) + } + + /// Return the total number of patterns represented by this capture slot + /// info. + fn pattern_len(&self) -> usize { + self.slot_ranges.len() + } + + /// Return the total number of capturing groups for the given pattern. If + /// the given pattern isn't valid for this capture slot info, then 0 is + /// returned. + fn group_len(&self, pid: PatternID) -> usize { + let (start, end) = match self.slot_ranges.get(pid.as_usize()) { + None => return 0, + Some(range) => range, + }; + // The difference between any two SmallIndex values always fits in a + // usize since we know that SmallIndex::MAX <= isize::MAX-1. We also + // know that start<=end by construction and that the number of groups + // never exceeds SmallIndex and thus never overflows usize. + 1 + ((end.as_usize() - start.as_usize()) / 2) + } + + /// Return the total number of slots in this capture slot info as a + /// "small index." + fn small_slot_len(&self) -> SmallIndex { + // Since slots are allocated in order of pattern (starting at 0) and + // then in order of capture group, it follows that the number of slots + // is the end of the range of slots for the last pattern. This is + // true even when the last pattern has no capturing groups, since + // 'slot_ranges' will still represent it explicitly with an empty + // range. + self.slot_ranges.last().map_or(SmallIndex::ZERO, |&(_, end)| end) + } +} + +/// An error that may occur when building a `GroupInfo`. +/// +/// Building a `GroupInfo` does a variety of checks to make sure the +/// capturing groups satisfy a number of invariants. This includes, but is not +/// limited to, ensuring that the first capturing group is unnamed and that +/// there are no duplicate capture groups for a specific pattern. +#[derive(Clone, Debug)] +pub struct GroupInfoError { + kind: GroupInfoErrorKind, +} + +/// The kind of error that occurs when building a `GroupInfo` fails. +/// +/// We keep this un-exported because it's not clear how useful it is to +/// export it. +#[derive(Clone, Debug)] +enum GroupInfoErrorKind { + /// This occurs when too many patterns have been added. i.e., It would + /// otherwise overflow a `PatternID`. + TooManyPatterns { err: PatternIDError }, + /// This occurs when too many capturing groups have been added for a + /// particular pattern. + TooManyGroups { + /// The ID of the pattern that had too many groups. + pattern: PatternID, + /// The minimum number of groups that the caller has tried to add for + /// a pattern. + minimum: usize, + }, + /// An error that occurs when a pattern has no capture groups. Either the + /// group info must be empty, or all patterns must have at least one group + /// (corresponding to the unnamed group for the entire pattern). + MissingGroups { + /// The ID of the pattern that had no capturing groups. + pattern: PatternID, + }, + /// An error that occurs when one tries to provide a name for the capture + /// group at index 0. This capturing group must currently always be + /// unnamed. + FirstMustBeUnnamed { + /// The ID of the pattern that was found to have a named first + /// capturing group. + pattern: PatternID, + }, + /// An error that occurs when duplicate capture group names for the same + /// pattern are added. + /// + /// NOTE: At time of writing, this error can never occur if you're using + /// regex-syntax, since the parser itself will reject patterns with + /// duplicate capture group names. This error can only occur when the + /// builder is used to hand construct NFAs. + Duplicate { + /// The pattern in which the duplicate capture group name was found. + pattern: PatternID, + /// The duplicate name. + name: String, + }, +} + +impl GroupInfoError { + fn too_many_patterns(err: PatternIDError) -> GroupInfoError { + GroupInfoError { kind: GroupInfoErrorKind::TooManyPatterns { err } } + } + + fn too_many_groups(pattern: PatternID, minimum: usize) -> GroupInfoError { + GroupInfoError { + kind: GroupInfoErrorKind::TooManyGroups { pattern, minimum }, + } + } + + fn missing_groups(pattern: PatternID) -> GroupInfoError { + GroupInfoError { kind: GroupInfoErrorKind::MissingGroups { pattern } } + } + + fn first_must_be_unnamed(pattern: PatternID) -> GroupInfoError { + GroupInfoError { + kind: GroupInfoErrorKind::FirstMustBeUnnamed { pattern }, + } + } + + fn duplicate(pattern: PatternID, name: &str) -> GroupInfoError { + GroupInfoError { + kind: GroupInfoErrorKind::Duplicate { + pattern, + name: String::from(name), + }, + } + } +} + +#[cfg(feature = "std")] +impl std::error::Error for GroupInfoError { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + match self.kind { + GroupInfoErrorKind::TooManyPatterns { .. } + | GroupInfoErrorKind::TooManyGroups { .. } + | GroupInfoErrorKind::MissingGroups { .. } + | GroupInfoErrorKind::FirstMustBeUnnamed { .. } + | GroupInfoErrorKind::Duplicate { .. } => None, + } + } +} + +impl core::fmt::Display for GroupInfoError { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + use self::GroupInfoErrorKind::*; + + match self.kind { + TooManyPatterns { ref err } => { + write!(f, "too many patterns to build capture info: {}", err) + } + TooManyGroups { pattern, minimum } => { + write!( + f, + "too many capture groups (at least {}) were \ + found for pattern {}", + minimum, + pattern.as_usize() + ) + } + MissingGroups { pattern } => write!( + f, + "no capturing groups found for pattern {} \ + (either all patterns have zero groups or all patterns have \ + at least one group)", + pattern.as_usize(), + ), + FirstMustBeUnnamed { pattern } => write!( + f, + "first capture group (at index 0) for pattern {} has a name \ + (it must be unnamed)", + pattern.as_usize(), + ), + Duplicate { pattern, ref name } => write!( + f, + "duplicate capture group name '{}' found for pattern {}", + name, + pattern.as_usize(), + ), + } + } +} + +/// An iterator over capturing groups and their names for a specific pattern. +/// +/// This iterator is created by [`GroupInfo::pattern_names`]. +/// +/// The lifetime parameter `'a` refers to the lifetime of the `GroupInfo` +/// from which this iterator was created. +#[derive(Clone, Debug)] +pub struct GroupInfoPatternNames<'a> { + it: core::slice::Iter<'a, Option<Arc<str>>>, +} + +impl GroupInfoPatternNames<'static> { + fn empty() -> GroupInfoPatternNames<'static> { + GroupInfoPatternNames { it: [].iter() } + } +} + +impl<'a> Iterator for GroupInfoPatternNames<'a> { + type Item = Option<&'a str>; + + fn next(&mut self) -> Option<Option<&'a str>> { + self.it.next().map(|x| x.as_deref()) + } + + fn size_hint(&self) -> (usize, Option<usize>) { + self.it.size_hint() + } + + fn count(self) -> usize { + self.it.count() + } +} + +impl<'a> ExactSizeIterator for GroupInfoPatternNames<'a> {} +impl<'a> core::iter::FusedIterator for GroupInfoPatternNames<'a> {} + +/// An iterator over capturing groups and their names for a `GroupInfo`. +/// +/// This iterator is created by [`GroupInfo::all_names`]. +/// +/// The lifetime parameter `'a` refers to the lifetime of the `GroupInfo` +/// from which this iterator was created. +#[derive(Debug)] +pub struct GroupInfoAllNames<'a> { + group_info: &'a GroupInfo, + pids: PatternIDIter, + current_pid: Option<PatternID>, + names: Option<core::iter::Enumerate<GroupInfoPatternNames<'a>>>, +} + +impl<'a> Iterator for GroupInfoAllNames<'a> { + type Item = (PatternID, usize, Option<&'a str>); + + fn next(&mut self) -> Option<(PatternID, usize, Option<&'a str>)> { + // If the group info has no captures, then we never have anything + // to yield. We need to consider this case explicitly (at time of + // writing) because 'pattern_capture_names' will panic if captures + // aren't enabled. + if self.group_info.0.index_to_name.is_empty() { + return None; + } + if self.current_pid.is_none() { + self.current_pid = Some(self.pids.next()?); + } + let pid = self.current_pid.unwrap(); + if self.names.is_none() { + self.names = Some(self.group_info.pattern_names(pid).enumerate()); + } + let (group_index, name) = match self.names.as_mut().unwrap().next() { + Some((group_index, name)) => (group_index, name), + None => { + self.current_pid = None; + self.names = None; + return self.next(); + } + }; + Some((pid, group_index, name)) + } +} diff --git a/vendor/regex-automata/src/util/determinize/mod.rs b/vendor/regex-automata/src/util/determinize/mod.rs new file mode 100644 index 0000000..ba32991 --- /dev/null +++ b/vendor/regex-automata/src/util/determinize/mod.rs @@ -0,0 +1,682 @@ +/*! +This module contains types and routines for implementing determinization. + +In this crate, there are at least two places where we implement +determinization: fully ahead-of-time compiled DFAs in the `dfa` module and +lazily compiled DFAs in the `hybrid` module. The stuff in this module +corresponds to the things that are in common between these implementations. + +There are three broad things that our implementations of determinization have +in common, as defined by this module: + +* The classification of start states. That is, whether we're dealing with +word boundaries, line boundaries, etc., is all the same. This also includes +the look-behind assertions that are satisfied by each starting state +classification. +* The representation of DFA states as sets of NFA states, including +convenience types for building these DFA states that are amenable to reusing +allocations. +* Routines for the "classical" parts of determinization: computing the +epsilon closure, tracking match states (with corresponding pattern IDs, since +we support multi-pattern finite automata) and, of course, computing the +transition function between states for units of input. + +I did consider a couple of alternatives to this particular form of code reuse: + +1. Don't do any code reuse. The problem here is that we *really* want both +forms of determinization to do exactly identical things when it comes to +their handling of NFA states. While our tests generally ensure this, the code +is tricky and large enough where not reusing code is a pretty big bummer. + +2. Implement all of determinization once and make it generic over fully +compiled DFAs and lazily compiled DFAs. While I didn't actually try this +approach, my instinct is that it would be more complex than is needed here. +And the interface required would be pretty hairy. Instead, I think splitting +it into logical sub-components works better. +*/ + +use alloc::vec::Vec; + +pub(crate) use self::state::{ + State, StateBuilderEmpty, StateBuilderMatches, StateBuilderNFA, +}; + +use crate::{ + nfa::thompson, + util::{ + alphabet, + look::{Look, LookSet}, + primitives::StateID, + search::MatchKind, + sparse_set::{SparseSet, SparseSets}, + start::Start, + utf8, + }, +}; + +mod state; + +/// Compute the set of all reachable NFA states, including the full epsilon +/// closure, from a DFA state for a single unit of input. The set of reachable +/// states is returned as a `StateBuilderNFA`. The `StateBuilderNFA` returned +/// also includes any look-behind assertions satisfied by `unit`, in addition +/// to whether it is a match state. For multi-pattern DFAs, the builder will +/// also include the pattern IDs that match (in the order seen). +/// +/// `nfa` must be able to resolve any NFA state in `state` and any NFA state +/// reachable via the epsilon closure of any NFA state in `state`. `sparses` +/// must have capacity equivalent to `nfa.len()`. +/// +/// `match_kind` should correspond to the match semantics implemented by the +/// DFA being built. Generally speaking, for leftmost-first match semantics, +/// states that appear after the first NFA match state will not be included in +/// the `StateBuilderNFA` returned since they are impossible to visit. +/// +/// `sparses` is used as scratch space for NFA traversal. Other than their +/// capacity requirements (detailed above), there are no requirements on what's +/// contained within them (if anything). Similarly, what's inside of them once +/// this routine returns is unspecified. +/// +/// `stack` must have length 0. It is used as scratch space for depth first +/// traversal. After returning, it is guaranteed that `stack` will have length +/// 0. +/// +/// `state` corresponds to the current DFA state on which one wants to compute +/// the transition for the input `unit`. +/// +/// `empty_builder` corresponds to the builder allocation to use to produce a +/// complete `StateBuilderNFA` state. If the state is not needed (or is already +/// cached), then it can be cleared and reused without needing to create a new +/// `State`. The `StateBuilderNFA` state returned is final and ready to be +/// turned into a `State` if necessary. +pub(crate) fn next( + nfa: &thompson::NFA, + match_kind: MatchKind, + sparses: &mut SparseSets, + stack: &mut Vec<StateID>, + state: &State, + unit: alphabet::Unit, + empty_builder: StateBuilderEmpty, +) -> StateBuilderNFA { + sparses.clear(); + + // Whether the NFA is matched in reverse or not. We use this in some + // conditional logic for dealing with the exceptionally annoying CRLF-aware + // line anchors. + let rev = nfa.is_reverse(); + // The look-around matcher that our NFA is configured with. We don't + // actually use it to match look-around assertions, but we do need its + // configuration for constructing states consistent with how it matches. + let lookm = nfa.look_matcher(); + + // Put the NFA state IDs into a sparse set in case we need to + // re-compute their epsilon closure. + // + // Doing this state shuffling is technically not necessary unless some + // kind of look-around is used in the DFA. Some ad hoc experiments + // suggested that avoiding this didn't lead to much of an improvement, + // but perhaps more rigorous experimentation should be done. And in + // particular, avoiding this check requires some light refactoring of + // the code below. + state.iter_nfa_state_ids(|nfa_id| { + sparses.set1.insert(nfa_id); + }); + + // Compute look-ahead assertions originating from the current state. Based + // on the input unit we're transitioning over, some additional set of + // assertions may be true. Thus, we re-compute this state's epsilon closure + // (but only if necessary). Notably, when we build a DFA state initially, + // we don't enable any look-ahead assertions because we don't know whether + // they're true or not at that point. + if !state.look_need().is_empty() { + // Add look-ahead assertions that are now true based on the current + // input unit. + let mut look_have = state.look_have().clone(); + match unit.as_u8() { + Some(b'\r') => { + if !rev || !state.is_half_crlf() { + look_have = look_have.insert(Look::EndCRLF); + } + } + Some(b'\n') => { + if rev || !state.is_half_crlf() { + look_have = look_have.insert(Look::EndCRLF); + } + } + Some(_) => {} + None => { + look_have = look_have + .insert(Look::End) + .insert(Look::EndLF) + .insert(Look::EndCRLF); + } + } + if unit.is_byte(lookm.get_line_terminator()) { + look_have = look_have.insert(Look::EndLF); + } + if state.is_half_crlf() + && ((rev && !unit.is_byte(b'\r')) + || (!rev && !unit.is_byte(b'\n'))) + { + look_have = look_have.insert(Look::StartCRLF); + } + if state.is_from_word() == unit.is_word_byte() { + look_have = look_have + .insert(Look::WordAsciiNegate) + .insert(Look::WordUnicodeNegate); + } else { + look_have = + look_have.insert(Look::WordAscii).insert(Look::WordUnicode); + } + if !unit.is_word_byte() { + look_have = look_have + .insert(Look::WordEndHalfAscii) + .insert(Look::WordEndHalfUnicode); + } + if state.is_from_word() && !unit.is_word_byte() { + look_have = look_have + .insert(Look::WordEndAscii) + .insert(Look::WordEndUnicode); + } else if !state.is_from_word() && unit.is_word_byte() { + look_have = look_have + .insert(Look::WordStartAscii) + .insert(Look::WordStartUnicode); + } + // If we have new assertions satisfied that are among the set of + // assertions that exist in this state (that is, just because we added + // an EndLF assertion above doesn't mean there is an EndLF conditional + // epsilon transition in this state), then we re-compute this state's + // epsilon closure using the updated set of assertions. + // + // Note that since our DFA states omit unconditional epsilon + // transitions, this check is necessary for correctness. If we re-did + // the epsilon closure below needlessly, it could change based on the + // fact that we omitted epsilon states originally. + if !look_have + .subtract(state.look_have()) + .intersect(state.look_need()) + .is_empty() + { + for nfa_id in sparses.set1.iter() { + epsilon_closure( + nfa, + nfa_id, + look_have, + stack, + &mut sparses.set2, + ); + } + sparses.swap(); + sparses.set2.clear(); + } + } + + // Convert our empty builder into one that can record assertions and match + // pattern IDs. + let mut builder = empty_builder.into_matches(); + // Set whether the StartLF look-behind assertion is true for this + // transition or not. The look-behind assertion for ASCII word boundaries + // is handled below. + if nfa.look_set_any().contains_anchor_line() + && unit.is_byte(lookm.get_line_terminator()) + { + // Why only handle StartLF here and not Start? That's because Start + // can only impact the starting state, which is special cased in + // start state handling. + builder.set_look_have(|have| have.insert(Look::StartLF)); + } + // We also need to add StartCRLF to our assertions too, if we can. This + // is unfortunately a bit more complicated, because it depends on the + // direction of the search. In the forward direction, ^ matches after a + // \n, but in the reverse direction, ^ only matches after a \r. (This is + // further complicated by the fact that reverse a regex means changing a ^ + // to a $ and vice versa.) + if nfa.look_set_any().contains_anchor_crlf() + && ((rev && unit.is_byte(b'\r')) || (!rev && unit.is_byte(b'\n'))) + { + builder.set_look_have(|have| have.insert(Look::StartCRLF)); + } + // And also for the start-half word boundary assertions. As long as the + // look-behind byte is not a word char, then the assertions are satisfied. + if nfa.look_set_any().contains_word() && !unit.is_word_byte() { + builder.set_look_have(|have| { + have.insert(Look::WordStartHalfAscii) + .insert(Look::WordStartHalfUnicode) + }); + } + for nfa_id in sparses.set1.iter() { + match *nfa.state(nfa_id) { + thompson::State::Union { .. } + | thompson::State::BinaryUnion { .. } + | thompson::State::Fail + | thompson::State::Look { .. } + | thompson::State::Capture { .. } => {} + thompson::State::Match { pattern_id } => { + // Notice here that we are calling the NEW state a match + // state if the OLD state we are transitioning from + // contains an NFA match state. This is precisely how we + // delay all matches by one byte and also what therefore + // guarantees that starting states cannot be match states. + // + // If we didn't delay matches by one byte, then whether + // a DFA is a matching state or not would be determined + // by whether one of its own constituent NFA states + // was a match state. (And that would be done in + // 'add_nfa_states'.) + // + // Also, 'add_match_pattern_id' requires that callers never + // pass duplicative pattern IDs. We do in fact uphold that + // guarantee here, but it's subtle. In particular, a Thompson + // NFA guarantees that each pattern has exactly one match + // state. Moreover, since we're iterating over the NFA state + // IDs in a set, we are guarateed not to have any duplicative + // match states. Thus, it is impossible to add the same pattern + // ID more than once. + // + // N.B. We delay matches by 1 byte as a way to hack 1-byte + // look-around into DFA searches. This lets us support ^, $ + // and ASCII-only \b. The delay is also why we need a special + // "end-of-input" (EOI) sentinel and why we need to follow the + // EOI sentinel at the end of every search. This final EOI + // transition is necessary to report matches found at the end + // of a haystack. + builder.add_match_pattern_id(pattern_id); + if !match_kind.continue_past_first_match() { + break; + } + } + thompson::State::ByteRange { ref trans } => { + if trans.matches_unit(unit) { + epsilon_closure( + nfa, + trans.next, + builder.look_have(), + stack, + &mut sparses.set2, + ); + } + } + thompson::State::Sparse(ref sparse) => { + if let Some(next) = sparse.matches_unit(unit) { + epsilon_closure( + nfa, + next, + builder.look_have(), + stack, + &mut sparses.set2, + ); + } + } + thompson::State::Dense(ref dense) => { + if let Some(next) = dense.matches_unit(unit) { + epsilon_closure( + nfa, + next, + builder.look_have(), + stack, + &mut sparses.set2, + ); + } + } + } + } + // We only set the word byte if there's a word boundary look-around + // anywhere in this regex. Otherwise, there's no point in bloating the + // number of states if we don't have one. + // + // We also only set it when the state has a non-zero number of NFA states. + // Otherwise, we could wind up with states that *should* be DEAD states + // but are otherwise distinct from DEAD states because of this look-behind + // assertion being set. While this can't technically impact correctness *in + // theory*, it can create pathological DFAs that consume input until EOI or + // a quit byte is seen. Consuming until EOI isn't a correctness problem, + // but a (serious) perf problem. Hitting a quit byte, however, could be a + // correctness problem since it could cause search routines to report an + // error instead of a detected match once the quit state is entered. (The + // search routine could be made to be a bit smarter by reporting a match + // if one was detected once it enters a quit state (and indeed, the search + // routines in this crate do just that), but it seems better to prevent + // these things by construction if possible.) + if !sparses.set2.is_empty() { + if nfa.look_set_any().contains_word() && unit.is_word_byte() { + builder.set_is_from_word(); + } + if nfa.look_set_any().contains_anchor_crlf() + && ((rev && unit.is_byte(b'\n')) || (!rev && unit.is_byte(b'\r'))) + { + builder.set_is_half_crlf(); + } + } + let mut builder_nfa = builder.into_nfa(); + add_nfa_states(nfa, &sparses.set2, &mut builder_nfa); + builder_nfa +} + +/// Compute the epsilon closure for the given NFA state. The epsilon closure +/// consists of all NFA state IDs, including `start_nfa_id`, that can be +/// reached from `start_nfa_id` without consuming any input. These state IDs +/// are written to `set` in the order they are visited, but only if they are +/// not already in `set`. `start_nfa_id` must be a valid state ID for the NFA +/// given. +/// +/// `look_have` consists of the satisfied assertions at the current +/// position. For conditional look-around epsilon transitions, these are +/// only followed if they are satisfied by `look_have`. +/// +/// `stack` must have length 0. It is used as scratch space for depth first +/// traversal. After returning, it is guaranteed that `stack` will have length +/// 0. +pub(crate) fn epsilon_closure( + nfa: &thompson::NFA, + start_nfa_id: StateID, + look_have: LookSet, + stack: &mut Vec<StateID>, + set: &mut SparseSet, +) { + assert!(stack.is_empty()); + // If this isn't an epsilon state, then the epsilon closure is always just + // itself, so there's no need to spin up the machinery below to handle it. + if !nfa.state(start_nfa_id).is_epsilon() { + set.insert(start_nfa_id); + return; + } + + stack.push(start_nfa_id); + while let Some(mut id) = stack.pop() { + // In many cases, we can avoid stack operations when an NFA state only + // adds one new state to visit. In that case, we just set our ID to + // that state and mush on. We only use the stack when an NFA state + // introduces multiple new states to visit. + loop { + // Insert this NFA state, and if it's already in the set and thus + // already visited, then we can move on to the next one. + if !set.insert(id) { + break; + } + match *nfa.state(id) { + thompson::State::ByteRange { .. } + | thompson::State::Sparse { .. } + | thompson::State::Dense { .. } + | thompson::State::Fail + | thompson::State::Match { .. } => break, + thompson::State::Look { look, next } => { + if !look_have.contains(look) { + break; + } + id = next; + } + thompson::State::Union { ref alternates } => { + id = match alternates.get(0) { + None => break, + Some(&id) => id, + }; + // We need to process our alternates in order to preserve + // match preferences, so put the earliest alternates closer + // to the top of the stack. + stack.extend(alternates[1..].iter().rev()); + } + thompson::State::BinaryUnion { alt1, alt2 } => { + id = alt1; + stack.push(alt2); + } + thompson::State::Capture { next, .. } => { + id = next; + } + } + } + } +} + +/// Add the NFA state IDs in the given `set` to the given DFA builder state. +/// The order in which states are added corresponds to the order in which they +/// were added to `set`. +/// +/// The DFA builder state given should already have its complete set of match +/// pattern IDs added (if any) and any look-behind assertions (StartLF, Start +/// and whether this state is being generated for a transition over a word byte +/// when applicable) that are true immediately prior to transitioning into this +/// state (via `builder.look_have()`). The match pattern IDs should correspond +/// to matches that occurred on the previous transition, since all matches are +/// delayed by one byte. The things that should _not_ be set are look-ahead +/// assertions (EndLF, End and whether the next byte is a word byte or not). +/// The builder state should also not have anything in `look_need` set, as this +/// routine will compute that for you. +/// +/// The given NFA should be able to resolve all identifiers in `set` to a +/// particular NFA state. Additionally, `set` must have capacity equivalent +/// to `nfa.len()`. +pub(crate) fn add_nfa_states( + nfa: &thompson::NFA, + set: &SparseSet, + builder: &mut StateBuilderNFA, +) { + for nfa_id in set.iter() { + match *nfa.state(nfa_id) { + thompson::State::ByteRange { .. } => { + builder.add_nfa_state_id(nfa_id); + } + thompson::State::Sparse { .. } => { + builder.add_nfa_state_id(nfa_id); + } + thompson::State::Dense { .. } => { + builder.add_nfa_state_id(nfa_id); + } + thompson::State::Look { look, .. } => { + builder.add_nfa_state_id(nfa_id); + builder.set_look_need(|need| need.insert(look)); + } + thompson::State::Union { .. } + | thompson::State::BinaryUnion { .. } => { + // Pure epsilon transitions don't need to be tracked as part + // of the DFA state. Tracking them is actually superfluous; + // they won't cause any harm other than making determinization + // slower. + // + // Why aren't these needed? Well, in an NFA, epsilon + // transitions are really just jumping points to other states. + // So once you hit an epsilon transition, the same set of + // resulting states always appears. Therefore, putting them in + // a DFA's set of ordered NFA states is strictly redundant. + // + // Look-around states are also epsilon transitions, but + // they are *conditional*. So their presence could be + // discriminatory, and thus, they are tracked above. + // + // But wait... why are epsilon states in our `set` in the first + // place? Why not just leave them out? They're in our `set` + // because it was generated by computing an epsilon closure, + // and we want to keep track of all states we visited to avoid + // re-visiting them. In exchange, we have to do this second + // iteration over our collected states to finalize our DFA + // state. In theory, we could avoid this second iteration if + // we maintained two sets during epsilon closure: the set of + // visited states (to avoid cycles) and the set of states that + // will actually be used to construct the next DFA state. + // + // Note that this optimization requires that we re-compute the + // epsilon closure to account for look-ahead in 'next' *only + // when necessary*. Namely, only when the set of look-around + // assertions changes and only when those changes are within + // the set of assertions that are needed in order to step + // through the closure correctly. Otherwise, if we re-do the + // epsilon closure needlessly, it could change based on the + // fact that we are omitting epsilon states here. + // + // ----- + // + // Welp, scratch the above. It turns out that recording these + // is in fact necessary to seemingly handle one particularly + // annoying case: when a conditional epsilon transition is + // put inside of a repetition operator. One specific case I + // ran into was the regex `(?:\b|%)+` on the haystack `z%`. + // The correct leftmost first matches are: [0, 0] and [1, 1]. + // But the DFA was reporting [0, 0] and [1, 2]. To understand + // why this happens, consider the NFA for the aforementioned + // regex: + // + // >000000: binary-union(4, 1) + // 000001: \x00-\xFF => 0 + // 000002: WordAscii => 5 + // 000003: % => 5 + // ^000004: binary-union(2, 3) + // 000005: binary-union(4, 6) + // 000006: MATCH(0) + // + // The problem here is that one of the DFA start states is + // going to consist of the NFA states [2, 3] by computing the + // epsilon closure of state 4. State 4 isn't included because + // we previously were not keeping track of union states. But + // only a subset of transitions out of this state will be able + // to follow WordAscii, and in those cases, the epsilon closure + // is redone. The only problem is that computing the epsilon + // closure from [2, 3] is different than computing the epsilon + // closure from [4]. In the former case, assuming the WordAscii + // assertion is satisfied, you get: [2, 3, 6]. In the latter + // case, you get: [2, 6, 3]. Notice that '6' is the match state + // and appears AFTER '3' in the former case. This leads to a + // preferential but incorrect match of '%' before returning + // a match. In the latter case, the match is preferred over + // continuing to accept the '%'. + // + // It almost feels like we might be able to fix the NFA states + // to avoid this, or to at least only keep track of union + // states where this actually matters, since in the vast + // majority of cases, this doesn't matter. + // + // Another alternative would be to define a new HIR property + // called "assertion is repeated anywhere" and compute it + // inductively over the entire pattern. If it happens anywhere, + // which is probably pretty rare, then we record union states. + // Otherwise we don't. + builder.add_nfa_state_id(nfa_id); + } + // Capture states we definitely do not need to record, since they + // are unconditional epsilon transitions with no branching. + thompson::State::Capture { .. } => {} + // It's not totally clear whether we need to record fail states or + // not, but we do so out of an abundance of caution. Since they are + // quite rare in practice, there isn't much cost to recording them. + thompson::State::Fail => { + builder.add_nfa_state_id(nfa_id); + } + thompson::State::Match { .. } => { + // Normally, the NFA match state doesn't actually need to + // be inside the DFA state. But since we delay matches by + // one byte, the matching DFA state corresponds to states + // that transition from the one we're building here. And + // the way we detect those cases is by looking for an NFA + // match state. See 'next' for how this is handled. + builder.add_nfa_state_id(nfa_id); + } + } + } + // If we know this state contains no look-around assertions, then + // there's no reason to track which look-around assertions were + // satisfied when this state was created. + if builder.look_need().is_empty() { + builder.set_look_have(|_| LookSet::empty()); + } +} + +/// Sets the appropriate look-behind assertions on the given state based on +/// this starting configuration. +pub(crate) fn set_lookbehind_from_start( + nfa: &thompson::NFA, + start: &Start, + builder: &mut StateBuilderMatches, +) { + let rev = nfa.is_reverse(); + let lineterm = nfa.look_matcher().get_line_terminator(); + let lookset = nfa.look_set_any(); + match *start { + Start::NonWordByte => { + if lookset.contains_word() { + builder.set_look_have(|have| { + have.insert(Look::WordStartHalfAscii) + .insert(Look::WordStartHalfUnicode) + }); + } + } + Start::WordByte => { + if lookset.contains_word() { + builder.set_is_from_word(); + } + } + Start::Text => { + if lookset.contains_anchor_haystack() { + builder.set_look_have(|have| have.insert(Look::Start)); + } + if lookset.contains_anchor_line() { + builder.set_look_have(|have| { + have.insert(Look::StartLF).insert(Look::StartCRLF) + }); + } + if lookset.contains_word() { + builder.set_look_have(|have| { + have.insert(Look::WordStartHalfAscii) + .insert(Look::WordStartHalfUnicode) + }); + } + } + Start::LineLF => { + if rev { + if lookset.contains_anchor_crlf() { + builder.set_is_half_crlf(); + } + if lookset.contains_anchor_line() { + builder.set_look_have(|have| have.insert(Look::StartLF)); + } + } else { + if lookset.contains_anchor_line() { + builder.set_look_have(|have| have.insert(Look::StartCRLF)); + } + } + if lookset.contains_anchor_line() && lineterm == b'\n' { + builder.set_look_have(|have| have.insert(Look::StartLF)); + } + if lookset.contains_word() { + builder.set_look_have(|have| { + have.insert(Look::WordStartHalfAscii) + .insert(Look::WordStartHalfUnicode) + }); + } + } + Start::LineCR => { + if lookset.contains_anchor_crlf() { + if rev { + builder.set_look_have(|have| have.insert(Look::StartCRLF)); + } else { + builder.set_is_half_crlf(); + } + } + if lookset.contains_anchor_line() && lineterm == b'\r' { + builder.set_look_have(|have| have.insert(Look::StartLF)); + } + if lookset.contains_word() { + builder.set_look_have(|have| { + have.insert(Look::WordStartHalfAscii) + .insert(Look::WordStartHalfUnicode) + }); + } + } + Start::CustomLineTerminator => { + if lookset.contains_anchor_line() { + builder.set_look_have(|have| have.insert(Look::StartLF)); + } + // This is a bit of a tricky case, but if the line terminator was + // set to a word byte, then we also need to behave as if the start + // configuration is Start::WordByte. That is, we need to mark our + // state as having come from a word byte. + if lookset.contains_word() { + if utf8::is_word_byte(lineterm) { + builder.set_is_from_word(); + } else { + builder.set_look_have(|have| { + have.insert(Look::WordStartHalfAscii) + .insert(Look::WordStartHalfUnicode) + }); + } + } + } + } +} diff --git a/vendor/regex-automata/src/util/determinize/state.rs b/vendor/regex-automata/src/util/determinize/state.rs new file mode 100644 index 0000000..effa6f4 --- /dev/null +++ b/vendor/regex-automata/src/util/determinize/state.rs @@ -0,0 +1,907 @@ +/*! +This module defines a DFA state representation and builders for constructing +DFA states. + +This representation is specifically for use in implementations of NFA-to-DFA +conversion via powerset construction. (Also called "determinization" in this +crate.) + +The term "DFA state" is somewhat overloaded in this crate. In some cases, it +refers to the set of transitions over an alphabet for a particular state. In +other cases, it refers to a set of NFA states. The former is really about the +final representation of a state in a DFA's transition table, where as the +latter---what this module is focused on---is closer to an intermediate form +that is used to help eventually build the transition table. + +This module exports four types. All four types represent the same idea: an +ordered set of NFA states. This ordered set represents the epsilon closure of a +particular NFA state, where the "epsilon closure" is the set of NFA states that +can be transitioned to without consuming any input. i.e., Follow all of the NFA +state's epsilon transitions. In addition, this implementation of DFA states +cares about two other things: the ordered set of pattern IDs corresponding +to the patterns that match if the state is a match state, and the set of +look-behind assertions that were true when the state was created. + +The first, `State`, is a frozen representation of a state that cannot be +modified. It may be cheaply cloned without copying the state itself and can be +accessed safely from multiple threads simultaneously. This type is useful for +when one knows that the DFA state being constructed is distinct from any other +previously constructed states. Namely, powerset construction, in practice, +requires one to keep a cache of previously created DFA states. Otherwise, +the number of DFA states created in memory balloons to an impractically +large number. For this reason, equivalent states should endeavor to have an +equivalent byte-level representation. (In general, "equivalency" here means, +"equivalent assertions, pattern IDs and NFA state IDs." We do not require that +full DFA minimization be implemented here. This form of equivalency is only +surface deep and is more-or-less a practical necessity.) + +The other three types represent different phases in the construction of a +DFA state. Internally, these three types (and `State`) all use the same +byte-oriented representation. That means one can use any of the builder types +to check whether the state it represents already exists or not. If it does, +then there is no need to freeze it into a `State` (which requires an alloc and +a copy). Here are the three types described succinctly: + +* `StateBuilderEmpty` represents a state with no pattern IDs, no assertions +and no NFA states. Creating a `StateBuilderEmpty` performs no allocs. A +`StateBuilderEmpty` can only be used to query its underlying memory capacity, +or to convert into a builder for recording pattern IDs and/or assertions. + +* `StateBuilderMatches` represents a state with zero or more pattern IDs, zero +or more satisfied assertions and zero NFA state IDs. A `StateBuilderMatches` +can only be used for adding pattern IDs and recording assertions. + +* `StateBuilderNFA` represents a state with zero or more pattern IDs, zero or +more satisfied assertions and zero or more NFA state IDs. A `StateBuilderNFA` +can only be used for adding NFA state IDs and recording some assertions. + +The expected flow here is to use the above builders to construct a candidate +DFA state to check if it already exists. If it does, then there's no need to +freeze it into a `State`. It it doesn't exist, then `StateBuilderNFA::to_state` +can be called to freeze the builder into an immutable `State`. In either +case, `clear` should be called on the builder to turn it back into a +`StateBuilderEmpty` that reuses the underlying memory. + +The main purpose for splitting the builder into these distinct types is to +make it impossible to do things like adding a pattern ID after adding an NFA +state ID. Namely, this makes it simpler to use a space-and-time efficient +binary representation for the state. (The format is documented on the `Repr` +type below.) If we just used one type for everything, it would be possible for +callers to use an incorrect interleaving of calls and thus result in a corrupt +representation. I chose to use more type machinery to make this impossible to +do because 1) determinization is itself pretty complex and it wouldn't be too +hard to foul this up and 2) there isn't too much machinery involved and it's +well contained. + +As an optimization, sometimes states won't have certain things set. For +example, if the underlying NFA has no word boundary assertions, then there is +no reason to set a state's look-behind assertion as to whether it was generated +from a word byte or not. Similarly, if a state has no NFA states corresponding +to look-around assertions, then there is no reason to set `look_have` to a +non-empty set. Finally, callers usually omit unconditional epsilon transitions +when adding NFA state IDs since they aren't discriminatory. + +Finally, the binary representation used by these states is, thankfully, not +serialized anywhere. So any kind of change can be made with reckless abandon, +as long as everything in this module agrees. +*/ + +use core::{convert::TryFrom, mem}; + +use alloc::{sync::Arc, vec::Vec}; + +use crate::util::{ + int::{I32, U32}, + look::LookSet, + primitives::{PatternID, StateID}, + wire::{self, Endian}, +}; + +/// A DFA state that, at its core, is represented by an ordered set of NFA +/// states. +/// +/// This type is intended to be used only in NFA-to-DFA conversion via powerset +/// construction. +/// +/// It may be cheaply cloned and accessed safely from multiple threads +/// simultaneously. +#[derive(Clone, Eq, Hash, PartialEq, PartialOrd, Ord)] +pub(crate) struct State(Arc<[u8]>); + +/// This Borrow impl permits us to lookup any state in a map by its byte +/// representation. This is particularly convenient when one has a StateBuilder +/// and we want to see if a correspondingly equivalent state already exists. If +/// one does exist, then we can reuse the allocation required by StateBuilder +/// without having to convert it into a State first. +impl core::borrow::Borrow<[u8]> for State { + fn borrow(&self) -> &[u8] { + &*self.0 + } +} + +impl core::fmt::Debug for State { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + f.debug_tuple("State").field(&self.repr()).finish() + } +} + +/// For docs on these routines, see the internal Repr and ReprVec types below. +impl State { + pub(crate) fn dead() -> State { + StateBuilderEmpty::new().into_matches().into_nfa().to_state() + } + + pub(crate) fn is_match(&self) -> bool { + self.repr().is_match() + } + + pub(crate) fn is_from_word(&self) -> bool { + self.repr().is_from_word() + } + + pub(crate) fn is_half_crlf(&self) -> bool { + self.repr().is_half_crlf() + } + + pub(crate) fn look_have(&self) -> LookSet { + self.repr().look_have() + } + + pub(crate) fn look_need(&self) -> LookSet { + self.repr().look_need() + } + + pub(crate) fn match_len(&self) -> usize { + self.repr().match_len() + } + + pub(crate) fn match_pattern(&self, index: usize) -> PatternID { + self.repr().match_pattern(index) + } + + pub(crate) fn match_pattern_ids(&self) -> Option<Vec<PatternID>> { + self.repr().match_pattern_ids() + } + + #[cfg(all(test, not(miri)))] + pub(crate) fn iter_match_pattern_ids<F: FnMut(PatternID)>(&self, f: F) { + self.repr().iter_match_pattern_ids(f) + } + + pub(crate) fn iter_nfa_state_ids<F: FnMut(StateID)>(&self, f: F) { + self.repr().iter_nfa_state_ids(f) + } + + pub(crate) fn memory_usage(&self) -> usize { + self.0.len() + } + + fn repr(&self) -> Repr<'_> { + Repr(&*self.0) + } +} + +/// A state builder that represents an empty state. +/// +/// This is a useful "initial condition" for state construction. It has no +/// NFA state IDs, no assertions set and no pattern IDs. No allocations are +/// made when new() is called. Its main use is for being converted into a +/// builder that can capture assertions and pattern IDs. +#[derive(Clone, Debug)] +pub(crate) struct StateBuilderEmpty(Vec<u8>); + +/// For docs on these routines, see the internal Repr and ReprVec types below. +impl StateBuilderEmpty { + pub(crate) fn new() -> StateBuilderEmpty { + StateBuilderEmpty(alloc::vec![]) + } + + pub(crate) fn into_matches(mut self) -> StateBuilderMatches { + self.0.extend_from_slice(&[0, 0, 0, 0, 0, 0, 0, 0, 0]); + StateBuilderMatches(self.0) + } + + fn clear(&mut self) { + self.0.clear(); + } + + pub(crate) fn capacity(&self) -> usize { + self.0.capacity() + } +} + +/// A state builder that collects assertions and pattern IDs. +/// +/// When collecting pattern IDs is finished, this can be converted into a +/// builder that collects NFA state IDs. +#[derive(Clone)] +pub(crate) struct StateBuilderMatches(Vec<u8>); + +impl core::fmt::Debug for StateBuilderMatches { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + f.debug_tuple("StateBuilderMatches").field(&self.repr()).finish() + } +} + +/// For docs on these routines, see the internal Repr and ReprVec types below. +impl StateBuilderMatches { + pub(crate) fn into_nfa(mut self) -> StateBuilderNFA { + self.repr_vec().close_match_pattern_ids(); + StateBuilderNFA { repr: self.0, prev_nfa_state_id: StateID::ZERO } + } + + pub(crate) fn set_is_from_word(&mut self) { + self.repr_vec().set_is_from_word() + } + + pub(crate) fn set_is_half_crlf(&mut self) { + self.repr_vec().set_is_half_crlf() + } + + pub(crate) fn look_have(&self) -> LookSet { + LookSet::read_repr(&self.0[1..]) + } + + pub(crate) fn set_look_have( + &mut self, + set: impl FnMut(LookSet) -> LookSet, + ) { + self.repr_vec().set_look_have(set) + } + + pub(crate) fn add_match_pattern_id(&mut self, pid: PatternID) { + self.repr_vec().add_match_pattern_id(pid) + } + + fn repr(&self) -> Repr<'_> { + Repr(&self.0) + } + + fn repr_vec(&mut self) -> ReprVec<'_> { + ReprVec(&mut self.0) + } +} + +/// A state builder that collects some assertions and NFA state IDs. +/// +/// When collecting NFA state IDs is finished, this can be used to build a +/// `State` if necessary. +/// +/// When dont with building a state (regardless of whether it got kept or not), +/// it's usually a good idea to call `clear` to get an empty builder back so +/// that it can be reused to build the next state. +#[derive(Clone)] +pub(crate) struct StateBuilderNFA { + repr: Vec<u8>, + prev_nfa_state_id: StateID, +} + +impl core::fmt::Debug for StateBuilderNFA { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + f.debug_tuple("StateBuilderNFA").field(&self.repr()).finish() + } +} + +/// For docs on these routines, see the internal Repr and ReprVec types below. +impl StateBuilderNFA { + pub(crate) fn to_state(&self) -> State { + State(Arc::from(&*self.repr)) + } + + pub(crate) fn clear(self) -> StateBuilderEmpty { + let mut builder = StateBuilderEmpty(self.repr); + builder.clear(); + builder + } + + pub(crate) fn look_need(&self) -> LookSet { + self.repr().look_need() + } + + pub(crate) fn set_look_have( + &mut self, + set: impl FnMut(LookSet) -> LookSet, + ) { + self.repr_vec().set_look_have(set) + } + + pub(crate) fn set_look_need( + &mut self, + set: impl FnMut(LookSet) -> LookSet, + ) { + self.repr_vec().set_look_need(set) + } + + pub(crate) fn add_nfa_state_id(&mut self, sid: StateID) { + ReprVec(&mut self.repr) + .add_nfa_state_id(&mut self.prev_nfa_state_id, sid) + } + + pub(crate) fn as_bytes(&self) -> &[u8] { + &self.repr + } + + fn repr(&self) -> Repr<'_> { + Repr(&self.repr) + } + + fn repr_vec(&mut self) -> ReprVec<'_> { + ReprVec(&mut self.repr) + } +} + +/// Repr is a read-only view into the representation of a DFA state. +/// +/// Primarily, a Repr is how we achieve DRY: we implement decoding the format +/// in one place, and then use a Repr to implement the various methods on the +/// public state types. +/// +/// The format is as follows: +/// +/// The first three bytes correspond to bitsets. +/// +/// Byte 0 is a bitset corresponding to miscellaneous flags associated with the +/// state. Bit 0 is set to 1 if the state is a match state. Bit 1 is set to 1 +/// if the state has pattern IDs explicitly written to it. (This is a flag that +/// is not meant to be set by determinization, but rather, is used as part of +/// an internal space-saving optimization.) Bit 2 is set to 1 if the state was +/// generated by a transition over a "word" byte. (Callers may not always set +/// this. For example, if the NFA has no word boundary assertion, then needing +/// to track whether a state came from a word byte or not is superfluous and +/// wasteful.) Bit 3 is set to 1 if the state was generated by a transition +/// from a `\r` (forward search) or a `\n` (reverse search) when CRLF mode is +/// enabled. +/// +/// Bytes 1..5 correspond to the look-behind assertions that were satisfied +/// by the transition that created this state. (Look-ahead assertions are not +/// tracked as part of states. Instead, these are applied by re-computing the +/// epsilon closure of a state when computing the transition function. See +/// `next` in the parent module.) +/// +/// Bytes 5..9 correspond to the set of look-around assertions (including both +/// look-behind and look-ahead) that appear somewhere in this state's set of +/// NFA state IDs. This is used to determine whether this state's epsilon +/// closure should be re-computed when computing the transition function. +/// Namely, look-around assertions are "just" conditional epsilon transitions, +/// so if there are new assertions available when computing the transition +/// function, we should only re-compute the epsilon closure if those new +/// assertions are relevant to this particular state. +/// +/// Bytes 9..13 correspond to a 32-bit native-endian encoded integer +/// corresponding to the number of patterns encoded in this state. If the state +/// is not a match state (byte 0 bit 0 is 0) or if it's only pattern ID is +/// PatternID::ZERO, then no integer is encoded at this position. Instead, byte +/// offset 3 is the position at which the first NFA state ID is encoded. +/// +/// For a match state with at least one non-ZERO pattern ID, the next bytes +/// correspond to a sequence of 32-bit native endian encoded integers that +/// represent each pattern ID, in order, that this match state represents. +/// +/// After the pattern IDs (if any), NFA state IDs are delta encoded as +/// varints.[1] The first NFA state ID is encoded as itself, and each +/// subsequent NFA state ID is encoded as the difference between itself and the +/// previous NFA state ID. +/// +/// [1] - https://developers.google.com/protocol-buffers/docs/encoding#varints +struct Repr<'a>(&'a [u8]); + +impl<'a> Repr<'a> { + /// Returns true if and only if this is a match state. + /// + /// If callers have added pattern IDs to this state, then callers MUST set + /// this state as a match state explicitly. However, as a special case, + /// states that are marked as match states but with no pattern IDs, then + /// the state is treated as if it had a single pattern ID equivalent to + /// PatternID::ZERO. + fn is_match(&self) -> bool { + self.0[0] & (1 << 0) > 0 + } + + /// Returns true if and only if this state has had at least one pattern + /// ID added to it. + /// + /// This is an internal-only flag that permits the representation to save + /// space in the common case of an NFA with one pattern in it. In that + /// case, a match state can only ever have exactly one pattern ID: + /// PatternID::ZERO. So there's no need to represent it. + fn has_pattern_ids(&self) -> bool { + self.0[0] & (1 << 1) > 0 + } + + /// Returns true if and only if this state is marked as having been created + /// from a transition over a word byte. This is useful for checking whether + /// a word boundary assertion is true or not, which requires look-behind + /// (whether the current state came from a word byte or not) and look-ahead + /// (whether the transition byte is a word byte or not). + /// + /// Since states with this set are distinct from states that don't have + /// this set (even if they are otherwise equivalent), callers should not + /// set this assertion unless the underlying NFA has at least one word + /// boundary assertion somewhere. Otherwise, a superfluous number of states + /// may be created. + fn is_from_word(&self) -> bool { + self.0[0] & (1 << 2) > 0 + } + + /// Returns true if and only if this state is marked as being inside of a + /// CRLF terminator. In the forward direction, this means the state was + /// created after seeing a `\r`. In the reverse direction, this means the + /// state was created after seeing a `\n`. + fn is_half_crlf(&self) -> bool { + self.0[0] & (1 << 3) > 0 + } + + /// The set of look-behind assertions that were true in the transition that + /// created this state. + /// + /// Generally, this should be empty if 'look_need' is empty, since there is + /// no reason to track which look-behind assertions are true if the state + /// has no conditional epsilon transitions. + /// + /// Satisfied look-ahead assertions are not tracked in states. Instead, + /// these are re-computed on demand via epsilon closure when computing the + /// transition function. + fn look_have(&self) -> LookSet { + LookSet::read_repr(&self.0[1..]) + } + + /// The set of look-around (both behind and ahead) assertions that appear + /// at least once in this state's set of NFA states. + /// + /// This is used to determine whether the epsilon closure needs to be + /// re-computed when computing the transition function. Namely, if the + /// state has no conditional epsilon transitions, then there is no need + /// to re-compute the epsilon closure. + fn look_need(&self) -> LookSet { + LookSet::read_repr(&self.0[5..]) + } + + /// Returns the total number of match pattern IDs in this state. + /// + /// If this state is not a match state, then this always returns 0. + fn match_len(&self) -> usize { + if !self.is_match() { + return 0; + } else if !self.has_pattern_ids() { + 1 + } else { + self.encoded_pattern_len() + } + } + + /// Returns the pattern ID for this match state at the given index. + /// + /// If the given index is greater than or equal to `match_len()` for this + /// state, then this could panic or return incorrect results. + fn match_pattern(&self, index: usize) -> PatternID { + if !self.has_pattern_ids() { + PatternID::ZERO + } else { + let offset = 13 + index * PatternID::SIZE; + // This is OK since we only ever serialize valid PatternIDs to + // states. + wire::read_pattern_id_unchecked(&self.0[offset..]).0 + } + } + + /// Returns a copy of all match pattern IDs in this state. If this state + /// is not a match state, then this returns None. + fn match_pattern_ids(&self) -> Option<Vec<PatternID>> { + if !self.is_match() { + return None; + } + let mut pids = alloc::vec![]; + self.iter_match_pattern_ids(|pid| pids.push(pid)); + Some(pids) + } + + /// Calls the given function on every pattern ID in this state. + fn iter_match_pattern_ids<F: FnMut(PatternID)>(&self, mut f: F) { + if !self.is_match() { + return; + } + // As an optimization for a very common case, when this is a match + // state for an NFA with only one pattern, we don't actually write the + // pattern ID to the state representation. Instead, we know it must + // be there since it is the only possible choice. + if !self.has_pattern_ids() { + f(PatternID::ZERO); + return; + } + let mut pids = &self.0[13..self.pattern_offset_end()]; + while !pids.is_empty() { + let pid = wire::read_u32(pids); + pids = &pids[PatternID::SIZE..]; + // This is OK since we only ever serialize valid PatternIDs to + // states. And since pattern IDs can never exceed a usize, the + // unwrap is OK. + f(PatternID::new_unchecked(usize::try_from(pid).unwrap())); + } + } + + /// Calls the given function on every NFA state ID in this state. + fn iter_nfa_state_ids<F: FnMut(StateID)>(&self, mut f: F) { + let mut sids = &self.0[self.pattern_offset_end()..]; + let mut prev = 0i32; + while !sids.is_empty() { + let (delta, nr) = read_vari32(sids); + sids = &sids[nr..]; + let sid = prev + delta; + prev = sid; + // This is OK since we only ever serialize valid StateIDs to + // states. And since state IDs can never exceed an isize, they must + // always be able to fit into a usize, and thus cast is OK. + f(StateID::new_unchecked(sid.as_usize())) + } + } + + /// Returns the offset into this state's representation where the pattern + /// IDs end and the NFA state IDs begin. + fn pattern_offset_end(&self) -> usize { + let encoded = self.encoded_pattern_len(); + if encoded == 0 { + return 9; + } + // This arithmetic is OK since we were able to address this many bytes + // when writing to the state, thus, it must fit into a usize. + encoded.checked_mul(4).unwrap().checked_add(13).unwrap() + } + + /// Returns the total number of *encoded* pattern IDs in this state. + /// + /// This may return 0 even when this is a match state, since the pattern + /// ID `PatternID::ZERO` is not encoded when it's the only pattern ID in + /// the match state (the overwhelming common case). + fn encoded_pattern_len(&self) -> usize { + if !self.has_pattern_ids() { + return 0; + } + // This unwrap is OK since the total number of patterns is always + // guaranteed to fit into a usize. + usize::try_from(wire::read_u32(&self.0[9..13])).unwrap() + } +} + +impl<'a> core::fmt::Debug for Repr<'a> { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + let mut nfa_ids = alloc::vec![]; + self.iter_nfa_state_ids(|sid| nfa_ids.push(sid)); + f.debug_struct("Repr") + .field("is_match", &self.is_match()) + .field("is_from_word", &self.is_from_word()) + .field("is_half_crlf", &self.is_half_crlf()) + .field("look_have", &self.look_have()) + .field("look_need", &self.look_need()) + .field("match_pattern_ids", &self.match_pattern_ids()) + .field("nfa_state_ids", &nfa_ids) + .finish() + } +} + +/// ReprVec is a write-only view into the representation of a DFA state. +/// +/// See Repr for more details on the purpose of this type and also the format. +/// +/// Note that not all possible combinations of methods may be called. This is +/// precisely what the various StateBuilder types encapsulate: they only +/// permit valid combinations via Rust's linear typing. +struct ReprVec<'a>(&'a mut Vec<u8>); + +impl<'a> ReprVec<'a> { + /// Set this state as a match state. + /// + /// This should not be exposed explicitly outside of this module. It is + /// set automatically when a pattern ID is added. + fn set_is_match(&mut self) { + self.0[0] |= 1 << 0; + } + + /// Set that this state has pattern IDs explicitly written to it. + /// + /// This should not be exposed explicitly outside of this module. This is + /// used internally as a space saving optimization. Namely, if the state + /// is a match state but does not have any pattern IDs written to it, + /// then it is automatically inferred to have a pattern ID of ZERO. + fn set_has_pattern_ids(&mut self) { + self.0[0] |= 1 << 1; + } + + /// Set this state as being built from a transition over a word byte. + /// + /// Setting this is only necessary when one needs to deal with word + /// boundary assertions. Therefore, if the underlying NFA has no word + /// boundary assertions, callers should not set this. + fn set_is_from_word(&mut self) { + self.0[0] |= 1 << 2; + } + + /// Set this state as having seen half of a CRLF terminator. + /// + /// In the forward direction, this should be set when a `\r` has been seen. + /// In the reverse direction, this should be set when a `\n` has been seen. + fn set_is_half_crlf(&mut self) { + self.0[0] |= 1 << 3; + } + + /// The set of look-behind assertions that were true in the transition that + /// created this state. + fn look_have(&self) -> LookSet { + self.repr().look_have() + } + + /// The set of look-around (both behind and ahead) assertions that appear + /// at least once in this state's set of NFA states. + fn look_need(&self) -> LookSet { + self.repr().look_need() + } + + /// Mutate the set of look-behind assertions that were true in the + /// transition that created this state. + fn set_look_have(&mut self, mut set: impl FnMut(LookSet) -> LookSet) { + set(self.look_have()).write_repr(&mut self.0[1..]); + } + + /// Mutate the set of look-around (both behind and ahead) assertions that + /// appear at least once in this state's set of NFA states. + fn set_look_need(&mut self, mut set: impl FnMut(LookSet) -> LookSet) { + set(self.look_need()).write_repr(&mut self.0[5..]); + } + + /// Add a pattern ID to this state. All match states must have at least + /// one pattern ID associated with it. + /// + /// Callers must never add duplicative pattern IDs. + /// + /// The order in which patterns are added must correspond to the order + /// in which patterns are reported as matches. + fn add_match_pattern_id(&mut self, pid: PatternID) { + // As a (somewhat small) space saving optimization, in the case where + // a matching state has exactly one pattern ID, PatternID::ZERO, we do + // not write either the pattern ID or the number of patterns encoded. + // Instead, all we do is set the 'is_match' bit on this state. Overall, + // this saves 8 bytes per match state for the overwhelming majority of + // match states. + // + // In order to know whether pattern IDs need to be explicitly read or + // not, we use another internal-only bit, 'has_pattern_ids', to + // indicate whether they have been explicitly written or not. + if !self.repr().has_pattern_ids() { + if pid == PatternID::ZERO { + self.set_is_match(); + return; + } + // Make room for 'close_match_pattern_ids' to write the total + // number of pattern IDs written. + self.0.extend(core::iter::repeat(0).take(PatternID::SIZE)); + self.set_has_pattern_ids(); + // If this was already a match state, then the only way that's + // possible when the state doesn't have pattern IDs is if + // PatternID::ZERO was added by the caller previously. In this + // case, we are now adding a non-ZERO pattern ID after it, in + // which case, we want to make sure to represent ZERO explicitly + // now. + if self.repr().is_match() { + write_u32(self.0, 0) + } else { + // Otherwise, just make sure the 'is_match' bit is set. + self.set_is_match(); + } + } + write_u32(self.0, pid.as_u32()); + } + + /// Indicate that no more pattern IDs will be added to this state. + /// + /// Once this is called, callers must not call it or 'add_match_pattern_id' + /// again. + /// + /// This should not be exposed explicitly outside of this module. It + /// should be called only when converting a StateBuilderMatches into a + /// StateBuilderNFA. + fn close_match_pattern_ids(&mut self) { + // If we never wrote any pattern IDs, then there's nothing to do here. + if !self.repr().has_pattern_ids() { + return; + } + let patsize = PatternID::SIZE; + let pattern_bytes = self.0.len() - 13; + // Every pattern ID uses 4 bytes, so number of bytes should be + // divisible by 4. + assert_eq!(pattern_bytes % patsize, 0); + // This unwrap is OK since we are guaranteed that the maximum number + // of possible patterns fits into a u32. + let count32 = u32::try_from(pattern_bytes / patsize).unwrap(); + wire::NE::write_u32(count32, &mut self.0[9..13]); + } + + /// Add an NFA state ID to this state. The order in which NFA states are + /// added matters. It is the caller's responsibility to ensure that + /// duplicate NFA state IDs are not added. + fn add_nfa_state_id(&mut self, prev: &mut StateID, sid: StateID) { + let delta = sid.as_i32() - prev.as_i32(); + write_vari32(self.0, delta); + *prev = sid; + } + + /// Return a read-only view of this state's representation. + fn repr(&self) -> Repr<'_> { + Repr(self.0.as_slice()) + } +} + +/// Write a signed 32-bit integer using zig-zag encoding. +/// +/// https://developers.google.com/protocol-buffers/docs/encoding#varints +fn write_vari32(data: &mut Vec<u8>, n: i32) { + let mut un = n.to_bits() << 1; + if n < 0 { + un = !un; + } + write_varu32(data, un) +} + +/// Read a signed 32-bit integer using zig-zag encoding. Also, return the +/// number of bytes read. +/// +/// https://developers.google.com/protocol-buffers/docs/encoding#varints +fn read_vari32(data: &[u8]) -> (i32, usize) { + let (un, i) = read_varu32(data); + let mut n = i32::from_bits(un >> 1); + if un & 1 != 0 { + n = !n; + } + (n, i) +} + +/// Write an unsigned 32-bit integer as a varint. In essence, `n` is written +/// as a sequence of bytes where all bytes except for the last one have the +/// most significant bit set. The least significant 7 bits correspond to the +/// actual bits of `n`. So in the worst case, a varint uses 5 bytes, but in +/// very common cases, it uses fewer than 4. +/// +/// https://developers.google.com/protocol-buffers/docs/encoding#varints +fn write_varu32(data: &mut Vec<u8>, mut n: u32) { + while n >= 0b1000_0000 { + data.push(n.low_u8() | 0b1000_0000); + n >>= 7; + } + data.push(n.low_u8()); +} + +/// Read an unsigned 32-bit varint. Also, return the number of bytes read. +/// +/// https://developers.google.com/protocol-buffers/docs/encoding#varints +fn read_varu32(data: &[u8]) -> (u32, usize) { + // N.B. We can assume correctness here since we know that all varuints are + // written with write_varu32. Hence, the 'as' uses and unchecked arithmetic + // is all okay. + let mut n: u32 = 0; + let mut shift: u32 = 0; + for (i, &b) in data.iter().enumerate() { + if b < 0b1000_0000 { + return (n | (u32::from(b) << shift), i + 1); + } + n |= (u32::from(b) & 0b0111_1111) << shift; + shift += 7; + } + (0, 0) +} + +/// Push a native-endian encoded `n` on to `dst`. +fn write_u32(dst: &mut Vec<u8>, n: u32) { + use crate::util::wire::NE; + + let start = dst.len(); + dst.extend(core::iter::repeat(0).take(mem::size_of::<u32>())); + NE::write_u32(n, &mut dst[start..]); +} + +#[cfg(test)] +mod tests { + use alloc::vec; + + use quickcheck::quickcheck; + + use super::*; + + #[cfg(not(miri))] + quickcheck! { + fn prop_state_read_write_nfa_state_ids(sids: Vec<StateID>) -> bool { + // Builders states do not permit duplicate IDs. + let sids = dedup_state_ids(sids); + + let mut b = StateBuilderEmpty::new().into_matches().into_nfa(); + for &sid in &sids { + b.add_nfa_state_id(sid); + } + let s = b.to_state(); + let mut got = vec![]; + s.iter_nfa_state_ids(|sid| got.push(sid)); + got == sids + } + + fn prop_state_read_write_pattern_ids(pids: Vec<PatternID>) -> bool { + // Builders states do not permit duplicate IDs. + let pids = dedup_pattern_ids(pids); + + let mut b = StateBuilderEmpty::new().into_matches(); + for &pid in &pids { + b.add_match_pattern_id(pid); + } + let s = b.into_nfa().to_state(); + let mut got = vec![]; + s.iter_match_pattern_ids(|pid| got.push(pid)); + got == pids + } + + fn prop_state_read_write_nfa_state_and_pattern_ids( + sids: Vec<StateID>, + pids: Vec<PatternID> + ) -> bool { + // Builders states do not permit duplicate IDs. + let sids = dedup_state_ids(sids); + let pids = dedup_pattern_ids(pids); + + let mut b = StateBuilderEmpty::new().into_matches(); + for &pid in &pids { + b.add_match_pattern_id(pid); + } + + let mut b = b.into_nfa(); + for &sid in &sids { + b.add_nfa_state_id(sid); + } + + let s = b.to_state(); + let mut got_pids = vec![]; + s.iter_match_pattern_ids(|pid| got_pids.push(pid)); + let mut got_sids = vec![]; + s.iter_nfa_state_ids(|sid| got_sids.push(sid)); + got_pids == pids && got_sids == sids + } + } + + quickcheck! { + fn prop_read_write_varu32(n: u32) -> bool { + let mut buf = vec![]; + write_varu32(&mut buf, n); + let (got, nread) = read_varu32(&buf); + nread == buf.len() && got == n + } + + fn prop_read_write_vari32(n: i32) -> bool { + let mut buf = vec![]; + write_vari32(&mut buf, n); + let (got, nread) = read_vari32(&buf); + nread == buf.len() && got == n + } + } + + #[cfg(not(miri))] + fn dedup_state_ids(sids: Vec<StateID>) -> Vec<StateID> { + let mut set = alloc::collections::BTreeSet::new(); + let mut deduped = vec![]; + for sid in sids { + if set.contains(&sid) { + continue; + } + set.insert(sid); + deduped.push(sid); + } + deduped + } + + #[cfg(not(miri))] + fn dedup_pattern_ids(pids: Vec<PatternID>) -> Vec<PatternID> { + let mut set = alloc::collections::BTreeSet::new(); + let mut deduped = vec![]; + for pid in pids { + if set.contains(&pid) { + continue; + } + set.insert(pid); + deduped.push(pid); + } + deduped + } +} diff --git a/vendor/regex-automata/src/util/empty.rs b/vendor/regex-automata/src/util/empty.rs new file mode 100644 index 0000000..e16af3b --- /dev/null +++ b/vendor/regex-automata/src/util/empty.rs @@ -0,0 +1,265 @@ +/*! +This module provides helper routines for dealing with zero-width matches. + +The main problem being solved here is this: + +1. The caller wants to search something that they know is valid UTF-8, such +as a Rust `&str`. +2. The regex used by the caller can match the empty string. For example, `a*`. +3. The caller should never get match offsets returned that occur within the +encoding of a UTF-8 codepoint. It is logically incorrect, and also means that, +e.g., slicing the `&str` at those offsets will lead to a panic. + +So the question here is, how do we prevent the caller from getting match +offsets that split a codepoint? For example, strictly speaking, the regex `a*` +matches `☃` at the positions `[0, 0]`, `[1, 1]`, `[2, 2]` and `[3, 3]` since +the UTF-8 encoding of `☃` is `\xE2\x98\x83`. In particular, the `NFA` that +underlies all of the matching engines in this crate doesn't have anything in +its state graph that prevents matching between UTF-8 code units. Indeed, any +engine derived from the `NFA` will match at those positions by virtue of the +fact that the `NFA` is byte oriented. That is, its transitions are defined over +bytes and the matching engines work by proceeding one byte at a time. + +(An alternative architecture would be to define the transitions in an `NFA` +over codepoints, or `char`. And then make the matching engines proceed by +decoding one codepoint at a time. This is a viable strategy, but it doesn't +work for DFA matching engines because designing a fast and memory efficient +transition table for an alphabet as large as Unicode is quite difficult. More +to the point, the top-level `regex` crate supports matching on arbitrary bytes +when Unicode mode is disabled and one is searching a `&[u8]`. So in that case, +you can't just limit yourself to decoding codepoints and matching those. You +really do need to be able to follow byte oriented transitions on the `NFA`.) + +In an older version of the regex crate, we handled this case not in the regex +engine, but in the iterators over matches. Namely, since this case only arises +when the match is empty, we "just" incremented the next starting position +of the search by `N`, where `N` is the length of the codepoint encoded at +the current position. The alternative or more "natural" solution of just +incrementing by `1` would result in executing a search of `a*` on `☃` like +this: + +* Start search at `0`. +* Found match at `[0, 0]`. +* Next start position is `0`. +* To avoid an infinite loop, since it's an empty match, increment by `1`. +* Start search at `1`. +* Found match at `[1, 1]`. Oops. + +But if we instead incremented by `3` (the length in bytes of `☃`), then we get +the following: + +* Start search at `0`. +* Found match at `[0, 0]`. +* Next start position is `0`. +* To avoid an infinite loop, since it's an empty match, increment by `3`. +* Start search at `3`. +* Found match at `[3, 3]`. + +And we get the correct result. But does this technique work in all cases? +Crucially, it requires that a zero-width match that splits a codepoint never +occurs beyond the starting position of the search. Because if it did, merely +incrementing the start position by the number of bytes in the codepoint at +the current position wouldn't be enough. A zero-width match could just occur +anywhere. It turns out that it is _almost_ true. We can convince ourselves by +looking at all possible patterns that can match the empty string: + +* Patterns like `a*`, `a{0}`, `(?:)`, `a|` and `|a` all unconditionally match +the empty string. That is, assuming there isn't an `a` at the current position, +they will all match the empty string at the start of a search. There is no way +to move past it because any other match would not be "leftmost." +* `^` only matches at the beginning of the haystack, where the start position +is `0`. Since we know we're searching valid UTF-8 (if it isn't valid UTF-8, +then this entire problem goes away because it implies your string type supports +invalid UTF-8 and thus must deal with offsets that not only split a codepoint +but occur in entirely invalid UTF-8 somehow), it follows that `^` never matches +between the code units of a codepoint because the start of a valid UTF-8 string +is never within the encoding of a codepoint. +* `$` basically the same logic as `^`, but for the end of a string. A valid +UTF-8 string can't have an incomplete codepoint at the end of it. +* `(?m:^)` follows similarly to `^`, but it can match immediately following +a `\n`. However, since a `\n` is always a codepoint itself and can never +appear within a codepoint, it follows that the position immediately following +a `\n` in a string that is valid UTF-8 is guaranteed to not be between the +code units of another codepoint. (One caveat here is that the line terminator +for multi-line anchors can now be changed to any arbitrary byte, including +things like `\x98` which might occur within a codepoint. However, this wasn't +supported by the old regex crate. If it was, it pose the same problems as +`(?-u:\B)`, as we'll discuss below.) +* `(?m:$)` a similar argument as for `(?m:^)`. The only difference is that a +`(?m:$)` matches just before a `\n`. But the same argument applies. +* `(?Rm:^)` and `(?Rm:$)` weren't supported by the old regex crate, but the +CRLF aware line anchors follow a similar argument as for `(?m:^)` and `(?m:$)`. +Namely, since they only ever match at a boundary where one side is either a +`\r` or a `\n`, neither of which can occur within a codepoint. +* `\b` only matches at positions where both sides are valid codepoints, so +this cannot split a codepoint. +* `\B`, like `\b`, also only matches at positions where both sides are valid +codepoints. So this cannot split a codepoint either. +* `(?-u:\b)` matches only at positions where at least one side of it is an ASCII +word byte. Since ASCII bytes cannot appear as code units in non-ASCII codepoints +(one of the many amazing qualities of UTF-8), it follows that this too cannot +split a codepoint. +* `(?-u:\B)` finally represents a problem. It can matches between *any* two +bytes that are either both word bytes or non-word bytes. Since code units like +`\xE2` and `\x98` (from the UTF-8 encoding of `☃`) are both non-word bytes, +`(?-u:\B)` will match at the position between them. + +Thus, our approach of incrementing one codepoint at a time after seeing an +empty match is flawed because `(?-u:\B)` can result in an empty match that +splits a codepoint at a position past the starting point of a search. For +example, searching `(?-u:\B)` on `a☃` would produce the following matches: `[2, +2]`, `[3, 3]` and `[4, 4]`. The positions at `0` and `1` don't match because +they correspond to word boundaries since `a` is an ASCII word byte. + +So what did the old regex crate do to avoid this? It banned `(?-u:\B)` from +regexes that could match `&str`. That might sound extreme, but a lot of other +things were banned too. For example, all of `(?-u:.)`, `(?-u:[^a])` and +`(?-u:\W)` can match invalid UTF-8 too, including individual code units with a +codepoint. The key difference is that those expressions could never produce an +empty match. That ban happens when translating an `Ast` to an `Hir`, because +that process that reason about whether an `Hir` can produce *non-empty* matches +at invalid UTF-8 boundaries. Bottom line though is that we side-stepped the +`(?-u:\B)` issue by banning it. + +If banning `(?-u:\B)` were the only issue with the old regex crate's approach, +then I probably would have kept it. `\B` is rarely used, so it's not such a big +deal to have to work-around it. However, the problem with the above approach +is that it doesn't compose. The logic for avoiding splitting a codepoint only +lived in the iterator, which means if anyone wants to implement their own +iterator over regex matches, they have to deal with this extremely subtle edge +case to get full correctness. + +Instead, in this crate, we take the approach of pushing this complexity down +to the lowest layers of each regex engine. The approach is pretty simple: + +* If this corner case doesn't apply, don't do anything. (For example, if UTF-8 +mode isn't enabled or if the regex cannot match the empty string.) +* If an empty match is reported, explicitly check if it splits a codepoint. +* If it doesn't, we're done, return the match. +* If it does, then ignore the match and re-run the search. +* Repeat the above process until the end of the haystack is reached or a match +is found that doesn't split a codepoint or isn't zero width. + +And that's pretty much what this module provides. Every regex engine uses these +methods in their lowest level public APIs, but just above the layer where +their internal engine is used. That way, all regex engines can be arbitrarily +composed without worrying about handling this case, and iterators don't need to +handle it explicitly. + +(It turns out that a new feature I added, support for changing the line +terminator in a regex to any arbitrary byte, also provokes the above problem. +Namely, the byte could be invalid UTF-8 or a UTF-8 continuation byte. So that +support would need to be limited or banned when UTF-8 mode is enabled, just +like we did for `(?-u:\B)`. But thankfully our more robust approach in this +crate handles that case just fine too.) +*/ + +use crate::util::search::{Input, MatchError}; + +#[cold] +#[inline(never)] +pub(crate) fn skip_splits_fwd<T, F>( + input: &Input<'_>, + init_value: T, + match_offset: usize, + find: F, +) -> Result<Option<T>, MatchError> +where + F: FnMut(&Input<'_>) -> Result<Option<(T, usize)>, MatchError>, +{ + skip_splits(true, input, init_value, match_offset, find) +} + +#[cold] +#[inline(never)] +pub(crate) fn skip_splits_rev<T, F>( + input: &Input<'_>, + init_value: T, + match_offset: usize, + find: F, +) -> Result<Option<T>, MatchError> +where + F: FnMut(&Input<'_>) -> Result<Option<(T, usize)>, MatchError>, +{ + skip_splits(false, input, init_value, match_offset, find) +} + +fn skip_splits<T, F>( + forward: bool, + input: &Input<'_>, + init_value: T, + mut match_offset: usize, + mut find: F, +) -> Result<Option<T>, MatchError> +where + F: FnMut(&Input<'_>) -> Result<Option<(T, usize)>, MatchError>, +{ + // If our config says to do an anchored search, then we're definitely + // done. We just need to determine whether we have a valid match or + // not. If we don't, then we're not allowed to continue, so we report + // no match. + // + // This is actually quite a subtle correctness thing. The key here is + // that if we got an empty match that splits a codepoint after doing an + // anchored search in UTF-8 mode, then that implies that we must have + // *started* the search at a location that splits a codepoint. This + // follows from the fact that if a match is reported from an anchored + // search, then the start offset of the match *must* match the start + // offset of the search. + // + // It also follows that no other non-empty match is possible. For + // example, you might write a regex like '(?:)|SOMETHING' and start its + // search in the middle of a codepoint. The first branch is an empty + // regex that will bubble up a match at the first position, and then + // get rejected here and report no match. But what if 'SOMETHING' could + // have matched? We reason that such a thing is impossible, because + // if it does, it must report a match that starts in the middle of a + // codepoint. This in turn implies that a match is reported whose span + // does not correspond to valid UTF-8, and this breaks the promise + // made when UTF-8 mode is enabled. (That promise *can* be broken, for + // example, by enabling UTF-8 mode but building an by hand NFA that + // produces non-empty matches that span invalid UTF-8. This is an unchecked + // but documented precondition violation of UTF-8 mode, and is documented + // to have unspecified behavior.) + // + // I believe this actually means that if an anchored search is run, and + // UTF-8 mode is enabled and the start position splits a codepoint, + // then it is correct to immediately report no match without even + // executing the regex engine. But it doesn't really seem worth writing + // out that case in every regex engine to save a tiny bit of work in an + // extremely pathological case, so we just handle it here. + if input.get_anchored().is_anchored() { + return Ok(if input.is_char_boundary(match_offset) { + Some(init_value) + } else { + None + }); + } + // Otherwise, we have an unanchored search, so just keep looking for + // matches until we have one that does not split a codepoint or we hit + // EOI. + let mut value = init_value; + let mut input = input.clone(); + while !input.is_char_boundary(match_offset) { + if forward { + // The unwrap is OK here because overflowing usize while + // iterating over a slice is impossible, at it would require + // a slice of length greater than isize::MAX, which is itself + // impossible. + input.set_start(input.start().checked_add(1).unwrap()); + } else { + input.set_end(match input.end().checked_sub(1) { + None => return Ok(None), + Some(end) => end, + }); + } + match find(&input)? { + None => return Ok(None), + Some((new_value, new_match_end)) => { + value = new_value; + match_offset = new_match_end; + } + } + } + Ok(Some(value)) +} diff --git a/vendor/regex-automata/src/util/escape.rs b/vendor/regex-automata/src/util/escape.rs new file mode 100644 index 0000000..7f6aa15 --- /dev/null +++ b/vendor/regex-automata/src/util/escape.rs @@ -0,0 +1,84 @@ +/*! +Provides convenience routines for escaping raw bytes. + +Since this crate tends to deal with `&[u8]` everywhere and the default +`Debug` implementation just shows decimal integers, it makes debugging those +representations quite difficult. This module provides types that show `&[u8]` +as if it were a string, with invalid UTF-8 escaped into its byte-by-byte hex +representation. +*/ + +use crate::util::utf8; + +/// Provides a convenient `Debug` implementation for a `u8`. +/// +/// The `Debug` impl treats the byte as an ASCII, and emits a human readable +/// representation of it. If the byte isn't ASCII, then it's emitted as a hex +/// escape sequence. +#[derive(Clone, Copy)] +pub struct DebugByte(pub u8); + +impl core::fmt::Debug for DebugByte { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + // Special case ASCII space. It's too hard to read otherwise, so + // put quotes around it. I sometimes wonder whether just '\x20' would + // be better... + if self.0 == b' ' { + return write!(f, "' '"); + } + // 10 bytes is enough to cover any output from ascii::escape_default. + let mut bytes = [0u8; 10]; + let mut len = 0; + for (i, mut b) in core::ascii::escape_default(self.0).enumerate() { + // capitalize \xab to \xAB + if i >= 2 && b'a' <= b && b <= b'f' { + b -= 32; + } + bytes[len] = b; + len += 1; + } + write!(f, "{}", core::str::from_utf8(&bytes[..len]).unwrap()) + } +} + +/// Provides a convenient `Debug` implementation for `&[u8]`. +/// +/// This generally works best when the bytes are presumed to be mostly UTF-8, +/// but will work for anything. For any bytes that aren't UTF-8, they are +/// emitted as hex escape sequences. +pub struct DebugHaystack<'a>(pub &'a [u8]); + +impl<'a> core::fmt::Debug for DebugHaystack<'a> { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + write!(f, "\"")?; + // This is a sad re-implementation of a similar impl found in bstr. + let mut bytes = self.0; + while let Some(result) = utf8::decode(bytes) { + let ch = match result { + Ok(ch) => ch, + Err(byte) => { + write!(f, r"\x{:02x}", byte)?; + bytes = &bytes[1..]; + continue; + } + }; + bytes = &bytes[ch.len_utf8()..]; + match ch { + '\0' => write!(f, "\\0")?, + // ASCII control characters except \0, \n, \r, \t + '\x01'..='\x08' + | '\x0b' + | '\x0c' + | '\x0e'..='\x19' + | '\x7f' => { + write!(f, "\\x{:02x}", u32::from(ch))?; + } + '\n' | '\r' | '\t' | _ => { + write!(f, "{}", ch.escape_debug())?; + } + } + } + write!(f, "\"")?; + Ok(()) + } +} diff --git a/vendor/regex-automata/src/util/int.rs b/vendor/regex-automata/src/util/int.rs new file mode 100644 index 0000000..e6b13bf --- /dev/null +++ b/vendor/regex-automata/src/util/int.rs @@ -0,0 +1,252 @@ +/*! +This module provides several integer oriented traits for converting between +both fixed size integers and integers whose size varies based on the target +(like `usize`). + +The driving design principle of this module is to attempt to centralize as many +`as` casts as possible here. And in particular, we separate casts into two +buckets: + +* Casts that we use for their truncating behavior. In this case, we use more +descriptive names, like `low_u32` and `high_u32`. +* Casts that we use for converting back-and-forth between `usize`. These +conversions are generally necessary because we often store indices in different +formats to save on memory, which requires converting to and from `usize`. In +this case, we very specifically do not want to overflow, and so the methods +defined here will panic if the `as` cast would be lossy in debug mode. (A +normal `as` cast will never panic!) + +For `as` casts between raw pointers, we use `cast`, so `as` isn't needed there. + +For regex engines, floating point is just never used, so we don't have to worry +about `as` casts for those. + +Otherwise, this module pretty much covers all of our `as` needs except for one +thing: const contexts. There are a select few places in this crate where we +still need to use `as` because const functions on traits aren't stable yet. +If we wind up significantly expanding our const footprint in this crate, it +might be worth defining free functions to handle those cases. But at the time +of writing, that just seemed like too much ceremony. Instead, I comment each +such use of `as` in a const context with a "fixme" notice. + +NOTE: for simplicity, we don't take target pointer width into account here for +`usize` conversions. Since we currently only panic in debug mode, skipping the +check when it can be proven it isn't needed at compile time doesn't really +matter. Now, if we wind up wanting to do as many checks as possible in release +mode, then we would want to skip those when we know the conversions are always +non-lossy. + +NOTE: this module isn't an exhaustive API. For example, we still use things +like `u64::from` where possible, or even `usize::try_from()` for when we do +explicitly want to panic or when we want to return an error for overflow. +*/ + +pub(crate) trait U8 { + fn as_usize(self) -> usize; +} + +impl U8 for u8 { + fn as_usize(self) -> usize { + usize::from(self) + } +} + +pub(crate) trait U16 { + fn as_usize(self) -> usize; + fn low_u8(self) -> u8; + fn high_u8(self) -> u8; +} + +impl U16 for u16 { + fn as_usize(self) -> usize { + usize::from(self) + } + + fn low_u8(self) -> u8 { + self as u8 + } + + fn high_u8(self) -> u8 { + (self >> 8) as u8 + } +} + +pub(crate) trait U32 { + fn as_usize(self) -> usize; + fn low_u8(self) -> u8; + fn low_u16(self) -> u16; + fn high_u16(self) -> u16; +} + +impl U32 for u32 { + fn as_usize(self) -> usize { + #[cfg(debug_assertions)] + { + usize::try_from(self).expect("u32 overflowed usize") + } + #[cfg(not(debug_assertions))] + { + self as usize + } + } + + fn low_u8(self) -> u8 { + self as u8 + } + + fn low_u16(self) -> u16 { + self as u16 + } + + fn high_u16(self) -> u16 { + (self >> 16) as u16 + } +} + +pub(crate) trait U64 { + fn as_usize(self) -> usize; + fn low_u8(self) -> u8; + fn low_u16(self) -> u16; + fn low_u32(self) -> u32; + fn high_u32(self) -> u32; +} + +impl U64 for u64 { + fn as_usize(self) -> usize { + #[cfg(debug_assertions)] + { + usize::try_from(self).expect("u64 overflowed usize") + } + #[cfg(not(debug_assertions))] + { + self as usize + } + } + + fn low_u8(self) -> u8 { + self as u8 + } + + fn low_u16(self) -> u16 { + self as u16 + } + + fn low_u32(self) -> u32 { + self as u32 + } + + fn high_u32(self) -> u32 { + (self >> 32) as u32 + } +} + +pub(crate) trait I32 { + fn as_usize(self) -> usize; + fn to_bits(self) -> u32; + fn from_bits(n: u32) -> i32; +} + +impl I32 for i32 { + fn as_usize(self) -> usize { + #[cfg(debug_assertions)] + { + usize::try_from(self).expect("i32 overflowed usize") + } + #[cfg(not(debug_assertions))] + { + self as usize + } + } + + fn to_bits(self) -> u32 { + self as u32 + } + + fn from_bits(n: u32) -> i32 { + n as i32 + } +} + +pub(crate) trait Usize { + fn as_u8(self) -> u8; + fn as_u16(self) -> u16; + fn as_u32(self) -> u32; + fn as_u64(self) -> u64; +} + +impl Usize for usize { + fn as_u8(self) -> u8 { + #[cfg(debug_assertions)] + { + u8::try_from(self).expect("usize overflowed u8") + } + #[cfg(not(debug_assertions))] + { + self as u8 + } + } + + fn as_u16(self) -> u16 { + #[cfg(debug_assertions)] + { + u16::try_from(self).expect("usize overflowed u16") + } + #[cfg(not(debug_assertions))] + { + self as u16 + } + } + + fn as_u32(self) -> u32 { + #[cfg(debug_assertions)] + { + u32::try_from(self).expect("usize overflowed u32") + } + #[cfg(not(debug_assertions))] + { + self as u32 + } + } + + fn as_u64(self) -> u64 { + #[cfg(debug_assertions)] + { + u64::try_from(self).expect("usize overflowed u64") + } + #[cfg(not(debug_assertions))] + { + self as u64 + } + } +} + +// Pointers aren't integers, but we convert pointers to integers to perform +// offset arithmetic in some places. (And no, we don't convert the integers +// back to pointers.) So add 'as_usize' conversions here too for completeness. +// +// These 'as' casts are actually okay because they're always non-lossy. But the +// idea here is to just try and remove as much 'as' as possible, particularly +// in this crate where we are being really paranoid about offsets and making +// sure we don't panic on inputs that might be untrusted. This way, the 'as' +// casts become easier to audit if they're all in one place, even when some of +// them are actually okay 100% of the time. + +pub(crate) trait Pointer { + fn as_usize(self) -> usize; +} + +impl<T> Pointer for *const T { + fn as_usize(self) -> usize { + self as usize + } +} + +pub(crate) trait PointerMut { + fn as_usize(self) -> usize; +} + +impl<T> PointerMut for *mut T { + fn as_usize(self) -> usize { + self as usize + } +} diff --git a/vendor/regex-automata/src/util/interpolate.rs b/vendor/regex-automata/src/util/interpolate.rs new file mode 100644 index 0000000..f274629 --- /dev/null +++ b/vendor/regex-automata/src/util/interpolate.rs @@ -0,0 +1,579 @@ +/*! +Provides routines for interpolating capture group references. + +That is, if a replacement string contains references like `$foo` or `${foo1}`, +then they are replaced with the corresponding capture values for the groups +named `foo` and `foo1`, respectively. Similarly, syntax like `$1` and `${1}` +is supported as well, with `1` corresponding to a capture group index and not +a name. + +This module provides the free functions [`string`] and [`bytes`], which +interpolate Rust Unicode strings and byte strings, respectively. + +# Format + +These routines support two different kinds of capture references: unbraced and +braced. + +For the unbraced format, the format supported is `$ref` where `name` can be +any character in the class `[0-9A-Za-z_]`. `ref` is always the longest +possible parse. So for example, `$1a` corresponds to the capture group named +`1a` and not the capture group at index `1`. If `ref` matches `^[0-9]+$`, then +it is treated as a capture group index itself and not a name. + +For the braced format, the format supported is `${ref}` where `ref` can be any +sequence of bytes except for `}`. If no closing brace occurs, then it is not +considered a capture reference. As with the unbraced format, if `ref` matches +`^[0-9]+$`, then it is treated as a capture group index and not a name. + +The braced format is useful for exerting precise control over the name of the +capture reference. For example, `${1}a` corresponds to the capture group +reference `1` followed by the letter `a`, where as `$1a` (as mentioned above) +corresponds to the capture group reference `1a`. The braced format is also +useful for expressing capture group names that use characters not supported by +the unbraced format. For example, `${foo[bar].baz}` refers to the capture group +named `foo[bar].baz`. + +If a capture group reference is found and it does not refer to a valid capture +group, then it will be replaced with the empty string. + +To write a literal `$`, use `$$`. + +To be clear, and as exhibited via the type signatures in the routines in this +module, it is impossible for a replacement string to be invalid. A replacement +string may not have the intended semantics, but the interpolation procedure +itself can never fail. +*/ + +use alloc::{string::String, vec::Vec}; + +use crate::util::memchr::memchr; + +/// Accepts a replacement string and interpolates capture references with their +/// corresponding values. +/// +/// `append` should be a function that appends the string value of a capture +/// group at a particular index to the string given. If the capture group +/// index is invalid, then nothing should be appended. +/// +/// `name_to_index` should be a function that maps a capture group name to a +/// capture group index. If the given name doesn't exist, then `None` should +/// be returned. +/// +/// Finally, `dst` is where the final interpolated contents should be written. +/// If `replacement` contains no capture group references, then `dst` will be +/// equivalent to `replacement`. +/// +/// See the [module documentation](self) for details about the format +/// supported. +/// +/// # Example +/// +/// ``` +/// use regex_automata::util::interpolate; +/// +/// let mut dst = String::new(); +/// interpolate::string( +/// "foo $bar baz", +/// |index, dst| { +/// if index == 0 { +/// dst.push_str("BAR"); +/// } +/// }, +/// |name| { +/// if name == "bar" { +/// Some(0) +/// } else { +/// None +/// } +/// }, +/// &mut dst, +/// ); +/// assert_eq!("foo BAR baz", dst); +/// ``` +pub fn string( + mut replacement: &str, + mut append: impl FnMut(usize, &mut String), + mut name_to_index: impl FnMut(&str) -> Option<usize>, + dst: &mut String, +) { + while !replacement.is_empty() { + match memchr(b'$', replacement.as_bytes()) { + None => break, + Some(i) => { + dst.push_str(&replacement[..i]); + replacement = &replacement[i..]; + } + } + // Handle escaping of '$'. + if replacement.as_bytes().get(1).map_or(false, |&b| b == b'$') { + dst.push_str("$"); + replacement = &replacement[2..]; + continue; + } + debug_assert!(!replacement.is_empty()); + let cap_ref = match find_cap_ref(replacement.as_bytes()) { + Some(cap_ref) => cap_ref, + None => { + dst.push_str("$"); + replacement = &replacement[1..]; + continue; + } + }; + replacement = &replacement[cap_ref.end..]; + match cap_ref.cap { + Ref::Number(i) => append(i, dst), + Ref::Named(name) => { + if let Some(i) = name_to_index(name) { + append(i, dst); + } + } + } + } + dst.push_str(replacement); +} + +/// Accepts a replacement byte string and interpolates capture references with +/// their corresponding values. +/// +/// `append` should be a function that appends the byte string value of a +/// capture group at a particular index to the byte string given. If the +/// capture group index is invalid, then nothing should be appended. +/// +/// `name_to_index` should be a function that maps a capture group name to a +/// capture group index. If the given name doesn't exist, then `None` should +/// be returned. +/// +/// Finally, `dst` is where the final interpolated contents should be written. +/// If `replacement` contains no capture group references, then `dst` will be +/// equivalent to `replacement`. +/// +/// See the [module documentation](self) for details about the format +/// supported. +/// +/// # Example +/// +/// ``` +/// use regex_automata::util::interpolate; +/// +/// let mut dst = vec![]; +/// interpolate::bytes( +/// b"foo $bar baz", +/// |index, dst| { +/// if index == 0 { +/// dst.extend_from_slice(b"BAR"); +/// } +/// }, +/// |name| { +/// if name == "bar" { +/// Some(0) +/// } else { +/// None +/// } +/// }, +/// &mut dst, +/// ); +/// assert_eq!(&b"foo BAR baz"[..], dst); +/// ``` +pub fn bytes( + mut replacement: &[u8], + mut append: impl FnMut(usize, &mut Vec<u8>), + mut name_to_index: impl FnMut(&str) -> Option<usize>, + dst: &mut Vec<u8>, +) { + while !replacement.is_empty() { + match memchr(b'$', replacement) { + None => break, + Some(i) => { + dst.extend_from_slice(&replacement[..i]); + replacement = &replacement[i..]; + } + } + // Handle escaping of '$'. + if replacement.get(1).map_or(false, |&b| b == b'$') { + dst.push(b'$'); + replacement = &replacement[2..]; + continue; + } + debug_assert!(!replacement.is_empty()); + let cap_ref = match find_cap_ref(replacement) { + Some(cap_ref) => cap_ref, + None => { + dst.push(b'$'); + replacement = &replacement[1..]; + continue; + } + }; + replacement = &replacement[cap_ref.end..]; + match cap_ref.cap { + Ref::Number(i) => append(i, dst), + Ref::Named(name) => { + if let Some(i) = name_to_index(name) { + append(i, dst); + } + } + } + } + dst.extend_from_slice(replacement); +} + +/// `CaptureRef` represents a reference to a capture group inside some text. +/// The reference is either a capture group name or a number. +/// +/// It is also tagged with the position in the text following the +/// capture reference. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +struct CaptureRef<'a> { + cap: Ref<'a>, + end: usize, +} + +/// A reference to a capture group in some text. +/// +/// e.g., `$2`, `$foo`, `${foo}`. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +enum Ref<'a> { + Named(&'a str), + Number(usize), +} + +impl<'a> From<&'a str> for Ref<'a> { + fn from(x: &'a str) -> Ref<'a> { + Ref::Named(x) + } +} + +impl From<usize> for Ref<'static> { + fn from(x: usize) -> Ref<'static> { + Ref::Number(x) + } +} + +/// Parses a possible reference to a capture group name in the given text, +/// starting at the beginning of `replacement`. +/// +/// If no such valid reference could be found, None is returned. +/// +/// Note that this returns a "possible" reference because this routine doesn't +/// know whether the reference is to a valid group or not. If it winds up not +/// being a valid reference, then it should be replaced with the empty string. +fn find_cap_ref(replacement: &[u8]) -> Option<CaptureRef<'_>> { + let mut i = 0; + let rep: &[u8] = replacement; + if rep.len() <= 1 || rep[0] != b'$' { + return None; + } + i += 1; + if rep[i] == b'{' { + return find_cap_ref_braced(rep, i + 1); + } + let mut cap_end = i; + while rep.get(cap_end).copied().map_or(false, is_valid_cap_letter) { + cap_end += 1; + } + if cap_end == i { + return None; + } + // We just verified that the range 0..cap_end is valid ASCII, so it must + // therefore be valid UTF-8. If we really cared, we could avoid this UTF-8 + // check via an unchecked conversion or by parsing the number straight from + // &[u8]. + let cap = core::str::from_utf8(&rep[i..cap_end]) + .expect("valid UTF-8 capture name"); + Some(CaptureRef { + cap: match cap.parse::<usize>() { + Ok(i) => Ref::Number(i), + Err(_) => Ref::Named(cap), + }, + end: cap_end, + }) +} + +/// Looks for a braced reference, e.g., `${foo1}`. This assumes that an opening +/// brace has been found at `i-1` in `rep`. This then looks for a closing +/// brace and returns the capture reference within the brace. +fn find_cap_ref_braced(rep: &[u8], mut i: usize) -> Option<CaptureRef<'_>> { + assert_eq!(b'{', rep[i.checked_sub(1).unwrap()]); + let start = i; + while rep.get(i).map_or(false, |&b| b != b'}') { + i += 1; + } + if !rep.get(i).map_or(false, |&b| b == b'}') { + return None; + } + // When looking at braced names, we don't put any restrictions on the name, + // so it's possible it could be invalid UTF-8. But a capture group name + // can never be invalid UTF-8, so if we have invalid UTF-8, then we can + // safely return None. + let cap = match core::str::from_utf8(&rep[start..i]) { + Err(_) => return None, + Ok(cap) => cap, + }; + Some(CaptureRef { + cap: match cap.parse::<usize>() { + Ok(i) => Ref::Number(i), + Err(_) => Ref::Named(cap), + }, + end: i + 1, + }) +} + +/// Returns true if and only if the given byte is allowed in a capture name +/// written in non-brace form. +fn is_valid_cap_letter(b: u8) -> bool { + match b { + b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' | b'_' => true, + _ => false, + } +} + +#[cfg(test)] +mod tests { + use alloc::{string::String, vec, vec::Vec}; + + use super::{find_cap_ref, CaptureRef}; + + macro_rules! find { + ($name:ident, $text:expr) => { + #[test] + fn $name() { + assert_eq!(None, find_cap_ref($text.as_bytes())); + } + }; + ($name:ident, $text:expr, $capref:expr) => { + #[test] + fn $name() { + assert_eq!(Some($capref), find_cap_ref($text.as_bytes())); + } + }; + } + + macro_rules! c { + ($name_or_number:expr, $pos:expr) => { + CaptureRef { cap: $name_or_number.into(), end: $pos } + }; + } + + find!(find_cap_ref1, "$foo", c!("foo", 4)); + find!(find_cap_ref2, "${foo}", c!("foo", 6)); + find!(find_cap_ref3, "$0", c!(0, 2)); + find!(find_cap_ref4, "$5", c!(5, 2)); + find!(find_cap_ref5, "$10", c!(10, 3)); + // See https://github.com/rust-lang/regex/pull/585 + // for more on characters following numbers + find!(find_cap_ref6, "$42a", c!("42a", 4)); + find!(find_cap_ref7, "${42}a", c!(42, 5)); + find!(find_cap_ref8, "${42"); + find!(find_cap_ref9, "${42 "); + find!(find_cap_ref10, " $0 "); + find!(find_cap_ref11, "$"); + find!(find_cap_ref12, " "); + find!(find_cap_ref13, ""); + find!(find_cap_ref14, "$1-$2", c!(1, 2)); + find!(find_cap_ref15, "$1_$2", c!("1_", 3)); + find!(find_cap_ref16, "$x-$y", c!("x", 2)); + find!(find_cap_ref17, "$x_$y", c!("x_", 3)); + find!(find_cap_ref18, "${#}", c!("#", 4)); + find!(find_cap_ref19, "${Z[}", c!("Z[", 5)); + find!(find_cap_ref20, "${¾}", c!("¾", 5)); + find!(find_cap_ref21, "${¾a}", c!("¾a", 6)); + find!(find_cap_ref22, "${a¾}", c!("a¾", 6)); + find!(find_cap_ref23, "${☃}", c!("☃", 6)); + find!(find_cap_ref24, "${a☃}", c!("a☃", 7)); + find!(find_cap_ref25, "${☃a}", c!("☃a", 7)); + find!(find_cap_ref26, "${名字}", c!("名字", 9)); + + fn interpolate_string( + mut name_to_index: Vec<(&'static str, usize)>, + caps: Vec<&'static str>, + replacement: &str, + ) -> String { + name_to_index.sort_by_key(|x| x.0); + + let mut dst = String::new(); + super::string( + replacement, + |i, dst| { + if let Some(&s) = caps.get(i) { + dst.push_str(s); + } + }, + |name| -> Option<usize> { + name_to_index + .binary_search_by_key(&name, |x| x.0) + .ok() + .map(|i| name_to_index[i].1) + }, + &mut dst, + ); + dst + } + + fn interpolate_bytes( + mut name_to_index: Vec<(&'static str, usize)>, + caps: Vec<&'static str>, + replacement: &str, + ) -> String { + name_to_index.sort_by_key(|x| x.0); + + let mut dst = vec![]; + super::bytes( + replacement.as_bytes(), + |i, dst| { + if let Some(&s) = caps.get(i) { + dst.extend_from_slice(s.as_bytes()); + } + }, + |name| -> Option<usize> { + name_to_index + .binary_search_by_key(&name, |x| x.0) + .ok() + .map(|i| name_to_index[i].1) + }, + &mut dst, + ); + String::from_utf8(dst).unwrap() + } + + macro_rules! interp { + ($name:ident, $map:expr, $caps:expr, $hay:expr, $expected:expr $(,)*) => { + #[test] + fn $name() { + assert_eq!( + $expected, + interpolate_string($map, $caps, $hay), + "interpolate::string failed", + ); + assert_eq!( + $expected, + interpolate_bytes($map, $caps, $hay), + "interpolate::bytes failed", + ); + } + }; + } + + interp!( + interp1, + vec![("foo", 2)], + vec!["", "", "xxx"], + "test $foo test", + "test xxx test", + ); + + interp!( + interp2, + vec![("foo", 2)], + vec!["", "", "xxx"], + "test$footest", + "test", + ); + + interp!( + interp3, + vec![("foo", 2)], + vec!["", "", "xxx"], + "test${foo}test", + "testxxxtest", + ); + + interp!( + interp4, + vec![("foo", 2)], + vec!["", "", "xxx"], + "test$2test", + "test", + ); + + interp!( + interp5, + vec![("foo", 2)], + vec!["", "", "xxx"], + "test${2}test", + "testxxxtest", + ); + + interp!( + interp6, + vec![("foo", 2)], + vec!["", "", "xxx"], + "test $$foo test", + "test $foo test", + ); + + interp!( + interp7, + vec![("foo", 2)], + vec!["", "", "xxx"], + "test $foo", + "test xxx", + ); + + interp!( + interp8, + vec![("foo", 2)], + vec!["", "", "xxx"], + "$foo test", + "xxx test", + ); + + interp!( + interp9, + vec![("bar", 1), ("foo", 2)], + vec!["", "yyy", "xxx"], + "test $bar$foo", + "test yyyxxx", + ); + + interp!( + interp10, + vec![("bar", 1), ("foo", 2)], + vec!["", "yyy", "xxx"], + "test $ test", + "test $ test", + ); + + interp!( + interp11, + vec![("bar", 1), ("foo", 2)], + vec!["", "yyy", "xxx"], + "test ${} test", + "test test", + ); + + interp!( + interp12, + vec![("bar", 1), ("foo", 2)], + vec!["", "yyy", "xxx"], + "test ${ } test", + "test test", + ); + + interp!( + interp13, + vec![("bar", 1), ("foo", 2)], + vec!["", "yyy", "xxx"], + "test ${a b} test", + "test test", + ); + + interp!( + interp14, + vec![("bar", 1), ("foo", 2)], + vec!["", "yyy", "xxx"], + "test ${a} test", + "test test", + ); + + // This is a funny case where a braced reference is never closed, but + // within the unclosed braced reference, there is an unbraced reference. + // In this case, the braced reference is just treated literally and the + // unbraced reference is found. + interp!( + interp15, + vec![("bar", 1), ("foo", 2)], + vec!["", "yyy", "xxx"], + "test ${wat $bar ok", + "test ${wat yyy ok", + ); +} diff --git a/vendor/regex-automata/src/util/iter.rs b/vendor/regex-automata/src/util/iter.rs new file mode 100644 index 0000000..a789fa0 --- /dev/null +++ b/vendor/regex-automata/src/util/iter.rs @@ -0,0 +1,1027 @@ +/*! +Generic helpers for iteration of matches from a regex engine in a haystack. + +The principle type in this module is a [`Searcher`]. A `Searcher` provides +its own lower level iterator-like API in addition to methods for constructing +types that implement `Iterator`. The documentation for `Searcher` explains a +bit more about why these different APIs exist. + +Currently, this module supports iteration over any regex engine that works +with the [`HalfMatch`], [`Match`] or [`Captures`] types. +*/ + +#[cfg(feature = "alloc")] +use crate::util::captures::Captures; +use crate::util::search::{HalfMatch, Input, Match, MatchError}; + +/// A searcher for creating iterators and performing lower level iteration. +/// +/// This searcher encapsulates the logic required for finding all successive +/// non-overlapping matches in a haystack. In theory, iteration would look +/// something like this: +/// +/// 1. Setting the start position to `0`. +/// 2. Execute a regex search. If no match, end iteration. +/// 3. Report the match and set the start position to the end of the match. +/// 4. Go back to (2). +/// +/// And if this were indeed the case, it's likely that `Searcher` wouldn't +/// exist. Unfortunately, because a regex may match the empty string, the above +/// logic won't work for all possible regexes. Namely, if an empty match is +/// found, then step (3) would set the start position of the search to the +/// position it was at. Thus, iteration would never end. +/// +/// Instead, a `Searcher` knows how to detect these cases and forcefully +/// advance iteration in the case of an empty match that overlaps with a +/// previous match. +/// +/// If you know that your regex cannot match any empty string, then the simple +/// algorithm described above will work correctly. +/// +/// When possible, prefer the iterators defined on the regex engine you're +/// using. This tries to abstract over the regex engine and is thus a bit more +/// unwieldy to use. +/// +/// In particular, a `Searcher` is not itself an iterator. Instead, it provides +/// `advance` routines that permit moving the search along explicitly. It also +/// provides various routines, like [`Searcher::into_matches_iter`], that +/// accept a closure (representing how a regex engine executes a search) and +/// returns a conventional iterator. +/// +/// The lifetime parameters come from the [`Input`] type passed to +/// [`Searcher::new`]: +/// +/// * `'h` is the lifetime of the underlying haystack. +/// +/// # Searcher vs Iterator +/// +/// Why does a search type with "advance" APIs exist at all when we also have +/// iterators? Unfortunately, the reasoning behind this split is a complex +/// combination of the following things: +/// +/// 1. While many of the regex engines expose their own iterators, it is also +/// nice to expose this lower level iteration helper because it permits callers +/// to provide their own `Input` configuration. Moreover, a `Searcher` can work +/// with _any_ regex engine instead of only the ones defined in this crate. +/// This way, everyone benefits from a shared iteration implementation. +/// 2. There are many different regex engines that, while they have the same +/// match semantics, they have slightly different APIs. Iteration is just +/// complex enough to want to share code, and so we need a way of abstracting +/// over those different regex engines. While we could define a new trait that +/// describes any regex engine search API, it would wind up looking very close +/// to a closure. While there may still be reasons for the more generic trait +/// to exist, for now and for the purposes of iteration, we use a closure. +/// Closures also provide a lot of easy flexibility at the call site, in that +/// they permit the caller to borrow any kind of state they want for use during +/// each search call. +/// 3. As a result of using closures, and because closures are anonymous types +/// that cannot be named, it is difficult to encapsulate them without both +/// costs to speed and added complexity to the public API. For example, in +/// defining an iterator type like +/// [`dfa::regex::FindMatches`](crate::dfa::regex::FindMatches), +/// if we use a closure internally, it's not possible to name this type in the +/// return type of the iterator constructor. Thus, the only way around it is +/// to erase the type by boxing it and turning it into a `Box<dyn FnMut ...>`. +/// This boxed closure is unlikely to be inlined _and_ it infects the public +/// API in subtle ways. Namely, unless you declare the closure as implementing +/// `Send` and `Sync`, then the resulting iterator type won't implement it +/// either. But there are practical issues with requiring the closure to +/// implement `Send` and `Sync` that result in other API complexities that +/// are beyond the scope of this already long exposition. +/// 4. Some regex engines expose more complex match information than just +/// "which pattern matched" and "at what offsets." For example, the PikeVM +/// exposes match spans for each capturing group that participated in the +/// match. In such cases, it can be quite beneficial to reuse the capturing +/// group allocation on subsequent searches. A proper iterator doesn't permit +/// this API due to its interface, so it's useful to have something a bit lower +/// level that permits callers to amortize allocations while also reusing a +/// shared implementation of iteration. (See the documentation for +/// [`Searcher::advance`] for an example of using the "advance" API with the +/// PikeVM.) +/// +/// What this boils down to is that there are "advance" APIs which require +/// handing a closure to it for every call, and there are also APIs to create +/// iterators from a closure. The former are useful for _implementing_ +/// iterators or when you need more flexibility, while the latter are useful +/// for conveniently writing custom iterators on-the-fly. +/// +/// # Example: iterating with captures +/// +/// Several regex engines in this crate over convenient iterator APIs over +/// [`Captures`] values. To do so, this requires allocating a new `Captures` +/// value for each iteration step. This can perhaps be more costly than you +/// might want. Instead of implementing your own iterator to avoid that +/// cost (which can be a little subtle if you want to handle empty matches +/// correctly), you can use this `Searcher` to do it for you: +/// +/// ``` +/// use regex_automata::{ +/// nfa::thompson::pikevm::PikeVM, +/// util::iter::Searcher, +/// Input, Span, +/// }; +/// +/// let re = PikeVM::new("foo(?P<numbers>[0-9]+)")?; +/// let haystack = "foo1 foo12 foo123"; +/// +/// let mut caps = re.create_captures(); +/// let mut cache = re.create_cache(); +/// let mut matches = vec![]; +/// let mut searcher = Searcher::new(Input::new(haystack)); +/// while let Some(_) = searcher.advance(|input| { +/// re.search(&mut cache, input, &mut caps); +/// Ok(caps.get_match()) +/// }) { +/// // The unwrap is OK since 'numbers' matches if the pattern matches. +/// matches.push(caps.get_group_by_name("numbers").unwrap()); +/// } +/// assert_eq!(matches, vec![ +/// Span::from(3..4), +/// Span::from(8..10), +/// Span::from(14..17), +/// ]); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +#[derive(Clone, Debug)] +pub struct Searcher<'h> { + /// The input parameters to give to each regex engine call. + /// + /// The start position of the search is mutated during iteration. + input: Input<'h>, + /// Records the end offset of the most recent match. This is necessary to + /// handle a corner case for preventing empty matches from overlapping with + /// the ending bounds of a prior match. + last_match_end: Option<usize>, +} + +impl<'h> Searcher<'h> { + /// Create a new fallible non-overlapping matches iterator. + /// + /// The given `input` provides the parameters (including the haystack), + /// while the `finder` represents a closure that calls the underlying regex + /// engine. The closure may borrow any additional state that is needed, + /// such as a prefilter scanner. + pub fn new(input: Input<'h>) -> Searcher<'h> { + Searcher { input, last_match_end: None } + } + + /// Returns the current `Input` used by this searcher. + /// + /// The `Input` returned is generally equivalent to the one given to + /// [`Searcher::new`], but its start position may be different to reflect + /// the start of the next search to be executed. + pub fn input<'s>(&'s self) -> &'s Input<'h> { + &self.input + } + + /// Return the next half match for an infallible search if one exists, and + /// advance to the next position. + /// + /// This is like `try_advance_half`, except errors are converted into + /// panics. + /// + /// # Panics + /// + /// If the given closure returns an error, then this panics. This is useful + /// when you know your underlying regex engine has been configured to not + /// return an error. + /// + /// # Example + /// + /// This example shows how to use a `Searcher` to iterate over all matches + /// when using a DFA, which only provides "half" matches. + /// + /// ``` + /// use regex_automata::{ + /// hybrid::dfa::DFA, + /// util::iter::Searcher, + /// HalfMatch, Input, + /// }; + /// + /// let re = DFA::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}")?; + /// let mut cache = re.create_cache(); + /// + /// let input = Input::new("2010-03-14 2016-10-08 2020-10-22"); + /// let mut it = Searcher::new(input); + /// + /// let expected = Some(HalfMatch::must(0, 10)); + /// let got = it.advance_half(|input| re.try_search_fwd(&mut cache, input)); + /// assert_eq!(expected, got); + /// + /// let expected = Some(HalfMatch::must(0, 21)); + /// let got = it.advance_half(|input| re.try_search_fwd(&mut cache, input)); + /// assert_eq!(expected, got); + /// + /// let expected = Some(HalfMatch::must(0, 32)); + /// let got = it.advance_half(|input| re.try_search_fwd(&mut cache, input)); + /// assert_eq!(expected, got); + /// + /// let expected = None; + /// let got = it.advance_half(|input| re.try_search_fwd(&mut cache, input)); + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// This correctly moves iteration forward even when an empty match occurs: + /// + /// ``` + /// use regex_automata::{ + /// hybrid::dfa::DFA, + /// util::iter::Searcher, + /// HalfMatch, Input, + /// }; + /// + /// let re = DFA::new(r"a|")?; + /// let mut cache = re.create_cache(); + /// + /// let input = Input::new("abba"); + /// let mut it = Searcher::new(input); + /// + /// let expected = Some(HalfMatch::must(0, 1)); + /// let got = it.advance_half(|input| re.try_search_fwd(&mut cache, input)); + /// assert_eq!(expected, got); + /// + /// let expected = Some(HalfMatch::must(0, 2)); + /// let got = it.advance_half(|input| re.try_search_fwd(&mut cache, input)); + /// assert_eq!(expected, got); + /// + /// let expected = Some(HalfMatch::must(0, 4)); + /// let got = it.advance_half(|input| re.try_search_fwd(&mut cache, input)); + /// assert_eq!(expected, got); + /// + /// let expected = None; + /// let got = it.advance_half(|input| re.try_search_fwd(&mut cache, input)); + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn advance_half<F>(&mut self, finder: F) -> Option<HalfMatch> + where + F: FnMut(&Input<'_>) -> Result<Option<HalfMatch>, MatchError>, + { + match self.try_advance_half(finder) { + Ok(m) => m, + Err(err) => panic!( + "unexpected regex half find error: {}\n\ + to handle find errors, use 'try' or 'search' methods", + err, + ), + } + } + + /// Return the next match for an infallible search if one exists, and + /// advance to the next position. + /// + /// The search is advanced even in the presence of empty matches by + /// forbidding empty matches from overlapping with any other match. + /// + /// This is like `try_advance`, except errors are converted into panics. + /// + /// # Panics + /// + /// If the given closure returns an error, then this panics. This is useful + /// when you know your underlying regex engine has been configured to not + /// return an error. + /// + /// # Example + /// + /// This example shows how to use a `Searcher` to iterate over all matches + /// when using a regex based on lazy DFAs: + /// + /// ``` + /// use regex_automata::{ + /// hybrid::regex::Regex, + /// util::iter::Searcher, + /// Match, Input, + /// }; + /// + /// let re = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}")?; + /// let mut cache = re.create_cache(); + /// + /// let input = Input::new("2010-03-14 2016-10-08 2020-10-22"); + /// let mut it = Searcher::new(input); + /// + /// let expected = Some(Match::must(0, 0..10)); + /// let got = it.advance(|input| re.try_search(&mut cache, input)); + /// assert_eq!(expected, got); + /// + /// let expected = Some(Match::must(0, 11..21)); + /// let got = it.advance(|input| re.try_search(&mut cache, input)); + /// assert_eq!(expected, got); + /// + /// let expected = Some(Match::must(0, 22..32)); + /// let got = it.advance(|input| re.try_search(&mut cache, input)); + /// assert_eq!(expected, got); + /// + /// let expected = None; + /// let got = it.advance(|input| re.try_search(&mut cache, input)); + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// This example shows the same as above, but with the PikeVM. This example + /// is useful because it shows how to use this API even when the regex + /// engine doesn't directly return a `Match`. + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson::pikevm::PikeVM, + /// util::iter::Searcher, + /// Match, Input, + /// }; + /// + /// let re = PikeVM::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}")?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// + /// let input = Input::new("2010-03-14 2016-10-08 2020-10-22"); + /// let mut it = Searcher::new(input); + /// + /// let expected = Some(Match::must(0, 0..10)); + /// let got = it.advance(|input| { + /// re.search(&mut cache, input, &mut caps); + /// Ok(caps.get_match()) + /// }); + /// // Note that if we wanted to extract capturing group spans, we could + /// // do that here with 'caps'. + /// assert_eq!(expected, got); + /// + /// let expected = Some(Match::must(0, 11..21)); + /// let got = it.advance(|input| { + /// re.search(&mut cache, input, &mut caps); + /// Ok(caps.get_match()) + /// }); + /// assert_eq!(expected, got); + /// + /// let expected = Some(Match::must(0, 22..32)); + /// let got = it.advance(|input| { + /// re.search(&mut cache, input, &mut caps); + /// Ok(caps.get_match()) + /// }); + /// assert_eq!(expected, got); + /// + /// let expected = None; + /// let got = it.advance(|input| { + /// re.search(&mut cache, input, &mut caps); + /// Ok(caps.get_match()) + /// }); + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn advance<F>(&mut self, finder: F) -> Option<Match> + where + F: FnMut(&Input<'_>) -> Result<Option<Match>, MatchError>, + { + match self.try_advance(finder) { + Ok(m) => m, + Err(err) => panic!( + "unexpected regex find error: {}\n\ + to handle find errors, use 'try' or 'search' methods", + err, + ), + } + } + + /// Return the next half match for a fallible search if one exists, and + /// advance to the next position. + /// + /// This is like `advance_half`, except it permits callers to handle errors + /// during iteration. + #[inline] + pub fn try_advance_half<F>( + &mut self, + mut finder: F, + ) -> Result<Option<HalfMatch>, MatchError> + where + F: FnMut(&Input<'_>) -> Result<Option<HalfMatch>, MatchError>, + { + let mut m = match finder(&self.input)? { + None => return Ok(None), + Some(m) => m, + }; + if Some(m.offset()) == self.last_match_end { + m = match self.handle_overlapping_empty_half_match(m, finder)? { + None => return Ok(None), + Some(m) => m, + }; + } + self.input.set_start(m.offset()); + self.last_match_end = Some(m.offset()); + Ok(Some(m)) + } + + /// Return the next match for a fallible search if one exists, and advance + /// to the next position. + /// + /// This is like `advance`, except it permits callers to handle errors + /// during iteration. + #[inline] + pub fn try_advance<F>( + &mut self, + mut finder: F, + ) -> Result<Option<Match>, MatchError> + where + F: FnMut(&Input<'_>) -> Result<Option<Match>, MatchError>, + { + let mut m = match finder(&self.input)? { + None => return Ok(None), + Some(m) => m, + }; + if m.is_empty() && Some(m.end()) == self.last_match_end { + m = match self.handle_overlapping_empty_match(m, finder)? { + None => return Ok(None), + Some(m) => m, + }; + } + self.input.set_start(m.end()); + self.last_match_end = Some(m.end()); + Ok(Some(m)) + } + + /// Given a closure that executes a single search, return an iterator over + /// all successive non-overlapping half matches. + /// + /// The iterator returned yields result values. If the underlying regex + /// engine is configured to never return an error, consider calling + /// [`TryHalfMatchesIter::infallible`] to convert errors into panics. + /// + /// # Example + /// + /// This example shows how to use a `Searcher` to create a proper + /// iterator over half matches. + /// + /// ``` + /// use regex_automata::{ + /// hybrid::dfa::DFA, + /// util::iter::Searcher, + /// HalfMatch, Input, + /// }; + /// + /// let re = DFA::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}")?; + /// let mut cache = re.create_cache(); + /// + /// let input = Input::new("2010-03-14 2016-10-08 2020-10-22"); + /// let mut it = Searcher::new(input).into_half_matches_iter(|input| { + /// re.try_search_fwd(&mut cache, input) + /// }); + /// + /// let expected = Some(Ok(HalfMatch::must(0, 10))); + /// assert_eq!(expected, it.next()); + /// + /// let expected = Some(Ok(HalfMatch::must(0, 21))); + /// assert_eq!(expected, it.next()); + /// + /// let expected = Some(Ok(HalfMatch::must(0, 32))); + /// assert_eq!(expected, it.next()); + /// + /// let expected = None; + /// assert_eq!(expected, it.next()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn into_half_matches_iter<F>( + self, + finder: F, + ) -> TryHalfMatchesIter<'h, F> + where + F: FnMut(&Input<'_>) -> Result<Option<HalfMatch>, MatchError>, + { + TryHalfMatchesIter { it: self, finder } + } + + /// Given a closure that executes a single search, return an iterator over + /// all successive non-overlapping matches. + /// + /// The iterator returned yields result values. If the underlying regex + /// engine is configured to never return an error, consider calling + /// [`TryMatchesIter::infallible`] to convert errors into panics. + /// + /// # Example + /// + /// This example shows how to use a `Searcher` to create a proper + /// iterator over matches. + /// + /// ``` + /// use regex_automata::{ + /// hybrid::regex::Regex, + /// util::iter::Searcher, + /// Match, Input, + /// }; + /// + /// let re = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}")?; + /// let mut cache = re.create_cache(); + /// + /// let input = Input::new("2010-03-14 2016-10-08 2020-10-22"); + /// let mut it = Searcher::new(input).into_matches_iter(|input| { + /// re.try_search(&mut cache, input) + /// }); + /// + /// let expected = Some(Ok(Match::must(0, 0..10))); + /// assert_eq!(expected, it.next()); + /// + /// let expected = Some(Ok(Match::must(0, 11..21))); + /// assert_eq!(expected, it.next()); + /// + /// let expected = Some(Ok(Match::must(0, 22..32))); + /// assert_eq!(expected, it.next()); + /// + /// let expected = None; + /// assert_eq!(expected, it.next()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn into_matches_iter<F>(self, finder: F) -> TryMatchesIter<'h, F> + where + F: FnMut(&Input<'_>) -> Result<Option<Match>, MatchError>, + { + TryMatchesIter { it: self, finder } + } + + /// Given a closure that executes a single search, return an iterator over + /// all successive non-overlapping `Captures` values. + /// + /// The iterator returned yields result values. If the underlying regex + /// engine is configured to never return an error, consider calling + /// [`TryCapturesIter::infallible`] to convert errors into panics. + /// + /// Unlike the other iterator constructors, this accepts an initial + /// `Captures` value. This `Captures` value is reused for each search, and + /// the iterator implementation clones it before returning it. The caller + /// must provide this value because the iterator is purposely ignorant + /// of the underlying regex engine and thus doesn't know how to create + /// one itself. More to the point, a `Captures` value itself has a few + /// different constructors, which change which kind of information is + /// available to query in exchange for search performance. + /// + /// # Example + /// + /// This example shows how to use a `Searcher` to create a proper iterator + /// over `Captures` values, which provides access to all capturing group + /// spans for each match. + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson::pikevm::PikeVM, + /// util::iter::Searcher, + /// Input, + /// }; + /// + /// let re = PikeVM::new( + /// r"(?P<y>[0-9]{4})-(?P<m>[0-9]{2})-(?P<d>[0-9]{2})", + /// )?; + /// let (mut cache, caps) = (re.create_cache(), re.create_captures()); + /// + /// let haystack = "2010-03-14 2016-10-08 2020-10-22"; + /// let input = Input::new(haystack); + /// let mut it = Searcher::new(input) + /// .into_captures_iter(caps, |input, caps| { + /// re.search(&mut cache, input, caps); + /// Ok(()) + /// }); + /// + /// let got = it.next().expect("first date")?; + /// let year = got.get_group_by_name("y").expect("must match"); + /// assert_eq!("2010", &haystack[year]); + /// + /// let got = it.next().expect("second date")?; + /// let month = got.get_group_by_name("m").expect("must match"); + /// assert_eq!("10", &haystack[month]); + /// + /// let got = it.next().expect("third date")?; + /// let day = got.get_group_by_name("d").expect("must match"); + /// assert_eq!("22", &haystack[day]); + /// + /// assert!(it.next().is_none()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[cfg(feature = "alloc")] + #[inline] + pub fn into_captures_iter<F>( + self, + caps: Captures, + finder: F, + ) -> TryCapturesIter<'h, F> + where + F: FnMut(&Input<'_>, &mut Captures) -> Result<(), MatchError>, + { + TryCapturesIter { it: self, caps, finder } + } + + /// Handles the special case of a match that begins where the previous + /// match ended. Without this special handling, it'd be possible to get + /// stuck where an empty match never results in forward progress. This + /// also makes it more consistent with how presiding general purpose regex + /// engines work. + #[cold] + #[inline(never)] + fn handle_overlapping_empty_half_match<F>( + &mut self, + _: HalfMatch, + mut finder: F, + ) -> Result<Option<HalfMatch>, MatchError> + where + F: FnMut(&Input<'_>) -> Result<Option<HalfMatch>, MatchError>, + { + // Since we are only here when 'm.offset()' matches the offset of the + // last match, it follows that this must have been an empty match. + // Since we both need to make progress *and* prevent overlapping + // matches, we discard this match and advance the search by 1. + // + // Note that this may start a search in the middle of a codepoint. The + // regex engines themselves are expected to deal with that and not + // report any matches within a codepoint if they are configured in + // UTF-8 mode. + self.input.set_start(self.input.start().checked_add(1).unwrap()); + finder(&self.input) + } + + /// Handles the special case of an empty match by ensuring that 1) the + /// iterator always advances and 2) empty matches never overlap with other + /// matches. + /// + /// (1) is necessary because we principally make progress by setting the + /// starting location of the next search to the ending location of the last + /// match. But if a match is empty, then this results in a search that does + /// not advance and thus does not terminate. + /// + /// (2) is not strictly necessary, but makes intuitive sense and matches + /// the presiding behavior of most general purpose regex engines. The + /// "intuitive sense" here is that we want to report NON-overlapping + /// matches. So for example, given the regex 'a|(?:)' against the haystack + /// 'a', without the special handling, you'd get the matches [0, 1) and [1, + /// 1), where the latter overlaps with the end bounds of the former. + /// + /// Note that we mark this cold and forcefully prevent inlining because + /// handling empty matches like this is extremely rare and does require + /// quite a bit of code, comparatively. Keeping this code out of the main + /// iterator function keeps it smaller and more amenable to inlining + /// itself. + #[cold] + #[inline(never)] + fn handle_overlapping_empty_match<F>( + &mut self, + m: Match, + mut finder: F, + ) -> Result<Option<Match>, MatchError> + where + F: FnMut(&Input<'_>) -> Result<Option<Match>, MatchError>, + { + assert!(m.is_empty()); + self.input.set_start(self.input.start().checked_add(1).unwrap()); + finder(&self.input) + } +} + +/// An iterator over all non-overlapping half matches for a fallible search. +/// +/// The iterator yields a `Result<HalfMatch, MatchError>` value until no more +/// matches could be found. +/// +/// The type parameters are as follows: +/// +/// * `F` represents the type of a closure that executes the search. +/// +/// The lifetime parameters come from the [`Input`] type: +/// +/// * `'h` is the lifetime of the underlying haystack. +/// +/// When possible, prefer the iterators defined on the regex engine you're +/// using. This tries to abstract over the regex engine and is thus a bit more +/// unwieldy to use. +/// +/// This iterator is created by [`Searcher::into_half_matches_iter`]. +pub struct TryHalfMatchesIter<'h, F> { + it: Searcher<'h>, + finder: F, +} + +impl<'h, F> TryHalfMatchesIter<'h, F> { + /// Return an infallible version of this iterator. + /// + /// Any item yielded that corresponds to an error results in a panic. This + /// is useful if your underlying regex engine is configured in a way that + /// it is guaranteed to never return an error. + pub fn infallible(self) -> HalfMatchesIter<'h, F> { + HalfMatchesIter(self) + } + + /// Returns the current `Input` used by this iterator. + /// + /// The `Input` returned is generally equivalent to the one used to + /// construct this iterator, but its start position may be different to + /// reflect the start of the next search to be executed. + pub fn input<'i>(&'i self) -> &'i Input<'h> { + self.it.input() + } +} + +impl<'h, F> Iterator for TryHalfMatchesIter<'h, F> +where + F: FnMut(&Input<'_>) -> Result<Option<HalfMatch>, MatchError>, +{ + type Item = Result<HalfMatch, MatchError>; + + #[inline] + fn next(&mut self) -> Option<Result<HalfMatch, MatchError>> { + self.it.try_advance_half(&mut self.finder).transpose() + } +} + +impl<'h, F> core::fmt::Debug for TryHalfMatchesIter<'h, F> { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + f.debug_struct("TryHalfMatchesIter") + .field("it", &self.it) + .field("finder", &"<closure>") + .finish() + } +} + +/// An iterator over all non-overlapping half matches for an infallible search. +/// +/// The iterator yields a [`HalfMatch`] value until no more matches could be +/// found. +/// +/// The type parameters are as follows: +/// +/// * `F` represents the type of a closure that executes the search. +/// +/// The lifetime parameters come from the [`Input`] type: +/// +/// * `'h` is the lifetime of the underlying haystack. +/// +/// When possible, prefer the iterators defined on the regex engine you're +/// using. This tries to abstract over the regex engine and is thus a bit more +/// unwieldy to use. +/// +/// This iterator is created by [`Searcher::into_half_matches_iter`] and +/// then calling [`TryHalfMatchesIter::infallible`]. +#[derive(Debug)] +pub struct HalfMatchesIter<'h, F>(TryHalfMatchesIter<'h, F>); + +impl<'h, F> HalfMatchesIter<'h, F> { + /// Returns the current `Input` used by this iterator. + /// + /// The `Input` returned is generally equivalent to the one used to + /// construct this iterator, but its start position may be different to + /// reflect the start of the next search to be executed. + pub fn input<'i>(&'i self) -> &'i Input<'h> { + self.0.it.input() + } +} + +impl<'h, F> Iterator for HalfMatchesIter<'h, F> +where + F: FnMut(&Input<'_>) -> Result<Option<HalfMatch>, MatchError>, +{ + type Item = HalfMatch; + + #[inline] + fn next(&mut self) -> Option<HalfMatch> { + match self.0.next()? { + Ok(m) => Some(m), + Err(err) => panic!( + "unexpected regex half find error: {}\n\ + to handle find errors, use 'try' or 'search' methods", + err, + ), + } + } +} + +/// An iterator over all non-overlapping matches for a fallible search. +/// +/// The iterator yields a `Result<Match, MatchError>` value until no more +/// matches could be found. +/// +/// The type parameters are as follows: +/// +/// * `F` represents the type of a closure that executes the search. +/// +/// The lifetime parameters come from the [`Input`] type: +/// +/// * `'h` is the lifetime of the underlying haystack. +/// +/// When possible, prefer the iterators defined on the regex engine you're +/// using. This tries to abstract over the regex engine and is thus a bit more +/// unwieldy to use. +/// +/// This iterator is created by [`Searcher::into_matches_iter`]. +pub struct TryMatchesIter<'h, F> { + it: Searcher<'h>, + finder: F, +} + +impl<'h, F> TryMatchesIter<'h, F> { + /// Return an infallible version of this iterator. + /// + /// Any item yielded that corresponds to an error results in a panic. This + /// is useful if your underlying regex engine is configured in a way that + /// it is guaranteed to never return an error. + pub fn infallible(self) -> MatchesIter<'h, F> { + MatchesIter(self) + } + + /// Returns the current `Input` used by this iterator. + /// + /// The `Input` returned is generally equivalent to the one used to + /// construct this iterator, but its start position may be different to + /// reflect the start of the next search to be executed. + pub fn input<'i>(&'i self) -> &'i Input<'h> { + self.it.input() + } +} + +impl<'h, F> Iterator for TryMatchesIter<'h, F> +where + F: FnMut(&Input<'_>) -> Result<Option<Match>, MatchError>, +{ + type Item = Result<Match, MatchError>; + + #[inline] + fn next(&mut self) -> Option<Result<Match, MatchError>> { + self.it.try_advance(&mut self.finder).transpose() + } +} + +impl<'h, F> core::fmt::Debug for TryMatchesIter<'h, F> { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + f.debug_struct("TryMatchesIter") + .field("it", &self.it) + .field("finder", &"<closure>") + .finish() + } +} + +/// An iterator over all non-overlapping matches for an infallible search. +/// +/// The iterator yields a [`Match`] value until no more matches could be found. +/// +/// The type parameters are as follows: +/// +/// * `F` represents the type of a closure that executes the search. +/// +/// The lifetime parameters come from the [`Input`] type: +/// +/// * `'h` is the lifetime of the underlying haystack. +/// +/// When possible, prefer the iterators defined on the regex engine you're +/// using. This tries to abstract over the regex engine and is thus a bit more +/// unwieldy to use. +/// +/// This iterator is created by [`Searcher::into_matches_iter`] and +/// then calling [`TryMatchesIter::infallible`]. +#[derive(Debug)] +pub struct MatchesIter<'h, F>(TryMatchesIter<'h, F>); + +impl<'h, F> MatchesIter<'h, F> { + /// Returns the current `Input` used by this iterator. + /// + /// The `Input` returned is generally equivalent to the one used to + /// construct this iterator, but its start position may be different to + /// reflect the start of the next search to be executed. + pub fn input<'i>(&'i self) -> &'i Input<'h> { + self.0.it.input() + } +} + +impl<'h, F> Iterator for MatchesIter<'h, F> +where + F: FnMut(&Input<'_>) -> Result<Option<Match>, MatchError>, +{ + type Item = Match; + + #[inline] + fn next(&mut self) -> Option<Match> { + match self.0.next()? { + Ok(m) => Some(m), + Err(err) => panic!( + "unexpected regex find error: {}\n\ + to handle find errors, use 'try' or 'search' methods", + err, + ), + } + } +} + +/// An iterator over all non-overlapping captures for a fallible search. +/// +/// The iterator yields a `Result<Captures, MatchError>` value until no more +/// matches could be found. +/// +/// The type parameters are as follows: +/// +/// * `F` represents the type of a closure that executes the search. +/// +/// The lifetime parameters come from the [`Input`] type: +/// +/// * `'h` is the lifetime of the underlying haystack. +/// +/// When possible, prefer the iterators defined on the regex engine you're +/// using. This tries to abstract over the regex engine and is thus a bit more +/// unwieldy to use. +/// +/// This iterator is created by [`Searcher::into_captures_iter`]. +#[cfg(feature = "alloc")] +pub struct TryCapturesIter<'h, F> { + it: Searcher<'h>, + caps: Captures, + finder: F, +} + +#[cfg(feature = "alloc")] +impl<'h, F> TryCapturesIter<'h, F> { + /// Return an infallible version of this iterator. + /// + /// Any item yielded that corresponds to an error results in a panic. This + /// is useful if your underlying regex engine is configured in a way that + /// it is guaranteed to never return an error. + pub fn infallible(self) -> CapturesIter<'h, F> { + CapturesIter(self) + } +} + +#[cfg(feature = "alloc")] +impl<'h, F> Iterator for TryCapturesIter<'h, F> +where + F: FnMut(&Input<'_>, &mut Captures) -> Result<(), MatchError>, +{ + type Item = Result<Captures, MatchError>; + + #[inline] + fn next(&mut self) -> Option<Result<Captures, MatchError>> { + let TryCapturesIter { ref mut it, ref mut caps, ref mut finder } = + *self; + let result = it + .try_advance(|input| { + (finder)(input, caps)?; + Ok(caps.get_match()) + }) + .transpose()?; + match result { + Ok(_) => Some(Ok(caps.clone())), + Err(err) => Some(Err(err)), + } + } +} + +#[cfg(feature = "alloc")] +impl<'h, F> core::fmt::Debug for TryCapturesIter<'h, F> { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + f.debug_struct("TryCapturesIter") + .field("it", &self.it) + .field("caps", &self.caps) + .field("finder", &"<closure>") + .finish() + } +} + +/// An iterator over all non-overlapping captures for an infallible search. +/// +/// The iterator yields a [`Captures`] value until no more matches could be +/// found. +/// +/// The type parameters are as follows: +/// +/// * `F` represents the type of a closure that executes the search. +/// +/// The lifetime parameters come from the [`Input`] type: +/// +/// * `'h` is the lifetime of the underlying haystack. +/// +/// When possible, prefer the iterators defined on the regex engine you're +/// using. This tries to abstract over the regex engine and is thus a bit more +/// unwieldy to use. +/// +/// This iterator is created by [`Searcher::into_captures_iter`] and then +/// calling [`TryCapturesIter::infallible`]. +#[cfg(feature = "alloc")] +#[derive(Debug)] +pub struct CapturesIter<'h, F>(TryCapturesIter<'h, F>); + +#[cfg(feature = "alloc")] +impl<'h, F> Iterator for CapturesIter<'h, F> +where + F: FnMut(&Input<'_>, &mut Captures) -> Result<(), MatchError>, +{ + type Item = Captures; + + #[inline] + fn next(&mut self) -> Option<Captures> { + match self.0.next()? { + Ok(m) => Some(m), + Err(err) => panic!( + "unexpected regex captures error: {}\n\ + to handle find errors, use 'try' or 'search' methods", + err, + ), + } + } +} diff --git a/vendor/regex-automata/src/util/lazy.rs b/vendor/regex-automata/src/util/lazy.rs new file mode 100644 index 0000000..0d0b4fb --- /dev/null +++ b/vendor/regex-automata/src/util/lazy.rs @@ -0,0 +1,461 @@ +/*! +A lazily initialized value for safe sharing between threads. + +The principal type in this module is `Lazy`, which makes it easy to construct +values that are shared safely across multiple threads simultaneously. +*/ + +use core::fmt; + +/// A lazily initialized value that implements `Deref` for `T`. +/// +/// A `Lazy` takes an initialization function and permits callers from any +/// thread to access the result of that initialization function in a safe +/// manner. In effect, this permits one-time initialization of global resources +/// in a (possibly) multi-threaded program. +/// +/// This type and its functionality are available even when neither the `alloc` +/// nor the `std` features are enabled. In exchange, a `Lazy` does **not** +/// guarantee that the given `create` function is called at most once. It +/// might be called multiple times. Moreover, a call to `Lazy::get` (either +/// explicitly or implicitly via `Lazy`'s `Deref` impl) may block until a `T` +/// is available. +/// +/// This is very similar to `lazy_static` or `once_cell`, except it doesn't +/// guarantee that the initialization function will be run once and it works +/// in no-alloc no-std environments. With that said, if you need stronger +/// guarantees or a more flexible API, then it is recommended to use either +/// `lazy_static` or `once_cell`. +/// +/// # Warning: may use a spin lock +/// +/// When this crate is compiled _without_ the `alloc` feature, then this type +/// may used a spin lock internally. This can have subtle effects that may +/// be undesirable. See [Spinlocks Considered Harmful][spinharm] for a more +/// thorough treatment of this topic. +/// +/// [spinharm]: https://matklad.github.io/2020/01/02/spinlocks-considered-harmful.html +/// +/// # Example +/// +/// This type is useful for creating regexes once, and then using them from +/// multiple threads simultaneously without worrying about synchronization. +/// +/// ``` +/// use regex_automata::{dfa::regex::Regex, util::lazy::Lazy, Match}; +/// +/// static RE: Lazy<Regex> = Lazy::new(|| Regex::new("foo[0-9]+bar").unwrap()); +/// +/// let expected = Some(Match::must(0, 3..14)); +/// assert_eq!(expected, RE.find(b"zzzfoo12345barzzz")); +/// ``` +pub struct Lazy<T, F = fn() -> T>(lazy::Lazy<T, F>); + +impl<T, F> Lazy<T, F> { + /// Create a new `Lazy` value that is initialized via the given function. + /// + /// The `T` type is automatically inferred from the return type of the + /// `create` function given. + pub const fn new(create: F) -> Lazy<T, F> { + Lazy(lazy::Lazy::new(create)) + } +} + +impl<T, F: Fn() -> T> Lazy<T, F> { + /// Return a reference to the lazily initialized value. + /// + /// This routine may block if another thread is initializing a `T`. + /// + /// Note that given a `x` which has type `Lazy`, this must be called via + /// `Lazy::get(x)` and not `x.get()`. This routine is defined this way + /// because `Lazy` impls `Deref` with a target of `T`. + /// + /// # Panics + /// + /// This panics if the `create` function inside this lazy value panics. + /// If the panic occurred in another thread, then this routine _may_ also + /// panic (but is not guaranteed to do so). + pub fn get(this: &Lazy<T, F>) -> &T { + this.0.get() + } +} + +impl<T, F: Fn() -> T> core::ops::Deref for Lazy<T, F> { + type Target = T; + + fn deref(&self) -> &T { + Lazy::get(self) + } +} + +impl<T: fmt::Debug, F: Fn() -> T> fmt::Debug for Lazy<T, F> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + self.0.fmt(f) + } +} + +#[cfg(feature = "alloc")] +mod lazy { + use core::{ + fmt, + marker::PhantomData, + sync::atomic::{AtomicPtr, Ordering}, + }; + + use alloc::boxed::Box; + + /// A non-std lazy initialized value. + /// + /// This might run the initialization function more than once, but will + /// never block. + /// + /// I wish I could get these semantics into the non-alloc non-std Lazy + /// type below, but I'm not sure how to do it. If you can do an alloc, + /// then the implementation becomes very simple if you don't care about + /// redundant work precisely because a pointer can be atomically swapped. + /// + /// Perhaps making this approach work in the non-alloc non-std case + /// requires asking the caller for a pointer? It would make the API less + /// convenient I think. + pub(super) struct Lazy<T, F> { + data: AtomicPtr<T>, + create: F, + // This indicates to the compiler that this type can drop T. It's not + // totally clear how the absence of this marker could lead to trouble, + // but putting here doesn't have any downsides so we hedge until somone + // can from the Unsafe Working Group can tell us definitively that we + // don't need it. + // + // See: https://github.com/BurntSushi/regex-automata/issues/30 + owned: PhantomData<Box<T>>, + } + + // SAFETY: So long as T and &T (and F and &F) can themselves be safely + // shared among threads, so to can a Lazy<T, _>. Namely, the Lazy API only + // permits accessing a &T and initialization is free of data races. So if T + // is thread safe, then so to is Lazy<T, _>. + // + // We specifically require that T: Send in order for Lazy<T> to be Sync. + // Without that requirement, it's possible to send a T from one thread to + // another via Lazy's destructor. + // + // It's not clear whether we need F: Send+Sync for Lazy to be Sync. But + // we're conservative for now and keep both. + unsafe impl<T: Send + Sync, F: Send + Sync> Sync for Lazy<T, F> {} + + impl<T, F> Lazy<T, F> { + /// Create a new alloc but non-std lazy value that is racily + /// initialized. That is, the 'create' function may be called more than + /// once. + pub(super) const fn new(create: F) -> Lazy<T, F> { + Lazy { + data: AtomicPtr::new(core::ptr::null_mut()), + create, + owned: PhantomData, + } + } + } + + impl<T, F: Fn() -> T> Lazy<T, F> { + /// Get the underlying lazy value. If it hasn't been initialized + /// yet, then always attempt to initialize it (even if some other + /// thread is initializing it) and atomically attach it to this lazy + /// value before returning it. + pub(super) fn get(&self) -> &T { + if let Some(data) = self.poll() { + return data; + } + let data = (self.create)(); + let mut ptr = Box::into_raw(Box::new(data)); + // We attempt to stuff our initialized value into our atomic + // pointer. Upon success, we don't need to do anything. But if + // someone else beat us to the punch, then we need to make sure + // our newly created value is dropped. + let result = self.data.compare_exchange( + core::ptr::null_mut(), + ptr, + Ordering::AcqRel, + Ordering::Acquire, + ); + if let Err(old) = result { + // SAFETY: We created 'ptr' via Box::into_raw above, so turning + // it back into a Box via from_raw is safe. + drop(unsafe { Box::from_raw(ptr) }); + ptr = old; + } + // SAFETY: We just set the pointer above to a non-null value, even + // in the error case, and set it to a fully initialized value + // returned by 'create'. + unsafe { &*ptr } + } + + /// If this lazy value has been initialized successfully, then return + /// that value. Otherwise return None immediately. This never attempts + /// to run initialization itself. + fn poll(&self) -> Option<&T> { + let ptr = self.data.load(Ordering::Acquire); + if ptr.is_null() { + return None; + } + // SAFETY: We just checked that the pointer is not null. Since it's + // not null, it must have been fully initialized by 'get' at some + // point. + Some(unsafe { &*ptr }) + } + } + + impl<T: fmt::Debug, F: Fn() -> T> fmt::Debug for Lazy<T, F> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.debug_struct("Lazy").field("data", &self.poll()).finish() + } + } + + impl<T, F> Drop for Lazy<T, F> { + fn drop(&mut self) { + let ptr = *self.data.get_mut(); + if !ptr.is_null() { + // SAFETY: We just checked that 'ptr' is not null. And since + // we have exclusive access, there are no races to worry about. + drop(unsafe { Box::from_raw(ptr) }); + } + } + } +} + +#[cfg(not(feature = "alloc"))] +mod lazy { + use core::{ + cell::Cell, + fmt, + mem::MaybeUninit, + panic::{RefUnwindSafe, UnwindSafe}, + sync::atomic::{AtomicU8, Ordering}, + }; + + /// Our 'Lazy' value can be in one of three states: + /// + /// * INIT is where it starts, and also ends up back here if the + /// 'create' routine panics. + /// * BUSY is where it sits while initialization is running in exactly + /// one thread. + /// * DONE is where it sits after 'create' has completed and 'data' has + /// been fully initialized. + const LAZY_STATE_INIT: u8 = 0; + const LAZY_STATE_BUSY: u8 = 1; + const LAZY_STATE_DONE: u8 = 2; + + /// A non-alloc non-std lazy initialized value. + /// + /// This guarantees initialization only happens once, but uses a spinlock + /// to block in the case of simultaneous access. Blocking occurs so that + /// one thread waits while another thread initializes the value. + /// + /// I would much rather have the semantics of the 'alloc' Lazy type above. + /// Namely, that we might run the initialization function more than once, + /// but we never otherwise block. However, I don't know how to do that in + /// a non-alloc non-std context. + pub(super) struct Lazy<T, F> { + state: AtomicU8, + create: Cell<Option<F>>, + data: Cell<MaybeUninit<T>>, + } + + // SAFETY: So long as T and &T (and F and &F) can themselves be safely + // shared among threads, so to can a Lazy<T, _>. Namely, the Lazy API only + // permits accessing a &T and initialization is free of data races. So if T + // is thread safe, then so to is Lazy<T, _>. + unsafe impl<T: Send + Sync, F: Send + Sync> Sync for Lazy<T, F> {} + // A reference to a Lazy is unwind safe because we specifically take + // precautions to poison all accesses to a Lazy if the caller-provided + // 'create' function panics. + impl<T: UnwindSafe, F: UnwindSafe + RefUnwindSafe> RefUnwindSafe + for Lazy<T, F> + { + } + + impl<T, F> Lazy<T, F> { + /// Create a new non-alloc non-std lazy value that is initialized + /// exactly once on first use using the given function. + pub(super) const fn new(create: F) -> Lazy<T, F> { + Lazy { + state: AtomicU8::new(LAZY_STATE_INIT), + create: Cell::new(Some(create)), + data: Cell::new(MaybeUninit::uninit()), + } + } + } + + impl<T, F: FnOnce() -> T> Lazy<T, F> { + /// Get the underlying lazy value. If it isn't been initialized + /// yet, then either initialize it or block until some other thread + /// initializes it. If the 'create' function given to Lazy::new panics + /// (even in another thread), then this panics too. + pub(super) fn get(&self) -> &T { + // This is effectively a spinlock. We loop until we enter a DONE + // state, and if possible, initialize it ourselves. The only way + // we exit the loop is if 'create' panics, we initialize 'data' or + // some other thread initializes 'data'. + // + // Yes, I have read spinlocks considered harmful[1]. And that + // article is why this spinlock is only active when 'alloc' isn't + // enabled. I did this because I don't think there is really + // another choice without 'alloc', other than not providing this at + // all. But I think that's a big bummer. + // + // [1]: https://matklad.github.io/2020/01/02/spinlocks-considered-harmful.html + while self.state.load(Ordering::Acquire) != LAZY_STATE_DONE { + // Check if we're the first ones to get here. If so, we'll be + // the ones who initialize. + let result = self.state.compare_exchange( + LAZY_STATE_INIT, + LAZY_STATE_BUSY, + Ordering::AcqRel, + Ordering::Acquire, + ); + // This means we saw the INIT state and nobody else can. So we + // must take responsibility for initializing. And by virtue of + // observing INIT, we have also told anyone else trying to + // get here that we are BUSY. If someone else sees BUSY, then + // they will spin until we finish initialization. + if let Ok(_) = result { + // Since we are guaranteed to be the only ones here, we + // know that 'create' is there... Unless someone else got + // here before us and 'create' panicked. In which case, + // 'self.create' is now 'None' and we forward the panic + // to the caller. (i.e., We implement poisoning.) + // + // SAFETY: Our use of 'self.state' guarantees that we are + // the only thread executing this line, and thus there are + // no races. + let create = unsafe { + (*self.create.as_ptr()).take().expect( + "Lazy's create function panicked, \ + preventing initialization, + poisoning current thread", + ) + }; + let guard = Guard { state: &self.state }; + // SAFETY: Our use of 'self.state' guarantees that we are + // the only thread executing this line, and thus there are + // no races. + unsafe { + (*self.data.as_ptr()).as_mut_ptr().write(create()); + } + // All is well. 'self.create' ran successfully, so we + // forget the guard. + core::mem::forget(guard); + // Everything is initialized, so we can declare success. + self.state.store(LAZY_STATE_DONE, Ordering::Release); + break; + } + core::hint::spin_loop(); + } + // We only get here if data is fully initialized, and thus poll + // will always return something. + self.poll().unwrap() + } + + /// If this lazy value has been initialized successfully, then return + /// that value. Otherwise return None immediately. This never blocks. + fn poll(&self) -> Option<&T> { + if self.state.load(Ordering::Acquire) == LAZY_STATE_DONE { + // SAFETY: The DONE state only occurs when data has been fully + // initialized. + Some(unsafe { &*(*self.data.as_ptr()).as_ptr() }) + } else { + None + } + } + } + + impl<T: fmt::Debug, F: FnMut() -> T> fmt::Debug for Lazy<T, F> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.debug_struct("Lazy") + .field("state", &self.state.load(Ordering::Acquire)) + .field("create", &"<closure>") + .field("data", &self.poll()) + .finish() + } + } + + impl<T, F> Drop for Lazy<T, F> { + fn drop(&mut self) { + if *self.state.get_mut() == LAZY_STATE_DONE { + // SAFETY: state is DONE if and only if data has been fully + // initialized. At which point, it is safe to drop. + unsafe { + self.data.get_mut().assume_init_drop(); + } + } + } + } + + /// A guard that will reset a Lazy's state back to INIT when dropped. The + /// idea here is to 'forget' this guard on success. On failure (when a + /// panic occurs), the Drop impl runs and causes all in-progress and future + /// 'get' calls to panic. Without this guard, all in-progress and future + /// 'get' calls would spin forever. Crashing is much better than getting + /// stuck in an infinite loop. + struct Guard<'a> { + state: &'a AtomicU8, + } + + impl<'a> Drop for Guard<'a> { + fn drop(&mut self) { + // We force ourselves back into an INIT state. This will in turn + // cause any future 'get' calls to attempt calling 'self.create' + // again which will in turn panic because 'self.create' will now + // be 'None'. + self.state.store(LAZY_STATE_INIT, Ordering::Release); + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn assert_send<T: Send>() {} + fn assert_sync<T: Sync>() {} + fn assert_unwind<T: core::panic::UnwindSafe>() {} + fn assert_refunwind<T: core::panic::RefUnwindSafe>() {} + + #[test] + fn oibits() { + assert_send::<Lazy<u64>>(); + assert_sync::<Lazy<u64>>(); + assert_unwind::<Lazy<u64>>(); + assert_refunwind::<Lazy<u64>>(); + } + + // This is a regression test because we used to rely on the inferred Sync + // impl for the Lazy type defined above (for 'alloc' mode). In the + // inferred impl, it only requires that T: Sync for Lazy<T>: Sync. But + // if we have that, we can actually make use of the fact that Lazy<T> drops + // T to create a value on one thread and drop it on another. This *should* + // require T: Send, but our missing bounds before let it sneak by. + // + // Basically, this test should not compile, so we... comment it out. We + // don't have a great way of testing compile-fail tests right now. + // + // See: https://github.com/BurntSushi/regex-automata/issues/30 + /* + #[test] + fn sync_not_send() { + #[allow(dead_code)] + fn inner<T: Sync + Default>() { + let lazy = Lazy::new(move || T::default()); + std::thread::scope(|scope| { + scope.spawn(|| { + Lazy::get(&lazy); // We create T in this thread + }); + }); + // And drop in this thread. + drop(lazy); + // So we have send a !Send type over threads. (with some more + // legwork, its possible to even sneak the value out of drop + // through thread local) + } + } + */ +} diff --git a/vendor/regex-automata/src/util/look.rs b/vendor/regex-automata/src/util/look.rs new file mode 100644 index 0000000..73e51c0 --- /dev/null +++ b/vendor/regex-automata/src/util/look.rs @@ -0,0 +1,2547 @@ +/*! +Types and routines for working with look-around assertions. + +This module principally defines two types: + +* [`Look`] enumerates all of the assertions supported by this crate. +* [`LookSet`] provides a way to efficiently store a set of [`Look`] values. +* [`LookMatcher`] provides routines for checking whether a `Look` or a +`LookSet` matches at a particular position in a haystack. +*/ + +// LAMENTATION: Sadly, a lot of the API of `Look` and `LookSet` were basically +// copied verbatim from the regex-syntax crate. I would have no problems using +// the regex-syntax types and defining the matching routines (only found +// in this crate) as free functions, except the `Look` and `LookSet` types +// are used in lots of places. Including in places we expect to work when +// regex-syntax is *not* enabled, such as in the definition of the NFA itself. +// +// Thankfully the code we copy is pretty simple and there isn't much of it. +// Otherwise, the rest of this module deals with *matching* the assertions, +// which is not something that regex-syntax handles. + +use crate::util::{escape::DebugByte, utf8}; + +/// A look-around assertion. +/// +/// An assertion matches at a position between characters in a haystack. +/// Namely, it does not actually "consume" any input as most parts of a regular +/// expression do. Assertions are a way of stating that some property must be +/// true at a particular point during matching. +/// +/// For example, `(?m)^[a-z]+$` is a pattern that: +/// +/// * Scans the haystack for a position at which `(?m:^)` is satisfied. That +/// occurs at either the beginning of the haystack, or immediately following +/// a `\n` character. +/// * Looks for one or more occurrences of `[a-z]`. +/// * Once `[a-z]+` has matched as much as it can, an overall match is only +/// reported when `[a-z]+` stops just before a `\n`. +/// +/// So in this case, `abc` and `\nabc\n` match, but `\nabc1\n` does not. +/// +/// Assertions are also called "look-around," "look-behind" and "look-ahead." +/// Specifically, some assertions are look-behind (like `^`), other assertions +/// are look-ahead (like `$`) and yet other assertions are both look-ahead and +/// look-behind (like `\b`). +/// +/// # Assertions in an NFA +/// +/// An assertion in a [`thompson::NFA`](crate::nfa::thompson::NFA) can be +/// thought of as a conditional epsilon transition. That is, a matching engine +/// like the [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM) only permits +/// moving through conditional epsilon transitions when their condition +/// is satisfied at whatever position the `PikeVM` is currently at in the +/// haystack. +/// +/// How assertions are handled in a `DFA` is trickier, since a DFA does not +/// have epsilon transitions at all. In this case, they are compiled into the +/// automaton itself, at the expense of more states than what would be required +/// without an assertion. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum Look { + /// Match the beginning of text. Specifically, this matches at the starting + /// position of the input. + Start = 1 << 0, + /// Match the end of text. Specifically, this matches at the ending + /// position of the input. + End = 1 << 1, + /// Match the beginning of a line or the beginning of text. Specifically, + /// this matches at the starting position of the input, or at the position + /// immediately following a `\n` character. + StartLF = 1 << 2, + /// Match the end of a line or the end of text. Specifically, this matches + /// at the end position of the input, or at the position immediately + /// preceding a `\n` character. + EndLF = 1 << 3, + /// Match the beginning of a line or the beginning of text. Specifically, + /// this matches at the starting position of the input, or at the position + /// immediately following either a `\r` or `\n` character, but never after + /// a `\r` when a `\n` follows. + StartCRLF = 1 << 4, + /// Match the end of a line or the end of text. Specifically, this matches + /// at the end position of the input, or at the position immediately + /// preceding a `\r` or `\n` character, but never before a `\n` when a `\r` + /// precedes it. + EndCRLF = 1 << 5, + /// Match an ASCII-only word boundary. That is, this matches a position + /// where the left adjacent character and right adjacent character + /// correspond to a word and non-word or a non-word and word character. + WordAscii = 1 << 6, + /// Match an ASCII-only negation of a word boundary. + WordAsciiNegate = 1 << 7, + /// Match a Unicode-aware word boundary. That is, this matches a position + /// where the left adjacent character and right adjacent character + /// correspond to a word and non-word or a non-word and word character. + WordUnicode = 1 << 8, + /// Match a Unicode-aware negation of a word boundary. + WordUnicodeNegate = 1 << 9, + /// Match the start of an ASCII-only word boundary. That is, this matches a + /// position at either the beginning of the haystack or where the previous + /// character is not a word character and the following character is a word + /// character. + WordStartAscii = 1 << 10, + /// Match the end of an ASCII-only word boundary. That is, this matches + /// a position at either the end of the haystack or where the previous + /// character is a word character and the following character is not a word + /// character. + WordEndAscii = 1 << 11, + /// Match the start of a Unicode word boundary. That is, this matches a + /// position at either the beginning of the haystack or where the previous + /// character is not a word character and the following character is a word + /// character. + WordStartUnicode = 1 << 12, + /// Match the end of a Unicode word boundary. That is, this matches a + /// position at either the end of the haystack or where the previous + /// character is a word character and the following character is not a word + /// character. + WordEndUnicode = 1 << 13, + /// Match the start half of an ASCII-only word boundary. That is, this + /// matches a position at either the beginning of the haystack or where the + /// previous character is not a word character. + WordStartHalfAscii = 1 << 14, + /// Match the end half of an ASCII-only word boundary. That is, this + /// matches a position at either the end of the haystack or where the + /// following character is not a word character. + WordEndHalfAscii = 1 << 15, + /// Match the start half of a Unicode word boundary. That is, this matches + /// a position at either the beginning of the haystack or where the + /// previous character is not a word character. + WordStartHalfUnicode = 1 << 16, + /// Match the end half of a Unicode word boundary. That is, this matches + /// a position at either the end of the haystack or where the following + /// character is not a word character. + WordEndHalfUnicode = 1 << 17, +} + +impl Look { + /// Flip the look-around assertion to its equivalent for reverse searches. + /// For example, `StartLF` gets translated to `EndLF`. + /// + /// Some assertions, such as `WordUnicode`, remain the same since they + /// match the same positions regardless of the direction of the search. + #[inline] + pub const fn reversed(self) -> Look { + match self { + Look::Start => Look::End, + Look::End => Look::Start, + Look::StartLF => Look::EndLF, + Look::EndLF => Look::StartLF, + Look::StartCRLF => Look::EndCRLF, + Look::EndCRLF => Look::StartCRLF, + Look::WordAscii => Look::WordAscii, + Look::WordAsciiNegate => Look::WordAsciiNegate, + Look::WordUnicode => Look::WordUnicode, + Look::WordUnicodeNegate => Look::WordUnicodeNegate, + Look::WordStartAscii => Look::WordEndAscii, + Look::WordEndAscii => Look::WordStartAscii, + Look::WordStartUnicode => Look::WordEndUnicode, + Look::WordEndUnicode => Look::WordStartUnicode, + Look::WordStartHalfAscii => Look::WordEndHalfAscii, + Look::WordEndHalfAscii => Look::WordStartHalfAscii, + Look::WordStartHalfUnicode => Look::WordEndHalfUnicode, + Look::WordEndHalfUnicode => Look::WordStartHalfUnicode, + } + } + + /// Return the underlying representation of this look-around enumeration + /// as an integer. Giving the return value to the [`Look::from_repr`] + /// constructor is guaranteed to return the same look-around variant that + /// one started with within a semver compatible release of this crate. + #[inline] + pub const fn as_repr(self) -> u32 { + // AFAIK, 'as' is the only way to zero-cost convert an int enum to an + // actual int. + self as u32 + } + + /// Given the underlying representation of a `Look` value, return the + /// corresponding `Look` value if the representation is valid. Otherwise + /// `None` is returned. + #[inline] + pub const fn from_repr(repr: u32) -> Option<Look> { + match repr { + 0b00_0000_0000_0000_0001 => Some(Look::Start), + 0b00_0000_0000_0000_0010 => Some(Look::End), + 0b00_0000_0000_0000_0100 => Some(Look::StartLF), + 0b00_0000_0000_0000_1000 => Some(Look::EndLF), + 0b00_0000_0000_0001_0000 => Some(Look::StartCRLF), + 0b00_0000_0000_0010_0000 => Some(Look::EndCRLF), + 0b00_0000_0000_0100_0000 => Some(Look::WordAscii), + 0b00_0000_0000_1000_0000 => Some(Look::WordAsciiNegate), + 0b00_0000_0001_0000_0000 => Some(Look::WordUnicode), + 0b00_0000_0010_0000_0000 => Some(Look::WordUnicodeNegate), + 0b00_0000_0100_0000_0000 => Some(Look::WordStartAscii), + 0b00_0000_1000_0000_0000 => Some(Look::WordEndAscii), + 0b00_0001_0000_0000_0000 => Some(Look::WordStartUnicode), + 0b00_0010_0000_0000_0000 => Some(Look::WordEndUnicode), + 0b00_0100_0000_0000_0000 => Some(Look::WordStartHalfAscii), + 0b00_1000_0000_0000_0000 => Some(Look::WordEndHalfAscii), + 0b01_0000_0000_0000_0000 => Some(Look::WordStartHalfUnicode), + 0b10_0000_0000_0000_0000 => Some(Look::WordEndHalfUnicode), + _ => None, + } + } + + /// Returns a convenient single codepoint representation of this + /// look-around assertion. Each assertion is guaranteed to be represented + /// by a distinct character. + /// + /// This is useful for succinctly representing a look-around assertion in + /// human friendly but succinct output intended for a programmer working on + /// regex internals. + #[inline] + pub const fn as_char(self) -> char { + match self { + Look::Start => 'A', + Look::End => 'z', + Look::StartLF => '^', + Look::EndLF => '$', + Look::StartCRLF => 'r', + Look::EndCRLF => 'R', + Look::WordAscii => 'b', + Look::WordAsciiNegate => 'B', + Look::WordUnicode => '𝛃', + Look::WordUnicodeNegate => '𝚩', + Look::WordStartAscii => '<', + Look::WordEndAscii => '>', + Look::WordStartUnicode => '〈', + Look::WordEndUnicode => '〉', + Look::WordStartHalfAscii => '◁', + Look::WordEndHalfAscii => '▷', + Look::WordStartHalfUnicode => '◀', + Look::WordEndHalfUnicode => '▶', + } + } +} + +/// LookSet is a memory-efficient set of look-around assertions. +/// +/// This is useful for efficiently tracking look-around assertions. For +/// example, a [`thompson::NFA`](crate::nfa::thompson::NFA) provides properties +/// that return `LookSet`s. +#[derive(Clone, Copy, Default, Eq, PartialEq)] +pub struct LookSet { + /// The underlying representation this set is exposed to make it possible + /// to store it somewhere efficiently. The representation is that + /// of a bitset, where each assertion occupies bit `i` where + /// `i = Look::as_repr()`. + /// + /// Note that users of this internal representation must permit the full + /// range of `u16` values to be represented. For example, even if the + /// current implementation only makes use of the 10 least significant bits, + /// it may use more bits in a future semver compatible release. + pub bits: u32, +} + +impl LookSet { + /// Create an empty set of look-around assertions. + #[inline] + pub fn empty() -> LookSet { + LookSet { bits: 0 } + } + + /// Create a full set of look-around assertions. + /// + /// This set contains all possible look-around assertions. + #[inline] + pub fn full() -> LookSet { + LookSet { bits: !0 } + } + + /// Create a look-around set containing the look-around assertion given. + /// + /// This is a convenience routine for creating an empty set and inserting + /// one look-around assertions. + #[inline] + pub fn singleton(look: Look) -> LookSet { + LookSet::empty().insert(look) + } + + /// Returns the total number of look-around assertions in this set. + #[inline] + pub fn len(self) -> usize { + // OK because max value always fits in a u8, which in turn always + // fits in a usize, regardless of target. + usize::try_from(self.bits.count_ones()).unwrap() + } + + /// Returns true if and only if this set is empty. + #[inline] + pub fn is_empty(self) -> bool { + self.len() == 0 + } + + /// Returns true if and only if the given look-around assertion is in this + /// set. + #[inline] + pub fn contains(self, look: Look) -> bool { + self.bits & look.as_repr() != 0 + } + + /// Returns true if and only if this set contains any anchor assertions. + /// This includes both "start/end of haystack" and "start/end of line." + #[inline] + pub fn contains_anchor(&self) -> bool { + self.contains_anchor_haystack() || self.contains_anchor_line() + } + + /// Returns true if and only if this set contains any "start/end of + /// haystack" anchors. This doesn't include "start/end of line" anchors. + #[inline] + pub fn contains_anchor_haystack(&self) -> bool { + self.contains(Look::Start) || self.contains(Look::End) + } + + /// Returns true if and only if this set contains any "start/end of line" + /// anchors. This doesn't include "start/end of haystack" anchors. This + /// includes both `\n` line anchors and CRLF (`\r\n`) aware line anchors. + #[inline] + pub fn contains_anchor_line(&self) -> bool { + self.contains(Look::StartLF) + || self.contains(Look::EndLF) + || self.contains(Look::StartCRLF) + || self.contains(Look::EndCRLF) + } + + /// Returns true if and only if this set contains any "start/end of line" + /// anchors that only treat `\n` as line terminators. This does not include + /// haystack anchors or CRLF aware line anchors. + #[inline] + pub fn contains_anchor_lf(&self) -> bool { + self.contains(Look::StartLF) || self.contains(Look::EndLF) + } + + /// Returns true if and only if this set contains any "start/end of line" + /// anchors that are CRLF-aware. This doesn't include "start/end of + /// haystack" or "start/end of line-feed" anchors. + #[inline] + pub fn contains_anchor_crlf(&self) -> bool { + self.contains(Look::StartCRLF) || self.contains(Look::EndCRLF) + } + + /// Returns true if and only if this set contains any word boundary or + /// negated word boundary assertions. This include both Unicode and ASCII + /// word boundaries. + #[inline] + pub fn contains_word(self) -> bool { + self.contains_word_unicode() || self.contains_word_ascii() + } + + /// Returns true if and only if this set contains any Unicode word boundary + /// or negated Unicode word boundary assertions. + #[inline] + pub fn contains_word_unicode(self) -> bool { + self.contains(Look::WordUnicode) + || self.contains(Look::WordUnicodeNegate) + || self.contains(Look::WordStartUnicode) + || self.contains(Look::WordEndUnicode) + || self.contains(Look::WordStartHalfUnicode) + || self.contains(Look::WordEndHalfUnicode) + } + + /// Returns true if and only if this set contains any ASCII word boundary + /// or negated ASCII word boundary assertions. + #[inline] + pub fn contains_word_ascii(self) -> bool { + self.contains(Look::WordAscii) + || self.contains(Look::WordAsciiNegate) + || self.contains(Look::WordStartAscii) + || self.contains(Look::WordEndAscii) + || self.contains(Look::WordStartHalfAscii) + || self.contains(Look::WordEndHalfAscii) + } + + /// Returns an iterator over all of the look-around assertions in this set. + #[inline] + pub fn iter(self) -> LookSetIter { + LookSetIter { set: self } + } + + /// Return a new set that is equivalent to the original, but with the given + /// assertion added to it. If the assertion is already in the set, then the + /// returned set is equivalent to the original. + #[inline] + pub fn insert(self, look: Look) -> LookSet { + LookSet { bits: self.bits | look.as_repr() } + } + + /// Updates this set in place with the result of inserting the given + /// assertion into this set. + #[inline] + pub fn set_insert(&mut self, look: Look) { + *self = self.insert(look); + } + + /// Return a new set that is equivalent to the original, but with the given + /// assertion removed from it. If the assertion is not in the set, then the + /// returned set is equivalent to the original. + #[inline] + pub fn remove(self, look: Look) -> LookSet { + LookSet { bits: self.bits & !look.as_repr() } + } + + /// Updates this set in place with the result of removing the given + /// assertion from this set. + #[inline] + pub fn set_remove(&mut self, look: Look) { + *self = self.remove(look); + } + + /// Returns a new set that is the result of subtracting the given set from + /// this set. + #[inline] + pub fn subtract(self, other: LookSet) -> LookSet { + LookSet { bits: self.bits & !other.bits } + } + + /// Updates this set in place with the result of subtracting the given set + /// from this set. + #[inline] + pub fn set_subtract(&mut self, other: LookSet) { + *self = self.subtract(other); + } + + /// Returns a new set that is the union of this and the one given. + #[inline] + pub fn union(self, other: LookSet) -> LookSet { + LookSet { bits: self.bits | other.bits } + } + + /// Updates this set in place with the result of unioning it with the one + /// given. + #[inline] + pub fn set_union(&mut self, other: LookSet) { + *self = self.union(other); + } + + /// Returns a new set that is the intersection of this and the one given. + #[inline] + pub fn intersect(self, other: LookSet) -> LookSet { + LookSet { bits: self.bits & other.bits } + } + + /// Updates this set in place with the result of intersecting it with the + /// one given. + #[inline] + pub fn set_intersect(&mut self, other: LookSet) { + *self = self.intersect(other); + } + + /// Return a `LookSet` from the slice given as a native endian 32-bit + /// integer. + /// + /// # Panics + /// + /// This panics if `slice.len() < 4`. + #[inline] + pub fn read_repr(slice: &[u8]) -> LookSet { + let bits = u32::from_ne_bytes(slice[..4].try_into().unwrap()); + LookSet { bits } + } + + /// Write a `LookSet` as a native endian 32-bit integer to the beginning + /// of the slice given. + /// + /// # Panics + /// + /// This panics if `slice.len() < 4`. + #[inline] + pub fn write_repr(self, slice: &mut [u8]) { + let raw = self.bits.to_ne_bytes(); + slice[0] = raw[0]; + slice[1] = raw[1]; + slice[2] = raw[2]; + slice[3] = raw[3]; + } + + /// Checks that all assertions in this set can be matched. + /// + /// Some assertions, such as Unicode word boundaries, require optional (but + /// enabled by default) tables that may not be available. If there are + /// assertions in this set that require tables that are not available, then + /// this will return an error. + /// + /// Specifically, this returns an error when the the + /// `unicode-word-boundary` feature is _not_ enabled _and_ this set + /// contains a Unicode word boundary assertion. + /// + /// It can be useful to use this on the result of + /// [`NFA::look_set_any`](crate::nfa::thompson::NFA::look_set_any) + /// when building a matcher engine to ensure methods like + /// [`LookMatcher::matches_set`] do not panic at search time. + pub fn available(self) -> Result<(), UnicodeWordBoundaryError> { + if self.contains_word_unicode() { + UnicodeWordBoundaryError::check()?; + } + Ok(()) + } +} + +impl core::fmt::Debug for LookSet { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + if self.is_empty() { + return write!(f, "∅"); + } + for look in self.iter() { + write!(f, "{}", look.as_char())?; + } + Ok(()) + } +} + +/// An iterator over all look-around assertions in a [`LookSet`]. +/// +/// This iterator is created by [`LookSet::iter`]. +#[derive(Clone, Debug)] +pub struct LookSetIter { + set: LookSet, +} + +impl Iterator for LookSetIter { + type Item = Look; + + #[inline] + fn next(&mut self) -> Option<Look> { + if self.set.is_empty() { + return None; + } + // We'll never have more than u8::MAX distinct look-around assertions, + // so 'bit' will always fit into a u16. + let bit = u16::try_from(self.set.bits.trailing_zeros()).unwrap(); + let look = Look::from_repr(1 << bit)?; + self.set = self.set.remove(look); + Some(look) + } +} + +/// A matcher for look-around assertions. +/// +/// This matcher permits configuring aspects of how look-around assertions are +/// matched. +/// +/// # Example +/// +/// A `LookMatcher` can change the line terminator used for matching multi-line +/// anchors such as `(?m:^)` and `(?m:$)`. +/// +/// ``` +/// use regex_automata::{ +/// nfa::thompson::{self, pikevm::PikeVM}, +/// util::look::LookMatcher, +/// Match, Input, +/// }; +/// +/// let mut lookm = LookMatcher::new(); +/// lookm.set_line_terminator(b'\x00'); +/// +/// let re = PikeVM::builder() +/// .thompson(thompson::Config::new().look_matcher(lookm)) +/// .build(r"(?m)^[a-z]+$")?; +/// let mut cache = re.create_cache(); +/// +/// // Multi-line assertions now use NUL as a terminator. +/// assert_eq!( +/// Some(Match::must(0, 1..4)), +/// re.find(&mut cache, b"\x00abc\x00"), +/// ); +/// // ... and \n is no longer recognized as a terminator. +/// assert_eq!( +/// None, +/// re.find(&mut cache, b"\nabc\n"), +/// ); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +#[derive(Clone, Debug)] +pub struct LookMatcher { + lineterm: DebugByte, +} + +impl LookMatcher { + /// Creates a new default matcher for look-around assertions. + pub fn new() -> LookMatcher { + LookMatcher { lineterm: DebugByte(b'\n') } + } + + /// Sets the line terminator for use with `(?m:^)` and `(?m:$)`. + /// + /// Namely, instead of `^` matching after `\n` and `$` matching immediately + /// before a `\n`, this will cause it to match after and before the byte + /// given. + /// + /// It can occasionally be useful to use this to configure the line + /// terminator to the NUL byte when searching binary data. + /// + /// Note that this does not apply to CRLF-aware line anchors such as + /// `(?Rm:^)` and `(?Rm:$)`. CRLF-aware line anchors are hard-coded to + /// use `\r` and `\n`. + pub fn set_line_terminator(&mut self, byte: u8) -> &mut LookMatcher { + self.lineterm.0 = byte; + self + } + + /// Returns the line terminator that was configured for this matcher. + /// + /// If no line terminator was configured, then this returns `\n`. + /// + /// Note that the line terminator should only be used for matching `(?m:^)` + /// and `(?m:$)` assertions. It specifically should _not_ be used for + /// matching the CRLF aware assertions `(?Rm:^)` and `(?Rm:$)`. + pub fn get_line_terminator(&self) -> u8 { + self.lineterm.0 + } + + /// Returns true when the position `at` in `haystack` satisfies the given + /// look-around assertion. + /// + /// # Panics + /// + /// This panics when testing any Unicode word boundary assertion in this + /// set and when the Unicode word data is not available. Specifically, this + /// only occurs when the `unicode-word-boundary` feature is not enabled. + /// + /// Since it's generally expected that this routine is called inside of + /// a matching engine, callers should check the error condition when + /// building the matching engine. If there is a Unicode word boundary + /// in the matcher and the data isn't available, then the matcher should + /// fail to build. + /// + /// Callers can check the error condition with [`LookSet::available`]. + /// + /// This also may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. + #[inline] + pub fn matches(&self, look: Look, haystack: &[u8], at: usize) -> bool { + self.matches_inline(look, haystack, at) + } + + /// Like `matches`, but forcefully inlined. + /// + /// # Panics + /// + /// This panics when testing any Unicode word boundary assertion in this + /// set and when the Unicode word data is not available. Specifically, this + /// only occurs when the `unicode-word-boundary` feature is not enabled. + /// + /// Since it's generally expected that this routine is called inside of + /// a matching engine, callers should check the error condition when + /// building the matching engine. If there is a Unicode word boundary + /// in the matcher and the data isn't available, then the matcher should + /// fail to build. + /// + /// Callers can check the error condition with [`LookSet::available`]. + /// + /// This also may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn matches_inline( + &self, + look: Look, + haystack: &[u8], + at: usize, + ) -> bool { + match look { + Look::Start => self.is_start(haystack, at), + Look::End => self.is_end(haystack, at), + Look::StartLF => self.is_start_lf(haystack, at), + Look::EndLF => self.is_end_lf(haystack, at), + Look::StartCRLF => self.is_start_crlf(haystack, at), + Look::EndCRLF => self.is_end_crlf(haystack, at), + Look::WordAscii => self.is_word_ascii(haystack, at), + Look::WordAsciiNegate => self.is_word_ascii_negate(haystack, at), + Look::WordUnicode => self.is_word_unicode(haystack, at).unwrap(), + Look::WordUnicodeNegate => { + self.is_word_unicode_negate(haystack, at).unwrap() + } + Look::WordStartAscii => self.is_word_start_ascii(haystack, at), + Look::WordEndAscii => self.is_word_end_ascii(haystack, at), + Look::WordStartUnicode => { + self.is_word_start_unicode(haystack, at).unwrap() + } + Look::WordEndUnicode => { + self.is_word_end_unicode(haystack, at).unwrap() + } + Look::WordStartHalfAscii => { + self.is_word_start_half_ascii(haystack, at) + } + Look::WordEndHalfAscii => { + self.is_word_end_half_ascii(haystack, at) + } + Look::WordStartHalfUnicode => { + self.is_word_start_half_unicode(haystack, at).unwrap() + } + Look::WordEndHalfUnicode => { + self.is_word_end_half_unicode(haystack, at).unwrap() + } + } + } + + /// Returns true when _all_ of the assertions in the given set match at the + /// given position in the haystack. + /// + /// # Panics + /// + /// This panics when testing any Unicode word boundary assertion in this + /// set and when the Unicode word data is not available. Specifically, this + /// only occurs when the `unicode-word-boundary` feature is not enabled. + /// + /// Since it's generally expected that this routine is called inside of + /// a matching engine, callers should check the error condition when + /// building the matching engine. If there is a Unicode word boundary + /// in the matcher and the data isn't available, then the matcher should + /// fail to build. + /// + /// Callers can check the error condition with [`LookSet::available`]. + /// + /// This also may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. + #[inline] + pub fn matches_set( + &self, + set: LookSet, + haystack: &[u8], + at: usize, + ) -> bool { + self.matches_set_inline(set, haystack, at) + } + + /// Like `LookSet::matches`, but forcefully inlined for perf. + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn matches_set_inline( + &self, + set: LookSet, + haystack: &[u8], + at: usize, + ) -> bool { + // This used to luse LookSet::iter with Look::matches on each element, + // but that proved to be quite diastrous for perf. The manual "if + // the set has this assertion, check it" turns out to be quite a bit + // faster. + if set.contains(Look::Start) { + if !self.is_start(haystack, at) { + return false; + } + } + if set.contains(Look::End) { + if !self.is_end(haystack, at) { + return false; + } + } + if set.contains(Look::StartLF) { + if !self.is_start_lf(haystack, at) { + return false; + } + } + if set.contains(Look::EndLF) { + if !self.is_end_lf(haystack, at) { + return false; + } + } + if set.contains(Look::StartCRLF) { + if !self.is_start_crlf(haystack, at) { + return false; + } + } + if set.contains(Look::EndCRLF) { + if !self.is_end_crlf(haystack, at) { + return false; + } + } + if set.contains(Look::WordAscii) { + if !self.is_word_ascii(haystack, at) { + return false; + } + } + if set.contains(Look::WordAsciiNegate) { + if !self.is_word_ascii_negate(haystack, at) { + return false; + } + } + if set.contains(Look::WordUnicode) { + if !self.is_word_unicode(haystack, at).unwrap() { + return false; + } + } + if set.contains(Look::WordUnicodeNegate) { + if !self.is_word_unicode_negate(haystack, at).unwrap() { + return false; + } + } + if set.contains(Look::WordStartAscii) { + if !self.is_word_start_ascii(haystack, at) { + return false; + } + } + if set.contains(Look::WordEndAscii) { + if !self.is_word_end_ascii(haystack, at) { + return false; + } + } + if set.contains(Look::WordStartUnicode) { + if !self.is_word_start_unicode(haystack, at).unwrap() { + return false; + } + } + if set.contains(Look::WordEndUnicode) { + if !self.is_word_end_unicode(haystack, at).unwrap() { + return false; + } + } + if set.contains(Look::WordStartHalfAscii) { + if !self.is_word_start_half_ascii(haystack, at) { + return false; + } + } + if set.contains(Look::WordEndHalfAscii) { + if !self.is_word_end_half_ascii(haystack, at) { + return false; + } + } + if set.contains(Look::WordStartHalfUnicode) { + if !self.is_word_start_half_unicode(haystack, at).unwrap() { + return false; + } + } + if set.contains(Look::WordEndHalfUnicode) { + if !self.is_word_end_half_unicode(haystack, at).unwrap() { + return false; + } + } + true + } + + /// Split up the given byte classes into equivalence classes in a way that + /// is consistent with this look-around assertion. + #[cfg(feature = "alloc")] + pub(crate) fn add_to_byteset( + &self, + look: Look, + set: &mut crate::util::alphabet::ByteClassSet, + ) { + match look { + Look::Start | Look::End => {} + Look::StartLF | Look::EndLF => { + set.set_range(self.lineterm.0, self.lineterm.0); + } + Look::StartCRLF | Look::EndCRLF => { + set.set_range(b'\r', b'\r'); + set.set_range(b'\n', b'\n'); + } + Look::WordAscii + | Look::WordAsciiNegate + | Look::WordUnicode + | Look::WordUnicodeNegate + | Look::WordStartAscii + | Look::WordEndAscii + | Look::WordStartUnicode + | Look::WordEndUnicode + | Look::WordStartHalfAscii + | Look::WordEndHalfAscii + | Look::WordStartHalfUnicode + | Look::WordEndHalfUnicode => { + // We need to mark all ranges of bytes whose pairs result in + // evaluating \b differently. This isn't technically correct + // for Unicode word boundaries, but DFAs can't handle those + // anyway, and thus, the byte classes don't need to either + // since they are themselves only used in DFAs. + // + // FIXME: It seems like the calls to 'set_range' here are + // completely invariant, which means we could just hard-code + // them here without needing to write a loop. And we only need + // to do this dance at most once per regex. + // + // FIXME: Is this correct for \B? + let iswb = utf8::is_word_byte; + // This unwrap is OK because we guard every use of 'asu8' with + // a check that the input is <= 255. + let asu8 = |b: u16| u8::try_from(b).unwrap(); + let mut b1: u16 = 0; + let mut b2: u16; + while b1 <= 255 { + b2 = b1 + 1; + while b2 <= 255 && iswb(asu8(b1)) == iswb(asu8(b2)) { + b2 += 1; + } + // The guards above guarantee that b2 can never get any + // bigger. + assert!(b2 <= 256); + // Subtracting 1 from b2 is always OK because it is always + // at least 1 greater than b1, and the assert above + // guarantees that the asu8 conversion will succeed. + set.set_range(asu8(b1), asu8(b2.checked_sub(1).unwrap())); + b1 = b2; + } + } + } + } + + /// Returns true when [`Look::Start`] is satisfied `at` the given position + /// in `haystack`. + /// + /// # Panics + /// + /// This may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. + #[inline] + pub fn is_start(&self, _haystack: &[u8], at: usize) -> bool { + at == 0 + } + + /// Returns true when [`Look::End`] is satisfied `at` the given position in + /// `haystack`. + /// + /// # Panics + /// + /// This may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. + #[inline] + pub fn is_end(&self, haystack: &[u8], at: usize) -> bool { + at == haystack.len() + } + + /// Returns true when [`Look::StartLF`] is satisfied `at` the given + /// position in `haystack`. + /// + /// # Panics + /// + /// This may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. + #[inline] + pub fn is_start_lf(&self, haystack: &[u8], at: usize) -> bool { + self.is_start(haystack, at) || haystack[at - 1] == self.lineterm.0 + } + + /// Returns true when [`Look::EndLF`] is satisfied `at` the given position + /// in `haystack`. + /// + /// # Panics + /// + /// This may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. + #[inline] + pub fn is_end_lf(&self, haystack: &[u8], at: usize) -> bool { + self.is_end(haystack, at) || haystack[at] == self.lineterm.0 + } + + /// Returns true when [`Look::StartCRLF`] is satisfied `at` the given + /// position in `haystack`. + /// + /// # Panics + /// + /// This may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. + #[inline] + pub fn is_start_crlf(&self, haystack: &[u8], at: usize) -> bool { + self.is_start(haystack, at) + || haystack[at - 1] == b'\n' + || (haystack[at - 1] == b'\r' + && (at >= haystack.len() || haystack[at] != b'\n')) + } + + /// Returns true when [`Look::EndCRLF`] is satisfied `at` the given + /// position in `haystack`. + /// + /// # Panics + /// + /// This may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. + #[inline] + pub fn is_end_crlf(&self, haystack: &[u8], at: usize) -> bool { + self.is_end(haystack, at) + || haystack[at] == b'\r' + || (haystack[at] == b'\n' + && (at == 0 || haystack[at - 1] != b'\r')) + } + + /// Returns true when [`Look::WordAscii`] is satisfied `at` the given + /// position in `haystack`. + /// + /// # Panics + /// + /// This may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. + #[inline] + pub fn is_word_ascii(&self, haystack: &[u8], at: usize) -> bool { + let word_before = at > 0 && utf8::is_word_byte(haystack[at - 1]); + let word_after = + at < haystack.len() && utf8::is_word_byte(haystack[at]); + word_before != word_after + } + + /// Returns true when [`Look::WordAsciiNegate`] is satisfied `at` the given + /// position in `haystack`. + /// + /// # Panics + /// + /// This may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. + #[inline] + pub fn is_word_ascii_negate(&self, haystack: &[u8], at: usize) -> bool { + !self.is_word_ascii(haystack, at) + } + + /// Returns true when [`Look::WordUnicode`] is satisfied `at` the given + /// position in `haystack`. + /// + /// # Panics + /// + /// This may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. + /// + /// # Errors + /// + /// This returns an error when Unicode word boundary tables + /// are not available. Specifically, this only occurs when the + /// `unicode-word-boundary` feature is not enabled. + #[inline] + pub fn is_word_unicode( + &self, + haystack: &[u8], + at: usize, + ) -> Result<bool, UnicodeWordBoundaryError> { + let word_before = is_word_char::rev(haystack, at)?; + let word_after = is_word_char::fwd(haystack, at)?; + Ok(word_before != word_after) + } + + /// Returns true when [`Look::WordUnicodeNegate`] is satisfied `at` the + /// given position in `haystack`. + /// + /// # Panics + /// + /// This may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. + /// + /// # Errors + /// + /// This returns an error when Unicode word boundary tables + /// are not available. Specifically, this only occurs when the + /// `unicode-word-boundary` feature is not enabled. + #[inline] + pub fn is_word_unicode_negate( + &self, + haystack: &[u8], + at: usize, + ) -> Result<bool, UnicodeWordBoundaryError> { + // This is pretty subtle. Why do we need to do UTF-8 decoding here? + // Well... at time of writing, the is_word_char_{fwd,rev} routines will + // only return true if there is a valid UTF-8 encoding of a "word" + // codepoint, and false in every other case (including invalid UTF-8). + // This means that in regions of invalid UTF-8 (which might be a + // subset of valid UTF-8!), it would result in \B matching. While this + // would be questionable in the context of truly invalid UTF-8, it is + // *certainly* wrong to report match boundaries that split the encoding + // of a codepoint. So to work around this, we ensure that we can decode + // a codepoint on either side of `at`. If either direction fails, then + // we don't permit \B to match at all. + // + // Now, this isn't exactly optimal from a perf perspective. We could + // try and detect this in is_word_char::{fwd,rev}, but it's not clear + // if it's worth it. \B is, after all, rarely used. Even worse, + // is_word_char::{fwd,rev} could do its own UTF-8 decoding, and so this + // will wind up doing UTF-8 decoding twice. Owch. We could fix this + // with more code complexity, but it just doesn't feel worth it for \B. + // + // And in particular, we do *not* have to do this with \b, because \b + // *requires* that at least one side of `at` be a "word" codepoint, + // which in turn implies one side of `at` must be valid UTF-8. This in + // turn implies that \b can never split a valid UTF-8 encoding of a + // codepoint. In the case where one side of `at` is truly invalid UTF-8 + // and the other side IS a word codepoint, then we want \b to match + // since it represents a valid UTF-8 boundary. It also makes sense. For + // example, you'd want \b\w+\b to match 'abc' in '\xFFabc\xFF'. + // + // Note also that this is not just '!is_word_unicode(..)' like it is + // for the ASCII case. For example, neither \b nor \B is satisfied + // within invalid UTF-8 sequences. + let word_before = at > 0 + && match utf8::decode_last(&haystack[..at]) { + None | Some(Err(_)) => return Ok(false), + Some(Ok(_)) => is_word_char::rev(haystack, at)?, + }; + let word_after = at < haystack.len() + && match utf8::decode(&haystack[at..]) { + None | Some(Err(_)) => return Ok(false), + Some(Ok(_)) => is_word_char::fwd(haystack, at)?, + }; + Ok(word_before == word_after) + } + + /// Returns true when [`Look::WordStartAscii`] is satisfied `at` the given + /// position in `haystack`. + /// + /// # Panics + /// + /// This may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. + #[inline] + pub fn is_word_start_ascii(&self, haystack: &[u8], at: usize) -> bool { + let word_before = at > 0 && utf8::is_word_byte(haystack[at - 1]); + let word_after = + at < haystack.len() && utf8::is_word_byte(haystack[at]); + !word_before && word_after + } + + /// Returns true when [`Look::WordEndAscii`] is satisfied `at` the given + /// position in `haystack`. + /// + /// # Panics + /// + /// This may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. + #[inline] + pub fn is_word_end_ascii(&self, haystack: &[u8], at: usize) -> bool { + let word_before = at > 0 && utf8::is_word_byte(haystack[at - 1]); + let word_after = + at < haystack.len() && utf8::is_word_byte(haystack[at]); + word_before && !word_after + } + + /// Returns true when [`Look::WordStartUnicode`] is satisfied `at` the + /// given position in `haystack`. + /// + /// # Panics + /// + /// This may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. + /// + /// # Errors + /// + /// This returns an error when Unicode word boundary tables + /// are not available. Specifically, this only occurs when the + /// `unicode-word-boundary` feature is not enabled. + #[inline] + pub fn is_word_start_unicode( + &self, + haystack: &[u8], + at: usize, + ) -> Result<bool, UnicodeWordBoundaryError> { + let word_before = is_word_char::rev(haystack, at)?; + let word_after = is_word_char::fwd(haystack, at)?; + Ok(!word_before && word_after) + } + + /// Returns true when [`Look::WordEndUnicode`] is satisfied `at` the + /// given position in `haystack`. + /// + /// # Panics + /// + /// This may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. + /// + /// # Errors + /// + /// This returns an error when Unicode word boundary tables + /// are not available. Specifically, this only occurs when the + /// `unicode-word-boundary` feature is not enabled. + #[inline] + pub fn is_word_end_unicode( + &self, + haystack: &[u8], + at: usize, + ) -> Result<bool, UnicodeWordBoundaryError> { + let word_before = is_word_char::rev(haystack, at)?; + let word_after = is_word_char::fwd(haystack, at)?; + Ok(word_before && !word_after) + } + + /// Returns true when [`Look::WordStartHalfAscii`] is satisfied `at` the + /// given position in `haystack`. + /// + /// # Panics + /// + /// This may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. + #[inline] + pub fn is_word_start_half_ascii( + &self, + haystack: &[u8], + at: usize, + ) -> bool { + let word_before = at > 0 && utf8::is_word_byte(haystack[at - 1]); + !word_before + } + + /// Returns true when [`Look::WordEndHalfAscii`] is satisfied `at` the + /// given position in `haystack`. + /// + /// # Panics + /// + /// This may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. + #[inline] + pub fn is_word_end_half_ascii(&self, haystack: &[u8], at: usize) -> bool { + let word_after = + at < haystack.len() && utf8::is_word_byte(haystack[at]); + !word_after + } + + /// Returns true when [`Look::WordStartHalfUnicode`] is satisfied `at` the + /// given position in `haystack`. + /// + /// # Panics + /// + /// This may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. + /// + /// # Errors + /// + /// This returns an error when Unicode word boundary tables + /// are not available. Specifically, this only occurs when the + /// `unicode-word-boundary` feature is not enabled. + #[inline] + pub fn is_word_start_half_unicode( + &self, + haystack: &[u8], + at: usize, + ) -> Result<bool, UnicodeWordBoundaryError> { + // See `is_word_unicode_negate` for why we need to do this. We don't + // need to do it for `is_word_start_unicode` because that guarantees + // that the position matched falls on a valid UTF-8 boundary given + // that the right side must be in \w. + let word_before = at > 0 + && match utf8::decode_last(&haystack[..at]) { + None | Some(Err(_)) => return Ok(false), + Some(Ok(_)) => is_word_char::rev(haystack, at)?, + }; + Ok(!word_before) + } + + /// Returns true when [`Look::WordEndHalfUnicode`] is satisfied `at` the + /// given position in `haystack`. + /// + /// # Panics + /// + /// This may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. + /// + /// # Errors + /// + /// This returns an error when Unicode word boundary tables + /// are not available. Specifically, this only occurs when the + /// `unicode-word-boundary` feature is not enabled. + #[inline] + pub fn is_word_end_half_unicode( + &self, + haystack: &[u8], + at: usize, + ) -> Result<bool, UnicodeWordBoundaryError> { + // See `is_word_unicode_negate` for why we need to do this. We don't + // need to do it for `is_word_end_unicode` because that guarantees + // that the position matched falls on a valid UTF-8 boundary given + // that the left side must be in \w. + let word_after = at < haystack.len() + && match utf8::decode(&haystack[at..]) { + None | Some(Err(_)) => return Ok(false), + Some(Ok(_)) => is_word_char::fwd(haystack, at)?, + }; + Ok(!word_after) + } +} + +impl Default for LookMatcher { + fn default() -> LookMatcher { + LookMatcher::new() + } +} + +/// An error that occurs when the Unicode-aware `\w` class is unavailable. +/// +/// This error can occur when the data tables necessary for the Unicode aware +/// Perl character class `\w` are unavailable. The `\w` class is used to +/// determine whether a codepoint is considered a word character or not when +/// determining whether a Unicode aware `\b` (or `\B`) matches at a particular +/// position. +/// +/// This error can only occur when the `unicode-word-boundary` feature is +/// disabled. +#[derive(Clone, Debug)] +pub struct UnicodeWordBoundaryError(()); + +impl UnicodeWordBoundaryError { + #[cfg(not(feature = "unicode-word-boundary"))] + pub(crate) fn new() -> UnicodeWordBoundaryError { + UnicodeWordBoundaryError(()) + } + + /// Returns an error if and only if Unicode word boundary data is + /// unavailable. + pub fn check() -> Result<(), UnicodeWordBoundaryError> { + is_word_char::check() + } +} + +#[cfg(feature = "std")] +impl std::error::Error for UnicodeWordBoundaryError {} + +impl core::fmt::Display for UnicodeWordBoundaryError { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + write!( + f, + "Unicode-aware \\b and \\B are unavailable because the \ + requisite data tables are missing, please enable the \ + unicode-word-boundary feature" + ) + } +} + +// Below are FOUR different ways for checking whether whether a "word" +// codepoint exists at a particular position in the haystack. The four +// different approaches are, in order of preference: +// +// 1. Parse '\w', convert to an NFA, convert to a fully compiled DFA on the +// first call, and then use that DFA for all subsequent calls. +// 2. Do UTF-8 decoding and use regex_syntax::is_word_character if available. +// 3. Do UTF-8 decoding and use our own 'perl_word' table. +// 4. Return an error. +// +// The reason for all of these approaches is a combination of perf and +// permitting one to build regex-automata without the Unicode data necessary +// for handling Unicode-aware word boundaries. (In which case, '(?-u:\b)' would +// still work.) +// +// The DFA approach is the fastest, but it requires the regex parser, the +// NFA compiler, the DFA builder and the DFA search runtime. That's a lot to +// bring in, but if it's available, it's (probably) the best we can do. +// +// Approaches (2) and (3) are effectively equivalent, but (2) reuses the +// data in regex-syntax and avoids duplicating it in regex-automata. +// +// Finally, (4) unconditionally returns an error since the requisite data isn't +// available anywhere. +// +// There are actually more approaches possible that we didn't implement. For +// example, if the DFA builder is available but the syntax parser is not, we +// could technically hand construct our own NFA from the 'perl_word' data +// table. But to avoid some pretty hairy code duplication, we would in turn +// need to pull the UTF-8 compiler out of the NFA compiler. Yikes. +// +// A possibly more sensible alternative is to use a lazy DFA when the full +// DFA builder isn't available... +// +// Yet another choice would be to build the full DFA and then embed it into the +// source. Then we'd only need to bring in the DFA search runtime, which is +// considerably smaller than the DFA builder code. The problem here is that the +// Debian people have spooked me[1] into avoiding cyclic dependencies. Namely, +// we'd need to build regex-cli, which depends on regex-automata in order to +// build some part of regex-automata. But to be honest, something like this has +// to be allowed somehow? I just don't know what the right process is. +// +// There are perhaps other choices as well. Why did I stop at these 4? Because +// I wanted to preserve my sanity. I suspect I'll wind up adding the lazy DFA +// approach eventually, as the benefits of the DFA approach are somewhat +// compelling. The 'boundary-words-holmes' benchmark tests this. (Note that +// the commands below no longer work. If necessary, we should re-capitulate +// the benchmark from whole cloth in rebar.) +// +// $ regex-cli bench measure -f boundary-words-holmes -e pikevm > dfa.csv +// +// Then I changed the code below so that the util/unicode_data/perl_word table +// was used and re-ran the benchmark: +// +// $ regex-cli bench measure -f boundary-words-holmes -e pikevm > table.csv +// +// And compared them: +// +// $ regex-cli bench diff dfa.csv table.csv +// benchmark engine dfa table +// --------- ------ --- ----- +// internal/count/boundary-words-holmes regex/automata/pikevm 18.6 MB/s 12.9 MB/s +// +// Which is a nice improvement. +// +// UPDATE: It turns out that it takes approximately 22ms to build the reverse +// DFA for \w. (And about 3ms for the forward DFA.) It's probably not much in +// the grand scheme things, but that is a significant latency cost. So I'm not +// sure that's a good idea. I then tried using a lazy DFA instead, and that +// eliminated the overhead, but since the lazy DFA requires mutable working +// memory, that requires introducing a 'Cache' for every simultaneous call. +// +// I ended up deciding for now to just keep the "UTF-8 decode and check the +// table." The DFA and lazy DFA approaches are still below, but commented out. +// +// [1]: https://github.com/BurntSushi/ucd-generate/issues/11 + +/* +/// A module that looks for word codepoints using lazy DFAs. +#[cfg(all( + feature = "unicode-word-boundary", + feature = "syntax", + feature = "unicode-perl", + feature = "hybrid" +))] +mod is_word_char { + use alloc::vec::Vec; + + use crate::{ + hybrid::dfa::{Cache, DFA}, + nfa::thompson::NFA, + util::{lazy::Lazy, pool::Pool, primitives::StateID}, + Anchored, Input, + }; + + pub(super) fn check() -> Result<(), super::UnicodeWordBoundaryError> { + Ok(()) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(super) fn fwd( + haystack: &[u8], + mut at: usize, + ) -> Result<bool, super::UnicodeWordBoundaryError> { + static WORD: Lazy<DFA> = Lazy::new(|| DFA::new(r"\w").unwrap()); + static CACHE: Lazy<Pool<Cache>> = + Lazy::new(|| Pool::new(|| WORD.create_cache())); + let dfa = Lazy::get(&WORD); + let mut cache = Lazy::get(&CACHE).get(); + let mut sid = dfa + .start_state_forward( + &mut cache, + &Input::new("").anchored(Anchored::Yes), + ) + .unwrap(); + while at < haystack.len() { + let byte = haystack[at]; + sid = dfa.next_state(&mut cache, sid, byte).unwrap(); + at += 1; + if sid.is_tagged() { + if sid.is_match() { + return Ok(true); + } else if sid.is_dead() { + return Ok(false); + } + } + } + Ok(dfa.next_eoi_state(&mut cache, sid).unwrap().is_match()) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(super) fn rev( + haystack: &[u8], + mut at: usize, + ) -> Result<bool, super::UnicodeWordBoundaryError> { + static WORD: Lazy<DFA> = Lazy::new(|| { + DFA::builder() + .thompson(NFA::config().reverse(true)) + .build(r"\w") + .unwrap() + }); + static CACHE: Lazy<Pool<Cache>> = + Lazy::new(|| Pool::new(|| WORD.create_cache())); + let dfa = Lazy::get(&WORD); + let mut cache = Lazy::get(&CACHE).get(); + let mut sid = dfa + .start_state_reverse( + &mut cache, + &Input::new("").anchored(Anchored::Yes), + ) + .unwrap(); + while at > 0 { + at -= 1; + let byte = haystack[at]; + sid = dfa.next_state(&mut cache, sid, byte).unwrap(); + if sid.is_tagged() { + if sid.is_match() { + return Ok(true); + } else if sid.is_dead() { + return Ok(false); + } + } + } + Ok(dfa.next_eoi_state(&mut cache, sid).unwrap().is_match()) + } +} +*/ + +/* +/// A module that looks for word codepoints using fully compiled DFAs. +#[cfg(all( + feature = "unicode-word-boundary", + feature = "syntax", + feature = "unicode-perl", + feature = "dfa-build" +))] +mod is_word_char { + use alloc::vec::Vec; + + use crate::{ + dfa::{dense::DFA, Automaton, StartKind}, + nfa::thompson::NFA, + util::{lazy::Lazy, primitives::StateID}, + Anchored, Input, + }; + + pub(super) fn check() -> Result<(), super::UnicodeWordBoundaryError> { + Ok(()) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(super) fn fwd( + haystack: &[u8], + mut at: usize, + ) -> Result<bool, super::UnicodeWordBoundaryError> { + static WORD: Lazy<(DFA<Vec<u32>>, StateID)> = Lazy::new(|| { + let dfa = DFA::builder() + .configure(DFA::config().start_kind(StartKind::Anchored)) + .build(r"\w") + .unwrap(); + // OK because our regex has no look-around. + let start_id = dfa.universal_start_state(Anchored::Yes).unwrap(); + (dfa, start_id) + }); + let &(ref dfa, mut sid) = Lazy::get(&WORD); + while at < haystack.len() { + let byte = haystack[at]; + sid = dfa.next_state(sid, byte); + at += 1; + if dfa.is_special_state(sid) { + if dfa.is_match_state(sid) { + return Ok(true); + } else if dfa.is_dead_state(sid) { + return Ok(false); + } + } + } + Ok(dfa.is_match_state(dfa.next_eoi_state(sid))) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(super) fn rev( + haystack: &[u8], + mut at: usize, + ) -> Result<bool, super::UnicodeWordBoundaryError> { + static WORD: Lazy<(DFA<Vec<u32>>, StateID)> = Lazy::new(|| { + let dfa = DFA::builder() + .configure(DFA::config().start_kind(StartKind::Anchored)) + // From ad hoc measurements, it looks like setting + // shrink==false is slightly faster than shrink==true. I kind + // of feel like this indicates that shrinking is probably a + // failure, although it can help in some cases. Sigh. + .thompson(NFA::config().reverse(true).shrink(false)) + .build(r"\w") + .unwrap(); + // OK because our regex has no look-around. + let start_id = dfa.universal_start_state(Anchored::Yes).unwrap(); + (dfa, start_id) + }); + let &(ref dfa, mut sid) = Lazy::get(&WORD); + while at > 0 { + at -= 1; + let byte = haystack[at]; + sid = dfa.next_state(sid, byte); + if dfa.is_special_state(sid) { + if dfa.is_match_state(sid) { + return Ok(true); + } else if dfa.is_dead_state(sid) { + return Ok(false); + } + } + } + Ok(dfa.is_match_state(dfa.next_eoi_state(sid))) + } +} +*/ + +/// A module that looks for word codepoints using regex-syntax's data tables. +#[cfg(all( + feature = "unicode-word-boundary", + feature = "syntax", + feature = "unicode-perl", +))] +mod is_word_char { + use regex_syntax::try_is_word_character; + + use crate::util::utf8; + + pub(super) fn check() -> Result<(), super::UnicodeWordBoundaryError> { + Ok(()) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(super) fn fwd( + haystack: &[u8], + at: usize, + ) -> Result<bool, super::UnicodeWordBoundaryError> { + Ok(match utf8::decode(&haystack[at..]) { + None | Some(Err(_)) => false, + Some(Ok(ch)) => try_is_word_character(ch).expect( + "since unicode-word-boundary, syntax and unicode-perl \ + are all enabled, it is expected that \ + try_is_word_character succeeds", + ), + }) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(super) fn rev( + haystack: &[u8], + at: usize, + ) -> Result<bool, super::UnicodeWordBoundaryError> { + Ok(match utf8::decode_last(&haystack[..at]) { + None | Some(Err(_)) => false, + Some(Ok(ch)) => try_is_word_character(ch).expect( + "since unicode-word-boundary, syntax and unicode-perl \ + are all enabled, it is expected that \ + try_is_word_character succeeds", + ), + }) + } +} + +/// A module that looks for word codepoints using regex-automata's data tables +/// (which are only compiled when regex-syntax's tables aren't available). +/// +/// Note that the cfg should match the one in src/util/unicode_data/mod.rs for +/// perl_word. +#[cfg(all( + feature = "unicode-word-boundary", + not(all(feature = "syntax", feature = "unicode-perl")), +))] +mod is_word_char { + use crate::util::utf8; + + pub(super) fn check() -> Result<(), super::UnicodeWordBoundaryError> { + Ok(()) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(super) fn fwd( + haystack: &[u8], + at: usize, + ) -> Result<bool, super::UnicodeWordBoundaryError> { + Ok(match utf8::decode(&haystack[at..]) { + None | Some(Err(_)) => false, + Some(Ok(ch)) => is_word_character(ch), + }) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(super) fn rev( + haystack: &[u8], + at: usize, + ) -> Result<bool, super::UnicodeWordBoundaryError> { + Ok(match utf8::decode_last(&haystack[..at]) { + None | Some(Err(_)) => false, + Some(Ok(ch)) => is_word_character(ch), + }) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn is_word_character(c: char) -> bool { + use crate::util::{unicode_data::perl_word::PERL_WORD, utf8}; + + if u8::try_from(c).map_or(false, utf8::is_word_byte) { + return true; + } + PERL_WORD + .binary_search_by(|&(start, end)| { + use core::cmp::Ordering; + + if start <= c && c <= end { + Ordering::Equal + } else if start > c { + Ordering::Greater + } else { + Ordering::Less + } + }) + .is_ok() + } +} + +/// A module that always returns an error if Unicode word boundaries are +/// disabled. When this feature is disabled, then regex-automata will not +/// include its own data tables even if regex-syntax is disabled. +#[cfg(not(feature = "unicode-word-boundary"))] +mod is_word_char { + pub(super) fn check() -> Result<(), super::UnicodeWordBoundaryError> { + Err(super::UnicodeWordBoundaryError::new()) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(super) fn fwd( + _bytes: &[u8], + _at: usize, + ) -> Result<bool, super::UnicodeWordBoundaryError> { + Err(super::UnicodeWordBoundaryError::new()) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(super) fn rev( + _bytes: &[u8], + _at: usize, + ) -> Result<bool, super::UnicodeWordBoundaryError> { + Err(super::UnicodeWordBoundaryError::new()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + macro_rules! testlook { + ($look:expr, $haystack:expr, $at:expr) => { + LookMatcher::default().matches($look, $haystack.as_bytes(), $at) + }; + } + + #[test] + fn look_matches_start_line() { + let look = Look::StartLF; + + assert!(testlook!(look, "", 0)); + assert!(testlook!(look, "\n", 0)); + assert!(testlook!(look, "\n", 1)); + assert!(testlook!(look, "a", 0)); + assert!(testlook!(look, "\na", 1)); + + assert!(!testlook!(look, "a", 1)); + assert!(!testlook!(look, "a\na", 1)); + } + + #[test] + fn look_matches_end_line() { + let look = Look::EndLF; + + assert!(testlook!(look, "", 0)); + assert!(testlook!(look, "\n", 1)); + assert!(testlook!(look, "\na", 0)); + assert!(testlook!(look, "\na", 2)); + assert!(testlook!(look, "a\na", 1)); + + assert!(!testlook!(look, "a", 0)); + assert!(!testlook!(look, "\na", 1)); + assert!(!testlook!(look, "a\na", 0)); + assert!(!testlook!(look, "a\na", 2)); + } + + #[test] + fn look_matches_start_text() { + let look = Look::Start; + + assert!(testlook!(look, "", 0)); + assert!(testlook!(look, "\n", 0)); + assert!(testlook!(look, "a", 0)); + + assert!(!testlook!(look, "\n", 1)); + assert!(!testlook!(look, "\na", 1)); + assert!(!testlook!(look, "a", 1)); + assert!(!testlook!(look, "a\na", 1)); + } + + #[test] + fn look_matches_end_text() { + let look = Look::End; + + assert!(testlook!(look, "", 0)); + assert!(testlook!(look, "\n", 1)); + assert!(testlook!(look, "\na", 2)); + + assert!(!testlook!(look, "\na", 0)); + assert!(!testlook!(look, "a\na", 1)); + assert!(!testlook!(look, "a", 0)); + assert!(!testlook!(look, "\na", 1)); + assert!(!testlook!(look, "a\na", 0)); + assert!(!testlook!(look, "a\na", 2)); + } + + #[test] + #[cfg(all(not(miri), feature = "unicode-word-boundary"))] + fn look_matches_word_unicode() { + let look = Look::WordUnicode; + + // \xF0\x9D\x9B\x83 = 𝛃 (in \w) + // \xF0\x90\x86\x80 = 𐆀 (not in \w) + + // Simple ASCII word boundaries. + assert!(testlook!(look, "a", 0)); + assert!(testlook!(look, "a", 1)); + assert!(testlook!(look, "a ", 1)); + assert!(testlook!(look, " a ", 1)); + assert!(testlook!(look, " a ", 2)); + + // Unicode word boundaries with a non-ASCII codepoint. + assert!(testlook!(look, "𝛃", 0)); + assert!(testlook!(look, "𝛃", 4)); + assert!(testlook!(look, "𝛃 ", 4)); + assert!(testlook!(look, " 𝛃 ", 1)); + assert!(testlook!(look, " 𝛃 ", 5)); + + // Unicode word boundaries between non-ASCII codepoints. + assert!(testlook!(look, "𝛃𐆀", 0)); + assert!(testlook!(look, "𝛃𐆀", 4)); + + // Non word boundaries for ASCII. + assert!(!testlook!(look, "", 0)); + assert!(!testlook!(look, "ab", 1)); + assert!(!testlook!(look, "a ", 2)); + assert!(!testlook!(look, " a ", 0)); + assert!(!testlook!(look, " a ", 3)); + + // Non word boundaries with a non-ASCII codepoint. + assert!(!testlook!(look, "𝛃b", 4)); + assert!(!testlook!(look, "𝛃 ", 5)); + assert!(!testlook!(look, " 𝛃 ", 0)); + assert!(!testlook!(look, " 𝛃 ", 6)); + assert!(!testlook!(look, "𝛃", 1)); + assert!(!testlook!(look, "𝛃", 2)); + assert!(!testlook!(look, "𝛃", 3)); + + // Non word boundaries with non-ASCII codepoints. + assert!(!testlook!(look, "𝛃𐆀", 1)); + assert!(!testlook!(look, "𝛃𐆀", 2)); + assert!(!testlook!(look, "𝛃𐆀", 3)); + assert!(!testlook!(look, "𝛃𐆀", 5)); + assert!(!testlook!(look, "𝛃𐆀", 6)); + assert!(!testlook!(look, "𝛃𐆀", 7)); + assert!(!testlook!(look, "𝛃𐆀", 8)); + } + + #[test] + fn look_matches_word_ascii() { + let look = Look::WordAscii; + + // \xF0\x9D\x9B\x83 = 𝛃 (in \w) + // \xF0\x90\x86\x80 = 𐆀 (not in \w) + + // Simple ASCII word boundaries. + assert!(testlook!(look, "a", 0)); + assert!(testlook!(look, "a", 1)); + assert!(testlook!(look, "a ", 1)); + assert!(testlook!(look, " a ", 1)); + assert!(testlook!(look, " a ", 2)); + + // Unicode word boundaries with a non-ASCII codepoint. Since this is + // an ASCII word boundary, none of these match. + assert!(!testlook!(look, "𝛃", 0)); + assert!(!testlook!(look, "𝛃", 4)); + assert!(!testlook!(look, "𝛃 ", 4)); + assert!(!testlook!(look, " 𝛃 ", 1)); + assert!(!testlook!(look, " 𝛃 ", 5)); + + // Unicode word boundaries between non-ASCII codepoints. Again, since + // this is an ASCII word boundary, none of these match. + assert!(!testlook!(look, "𝛃𐆀", 0)); + assert!(!testlook!(look, "𝛃𐆀", 4)); + + // Non word boundaries for ASCII. + assert!(!testlook!(look, "", 0)); + assert!(!testlook!(look, "ab", 1)); + assert!(!testlook!(look, "a ", 2)); + assert!(!testlook!(look, " a ", 0)); + assert!(!testlook!(look, " a ", 3)); + + // Non word boundaries with a non-ASCII codepoint. + assert!(testlook!(look, "𝛃b", 4)); + assert!(!testlook!(look, "𝛃 ", 5)); + assert!(!testlook!(look, " 𝛃 ", 0)); + assert!(!testlook!(look, " 𝛃 ", 6)); + assert!(!testlook!(look, "𝛃", 1)); + assert!(!testlook!(look, "𝛃", 2)); + assert!(!testlook!(look, "𝛃", 3)); + + // Non word boundaries with non-ASCII codepoints. + assert!(!testlook!(look, "𝛃𐆀", 1)); + assert!(!testlook!(look, "𝛃𐆀", 2)); + assert!(!testlook!(look, "𝛃𐆀", 3)); + assert!(!testlook!(look, "𝛃𐆀", 5)); + assert!(!testlook!(look, "𝛃𐆀", 6)); + assert!(!testlook!(look, "𝛃𐆀", 7)); + assert!(!testlook!(look, "𝛃𐆀", 8)); + } + + #[test] + #[cfg(all(not(miri), feature = "unicode-word-boundary"))] + fn look_matches_word_unicode_negate() { + let look = Look::WordUnicodeNegate; + + // \xF0\x9D\x9B\x83 = 𝛃 (in \w) + // \xF0\x90\x86\x80 = 𐆀 (not in \w) + + // Simple ASCII word boundaries. + assert!(!testlook!(look, "a", 0)); + assert!(!testlook!(look, "a", 1)); + assert!(!testlook!(look, "a ", 1)); + assert!(!testlook!(look, " a ", 1)); + assert!(!testlook!(look, " a ", 2)); + + // Unicode word boundaries with a non-ASCII codepoint. + assert!(!testlook!(look, "𝛃", 0)); + assert!(!testlook!(look, "𝛃", 4)); + assert!(!testlook!(look, "𝛃 ", 4)); + assert!(!testlook!(look, " 𝛃 ", 1)); + assert!(!testlook!(look, " 𝛃 ", 5)); + + // Unicode word boundaries between non-ASCII codepoints. + assert!(!testlook!(look, "𝛃𐆀", 0)); + assert!(!testlook!(look, "𝛃𐆀", 4)); + + // Non word boundaries for ASCII. + assert!(testlook!(look, "", 0)); + assert!(testlook!(look, "ab", 1)); + assert!(testlook!(look, "a ", 2)); + assert!(testlook!(look, " a ", 0)); + assert!(testlook!(look, " a ", 3)); + + // Non word boundaries with a non-ASCII codepoint. + assert!(testlook!(look, "𝛃b", 4)); + assert!(testlook!(look, "𝛃 ", 5)); + assert!(testlook!(look, " 𝛃 ", 0)); + assert!(testlook!(look, " 𝛃 ", 6)); + // These don't match because they could otherwise return an offset that + // splits the UTF-8 encoding of a codepoint. + assert!(!testlook!(look, "𝛃", 1)); + assert!(!testlook!(look, "𝛃", 2)); + assert!(!testlook!(look, "𝛃", 3)); + + // Non word boundaries with non-ASCII codepoints. These also don't + // match because they could otherwise return an offset that splits the + // UTF-8 encoding of a codepoint. + assert!(!testlook!(look, "𝛃𐆀", 1)); + assert!(!testlook!(look, "𝛃𐆀", 2)); + assert!(!testlook!(look, "𝛃𐆀", 3)); + assert!(!testlook!(look, "𝛃𐆀", 5)); + assert!(!testlook!(look, "𝛃𐆀", 6)); + assert!(!testlook!(look, "𝛃𐆀", 7)); + // But this one does, since 𐆀 isn't a word codepoint, and 8 is the end + // of the haystack. So the "end" of the haystack isn't a word and 𐆀 + // isn't a word, thus, \B matches. + assert!(testlook!(look, "𝛃𐆀", 8)); + } + + #[test] + fn look_matches_word_ascii_negate() { + let look = Look::WordAsciiNegate; + + // \xF0\x9D\x9B\x83 = 𝛃 (in \w) + // \xF0\x90\x86\x80 = 𐆀 (not in \w) + + // Simple ASCII word boundaries. + assert!(!testlook!(look, "a", 0)); + assert!(!testlook!(look, "a", 1)); + assert!(!testlook!(look, "a ", 1)); + assert!(!testlook!(look, " a ", 1)); + assert!(!testlook!(look, " a ", 2)); + + // Unicode word boundaries with a non-ASCII codepoint. Since this is + // an ASCII word boundary, none of these match. + assert!(testlook!(look, "𝛃", 0)); + assert!(testlook!(look, "𝛃", 4)); + assert!(testlook!(look, "𝛃 ", 4)); + assert!(testlook!(look, " 𝛃 ", 1)); + assert!(testlook!(look, " 𝛃 ", 5)); + + // Unicode word boundaries between non-ASCII codepoints. Again, since + // this is an ASCII word boundary, none of these match. + assert!(testlook!(look, "𝛃𐆀", 0)); + assert!(testlook!(look, "𝛃𐆀", 4)); + + // Non word boundaries for ASCII. + assert!(testlook!(look, "", 0)); + assert!(testlook!(look, "ab", 1)); + assert!(testlook!(look, "a ", 2)); + assert!(testlook!(look, " a ", 0)); + assert!(testlook!(look, " a ", 3)); + + // Non word boundaries with a non-ASCII codepoint. + assert!(!testlook!(look, "𝛃b", 4)); + assert!(testlook!(look, "𝛃 ", 5)); + assert!(testlook!(look, " 𝛃 ", 0)); + assert!(testlook!(look, " 𝛃 ", 6)); + assert!(testlook!(look, "𝛃", 1)); + assert!(testlook!(look, "𝛃", 2)); + assert!(testlook!(look, "𝛃", 3)); + + // Non word boundaries with non-ASCII codepoints. + assert!(testlook!(look, "𝛃𐆀", 1)); + assert!(testlook!(look, "𝛃𐆀", 2)); + assert!(testlook!(look, "𝛃𐆀", 3)); + assert!(testlook!(look, "𝛃𐆀", 5)); + assert!(testlook!(look, "𝛃𐆀", 6)); + assert!(testlook!(look, "𝛃𐆀", 7)); + assert!(testlook!(look, "𝛃𐆀", 8)); + } + + #[test] + fn look_matches_word_start_ascii() { + let look = Look::WordStartAscii; + + // \xF0\x9D\x9B\x83 = 𝛃 (in \w) + // \xF0\x90\x86\x80 = 𐆀 (not in \w) + + // Simple ASCII word boundaries. + assert!(testlook!(look, "a", 0)); + assert!(!testlook!(look, "a", 1)); + assert!(!testlook!(look, "a ", 1)); + assert!(testlook!(look, " a ", 1)); + assert!(!testlook!(look, " a ", 2)); + + // Unicode word boundaries with a non-ASCII codepoint. Since this is + // an ASCII word boundary, none of these match. + assert!(!testlook!(look, "𝛃", 0)); + assert!(!testlook!(look, "𝛃", 4)); + assert!(!testlook!(look, "𝛃 ", 4)); + assert!(!testlook!(look, " 𝛃 ", 1)); + assert!(!testlook!(look, " 𝛃 ", 5)); + + // Unicode word boundaries between non-ASCII codepoints. Again, since + // this is an ASCII word boundary, none of these match. + assert!(!testlook!(look, "𝛃𐆀", 0)); + assert!(!testlook!(look, "𝛃𐆀", 4)); + + // Non word boundaries for ASCII. + assert!(!testlook!(look, "", 0)); + assert!(!testlook!(look, "ab", 1)); + assert!(!testlook!(look, "a ", 2)); + assert!(!testlook!(look, " a ", 0)); + assert!(!testlook!(look, " a ", 3)); + + // Non word boundaries with a non-ASCII codepoint. + assert!(testlook!(look, "𝛃b", 4)); + assert!(!testlook!(look, "b𝛃", 1)); + assert!(!testlook!(look, "𝛃 ", 5)); + assert!(!testlook!(look, " 𝛃 ", 0)); + assert!(!testlook!(look, " 𝛃 ", 6)); + assert!(!testlook!(look, "𝛃", 1)); + assert!(!testlook!(look, "𝛃", 2)); + assert!(!testlook!(look, "𝛃", 3)); + + // Non word boundaries with non-ASCII codepoints. + assert!(!testlook!(look, "𝛃𐆀", 1)); + assert!(!testlook!(look, "𝛃𐆀", 2)); + assert!(!testlook!(look, "𝛃𐆀", 3)); + assert!(!testlook!(look, "𝛃𐆀", 5)); + assert!(!testlook!(look, "𝛃𐆀", 6)); + assert!(!testlook!(look, "𝛃𐆀", 7)); + assert!(!testlook!(look, "𝛃𐆀", 8)); + } + + #[test] + fn look_matches_word_end_ascii() { + let look = Look::WordEndAscii; + + // \xF0\x9D\x9B\x83 = 𝛃 (in \w) + // \xF0\x90\x86\x80 = 𐆀 (not in \w) + + // Simple ASCII word boundaries. + assert!(!testlook!(look, "a", 0)); + assert!(testlook!(look, "a", 1)); + assert!(testlook!(look, "a ", 1)); + assert!(!testlook!(look, " a ", 1)); + assert!(testlook!(look, " a ", 2)); + + // Unicode word boundaries with a non-ASCII codepoint. Since this is + // an ASCII word boundary, none of these match. + assert!(!testlook!(look, "𝛃", 0)); + assert!(!testlook!(look, "𝛃", 4)); + assert!(!testlook!(look, "𝛃 ", 4)); + assert!(!testlook!(look, " 𝛃 ", 1)); + assert!(!testlook!(look, " 𝛃 ", 5)); + + // Unicode word boundaries between non-ASCII codepoints. Again, since + // this is an ASCII word boundary, none of these match. + assert!(!testlook!(look, "𝛃𐆀", 0)); + assert!(!testlook!(look, "𝛃𐆀", 4)); + + // Non word boundaries for ASCII. + assert!(!testlook!(look, "", 0)); + assert!(!testlook!(look, "ab", 1)); + assert!(!testlook!(look, "a ", 2)); + assert!(!testlook!(look, " a ", 0)); + assert!(!testlook!(look, " a ", 3)); + + // Non word boundaries with a non-ASCII codepoint. + assert!(!testlook!(look, "𝛃b", 4)); + assert!(testlook!(look, "b𝛃", 1)); + assert!(!testlook!(look, "𝛃 ", 5)); + assert!(!testlook!(look, " 𝛃 ", 0)); + assert!(!testlook!(look, " 𝛃 ", 6)); + assert!(!testlook!(look, "𝛃", 1)); + assert!(!testlook!(look, "𝛃", 2)); + assert!(!testlook!(look, "𝛃", 3)); + + // Non word boundaries with non-ASCII codepoints. + assert!(!testlook!(look, "𝛃𐆀", 1)); + assert!(!testlook!(look, "𝛃𐆀", 2)); + assert!(!testlook!(look, "𝛃𐆀", 3)); + assert!(!testlook!(look, "𝛃𐆀", 5)); + assert!(!testlook!(look, "𝛃𐆀", 6)); + assert!(!testlook!(look, "𝛃𐆀", 7)); + assert!(!testlook!(look, "𝛃𐆀", 8)); + } + + #[test] + #[cfg(all(not(miri), feature = "unicode-word-boundary"))] + fn look_matches_word_start_unicode() { + let look = Look::WordStartUnicode; + + // \xF0\x9D\x9B\x83 = 𝛃 (in \w) + // \xF0\x90\x86\x80 = 𐆀 (not in \w) + + // Simple ASCII word boundaries. + assert!(testlook!(look, "a", 0)); + assert!(!testlook!(look, "a", 1)); + assert!(!testlook!(look, "a ", 1)); + assert!(testlook!(look, " a ", 1)); + assert!(!testlook!(look, " a ", 2)); + + // Unicode word boundaries with a non-ASCII codepoint. + assert!(testlook!(look, "𝛃", 0)); + assert!(!testlook!(look, "𝛃", 4)); + assert!(!testlook!(look, "𝛃 ", 4)); + assert!(testlook!(look, " 𝛃 ", 1)); + assert!(!testlook!(look, " 𝛃 ", 5)); + + // Unicode word boundaries between non-ASCII codepoints. + assert!(testlook!(look, "𝛃𐆀", 0)); + assert!(!testlook!(look, "𝛃𐆀", 4)); + + // Non word boundaries for ASCII. + assert!(!testlook!(look, "", 0)); + assert!(!testlook!(look, "ab", 1)); + assert!(!testlook!(look, "a ", 2)); + assert!(!testlook!(look, " a ", 0)); + assert!(!testlook!(look, " a ", 3)); + + // Non word boundaries with a non-ASCII codepoint. + assert!(!testlook!(look, "𝛃b", 4)); + assert!(!testlook!(look, "b𝛃", 1)); + assert!(!testlook!(look, "𝛃 ", 5)); + assert!(!testlook!(look, " 𝛃 ", 0)); + assert!(!testlook!(look, " 𝛃 ", 6)); + assert!(!testlook!(look, "𝛃", 1)); + assert!(!testlook!(look, "𝛃", 2)); + assert!(!testlook!(look, "𝛃", 3)); + + // Non word boundaries with non-ASCII codepoints. + assert!(!testlook!(look, "𝛃𐆀", 1)); + assert!(!testlook!(look, "𝛃𐆀", 2)); + assert!(!testlook!(look, "𝛃𐆀", 3)); + assert!(!testlook!(look, "𝛃𐆀", 5)); + assert!(!testlook!(look, "𝛃𐆀", 6)); + assert!(!testlook!(look, "𝛃𐆀", 7)); + assert!(!testlook!(look, "𝛃𐆀", 8)); + } + + #[test] + #[cfg(all(not(miri), feature = "unicode-word-boundary"))] + fn look_matches_word_end_unicode() { + let look = Look::WordEndUnicode; + + // \xF0\x9D\x9B\x83 = 𝛃 (in \w) + // \xF0\x90\x86\x80 = 𐆀 (not in \w) + + // Simple ASCII word boundaries. + assert!(!testlook!(look, "a", 0)); + assert!(testlook!(look, "a", 1)); + assert!(testlook!(look, "a ", 1)); + assert!(!testlook!(look, " a ", 1)); + assert!(testlook!(look, " a ", 2)); + + // Unicode word boundaries with a non-ASCII codepoint. + assert!(!testlook!(look, "𝛃", 0)); + assert!(testlook!(look, "𝛃", 4)); + assert!(testlook!(look, "𝛃 ", 4)); + assert!(!testlook!(look, " 𝛃 ", 1)); + assert!(testlook!(look, " 𝛃 ", 5)); + + // Unicode word boundaries between non-ASCII codepoints. + assert!(!testlook!(look, "𝛃𐆀", 0)); + assert!(testlook!(look, "𝛃𐆀", 4)); + + // Non word boundaries for ASCII. + assert!(!testlook!(look, "", 0)); + assert!(!testlook!(look, "ab", 1)); + assert!(!testlook!(look, "a ", 2)); + assert!(!testlook!(look, " a ", 0)); + assert!(!testlook!(look, " a ", 3)); + + // Non word boundaries with a non-ASCII codepoint. + assert!(!testlook!(look, "𝛃b", 4)); + assert!(!testlook!(look, "b𝛃", 1)); + assert!(!testlook!(look, "𝛃 ", 5)); + assert!(!testlook!(look, " 𝛃 ", 0)); + assert!(!testlook!(look, " 𝛃 ", 6)); + assert!(!testlook!(look, "𝛃", 1)); + assert!(!testlook!(look, "𝛃", 2)); + assert!(!testlook!(look, "𝛃", 3)); + + // Non word boundaries with non-ASCII codepoints. + assert!(!testlook!(look, "𝛃𐆀", 1)); + assert!(!testlook!(look, "𝛃𐆀", 2)); + assert!(!testlook!(look, "𝛃𐆀", 3)); + assert!(!testlook!(look, "𝛃𐆀", 5)); + assert!(!testlook!(look, "𝛃𐆀", 6)); + assert!(!testlook!(look, "𝛃𐆀", 7)); + assert!(!testlook!(look, "𝛃𐆀", 8)); + } + + #[test] + fn look_matches_word_start_half_ascii() { + let look = Look::WordStartHalfAscii; + + // \xF0\x9D\x9B\x83 = 𝛃 (in \w) + // \xF0\x90\x86\x80 = 𐆀 (not in \w) + + // Simple ASCII word boundaries. + assert!(testlook!(look, "a", 0)); + assert!(!testlook!(look, "a", 1)); + assert!(!testlook!(look, "a ", 1)); + assert!(testlook!(look, " a ", 1)); + assert!(!testlook!(look, " a ", 2)); + + // Unicode word boundaries with a non-ASCII codepoint. Since this is + // an ASCII word boundary, none of these match. + assert!(testlook!(look, "𝛃", 0)); + assert!(testlook!(look, "𝛃", 4)); + assert!(testlook!(look, "𝛃 ", 4)); + assert!(testlook!(look, " 𝛃 ", 1)); + assert!(testlook!(look, " 𝛃 ", 5)); + + // Unicode word boundaries between non-ASCII codepoints. Again, since + // this is an ASCII word boundary, none of these match. + assert!(testlook!(look, "𝛃𐆀", 0)); + assert!(testlook!(look, "𝛃𐆀", 4)); + + // Non word boundaries for ASCII. + assert!(testlook!(look, "", 0)); + assert!(!testlook!(look, "ab", 1)); + assert!(testlook!(look, "a ", 2)); + assert!(testlook!(look, " a ", 0)); + assert!(testlook!(look, " a ", 3)); + + // Non word boundaries with a non-ASCII codepoint. + assert!(testlook!(look, "𝛃b", 4)); + assert!(!testlook!(look, "b𝛃", 1)); + assert!(testlook!(look, "𝛃 ", 5)); + assert!(testlook!(look, " 𝛃 ", 0)); + assert!(testlook!(look, " 𝛃 ", 6)); + assert!(testlook!(look, "𝛃", 1)); + assert!(testlook!(look, "𝛃", 2)); + assert!(testlook!(look, "𝛃", 3)); + + // Non word boundaries with non-ASCII codepoints. + assert!(testlook!(look, "𝛃𐆀", 1)); + assert!(testlook!(look, "𝛃𐆀", 2)); + assert!(testlook!(look, "𝛃𐆀", 3)); + assert!(testlook!(look, "𝛃𐆀", 5)); + assert!(testlook!(look, "𝛃𐆀", 6)); + assert!(testlook!(look, "𝛃𐆀", 7)); + assert!(testlook!(look, "𝛃𐆀", 8)); + } + + #[test] + fn look_matches_word_end_half_ascii() { + let look = Look::WordEndHalfAscii; + + // \xF0\x9D\x9B\x83 = 𝛃 (in \w) + // \xF0\x90\x86\x80 = 𐆀 (not in \w) + + // Simple ASCII word boundaries. + assert!(!testlook!(look, "a", 0)); + assert!(testlook!(look, "a", 1)); + assert!(testlook!(look, "a ", 1)); + assert!(!testlook!(look, " a ", 1)); + assert!(testlook!(look, " a ", 2)); + + // Unicode word boundaries with a non-ASCII codepoint. Since this is + // an ASCII word boundary, none of these match. + assert!(testlook!(look, "𝛃", 0)); + assert!(testlook!(look, "𝛃", 4)); + assert!(testlook!(look, "𝛃 ", 4)); + assert!(testlook!(look, " 𝛃 ", 1)); + assert!(testlook!(look, " 𝛃 ", 5)); + + // Unicode word boundaries between non-ASCII codepoints. Again, since + // this is an ASCII word boundary, none of these match. + assert!(testlook!(look, "𝛃𐆀", 0)); + assert!(testlook!(look, "𝛃𐆀", 4)); + + // Non word boundaries for ASCII. + assert!(testlook!(look, "", 0)); + assert!(!testlook!(look, "ab", 1)); + assert!(testlook!(look, "a ", 2)); + assert!(testlook!(look, " a ", 0)); + assert!(testlook!(look, " a ", 3)); + + // Non word boundaries with a non-ASCII codepoint. + assert!(!testlook!(look, "𝛃b", 4)); + assert!(testlook!(look, "b𝛃", 1)); + assert!(testlook!(look, "𝛃 ", 5)); + assert!(testlook!(look, " 𝛃 ", 0)); + assert!(testlook!(look, " 𝛃 ", 6)); + assert!(testlook!(look, "𝛃", 1)); + assert!(testlook!(look, "𝛃", 2)); + assert!(testlook!(look, "𝛃", 3)); + + // Non word boundaries with non-ASCII codepoints. + assert!(testlook!(look, "𝛃𐆀", 1)); + assert!(testlook!(look, "𝛃𐆀", 2)); + assert!(testlook!(look, "𝛃𐆀", 3)); + assert!(testlook!(look, "𝛃𐆀", 5)); + assert!(testlook!(look, "𝛃𐆀", 6)); + assert!(testlook!(look, "𝛃𐆀", 7)); + assert!(testlook!(look, "𝛃𐆀", 8)); + } + + #[test] + #[cfg(all(not(miri), feature = "unicode-word-boundary"))] + fn look_matches_word_start_half_unicode() { + let look = Look::WordStartHalfUnicode; + + // \xF0\x9D\x9B\x83 = 𝛃 (in \w) + // \xF0\x90\x86\x80 = 𐆀 (not in \w) + + // Simple ASCII word boundaries. + assert!(testlook!(look, "a", 0)); + assert!(!testlook!(look, "a", 1)); + assert!(!testlook!(look, "a ", 1)); + assert!(testlook!(look, " a ", 1)); + assert!(!testlook!(look, " a ", 2)); + + // Unicode word boundaries with a non-ASCII codepoint. + assert!(testlook!(look, "𝛃", 0)); + assert!(!testlook!(look, "𝛃", 4)); + assert!(!testlook!(look, "𝛃 ", 4)); + assert!(testlook!(look, " 𝛃 ", 1)); + assert!(!testlook!(look, " 𝛃 ", 5)); + + // Unicode word boundaries between non-ASCII codepoints. + assert!(testlook!(look, "𝛃𐆀", 0)); + assert!(!testlook!(look, "𝛃𐆀", 4)); + + // Non word boundaries for ASCII. + assert!(testlook!(look, "", 0)); + assert!(!testlook!(look, "ab", 1)); + assert!(testlook!(look, "a ", 2)); + assert!(testlook!(look, " a ", 0)); + assert!(testlook!(look, " a ", 3)); + + // Non word boundaries with a non-ASCII codepoint. + assert!(!testlook!(look, "𝛃b", 4)); + assert!(!testlook!(look, "b𝛃", 1)); + assert!(testlook!(look, "𝛃 ", 5)); + assert!(testlook!(look, " 𝛃 ", 0)); + assert!(testlook!(look, " 𝛃 ", 6)); + assert!(!testlook!(look, "𝛃", 1)); + assert!(!testlook!(look, "𝛃", 2)); + assert!(!testlook!(look, "𝛃", 3)); + + // Non word boundaries with non-ASCII codepoints. + assert!(!testlook!(look, "𝛃𐆀", 1)); + assert!(!testlook!(look, "𝛃𐆀", 2)); + assert!(!testlook!(look, "𝛃𐆀", 3)); + assert!(!testlook!(look, "𝛃𐆀", 5)); + assert!(!testlook!(look, "𝛃𐆀", 6)); + assert!(!testlook!(look, "𝛃𐆀", 7)); + assert!(testlook!(look, "𝛃𐆀", 8)); + } + + #[test] + #[cfg(all(not(miri), feature = "unicode-word-boundary"))] + fn look_matches_word_end_half_unicode() { + let look = Look::WordEndHalfUnicode; + + // \xF0\x9D\x9B\x83 = 𝛃 (in \w) + // \xF0\x90\x86\x80 = 𐆀 (not in \w) + + // Simple ASCII word boundaries. + assert!(!testlook!(look, "a", 0)); + assert!(testlook!(look, "a", 1)); + assert!(testlook!(look, "a ", 1)); + assert!(!testlook!(look, " a ", 1)); + assert!(testlook!(look, " a ", 2)); + + // Unicode word boundaries with a non-ASCII codepoint. + assert!(!testlook!(look, "𝛃", 0)); + assert!(testlook!(look, "𝛃", 4)); + assert!(testlook!(look, "𝛃 ", 4)); + assert!(!testlook!(look, " 𝛃 ", 1)); + assert!(testlook!(look, " 𝛃 ", 5)); + + // Unicode word boundaries between non-ASCII codepoints. + assert!(!testlook!(look, "𝛃𐆀", 0)); + assert!(testlook!(look, "𝛃𐆀", 4)); + + // Non word boundaries for ASCII. + assert!(testlook!(look, "", 0)); + assert!(!testlook!(look, "ab", 1)); + assert!(testlook!(look, "a ", 2)); + assert!(testlook!(look, " a ", 0)); + assert!(testlook!(look, " a ", 3)); + + // Non word boundaries with a non-ASCII codepoint. + assert!(!testlook!(look, "𝛃b", 4)); + assert!(!testlook!(look, "b𝛃", 1)); + assert!(testlook!(look, "𝛃 ", 5)); + assert!(testlook!(look, " 𝛃 ", 0)); + assert!(testlook!(look, " 𝛃 ", 6)); + assert!(!testlook!(look, "𝛃", 1)); + assert!(!testlook!(look, "𝛃", 2)); + assert!(!testlook!(look, "𝛃", 3)); + + // Non word boundaries with non-ASCII codepoints. + assert!(!testlook!(look, "𝛃𐆀", 1)); + assert!(!testlook!(look, "𝛃𐆀", 2)); + assert!(!testlook!(look, "𝛃𐆀", 3)); + assert!(!testlook!(look, "𝛃𐆀", 5)); + assert!(!testlook!(look, "𝛃𐆀", 6)); + assert!(!testlook!(look, "𝛃𐆀", 7)); + assert!(testlook!(look, "𝛃𐆀", 8)); + } + + #[test] + fn look_set() { + let mut f = LookSet::default(); + assert!(!f.contains(Look::Start)); + assert!(!f.contains(Look::End)); + assert!(!f.contains(Look::StartLF)); + assert!(!f.contains(Look::EndLF)); + assert!(!f.contains(Look::WordUnicode)); + assert!(!f.contains(Look::WordUnicodeNegate)); + assert!(!f.contains(Look::WordAscii)); + assert!(!f.contains(Look::WordAsciiNegate)); + + f = f.insert(Look::Start); + assert!(f.contains(Look::Start)); + f = f.remove(Look::Start); + assert!(!f.contains(Look::Start)); + + f = f.insert(Look::End); + assert!(f.contains(Look::End)); + f = f.remove(Look::End); + assert!(!f.contains(Look::End)); + + f = f.insert(Look::StartLF); + assert!(f.contains(Look::StartLF)); + f = f.remove(Look::StartLF); + assert!(!f.contains(Look::StartLF)); + + f = f.insert(Look::EndLF); + assert!(f.contains(Look::EndLF)); + f = f.remove(Look::EndLF); + assert!(!f.contains(Look::EndLF)); + + f = f.insert(Look::StartCRLF); + assert!(f.contains(Look::StartCRLF)); + f = f.remove(Look::StartCRLF); + assert!(!f.contains(Look::StartCRLF)); + + f = f.insert(Look::EndCRLF); + assert!(f.contains(Look::EndCRLF)); + f = f.remove(Look::EndCRLF); + assert!(!f.contains(Look::EndCRLF)); + + f = f.insert(Look::WordUnicode); + assert!(f.contains(Look::WordUnicode)); + f = f.remove(Look::WordUnicode); + assert!(!f.contains(Look::WordUnicode)); + + f = f.insert(Look::WordUnicodeNegate); + assert!(f.contains(Look::WordUnicodeNegate)); + f = f.remove(Look::WordUnicodeNegate); + assert!(!f.contains(Look::WordUnicodeNegate)); + + f = f.insert(Look::WordAscii); + assert!(f.contains(Look::WordAscii)); + f = f.remove(Look::WordAscii); + assert!(!f.contains(Look::WordAscii)); + + f = f.insert(Look::WordAsciiNegate); + assert!(f.contains(Look::WordAsciiNegate)); + f = f.remove(Look::WordAsciiNegate); + assert!(!f.contains(Look::WordAsciiNegate)); + + f = f.insert(Look::WordStartAscii); + assert!(f.contains(Look::WordStartAscii)); + f = f.remove(Look::WordStartAscii); + assert!(!f.contains(Look::WordStartAscii)); + + f = f.insert(Look::WordEndAscii); + assert!(f.contains(Look::WordEndAscii)); + f = f.remove(Look::WordEndAscii); + assert!(!f.contains(Look::WordEndAscii)); + + f = f.insert(Look::WordStartUnicode); + assert!(f.contains(Look::WordStartUnicode)); + f = f.remove(Look::WordStartUnicode); + assert!(!f.contains(Look::WordStartUnicode)); + + f = f.insert(Look::WordEndUnicode); + assert!(f.contains(Look::WordEndUnicode)); + f = f.remove(Look::WordEndUnicode); + assert!(!f.contains(Look::WordEndUnicode)); + + f = f.insert(Look::WordStartHalfAscii); + assert!(f.contains(Look::WordStartHalfAscii)); + f = f.remove(Look::WordStartHalfAscii); + assert!(!f.contains(Look::WordStartHalfAscii)); + + f = f.insert(Look::WordEndHalfAscii); + assert!(f.contains(Look::WordEndHalfAscii)); + f = f.remove(Look::WordEndHalfAscii); + assert!(!f.contains(Look::WordEndHalfAscii)); + + f = f.insert(Look::WordStartHalfUnicode); + assert!(f.contains(Look::WordStartHalfUnicode)); + f = f.remove(Look::WordStartHalfUnicode); + assert!(!f.contains(Look::WordStartHalfUnicode)); + + f = f.insert(Look::WordEndHalfUnicode); + assert!(f.contains(Look::WordEndHalfUnicode)); + f = f.remove(Look::WordEndHalfUnicode); + assert!(!f.contains(Look::WordEndHalfUnicode)); + } + + #[test] + fn look_set_iter() { + let set = LookSet::empty(); + assert_eq!(0, set.iter().count()); + + let set = LookSet::full(); + assert_eq!(18, set.iter().count()); + + let set = + LookSet::empty().insert(Look::StartLF).insert(Look::WordUnicode); + assert_eq!(2, set.iter().count()); + + let set = LookSet::empty().insert(Look::StartLF); + assert_eq!(1, set.iter().count()); + + let set = LookSet::empty().insert(Look::WordAsciiNegate); + assert_eq!(1, set.iter().count()); + + let set = LookSet::empty().insert(Look::WordEndHalfUnicode); + assert_eq!(1, set.iter().count()); + } + + #[test] + #[cfg(feature = "alloc")] + fn look_set_debug() { + let res = alloc::format!("{:?}", LookSet::empty()); + assert_eq!("∅", res); + let res = alloc::format!("{:?}", LookSet::full()); + assert_eq!("Az^$rRbB𝛃𝚩<>〈〉◁▷◀▶", res); + } +} diff --git a/vendor/regex-automata/src/util/memchr.rs b/vendor/regex-automata/src/util/memchr.rs new file mode 100644 index 0000000..a2cbb07 --- /dev/null +++ b/vendor/regex-automata/src/util/memchr.rs @@ -0,0 +1,93 @@ +/*! +This module defines simple wrapper routines for the memchr functions from the +`memchr` crate. Basically, when the `memchr` crate is available, we use it, +otherwise we use a naive implementation which is still pretty fast. +*/ + +pub(crate) use self::inner::*; + +#[cfg(feature = "perf-literal-substring")] +pub(super) mod inner { + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn memchr(n1: u8, haystack: &[u8]) -> Option<usize> { + memchr::memchr(n1, haystack) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn memchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> { + memchr::memchr2(n1, n2, haystack) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn memchr3( + n1: u8, + n2: u8, + n3: u8, + haystack: &[u8], + ) -> Option<usize> { + memchr::memchr3(n1, n2, n3, haystack) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn memrchr(n1: u8, haystack: &[u8]) -> Option<usize> { + memchr::memrchr(n1, haystack) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn memrchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> { + memchr::memrchr2(n1, n2, haystack) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn memrchr3( + n1: u8, + n2: u8, + n3: u8, + haystack: &[u8], + ) -> Option<usize> { + memchr::memrchr3(n1, n2, n3, haystack) + } +} + +#[cfg(not(feature = "perf-literal-substring"))] +pub(super) mod inner { + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn memchr(n1: u8, haystack: &[u8]) -> Option<usize> { + haystack.iter().position(|&b| b == n1) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn memchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> { + haystack.iter().position(|&b| b == n1 || b == n2) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn memchr3( + n1: u8, + n2: u8, + n3: u8, + haystack: &[u8], + ) -> Option<usize> { + haystack.iter().position(|&b| b == n1 || b == n2 || b == n3) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn memrchr(n1: u8, haystack: &[u8]) -> Option<usize> { + haystack.iter().rposition(|&b| b == n1) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn memrchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> { + haystack.iter().rposition(|&b| b == n1 || b == n2) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn memrchr3( + n1: u8, + n2: u8, + n3: u8, + haystack: &[u8], + ) -> Option<usize> { + haystack.iter().rposition(|&b| b == n1 || b == n2 || b == n3) + } +} diff --git a/vendor/regex-automata/src/util/mod.rs b/vendor/regex-automata/src/util/mod.rs new file mode 100644 index 0000000..b3eef64 --- /dev/null +++ b/vendor/regex-automata/src/util/mod.rs @@ -0,0 +1,57 @@ +/*! +A collection of modules that provide APIs that are useful across many regex +engines. + +While one should explore the sub-modules directly to get a sense of what's +there, here are some highlights that tie the sub-modules to higher level +use cases: + +* `alphabet` contains APIs that are useful if you're doing low level things +with the DFAs in this crate. For example, implementing determinization or +walking its state graph directly. +* `captures` contains APIs for dealing with capture group matches and their +mapping to "slots" used inside an NFA graph. This is also where you can find +iterators over capture group names. +* `escape` contains types for pretty-printing raw byte slices as strings. +* `iter` contains API helpers for writing regex iterators. +* `lazy` contains a no-std and no-alloc variant of `lazy_static!` and +`once_cell`. +* `look` contains APIs for matching and configuring look-around assertions. +* `pool` provides a way to reuse mutable memory allocated in a thread safe +manner. +* `prefilter` provides APIs for building prefilters and using them in searches. +* `primitives` are what you might use if you're doing lower level work on +automata, such as walking an NFA state graph. +* `syntax` provides some higher level convenience functions for interacting +with the `regex-syntax` crate. +* `wire` is useful if you're working with DFA serialization. +*/ + +pub mod alphabet; +#[cfg(feature = "alloc")] +pub mod captures; +pub mod escape; +#[cfg(feature = "alloc")] +pub mod interpolate; +pub mod iter; +pub mod lazy; +pub mod look; +#[cfg(feature = "alloc")] +pub mod pool; +pub mod prefilter; +pub mod primitives; +pub mod start; +#[cfg(feature = "syntax")] +pub mod syntax; +pub mod wire; + +#[cfg(any(feature = "dfa-build", feature = "hybrid"))] +pub(crate) mod determinize; +pub(crate) mod empty; +pub(crate) mod int; +pub(crate) mod memchr; +pub(crate) mod search; +#[cfg(feature = "alloc")] +pub(crate) mod sparse_set; +pub(crate) mod unicode_data; +pub(crate) mod utf8; diff --git a/vendor/regex-automata/src/util/pool.rs b/vendor/regex-automata/src/util/pool.rs new file mode 100644 index 0000000..d90d4ec --- /dev/null +++ b/vendor/regex-automata/src/util/pool.rs @@ -0,0 +1,1199 @@ +// This module provides a relatively simple thread-safe pool of reusable +// objects. For the most part, it's implemented by a stack represented by a +// Mutex<Vec<T>>. It has one small trick: because unlocking a mutex is somewhat +// costly, in the case where a pool is accessed by the first thread that tried +// to get a value, we bypass the mutex. Here are some benchmarks showing the +// difference. +// +// 2022-10-15: These benchmarks are from the old regex crate and they aren't +// easy to reproduce because some rely on older implementations of Pool that +// are no longer around. I've left the results here for posterity, but any +// enterprising individual should feel encouraged to re-litigate the way Pool +// works. I am not at all certain it is the best approach. +// +// 1) misc::anchored_literal_long_non_match 21 (18571 MB/s) +// 2) misc::anchored_literal_long_non_match 107 (3644 MB/s) +// 3) misc::anchored_literal_long_non_match 45 (8666 MB/s) +// 4) misc::anchored_literal_long_non_match 19 (20526 MB/s) +// +// (1) represents our baseline: the master branch at the time of writing when +// using the 'thread_local' crate to implement the pool below. +// +// (2) represents a naive pool implemented completely via Mutex<Vec<T>>. There +// is no special trick for bypassing the mutex. +// +// (3) is the same as (2), except it uses Mutex<Vec<Box<T>>>. It is twice as +// fast because a Box<T> is much smaller than the T we use with a Pool in this +// crate. So pushing and popping a Box<T> from a Vec is quite a bit faster +// than for T. +// +// (4) is the same as (3), but with the trick for bypassing the mutex in the +// case of the first-to-get thread. +// +// Why move off of thread_local? Even though (4) is a hair faster than (1) +// above, this was not the main goal. The main goal was to move off of +// thread_local and find a way to *simply* re-capture some of its speed for +// regex's specific case. So again, why move off of it? The *primary* reason is +// because of memory leaks. See https://github.com/rust-lang/regex/issues/362 +// for example. (Why do I want it to be simple? Well, I suppose what I mean is, +// "use as much safe code as possible to minimize risk and be as sure as I can +// be that it is correct.") +// +// My guess is that the thread_local design is probably not appropriate for +// regex since its memory usage scales to the number of active threads that +// have used a regex, where as the pool below scales to the number of threads +// that simultaneously use a regex. While neither case permits contraction, +// since we own the pool data structure below, we can add contraction if a +// clear use case pops up in the wild. More pressingly though, it seems that +// there are at least some use case patterns where one might have many threads +// sitting around that might have used a regex at one point. While thread_local +// does try to reuse space previously used by a thread that has since stopped, +// its maximal memory usage still scales with the total number of active +// threads. In contrast, the pool below scales with the total number of threads +// *simultaneously* using the pool. The hope is that this uses less memory +// overall. And if it doesn't, we can hopefully tune it somehow. +// +// It seems that these sort of conditions happen frequently +// in FFI inside of other more "managed" languages. This was +// mentioned in the issue linked above, and also mentioned here: +// https://github.com/BurntSushi/rure-go/issues/3. And in particular, users +// confirm that disabling the use of thread_local resolves the leak. +// +// There were other weaker reasons for moving off of thread_local as well. +// Namely, at the time, I was looking to reduce dependencies. And for something +// like regex, maintenance can be simpler when we own the full dependency tree. +// +// Note that I am not entirely happy with this pool. It has some subtle +// implementation details and is overall still observable (even with the +// thread owner optimization) in benchmarks. If someone wants to take a crack +// at building something better, please file an issue. Even if it means a +// different API. The API exposed by this pool is not the minimal thing that +// something like a 'Regex' actually needs. It could adapt to, for example, +// an API more like what is found in the 'thread_local' crate. However, we do +// really need to support the no-std alloc-only context, or else the regex +// crate wouldn't be able to support no-std alloc-only. However, I'm generally +// okay with making the alloc-only context slower (as it is here), although I +// do find it unfortunate. + +/*! +A thread safe memory pool. + +The principal type in this module is a [`Pool`]. It main use case is for +holding a thread safe collection of mutable scratch spaces (usually called +`Cache` in this crate) that regex engines need to execute a search. This then +permits sharing the same read-only regex object across multiple threads while +having a quick way of reusing scratch space in a thread safe way. This avoids +needing to re-create the scratch space for every search, which could wind up +being quite expensive. +*/ + +/// A thread safe pool that works in an `alloc`-only context. +/// +/// Getting a value out comes with a guard. When that guard is dropped, the +/// value is automatically put back in the pool. The guard provides both a +/// `Deref` and a `DerefMut` implementation for easy access to an underlying +/// `T`. +/// +/// A `Pool` impls `Sync` when `T` is `Send` (even if `T` is not `Sync`). This +/// is possible because a pool is guaranteed to provide a value to exactly one +/// thread at any time. +/// +/// Currently, a pool never contracts in size. Its size is proportional to the +/// maximum number of simultaneous uses. This may change in the future. +/// +/// A `Pool` is a particularly useful data structure for this crate because +/// many of the regex engines require a mutable "cache" in order to execute +/// a search. Since regexes themselves tend to be global, the problem is then: +/// how do you get a mutable cache to execute a search? You could: +/// +/// 1. Use a `thread_local!`, which requires the standard library and requires +/// that the regex pattern be statically known. +/// 2. Use a `Pool`. +/// 3. Make the cache an explicit dependency in your code and pass it around. +/// 4. Put the cache state in a `Mutex`, but this means only one search can +/// execute at a time. +/// 5. Create a new cache for every search. +/// +/// A `thread_local!` is perhaps the best choice if it works for your use case. +/// Putting the cache in a mutex or creating a new cache for every search are +/// perhaps the worst choices. Of the remaining two choices, whether you use +/// this `Pool` or thread through a cache explicitly in your code is a matter +/// of taste and depends on your code architecture. +/// +/// # Warning: may use a spin lock +/// +/// When this crate is compiled _without_ the `std` feature, then this type +/// may used a spin lock internally. This can have subtle effects that may +/// be undesirable. See [Spinlocks Considered Harmful][spinharm] for a more +/// thorough treatment of this topic. +/// +/// [spinharm]: https://matklad.github.io/2020/01/02/spinlocks-considered-harmful.html +/// +/// # Example +/// +/// This example shows how to share a single hybrid regex among multiple +/// threads, while also safely getting exclusive access to a hybrid's +/// [`Cache`](crate::hybrid::regex::Cache) without preventing other searches +/// from running while your thread uses the `Cache`. +/// +/// ``` +/// use regex_automata::{ +/// hybrid::regex::{Cache, Regex}, +/// util::{lazy::Lazy, pool::Pool}, +/// Match, +/// }; +/// +/// static RE: Lazy<Regex> = +/// Lazy::new(|| Regex::new("foo[0-9]+bar").unwrap()); +/// static CACHE: Lazy<Pool<Cache>> = +/// Lazy::new(|| Pool::new(|| RE.create_cache())); +/// +/// let expected = Some(Match::must(0, 3..14)); +/// assert_eq!(expected, RE.find(&mut CACHE.get(), b"zzzfoo12345barzzz")); +/// ``` +pub struct Pool<T, F = fn() -> T>(alloc::boxed::Box<inner::Pool<T, F>>); + +impl<T, F> Pool<T, F> { + /// Create a new pool. The given closure is used to create values in + /// the pool when necessary. + pub fn new(create: F) -> Pool<T, F> { + Pool(alloc::boxed::Box::new(inner::Pool::new(create))) + } +} + +impl<T: Send, F: Fn() -> T> Pool<T, F> { + /// Get a value from the pool. The caller is guaranteed to have + /// exclusive access to the given value. Namely, it is guaranteed that + /// this will never return a value that was returned by another call to + /// `get` but was not put back into the pool. + /// + /// When the guard goes out of scope and its destructor is called, then + /// it will automatically be put back into the pool. Alternatively, + /// [`PoolGuard::put`] may be used to explicitly put it back in the pool + /// without relying on its destructor. + /// + /// Note that there is no guarantee provided about which value in the + /// pool is returned. That is, calling get, dropping the guard (causing + /// the value to go back into the pool) and then calling get again is + /// *not* guaranteed to return the same value received in the first `get` + /// call. + #[inline] + pub fn get(&self) -> PoolGuard<'_, T, F> { + PoolGuard(self.0.get()) + } +} + +impl<T: core::fmt::Debug, F> core::fmt::Debug for Pool<T, F> { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + f.debug_tuple("Pool").field(&self.0).finish() + } +} + +/// A guard that is returned when a caller requests a value from the pool. +/// +/// The purpose of the guard is to use RAII to automatically put the value +/// back in the pool once it's dropped. +pub struct PoolGuard<'a, T: Send, F: Fn() -> T>(inner::PoolGuard<'a, T, F>); + +impl<'a, T: Send, F: Fn() -> T> PoolGuard<'a, T, F> { + /// Consumes this guard and puts it back into the pool. + /// + /// This circumvents the guard's `Drop` implementation. This can be useful + /// in circumstances where the automatic `Drop` results in poorer codegen, + /// such as calling non-inlined functions. + #[inline] + pub fn put(this: PoolGuard<'_, T, F>) { + inner::PoolGuard::put(this.0); + } +} + +impl<'a, T: Send, F: Fn() -> T> core::ops::Deref for PoolGuard<'a, T, F> { + type Target = T; + + #[inline] + fn deref(&self) -> &T { + self.0.value() + } +} + +impl<'a, T: Send, F: Fn() -> T> core::ops::DerefMut for PoolGuard<'a, T, F> { + #[inline] + fn deref_mut(&mut self) -> &mut T { + self.0.value_mut() + } +} + +impl<'a, T: Send + core::fmt::Debug, F: Fn() -> T> core::fmt::Debug + for PoolGuard<'a, T, F> +{ + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + f.debug_tuple("PoolGuard").field(&self.0).finish() + } +} + +#[cfg(feature = "std")] +mod inner { + use core::{ + cell::UnsafeCell, + panic::{RefUnwindSafe, UnwindSafe}, + sync::atomic::{AtomicUsize, Ordering}, + }; + + use alloc::{boxed::Box, vec, vec::Vec}; + + use std::{sync::Mutex, thread_local}; + + /// An atomic counter used to allocate thread IDs. + /// + /// We specifically start our counter at 3 so that we can use the values + /// less than it as sentinels. + static COUNTER: AtomicUsize = AtomicUsize::new(3); + + /// A thread ID indicating that there is no owner. This is the initial + /// state of a pool. Once a pool has an owner, there is no way to change + /// it. + static THREAD_ID_UNOWNED: usize = 0; + + /// A thread ID indicating that the special owner value is in use and not + /// available. This state is useful for avoiding a case where the owner + /// of a pool calls `get` before putting the result of a previous `get` + /// call back into the pool. + static THREAD_ID_INUSE: usize = 1; + + /// This sentinel is used to indicate that a guard has already been dropped + /// and should not be re-dropped. We use this because our drop code can be + /// called outside of Drop and thus there could be a bug in the internal + /// implementation that results in trying to put the same guard back into + /// the same pool multiple times, and *that* could result in UB if we + /// didn't mark the guard as already having been put back in the pool. + /// + /// So this isn't strictly necessary, but this let's us define some + /// routines as safe (like PoolGuard::put_imp) that we couldn't otherwise + /// do. + static THREAD_ID_DROPPED: usize = 2; + + /// The number of stacks we use inside of the pool. These are only used for + /// non-owners. That is, these represent the "slow" path. + /// + /// In the original implementation of this pool, we only used a single + /// stack. While this might be okay for a couple threads, the prevalence of + /// 32, 64 and even 128 core CPUs has made it untenable. The contention + /// such an environment introduces when threads are doing a lot of searches + /// on short haystacks (a not uncommon use case) is palpable and leads to + /// huge slowdowns. + /// + /// This constant reflects a change from using one stack to the number of + /// stacks that this constant is set to. The stack for a particular thread + /// is simply chosen by `thread_id % MAX_POOL_STACKS`. The idea behind + /// this setup is that there should be a good chance that accesses to the + /// pool will be distributed over several stacks instead of all of them + /// converging to one. + /// + /// This is not a particularly smart or dynamic strategy. Fixing this to a + /// specific number has at least two downsides. First is that it will help, + /// say, an 8 core CPU more than it will a 128 core CPU. (But, crucially, + /// it will still help the 128 core case.) Second is that this may wind + /// up being a little wasteful with respect to memory usage. Namely, if a + /// regex is used on one thread and then moved to another thread, then it + /// could result in creating a new copy of the data in the pool even though + /// only one is actually needed. + /// + /// And that memory usage bit is why this is set to 8 and not, say, 64. + /// Keeping it at 8 limits, to an extent, how much unnecessary memory can + /// be allocated. + /// + /// In an ideal world, we'd be able to have something like this: + /// + /// * Grow the number of stacks as the number of concurrent callers + /// increases. I spent a little time trying this, but even just adding an + /// atomic addition/subtraction for each pop/push for tracking concurrent + /// callers led to a big perf hit. Since even more work would seemingly be + /// required than just an addition/subtraction, I abandoned this approach. + /// * The maximum amount of memory used should scale with respect to the + /// number of concurrent callers and *not* the total number of existing + /// threads. This is primarily why the `thread_local` crate isn't used, as + /// as some environments spin up a lot of threads. This led to multiple + /// reports of extremely high memory usage (often described as memory + /// leaks). + /// * Even more ideally, the pool should contract in size. That is, it + /// should grow with bursts and then shrink. But this is a pretty thorny + /// issue to tackle and it might be better to just not. + /// * It would be nice to explore the use of, say, a lock-free stack + /// instead of using a mutex to guard a `Vec` that is ultimately just + /// treated as a stack. The main thing preventing me from exploring this + /// is the ABA problem. The `crossbeam` crate has tools for dealing with + /// this sort of problem (via its epoch based memory reclamation strategy), + /// but I can't justify bringing in all of `crossbeam` as a dependency of + /// `regex` for this. + /// + /// See this issue for more context and discussion: + /// https://github.com/rust-lang/regex/issues/934 + const MAX_POOL_STACKS: usize = 8; + + thread_local!( + /// A thread local used to assign an ID to a thread. + static THREAD_ID: usize = { + let next = COUNTER.fetch_add(1, Ordering::Relaxed); + // SAFETY: We cannot permit the reuse of thread IDs since reusing a + // thread ID might result in more than one thread "owning" a pool, + // and thus, permit accessing a mutable value from multiple threads + // simultaneously without synchronization. The intent of this panic + // is to be a sanity check. It is not expected that the thread ID + // space will actually be exhausted in practice. Even on a 32-bit + // system, it would require spawning 2^32 threads (although they + // wouldn't all need to run simultaneously, so it is in theory + // possible). + // + // This checks that the counter never wraps around, since atomic + // addition wraps around on overflow. + if next == 0 { + panic!("regex: thread ID allocation space exhausted"); + } + next + }; + ); + + /// This puts each stack in the pool below into its own cache line. This is + /// an absolutely critical optimization that tends to have the most impact + /// in high contention workloads. Without forcing each mutex protected + /// into its own cache line, high contention exacerbates the performance + /// problem by causing "false sharing." By putting each mutex in its own + /// cache-line, we avoid the false sharing problem and the affects of + /// contention are greatly reduced. + #[derive(Debug)] + #[repr(C, align(64))] + struct CacheLine<T>(T); + + /// A thread safe pool utilizing std-only features. + /// + /// The main difference between this and the simplistic alloc-only pool is + /// the use of std::sync::Mutex and an "owner thread" optimization that + /// makes accesses by the owner of a pool faster than all other threads. + /// This makes the common case of running a regex within a single thread + /// faster by avoiding mutex unlocking. + pub(super) struct Pool<T, F> { + /// A function to create more T values when stack is empty and a caller + /// has requested a T. + create: F, + /// Multiple stacks of T values to hand out. These are used when a Pool + /// is accessed by a thread that didn't create it. + /// + /// Conceptually this is `Mutex<Vec<Box<T>>>`, but sharded out to make + /// it scale better under high contention work-loads. We index into + /// this sequence via `thread_id % stacks.len()`. + stacks: Vec<CacheLine<Mutex<Vec<Box<T>>>>>, + /// The ID of the thread that owns this pool. The owner is the thread + /// that makes the first call to 'get'. When the owner calls 'get', it + /// gets 'owner_val' directly instead of returning a T from 'stack'. + /// See comments elsewhere for details, but this is intended to be an + /// optimization for the common case that makes getting a T faster. + /// + /// It is initialized to a value of zero (an impossible thread ID) as a + /// sentinel to indicate that it is unowned. + owner: AtomicUsize, + /// A value to return when the caller is in the same thread that + /// first called `Pool::get`. + /// + /// This is set to None when a Pool is first created, and set to Some + /// once the first thread calls Pool::get. + owner_val: UnsafeCell<Option<T>>, + } + + // SAFETY: Since we want to use a Pool from multiple threads simultaneously + // behind an Arc, we need for it to be Sync. In cases where T is sync, + // Pool<T> would be Sync. However, since we use a Pool to store mutable + // scratch space, we wind up using a T that has interior mutability and is + // thus itself not Sync. So what we *really* want is for our Pool<T> to by + // Sync even when T is not Sync (but is at least Send). + // + // The only non-sync aspect of a Pool is its 'owner_val' field, which is + // used to implement faster access to a pool value in the common case of + // a pool being accessed in the same thread in which it was created. The + // 'stack' field is also shared, but a Mutex<T> where T: Send is already + // Sync. So we only need to worry about 'owner_val'. + // + // The key is to guarantee that 'owner_val' can only ever be accessed from + // one thread. In our implementation below, we guarantee this by only + // returning the 'owner_val' when the ID of the current thread matches the + // ID of the thread that first called 'Pool::get'. Since this can only ever + // be one thread, it follows that only one thread can access 'owner_val' at + // any point in time. Thus, it is safe to declare that Pool<T> is Sync when + // T is Send. + // + // If there is a way to achieve our performance goals using safe code, then + // I would very much welcome a patch. As it stands, the implementation + // below tries to balance safety with performance. The case where a Regex + // is used from multiple threads simultaneously will suffer a bit since + // getting a value out of the pool will require unlocking a mutex. + // + // We require `F: Send + Sync` because we call `F` at any point on demand, + // potentially from multiple threads simultaneously. + unsafe impl<T: Send, F: Send + Sync> Sync for Pool<T, F> {} + + // If T is UnwindSafe, then since we provide exclusive access to any + // particular value in the pool, the pool should therefore also be + // considered UnwindSafe. + // + // We require `F: UnwindSafe + RefUnwindSafe` because we call `F` at any + // point on demand, so it needs to be unwind safe on both dimensions for + // the entire Pool to be unwind safe. + impl<T: UnwindSafe, F: UnwindSafe + RefUnwindSafe> UnwindSafe for Pool<T, F> {} + + // If T is UnwindSafe, then since we provide exclusive access to any + // particular value in the pool, the pool should therefore also be + // considered RefUnwindSafe. + // + // We require `F: UnwindSafe + RefUnwindSafe` because we call `F` at any + // point on demand, so it needs to be unwind safe on both dimensions for + // the entire Pool to be unwind safe. + impl<T: UnwindSafe, F: UnwindSafe + RefUnwindSafe> RefUnwindSafe + for Pool<T, F> + { + } + + impl<T, F> Pool<T, F> { + /// Create a new pool. The given closure is used to create values in + /// the pool when necessary. + pub(super) fn new(create: F) -> Pool<T, F> { + // FIXME: Now that we require 1.65+, Mutex::new is available as + // const... So we can almost mark this function as const. But of + // course, we're creating a Vec of stacks below (we didn't when I + // originally wrote this code). It seems like the best way to work + // around this would be to use a `[Stack; MAX_POOL_STACKS]` instead + // of a `Vec<Stack>`. I refrained from making this change at time + // of writing (2023/10/08) because I was making a lot of other + // changes at the same time and wanted to do this more carefully. + // Namely, because of the cache line optimization, that `[Stack; + // MAX_POOL_STACKS]` would be quite big. It's unclear how bad (if + // at all) that would be. + // + // Another choice would be to lazily allocate the stacks, but... + // I'm not so sure about that. Seems like a fair bit of complexity? + // + // Maybe there's a simple solution I'm missing. + // + // ... OK, I tried to fix this. First, I did it by putting `stacks` + // in an `UnsafeCell` and using a `Once` to lazily initialize it. + // I benchmarked it and everything looked okay. I then made this + // function `const` and thought I was just about done. But the + // public pool type wraps its inner pool in a `Box` to keep its + // size down. Blech. + // + // So then I thought that I could push the box down into this + // type (and leave the non-std version unboxed) and use the same + // `UnsafeCell` technique to lazily initialize it. This has the + // downside of the `Once` now needing to get hit in the owner fast + // path, but maybe that's OK? However, I then realized that we can + // only lazily initialize `stacks`, `owner` and `owner_val`. The + // `create` function needs to be put somewhere outside of the box. + // So now the pool is a `Box`, `Once` and a function. Now we're + // starting to defeat the point of boxing in the first place. So I + // backed out that change too. + // + // Back to square one. I maybe we just don't make a pool's + // constructor const and live with it. It's probably not a huge + // deal. + let mut stacks = Vec::with_capacity(MAX_POOL_STACKS); + for _ in 0..stacks.capacity() { + stacks.push(CacheLine(Mutex::new(vec![]))); + } + let owner = AtomicUsize::new(THREAD_ID_UNOWNED); + let owner_val = UnsafeCell::new(None); // init'd on first access + Pool { create, stacks, owner, owner_val } + } + } + + impl<T: Send, F: Fn() -> T> Pool<T, F> { + /// Get a value from the pool. This may block if another thread is also + /// attempting to retrieve a value from the pool. + #[inline] + pub(super) fn get(&self) -> PoolGuard<'_, T, F> { + // Our fast path checks if the caller is the thread that "owns" + // this pool. Or stated differently, whether it is the first thread + // that tried to extract a value from the pool. If it is, then we + // can return a T to the caller without going through a mutex. + // + // SAFETY: We must guarantee that only one thread gets access + // to this value. Since a thread is uniquely identified by the + // THREAD_ID thread local, it follows that if the caller's thread + // ID is equal to the owner, then only one thread may receive this + // value. This is also why we can get away with what looks like a + // racy load and a store. We know that if 'owner == caller', then + // only one thread can be here, so we don't need to worry about any + // other thread setting the owner to something else. + let caller = THREAD_ID.with(|id| *id); + let owner = self.owner.load(Ordering::Acquire); + if caller == owner { + // N.B. We could also do a CAS here instead of a load/store, + // but ad hoc benchmarking suggests it is slower. And a lot + // slower in the case where `get_slow` is common. + self.owner.store(THREAD_ID_INUSE, Ordering::Release); + return self.guard_owned(caller); + } + self.get_slow(caller, owner) + } + + /// This is the "slow" version that goes through a mutex to pop an + /// allocated value off a stack to return to the caller. (Or, if the + /// stack is empty, a new value is created.) + /// + /// If the pool has no owner, then this will set the owner. + #[cold] + fn get_slow( + &self, + caller: usize, + owner: usize, + ) -> PoolGuard<'_, T, F> { + if owner == THREAD_ID_UNOWNED { + // This sentinel means this pool is not yet owned. We try to + // atomically set the owner. If we do, then this thread becomes + // the owner and we can return a guard that represents the + // special T for the owner. + // + // Note that we set the owner to a different sentinel that + // indicates that the owned value is in use. The owner ID will + // get updated to the actual ID of this thread once the guard + // returned by this function is put back into the pool. + let res = self.owner.compare_exchange( + THREAD_ID_UNOWNED, + THREAD_ID_INUSE, + Ordering::AcqRel, + Ordering::Acquire, + ); + if res.is_ok() { + // SAFETY: A successful CAS above implies this thread is + // the owner and that this is the only such thread that + // can reach here. Thus, there is no data race. + unsafe { + *self.owner_val.get() = Some((self.create)()); + } + return self.guard_owned(caller); + } + } + let stack_id = caller % self.stacks.len(); + // We try to acquire exclusive access to this thread's stack, and + // if so, grab a value from it if we can. We put this in a loop so + // that it's easy to tweak and experiment with a different number + // of tries. In the end, I couldn't see anything obviously better + // than one attempt in ad hoc testing. + for _ in 0..1 { + let mut stack = match self.stacks[stack_id].0.try_lock() { + Err(_) => continue, + Ok(stack) => stack, + }; + if let Some(value) = stack.pop() { + return self.guard_stack(value); + } + // Unlock the mutex guarding the stack before creating a fresh + // value since we no longer need the stack. + drop(stack); + let value = Box::new((self.create)()); + return self.guard_stack(value); + } + // We're only here if we could get access to our stack, so just + // create a new value. This seems like it could be wasteful, but + // waiting for exclusive access to a stack when there's high + // contention is brutal for perf. + self.guard_stack_transient(Box::new((self.create)())) + } + + /// Puts a value back into the pool. Callers don't need to call this. + /// Once the guard that's returned by 'get' is dropped, it is put back + /// into the pool automatically. + #[inline] + fn put_value(&self, value: Box<T>) { + let caller = THREAD_ID.with(|id| *id); + let stack_id = caller % self.stacks.len(); + // As with trying to pop a value from this thread's stack, we + // merely attempt to get access to push this value back on the + // stack. If there's too much contention, we just give up and throw + // the value away. + // + // Interestingly, in ad hoc benchmarking, it is beneficial to + // attempt to push the value back more than once, unlike when + // popping the value. I don't have a good theory for why this is. + // I guess if we drop too many values then that winds up forcing + // the pop operation to create new fresh values and thus leads to + // less reuse. There's definitely a balancing act here. + for _ in 0..10 { + let mut stack = match self.stacks[stack_id].0.try_lock() { + Err(_) => continue, + Ok(stack) => stack, + }; + stack.push(value); + return; + } + } + + /// Create a guard that represents the special owned T. + #[inline] + fn guard_owned(&self, caller: usize) -> PoolGuard<'_, T, F> { + PoolGuard { pool: self, value: Err(caller), discard: false } + } + + /// Create a guard that contains a value from the pool's stack. + #[inline] + fn guard_stack(&self, value: Box<T>) -> PoolGuard<'_, T, F> { + PoolGuard { pool: self, value: Ok(value), discard: false } + } + + /// Create a guard that contains a value from the pool's stack with an + /// instruction to throw away the value instead of putting it back + /// into the pool. + #[inline] + fn guard_stack_transient(&self, value: Box<T>) -> PoolGuard<'_, T, F> { + PoolGuard { pool: self, value: Ok(value), discard: true } + } + } + + impl<T: core::fmt::Debug, F> core::fmt::Debug for Pool<T, F> { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + f.debug_struct("Pool") + .field("stacks", &self.stacks) + .field("owner", &self.owner) + .field("owner_val", &self.owner_val) + .finish() + } + } + + /// A guard that is returned when a caller requests a value from the pool. + pub(super) struct PoolGuard<'a, T: Send, F: Fn() -> T> { + /// The pool that this guard is attached to. + pool: &'a Pool<T, F>, + /// This is Err when the guard represents the special "owned" value. + /// In which case, the value is retrieved from 'pool.owner_val'. And + /// in the special case of `Err(THREAD_ID_DROPPED)`, it means the + /// guard has been put back into the pool and should no longer be used. + value: Result<Box<T>, usize>, + /// When true, the value should be discarded instead of being pushed + /// back into the pool. We tend to use this under high contention, and + /// this allows us to avoid inflating the size of the pool. (Because + /// under contention, we tend to create more values instead of waiting + /// for access to a stack of existing values.) + discard: bool, + } + + impl<'a, T: Send, F: Fn() -> T> PoolGuard<'a, T, F> { + /// Return the underlying value. + #[inline] + pub(super) fn value(&self) -> &T { + match self.value { + Ok(ref v) => &**v, + // SAFETY: This is safe because the only way a PoolGuard gets + // created for self.value=Err is when the current thread + // corresponds to the owning thread, of which there can only + // be one. Thus, we are guaranteed to be providing exclusive + // access here which makes this safe. + // + // Also, since 'owner_val' is guaranteed to be initialized + // before an owned PoolGuard is created, the unchecked unwrap + // is safe. + Err(id) => unsafe { + // This assert is *not* necessary for safety, since we + // should never be here if the guard had been put back into + // the pool. This is a sanity check to make sure we didn't + // break an internal invariant. + debug_assert_ne!(THREAD_ID_DROPPED, id); + (*self.pool.owner_val.get()).as_ref().unwrap_unchecked() + }, + } + } + + /// Return the underlying value as a mutable borrow. + #[inline] + pub(super) fn value_mut(&mut self) -> &mut T { + match self.value { + Ok(ref mut v) => &mut **v, + // SAFETY: This is safe because the only way a PoolGuard gets + // created for self.value=None is when the current thread + // corresponds to the owning thread, of which there can only + // be one. Thus, we are guaranteed to be providing exclusive + // access here which makes this safe. + // + // Also, since 'owner_val' is guaranteed to be initialized + // before an owned PoolGuard is created, the unwrap_unchecked + // is safe. + Err(id) => unsafe { + // This assert is *not* necessary for safety, since we + // should never be here if the guard had been put back into + // the pool. This is a sanity check to make sure we didn't + // break an internal invariant. + debug_assert_ne!(THREAD_ID_DROPPED, id); + (*self.pool.owner_val.get()).as_mut().unwrap_unchecked() + }, + } + } + + /// Consumes this guard and puts it back into the pool. + #[inline] + pub(super) fn put(this: PoolGuard<'_, T, F>) { + // Since this is effectively consuming the guard and putting the + // value back into the pool, there's no reason to run its Drop + // impl after doing this. I don't believe there is a correctness + // problem with doing so, but there's definitely a perf problem + // by redoing this work. So we avoid it. + let mut this = core::mem::ManuallyDrop::new(this); + this.put_imp(); + } + + /// Puts this guard back into the pool by only borrowing the guard as + /// mutable. This should be called at most once. + #[inline(always)] + fn put_imp(&mut self) { + match core::mem::replace(&mut self.value, Err(THREAD_ID_DROPPED)) { + Ok(value) => { + // If we were told to discard this value then don't bother + // trying to put it back into the pool. This occurs when + // the pop operation failed to acquire a lock and we + // decided to create a new value in lieu of contending for + // the lock. + if self.discard { + return; + } + self.pool.put_value(value); + } + // If this guard has a value "owned" by the thread, then + // the Pool guarantees that this is the ONLY such guard. + // Therefore, in order to place it back into the pool and make + // it available, we need to change the owner back to the owning + // thread's ID. But note that we use the ID that was stored in + // the guard, since a guard can be moved to another thread and + // dropped. (A previous iteration of this code read from the + // THREAD_ID thread local, which uses the ID of the current + // thread which may not be the ID of the owning thread! This + // also avoids the TLS access, which is likely a hair faster.) + Err(owner) => { + // If we hit this point, it implies 'put_imp' has been + // called multiple times for the same guard which in turn + // corresponds to a bug in this implementation. + assert_ne!(THREAD_ID_DROPPED, owner); + self.pool.owner.store(owner, Ordering::Release); + } + } + } + } + + impl<'a, T: Send, F: Fn() -> T> Drop for PoolGuard<'a, T, F> { + #[inline] + fn drop(&mut self) { + self.put_imp(); + } + } + + impl<'a, T: Send + core::fmt::Debug, F: Fn() -> T> core::fmt::Debug + for PoolGuard<'a, T, F> + { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + f.debug_struct("PoolGuard") + .field("pool", &self.pool) + .field("value", &self.value) + .finish() + } + } +} + +// FUTURE: We should consider using Mara Bos's nearly-lock-free version of this +// here: https://gist.github.com/m-ou-se/5fdcbdf7dcf4585199ce2de697f367a4. +// +// One reason why I did things with a "mutex" below is that it isolates the +// safety concerns to just the Mutex, where as the safety of Mara's pool is a +// bit more sprawling. I also expect this code to not be used that much, and +// so is unlikely to get as much real world usage with which to test it. That +// means the "obviously correct" lever is an important one. +// +// The specific reason to use Mara's pool is that it is likely faster and also +// less likely to hit problems with spin-locks, although it is not completely +// impervious to them. +// +// The best solution to this problem, probably, is a truly lock free pool. That +// could be done with a lock free linked list. The issue is the ABA problem. It +// is difficult to avoid, and doing so is complex. BUT, the upshot of that is +// that if we had a truly lock free pool, then we could also use it above in +// the 'std' pool instead of a Mutex because it should be completely free the +// problems that come from spin-locks. +#[cfg(not(feature = "std"))] +mod inner { + use core::{ + cell::UnsafeCell, + panic::{RefUnwindSafe, UnwindSafe}, + sync::atomic::{AtomicBool, Ordering}, + }; + + use alloc::{boxed::Box, vec, vec::Vec}; + + /// A thread safe pool utilizing alloc-only features. + /// + /// Unlike the std version, it doesn't seem possible(?) to implement the + /// "thread owner" optimization because alloc-only doesn't have any concept + /// of threads. So the best we can do is just a normal stack. This will + /// increase latency in alloc-only environments. + pub(super) struct Pool<T, F> { + /// A stack of T values to hand out. These are used when a Pool is + /// accessed by a thread that didn't create it. + stack: Mutex<Vec<Box<T>>>, + /// A function to create more T values when stack is empty and a caller + /// has requested a T. + create: F, + } + + // If T is UnwindSafe, then since we provide exclusive access to any + // particular value in the pool, it should therefore also be considered + // RefUnwindSafe. + impl<T: UnwindSafe, F: UnwindSafe> RefUnwindSafe for Pool<T, F> {} + + impl<T, F> Pool<T, F> { + /// Create a new pool. The given closure is used to create values in + /// the pool when necessary. + pub(super) const fn new(create: F) -> Pool<T, F> { + Pool { stack: Mutex::new(vec![]), create } + } + } + + impl<T: Send, F: Fn() -> T> Pool<T, F> { + /// Get a value from the pool. This may block if another thread is also + /// attempting to retrieve a value from the pool. + #[inline] + pub(super) fn get(&self) -> PoolGuard<'_, T, F> { + let mut stack = self.stack.lock(); + let value = match stack.pop() { + None => Box::new((self.create)()), + Some(value) => value, + }; + PoolGuard { pool: self, value: Some(value) } + } + + #[inline] + fn put(&self, guard: PoolGuard<'_, T, F>) { + let mut guard = core::mem::ManuallyDrop::new(guard); + if let Some(value) = guard.value.take() { + self.put_value(value); + } + } + + /// Puts a value back into the pool. Callers don't need to call this. + /// Once the guard that's returned by 'get' is dropped, it is put back + /// into the pool automatically. + #[inline] + fn put_value(&self, value: Box<T>) { + let mut stack = self.stack.lock(); + stack.push(value); + } + } + + impl<T: core::fmt::Debug, F> core::fmt::Debug for Pool<T, F> { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + f.debug_struct("Pool").field("stack", &self.stack).finish() + } + } + + /// A guard that is returned when a caller requests a value from the pool. + pub(super) struct PoolGuard<'a, T: Send, F: Fn() -> T> { + /// The pool that this guard is attached to. + pool: &'a Pool<T, F>, + /// This is None after the guard has been put back into the pool. + value: Option<Box<T>>, + } + + impl<'a, T: Send, F: Fn() -> T> PoolGuard<'a, T, F> { + /// Return the underlying value. + #[inline] + pub(super) fn value(&self) -> &T { + self.value.as_deref().unwrap() + } + + /// Return the underlying value as a mutable borrow. + #[inline] + pub(super) fn value_mut(&mut self) -> &mut T { + self.value.as_deref_mut().unwrap() + } + + /// Consumes this guard and puts it back into the pool. + #[inline] + pub(super) fn put(this: PoolGuard<'_, T, F>) { + // Since this is effectively consuming the guard and putting the + // value back into the pool, there's no reason to run its Drop + // impl after doing this. I don't believe there is a correctness + // problem with doing so, but there's definitely a perf problem + // by redoing this work. So we avoid it. + let mut this = core::mem::ManuallyDrop::new(this); + this.put_imp(); + } + + /// Puts this guard back into the pool by only borrowing the guard as + /// mutable. This should be called at most once. + #[inline(always)] + fn put_imp(&mut self) { + if let Some(value) = self.value.take() { + self.pool.put_value(value); + } + } + } + + impl<'a, T: Send, F: Fn() -> T> Drop for PoolGuard<'a, T, F> { + #[inline] + fn drop(&mut self) { + self.put_imp(); + } + } + + impl<'a, T: Send + core::fmt::Debug, F: Fn() -> T> core::fmt::Debug + for PoolGuard<'a, T, F> + { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + f.debug_struct("PoolGuard") + .field("pool", &self.pool) + .field("value", &self.value) + .finish() + } + } + + /// A spin-lock based mutex. Yes, I have read spinlocks cosnidered + /// harmful[1], and if there's a reasonable alternative choice, I'll + /// happily take it. + /// + /// I suspect the most likely alternative here is a Treiber stack, but + /// implementing one correctly in a way that avoids the ABA problem looks + /// subtle enough that I'm not sure I want to attempt that. But otherwise, + /// we only need a mutex in order to implement our pool, so if there's + /// something simpler we can use that works for our `Pool` use case, then + /// that would be great. + /// + /// Note that this mutex does not do poisoning. + /// + /// [1]: https://matklad.github.io/2020/01/02/spinlocks-considered-harmful.html + #[derive(Debug)] + struct Mutex<T> { + locked: AtomicBool, + data: UnsafeCell<T>, + } + + // SAFETY: Since a Mutex guarantees exclusive access, as long as we can + // send it across threads, it must also be Sync. + unsafe impl<T: Send> Sync for Mutex<T> {} + + impl<T> Mutex<T> { + /// Create a new mutex for protecting access to the given value across + /// multiple threads simultaneously. + const fn new(value: T) -> Mutex<T> { + Mutex { + locked: AtomicBool::new(false), + data: UnsafeCell::new(value), + } + } + + /// Lock this mutex and return a guard providing exclusive access to + /// `T`. This blocks if some other thread has already locked this + /// mutex. + #[inline] + fn lock(&self) -> MutexGuard<'_, T> { + while self + .locked + .compare_exchange( + false, + true, + Ordering::AcqRel, + Ordering::Acquire, + ) + .is_err() + { + core::hint::spin_loop(); + } + // SAFETY: The only way we're here is if we successfully set + // 'locked' to true, which implies we must be the only thread here + // and thus have exclusive access to 'data'. + let data = unsafe { &mut *self.data.get() }; + MutexGuard { locked: &self.locked, data } + } + } + + /// A guard that derefs to &T and &mut T. When it's dropped, the lock is + /// released. + #[derive(Debug)] + struct MutexGuard<'a, T> { + locked: &'a AtomicBool, + data: &'a mut T, + } + + impl<'a, T> core::ops::Deref for MutexGuard<'a, T> { + type Target = T; + + #[inline] + fn deref(&self) -> &T { + self.data + } + } + + impl<'a, T> core::ops::DerefMut for MutexGuard<'a, T> { + #[inline] + fn deref_mut(&mut self) -> &mut T { + self.data + } + } + + impl<'a, T> Drop for MutexGuard<'a, T> { + #[inline] + fn drop(&mut self) { + // Drop means 'data' is no longer accessible, so we can unlock + // the mutex. + self.locked.store(false, Ordering::Release); + } + } +} + +#[cfg(test)] +mod tests { + use core::panic::{RefUnwindSafe, UnwindSafe}; + + use alloc::{boxed::Box, vec, vec::Vec}; + + use super::*; + + #[test] + fn oibits() { + fn assert_oitbits<T: Send + Sync + UnwindSafe + RefUnwindSafe>() {} + assert_oitbits::<Pool<Vec<u32>>>(); + assert_oitbits::<Pool<core::cell::RefCell<Vec<u32>>>>(); + assert_oitbits::< + Pool< + Vec<u32>, + Box< + dyn Fn() -> Vec<u32> + + Send + + Sync + + UnwindSafe + + RefUnwindSafe, + >, + >, + >(); + } + + // Tests that Pool implements the "single owner" optimization. That is, the + // thread that first accesses the pool gets its own copy, while all other + // threads get distinct copies. + #[cfg(feature = "std")] + #[test] + fn thread_owner_optimization() { + use std::{cell::RefCell, sync::Arc, vec}; + + let pool: Arc<Pool<RefCell<Vec<char>>>> = + Arc::new(Pool::new(|| RefCell::new(vec!['a']))); + pool.get().borrow_mut().push('x'); + + let pool1 = pool.clone(); + let t1 = std::thread::spawn(move || { + let guard = pool1.get(); + guard.borrow_mut().push('y'); + }); + + let pool2 = pool.clone(); + let t2 = std::thread::spawn(move || { + let guard = pool2.get(); + guard.borrow_mut().push('z'); + }); + + t1.join().unwrap(); + t2.join().unwrap(); + + // If we didn't implement the single owner optimization, then one of + // the threads above is likely to have mutated the [a, x] vec that + // we stuffed in the pool before spawning the threads. But since + // neither thread was first to access the pool, and because of the + // optimization, we should be guaranteed that neither thread mutates + // the special owned pool value. + // + // (Technically this is an implementation detail and not a contract of + // Pool's API.) + assert_eq!(vec!['a', 'x'], *pool.get().borrow()); + } + + // This tests that if the "owner" of a pool asks for two values, then it + // gets two distinct values and not the same one. This test failed in the + // course of developing the pool, which in turn resulted in UB because it + // permitted getting aliasing &mut borrows to the same place in memory. + #[test] + fn thread_owner_distinct() { + let pool = Pool::new(|| vec!['a']); + + { + let mut g1 = pool.get(); + let v1 = &mut *g1; + let mut g2 = pool.get(); + let v2 = &mut *g2; + v1.push('b'); + v2.push('c'); + assert_eq!(&mut vec!['a', 'b'], v1); + assert_eq!(&mut vec!['a', 'c'], v2); + } + // This isn't technically guaranteed, but we + // expect to now get the "owned" value (the first + // call to 'get()' above) now that it's back in + // the pool. + assert_eq!(&mut vec!['a', 'b'], &mut *pool.get()); + } + + // This tests that we can share a guard with another thread, mutate the + // underlying value and everything works. This failed in the course of + // developing a pool since the pool permitted 'get()' to return the same + // value to the owner thread, even before the previous value was put back + // into the pool. This in turn resulted in this test producing a data race. + #[cfg(feature = "std")] + #[test] + fn thread_owner_sync() { + let pool = Pool::new(|| vec!['a']); + { + let mut g1 = pool.get(); + let mut g2 = pool.get(); + std::thread::scope(|s| { + s.spawn(|| { + g1.push('b'); + }); + s.spawn(|| { + g2.push('c'); + }); + }); + + let v1 = &mut *g1; + let v2 = &mut *g2; + assert_eq!(&mut vec!['a', 'b'], v1); + assert_eq!(&mut vec!['a', 'c'], v2); + } + + // This isn't technically guaranteed, but we + // expect to now get the "owned" value (the first + // call to 'get()' above) now that it's back in + // the pool. + assert_eq!(&mut vec!['a', 'b'], &mut *pool.get()); + } + + // This tests that if we move a PoolGuard that is owned by the current + // thread to another thread and drop it, then the thread owner doesn't + // change. During development of the pool, this test failed because the + // PoolGuard assumed it was dropped in the same thread from which it was + // created, and thus used the current thread's ID as the owner, which could + // be different than the actual owner of the pool. + #[cfg(feature = "std")] + #[test] + fn thread_owner_send_drop() { + let pool = Pool::new(|| vec!['a']); + // Establishes this thread as the owner. + { + pool.get().push('b'); + } + std::thread::scope(|s| { + // Sanity check that we get the same value back. + // (Not technically guaranteed.) + let mut g = pool.get(); + assert_eq!(&vec!['a', 'b'], &*g); + // Now push it to another thread and drop it. + s.spawn(move || { + g.push('c'); + }) + .join() + .unwrap(); + }); + // Now check that we're still the owner. This is not technically + // guaranteed by the API, but is true in practice given the thread + // owner optimization. + assert_eq!(&vec!['a', 'b', 'c'], &*pool.get()); + } +} diff --git a/vendor/regex-automata/src/util/prefilter/aho_corasick.rs b/vendor/regex-automata/src/util/prefilter/aho_corasick.rs new file mode 100644 index 0000000..50cce82 --- /dev/null +++ b/vendor/regex-automata/src/util/prefilter/aho_corasick.rs @@ -0,0 +1,149 @@ +use crate::util::{ + prefilter::PrefilterI, + search::{MatchKind, Span}, +}; + +#[derive(Clone, Debug)] +pub(crate) struct AhoCorasick { + #[cfg(not(feature = "perf-literal-multisubstring"))] + _unused: (), + #[cfg(feature = "perf-literal-multisubstring")] + ac: aho_corasick::AhoCorasick, +} + +impl AhoCorasick { + pub(crate) fn new<B: AsRef<[u8]>>( + kind: MatchKind, + needles: &[B], + ) -> Option<AhoCorasick> { + #[cfg(not(feature = "perf-literal-multisubstring"))] + { + None + } + #[cfg(feature = "perf-literal-multisubstring")] + { + // We used to use `aho_corasick::MatchKind::Standard` here when + // `kind` was `MatchKind::All`, but this is not correct. The + // "standard" Aho-Corasick match semantics are to report a match + // immediately as soon as it is seen, but `All` isn't like that. + // In particular, with "standard" semantics, given the needles + // "abc" and "b" and the haystack "abc," it would report a match + // at offset 1 before a match at offset 0. This is never what we + // want in the context of the regex engine, regardless of whether + // we have leftmost-first or 'all' semantics. Namely, we always + // want the leftmost match. + let ac_match_kind = match kind { + MatchKind::LeftmostFirst | MatchKind::All => { + aho_corasick::MatchKind::LeftmostFirst + } + }; + // This is kind of just an arbitrary number, but basically, if we + // have a small enough set of literals, then we try to use the VERY + // memory hungry DFA. Otherwise, we whimp out and use an NFA. The + // upshot is that the NFA is quite lean and decently fast. Faster + // than a naive Aho-Corasick NFA anyway. + let ac_kind = if needles.len() <= 500 { + aho_corasick::AhoCorasickKind::DFA + } else { + aho_corasick::AhoCorasickKind::ContiguousNFA + }; + let result = aho_corasick::AhoCorasick::builder() + .kind(Some(ac_kind)) + .match_kind(ac_match_kind) + .start_kind(aho_corasick::StartKind::Both) + // We try to handle all of the prefilter cases in the super + // module, and only use Aho-Corasick for the actual automaton. + // The aho-corasick crate does have some extra prefilters, + // namely, looking for rare bytes to feed to memchr{,2,3} + // instead of just the first byte. If we end up wanting + // those---and they are somewhat tricky to implement---then + // we could port them to this crate. + // + // The main reason for doing things this way is so we have a + // complete and easy to understand picture of which prefilters + // are available and how they work. Otherwise it seems too + // easy to get into a situation where we have a prefilter + // layered on top of prefilter, and that might have unintended + // consequences. + .prefilter(false) + .build(needles); + let ac = match result { + Ok(ac) => ac, + Err(_err) => { + debug!("aho-corasick prefilter failed to build: {}", _err); + return None; + } + }; + Some(AhoCorasick { ac }) + } + } +} + +impl PrefilterI for AhoCorasick { + fn find(&self, haystack: &[u8], span: Span) -> Option<Span> { + #[cfg(not(feature = "perf-literal-multisubstring"))] + { + unreachable!() + } + #[cfg(feature = "perf-literal-multisubstring")] + { + let input = + aho_corasick::Input::new(haystack).span(span.start..span.end); + self.ac + .find(input) + .map(|m| Span { start: m.start(), end: m.end() }) + } + } + + fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span> { + #[cfg(not(feature = "perf-literal-multisubstring"))] + { + unreachable!() + } + #[cfg(feature = "perf-literal-multisubstring")] + { + let input = aho_corasick::Input::new(haystack) + .anchored(aho_corasick::Anchored::Yes) + .span(span.start..span.end); + self.ac + .find(input) + .map(|m| Span { start: m.start(), end: m.end() }) + } + } + + fn memory_usage(&self) -> usize { + #[cfg(not(feature = "perf-literal-multisubstring"))] + { + unreachable!() + } + #[cfg(feature = "perf-literal-multisubstring")] + { + self.ac.memory_usage() + } + } + + fn is_fast(&self) -> bool { + #[cfg(not(feature = "perf-literal-multisubstring"))] + { + unreachable!() + } + #[cfg(feature = "perf-literal-multisubstring")] + { + // Aho-Corasick is never considered "fast" because it's never + // going to be even close to an order of magnitude faster than the + // regex engine itself (assuming a DFA is used). In fact, it is + // usually slower. The magic of Aho-Corasick is that it can search + // a *large* number of literals with a relatively small amount of + // memory. The regex engines are far more wasteful. + // + // Aho-Corasick may be "fast" when the regex engine corresponds + // to, say, the PikeVM. That happens when the lazy DFA couldn't be + // built or used for some reason. But in these cases, the regex + // itself is likely quite big and we're probably hosed no matter + // what we do. (In this case, the best bet is for the caller to + // increase some of the memory limits on the hybrid cache capacity + // and hope that's enough.) + false + } + } +} diff --git a/vendor/regex-automata/src/util/prefilter/byteset.rs b/vendor/regex-automata/src/util/prefilter/byteset.rs new file mode 100644 index 0000000..a669d6c --- /dev/null +++ b/vendor/regex-automata/src/util/prefilter/byteset.rs @@ -0,0 +1,58 @@ +use crate::util::{ + prefilter::PrefilterI, + search::{MatchKind, Span}, +}; + +#[derive(Clone, Debug)] +pub(crate) struct ByteSet([bool; 256]); + +impl ByteSet { + pub(crate) fn new<B: AsRef<[u8]>>( + _kind: MatchKind, + needles: &[B], + ) -> Option<ByteSet> { + #[cfg(not(feature = "perf-literal-multisubstring"))] + { + None + } + #[cfg(feature = "perf-literal-multisubstring")] + { + let mut set = [false; 256]; + for needle in needles.iter() { + let needle = needle.as_ref(); + if needle.len() != 1 { + return None; + } + set[usize::from(needle[0])] = true; + } + Some(ByteSet(set)) + } + } +} + +impl PrefilterI for ByteSet { + fn find(&self, haystack: &[u8], span: Span) -> Option<Span> { + haystack[span].iter().position(|&b| self.0[usize::from(b)]).map(|i| { + let start = span.start + i; + let end = start + 1; + Span { start, end } + }) + } + + fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span> { + let b = *haystack.get(span.start)?; + if self.0[usize::from(b)] { + Some(Span { start: span.start, end: span.start + 1 }) + } else { + None + } + } + + fn memory_usage(&self) -> usize { + 0 + } + + fn is_fast(&self) -> bool { + false + } +} diff --git a/vendor/regex-automata/src/util/prefilter/memchr.rs b/vendor/regex-automata/src/util/prefilter/memchr.rs new file mode 100644 index 0000000..3d44b83 --- /dev/null +++ b/vendor/regex-automata/src/util/prefilter/memchr.rs @@ -0,0 +1,186 @@ +use crate::util::{ + prefilter::PrefilterI, + search::{MatchKind, Span}, +}; + +#[derive(Clone, Debug)] +pub(crate) struct Memchr(u8); + +impl Memchr { + pub(crate) fn new<B: AsRef<[u8]>>( + _kind: MatchKind, + needles: &[B], + ) -> Option<Memchr> { + #[cfg(not(feature = "perf-literal-substring"))] + { + None + } + #[cfg(feature = "perf-literal-substring")] + { + if needles.len() != 1 { + return None; + } + if needles[0].as_ref().len() != 1 { + return None; + } + Some(Memchr(needles[0].as_ref()[0])) + } + } +} + +impl PrefilterI for Memchr { + fn find(&self, haystack: &[u8], span: Span) -> Option<Span> { + #[cfg(not(feature = "perf-literal-substring"))] + { + unreachable!() + } + #[cfg(feature = "perf-literal-substring")] + { + memchr::memchr(self.0, &haystack[span]).map(|i| { + let start = span.start + i; + let end = start + 1; + Span { start, end } + }) + } + } + + fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span> { + let b = *haystack.get(span.start)?; + if self.0 == b { + Some(Span { start: span.start, end: span.start + 1 }) + } else { + None + } + } + + fn memory_usage(&self) -> usize { + 0 + } + + fn is_fast(&self) -> bool { + true + } +} + +#[derive(Clone, Debug)] +pub(crate) struct Memchr2(u8, u8); + +impl Memchr2 { + pub(crate) fn new<B: AsRef<[u8]>>( + _kind: MatchKind, + needles: &[B], + ) -> Option<Memchr2> { + #[cfg(not(feature = "perf-literal-substring"))] + { + None + } + #[cfg(feature = "perf-literal-substring")] + { + if needles.len() != 2 { + return None; + } + if !needles.iter().all(|n| n.as_ref().len() == 1) { + return None; + } + let b1 = needles[0].as_ref()[0]; + let b2 = needles[1].as_ref()[0]; + Some(Memchr2(b1, b2)) + } + } +} + +impl PrefilterI for Memchr2 { + fn find(&self, haystack: &[u8], span: Span) -> Option<Span> { + #[cfg(not(feature = "perf-literal-substring"))] + { + unreachable!() + } + #[cfg(feature = "perf-literal-substring")] + { + memchr::memchr2(self.0, self.1, &haystack[span]).map(|i| { + let start = span.start + i; + let end = start + 1; + Span { start, end } + }) + } + } + + fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span> { + let b = *haystack.get(span.start)?; + if self.0 == b || self.1 == b { + Some(Span { start: span.start, end: span.start + 1 }) + } else { + None + } + } + + fn memory_usage(&self) -> usize { + 0 + } + + fn is_fast(&self) -> bool { + true + } +} + +#[derive(Clone, Debug)] +pub(crate) struct Memchr3(u8, u8, u8); + +impl Memchr3 { + pub(crate) fn new<B: AsRef<[u8]>>( + _kind: MatchKind, + needles: &[B], + ) -> Option<Memchr3> { + #[cfg(not(feature = "perf-literal-substring"))] + { + None + } + #[cfg(feature = "perf-literal-substring")] + { + if needles.len() != 3 { + return None; + } + if !needles.iter().all(|n| n.as_ref().len() == 1) { + return None; + } + let b1 = needles[0].as_ref()[0]; + let b2 = needles[1].as_ref()[0]; + let b3 = needles[2].as_ref()[0]; + Some(Memchr3(b1, b2, b3)) + } + } +} + +impl PrefilterI for Memchr3 { + fn find(&self, haystack: &[u8], span: Span) -> Option<Span> { + #[cfg(not(feature = "perf-literal-substring"))] + { + unreachable!() + } + #[cfg(feature = "perf-literal-substring")] + { + memchr::memchr3(self.0, self.1, self.2, &haystack[span]).map(|i| { + let start = span.start + i; + let end = start + 1; + Span { start, end } + }) + } + } + + fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span> { + let b = *haystack.get(span.start)?; + if self.0 == b || self.1 == b || self.2 == b { + Some(Span { start: span.start, end: span.start + 1 }) + } else { + None + } + } + + fn memory_usage(&self) -> usize { + 0 + } + + fn is_fast(&self) -> bool { + true + } +} diff --git a/vendor/regex-automata/src/util/prefilter/memmem.rs b/vendor/regex-automata/src/util/prefilter/memmem.rs new file mode 100644 index 0000000..deea17b --- /dev/null +++ b/vendor/regex-automata/src/util/prefilter/memmem.rs @@ -0,0 +1,88 @@ +use crate::util::{ + prefilter::PrefilterI, + search::{MatchKind, Span}, +}; + +#[derive(Clone, Debug)] +pub(crate) struct Memmem { + #[cfg(not(all(feature = "std", feature = "perf-literal-substring")))] + _unused: (), + #[cfg(all(feature = "std", feature = "perf-literal-substring"))] + finder: memchr::memmem::Finder<'static>, +} + +impl Memmem { + pub(crate) fn new<B: AsRef<[u8]>>( + _kind: MatchKind, + needles: &[B], + ) -> Option<Memmem> { + #[cfg(not(all(feature = "std", feature = "perf-literal-substring")))] + { + None + } + #[cfg(all(feature = "std", feature = "perf-literal-substring"))] + { + if needles.len() != 1 { + return None; + } + let needle = needles[0].as_ref(); + let finder = memchr::memmem::Finder::new(needle).into_owned(); + Some(Memmem { finder }) + } + } +} + +impl PrefilterI for Memmem { + fn find(&self, haystack: &[u8], span: Span) -> Option<Span> { + #[cfg(not(all(feature = "std", feature = "perf-literal-substring")))] + { + unreachable!() + } + #[cfg(all(feature = "std", feature = "perf-literal-substring"))] + { + self.finder.find(&haystack[span]).map(|i| { + let start = span.start + i; + let end = start + self.finder.needle().len(); + Span { start, end } + }) + } + } + + fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span> { + #[cfg(not(all(feature = "std", feature = "perf-literal-substring")))] + { + unreachable!() + } + #[cfg(all(feature = "std", feature = "perf-literal-substring"))] + { + let needle = self.finder.needle(); + if haystack[span].starts_with(needle) { + Some(Span { end: span.start + needle.len(), ..span }) + } else { + None + } + } + } + + fn memory_usage(&self) -> usize { + #[cfg(not(all(feature = "std", feature = "perf-literal-substring")))] + { + unreachable!() + } + #[cfg(all(feature = "std", feature = "perf-literal-substring"))] + { + self.finder.needle().len() + } + } + + fn is_fast(&self) -> bool { + #[cfg(not(all(feature = "std", feature = "perf-literal-substring")))] + { + unreachable!() + } + #[cfg(all(feature = "std", feature = "perf-literal-substring"))] + { + true + } + } +} diff --git a/vendor/regex-automata/src/util/prefilter/mod.rs b/vendor/regex-automata/src/util/prefilter/mod.rs new file mode 100644 index 0000000..51fc922 --- /dev/null +++ b/vendor/regex-automata/src/util/prefilter/mod.rs @@ -0,0 +1,696 @@ +/*! +Defines a prefilter for accelerating regex searches. + +A prefilter can be created by building a [`Prefilter`] value. + +A prefilter represents one of the most important optimizations available for +accelerating regex searches. The idea of a prefilter is to very quickly find +candidate locations in a haystack where a regex _could_ match. Once a candidate +is found, it is then intended for the regex engine to run at that position to +determine whether the candidate is a match or a false positive. + +In the aforementioned description of the prefilter optimization also lay its +demise. Namely, if a prefilter has a high false positive rate and it produces +lots of candidates, then a prefilter can overall make a regex search slower. +It can run more slowly because more time is spent ping-ponging between the +prefilter search and the regex engine attempting to confirm each candidate as +a match. This ping-ponging has overhead that adds up, and is exacerbated by +a high false positive rate. + +Nevertheless, the optimization is still generally worth performing in most +cases. Particularly given just how much throughput can be improved. (It is not +uncommon for prefilter optimizations to improve throughput by one or two orders +of magnitude.) + +Typically a prefilter is used to find occurrences of literal prefixes from a +regex pattern, but this isn't required. A prefilter can be used to look for +suffixes or even inner literals. + +Note that as of now, prefilters throw away information about which pattern +each literal comes from. In other words, when a prefilter finds a match, +there's no way to know which pattern (or patterns) it came from. Therefore, +in order to confirm a match, you'll have to check all of the patterns by +running the full regex engine. +*/ + +mod aho_corasick; +mod byteset; +mod memchr; +mod memmem; +mod teddy; + +use core::{ + borrow::Borrow, + fmt::Debug, + panic::{RefUnwindSafe, UnwindSafe}, +}; + +#[cfg(feature = "alloc")] +use alloc::sync::Arc; + +#[cfg(feature = "syntax")] +use regex_syntax::hir::{literal, Hir}; + +use crate::util::search::{MatchKind, Span}; + +pub(crate) use crate::util::prefilter::{ + aho_corasick::AhoCorasick, + byteset::ByteSet, + memchr::{Memchr, Memchr2, Memchr3}, + memmem::Memmem, + teddy::Teddy, +}; + +/// A prefilter for accelerating regex searches. +/// +/// If you already have your literals that you want to search with, +/// then the vanilla [`Prefilter::new`] constructor is for you. But +/// if you have an [`Hir`] value from the `regex-syntax` crate, then +/// [`Prefilter::from_hir_prefix`] might be more convenient. Namely, it uses +/// the [`regex-syntax::hir::literal`](regex_syntax::hir::literal) module to +/// extract literal prefixes for you, optimize them and then select and build a +/// prefilter matcher. +/// +/// A prefilter must have **zero false negatives**. However, by its very +/// nature, it may produce false positives. That is, a prefilter will never +/// skip over a position in the haystack that corresponds to a match of the +/// original regex pattern, but it *may* produce a match for a position +/// in the haystack that does *not* correspond to a match of the original +/// regex pattern. If you use either the [`Prefilter::from_hir_prefix`] or +/// [`Prefilter::from_hirs_prefix`] constructors, then this guarantee is +/// upheld for you automatically. This guarantee is not preserved if you use +/// [`Prefilter::new`] though, since it is up to the caller to provide correct +/// literal strings with respect to the original regex pattern. +/// +/// # Cloning +/// +/// It is an API guarantee that cloning a prefilter is cheap. That is, cloning +/// it will not duplicate whatever heap memory is used to represent the +/// underlying matcher. +/// +/// # Example +/// +/// This example shows how to attach a `Prefilter` to the +/// [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM) in order to accelerate +/// searches. +/// +/// ``` +/// use regex_automata::{ +/// nfa::thompson::pikevm::PikeVM, +/// util::prefilter::Prefilter, +/// Match, MatchKind, +/// }; +/// +/// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["Bruce "]) +/// .expect("a prefilter"); +/// let re = PikeVM::builder() +/// .configure(PikeVM::config().prefilter(Some(pre))) +/// .build(r"Bruce \w+")?; +/// let mut cache = re.create_cache(); +/// assert_eq!( +/// Some(Match::must(0, 6..23)), +/// re.find(&mut cache, "Hello Bruce Springsteen!"), +/// ); +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +/// +/// But note that if you get your prefilter incorrect, it could lead to an +/// incorrect result! +/// +/// ``` +/// use regex_automata::{ +/// nfa::thompson::pikevm::PikeVM, +/// util::prefilter::Prefilter, +/// Match, MatchKind, +/// }; +/// +/// // This prefilter is wrong! +/// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["Patti "]) +/// .expect("a prefilter"); +/// let re = PikeVM::builder() +/// .configure(PikeVM::config().prefilter(Some(pre))) +/// .build(r"Bruce \w+")?; +/// let mut cache = re.create_cache(); +/// // We find no match even though the regex does match. +/// assert_eq!( +/// None, +/// re.find(&mut cache, "Hello Bruce Springsteen!"), +/// ); +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +#[derive(Clone, Debug)] +pub struct Prefilter { + #[cfg(not(feature = "alloc"))] + _unused: (), + #[cfg(feature = "alloc")] + pre: Arc<dyn PrefilterI>, + #[cfg(feature = "alloc")] + is_fast: bool, +} + +impl Prefilter { + /// Create a new prefilter from a sequence of needles and a corresponding + /// match semantics. + /// + /// This may return `None` for a variety of reasons, for example, if + /// a suitable prefilter could not be constructed. That might occur + /// if they are unavailable (e.g., the `perf-literal-substring` and + /// `perf-literal-multisubstring` features aren't enabled), or it might + /// occur because of heuristics or other artifacts of how the prefilter + /// works. + /// + /// Note that if you have an [`Hir`] expression, it may be more convenient + /// to use [`Prefilter::from_hir_prefix`]. It will automatically handle the + /// task of extracting prefix literals for you. + /// + /// # Example + /// + /// This example shows how match semantics can impact the matching + /// algorithm used by the prefilter. For this reason, it is important to + /// ensure that the match semantics given here are consistent with the + /// match semantics intended for the regular expression that the literals + /// were extracted from. + /// + /// ``` + /// use regex_automata::{ + /// util::{prefilter::Prefilter, syntax}, + /// MatchKind, Span, + /// }; + /// + /// let hay = "Hello samwise"; + /// + /// // With leftmost-first, we find 'samwise' here because it comes + /// // before 'sam' in the sequence we give it.. + /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["samwise", "sam"]) + /// .expect("a prefilter"); + /// assert_eq!( + /// Some(Span::from(6..13)), + /// pre.find(hay.as_bytes(), Span::from(0..hay.len())), + /// ); + /// // Still with leftmost-first but with the literals reverse, now 'sam' + /// // will match instead! + /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["sam", "samwise"]) + /// .expect("a prefilter"); + /// assert_eq!( + /// Some(Span::from(6..9)), + /// pre.find(hay.as_bytes(), Span::from(0..hay.len())), + /// ); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn new<B: AsRef<[u8]>>( + kind: MatchKind, + needles: &[B], + ) -> Option<Prefilter> { + Choice::new(kind, needles).and_then(Prefilter::from_choice) + } + + /// This turns a prefilter selection into a `Prefilter`. That is, in turns + /// the enum given into a trait object. + fn from_choice(choice: Choice) -> Option<Prefilter> { + #[cfg(not(feature = "alloc"))] + { + None + } + #[cfg(feature = "alloc")] + { + let pre: Arc<dyn PrefilterI> = match choice { + Choice::Memchr(p) => Arc::new(p), + Choice::Memchr2(p) => Arc::new(p), + Choice::Memchr3(p) => Arc::new(p), + Choice::Memmem(p) => Arc::new(p), + Choice::Teddy(p) => Arc::new(p), + Choice::ByteSet(p) => Arc::new(p), + Choice::AhoCorasick(p) => Arc::new(p), + }; + let is_fast = pre.is_fast(); + Some(Prefilter { pre, is_fast }) + } + } + + /// This attempts to extract prefixes from the given `Hir` expression for + /// the given match semantics, and if possible, builds a prefilter for + /// them. + /// + /// # Example + /// + /// This example shows how to build a prefilter directly from an [`Hir`] + /// expression, and use to find an occurrence of a prefix from the regex + /// pattern. + /// + /// ``` + /// use regex_automata::{ + /// util::{prefilter::Prefilter, syntax}, + /// MatchKind, Span, + /// }; + /// + /// let hir = syntax::parse(r"(Bruce|Patti) \w+")?; + /// let pre = Prefilter::from_hir_prefix(MatchKind::LeftmostFirst, &hir) + /// .expect("a prefilter"); + /// let hay = "Hello Patti Scialfa!"; + /// assert_eq!( + /// Some(Span::from(6..12)), + /// pre.find(hay.as_bytes(), Span::from(0..hay.len())), + /// ); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[cfg(feature = "syntax")] + pub fn from_hir_prefix(kind: MatchKind, hir: &Hir) -> Option<Prefilter> { + Prefilter::from_hirs_prefix(kind, &[hir]) + } + + /// This attempts to extract prefixes from the given `Hir` expressions for + /// the given match semantics, and if possible, builds a prefilter for + /// them. + /// + /// Note that as of now, prefilters throw away information about which + /// pattern each literal comes from. In other words, when a prefilter finds + /// a match, there's no way to know which pattern (or patterns) it came + /// from. Therefore, in order to confirm a match, you'll have to check all + /// of the patterns by running the full regex engine. + /// + /// # Example + /// + /// This example shows how to build a prefilter directly from multiple + /// `Hir` expressions expression, and use it to find an occurrence of a + /// prefix from the regex patterns. + /// + /// ``` + /// use regex_automata::{ + /// util::{prefilter::Prefilter, syntax}, + /// MatchKind, Span, + /// }; + /// + /// let hirs = syntax::parse_many(&[ + /// r"(Bruce|Patti) \w+", + /// r"Mrs?\. Doubtfire", + /// ])?; + /// let pre = Prefilter::from_hirs_prefix(MatchKind::LeftmostFirst, &hirs) + /// .expect("a prefilter"); + /// let hay = "Hello Mrs. Doubtfire"; + /// assert_eq!( + /// Some(Span::from(6..20)), + /// pre.find(hay.as_bytes(), Span::from(0..hay.len())), + /// ); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[cfg(feature = "syntax")] + pub fn from_hirs_prefix<H: Borrow<Hir>>( + kind: MatchKind, + hirs: &[H], + ) -> Option<Prefilter> { + prefixes(kind, hirs) + .literals() + .and_then(|lits| Prefilter::new(kind, lits)) + } + + /// Run this prefilter on `haystack[span.start..end]` and return a matching + /// span if one exists. + /// + /// The span returned is guaranteed to have a start position greater than + /// or equal to the one given, and an end position less than or equal to + /// the one given. + /// + /// # Example + /// + /// This example shows how to build a prefilter directly from an [`Hir`] + /// expression, and use it to find an occurrence of a prefix from the regex + /// pattern. + /// + /// ``` + /// use regex_automata::{ + /// util::{prefilter::Prefilter, syntax}, + /// MatchKind, Span, + /// }; + /// + /// let hir = syntax::parse(r"Bruce \w+")?; + /// let pre = Prefilter::from_hir_prefix(MatchKind::LeftmostFirst, &hir) + /// .expect("a prefilter"); + /// let hay = "Hello Bruce Springsteen!"; + /// assert_eq!( + /// Some(Span::from(6..12)), + /// pre.find(hay.as_bytes(), Span::from(0..hay.len())), + /// ); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn find(&self, haystack: &[u8], span: Span) -> Option<Span> { + #[cfg(not(feature = "alloc"))] + { + unreachable!() + } + #[cfg(feature = "alloc")] + { + self.pre.find(haystack, span) + } + } + + /// Returns the span of a prefix of `haystack[span.start..span.end]` if + /// the prefilter matches. + /// + /// The span returned is guaranteed to have a start position equivalent to + /// the one given, and an end position less than or equal to the one given. + /// + /// # Example + /// + /// This example shows how to build a prefilter directly from an [`Hir`] + /// expression, and use it to find an occurrence of a prefix from the regex + /// pattern that begins at the start of a haystack only. + /// + /// ``` + /// use regex_automata::{ + /// util::{prefilter::Prefilter, syntax}, + /// MatchKind, Span, + /// }; + /// + /// let hir = syntax::parse(r"Bruce \w+")?; + /// let pre = Prefilter::from_hir_prefix(MatchKind::LeftmostFirst, &hir) + /// .expect("a prefilter"); + /// let hay = "Hello Bruce Springsteen!"; + /// // Nothing is found here because 'Bruce' does + /// // not occur at the beginning of our search. + /// assert_eq!( + /// None, + /// pre.prefix(hay.as_bytes(), Span::from(0..hay.len())), + /// ); + /// // But if we change where we start the search + /// // to begin where 'Bruce ' begins, then a + /// // match will be found. + /// assert_eq!( + /// Some(Span::from(6..12)), + /// pre.prefix(hay.as_bytes(), Span::from(6..hay.len())), + /// ); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span> { + #[cfg(not(feature = "alloc"))] + { + unreachable!() + } + #[cfg(feature = "alloc")] + { + self.pre.prefix(haystack, span) + } + } + + /// Returns the heap memory, in bytes, used by the underlying prefilter. + #[inline] + pub fn memory_usage(&self) -> usize { + #[cfg(not(feature = "alloc"))] + { + unreachable!() + } + #[cfg(feature = "alloc")] + { + self.pre.memory_usage() + } + } + + /// Implementations might return true here if they believe themselves to + /// be "fast." The concept of "fast" is deliberately left vague, but in + /// practice this usually corresponds to whether it's believed that SIMD + /// will be used. + /// + /// Why do we care about this? Well, some prefilter tricks tend to come + /// with their own bits of overhead, and so might only make sense if we + /// know that a scan will be *much* faster than the regex engine itself. + /// Otherwise, the trick may not be worth doing. Whether something is + /// "much" faster than the regex engine generally boils down to whether + /// SIMD is used. (But not always. Even a SIMD matcher with a high false + /// positive rate can become quite slow.) + /// + /// Even if this returns true, it is still possible for the prefilter to + /// be "slow." Remember, prefilters are just heuristics. We can't really + /// *know* a prefilter will be fast without actually trying the prefilter. + /// (Which of course we cannot afford to do.) + #[inline] + pub(crate) fn is_fast(&self) -> bool { + #[cfg(not(feature = "alloc"))] + { + unreachable!() + } + #[cfg(feature = "alloc")] + { + self.is_fast + } + } +} + +/// A trait for abstracting over prefilters. Basically, a prefilter is +/// something that do an unanchored *and* an anchored search in a haystack +/// within a given span. +/// +/// This exists pretty much only so that we can use prefilters as a trait +/// object (which is what `Prefilter` is). If we ever move off of trait objects +/// and to an enum, then it's likely this trait could be removed. +pub(crate) trait PrefilterI: + Debug + Send + Sync + RefUnwindSafe + UnwindSafe + 'static +{ + /// Run this prefilter on `haystack[span.start..end]` and return a matching + /// span if one exists. + /// + /// The span returned is guaranteed to have a start position greater than + /// or equal to the one given, and an end position less than or equal to + /// the one given. + fn find(&self, haystack: &[u8], span: Span) -> Option<Span>; + + /// Returns the span of a prefix of `haystack[span.start..span.end]` if + /// the prefilter matches. + /// + /// The span returned is guaranteed to have a start position equivalent to + /// the one given, and an end position less than or equal to the one given. + fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span>; + + /// Returns the heap memory, in bytes, used by the underlying prefilter. + fn memory_usage(&self) -> usize; + + /// Implementations might return true here if they believe themselves to + /// be "fast." See [`Prefilter::is_fast`] for more details. + fn is_fast(&self) -> bool; +} + +#[cfg(feature = "alloc")] +impl<P: PrefilterI + ?Sized> PrefilterI for Arc<P> { + #[cfg_attr(feature = "perf-inline", inline(always))] + fn find(&self, haystack: &[u8], span: Span) -> Option<Span> { + (&**self).find(haystack, span) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span> { + (&**self).prefix(haystack, span) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn memory_usage(&self) -> usize { + (&**self).memory_usage() + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn is_fast(&self) -> bool { + (&**self).is_fast() + } +} + +/// A type that encapsulates the selection of a prefilter algorithm from a +/// sequence of needles. +/// +/// The existence of this type is a little tricky, because we don't (currently) +/// use it for performing a search. Instead, we really only consume it by +/// converting the underlying prefilter into a trait object, whether that be +/// `dyn PrefilterI` or `dyn Strategy` (for the meta regex engine). In order +/// to avoid re-copying the prefilter selection logic, we isolate it here, and +/// then force anything downstream that wants to convert it to a trait object +/// to do trivial case analysis on it. +/// +/// One wonders whether we *should* use an enum instead of a trait object. +/// At time of writing, I chose trait objects based on instinct because 1) I +/// knew I wasn't going to inline anything and 2) there would potentially be +/// many different choices. However, as of time of writing, I haven't actually +/// compared the trait object approach to the enum approach. That probably +/// should be litigated, but I ran out of steam. +/// +/// Note that if the `alloc` feature is disabled, then values of this type +/// are (and should) never be constructed. Also, in practice, for any of the +/// prefilters to be selected, you'll need at least one of the `perf-literal-*` +/// features enabled. +#[derive(Clone, Debug)] +pub(crate) enum Choice { + Memchr(Memchr), + Memchr2(Memchr2), + Memchr3(Memchr3), + Memmem(Memmem), + Teddy(Teddy), + ByteSet(ByteSet), + AhoCorasick(AhoCorasick), +} + +impl Choice { + /// Select what is believed to be the best prefilter algorithm for the + /// match semantics and sequence of needles given. + /// + /// This selection algorithm uses the needles as given without any + /// modification. For example, if `[bar]` is given, then this doesn't + /// try to select `memchr` for `b`. Instead, it would select `memmem` + /// for `bar`. If callers would want `memchr` selected for `[bar]`, then + /// callers should massages the literals themselves. That is, callers are + /// responsible for heuristics surrounding which sequence of literals is + /// best. + /// + /// What this selection algorithm does is attempt to use the fastest + /// prefilter that works for the literals given. So if `[a, b]`, is given, + /// then `memchr2` is selected. + /// + /// Of course, which prefilter is selected is also subject to what + /// is available. For example, if `alloc` isn't enabled, then + /// that limits which prefilters can be selected. Similarly, if + /// `perf-literal-substring` isn't enabled, then nothing from the `memchr` + /// crate can be returned. + pub(crate) fn new<B: AsRef<[u8]>>( + kind: MatchKind, + needles: &[B], + ) -> Option<Choice> { + // An empty set means the regex matches nothing, so no sense in + // building a prefilter. + if needles.len() == 0 { + debug!("prefilter building failed: found empty set of literals"); + return None; + } + // If the regex can match the empty string, then the prefilter + // will by definition match at every position. This is obviously + // completely ineffective. + if needles.iter().any(|n| n.as_ref().is_empty()) { + debug!("prefilter building failed: literals match empty string"); + return None; + } + // BREADCRUMBS: Perhaps the literal optimizer should special case + // sequences of length two or three if the leading bytes of each are + // "rare"? Or perhaps, if there are two or three total possible leading + // bytes, regardless of the number of literals, and all are rare... + // Then well, perhaps we should use memchr2 or memchr3 in those cases? + if let Some(pre) = Memchr::new(kind, needles) { + debug!("prefilter built: memchr"); + return Some(Choice::Memchr(pre)); + } + if let Some(pre) = Memchr2::new(kind, needles) { + debug!("prefilter built: memchr2"); + return Some(Choice::Memchr2(pre)); + } + if let Some(pre) = Memchr3::new(kind, needles) { + debug!("prefilter built: memchr3"); + return Some(Choice::Memchr3(pre)); + } + if let Some(pre) = Memmem::new(kind, needles) { + debug!("prefilter built: memmem"); + return Some(Choice::Memmem(pre)); + } + if let Some(pre) = Teddy::new(kind, needles) { + debug!("prefilter built: teddy"); + return Some(Choice::Teddy(pre)); + } + if let Some(pre) = ByteSet::new(kind, needles) { + debug!("prefilter built: byteset"); + return Some(Choice::ByteSet(pre)); + } + if let Some(pre) = AhoCorasick::new(kind, needles) { + debug!("prefilter built: aho-corasick"); + return Some(Choice::AhoCorasick(pre)); + } + debug!("prefilter building failed: no strategy could be found"); + None + } +} + +/// Extracts all of the prefix literals from the given HIR expressions into a +/// single `Seq`. The literals in the sequence are ordered with respect to the +/// order of the given HIR expressions and consistent with the match semantics +/// given. +/// +/// The sequence returned is "optimized." That is, they may be shrunk or even +/// truncated according to heuristics with the intent of making them more +/// useful as a prefilter. (Which translates to both using faster algorithms +/// and minimizing the false positive rate.) +/// +/// Note that this erases any connection between the literals and which pattern +/// (or patterns) they came from. +/// +/// The match kind given must correspond to the match semantics of the regex +/// that is represented by the HIRs given. The match semantics may change the +/// literal sequence returned. +#[cfg(feature = "syntax")] +pub(crate) fn prefixes<H>(kind: MatchKind, hirs: &[H]) -> literal::Seq +where + H: core::borrow::Borrow<Hir>, +{ + let mut extractor = literal::Extractor::new(); + extractor.kind(literal::ExtractKind::Prefix); + + let mut prefixes = literal::Seq::empty(); + for hir in hirs { + prefixes.union(&mut extractor.extract(hir.borrow())); + } + debug!( + "prefixes (len={:?}, exact={:?}) extracted before optimization: {:?}", + prefixes.len(), + prefixes.is_exact(), + prefixes + ); + match kind { + MatchKind::All => { + prefixes.sort(); + prefixes.dedup(); + } + MatchKind::LeftmostFirst => { + prefixes.optimize_for_prefix_by_preference(); + } + } + debug!( + "prefixes (len={:?}, exact={:?}) extracted after optimization: {:?}", + prefixes.len(), + prefixes.is_exact(), + prefixes + ); + prefixes +} + +/// Like `prefixes`, but for all suffixes of all matches for the given HIRs. +#[cfg(feature = "syntax")] +pub(crate) fn suffixes<H>(kind: MatchKind, hirs: &[H]) -> literal::Seq +where + H: core::borrow::Borrow<Hir>, +{ + let mut extractor = literal::Extractor::new(); + extractor.kind(literal::ExtractKind::Suffix); + + let mut suffixes = literal::Seq::empty(); + for hir in hirs { + suffixes.union(&mut extractor.extract(hir.borrow())); + } + debug!( + "suffixes (len={:?}, exact={:?}) extracted before optimization: {:?}", + suffixes.len(), + suffixes.is_exact(), + suffixes + ); + match kind { + MatchKind::All => { + suffixes.sort(); + suffixes.dedup(); + } + MatchKind::LeftmostFirst => { + suffixes.optimize_for_suffix_by_preference(); + } + } + debug!( + "suffixes (len={:?}, exact={:?}) extracted after optimization: {:?}", + suffixes.len(), + suffixes.is_exact(), + suffixes + ); + suffixes +} diff --git a/vendor/regex-automata/src/util/prefilter/teddy.rs b/vendor/regex-automata/src/util/prefilter/teddy.rs new file mode 100644 index 0000000..fc79f2b --- /dev/null +++ b/vendor/regex-automata/src/util/prefilter/teddy.rs @@ -0,0 +1,160 @@ +use crate::util::{ + prefilter::PrefilterI, + search::{MatchKind, Span}, +}; + +#[derive(Clone, Debug)] +pub(crate) struct Teddy { + #[cfg(not(feature = "perf-literal-multisubstring"))] + _unused: (), + /// The actual Teddy searcher. + /// + /// Technically, it's possible that Teddy doesn't actually get used, since + /// Teddy does require its haystack to at least be of a certain size + /// (usually around the size of whatever vector is being used, so ~16 + /// or ~32 bytes). For haystacks shorter than that, the implementation + /// currently uses Rabin-Karp. + #[cfg(feature = "perf-literal-multisubstring")] + searcher: aho_corasick::packed::Searcher, + /// When running an anchored search, the packed searcher can't handle it so + /// we defer to Aho-Corasick itself. Kind of sad, but changing the packed + /// searchers to support anchored search would be difficult at worst and + /// annoying at best. Since packed searchers only apply to small numbers of + /// literals, we content ourselves that this is not much of an added cost. + /// (That packed searchers only work with a small number of literals is + /// also why we use a DFA here. Otherwise, the memory usage of a DFA would + /// likely be unacceptable.) + #[cfg(feature = "perf-literal-multisubstring")] + anchored_ac: aho_corasick::dfa::DFA, + /// The length of the smallest literal we look for. + /// + /// We use this as a heuristic to figure out whether this will be "fast" or + /// not. Generally, the longer the better, because longer needles are more + /// discriminating and thus reduce false positive rate. + #[cfg(feature = "perf-literal-multisubstring")] + minimum_len: usize, +} + +impl Teddy { + pub(crate) fn new<B: AsRef<[u8]>>( + kind: MatchKind, + needles: &[B], + ) -> Option<Teddy> { + #[cfg(not(feature = "perf-literal-multisubstring"))] + { + None + } + #[cfg(feature = "perf-literal-multisubstring")] + { + // We only really support leftmost-first semantics. In + // theory we could at least support leftmost-longest, as the + // aho-corasick crate does, but regex-automata doesn't know about + // leftmost-longest currently. + // + // And like the aho-corasick prefilter, if we're using `All` + // semantics, then we can still use leftmost semantics for a + // prefilter. (This might be a suspicious choice for the literal + // engine, which uses a prefilter as a regex engine directly, but + // that only happens when using leftmost-first semantics.) + let (packed_match_kind, ac_match_kind) = match kind { + MatchKind::LeftmostFirst | MatchKind::All => ( + aho_corasick::packed::MatchKind::LeftmostFirst, + aho_corasick::MatchKind::LeftmostFirst, + ), + }; + let minimum_len = + needles.iter().map(|n| n.as_ref().len()).min().unwrap_or(0); + let packed = aho_corasick::packed::Config::new() + .match_kind(packed_match_kind) + .builder() + .extend(needles) + .build()?; + let anchored_ac = aho_corasick::dfa::DFA::builder() + .match_kind(ac_match_kind) + .start_kind(aho_corasick::StartKind::Anchored) + .prefilter(false) + .build(needles) + .ok()?; + Some(Teddy { searcher: packed, anchored_ac, minimum_len }) + } + } +} + +impl PrefilterI for Teddy { + fn find(&self, haystack: &[u8], span: Span) -> Option<Span> { + #[cfg(not(feature = "perf-literal-multisubstring"))] + { + unreachable!() + } + #[cfg(feature = "perf-literal-multisubstring")] + { + let ac_span = + aho_corasick::Span { start: span.start, end: span.end }; + self.searcher + .find_in(haystack, ac_span) + .map(|m| Span { start: m.start(), end: m.end() }) + } + } + + fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span> { + #[cfg(not(feature = "perf-literal-multisubstring"))] + { + unreachable!() + } + #[cfg(feature = "perf-literal-multisubstring")] + { + use aho_corasick::automaton::Automaton; + let input = aho_corasick::Input::new(haystack) + .anchored(aho_corasick::Anchored::Yes) + .span(span.start..span.end); + self.anchored_ac + .try_find(&input) + // OK because we build the DFA with anchored support. + .expect("aho-corasick DFA should never fail") + .map(|m| Span { start: m.start(), end: m.end() }) + } + } + + fn memory_usage(&self) -> usize { + #[cfg(not(feature = "perf-literal-multisubstring"))] + { + unreachable!() + } + #[cfg(feature = "perf-literal-multisubstring")] + { + use aho_corasick::automaton::Automaton; + self.searcher.memory_usage() + self.anchored_ac.memory_usage() + } + } + + fn is_fast(&self) -> bool { + #[cfg(not(feature = "perf-literal-multisubstring"))] + { + unreachable!() + } + #[cfg(feature = "perf-literal-multisubstring")] + { + // Teddy is usually quite fast, but I have seen some cases where + // a large number of literals can overwhelm it and make it not so + // fast. We make an educated but conservative guess at a limit, at + // which point, we're not so comfortable thinking Teddy is "fast." + // + // Well... this used to incorporate a "limit" on the *number* + // of literals, but I have since changed it to a minimum on the + // *smallest* literal. Namely, when there is a very small literal + // (1 or 2 bytes), it is far more likely that it leads to a higher + // false positive rate. (Although, of course, not always. For + // example, 'zq' is likely to have a very low false positive rate.) + // But when we have 3 bytes, we have a really good chance of being + // quite discriminatory and thus fast. + // + // We may still want to add some kind of limit on the number of + // literals here, but keep in mind that Teddy already has its own + // somewhat small limit (64 at time of writing). The main issue + // here is that if 'is_fast' is false, it opens the door for the + // reverse inner optimization to kick in. We really only want to + // resort to the reverse inner optimization if we absolutely must. + self.minimum_len >= 3 + } + } +} diff --git a/vendor/regex-automata/src/util/primitives.rs b/vendor/regex-automata/src/util/primitives.rs new file mode 100644 index 0000000..5c5d187 --- /dev/null +++ b/vendor/regex-automata/src/util/primitives.rs @@ -0,0 +1,776 @@ +/*! +Lower level primitive types that are useful in a variety of circumstances. + +# Overview + +This list represents the principle types in this module and briefly describes +when you might want to use them. + +* [`PatternID`] - A type that represents the identifier of a regex pattern. +This is probably the most widely used type in this module (which is why it's +also re-exported in the crate root). +* [`StateID`] - A type the represents the identifier of a finite automaton +state. This is used for both NFAs and DFAs, with the notable exception of +the hybrid NFA/DFA. (The hybrid NFA/DFA uses a special purpose "lazy" state +identifier.) +* [`SmallIndex`] - The internal representation of both a `PatternID` and a +`StateID`. Its purpose is to serve as a type that can index memory without +being as big as a `usize` on 64-bit targets. The main idea behind this type +is that there are many things in regex engines that will, in practice, never +overflow a 32-bit integer. (For example, like the number of patterns in a regex +or the number of states in an NFA.) Thus, a `SmallIndex` can be used to index +memory without peppering `as` casts everywhere. Moreover, it forces callers +to handle errors in the case where, somehow, the value would otherwise overflow +either a 32-bit integer or a `usize` (e.g., on 16-bit targets). +* [`NonMaxUsize`] - Represents a `usize` that cannot be `usize::MAX`. As a +result, `Option<NonMaxUsize>` has the same size in memory as a `usize`. This +useful, for example, when representing the offsets of submatches since it +reduces memory usage by a factor of 2. It is a legal optimization since Rust +guarantees that slices never have a length that exceeds `isize::MAX`. +*/ + +use core::num::NonZeroUsize; + +#[cfg(feature = "alloc")] +use alloc::vec::Vec; + +use crate::util::int::{Usize, U16, U32, U64}; + +/// A `usize` that can never be `usize::MAX`. +/// +/// This is similar to `core::num::NonZeroUsize`, but instead of not permitting +/// a zero value, this does not permit a max value. +/// +/// This is useful in certain contexts where one wants to optimize the memory +/// usage of things that contain match offsets. Namely, since Rust slices +/// are guaranteed to never have a length exceeding `isize::MAX`, we can use +/// `usize::MAX` as a sentinel to indicate that no match was found. Indeed, +/// types like `Option<NonMaxUsize>` have exactly the same size in memory as a +/// `usize`. +/// +/// This type is defined to be `repr(transparent)` for +/// `core::num::NonZeroUsize`, which is in turn defined to be +/// `repr(transparent)` for `usize`. +#[derive(Clone, Copy, Eq, Hash, PartialEq, PartialOrd, Ord)] +#[repr(transparent)] +pub struct NonMaxUsize(NonZeroUsize); + +impl NonMaxUsize { + /// Create a new `NonMaxUsize` from the given value. + /// + /// This returns `None` only when the given value is equal to `usize::MAX`. + #[inline] + pub fn new(value: usize) -> Option<NonMaxUsize> { + NonZeroUsize::new(value.wrapping_add(1)).map(NonMaxUsize) + } + + /// Return the underlying `usize` value. The returned value is guaranteed + /// to not equal `usize::MAX`. + #[inline] + pub fn get(self) -> usize { + self.0.get().wrapping_sub(1) + } +} + +// We provide our own Debug impl because seeing the internal repr can be quite +// surprising if you aren't expecting it. e.g., 'NonMaxUsize(5)' vs just '5'. +impl core::fmt::Debug for NonMaxUsize { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + write!(f, "{:?}", self.get()) + } +} + +/// A type that represents a "small" index. +/// +/// The main idea of this type is to provide something that can index memory, +/// but uses less memory than `usize` on 64-bit systems. Specifically, its +/// representation is always a `u32` and has `repr(transparent)` enabled. (So +/// it is safe to transmute between a `u32` and a `SmallIndex`.) +/// +/// A small index is typically useful in cases where there is no practical way +/// that the index will overflow a 32-bit integer. A good example of this is +/// an NFA state. If you could somehow build an NFA with `2^30` states, its +/// memory usage would be exorbitant and its runtime execution would be so +/// slow as to be completely worthless. Therefore, this crate generally deems +/// it acceptable to return an error if it would otherwise build an NFA that +/// requires a slice longer than what a 32-bit integer can index. In exchange, +/// we can use 32-bit indices instead of 64-bit indices in various places. +/// +/// This type ensures this by providing a constructor that will return an error +/// if its argument cannot fit into the type. This makes it much easier to +/// handle these sorts of boundary cases that are otherwise extremely subtle. +/// +/// On all targets, this type guarantees that its value will fit in a `u32`, +/// `i32`, `usize` and an `isize`. This means that on 16-bit targets, for +/// example, this type's maximum value will never overflow an `isize`, +/// which means it will never overflow a `i16` even though its internal +/// representation is still a `u32`. +/// +/// The purpose for making the type fit into even signed integer types like +/// `isize` is to guarantee that the difference between any two small indices +/// is itself also a small index. This is useful in certain contexts, e.g., +/// for delta encoding. +/// +/// # Other types +/// +/// The following types wrap `SmallIndex` to provide a more focused use case: +/// +/// * [`PatternID`] is for representing the identifiers of patterns. +/// * [`StateID`] is for representing the identifiers of states in finite +/// automata. It is used for both NFAs and DFAs. +/// +/// # Representation +/// +/// This type is always represented internally by a `u32` and is marked as +/// `repr(transparent)`. Thus, this type always has the same representation as +/// a `u32`. It is thus safe to transmute between a `u32` and a `SmallIndex`. +/// +/// # Indexing +/// +/// For convenience, callers may use a `SmallIndex` to index slices. +/// +/// # Safety +/// +/// While a `SmallIndex` is meant to guarantee that its value fits into `usize` +/// without using as much space as a `usize` on all targets, callers must +/// not rely on this property for safety. Callers may choose to rely on this +/// property for correctness however. For example, creating a `SmallIndex` with +/// an invalid value can be done in entirely safe code. This may in turn result +/// in panics or silent logical errors. +#[derive( + Clone, Copy, Debug, Default, Eq, Hash, PartialEq, PartialOrd, Ord, +)] +#[repr(transparent)] +pub struct SmallIndex(u32); + +impl SmallIndex { + /// The maximum index value. + #[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))] + pub const MAX: SmallIndex = + // FIXME: Use as_usize() once const functions in traits are stable. + SmallIndex::new_unchecked(core::i32::MAX as usize - 1); + + /// The maximum index value. + #[cfg(target_pointer_width = "16")] + pub const MAX: SmallIndex = + SmallIndex::new_unchecked(core::isize::MAX - 1); + + /// The total number of values that can be represented as a small index. + pub const LIMIT: usize = SmallIndex::MAX.as_usize() + 1; + + /// The zero index value. + pub const ZERO: SmallIndex = SmallIndex::new_unchecked(0); + + /// The number of bytes that a single small index uses in memory. + pub const SIZE: usize = core::mem::size_of::<SmallIndex>(); + + /// Create a new small index. + /// + /// If the given index exceeds [`SmallIndex::MAX`], then this returns + /// an error. + #[inline] + pub fn new(index: usize) -> Result<SmallIndex, SmallIndexError> { + SmallIndex::try_from(index) + } + + /// Create a new small index without checking whether the given value + /// exceeds [`SmallIndex::MAX`]. + /// + /// Using this routine with an invalid index value will result in + /// unspecified behavior, but *not* undefined behavior. In particular, an + /// invalid index value is likely to cause panics or possibly even silent + /// logical errors. + /// + /// Callers must never rely on a `SmallIndex` to be within a certain range + /// for memory safety. + #[inline] + pub const fn new_unchecked(index: usize) -> SmallIndex { + // FIXME: Use as_u32() once const functions in traits are stable. + SmallIndex(index as u32) + } + + /// Like [`SmallIndex::new`], but panics if the given index is not valid. + #[inline] + pub fn must(index: usize) -> SmallIndex { + SmallIndex::new(index).expect("invalid small index") + } + + /// Return this small index as a `usize`. This is guaranteed to never + /// overflow `usize`. + #[inline] + pub const fn as_usize(&self) -> usize { + // FIXME: Use as_usize() once const functions in traits are stable. + self.0 as usize + } + + /// Return this small index as a `u64`. This is guaranteed to never + /// overflow. + #[inline] + pub const fn as_u64(&self) -> u64 { + // FIXME: Use u64::from() once const functions in traits are stable. + self.0 as u64 + } + + /// Return the internal `u32` of this small index. This is guaranteed to + /// never overflow `u32`. + #[inline] + pub const fn as_u32(&self) -> u32 { + self.0 + } + + /// Return the internal `u32` of this small index represented as an `i32`. + /// This is guaranteed to never overflow an `i32`. + #[inline] + pub const fn as_i32(&self) -> i32 { + // This is OK because we guarantee that our max value is <= i32::MAX. + self.0 as i32 + } + + /// Returns one more than this small index as a usize. + /// + /// Since a small index has constraints on its maximum value, adding `1` to + /// it will always fit in a `usize`, `u32` and a `i32`. + #[inline] + pub fn one_more(&self) -> usize { + self.as_usize() + 1 + } + + /// Decode this small index from the bytes given using the native endian + /// byte order for the current target. + /// + /// If the decoded integer is not representable as a small index for the + /// current target, then this returns an error. + #[inline] + pub fn from_ne_bytes( + bytes: [u8; 4], + ) -> Result<SmallIndex, SmallIndexError> { + let id = u32::from_ne_bytes(bytes); + if id > SmallIndex::MAX.as_u32() { + return Err(SmallIndexError { attempted: u64::from(id) }); + } + Ok(SmallIndex::new_unchecked(id.as_usize())) + } + + /// Decode this small index from the bytes given using the native endian + /// byte order for the current target. + /// + /// This is analogous to [`SmallIndex::new_unchecked`] in that is does not + /// check whether the decoded integer is representable as a small index. + #[inline] + pub fn from_ne_bytes_unchecked(bytes: [u8; 4]) -> SmallIndex { + SmallIndex::new_unchecked(u32::from_ne_bytes(bytes).as_usize()) + } + + /// Return the underlying small index integer as raw bytes in native endian + /// format. + #[inline] + pub fn to_ne_bytes(&self) -> [u8; 4] { + self.0.to_ne_bytes() + } +} + +impl<T> core::ops::Index<SmallIndex> for [T] { + type Output = T; + + #[inline] + fn index(&self, index: SmallIndex) -> &T { + &self[index.as_usize()] + } +} + +impl<T> core::ops::IndexMut<SmallIndex> for [T] { + #[inline] + fn index_mut(&mut self, index: SmallIndex) -> &mut T { + &mut self[index.as_usize()] + } +} + +#[cfg(feature = "alloc")] +impl<T> core::ops::Index<SmallIndex> for Vec<T> { + type Output = T; + + #[inline] + fn index(&self, index: SmallIndex) -> &T { + &self[index.as_usize()] + } +} + +#[cfg(feature = "alloc")] +impl<T> core::ops::IndexMut<SmallIndex> for Vec<T> { + #[inline] + fn index_mut(&mut self, index: SmallIndex) -> &mut T { + &mut self[index.as_usize()] + } +} + +impl From<u8> for SmallIndex { + fn from(index: u8) -> SmallIndex { + SmallIndex::new_unchecked(usize::from(index)) + } +} + +impl TryFrom<u16> for SmallIndex { + type Error = SmallIndexError; + + fn try_from(index: u16) -> Result<SmallIndex, SmallIndexError> { + if u32::from(index) > SmallIndex::MAX.as_u32() { + return Err(SmallIndexError { attempted: u64::from(index) }); + } + Ok(SmallIndex::new_unchecked(index.as_usize())) + } +} + +impl TryFrom<u32> for SmallIndex { + type Error = SmallIndexError; + + fn try_from(index: u32) -> Result<SmallIndex, SmallIndexError> { + if index > SmallIndex::MAX.as_u32() { + return Err(SmallIndexError { attempted: u64::from(index) }); + } + Ok(SmallIndex::new_unchecked(index.as_usize())) + } +} + +impl TryFrom<u64> for SmallIndex { + type Error = SmallIndexError; + + fn try_from(index: u64) -> Result<SmallIndex, SmallIndexError> { + if index > SmallIndex::MAX.as_u64() { + return Err(SmallIndexError { attempted: index }); + } + Ok(SmallIndex::new_unchecked(index.as_usize())) + } +} + +impl TryFrom<usize> for SmallIndex { + type Error = SmallIndexError; + + fn try_from(index: usize) -> Result<SmallIndex, SmallIndexError> { + if index > SmallIndex::MAX.as_usize() { + return Err(SmallIndexError { attempted: index.as_u64() }); + } + Ok(SmallIndex::new_unchecked(index)) + } +} + +#[cfg(test)] +impl quickcheck::Arbitrary for SmallIndex { + fn arbitrary(gen: &mut quickcheck::Gen) -> SmallIndex { + use core::cmp::max; + + let id = max(i32::MIN + 1, i32::arbitrary(gen)).abs(); + if id > SmallIndex::MAX.as_i32() { + SmallIndex::MAX + } else { + SmallIndex::new(usize::try_from(id).unwrap()).unwrap() + } + } +} + +/// This error occurs when a small index could not be constructed. +/// +/// This occurs when given an integer exceeding the maximum small index value. +/// +/// When the `std` feature is enabled, this implements the `Error` trait. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct SmallIndexError { + attempted: u64, +} + +impl SmallIndexError { + /// Returns the value that could not be converted to a small index. + pub fn attempted(&self) -> u64 { + self.attempted + } +} + +#[cfg(feature = "std")] +impl std::error::Error for SmallIndexError {} + +impl core::fmt::Display for SmallIndexError { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + write!( + f, + "failed to create small index from {:?}, which exceeds {:?}", + self.attempted(), + SmallIndex::MAX, + ) + } +} + +#[derive(Clone, Debug)] +pub(crate) struct SmallIndexIter { + rng: core::ops::Range<usize>, +} + +impl Iterator for SmallIndexIter { + type Item = SmallIndex; + + fn next(&mut self) -> Option<SmallIndex> { + if self.rng.start >= self.rng.end { + return None; + } + let next_id = self.rng.start + 1; + let id = core::mem::replace(&mut self.rng.start, next_id); + // new_unchecked is OK since we asserted that the number of + // elements in this iterator will fit in an ID at construction. + Some(SmallIndex::new_unchecked(id)) + } +} + +macro_rules! index_type_impls { + ($name:ident, $err:ident, $iter:ident, $withiter:ident) => { + impl $name { + /// The maximum value. + pub const MAX: $name = $name(SmallIndex::MAX); + + /// The total number of values that can be represented. + pub const LIMIT: usize = SmallIndex::LIMIT; + + /// The zero value. + pub const ZERO: $name = $name(SmallIndex::ZERO); + + /// The number of bytes that a single value uses in memory. + pub const SIZE: usize = SmallIndex::SIZE; + + /// Create a new value that is represented by a "small index." + /// + /// If the given index exceeds the maximum allowed value, then this + /// returns an error. + #[inline] + pub fn new(value: usize) -> Result<$name, $err> { + SmallIndex::new(value).map($name).map_err($err) + } + + /// Create a new value without checking whether the given argument + /// exceeds the maximum. + /// + /// Using this routine with an invalid value will result in + /// unspecified behavior, but *not* undefined behavior. In + /// particular, an invalid ID value is likely to cause panics or + /// possibly even silent logical errors. + /// + /// Callers must never rely on this type to be within a certain + /// range for memory safety. + #[inline] + pub const fn new_unchecked(value: usize) -> $name { + $name(SmallIndex::new_unchecked(value)) + } + + /// Like `new`, but panics if the given value is not valid. + #[inline] + pub fn must(value: usize) -> $name { + $name::new(value).expect(concat!( + "invalid ", + stringify!($name), + " value" + )) + } + + /// Return the internal value as a `usize`. This is guaranteed to + /// never overflow `usize`. + #[inline] + pub const fn as_usize(&self) -> usize { + self.0.as_usize() + } + + /// Return the internal value as a `u64`. This is guaranteed to + /// never overflow. + #[inline] + pub const fn as_u64(&self) -> u64 { + self.0.as_u64() + } + + /// Return the internal value as a `u32`. This is guaranteed to + /// never overflow `u32`. + #[inline] + pub const fn as_u32(&self) -> u32 { + self.0.as_u32() + } + + /// Return the internal value as a i32`. This is guaranteed to + /// never overflow an `i32`. + #[inline] + pub const fn as_i32(&self) -> i32 { + self.0.as_i32() + } + + /// Returns one more than this value as a usize. + /// + /// Since values represented by a "small index" have constraints + /// on their maximum value, adding `1` to it will always fit in a + /// `usize`, `u32` and a `i32`. + #[inline] + pub fn one_more(&self) -> usize { + self.0.one_more() + } + + /// Decode this value from the bytes given using the native endian + /// byte order for the current target. + /// + /// If the decoded integer is not representable as a small index + /// for the current target, then this returns an error. + #[inline] + pub fn from_ne_bytes(bytes: [u8; 4]) -> Result<$name, $err> { + SmallIndex::from_ne_bytes(bytes).map($name).map_err($err) + } + + /// Decode this value from the bytes given using the native endian + /// byte order for the current target. + /// + /// This is analogous to `new_unchecked` in that is does not check + /// whether the decoded integer is representable as a small index. + #[inline] + pub fn from_ne_bytes_unchecked(bytes: [u8; 4]) -> $name { + $name(SmallIndex::from_ne_bytes_unchecked(bytes)) + } + + /// Return the underlying integer as raw bytes in native endian + /// format. + #[inline] + pub fn to_ne_bytes(&self) -> [u8; 4] { + self.0.to_ne_bytes() + } + + /// Returns an iterator over all values from 0 up to and not + /// including the given length. + /// + /// If the given length exceeds this type's limit, then this + /// panics. + pub(crate) fn iter(len: usize) -> $iter { + $iter::new(len) + } + } + + // We write our own Debug impl so that we get things like PatternID(5) + // instead of PatternID(SmallIndex(5)). + impl core::fmt::Debug for $name { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + f.debug_tuple(stringify!($name)).field(&self.as_u32()).finish() + } + } + + impl<T> core::ops::Index<$name> for [T] { + type Output = T; + + #[inline] + fn index(&self, index: $name) -> &T { + &self[index.as_usize()] + } + } + + impl<T> core::ops::IndexMut<$name> for [T] { + #[inline] + fn index_mut(&mut self, index: $name) -> &mut T { + &mut self[index.as_usize()] + } + } + + #[cfg(feature = "alloc")] + impl<T> core::ops::Index<$name> for Vec<T> { + type Output = T; + + #[inline] + fn index(&self, index: $name) -> &T { + &self[index.as_usize()] + } + } + + #[cfg(feature = "alloc")] + impl<T> core::ops::IndexMut<$name> for Vec<T> { + #[inline] + fn index_mut(&mut self, index: $name) -> &mut T { + &mut self[index.as_usize()] + } + } + + impl From<u8> for $name { + fn from(value: u8) -> $name { + $name(SmallIndex::from(value)) + } + } + + impl TryFrom<u16> for $name { + type Error = $err; + + fn try_from(value: u16) -> Result<$name, $err> { + SmallIndex::try_from(value).map($name).map_err($err) + } + } + + impl TryFrom<u32> for $name { + type Error = $err; + + fn try_from(value: u32) -> Result<$name, $err> { + SmallIndex::try_from(value).map($name).map_err($err) + } + } + + impl TryFrom<u64> for $name { + type Error = $err; + + fn try_from(value: u64) -> Result<$name, $err> { + SmallIndex::try_from(value).map($name).map_err($err) + } + } + + impl TryFrom<usize> for $name { + type Error = $err; + + fn try_from(value: usize) -> Result<$name, $err> { + SmallIndex::try_from(value).map($name).map_err($err) + } + } + + #[cfg(test)] + impl quickcheck::Arbitrary for $name { + fn arbitrary(gen: &mut quickcheck::Gen) -> $name { + $name(SmallIndex::arbitrary(gen)) + } + } + + /// This error occurs when a value could not be constructed. + /// + /// This occurs when given an integer exceeding the maximum allowed + /// value. + /// + /// When the `std` feature is enabled, this implements the `Error` + /// trait. + #[derive(Clone, Debug, Eq, PartialEq)] + pub struct $err(SmallIndexError); + + impl $err { + /// Returns the value that could not be converted to an ID. + pub fn attempted(&self) -> u64 { + self.0.attempted() + } + } + + #[cfg(feature = "std")] + impl std::error::Error for $err {} + + impl core::fmt::Display for $err { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + write!( + f, + "failed to create {} from {:?}, which exceeds {:?}", + stringify!($name), + self.attempted(), + $name::MAX, + ) + } + } + + #[derive(Clone, Debug)] + pub(crate) struct $iter(SmallIndexIter); + + impl $iter { + fn new(len: usize) -> $iter { + assert!( + len <= $name::LIMIT, + "cannot create iterator for {} when number of \ + elements exceed {:?}", + stringify!($name), + $name::LIMIT, + ); + $iter(SmallIndexIter { rng: 0..len }) + } + } + + impl Iterator for $iter { + type Item = $name; + + fn next(&mut self) -> Option<$name> { + self.0.next().map($name) + } + } + + /// An iterator adapter that is like std::iter::Enumerate, but attaches + /// small index values instead. It requires `ExactSizeIterator`. At + /// construction, it ensures that the index of each element in the + /// iterator is representable in the corresponding small index type. + #[derive(Clone, Debug)] + pub(crate) struct $withiter<I> { + it: I, + ids: $iter, + } + + impl<I: Iterator + ExactSizeIterator> $withiter<I> { + fn new(it: I) -> $withiter<I> { + let ids = $name::iter(it.len()); + $withiter { it, ids } + } + } + + impl<I: Iterator + ExactSizeIterator> Iterator for $withiter<I> { + type Item = ($name, I::Item); + + fn next(&mut self) -> Option<($name, I::Item)> { + let item = self.it.next()?; + // Number of elements in this iterator must match, according + // to contract of ExactSizeIterator. + let id = self.ids.next().unwrap(); + Some((id, item)) + } + } + }; +} + +/// The identifier of a regex pattern, represented by a [`SmallIndex`]. +/// +/// The identifier for a pattern corresponds to its relative position among +/// other patterns in a single finite state machine. Namely, when building +/// a multi-pattern regex engine, one must supply a sequence of patterns to +/// match. The position (starting at 0) of each pattern in that sequence +/// represents its identifier. This identifier is in turn used to identify and +/// report matches of that pattern in various APIs. +/// +/// See the [`SmallIndex`] type for more information about what it means for +/// a pattern ID to be a "small index." +/// +/// Note that this type is defined in the +/// [`util::primitives`](crate::util::primitives) module, but it is also +/// re-exported at the crate root due to how common it is. +#[derive(Clone, Copy, Default, Eq, Hash, PartialEq, PartialOrd, Ord)] +#[repr(transparent)] +pub struct PatternID(SmallIndex); + +/// The identifier of a finite automaton state, represented by a +/// [`SmallIndex`]. +/// +/// Most regex engines in this crate are built on top of finite automata. Each +/// state in a finite automaton defines transitions from its state to another. +/// Those transitions point to other states via their identifiers, i.e., a +/// `StateID`. Since finite automata tend to contain many transitions, it is +/// much more memory efficient to define state IDs as small indices. +/// +/// See the [`SmallIndex`] type for more information about what it means for +/// a state ID to be a "small index." +#[derive(Clone, Copy, Default, Eq, Hash, PartialEq, PartialOrd, Ord)] +#[repr(transparent)] +pub struct StateID(SmallIndex); + +index_type_impls!(PatternID, PatternIDError, PatternIDIter, WithPatternIDIter); +index_type_impls!(StateID, StateIDError, StateIDIter, WithStateIDIter); + +/// A utility trait that defines a couple of adapters for making it convenient +/// to access indices as "small index" types. We require ExactSizeIterator so +/// that iterator construction can do a single check to make sure the index of +/// each element is representable by its small index type. +pub(crate) trait IteratorIndexExt: Iterator { + fn with_pattern_ids(self) -> WithPatternIDIter<Self> + where + Self: Sized + ExactSizeIterator, + { + WithPatternIDIter::new(self) + } + + fn with_state_ids(self) -> WithStateIDIter<Self> + where + Self: Sized + ExactSizeIterator, + { + WithStateIDIter::new(self) + } +} + +impl<I: Iterator> IteratorIndexExt for I {} diff --git a/vendor/regex-automata/src/util/search.rs b/vendor/regex-automata/src/util/search.rs new file mode 100644 index 0000000..39aec52 --- /dev/null +++ b/vendor/regex-automata/src/util/search.rs @@ -0,0 +1,1969 @@ +/*! +Types and routines that support the search APIs of most regex engines. + +This sub-module isn't exposed directly, but rather, its contents are exported +at the crate root due to the universality of most of the types and routines in +this module. +*/ + +use core::ops::{Range, RangeBounds}; + +use crate::util::{escape::DebugByte, primitives::PatternID, utf8}; + +/// The parameters for a regex search including the haystack to search. +/// +/// It turns out that regex searches have a few parameters, and in most cases, +/// those parameters have defaults that work in the vast majority of cases. +/// This `Input` type exists to make that common case seamnless while also +/// providing an avenue for changing the parameters of a search. In particular, +/// this type enables doing so without a combinatorial explosion of different +/// methods and/or superfluous parameters in the common cases. +/// +/// An `Input` permits configuring the following things: +/// +/// * Search only a substring of a haystack, while taking the broader context +/// into account for resolving look-around assertions. +/// * Indicating whether to search for all patterns in a regex, or to +/// only search for one pattern in particular. +/// * Whether to perform an anchored on unanchored search. +/// * Whether to report a match as early as possible. +/// +/// All of these parameters, except for the haystack, have sensible default +/// values. This means that the minimal search configuration is simply a call +/// to [`Input::new`] with your haystack. Setting any other parameter is +/// optional. +/// +/// Moreover, for any `H` that implements `AsRef<[u8]>`, there exists a +/// `From<H> for Input` implementation. This is useful because many of the +/// search APIs in this crate accept an `Into<Input>`. This means you can +/// provide string or byte strings to these routines directly, and they'll +/// automatically get converted into an `Input` for you. +/// +/// The lifetime parameter `'h` refers to the lifetime of the haystack. +/// +/// # Organization +/// +/// The API of `Input` is split into a few different parts: +/// +/// * A builder-like API that transforms a `Input` by value. Examples: +/// [`Input::span`] and [`Input::anchored`]. +/// * A setter API that permits mutating parameters in place. Examples: +/// [`Input::set_span`] and [`Input::set_anchored`]. +/// * A getter API that permits retrieving any of the search parameters. +/// Examples: [`Input::get_span`] and [`Input::get_anchored`]. +/// * A few convenience getter routines that don't conform to the above naming +/// pattern due to how common they are. Examples: [`Input::haystack`], +/// [`Input::start`] and [`Input::end`]. +/// * Miscellaneous predicates and other helper routines that are useful +/// in some contexts. Examples: [`Input::is_char_boundary`]. +/// +/// A `Input` exposes so much because it is meant to be used by both callers of +/// regex engines _and_ implementors of regex engines. A constraining factor is +/// that regex engines should accept a `&Input` as its lowest level API, which +/// means that implementors should only use the "getter" APIs of a `Input`. +/// +/// # Valid bounds and search termination +/// +/// An `Input` permits setting the bounds of a search via either +/// [`Input::span`] or [`Input::range`]. The bounds set must be valid, or +/// else a panic will occur. Bounds are valid if and only if: +/// +/// * The bounds represent a valid range into the input's haystack. +/// * **or** the end bound is a valid ending bound for the haystack *and* +/// the start bound is exactly one greater than the start bound. +/// +/// In the latter case, [`Input::is_done`] will return true and indicates any +/// search receiving such an input should immediately return with no match. +/// +/// Note that while `Input` is used for reverse searches in this crate, the +/// `Input::is_done` predicate assumes a forward search. Because unsigned +/// offsets are used internally, there is no way to tell from only the offsets +/// whether a reverse search is done or not. +/// +/// # Regex engine support +/// +/// Any regex engine accepting an `Input` must support at least the following +/// things: +/// +/// * Searching a `&[u8]` for matches. +/// * Searching a substring of `&[u8]` for a match, such that any match +/// reported must appear entirely within that substring. +/// * For a forwards search, a match should never be reported when +/// [`Input::is_done`] returns true. (For reverse searches, termination should +/// be handled outside of `Input`.) +/// +/// Supporting other aspects of an `Input` are optional, but regex engines +/// should handle aspects they don't support gracefully. How this is done is +/// generally up to the regex engine. This crate generally treats unsupported +/// anchored modes as an error to report for example, but for simplicity, in +/// the meta regex engine, trying to search with an invalid pattern ID just +/// results in no match being reported. +#[derive(Clone)] +pub struct Input<'h> { + haystack: &'h [u8], + span: Span, + anchored: Anchored, + earliest: bool, +} + +impl<'h> Input<'h> { + /// Create a new search configuration for the given haystack. + #[inline] + pub fn new<H: ?Sized + AsRef<[u8]>>(haystack: &'h H) -> Input<'h> { + Input { + haystack: haystack.as_ref(), + span: Span { start: 0, end: haystack.as_ref().len() }, + anchored: Anchored::No, + earliest: false, + } + } + + /// Set the span for this search. + /// + /// This routine does not panic if the span given is not a valid range for + /// this search's haystack. If this search is run with an invalid range, + /// then the most likely outcome is that the actual search execution will + /// panic. + /// + /// This routine is generic over how a span is provided. While + /// a [`Span`] may be given directly, one may also provide a + /// `std::ops::Range<usize>`. To provide anything supported by range + /// syntax, use the [`Input::range`] method. + /// + /// The default span is the entire haystack. + /// + /// Note that [`Input::range`] overrides this method and vice versa. + /// + /// # Panics + /// + /// This panics if the given span does not correspond to valid bounds in + /// the haystack or the termination of a search. + /// + /// # Example + /// + /// This example shows how the span of the search can impact whether a + /// match is reported or not. This is particularly relevant for look-around + /// operators, which might take things outside of the span into account + /// when determining whether they match. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{ + /// nfa::thompson::pikevm::PikeVM, + /// Match, Input, + /// }; + /// + /// // Look for 'at', but as a distinct word. + /// let re = PikeVM::new(r"\bat\b")?; + /// let mut cache = re.create_cache(); + /// let mut caps = re.create_captures(); + /// + /// // Our haystack contains 'at', but not as a distinct word. + /// let haystack = "batter"; + /// + /// // A standard search finds nothing, as expected. + /// let input = Input::new(haystack); + /// re.search(&mut cache, &input, &mut caps); + /// assert_eq!(None, caps.get_match()); + /// + /// // But if we wanted to search starting at position '1', we might + /// // slice the haystack. If we do this, it's impossible for the \b + /// // anchors to take the surrounding context into account! And thus, + /// // a match is produced. + /// let input = Input::new(&haystack[1..3]); + /// re.search(&mut cache, &input, &mut caps); + /// assert_eq!(Some(Match::must(0, 0..2)), caps.get_match()); + /// + /// // But if we specify the span of the search instead of slicing the + /// // haystack, then the regex engine can "see" outside of the span + /// // and resolve the anchors correctly. + /// let input = Input::new(haystack).span(1..3); + /// re.search(&mut cache, &input, &mut caps); + /// assert_eq!(None, caps.get_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// This may seem a little ham-fisted, but this scenario tends to come up + /// if some other regex engine found the match span and now you need to + /// re-process that span to look for capturing groups. (e.g., Run a faster + /// DFA first, find a match, then run the PikeVM on just the match span to + /// resolve capturing groups.) In order to implement that sort of logic + /// correctly, you need to set the span on the search instead of slicing + /// the haystack directly. + /// + /// The other advantage of using this routine to specify the bounds of the + /// search is that the match offsets are still reported in terms of the + /// original haystack. For example, the second search in the example above + /// reported a match at position `0`, even though `at` starts at offset + /// `1` because we sliced the haystack. + #[inline] + pub fn span<S: Into<Span>>(mut self, span: S) -> Input<'h> { + self.set_span(span); + self + } + + /// Like `Input::span`, but accepts any range instead. + /// + /// This routine does not panic if the range given is not a valid range for + /// this search's haystack. If this search is run with an invalid range, + /// then the most likely outcome is that the actual search execution will + /// panic. + /// + /// The default range is the entire haystack. + /// + /// Note that [`Input::span`] overrides this method and vice versa. + /// + /// # Panics + /// + /// This routine will panic if the given range could not be converted + /// to a valid [`Range`]. For example, this would panic when given + /// `0..=usize::MAX` since it cannot be represented using a half-open + /// interval in terms of `usize`. + /// + /// This also panics if the given range does not correspond to valid bounds + /// in the haystack or the termination of a search. + /// + /// # Example + /// + /// ``` + /// use regex_automata::Input; + /// + /// let input = Input::new("foobar"); + /// assert_eq!(0..6, input.get_range()); + /// + /// let input = Input::new("foobar").range(2..=4); + /// assert_eq!(2..5, input.get_range()); + /// ``` + #[inline] + pub fn range<R: RangeBounds<usize>>(mut self, range: R) -> Input<'h> { + self.set_range(range); + self + } + + /// Sets the anchor mode of a search. + /// + /// When a search is anchored (so that's [`Anchored::Yes`] or + /// [`Anchored::Pattern`]), a match must begin at the start of a search. + /// When a search is not anchored (that's [`Anchored::No`]), regex engines + /// will behave as if the pattern started with a `(?s-u:.)*?`. This prefix + /// permits a match to appear anywhere. + /// + /// By default, the anchored mode is [`Anchored::No`]. + /// + /// **WARNING:** this is subtly different than using a `^` at the start of + /// your regex. A `^` forces a regex to match exclusively at the start of + /// a haystack, regardless of where you begin your search. In contrast, + /// anchoring a search will allow your regex to match anywhere in your + /// haystack, but the match must start at the beginning of a search. + /// + /// For example, consider the haystack `aba` and the following searches: + /// + /// 1. The regex `^a` is compiled with `Anchored::No` and searches `aba` + /// starting at position `2`. Since `^` requires the match to start at + /// the beginning of the haystack and `2 > 0`, no match is found. + /// 2. The regex `a` is compiled with `Anchored::Yes` and searches `aba` + /// starting at position `2`. This reports a match at `[2, 3]` since + /// the match starts where the search started. Since there is no `^`, + /// there is no requirement for the match to start at the beginning of + /// the haystack. + /// 3. The regex `a` is compiled with `Anchored::Yes` and searches `aba` + /// starting at position `1`. Since `b` corresponds to position `1` and + /// since the search is anchored, it finds no match. While the regex + /// matches at other positions, configuring the search to be anchored + /// requires that it only report a match that begins at the same offset + /// as the beginning of the search. + /// 4. The regex `a` is compiled with `Anchored::No` and searches `aba` + /// starting at position `1`. Since the search is not anchored and + /// the regex does not start with `^`, the search executes as if there + /// is a `(?s:.)*?` prefix that permits it to match anywhere. Thus, it + /// reports a match at `[2, 3]`. + /// + /// Note that the [`Anchored::Pattern`] mode is like `Anchored::Yes`, + /// except it only reports matches for a particular pattern. + /// + /// # Example + /// + /// This demonstrates the differences between an anchored search and + /// a pattern that begins with `^` (as described in the above warning + /// message). + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson::pikevm::PikeVM, + /// Anchored, Match, Input, + /// }; + /// + /// let haystack = "aba"; + /// + /// let re = PikeVM::new(r"^a")?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// let input = Input::new(haystack).span(2..3).anchored(Anchored::No); + /// re.search(&mut cache, &input, &mut caps); + /// // No match is found because 2 is not the beginning of the haystack, + /// // which is what ^ requires. + /// assert_eq!(None, caps.get_match()); + /// + /// let re = PikeVM::new(r"a")?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// let input = Input::new(haystack).span(2..3).anchored(Anchored::Yes); + /// re.search(&mut cache, &input, &mut caps); + /// // An anchored search can still match anywhere in the haystack, it just + /// // must begin at the start of the search which is '2' in this case. + /// assert_eq!(Some(Match::must(0, 2..3)), caps.get_match()); + /// + /// let re = PikeVM::new(r"a")?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// let input = Input::new(haystack).span(1..3).anchored(Anchored::Yes); + /// re.search(&mut cache, &input, &mut caps); + /// // No match is found since we start searching at offset 1 which + /// // corresponds to 'b'. Since there is no '(?s:.)*?' prefix, no match + /// // is found. + /// assert_eq!(None, caps.get_match()); + /// + /// let re = PikeVM::new(r"a")?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// let input = Input::new(haystack).span(1..3).anchored(Anchored::No); + /// re.search(&mut cache, &input, &mut caps); + /// // Since anchored=no, an implicit '(?s:.)*?' prefix was added to the + /// // pattern. Even though the search starts at 'b', the 'match anything' + /// // prefix allows the search to match 'a'. + /// let expected = Some(Match::must(0, 2..3)); + /// assert_eq!(expected, caps.get_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn anchored(mut self, mode: Anchored) -> Input<'h> { + self.set_anchored(mode); + self + } + + /// Whether to execute an "earliest" search or not. + /// + /// When running a non-overlapping search, an "earliest" search will return + /// the match location as early as possible. For example, given a pattern + /// of `foo[0-9]+` and a haystack of `foo12345`, a normal leftmost search + /// will return `foo12345` as a match. But an "earliest" search for regex + /// engines that support "earliest" semantics will return `foo1` as a + /// match, since as soon as the first digit following `foo` is seen, it is + /// known to have found a match. + /// + /// Note that "earliest" semantics generally depend on the regex engine. + /// Different regex engines may determine there is a match at different + /// points. So there is no guarantee that "earliest" matches will always + /// return the same offsets for all regex engines. The "earliest" notion + /// is really about when the particular regex engine determines there is + /// a match rather than a consistent semantic unto itself. This is often + /// useful for implementing "did a match occur or not" predicates, but + /// sometimes the offset is useful as well. + /// + /// This is disabled by default. + /// + /// # Example + /// + /// This example shows the difference between "earliest" searching and + /// normal searching. + /// + /// ``` + /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match, Input}; + /// + /// let re = PikeVM::new(r"foo[0-9]+")?; + /// let mut cache = re.create_cache(); + /// let mut caps = re.create_captures(); + /// + /// // A normal search implements greediness like you expect. + /// let input = Input::new("foo12345"); + /// re.search(&mut cache, &input, &mut caps); + /// assert_eq!(Some(Match::must(0, 0..8)), caps.get_match()); + /// + /// // When 'earliest' is enabled and the regex engine supports + /// // it, the search will bail once it knows a match has been + /// // found. + /// let input = Input::new("foo12345").earliest(true); + /// re.search(&mut cache, &input, &mut caps); + /// assert_eq!(Some(Match::must(0, 0..4)), caps.get_match()); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn earliest(mut self, yes: bool) -> Input<'h> { + self.set_earliest(yes); + self + } + + /// Set the span for this search configuration. + /// + /// This is like the [`Input::span`] method, except this mutates the + /// span in place. + /// + /// This routine is generic over how a span is provided. While + /// a [`Span`] may be given directly, one may also provide a + /// `std::ops::Range<usize>`. + /// + /// # Panics + /// + /// This panics if the given span does not correspond to valid bounds in + /// the haystack or the termination of a search. + /// + /// # Example + /// + /// ``` + /// use regex_automata::Input; + /// + /// let mut input = Input::new("foobar"); + /// assert_eq!(0..6, input.get_range()); + /// input.set_span(2..4); + /// assert_eq!(2..4, input.get_range()); + /// ``` + #[inline] + pub fn set_span<S: Into<Span>>(&mut self, span: S) { + let span = span.into(); + assert!( + span.end <= self.haystack.len() + && span.start <= span.end.wrapping_add(1), + "invalid span {:?} for haystack of length {}", + span, + self.haystack.len(), + ); + self.span = span; + } + + /// Set the span for this search configuration given any range. + /// + /// This is like the [`Input::range`] method, except this mutates the + /// span in place. + /// + /// This routine does not panic if the range given is not a valid range for + /// this search's haystack. If this search is run with an invalid range, + /// then the most likely outcome is that the actual search execution will + /// panic. + /// + /// # Panics + /// + /// This routine will panic if the given range could not be converted + /// to a valid [`Range`]. For example, this would panic when given + /// `0..=usize::MAX` since it cannot be represented using a half-open + /// interval in terms of `usize`. + /// + /// This also panics if the given span does not correspond to valid bounds + /// in the haystack or the termination of a search. + /// + /// # Example + /// + /// ``` + /// use regex_automata::Input; + /// + /// let mut input = Input::new("foobar"); + /// assert_eq!(0..6, input.get_range()); + /// input.set_range(2..=4); + /// assert_eq!(2..5, input.get_range()); + /// ``` + #[inline] + pub fn set_range<R: RangeBounds<usize>>(&mut self, range: R) { + use core::ops::Bound; + + // It's a little weird to convert ranges into spans, and then spans + // back into ranges when we actually slice the haystack. Because + // of that process, we always represent everything as a half-open + // internal. Therefore, handling things like m..=n is a little awkward. + let start = match range.start_bound() { + Bound::Included(&i) => i, + // Can this case ever happen? Range syntax doesn't support it... + Bound::Excluded(&i) => i.checked_add(1).unwrap(), + Bound::Unbounded => 0, + }; + let end = match range.end_bound() { + Bound::Included(&i) => i.checked_add(1).unwrap(), + Bound::Excluded(&i) => i, + Bound::Unbounded => self.haystack().len(), + }; + self.set_span(Span { start, end }); + } + + /// Set the starting offset for the span for this search configuration. + /// + /// This is a convenience routine for only mutating the start of a span + /// without having to set the entire span. + /// + /// # Panics + /// + /// This panics if the span resulting from the new start position does not + /// correspond to valid bounds in the haystack or the termination of a + /// search. + /// + /// # Example + /// + /// ``` + /// use regex_automata::Input; + /// + /// let mut input = Input::new("foobar"); + /// assert_eq!(0..6, input.get_range()); + /// input.set_start(5); + /// assert_eq!(5..6, input.get_range()); + /// ``` + #[inline] + pub fn set_start(&mut self, start: usize) { + self.set_span(Span { start, ..self.get_span() }); + } + + /// Set the ending offset for the span for this search configuration. + /// + /// This is a convenience routine for only mutating the end of a span + /// without having to set the entire span. + /// + /// # Panics + /// + /// This panics if the span resulting from the new end position does not + /// correspond to valid bounds in the haystack or the termination of a + /// search. + /// + /// # Example + /// + /// ``` + /// use regex_automata::Input; + /// + /// let mut input = Input::new("foobar"); + /// assert_eq!(0..6, input.get_range()); + /// input.set_end(5); + /// assert_eq!(0..5, input.get_range()); + /// ``` + #[inline] + pub fn set_end(&mut self, end: usize) { + self.set_span(Span { end, ..self.get_span() }); + } + + /// Set the anchor mode of a search. + /// + /// This is like [`Input::anchored`], except it mutates the search + /// configuration in place. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{Anchored, Input, PatternID}; + /// + /// let mut input = Input::new("foobar"); + /// assert_eq!(Anchored::No, input.get_anchored()); + /// + /// let pid = PatternID::must(5); + /// input.set_anchored(Anchored::Pattern(pid)); + /// assert_eq!(Anchored::Pattern(pid), input.get_anchored()); + /// ``` + #[inline] + pub fn set_anchored(&mut self, mode: Anchored) { + self.anchored = mode; + } + + /// Set whether the search should execute in "earliest" mode or not. + /// + /// This is like [`Input::earliest`], except it mutates the search + /// configuration in place. + /// + /// # Example + /// + /// ``` + /// use regex_automata::Input; + /// + /// let mut input = Input::new("foobar"); + /// assert!(!input.get_earliest()); + /// input.set_earliest(true); + /// assert!(input.get_earliest()); + /// ``` + #[inline] + pub fn set_earliest(&mut self, yes: bool) { + self.earliest = yes; + } + + /// Return a borrow of the underlying haystack as a slice of bytes. + /// + /// # Example + /// + /// ``` + /// use regex_automata::Input; + /// + /// let input = Input::new("foobar"); + /// assert_eq!(b"foobar", input.haystack()); + /// ``` + #[inline] + pub fn haystack(&self) -> &[u8] { + self.haystack + } + + /// Return the start position of this search. + /// + /// This is a convenience routine for `search.get_span().start()`. + /// + /// When [`Input::is_done`] is `false`, this is guaranteed to return + /// an offset that is less than or equal to [`Input::end`]. Otherwise, + /// the offset is one greater than [`Input::end`]. + /// + /// # Example + /// + /// ``` + /// use regex_automata::Input; + /// + /// let input = Input::new("foobar"); + /// assert_eq!(0, input.start()); + /// + /// let input = Input::new("foobar").span(2..4); + /// assert_eq!(2, input.start()); + /// ``` + #[inline] + pub fn start(&self) -> usize { + self.get_span().start + } + + /// Return the end position of this search. + /// + /// This is a convenience routine for `search.get_span().end()`. + /// + /// This is guaranteed to return an offset that is a valid exclusive end + /// bound for this input's haystack. + /// + /// # Example + /// + /// ``` + /// use regex_automata::Input; + /// + /// let input = Input::new("foobar"); + /// assert_eq!(6, input.end()); + /// + /// let input = Input::new("foobar").span(2..4); + /// assert_eq!(4, input.end()); + /// ``` + #[inline] + pub fn end(&self) -> usize { + self.get_span().end + } + + /// Return the span for this search configuration. + /// + /// If one was not explicitly set, then the span corresponds to the entire + /// range of the haystack. + /// + /// When [`Input::is_done`] is `false`, the span returned is guaranteed + /// to correspond to valid bounds for this input's haystack. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{Input, Span}; + /// + /// let input = Input::new("foobar"); + /// assert_eq!(Span { start: 0, end: 6 }, input.get_span()); + /// ``` + #[inline] + pub fn get_span(&self) -> Span { + self.span + } + + /// Return the span as a range for this search configuration. + /// + /// If one was not explicitly set, then the span corresponds to the entire + /// range of the haystack. + /// + /// When [`Input::is_done`] is `false`, the range returned is guaranteed + /// to correspond to valid bounds for this input's haystack. + /// + /// # Example + /// + /// ``` + /// use regex_automata::Input; + /// + /// let input = Input::new("foobar"); + /// assert_eq!(0..6, input.get_range()); + /// ``` + #[inline] + pub fn get_range(&self) -> Range<usize> { + self.get_span().range() + } + + /// Return the anchored mode for this search configuration. + /// + /// If no anchored mode was set, then it defaults to [`Anchored::No`]. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{Anchored, Input, PatternID}; + /// + /// let mut input = Input::new("foobar"); + /// assert_eq!(Anchored::No, input.get_anchored()); + /// + /// let pid = PatternID::must(5); + /// input.set_anchored(Anchored::Pattern(pid)); + /// assert_eq!(Anchored::Pattern(pid), input.get_anchored()); + /// ``` + #[inline] + pub fn get_anchored(&self) -> Anchored { + self.anchored + } + + /// Return whether this search should execute in "earliest" mode. + /// + /// # Example + /// + /// ``` + /// use regex_automata::Input; + /// + /// let input = Input::new("foobar"); + /// assert!(!input.get_earliest()); + /// ``` + #[inline] + pub fn get_earliest(&self) -> bool { + self.earliest + } + + /// Return true if and only if this search can never return any other + /// matches. + /// + /// This occurs when the start position of this search is greater than the + /// end position of the search. + /// + /// # Example + /// + /// ``` + /// use regex_automata::Input; + /// + /// let mut input = Input::new("foobar"); + /// assert!(!input.is_done()); + /// input.set_start(6); + /// assert!(!input.is_done()); + /// input.set_start(7); + /// assert!(input.is_done()); + /// ``` + #[inline] + pub fn is_done(&self) -> bool { + self.get_span().start > self.get_span().end + } + + /// Returns true if and only if the given offset in this search's haystack + /// falls on a valid UTF-8 encoded codepoint boundary. + /// + /// If the haystack is not valid UTF-8, then the behavior of this routine + /// is unspecified. + /// + /// # Example + /// + /// This shows where codepoint boundaries do and don't exist in valid + /// UTF-8. + /// + /// ``` + /// use regex_automata::Input; + /// + /// let input = Input::new("☃"); + /// assert!(input.is_char_boundary(0)); + /// assert!(!input.is_char_boundary(1)); + /// assert!(!input.is_char_boundary(2)); + /// assert!(input.is_char_boundary(3)); + /// assert!(!input.is_char_boundary(4)); + /// ``` + #[inline] + pub fn is_char_boundary(&self, offset: usize) -> bool { + utf8::is_boundary(self.haystack(), offset) + } +} + +impl<'h> core::fmt::Debug for Input<'h> { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + use crate::util::escape::DebugHaystack; + + f.debug_struct("Input") + .field("haystack", &DebugHaystack(self.haystack())) + .field("span", &self.span) + .field("anchored", &self.anchored) + .field("earliest", &self.earliest) + .finish() + } +} + +impl<'h, H: ?Sized + AsRef<[u8]>> From<&'h H> for Input<'h> { + fn from(haystack: &'h H) -> Input<'h> { + Input::new(haystack) + } +} + +/// A representation of a span reported by a regex engine. +/// +/// A span corresponds to the starting and ending _byte offsets_ of a +/// contiguous region of bytes. The starting offset is inclusive while the +/// ending offset is exclusive. That is, a span is a half-open interval. +/// +/// A span is used to report the offsets of a match, but it is also used to +/// convey which region of a haystack should be searched via routines like +/// [`Input::span`]. +/// +/// This is basically equivalent to a `std::ops::Range<usize>`, except this +/// type implements `Copy` which makes it more ergonomic to use in the context +/// of this crate. Like a range, this implements `Index` for `[u8]` and `str`, +/// and `IndexMut` for `[u8]`. For convenience, this also impls `From<Range>`, +/// which means things like `Span::from(5..10)` work. +#[derive(Clone, Copy, Eq, Hash, PartialEq)] +pub struct Span { + /// The start offset of the span, inclusive. + pub start: usize, + /// The end offset of the span, exclusive. + pub end: usize, +} + +impl Span { + /// Returns this span as a range. + #[inline] + pub fn range(&self) -> Range<usize> { + Range::from(*self) + } + + /// Returns true when this span is empty. That is, when `start >= end`. + #[inline] + pub fn is_empty(&self) -> bool { + self.start >= self.end + } + + /// Returns the length of this span. + /// + /// This returns `0` in precisely the cases that `is_empty` returns `true`. + #[inline] + pub fn len(&self) -> usize { + self.end.saturating_sub(self.start) + } + + /// Returns true when the given offset is contained within this span. + /// + /// Note that an empty span contains no offsets and will always return + /// false. + #[inline] + pub fn contains(&self, offset: usize) -> bool { + !self.is_empty() && self.start <= offset && offset <= self.end + } + + /// Returns a new span with `offset` added to this span's `start` and `end` + /// values. + #[inline] + pub fn offset(&self, offset: usize) -> Span { + Span { start: self.start + offset, end: self.end + offset } + } +} + +impl core::fmt::Debug for Span { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + write!(f, "{}..{}", self.start, self.end) + } +} + +impl core::ops::Index<Span> for [u8] { + type Output = [u8]; + + #[inline] + fn index(&self, index: Span) -> &[u8] { + &self[index.range()] + } +} + +impl core::ops::IndexMut<Span> for [u8] { + #[inline] + fn index_mut(&mut self, index: Span) -> &mut [u8] { + &mut self[index.range()] + } +} + +impl core::ops::Index<Span> for str { + type Output = str; + + #[inline] + fn index(&self, index: Span) -> &str { + &self[index.range()] + } +} + +impl From<Range<usize>> for Span { + #[inline] + fn from(range: Range<usize>) -> Span { + Span { start: range.start, end: range.end } + } +} + +impl From<Span> for Range<usize> { + #[inline] + fn from(span: Span) -> Range<usize> { + Range { start: span.start, end: span.end } + } +} + +impl PartialEq<Range<usize>> for Span { + #[inline] + fn eq(&self, range: &Range<usize>) -> bool { + self.start == range.start && self.end == range.end + } +} + +impl PartialEq<Span> for Range<usize> { + #[inline] + fn eq(&self, span: &Span) -> bool { + self.start == span.start && self.end == span.end + } +} + +/// A representation of "half" of a match reported by a DFA. +/// +/// This is called a "half" match because it only includes the end location (or +/// start location for a reverse search) of a match. This corresponds to the +/// information that a single DFA scan can report. Getting the other half of +/// the match requires a second scan with a reversed DFA. +/// +/// A half match also includes the pattern that matched. The pattern is +/// identified by an ID, which corresponds to its position (starting from `0`) +/// relative to other patterns used to construct the corresponding DFA. If only +/// a single pattern is provided to the DFA, then all matches are guaranteed to +/// have a pattern ID of `0`. +#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] +pub struct HalfMatch { + /// The pattern ID. + pattern: PatternID, + /// The offset of the match. + /// + /// For forward searches, the offset is exclusive. For reverse searches, + /// the offset is inclusive. + offset: usize, +} + +impl HalfMatch { + /// Create a new half match from a pattern ID and a byte offset. + #[inline] + pub fn new(pattern: PatternID, offset: usize) -> HalfMatch { + HalfMatch { pattern, offset } + } + + /// Create a new half match from a pattern ID and a byte offset. + /// + /// This is like [`HalfMatch::new`], but accepts a `usize` instead of a + /// [`PatternID`]. This panics if the given `usize` is not representable + /// as a `PatternID`. + #[inline] + pub fn must(pattern: usize, offset: usize) -> HalfMatch { + HalfMatch::new(PatternID::new(pattern).unwrap(), offset) + } + + /// Returns the ID of the pattern that matched. + /// + /// The ID of a pattern is derived from the position in which it was + /// originally inserted into the corresponding DFA. The first pattern has + /// identifier `0`, and each subsequent pattern is `1`, `2` and so on. + #[inline] + pub fn pattern(&self) -> PatternID { + self.pattern + } + + /// The position of the match. + /// + /// If this match was produced by a forward search, then the offset is + /// exclusive. If this match was produced by a reverse search, then the + /// offset is inclusive. + #[inline] + pub fn offset(&self) -> usize { + self.offset + } +} + +/// A representation of a match reported by a regex engine. +/// +/// A match has two essential pieces of information: the [`PatternID`] that +/// matches, and the [`Span`] of the match in a haystack. +/// +/// The pattern is identified by an ID, which corresponds to its position +/// (starting from `0`) relative to other patterns used to construct the +/// corresponding regex engine. If only a single pattern is provided, then all +/// matches are guaranteed to have a pattern ID of `0`. +/// +/// Every match reported by a regex engine guarantees that its span has its +/// start offset as less than or equal to its end offset. +#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] +pub struct Match { + /// The pattern ID. + pattern: PatternID, + /// The underlying match span. + span: Span, +} + +impl Match { + /// Create a new match from a pattern ID and a span. + /// + /// This constructor is generic over how a span is provided. While + /// a [`Span`] may be given directly, one may also provide a + /// `std::ops::Range<usize>`. + /// + /// # Panics + /// + /// This panics if `end < start`. + /// + /// # Example + /// + /// This shows how to create a match for the first pattern in a regex + /// object using convenient range syntax. + /// + /// ``` + /// use regex_automata::{Match, PatternID}; + /// + /// let m = Match::new(PatternID::ZERO, 5..10); + /// assert_eq!(0, m.pattern().as_usize()); + /// assert_eq!(5, m.start()); + /// assert_eq!(10, m.end()); + /// ``` + #[inline] + pub fn new<S: Into<Span>>(pattern: PatternID, span: S) -> Match { + let span: Span = span.into(); + assert!(span.start <= span.end, "invalid match span"); + Match { pattern, span } + } + + /// Create a new match from a pattern ID and a byte offset span. + /// + /// This constructor is generic over how a span is provided. While + /// a [`Span`] may be given directly, one may also provide a + /// `std::ops::Range<usize>`. + /// + /// This is like [`Match::new`], but accepts a `usize` instead of a + /// [`PatternID`]. This panics if the given `usize` is not representable + /// as a `PatternID`. + /// + /// # Panics + /// + /// This panics if `end < start` or if `pattern > PatternID::MAX`. + /// + /// # Example + /// + /// This shows how to create a match for the third pattern in a regex + /// object using convenient range syntax. + /// + /// ``` + /// use regex_automata::Match; + /// + /// let m = Match::must(3, 5..10); + /// assert_eq!(3, m.pattern().as_usize()); + /// assert_eq!(5, m.start()); + /// assert_eq!(10, m.end()); + /// ``` + #[inline] + pub fn must<S: Into<Span>>(pattern: usize, span: S) -> Match { + Match::new(PatternID::must(pattern), span) + } + + /// Returns the ID of the pattern that matched. + /// + /// The ID of a pattern is derived from the position in which it was + /// originally inserted into the corresponding regex engine. The first + /// pattern has identifier `0`, and each subsequent pattern is `1`, `2` and + /// so on. + #[inline] + pub fn pattern(&self) -> PatternID { + self.pattern + } + + /// The starting position of the match. + /// + /// This is a convenience routine for `Match::span().start`. + #[inline] + pub fn start(&self) -> usize { + self.span().start + } + + /// The ending position of the match. + /// + /// This is a convenience routine for `Match::span().end`. + #[inline] + pub fn end(&self) -> usize { + self.span().end + } + + /// Returns the match span as a range. + /// + /// This is a convenience routine for `Match::span().range()`. + #[inline] + pub fn range(&self) -> core::ops::Range<usize> { + self.span().range() + } + + /// Returns the span for this match. + #[inline] + pub fn span(&self) -> Span { + self.span + } + + /// Returns true when the span in this match is empty. + /// + /// An empty match can only be returned when the regex itself can match + /// the empty string. + #[inline] + pub fn is_empty(&self) -> bool { + self.span().is_empty() + } + + /// Returns the length of this match. + /// + /// This returns `0` in precisely the cases that `is_empty` returns `true`. + #[inline] + pub fn len(&self) -> usize { + self.span().len() + } +} + +/// A set of `PatternID`s. +/// +/// A set of pattern identifiers is useful for recording which patterns have +/// matched a particular haystack. A pattern set _only_ includes pattern +/// identifiers. It does not include offset information. +/// +/// # Example +/// +/// This shows basic usage of a set. +/// +/// ``` +/// use regex_automata::{PatternID, PatternSet}; +/// +/// let pid1 = PatternID::must(5); +/// let pid2 = PatternID::must(8); +/// // Create a new empty set. +/// let mut set = PatternSet::new(10); +/// // Insert pattern IDs. +/// set.insert(pid1); +/// set.insert(pid2); +/// // Test membership. +/// assert!(set.contains(pid1)); +/// assert!(set.contains(pid2)); +/// // Get all members. +/// assert_eq!( +/// vec![5, 8], +/// set.iter().map(|p| p.as_usize()).collect::<Vec<usize>>(), +/// ); +/// // Clear the set. +/// set.clear(); +/// // Test that it is indeed empty. +/// assert!(set.is_empty()); +/// ``` +#[cfg(feature = "alloc")] +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct PatternSet { + /// The number of patterns set to 'true' in this set. + len: usize, + /// A map from PatternID to boolean of whether a pattern matches or not. + /// + /// This should probably be a bitset, but it's probably unlikely to matter + /// much in practice. + /// + /// The main downside of this representation (and similarly for a bitset) + /// is that iteration scales with the capacity of the set instead of + /// the length of the set. This doesn't seem likely to be a problem in + /// practice. + /// + /// Another alternative is to just use a 'SparseSet' for this. It does use + /// more memory (quite a bit more), but that seems fine I think compared + /// to the memory being used by the regex engine. The real hiccup with + /// it is that it yields pattern IDs in the order they were inserted. + /// Which is actually kind of nice, but at the time of writing, pattern + /// IDs are yielded in ascending order in the regex crate RegexSet API. + /// If we did change to 'SparseSet', we could provide an additional + /// 'iter_match_order' iterator, but keep the ascending order one for + /// compatibility. + which: alloc::boxed::Box<[bool]>, +} + +#[cfg(feature = "alloc")] +impl PatternSet { + /// Create a new set of pattern identifiers with the given capacity. + /// + /// The given capacity typically corresponds to (at least) the number of + /// patterns in a compiled regex object. + /// + /// # Panics + /// + /// This panics if the given capacity exceeds [`PatternID::LIMIT`]. This is + /// impossible if you use the `pattern_len()` method as defined on any of + /// the regex engines in this crate. Namely, a regex will fail to build by + /// returning an error if the number of patterns given to it exceeds the + /// limit. Therefore, the number of patterns in a valid regex is always + /// a correct capacity to provide here. + pub fn new(capacity: usize) -> PatternSet { + assert!( + capacity <= PatternID::LIMIT, + "pattern set capacity exceeds limit of {}", + PatternID::LIMIT, + ); + PatternSet { + len: 0, + which: alloc::vec![false; capacity].into_boxed_slice(), + } + } + + /// Clear this set such that it contains no pattern IDs. + pub fn clear(&mut self) { + self.len = 0; + for matched in self.which.iter_mut() { + *matched = false; + } + } + + /// Return true if and only if the given pattern identifier is in this set. + pub fn contains(&self, pid: PatternID) -> bool { + pid.as_usize() < self.capacity() && self.which[pid] + } + + /// Insert the given pattern identifier into this set and return `true` if + /// the given pattern ID was not previously in this set. + /// + /// If the pattern identifier is already in this set, then this is a no-op. + /// + /// Use [`PatternSet::try_insert`] for a fallible version of this routine. + /// + /// # Panics + /// + /// This panics if this pattern set has insufficient capacity to + /// store the given pattern ID. + pub fn insert(&mut self, pid: PatternID) -> bool { + self.try_insert(pid) + .expect("PatternSet should have sufficient capacity") + } + + /// Insert the given pattern identifier into this set and return `true` if + /// the given pattern ID was not previously in this set. + /// + /// If the pattern identifier is already in this set, then this is a no-op. + /// + /// # Errors + /// + /// This returns an error if this pattern set has insufficient capacity to + /// store the given pattern ID. + pub fn try_insert( + &mut self, + pid: PatternID, + ) -> Result<bool, PatternSetInsertError> { + if pid.as_usize() >= self.capacity() { + return Err(PatternSetInsertError { + attempted: pid, + capacity: self.capacity(), + }); + } + if self.which[pid] { + return Ok(false); + } + self.len += 1; + self.which[pid] = true; + Ok(true) + } + + /* + // This is currently commented out because it is unused and it is unclear + // whether it's useful or not. What's the harm in having it? When, if + // we ever wanted to change our representation to a 'SparseSet', then + // supporting this method would be a bit tricky. So in order to keep some + // API evolution flexibility, we leave it out for now. + + /// Remove the given pattern identifier from this set. + /// + /// If the pattern identifier was not previously in this set, then this + /// does not change the set and returns `false`. + /// + /// # Panics + /// + /// This panics if `pid` exceeds the capacity of this set. + pub fn remove(&mut self, pid: PatternID) -> bool { + if !self.which[pid] { + return false; + } + self.len -= 1; + self.which[pid] = false; + true + } + */ + + /// Return true if and only if this set has no pattern identifiers in it. + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Return true if and only if this set has the maximum number of pattern + /// identifiers in the set. This occurs precisely when `PatternSet::len() + /// == PatternSet::capacity()`. + /// + /// This particular property is useful to test because it may allow one to + /// stop a search earlier than you might otherwise. Namely, if a search is + /// only reporting which patterns match a haystack and if you know all of + /// the patterns match at a given point, then there's no new information + /// that can be learned by continuing the search. (Because a pattern set + /// does not keep track of offset information.) + pub fn is_full(&self) -> bool { + self.len() == self.capacity() + } + + /// Returns the total number of pattern identifiers in this set. + pub fn len(&self) -> usize { + self.len + } + + /// Returns the total number of pattern identifiers that may be stored + /// in this set. + /// + /// This is guaranteed to be less than or equal to [`PatternID::LIMIT`]. + /// + /// Typically, the capacity of a pattern set matches the number of patterns + /// in a regex object with which you are searching. + pub fn capacity(&self) -> usize { + self.which.len() + } + + /// Returns an iterator over all pattern identifiers in this set. + /// + /// The iterator yields pattern identifiers in ascending order, starting + /// at zero. + pub fn iter(&self) -> PatternSetIter<'_> { + PatternSetIter { it: self.which.iter().enumerate() } + } +} + +/// An error that occurs when a `PatternID` failed to insert into a +/// `PatternSet`. +/// +/// An insert fails when the given `PatternID` exceeds the configured capacity +/// of the `PatternSet`. +/// +/// This error is created by the [`PatternSet::try_insert`] routine. +#[cfg(feature = "alloc")] +#[derive(Clone, Debug)] +pub struct PatternSetInsertError { + attempted: PatternID, + capacity: usize, +} + +#[cfg(feature = "std")] +impl std::error::Error for PatternSetInsertError {} + +#[cfg(feature = "alloc")] +impl core::fmt::Display for PatternSetInsertError { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + write!( + f, + "failed to insert pattern ID {} into pattern set \ + with insufficiet capacity of {}", + self.attempted.as_usize(), + self.capacity, + ) + } +} + +/// An iterator over all pattern identifiers in a [`PatternSet`]. +/// +/// The lifetime parameter `'a` refers to the lifetime of the pattern set being +/// iterated over. +/// +/// This iterator is created by the [`PatternSet::iter`] method. +#[cfg(feature = "alloc")] +#[derive(Clone, Debug)] +pub struct PatternSetIter<'a> { + it: core::iter::Enumerate<core::slice::Iter<'a, bool>>, +} + +#[cfg(feature = "alloc")] +impl<'a> Iterator for PatternSetIter<'a> { + type Item = PatternID; + + fn next(&mut self) -> Option<PatternID> { + while let Some((index, &yes)) = self.it.next() { + if yes { + // Only valid 'PatternID' values can be inserted into the set + // and construction of the set panics if the capacity would + // permit storing invalid pattern IDs. Thus, 'yes' is only true + // precisely when 'index' corresponds to a valid 'PatternID'. + return Some(PatternID::new_unchecked(index)); + } + } + None + } + + fn size_hint(&self) -> (usize, Option<usize>) { + self.it.size_hint() + } +} + +#[cfg(feature = "alloc")] +impl<'a> DoubleEndedIterator for PatternSetIter<'a> { + fn next_back(&mut self) -> Option<PatternID> { + while let Some((index, &yes)) = self.it.next_back() { + if yes { + // Only valid 'PatternID' values can be inserted into the set + // and construction of the set panics if the capacity would + // permit storing invalid pattern IDs. Thus, 'yes' is only true + // precisely when 'index' corresponds to a valid 'PatternID'. + return Some(PatternID::new_unchecked(index)); + } + } + None + } +} + +/// The type of anchored search to perform. +/// +/// This is *almost* a boolean option. That is, you can either do an unanchored +/// search for any pattern in a regex, or you can do an anchored search for any +/// pattern in a regex. +/// +/// A third option exists that, assuming the regex engine supports it, permits +/// you to do an anchored search for a specific pattern. +/// +/// Note that there is no way to run an unanchored search for a specific +/// pattern. If you need that, you'll need to build separate regexes for each +/// pattern. +/// +/// # Errors +/// +/// If a regex engine does not support the anchored mode selected, then the +/// regex engine will return an error. While any non-trivial regex engine +/// should support at least one of the available anchored modes, there is no +/// singular mode that is guaranteed to be universally supported. Some regex +/// engines might only support unanchored searches (DFAs compiled without +/// anchored starting states) and some regex engines might only support +/// anchored searches (like the one-pass DFA). +/// +/// The specific error returned is a [`MatchError`] with a +/// [`MatchErrorKind::UnsupportedAnchored`] kind. The kind includes the +/// `Anchored` value given that is unsupported. +/// +/// Note that regex engines should report "no match" if, for example, an +/// `Anchored::Pattern` is provided with an invalid pattern ID _but_ where +/// anchored searches for a specific pattern are supported. This is smooths out +/// behavior such that it's possible to guarantee that an error never occurs +/// based on how the regex engine is configured. All regex engines in this +/// crate report "no match" when searching for an invalid pattern ID, but where +/// searching for a valid pattern ID is otherwise supported. +/// +/// # Example +/// +/// This example shows how to use the various `Anchored` modes to run a +/// search. We use the [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM) +/// because it supports all modes unconditionally. Some regex engines, like +/// the [`onepass::DFA`](crate::dfa::onepass::DFA) cannot support unanchored +/// searches. +/// +/// ``` +/// # if cfg!(miri) { return Ok(()); } // miri takes too long +/// use regex_automata::{ +/// nfa::thompson::pikevm::PikeVM, +/// Anchored, Input, Match, PatternID, +/// }; +/// +/// let re = PikeVM::new_many(&[ +/// r"Mrs. \w+", +/// r"Miss \w+", +/// r"Mr. \w+", +/// r"Ms. \w+", +/// ])?; +/// let mut cache = re.create_cache(); +/// let hay = "Hello Mr. Springsteen!"; +/// +/// // The default is to do an unanchored search. +/// assert_eq!(Some(Match::must(2, 6..21)), re.find(&mut cache, hay)); +/// // Explicitly ask for an unanchored search. Same as above. +/// let input = Input::new(hay).anchored(Anchored::No); +/// assert_eq!(Some(Match::must(2, 6..21)), re.find(&mut cache, hay)); +/// +/// // Now try an anchored search. Since the match doesn't start at the +/// // beginning of the haystack, no match is found! +/// let input = Input::new(hay).anchored(Anchored::Yes); +/// assert_eq!(None, re.find(&mut cache, input)); +/// +/// // We can try an anchored search again, but move the location of where +/// // we start the search. Note that the offsets reported are still in +/// // terms of the overall haystack and not relative to where we started +/// // the search. +/// let input = Input::new(hay).anchored(Anchored::Yes).range(6..); +/// assert_eq!(Some(Match::must(2, 6..21)), re.find(&mut cache, input)); +/// +/// // Now try an anchored search for a specific pattern. We specifically +/// // choose a pattern that we know doesn't match to prove that the search +/// // only looks for the pattern we provide. +/// let input = Input::new(hay) +/// .anchored(Anchored::Pattern(PatternID::must(1))) +/// .range(6..); +/// assert_eq!(None, re.find(&mut cache, input)); +/// +/// // But if we switch it to the pattern that we know matches, then we find +/// // the match. +/// let input = Input::new(hay) +/// .anchored(Anchored::Pattern(PatternID::must(2))) +/// .range(6..); +/// assert_eq!(Some(Match::must(2, 6..21)), re.find(&mut cache, input)); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum Anchored { + /// Run an unanchored search. This means a match may occur anywhere at or + /// after the start position of the search. + /// + /// This search can return a match for any pattern in the regex. + No, + /// Run an anchored search. This means that a match must begin at the + /// start position of the search. + /// + /// This search can return a match for any pattern in the regex. + Yes, + /// Run an anchored search for a specific pattern. This means that a match + /// must be for the given pattern and must begin at the start position of + /// the search. + Pattern(PatternID), +} + +impl Anchored { + /// Returns true if and only if this anchor mode corresponds to any kind of + /// anchored search. + /// + /// # Example + /// + /// This examples shows that both `Anchored::Yes` and `Anchored::Pattern` + /// are considered anchored searches. + /// + /// ``` + /// use regex_automata::{Anchored, PatternID}; + /// + /// assert!(!Anchored::No.is_anchored()); + /// assert!(Anchored::Yes.is_anchored()); + /// assert!(Anchored::Pattern(PatternID::ZERO).is_anchored()); + /// ``` + #[inline] + pub fn is_anchored(&self) -> bool { + matches!(*self, Anchored::Yes | Anchored::Pattern(_)) + } + + /// Returns the pattern ID associated with this configuration if it is an + /// anchored search for a specific pattern. Otherwise `None` is returned. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{Anchored, PatternID}; + /// + /// assert_eq!(None, Anchored::No.pattern()); + /// assert_eq!(None, Anchored::Yes.pattern()); + /// + /// let pid = PatternID::must(5); + /// assert_eq!(Some(pid), Anchored::Pattern(pid).pattern()); + /// ``` + #[inline] + pub fn pattern(&self) -> Option<PatternID> { + match *self { + Anchored::Pattern(pid) => Some(pid), + _ => None, + } + } +} + +/// The kind of match semantics to use for a regex pattern. +/// +/// The default match kind is `LeftmostFirst`, and this corresponds to the +/// match semantics used by most backtracking engines, such as Perl. +/// +/// # Leftmost first or "preference order" match semantics +/// +/// Leftmost-first semantics determine which match to report when there are +/// multiple paths through a regex that match at the same position. The tie is +/// essentially broken by how a backtracker would behave. For example, consider +/// running the regex `foofoofoo|foofoo|foo` on the haystack `foofoo`. In this +/// case, both the `foofoo` and `foo` branches match at position `0`. So should +/// the end of the match be `3` or `6`? +/// +/// A backtracker will conceptually work by trying `foofoofoo` and failing. +/// Then it will try `foofoo`, find the match and stop there. Thus, the +/// leftmost-first match position is `6`. This is called "leftmost-first" or +/// "preference order" because the order of the branches as written in the +/// regex pattern is what determines how to break the tie. +/// +/// (Note that leftmost-longest match semantics, which break ties by always +/// taking the longest matching string, are not currently supported by this +/// crate. These match semantics tend to be found in POSIX regex engines.) +/// +/// This example shows how leftmost-first semantics work, and how it even +/// applies to multi-pattern regexes: +/// +/// ``` +/// use regex_automata::{ +/// nfa::thompson::pikevm::PikeVM, +/// Match, +/// }; +/// +/// let re = PikeVM::new_many(&[ +/// r"foofoofoo", +/// r"foofoo", +/// r"foo", +/// ])?; +/// let mut cache = re.create_cache(); +/// let got: Vec<Match> = re.find_iter(&mut cache, "foofoo").collect(); +/// let expected = vec![Match::must(1, 0..6)]; +/// assert_eq!(expected, got); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +/// +/// # All matches +/// +/// The `All` match semantics report any and all matches, and generally will +/// attempt to match as much as possible. It doesn't respect any sort of match +/// priority at all, so things like non-greedy matching don't work in this +/// mode. +/// +/// The fact that non-greedy matching doesn't work generally makes most forms +/// of unanchored non-overlapping searches have unintuitive behavior. Namely, +/// unanchored searches behave as if there is a `(?s-u:.)*?` prefix at the +/// beginning of the pattern, which is specifically non-greedy. Since it will +/// be treated as greedy in `All` match semantics, this generally means that +/// it will first attempt to consume all of the haystack and is likely to wind +/// up skipping matches. +/// +/// Generally speaking, `All` should only be used in two circumstances: +/// +/// * When running an anchored search and there is a desire to match as much as +/// possible. For example, when building a reverse regex matcher to find the +/// start of a match after finding the end. In this case, the reverse search +/// is anchored to the end of the match found by the forward search. +/// * When running overlapping searches. Since `All` encodes all possible +/// matches, this is generally what you want for an overlapping search. If you +/// try to use leftmost-first in an overlapping search, it is likely to produce +/// counter-intuitive results since leftmost-first specifically excludes some +/// matches from its underlying finite state machine. +/// +/// This example demonstrates the counter-intuitive behavior of `All` semantics +/// when using a standard leftmost unanchored search: +/// +/// ``` +/// use regex_automata::{ +/// nfa::thompson::pikevm::PikeVM, +/// Match, MatchKind, +/// }; +/// +/// let re = PikeVM::builder() +/// .configure(PikeVM::config().match_kind(MatchKind::All)) +/// .build("foo")?; +/// let hay = "first foo second foo wat"; +/// let mut cache = re.create_cache(); +/// let got: Vec<Match> = re.find_iter(&mut cache, hay).collect(); +/// // Notice that it completely skips the first 'foo'! +/// let expected = vec![Match::must(0, 17..20)]; +/// assert_eq!(expected, got); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +/// +/// This second example shows how `All` semantics are useful for an overlapping +/// search. Note that we use lower level lazy DFA APIs here since the NFA +/// engines only currently support a very limited form of overlapping search. +/// +/// ``` +/// use regex_automata::{ +/// hybrid::dfa::{DFA, OverlappingState}, +/// HalfMatch, Input, MatchKind, +/// }; +/// +/// let re = DFA::builder() +/// // If we didn't set 'All' semantics here, then the regex would only +/// // match 'foo' at offset 3 and nothing else. Why? Because the state +/// // machine implements preference order and knows that the 'foofoo' and +/// // 'foofoofoo' branches can never match since 'foo' will always match +/// // when they match and take priority. +/// .configure(DFA::config().match_kind(MatchKind::All)) +/// .build(r"foo|foofoo|foofoofoo")?; +/// let mut cache = re.create_cache(); +/// let mut state = OverlappingState::start(); +/// let input = Input::new("foofoofoo"); +/// let mut got = vec![]; +/// loop { +/// re.try_search_overlapping_fwd(&mut cache, &input, &mut state)?; +/// let m = match state.get_match() { +/// None => break, +/// Some(m) => m, +/// }; +/// got.push(m); +/// } +/// let expected = vec![ +/// HalfMatch::must(0, 3), +/// HalfMatch::must(0, 6), +/// HalfMatch::must(0, 9), +/// ]; +/// assert_eq!(expected, got); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +#[non_exhaustive] +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum MatchKind { + /// Report all possible matches. + All, + /// Report only the leftmost matches. When multiple leftmost matches exist, + /// report the match corresponding to the part of the regex that appears + /// first in the syntax. + LeftmostFirst, + // There is prior art in RE2 that shows that we should be able to add + // LeftmostLongest too. The tricky part of it is supporting ungreedy + // repetitions. Instead of treating all NFA states as having equivalent + // priority (as in 'All') or treating all NFA states as having distinct + // priority based on order (as in 'LeftmostFirst'), we instead group NFA + // states into sets, and treat members of each set as having equivalent + // priority, but having greater priority than all following members + // of different sets. + // + // However, it's not clear whether it's really worth adding this. After + // all, leftmost-longest can be emulated when using literals by using + // leftmost-first and sorting the literals by length in descending order. + // However, this won't work for arbitrary regexes. e.g., `\w|\w\w` will + // always match `a` in `ab` when using leftmost-first, but leftmost-longest + // would match `ab`. +} + +impl MatchKind { + #[cfg(feature = "alloc")] + pub(crate) fn continue_past_first_match(&self) -> bool { + *self == MatchKind::All + } +} + +impl Default for MatchKind { + fn default() -> MatchKind { + MatchKind::LeftmostFirst + } +} + +/// An error indicating that a search stopped before reporting whether a +/// match exists or not. +/// +/// To be very clear, this error type implies that one cannot assume that no +/// matches occur, since the search stopped before completing. That is, if +/// you're looking for information about where a search determined that no +/// match can occur, then this error type does *not* give you that. (Indeed, at +/// the time of writing, if you need such a thing, you have to write your own +/// search routine.) +/// +/// Normally, when one searches for something, the response is either an +/// affirmative "it was found at this location" or a negative "not found at +/// all." However, in some cases, a regex engine can be configured to stop its +/// search before concluding whether a match exists or not. When this happens, +/// it may be important for the caller to know why the regex engine gave up and +/// where in the input it gave up at. This error type exposes the 'why' and the +/// 'where.' +/// +/// For example, the DFAs provided by this library generally cannot correctly +/// implement Unicode word boundaries. Instead, they provide an option to +/// eagerly support them on ASCII text (since Unicode word boundaries are +/// equivalent to ASCII word boundaries when searching ASCII text), but will +/// "give up" if a non-ASCII byte is seen. In such cases, one is usually +/// required to either report the failure to the caller (unergonomic) or +/// otherwise fall back to some other regex engine (ergonomic, but potentially +/// costly). +/// +/// More generally, some regex engines offer the ability for callers to specify +/// certain bytes that will trigger the regex engine to automatically quit if +/// they are seen. +/// +/// Still yet, there may be other reasons for a failed match. For example, +/// the hybrid DFA provided by this crate can be configured to give up if it +/// believes that it is not efficient. This in turn permits callers to choose a +/// different regex engine. +/// +/// (Note that DFAs are configured by default to never quit or give up in this +/// fashion. For example, by default, a DFA will fail to build if the regex +/// pattern contains a Unicode word boundary. One needs to opt into the "quit" +/// behavior via options, like +/// [`hybrid::dfa::Config::unicode_word_boundary`](crate::hybrid::dfa::Config::unicode_word_boundary).) +/// +/// There are a couple other ways a search +/// can fail. For example, when using the +/// [`BoundedBacktracker`](crate::nfa::thompson::backtrack::BoundedBacktracker) +/// with a haystack that is too long, or trying to run an unanchored search +/// with a [one-pass DFA](crate::dfa::onepass). +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct MatchError( + #[cfg(feature = "alloc")] alloc::boxed::Box<MatchErrorKind>, + #[cfg(not(feature = "alloc"))] MatchErrorKind, +); + +impl MatchError { + /// Create a new error value with the given kind. + /// + /// This is a more verbose version of the kind-specific constructors, + /// e.g., `MatchError::quit`. + pub fn new(kind: MatchErrorKind) -> MatchError { + #[cfg(feature = "alloc")] + { + MatchError(alloc::boxed::Box::new(kind)) + } + #[cfg(not(feature = "alloc"))] + { + MatchError(kind) + } + } + + /// Returns a reference to the underlying error kind. + pub fn kind(&self) -> &MatchErrorKind { + &self.0 + } + + /// Create a new "quit" error. The given `byte` corresponds to the value + /// that tripped a search's quit condition, and `offset` corresponds to the + /// location in the haystack at which the search quit. + /// + /// This is the same as calling `MatchError::new` with a + /// [`MatchErrorKind::Quit`] kind. + pub fn quit(byte: u8, offset: usize) -> MatchError { + MatchError::new(MatchErrorKind::Quit { byte, offset }) + } + + /// Create a new "gave up" error. The given `offset` corresponds to the + /// location in the haystack at which the search gave up. + /// + /// This is the same as calling `MatchError::new` with a + /// [`MatchErrorKind::GaveUp`] kind. + pub fn gave_up(offset: usize) -> MatchError { + MatchError::new(MatchErrorKind::GaveUp { offset }) + } + + /// Create a new "haystack too long" error. The given `len` corresponds to + /// the length of the haystack that was problematic. + /// + /// This is the same as calling `MatchError::new` with a + /// [`MatchErrorKind::HaystackTooLong`] kind. + pub fn haystack_too_long(len: usize) -> MatchError { + MatchError::new(MatchErrorKind::HaystackTooLong { len }) + } + + /// Create a new "unsupported anchored" error. This occurs when the caller + /// requests a search with an anchor mode that is not supported by the + /// regex engine. + /// + /// This is the same as calling `MatchError::new` with a + /// [`MatchErrorKind::UnsupportedAnchored`] kind. + pub fn unsupported_anchored(mode: Anchored) -> MatchError { + MatchError::new(MatchErrorKind::UnsupportedAnchored { mode }) + } +} + +/// The underlying kind of a [`MatchError`]. +/// +/// This is a **non-exhaustive** enum. That means new variants may be added in +/// a semver-compatible release. +#[non_exhaustive] +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum MatchErrorKind { + /// The search saw a "quit" byte at which it was instructed to stop + /// searching. + Quit { + /// The "quit" byte that was observed that caused the search to stop. + byte: u8, + /// The offset at which the quit byte was observed. + offset: usize, + }, + /// The search, based on heuristics, determined that it would be better + /// to stop, typically to provide the caller an opportunity to use an + /// alternative regex engine. + /// + /// Currently, the only way for this to occur is via the lazy DFA and + /// only when it is configured to do so (it will not return this error by + /// default). + GaveUp { + /// The offset at which the search stopped. This corresponds to the + /// position immediately following the last byte scanned. + offset: usize, + }, + /// This error occurs if the haystack given to the regex engine was too + /// long to be searched. This occurs, for example, with regex engines + /// like the bounded backtracker that have a configurable fixed amount of + /// capacity that is tied to the length of the haystack. Anything beyond + /// that configured limit will result in an error at search time. + HaystackTooLong { + /// The length of the haystack that exceeded the limit. + len: usize, + }, + /// An error indicating that a particular type of anchored search was + /// requested, but that the regex engine does not support it. + /// + /// Note that this error should not be returned by a regex engine simply + /// because the pattern ID is invalid (i.e., equal to or exceeds the number + /// of patterns in the regex). In that case, the regex engine should report + /// a non-match. + UnsupportedAnchored { + /// The anchored mode given that is unsupported. + mode: Anchored, + }, +} + +#[cfg(feature = "std")] +impl std::error::Error for MatchError {} + +impl core::fmt::Display for MatchError { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + match *self.kind() { + MatchErrorKind::Quit { byte, offset } => write!( + f, + "quit search after observing byte {:?} at offset {}", + DebugByte(byte), + offset, + ), + MatchErrorKind::GaveUp { offset } => { + write!(f, "gave up searching at offset {}", offset) + } + MatchErrorKind::HaystackTooLong { len } => { + write!(f, "haystack of length {} is too long", len) + } + MatchErrorKind::UnsupportedAnchored { mode: Anchored::Yes } => { + write!(f, "anchored searches are not supported or enabled") + } + MatchErrorKind::UnsupportedAnchored { mode: Anchored::No } => { + write!(f, "unanchored searches are not supported or enabled") + } + MatchErrorKind::UnsupportedAnchored { + mode: Anchored::Pattern(pid), + } => { + write!( + f, + "anchored searches for a specific pattern ({}) are \ + not supported or enabled", + pid.as_usize(), + ) + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + // We test that our 'MatchError' type is the size we expect. This isn't an + // API guarantee, but if the size increases, we really want to make sure we + // decide to do that intentionally. So this should be a speed bump. And in + // general, we should not increase the size without a very good reason. + // + // Why? Because low level search APIs return Result<.., MatchError>. When + // MatchError gets bigger, so to does the Result type. + // + // Now, when 'alloc' is enabled, we do box the error, which de-emphasizes + // the importance of keeping a small error type. But without 'alloc', we + // still want things to be small. + #[test] + fn match_error_size() { + let expected_size = if cfg!(feature = "alloc") { + core::mem::size_of::<usize>() + } else { + 2 * core::mem::size_of::<usize>() + }; + assert_eq!(expected_size, core::mem::size_of::<MatchError>()); + } + + // Same as above, but for the underlying match error kind. + #[cfg(target_pointer_width = "64")] + #[test] + fn match_error_kind_size() { + let expected_size = 2 * core::mem::size_of::<usize>(); + assert_eq!(expected_size, core::mem::size_of::<MatchErrorKind>()); + } + + #[cfg(target_pointer_width = "32")] + #[test] + fn match_error_kind_size() { + let expected_size = 3 * core::mem::size_of::<usize>(); + assert_eq!(expected_size, core::mem::size_of::<MatchErrorKind>()); + } +} diff --git a/vendor/regex-automata/src/util/sparse_set.rs b/vendor/regex-automata/src/util/sparse_set.rs new file mode 100644 index 0000000..cbaa0b6 --- /dev/null +++ b/vendor/regex-automata/src/util/sparse_set.rs @@ -0,0 +1,239 @@ +/*! +This module defines a sparse set data structure. Its most interesting +properties are: + +* They preserve insertion order. +* Set membership testing is done in constant time. +* Set insertion is done in constant time. +* Clearing the set is done in constant time. + +The cost for doing this is that the capacity of the set needs to be known up +front, and the elements in the set are limited to state identifiers. + +These sets are principally used when traversing an NFA state graph. This +happens at search time, for example, in the PikeVM. It also happens during DFA +determinization. +*/ + +use alloc::{vec, vec::Vec}; + +use crate::util::primitives::StateID; + +/// A pairse of sparse sets. +/// +/// This is useful when one needs to compute NFA epsilon closures from a +/// previous set of states derived from an epsilon closure. One set can be the +/// starting states where as the other set can be the destination states after +/// following the transitions for a particular byte of input. +/// +/// There is no significance to 'set1' or 'set2'. They are both sparse sets of +/// the same size. +/// +/// The members of this struct are exposed so that callers may borrow 'set1' +/// and 'set2' individually without being force to borrow both at the same +/// time. +#[derive(Clone, Debug)] +pub(crate) struct SparseSets { + pub(crate) set1: SparseSet, + pub(crate) set2: SparseSet, +} + +impl SparseSets { + /// Create a new pair of sparse sets where each set has the given capacity. + /// + /// This panics if the capacity given is bigger than `StateID::LIMIT`. + pub(crate) fn new(capacity: usize) -> SparseSets { + SparseSets { + set1: SparseSet::new(capacity), + set2: SparseSet::new(capacity), + } + } + + /// Resizes these sparse sets to have the new capacity given. + /// + /// The sets are automatically cleared. + /// + /// This panics if the capacity given is bigger than `StateID::LIMIT`. + #[inline] + pub(crate) fn resize(&mut self, new_capacity: usize) { + self.set1.resize(new_capacity); + self.set2.resize(new_capacity); + } + + /// Clear both sparse sets. + pub(crate) fn clear(&mut self) { + self.set1.clear(); + self.set2.clear(); + } + + /// Swap set1 with set2. + pub(crate) fn swap(&mut self) { + core::mem::swap(&mut self.set1, &mut self.set2); + } + + /// Returns the memory usage, in bytes, used by this pair of sparse sets. + pub(crate) fn memory_usage(&self) -> usize { + self.set1.memory_usage() + self.set2.memory_usage() + } +} + +/// A sparse set used for representing ordered NFA states. +/// +/// This supports constant time addition and membership testing. Clearing an +/// entire set can also be done in constant time. Iteration yields elements +/// in the order in which they were inserted. +/// +/// The data structure is based on: https://research.swtch.com/sparse +/// Note though that we don't actually use uninitialized memory. We generally +/// reuse sparse sets, so the initial allocation cost is bareable. However, its +/// other properties listed above are extremely useful. +#[derive(Clone)] +pub(crate) struct SparseSet { + /// The number of elements currently in this set. + len: usize, + /// Dense contains the ids in the order in which they were inserted. + dense: Vec<StateID>, + /// Sparse maps ids to their location in dense. + /// + /// A state ID is in the set if and only if + /// sparse[id] < len && id == dense[sparse[id]]. + /// + /// Note that these are indices into 'dense'. It's a little weird to use + /// StateID here, but we know our length can never exceed the bounds of + /// StateID (enforced by 'resize') and StateID will be at most 4 bytes + /// where as a usize is likely double that in most cases. + sparse: Vec<StateID>, +} + +impl SparseSet { + /// Create a new sparse set with the given capacity. + /// + /// Sparse sets have a fixed size and they cannot grow. Attempting to + /// insert more distinct elements than the total capacity of the set will + /// result in a panic. + /// + /// This panics if the capacity given is bigger than `StateID::LIMIT`. + #[inline] + pub(crate) fn new(capacity: usize) -> SparseSet { + let mut set = SparseSet { len: 0, dense: vec![], sparse: vec![] }; + set.resize(capacity); + set + } + + /// Resizes this sparse set to have the new capacity given. + /// + /// This set is automatically cleared. + /// + /// This panics if the capacity given is bigger than `StateID::LIMIT`. + #[inline] + pub(crate) fn resize(&mut self, new_capacity: usize) { + assert!( + new_capacity <= StateID::LIMIT, + "sparse set capacity cannot excced {:?}", + StateID::LIMIT + ); + self.clear(); + self.dense.resize(new_capacity, StateID::ZERO); + self.sparse.resize(new_capacity, StateID::ZERO); + } + + /// Returns the capacity of this set. + /// + /// The capacity represents a fixed limit on the number of distinct + /// elements that are allowed in this set. The capacity cannot be changed. + #[inline] + pub(crate) fn capacity(&self) -> usize { + self.dense.len() + } + + /// Returns the number of elements in this set. + #[inline] + pub(crate) fn len(&self) -> usize { + self.len + } + + /// Returns true if and only if this set is empty. + #[inline] + pub(crate) fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Insert the state ID value into this set and return true if the given + /// state ID was not previously in this set. + /// + /// This operation is idempotent. If the given value is already in this + /// set, then this is a no-op. + /// + /// If more than `capacity` ids are inserted, then this panics. + /// + /// This is marked as inline(always) since the compiler won't inline it + /// otherwise, and it's a fairly hot piece of code in DFA determinization. + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn insert(&mut self, id: StateID) -> bool { + if self.contains(id) { + return false; + } + + let i = self.len(); + assert!( + i < self.capacity(), + "{:?} exceeds capacity of {:?} when inserting {:?}", + i, + self.capacity(), + id, + ); + // OK since i < self.capacity() and self.capacity() is guaranteed to + // be <= StateID::LIMIT. + let index = StateID::new_unchecked(i); + self.dense[index] = id; + self.sparse[id] = index; + self.len += 1; + true + } + + /// Returns true if and only if this set contains the given value. + #[inline] + pub(crate) fn contains(&self, id: StateID) -> bool { + let index = self.sparse[id]; + index.as_usize() < self.len() && self.dense[index] == id + } + + /// Clear this set such that it has no members. + #[inline] + pub(crate) fn clear(&mut self) { + self.len = 0; + } + + #[inline] + pub(crate) fn iter(&self) -> SparseSetIter<'_> { + SparseSetIter(self.dense[..self.len()].iter()) + } + + /// Returns the heap memory usage, in bytes, used by this sparse set. + #[inline] + pub(crate) fn memory_usage(&self) -> usize { + self.dense.len() * StateID::SIZE + self.sparse.len() * StateID::SIZE + } +} + +impl core::fmt::Debug for SparseSet { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + let elements: Vec<StateID> = self.iter().collect(); + f.debug_tuple("SparseSet").field(&elements).finish() + } +} + +/// An iterator over all elements in a sparse set. +/// +/// The lifetime `'a` refers to the lifetime of the set being iterated over. +#[derive(Debug)] +pub(crate) struct SparseSetIter<'a>(core::slice::Iter<'a, StateID>); + +impl<'a> Iterator for SparseSetIter<'a> { + type Item = StateID; + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn next(&mut self) -> Option<StateID> { + self.0.next().map(|&id| id) + } +} diff --git a/vendor/regex-automata/src/util/start.rs b/vendor/regex-automata/src/util/start.rs new file mode 100644 index 0000000..2715378 --- /dev/null +++ b/vendor/regex-automata/src/util/start.rs @@ -0,0 +1,479 @@ +/*! +Provides helpers for dealing with start state configurations in DFAs. +*/ + +use crate::util::{ + look::LookMatcher, + search::{Anchored, Input}, + wire::{self, DeserializeError, SerializeError}, +}; + +/// The configuration used to determine a DFA's start state for a search. +/// +/// A DFA has a single starting state in the typical textbook description. That +/// is, it corresponds to the set of all starting states for the NFA that built +/// it, along with their espsilon closures. In this crate, however, DFAs have +/// many possible start states due to a few factors: +/// +/// * DFAs support the ability to run either anchored or unanchored searches. +/// Each type of search needs its own start state. For example, an unanchored +/// search requires starting at a state corresponding to a regex with a +/// `(?s-u:.)*?` prefix, which will match through anything. +/// * DFAs also optionally support starting an anchored search for any one +/// specific pattern. Each such pattern requires its own start state. +/// * If a look-behind assertion like `^` or `\b` is used in the regex, then +/// the DFA will need to inspect a single byte immediately before the start of +/// the search to choose the correct start state. +/// +/// Indeed, this configuration precisely encapsulates all of the above factors. +/// The [`Config::anchored`] method sets which kind of anchored search to +/// perform while the [`Config::look_behind`] method provides a way to set +/// the byte that occurs immediately before the start of the search. +/// +/// Generally speaking, this type is only useful when you want to run searches +/// without using an [`Input`]. In particular, an `Input` wants a haystack +/// slice, but callers may not have a contiguous sequence of bytes as a +/// haystack in all cases. This type provides a lower level of control such +/// that callers can provide their own anchored configuration and look-behind +/// byte explicitly. +/// +/// # Example +/// +/// This shows basic usage that permits running a search with a DFA without +/// using the `Input` abstraction. +/// +/// ``` +/// use regex_automata::{ +/// dfa::{Automaton, dense}, +/// util::start, +/// Anchored, +/// }; +/// +/// let dfa = dense::DFA::new(r"(?-u)\b\w+\b")?; +/// let haystack = "quartz"; +/// +/// let config = start::Config::new().anchored(Anchored::Yes); +/// let mut state = dfa.start_state(&config)?; +/// for &b in haystack.as_bytes().iter() { +/// state = dfa.next_state(state, b); +/// } +/// state = dfa.next_eoi_state(state); +/// assert!(dfa.is_match_state(state)); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +/// +/// This example shows how to correctly run a search that doesn't begin at +/// the start of a haystack. Notice how we set the look-behind byte, and as +/// a result, the `\b` assertion does not match. +/// +/// ``` +/// use regex_automata::{ +/// dfa::{Automaton, dense}, +/// util::start, +/// Anchored, +/// }; +/// +/// let dfa = dense::DFA::new(r"(?-u)\b\w+\b")?; +/// let haystack = "quartz"; +/// +/// let config = start::Config::new() +/// .anchored(Anchored::Yes) +/// .look_behind(Some(b'q')); +/// let mut state = dfa.start_state(&config)?; +/// for &b in haystack.as_bytes().iter().skip(1) { +/// state = dfa.next_state(state, b); +/// } +/// state = dfa.next_eoi_state(state); +/// // No match! +/// assert!(!dfa.is_match_state(state)); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +/// +/// If we had instead not set a look-behind byte, then the DFA would assume +/// that it was starting at the beginning of the haystack, and thus `\b` should +/// match. This in turn would result in erroneously reporting a match: +/// +/// ``` +/// use regex_automata::{ +/// dfa::{Automaton, dense}, +/// util::start, +/// Anchored, +/// }; +/// +/// let dfa = dense::DFA::new(r"(?-u)\b\w+\b")?; +/// let haystack = "quartz"; +/// +/// // Whoops, forgot the look-behind byte... +/// let config = start::Config::new().anchored(Anchored::Yes); +/// let mut state = dfa.start_state(&config)?; +/// for &b in haystack.as_bytes().iter().skip(1) { +/// state = dfa.next_state(state, b); +/// } +/// state = dfa.next_eoi_state(state); +/// // And now we get a match unexpectedly. +/// assert!(dfa.is_match_state(state)); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +#[derive(Clone, Debug)] +pub struct Config { + look_behind: Option<u8>, + anchored: Anchored, +} + +impl Config { + /// Create a new default start configuration. + /// + /// The default is an unanchored search that starts at the beginning of the + /// haystack. + pub fn new() -> Config { + Config { anchored: Anchored::No, look_behind: None } + } + + /// A convenience routine for building a start configuration from an + /// [`Input`] for a forward search. + /// + /// This automatically sets the look-behind byte to the byte immediately + /// preceding the start of the search. If the start of the search is at + /// offset `0`, then no look-behind byte is set. + pub fn from_input_forward(input: &Input<'_>) -> Config { + let look_behind = input + .start() + .checked_sub(1) + .and_then(|i| input.haystack().get(i).copied()); + Config { look_behind, anchored: input.get_anchored() } + } + + /// A convenience routine for building a start configuration from an + /// [`Input`] for a reverse search. + /// + /// This automatically sets the look-behind byte to the byte immediately + /// following the end of the search. If the end of the search is at + /// offset `haystack.len()`, then no look-behind byte is set. + pub fn from_input_reverse(input: &Input<'_>) -> Config { + let look_behind = input.haystack().get(input.end()).copied(); + Config { look_behind, anchored: input.get_anchored() } + } + + /// Set the look-behind byte at the start of a search. + /// + /// Unless the search is intended to logically start at the beginning of a + /// haystack, this should _always_ be set to the byte immediately preceding + /// the start of the search. If no look-behind byte is set, then the start + /// configuration will assume it is at the beginning of the haystack. For + /// example, the anchor `^` will match. + /// + /// The default is that no look-behind byte is set. + pub fn look_behind(mut self, byte: Option<u8>) -> Config { + self.look_behind = byte; + self + } + + /// Set the anchored mode of a search. + /// + /// The default is an unanchored search. + pub fn anchored(mut self, mode: Anchored) -> Config { + self.anchored = mode; + self + } + + /// Return the look-behind byte in this configuration, if one exists. + pub fn get_look_behind(&self) -> Option<u8> { + self.look_behind + } + + /// Return the anchored mode in this configuration. + pub fn get_anchored(&self) -> Anchored { + self.anchored + } +} + +/// A map from every possible byte value to its corresponding starting +/// configuration. +/// +/// This map is used in order to lookup the start configuration for a particular +/// position in a haystack. This start configuration is then used in +/// combination with things like the anchored mode and pattern ID to fully +/// determine the start state. +/// +/// Generally speaking, this map is only used for fully compiled DFAs and lazy +/// DFAs. For NFAs (including the one-pass DFA), the start state is generally +/// selected by virtue of traversing the NFA state graph. DFAs do the same +/// thing, but at build time and not search time. (Well, technically the lazy +/// DFA does it at search time, but it does enough work to cache the full +/// result of the epsilon closure that the NFA engines tend to need to do.) +#[derive(Clone)] +pub(crate) struct StartByteMap { + map: [Start; 256], +} + +impl StartByteMap { + /// Create a new map from byte values to their corresponding starting + /// configurations. The map is determined, in part, by how look-around + /// assertions are matched via the matcher given. + pub(crate) fn new(lookm: &LookMatcher) -> StartByteMap { + let mut map = [Start::NonWordByte; 256]; + map[usize::from(b'\n')] = Start::LineLF; + map[usize::from(b'\r')] = Start::LineCR; + map[usize::from(b'_')] = Start::WordByte; + + let mut byte = b'0'; + while byte <= b'9' { + map[usize::from(byte)] = Start::WordByte; + byte += 1; + } + byte = b'A'; + while byte <= b'Z' { + map[usize::from(byte)] = Start::WordByte; + byte += 1; + } + byte = b'a'; + while byte <= b'z' { + map[usize::from(byte)] = Start::WordByte; + byte += 1; + } + + let lineterm = lookm.get_line_terminator(); + // If our line terminator is normal, then it is already handled by + // the LineLF and LineCR configurations. But if it's weird, then we + // overwrite whatever was there before for that terminator with a + // special configuration. The trick here is that if the terminator + // is, say, a word byte like `a`, then callers seeing this start + // configuration need to account for that and build their DFA state as + // if it *also* came from a word byte. + if lineterm != b'\r' && lineterm != b'\n' { + map[usize::from(lineterm)] = Start::CustomLineTerminator; + } + StartByteMap { map } + } + + /// Return the starting configuration for the given look-behind byte. + /// + /// If no look-behind exists, callers should use `Start::Text`. + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn get(&self, byte: u8) -> Start { + self.map[usize::from(byte)] + } + + /// Deserializes a byte class map from the given slice. If the slice is of + /// insufficient length or otherwise contains an impossible mapping, then + /// an error is returned. Upon success, the number of bytes read along with + /// the map are returned. The number of bytes read is always a multiple of + /// 8. + pub(crate) fn from_bytes( + slice: &[u8], + ) -> Result<(StartByteMap, usize), DeserializeError> { + wire::check_slice_len(slice, 256, "start byte map")?; + let mut map = [Start::NonWordByte; 256]; + for (i, &repr) in slice[..256].iter().enumerate() { + map[i] = match Start::from_usize(usize::from(repr)) { + Some(start) => start, + None => { + return Err(DeserializeError::generic( + "found invalid starting configuration", + )) + } + }; + } + Ok((StartByteMap { map }, 256)) + } + + /// Writes this map to the given byte buffer. if the given buffer is too + /// small, then an error is returned. Upon success, the total number of + /// bytes written is returned. The number of bytes written is guaranteed to + /// be a multiple of 8. + pub(crate) fn write_to( + &self, + dst: &mut [u8], + ) -> Result<usize, SerializeError> { + let nwrite = self.write_to_len(); + if dst.len() < nwrite { + return Err(SerializeError::buffer_too_small("start byte map")); + } + for (i, &start) in self.map.iter().enumerate() { + dst[i] = start.as_u8(); + } + Ok(nwrite) + } + + /// Returns the total number of bytes written by `write_to`. + pub(crate) fn write_to_len(&self) -> usize { + 256 + } +} + +impl core::fmt::Debug for StartByteMap { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + use crate::util::escape::DebugByte; + + write!(f, "StartByteMap{{")?; + for byte in 0..=255 { + if byte > 0 { + write!(f, ", ")?; + } + let start = self.map[usize::from(byte)]; + write!(f, "{:?} => {:?}", DebugByte(byte), start)?; + } + write!(f, "}}")?; + Ok(()) + } +} + +/// Represents the six possible starting configurations of a DFA search. +/// +/// The starting configuration is determined by inspecting the the beginning +/// of the haystack (up to 1 byte). Ultimately, this along with a pattern ID +/// (if specified) and the type of search (anchored or not) is what selects the +/// start state to use in a DFA. +/// +/// As one example, if a DFA only supports unanchored searches and does not +/// support anchored searches for each pattern, then it will have at most 6 +/// distinct start states. (Some start states may be reused if determinization +/// can determine that they will be equivalent.) If the DFA supports both +/// anchored and unanchored searches, then it will have a maximum of 12 +/// distinct start states. Finally, if the DFA also supports anchored searches +/// for each pattern, then it can have up to `12 + (N * 6)` start states, where +/// `N` is the number of patterns. +/// +/// Handling each of these starting configurations in the context of DFA +/// determinization can be *quite* tricky and subtle. But the code is small +/// and can be found at `crate::util::determinize::set_lookbehind_from_start`. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub(crate) enum Start { + /// This occurs when the starting position is not any of the ones below. + NonWordByte = 0, + /// This occurs when the byte immediately preceding the start of the search + /// is an ASCII word byte. + WordByte = 1, + /// This occurs when the starting position of the search corresponds to the + /// beginning of the haystack. + Text = 2, + /// This occurs when the byte immediately preceding the start of the search + /// is a line terminator. Specifically, `\n`. + LineLF = 3, + /// This occurs when the byte immediately preceding the start of the search + /// is a line terminator. Specifically, `\r`. + LineCR = 4, + /// This occurs when a custom line terminator has been set via a + /// `LookMatcher`, and when that line terminator is neither a `\r` or a + /// `\n`. + /// + /// If the custom line terminator is a word byte, then this start + /// configuration is still selected. DFAs that implement word boundary + /// assertions will likely need to check whether the custom line terminator + /// is a word byte, in which case, it should behave as if the byte + /// satisfies `\b` in addition to multi-line anchors. + CustomLineTerminator = 5, +} + +impl Start { + /// Return the starting state corresponding to the given integer. If no + /// starting state exists for the given integer, then None is returned. + pub(crate) fn from_usize(n: usize) -> Option<Start> { + match n { + 0 => Some(Start::NonWordByte), + 1 => Some(Start::WordByte), + 2 => Some(Start::Text), + 3 => Some(Start::LineLF), + 4 => Some(Start::LineCR), + 5 => Some(Start::CustomLineTerminator), + _ => None, + } + } + + /// Returns the total number of starting state configurations. + pub(crate) fn len() -> usize { + 6 + } + + /// Return this starting configuration as `u8` integer. It is guaranteed to + /// be less than `Start::len()`. + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn as_u8(&self) -> u8 { + // AFAIK, 'as' is the only way to zero-cost convert an int enum to an + // actual int. + *self as u8 + } + + /// Return this starting configuration as a `usize` integer. It is + /// guaranteed to be less than `Start::len()`. + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn as_usize(&self) -> usize { + usize::from(self.as_u8()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn start_fwd_done_range() { + let smap = StartByteMap::new(&LookMatcher::default()); + let input = Input::new("").range(1..0); + let config = Config::from_input_forward(&input); + let start = + config.get_look_behind().map_or(Start::Text, |b| smap.get(b)); + assert_eq!(Start::Text, start); + } + + #[test] + fn start_rev_done_range() { + let smap = StartByteMap::new(&LookMatcher::default()); + let input = Input::new("").range(1..0); + let config = Config::from_input_reverse(&input); + let start = + config.get_look_behind().map_or(Start::Text, |b| smap.get(b)); + assert_eq!(Start::Text, start); + } + + #[test] + fn start_fwd() { + let f = |haystack, start, end| { + let smap = StartByteMap::new(&LookMatcher::default()); + let input = Input::new(haystack).range(start..end); + let config = Config::from_input_forward(&input); + let start = + config.get_look_behind().map_or(Start::Text, |b| smap.get(b)); + start + }; + + assert_eq!(Start::Text, f("", 0, 0)); + assert_eq!(Start::Text, f("abc", 0, 3)); + assert_eq!(Start::Text, f("\nabc", 0, 3)); + + assert_eq!(Start::LineLF, f("\nabc", 1, 3)); + + assert_eq!(Start::LineCR, f("\rabc", 1, 3)); + + assert_eq!(Start::WordByte, f("abc", 1, 3)); + + assert_eq!(Start::NonWordByte, f(" abc", 1, 3)); + } + + #[test] + fn start_rev() { + let f = |haystack, start, end| { + let smap = StartByteMap::new(&LookMatcher::default()); + let input = Input::new(haystack).range(start..end); + let config = Config::from_input_reverse(&input); + let start = + config.get_look_behind().map_or(Start::Text, |b| smap.get(b)); + start + }; + + assert_eq!(Start::Text, f("", 0, 0)); + assert_eq!(Start::Text, f("abc", 0, 3)); + assert_eq!(Start::Text, f("abc\n", 0, 4)); + + assert_eq!(Start::LineLF, f("abc\nz", 0, 3)); + + assert_eq!(Start::LineCR, f("abc\rz", 0, 3)); + + assert_eq!(Start::WordByte, f("abc", 0, 2)); + + assert_eq!(Start::NonWordByte, f("abc ", 0, 3)); + } +} diff --git a/vendor/regex-automata/src/util/syntax.rs b/vendor/regex-automata/src/util/syntax.rs new file mode 100644 index 0000000..78e3cf9 --- /dev/null +++ b/vendor/regex-automata/src/util/syntax.rs @@ -0,0 +1,482 @@ +/*! +Utilities for dealing with the syntax of a regular expression. + +This module currently only exposes a [`Config`] type that +itself represents a wrapper around the configuration for a +[`regex-syntax::ParserBuilder`](regex_syntax::ParserBuilder). The purpose of +this wrapper is to make configuring syntax options very similar to how other +configuration is done throughout this crate. Namely, instead of duplicating +syntax options across every builder (of which there are many), we instead +create small config objects like this one that can be passed around and +composed. +*/ + +use alloc::{vec, vec::Vec}; + +use regex_syntax::{ + ast, + hir::{self, Hir}, + Error, ParserBuilder, +}; + +/// A convenience routine for parsing a pattern into an HIR value with the +/// default configuration. +/// +/// # Example +/// +/// This shows how to parse a pattern into an HIR value: +/// +/// ``` +/// use regex_automata::util::syntax; +/// +/// let hir = syntax::parse(r"([a-z]+)|([0-9]+)")?; +/// assert_eq!(Some(1), hir.properties().static_explicit_captures_len()); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +pub fn parse(pattern: &str) -> Result<Hir, Error> { + parse_with(pattern, &Config::default()) +} + +/// A convenience routine for parsing many patterns into HIR value with the +/// default configuration. +/// +/// # Example +/// +/// This shows how to parse many patterns into an corresponding HIR values: +/// +/// ``` +/// use { +/// regex_automata::util::syntax, +/// regex_syntax::hir::Properties, +/// }; +/// +/// let hirs = syntax::parse_many(&[ +/// r"([a-z]+)|([0-9]+)", +/// r"foo(A-Z]+)bar", +/// ])?; +/// let props = Properties::union(hirs.iter().map(|h| h.properties())); +/// assert_eq!(Some(1), props.static_explicit_captures_len()); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +pub fn parse_many<P: AsRef<str>>(patterns: &[P]) -> Result<Vec<Hir>, Error> { + parse_many_with(patterns, &Config::default()) +} + +/// A convenience routine for parsing a pattern into an HIR value using a +/// `Config`. +/// +/// # Example +/// +/// This shows how to parse a pattern into an HIR value with a non-default +/// configuration: +/// +/// ``` +/// use regex_automata::util::syntax; +/// +/// let hir = syntax::parse_with( +/// r"^[a-z]+$", +/// &syntax::Config::new().multi_line(true).crlf(true), +/// )?; +/// assert!(hir.properties().look_set().contains_anchor_crlf()); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +pub fn parse_with(pattern: &str, config: &Config) -> Result<Hir, Error> { + let mut builder = ParserBuilder::new(); + config.apply(&mut builder); + builder.build().parse(pattern) +} + +/// A convenience routine for parsing many patterns into HIR values using a +/// `Config`. +/// +/// # Example +/// +/// This shows how to parse many patterns into an corresponding HIR values +/// with a non-default configuration: +/// +/// ``` +/// use { +/// regex_automata::util::syntax, +/// regex_syntax::hir::Properties, +/// }; +/// +/// let patterns = &[ +/// r"([a-z]+)|([0-9]+)", +/// r"\W", +/// r"foo(A-Z]+)bar", +/// ]; +/// let config = syntax::Config::new().unicode(false).utf8(false); +/// let hirs = syntax::parse_many_with(patterns, &config)?; +/// let props = Properties::union(hirs.iter().map(|h| h.properties())); +/// assert!(!props.is_utf8()); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +pub fn parse_many_with<P: AsRef<str>>( + patterns: &[P], + config: &Config, +) -> Result<Vec<Hir>, Error> { + let mut builder = ParserBuilder::new(); + config.apply(&mut builder); + let mut hirs = vec![]; + for p in patterns.iter() { + hirs.push(builder.build().parse(p.as_ref())?); + } + Ok(hirs) +} + +/// A common set of configuration options that apply to the syntax of a regex. +/// +/// This represents a group of configuration options that specifically apply +/// to how the concrete syntax of a regular expression is interpreted. In +/// particular, they are generally forwarded to the +/// [`ParserBuilder`](https://docs.rs/regex-syntax/*/regex_syntax/struct.ParserBuilder.html) +/// in the +/// [`regex-syntax`](https://docs.rs/regex-syntax) +/// crate when building a regex from its concrete syntax directly. +/// +/// These options are defined as a group since they apply to every regex engine +/// in this crate. Instead of re-defining them on every engine's builder, they +/// are instead provided here as one cohesive unit. +#[derive(Clone, Copy, Debug)] +pub struct Config { + case_insensitive: bool, + multi_line: bool, + dot_matches_new_line: bool, + crlf: bool, + line_terminator: u8, + swap_greed: bool, + ignore_whitespace: bool, + unicode: bool, + utf8: bool, + nest_limit: u32, + octal: bool, +} + +impl Config { + /// Return a new default syntax configuration. + pub fn new() -> Config { + // These defaults match the ones used in regex-syntax. + Config { + case_insensitive: false, + multi_line: false, + dot_matches_new_line: false, + crlf: false, + line_terminator: b'\n', + swap_greed: false, + ignore_whitespace: false, + unicode: true, + utf8: true, + nest_limit: 250, + octal: false, + } + } + + /// Enable or disable the case insensitive flag by default. + /// + /// When Unicode mode is enabled, case insensitivity is Unicode-aware. + /// Specifically, it will apply the "simple" case folding rules as + /// specified by Unicode. + /// + /// By default this is disabled. It may alternatively be selectively + /// enabled in the regular expression itself via the `i` flag. + pub fn case_insensitive(mut self, yes: bool) -> Config { + self.case_insensitive = yes; + self + } + + /// Enable or disable the multi-line matching flag by default. + /// + /// When this is enabled, the `^` and `$` look-around assertions will + /// match immediately after and immediately before a new line character, + /// respectively. Note that the `\A` and `\z` look-around assertions are + /// unaffected by this setting and always correspond to matching at the + /// beginning and end of the input. + /// + /// By default this is disabled. It may alternatively be selectively + /// enabled in the regular expression itself via the `m` flag. + pub fn multi_line(mut self, yes: bool) -> Config { + self.multi_line = yes; + self + } + + /// Enable or disable the "dot matches any character" flag by default. + /// + /// When this is enabled, `.` will match any character. When it's disabled, + /// then `.` will match any character except for a new line character. + /// + /// Note that `.` is impacted by whether the "unicode" setting is enabled + /// or not. When Unicode is enabled (the default), `.` will match any UTF-8 + /// encoding of any Unicode scalar value (sans a new line, depending on + /// whether this "dot matches new line" option is enabled). When Unicode + /// mode is disabled, `.` will match any byte instead. Because of this, + /// when Unicode mode is disabled, `.` can only be used when the "allow + /// invalid UTF-8" option is enabled, since `.` could otherwise match + /// invalid UTF-8. + /// + /// By default this is disabled. It may alternatively be selectively + /// enabled in the regular expression itself via the `s` flag. + pub fn dot_matches_new_line(mut self, yes: bool) -> Config { + self.dot_matches_new_line = yes; + self + } + + /// Enable or disable the "CRLF mode" flag by default. + /// + /// By default this is disabled. It may alternatively be selectively + /// enabled in the regular expression itself via the `R` flag. + /// + /// When CRLF mode is enabled, the following happens: + /// + /// * Unless `dot_matches_new_line` is enabled, `.` will match any character + /// except for `\r` and `\n`. + /// * When `multi_line` mode is enabled, `^` and `$` will treat `\r\n`, + /// `\r` and `\n` as line terminators. And in particular, neither will + /// match between a `\r` and a `\n`. + pub fn crlf(mut self, yes: bool) -> Config { + self.crlf = yes; + self + } + + /// Sets the line terminator for use with `(?u-s:.)` and `(?-us:.)`. + /// + /// Namely, instead of `.` (by default) matching everything except for `\n`, + /// this will cause `.` to match everything except for the byte given. + /// + /// If `.` is used in a context where Unicode mode is enabled and this byte + /// isn't ASCII, then an error will be returned. When Unicode mode is + /// disabled, then any byte is permitted, but will return an error if UTF-8 + /// mode is enabled and it is a non-ASCII byte. + /// + /// In short, any ASCII value for a line terminator is always okay. But a + /// non-ASCII byte might result in an error depending on whether Unicode + /// mode or UTF-8 mode are enabled. + /// + /// Note that if `R` mode is enabled then it always takes precedence and + /// the line terminator will be treated as `\r` and `\n` simultaneously. + /// + /// Note also that this *doesn't* impact the look-around assertions + /// `(?m:^)` and `(?m:$)`. That's usually controlled by additional + /// configuration in the regex engine itself. + pub fn line_terminator(mut self, byte: u8) -> Config { + self.line_terminator = byte; + self + } + + /// Enable or disable the "swap greed" flag by default. + /// + /// When this is enabled, `.*` (for example) will become ungreedy and `.*?` + /// will become greedy. + /// + /// By default this is disabled. It may alternatively be selectively + /// enabled in the regular expression itself via the `U` flag. + pub fn swap_greed(mut self, yes: bool) -> Config { + self.swap_greed = yes; + self + } + + /// Enable verbose mode in the regular expression. + /// + /// When enabled, verbose mode permits insigificant whitespace in many + /// places in the regular expression, as well as comments. Comments are + /// started using `#` and continue until the end of the line. + /// + /// By default, this is disabled. It may be selectively enabled in the + /// regular expression by using the `x` flag regardless of this setting. + pub fn ignore_whitespace(mut self, yes: bool) -> Config { + self.ignore_whitespace = yes; + self + } + + /// Enable or disable the Unicode flag (`u`) by default. + /// + /// By default this is **enabled**. It may alternatively be selectively + /// disabled in the regular expression itself via the `u` flag. + /// + /// Note that unless "allow invalid UTF-8" is enabled (it's disabled by + /// default), a regular expression will fail to parse if Unicode mode is + /// disabled and a sub-expression could possibly match invalid UTF-8. + /// + /// **WARNING**: Unicode mode can greatly increase the size of the compiled + /// DFA, which can noticeably impact both memory usage and compilation + /// time. This is especially noticeable if your regex contains character + /// classes like `\w` that are impacted by whether Unicode is enabled or + /// not. If Unicode is not necessary, you are encouraged to disable it. + pub fn unicode(mut self, yes: bool) -> Config { + self.unicode = yes; + self + } + + /// When disabled, the builder will permit the construction of a regular + /// expression that may match invalid UTF-8. + /// + /// For example, when [`Config::unicode`] is disabled, then + /// expressions like `[^a]` may match invalid UTF-8 since they can match + /// any single byte that is not `a`. By default, these sub-expressions + /// are disallowed to avoid returning offsets that split a UTF-8 + /// encoded codepoint. However, in cases where matching at arbitrary + /// locations is desired, this option can be disabled to permit all such + /// sub-expressions. + /// + /// When enabled (the default), the builder is guaranteed to produce a + /// regex that will only ever match valid UTF-8 (otherwise, the builder + /// will return an error). + pub fn utf8(mut self, yes: bool) -> Config { + self.utf8 = yes; + self + } + + /// Set the nesting limit used for the regular expression parser. + /// + /// The nesting limit controls how deep the abstract syntax tree is allowed + /// to be. If the AST exceeds the given limit (e.g., with too many nested + /// groups), then an error is returned by the parser. + /// + /// The purpose of this limit is to act as a heuristic to prevent stack + /// overflow when building a finite automaton from a regular expression's + /// abstract syntax tree. In particular, construction currently uses + /// recursion. In the future, the implementation may stop using recursion + /// and this option will no longer be necessary. + /// + /// This limit is not checked until the entire AST is parsed. Therefore, + /// if callers want to put a limit on the amount of heap space used, then + /// they should impose a limit on the length, in bytes, of the concrete + /// pattern string. In particular, this is viable since the parser will + /// limit itself to heap space proportional to the length of the pattern + /// string. + /// + /// Note that a nest limit of `0` will return a nest limit error for most + /// patterns but not all. For example, a nest limit of `0` permits `a` but + /// not `ab`, since `ab` requires a concatenation AST item, which results + /// in a nest depth of `1`. In general, a nest limit is not something that + /// manifests in an obvious way in the concrete syntax, therefore, it + /// should not be used in a granular way. + pub fn nest_limit(mut self, limit: u32) -> Config { + self.nest_limit = limit; + self + } + + /// Whether to support octal syntax or not. + /// + /// Octal syntax is a little-known way of uttering Unicode codepoints in + /// a regular expression. For example, `a`, `\x61`, `\u0061` and + /// `\141` are all equivalent regular expressions, where the last example + /// shows octal syntax. + /// + /// While supporting octal syntax isn't in and of itself a problem, it does + /// make good error messages harder. That is, in PCRE based regex engines, + /// syntax like `\1` invokes a backreference, which is explicitly + /// unsupported in Rust's regex engine. However, many users expect it to + /// be supported. Therefore, when octal support is disabled, the error + /// message will explicitly mention that backreferences aren't supported. + /// + /// Octal syntax is disabled by default. + pub fn octal(mut self, yes: bool) -> Config { + self.octal = yes; + self + } + + /// Returns whether "unicode" mode is enabled. + pub fn get_unicode(&self) -> bool { + self.unicode + } + + /// Returns whether "case insensitive" mode is enabled. + pub fn get_case_insensitive(&self) -> bool { + self.case_insensitive + } + + /// Returns whether "multi line" mode is enabled. + pub fn get_multi_line(&self) -> bool { + self.multi_line + } + + /// Returns whether "dot matches new line" mode is enabled. + pub fn get_dot_matches_new_line(&self) -> bool { + self.dot_matches_new_line + } + + /// Returns whether "CRLF" mode is enabled. + pub fn get_crlf(&self) -> bool { + self.crlf + } + + /// Returns the line terminator in this syntax configuration. + pub fn get_line_terminator(&self) -> u8 { + self.line_terminator + } + + /// Returns whether "swap greed" mode is enabled. + pub fn get_swap_greed(&self) -> bool { + self.swap_greed + } + + /// Returns whether "ignore whitespace" mode is enabled. + pub fn get_ignore_whitespace(&self) -> bool { + self.ignore_whitespace + } + + /// Returns whether UTF-8 mode is enabled. + pub fn get_utf8(&self) -> bool { + self.utf8 + } + + /// Returns the "nest limit" setting. + pub fn get_nest_limit(&self) -> u32 { + self.nest_limit + } + + /// Returns whether "octal" mode is enabled. + pub fn get_octal(&self) -> bool { + self.octal + } + + /// Applies this configuration to the given parser. + pub(crate) fn apply(&self, builder: &mut ParserBuilder) { + builder + .unicode(self.unicode) + .case_insensitive(self.case_insensitive) + .multi_line(self.multi_line) + .dot_matches_new_line(self.dot_matches_new_line) + .crlf(self.crlf) + .line_terminator(self.line_terminator) + .swap_greed(self.swap_greed) + .ignore_whitespace(self.ignore_whitespace) + .utf8(self.utf8) + .nest_limit(self.nest_limit) + .octal(self.octal); + } + + /// Applies this configuration to the given AST parser. + pub(crate) fn apply_ast(&self, builder: &mut ast::parse::ParserBuilder) { + builder + .ignore_whitespace(self.ignore_whitespace) + .nest_limit(self.nest_limit) + .octal(self.octal); + } + + /// Applies this configuration to the given AST-to-HIR translator. + pub(crate) fn apply_hir( + &self, + builder: &mut hir::translate::TranslatorBuilder, + ) { + builder + .unicode(self.unicode) + .case_insensitive(self.case_insensitive) + .multi_line(self.multi_line) + .crlf(self.crlf) + .dot_matches_new_line(self.dot_matches_new_line) + .line_terminator(self.line_terminator) + .swap_greed(self.swap_greed) + .utf8(self.utf8); + } +} + +impl Default for Config { + fn default() -> Config { + Config::new() + } +} diff --git a/vendor/regex-automata/src/util/unicode_data/mod.rs b/vendor/regex-automata/src/util/unicode_data/mod.rs new file mode 100644 index 0000000..fc7b1c7 --- /dev/null +++ b/vendor/regex-automata/src/util/unicode_data/mod.rs @@ -0,0 +1,17 @@ +// This cfg should match the one in src/util/look.rs that uses perl_word. +#[cfg(all( + // We have to explicitly want to support Unicode word boundaries. + feature = "unicode-word-boundary", + not(all( + // If we don't have regex-syntax at all, then we definitely need to + // bring our own \w data table. + feature = "syntax", + // If unicode-perl is enabled, then regex-syntax/unicode-perl is + // also enabled, which in turn means we can use regex-syntax's + // is_word_character routine (and thus use its data tables). But if + // unicode-perl is not enabled, even if syntax is, then we need to + // bring our own. + feature = "unicode-perl", + )), +))] +pub(crate) mod perl_word; diff --git a/vendor/regex-automata/src/util/unicode_data/perl_word.rs b/vendor/regex-automata/src/util/unicode_data/perl_word.rs new file mode 100644 index 0000000..74d6265 --- /dev/null +++ b/vendor/regex-automata/src/util/unicode_data/perl_word.rs @@ -0,0 +1,781 @@ +// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: +// +// ucd-generate perl-word tmp/ucd-15.0.0/ --chars +// +// Unicode version: 15.0.0. +// +// ucd-generate 0.2.15 is available on crates.io. + +pub const PERL_WORD: &'static [(char, char)] = &[ + ('0', '9'), + ('A', 'Z'), + ('_', '_'), + ('a', 'z'), + ('ª', 'ª'), + ('µ', 'µ'), + ('º', 'º'), + ('À', 'Ö'), + ('Ø', 'ö'), + ('ø', 'ˁ'), + ('ˆ', 'ˑ'), + ('ˠ', 'ˤ'), + ('ˬ', 'ˬ'), + ('ˮ', 'ˮ'), + ('\u{300}', 'ʹ'), + ('Ͷ', 'ͷ'), + ('ͺ', 'ͽ'), + ('Ϳ', 'Ϳ'), + ('Ά', 'Ά'), + ('Έ', 'Ί'), + ('Ό', 'Ό'), + ('Ύ', 'Ρ'), + ('Σ', 'ϵ'), + ('Ϸ', 'ҁ'), + ('\u{483}', 'ԯ'), + ('Ա', 'Ֆ'), + ('ՙ', 'ՙ'), + ('ՠ', 'ֈ'), + ('\u{591}', '\u{5bd}'), + ('\u{5bf}', '\u{5bf}'), + ('\u{5c1}', '\u{5c2}'), + ('\u{5c4}', '\u{5c5}'), + ('\u{5c7}', '\u{5c7}'), + ('א', 'ת'), + ('ׯ', 'ײ'), + ('\u{610}', '\u{61a}'), + ('ؠ', '٩'), + ('ٮ', 'ۓ'), + ('ە', '\u{6dc}'), + ('\u{6df}', '\u{6e8}'), + ('\u{6ea}', 'ۼ'), + ('ۿ', 'ۿ'), + ('ܐ', '\u{74a}'), + ('ݍ', 'ޱ'), + ('߀', 'ߵ'), + ('ߺ', 'ߺ'), + ('\u{7fd}', '\u{7fd}'), + ('ࠀ', '\u{82d}'), + ('ࡀ', '\u{85b}'), + ('ࡠ', 'ࡪ'), + ('ࡰ', 'ࢇ'), + ('ࢉ', 'ࢎ'), + ('\u{898}', '\u{8e1}'), + ('\u{8e3}', '\u{963}'), + ('०', '९'), + ('ॱ', 'ঃ'), + ('অ', 'ঌ'), + ('এ', 'ঐ'), + ('ও', 'ন'), + ('প', 'র'), + ('ল', 'ল'), + ('শ', 'হ'), + ('\u{9bc}', '\u{9c4}'), + ('ে', 'ৈ'), + ('ো', 'ৎ'), + ('\u{9d7}', '\u{9d7}'), + ('ড়', 'ঢ়'), + ('য়', '\u{9e3}'), + ('০', 'ৱ'), + ('ৼ', 'ৼ'), + ('\u{9fe}', '\u{9fe}'), + ('\u{a01}', 'ਃ'), + ('ਅ', 'ਊ'), + ('ਏ', 'ਐ'), + ('ਓ', 'ਨ'), + ('ਪ', 'ਰ'), + ('ਲ', 'ਲ਼'), + ('ਵ', 'ਸ਼'), + ('ਸ', 'ਹ'), + ('\u{a3c}', '\u{a3c}'), + ('ਾ', '\u{a42}'), + ('\u{a47}', '\u{a48}'), + ('\u{a4b}', '\u{a4d}'), + ('\u{a51}', '\u{a51}'), + ('ਖ਼', 'ੜ'), + ('ਫ਼', 'ਫ਼'), + ('੦', '\u{a75}'), + ('\u{a81}', 'ઃ'), + ('અ', 'ઍ'), + ('એ', 'ઑ'), + ('ઓ', 'ન'), + ('પ', 'ર'), + ('લ', 'ળ'), + ('વ', 'હ'), + ('\u{abc}', '\u{ac5}'), + ('\u{ac7}', 'ૉ'), + ('ો', '\u{acd}'), + ('ૐ', 'ૐ'), + ('ૠ', '\u{ae3}'), + ('૦', '૯'), + ('ૹ', '\u{aff}'), + ('\u{b01}', 'ଃ'), + ('ଅ', 'ଌ'), + ('ଏ', 'ଐ'), + ('ଓ', 'ନ'), + ('ପ', 'ର'), + ('ଲ', 'ଳ'), + ('ଵ', 'ହ'), + ('\u{b3c}', '\u{b44}'), + ('େ', 'ୈ'), + ('ୋ', '\u{b4d}'), + ('\u{b55}', '\u{b57}'), + ('ଡ଼', 'ଢ଼'), + ('ୟ', '\u{b63}'), + ('୦', '୯'), + ('ୱ', 'ୱ'), + ('\u{b82}', 'ஃ'), + ('அ', 'ஊ'), + ('எ', 'ஐ'), + ('ஒ', 'க'), + ('ங', 'ச'), + ('ஜ', 'ஜ'), + ('ஞ', 'ட'), + ('ண', 'த'), + ('ந', 'ப'), + ('ம', 'ஹ'), + ('\u{bbe}', 'ூ'), + ('ெ', 'ை'), + ('ொ', '\u{bcd}'), + ('ௐ', 'ௐ'), + ('\u{bd7}', '\u{bd7}'), + ('௦', '௯'), + ('\u{c00}', 'ఌ'), + ('ఎ', 'ఐ'), + ('ఒ', 'న'), + ('ప', 'హ'), + ('\u{c3c}', 'ౄ'), + ('\u{c46}', '\u{c48}'), + ('\u{c4a}', '\u{c4d}'), + ('\u{c55}', '\u{c56}'), + ('ౘ', 'ౚ'), + ('ౝ', 'ౝ'), + ('ౠ', '\u{c63}'), + ('౦', '౯'), + ('ಀ', 'ಃ'), + ('ಅ', 'ಌ'), + ('ಎ', 'ಐ'), + ('ಒ', 'ನ'), + ('ಪ', 'ಳ'), + ('ವ', 'ಹ'), + ('\u{cbc}', 'ೄ'), + ('\u{cc6}', 'ೈ'), + ('ೊ', '\u{ccd}'), + ('\u{cd5}', '\u{cd6}'), + ('ೝ', 'ೞ'), + ('ೠ', '\u{ce3}'), + ('೦', '೯'), + ('ೱ', 'ೳ'), + ('\u{d00}', 'ഌ'), + ('എ', 'ഐ'), + ('ഒ', '\u{d44}'), + ('െ', 'ൈ'), + ('ൊ', 'ൎ'), + ('ൔ', '\u{d57}'), + ('ൟ', '\u{d63}'), + ('൦', '൯'), + ('ൺ', 'ൿ'), + ('\u{d81}', 'ඃ'), + ('අ', 'ඖ'), + ('ක', 'න'), + ('ඳ', 'ර'), + ('ල', 'ල'), + ('ව', 'ෆ'), + ('\u{dca}', '\u{dca}'), + ('\u{dcf}', '\u{dd4}'), + ('\u{dd6}', '\u{dd6}'), + ('ෘ', '\u{ddf}'), + ('෦', '෯'), + ('ෲ', 'ෳ'), + ('ก', '\u{e3a}'), + ('เ', '\u{e4e}'), + ('๐', '๙'), + ('ກ', 'ຂ'), + ('ຄ', 'ຄ'), + ('ຆ', 'ຊ'), + ('ຌ', 'ຣ'), + ('ລ', 'ລ'), + ('ວ', 'ຽ'), + ('ເ', 'ໄ'), + ('ໆ', 'ໆ'), + ('\u{ec8}', '\u{ece}'), + ('໐', '໙'), + ('ໜ', 'ໟ'), + ('ༀ', 'ༀ'), + ('\u{f18}', '\u{f19}'), + ('༠', '༩'), + ('\u{f35}', '\u{f35}'), + ('\u{f37}', '\u{f37}'), + ('\u{f39}', '\u{f39}'), + ('༾', 'ཇ'), + ('ཉ', 'ཬ'), + ('\u{f71}', '\u{f84}'), + ('\u{f86}', '\u{f97}'), + ('\u{f99}', '\u{fbc}'), + ('\u{fc6}', '\u{fc6}'), + ('က', '၉'), + ('ၐ', '\u{109d}'), + ('Ⴀ', 'Ⴥ'), + ('Ⴧ', 'Ⴧ'), + ('Ⴭ', 'Ⴭ'), + ('ა', 'ჺ'), + ('ჼ', 'ቈ'), + ('ቊ', 'ቍ'), + ('ቐ', 'ቖ'), + ('ቘ', 'ቘ'), + ('ቚ', 'ቝ'), + ('በ', 'ኈ'), + ('ኊ', 'ኍ'), + ('ነ', 'ኰ'), + ('ኲ', 'ኵ'), + ('ኸ', 'ኾ'), + ('ዀ', 'ዀ'), + ('ዂ', 'ዅ'), + ('ወ', 'ዖ'), + ('ዘ', 'ጐ'), + ('ጒ', 'ጕ'), + ('ጘ', 'ፚ'), + ('\u{135d}', '\u{135f}'), + ('ᎀ', 'ᎏ'), + ('Ꭰ', 'Ᏽ'), + ('ᏸ', 'ᏽ'), + ('ᐁ', 'ᙬ'), + ('ᙯ', 'ᙿ'), + ('ᚁ', 'ᚚ'), + ('ᚠ', 'ᛪ'), + ('ᛮ', 'ᛸ'), + ('ᜀ', '᜕'), + ('ᜟ', '᜴'), + ('ᝀ', '\u{1753}'), + ('ᝠ', 'ᝬ'), + ('ᝮ', 'ᝰ'), + ('\u{1772}', '\u{1773}'), + ('ក', '\u{17d3}'), + ('ៗ', 'ៗ'), + ('ៜ', '\u{17dd}'), + ('០', '៩'), + ('\u{180b}', '\u{180d}'), + ('\u{180f}', '᠙'), + ('ᠠ', 'ᡸ'), + ('ᢀ', 'ᢪ'), + ('ᢰ', 'ᣵ'), + ('ᤀ', 'ᤞ'), + ('\u{1920}', 'ᤫ'), + ('ᤰ', '\u{193b}'), + ('᥆', 'ᥭ'), + ('ᥰ', 'ᥴ'), + ('ᦀ', 'ᦫ'), + ('ᦰ', 'ᧉ'), + ('᧐', '᧙'), + ('ᨀ', '\u{1a1b}'), + ('ᨠ', '\u{1a5e}'), + ('\u{1a60}', '\u{1a7c}'), + ('\u{1a7f}', '᪉'), + ('᪐', '᪙'), + ('ᪧ', 'ᪧ'), + ('\u{1ab0}', '\u{1ace}'), + ('\u{1b00}', 'ᭌ'), + ('᭐', '᭙'), + ('\u{1b6b}', '\u{1b73}'), + ('\u{1b80}', '᯳'), + ('ᰀ', '\u{1c37}'), + ('᱀', '᱉'), + ('ᱍ', 'ᱽ'), + ('ᲀ', 'ᲈ'), + ('Ა', 'Ჺ'), + ('Ჽ', 'Ჿ'), + ('\u{1cd0}', '\u{1cd2}'), + ('\u{1cd4}', 'ᳺ'), + ('ᴀ', 'ἕ'), + ('Ἐ', 'Ἕ'), + ('ἠ', 'ὅ'), + ('Ὀ', 'Ὅ'), + ('ὐ', 'ὗ'), + ('Ὑ', 'Ὑ'), + ('Ὓ', 'Ὓ'), + ('Ὕ', 'Ὕ'), + ('Ὗ', 'ώ'), + ('ᾀ', 'ᾴ'), + ('ᾶ', 'ᾼ'), + ('ι', 'ι'), + ('ῂ', 'ῄ'), + ('ῆ', 'ῌ'), + ('ῐ', 'ΐ'), + ('ῖ', 'Ί'), + ('ῠ', 'Ῥ'), + ('ῲ', 'ῴ'), + ('ῶ', 'ῼ'), + ('\u{200c}', '\u{200d}'), + ('‿', '⁀'), + ('⁔', '⁔'), + ('ⁱ', 'ⁱ'), + ('ⁿ', 'ⁿ'), + ('ₐ', 'ₜ'), + ('\u{20d0}', '\u{20f0}'), + ('ℂ', 'ℂ'), + ('ℇ', 'ℇ'), + ('ℊ', 'ℓ'), + ('ℕ', 'ℕ'), + ('ℙ', 'ℝ'), + ('ℤ', 'ℤ'), + ('Ω', 'Ω'), + ('ℨ', 'ℨ'), + ('K', 'ℭ'), + ('ℯ', 'ℹ'), + ('ℼ', 'ℿ'), + ('ⅅ', 'ⅉ'), + ('ⅎ', 'ⅎ'), + ('Ⅰ', 'ↈ'), + ('Ⓐ', 'ⓩ'), + ('Ⰰ', 'ⳤ'), + ('Ⳬ', 'ⳳ'), + ('ⴀ', 'ⴥ'), + ('ⴧ', 'ⴧ'), + ('ⴭ', 'ⴭ'), + ('ⴰ', 'ⵧ'), + ('ⵯ', 'ⵯ'), + ('\u{2d7f}', 'ⶖ'), + ('ⶠ', 'ⶦ'), + ('ⶨ', 'ⶮ'), + ('ⶰ', 'ⶶ'), + ('ⶸ', 'ⶾ'), + ('ⷀ', 'ⷆ'), + ('ⷈ', 'ⷎ'), + ('ⷐ', 'ⷖ'), + ('ⷘ', 'ⷞ'), + ('\u{2de0}', '\u{2dff}'), + ('ⸯ', 'ⸯ'), + ('々', '〇'), + ('〡', '\u{302f}'), + ('〱', '〵'), + ('〸', '〼'), + ('ぁ', 'ゖ'), + ('\u{3099}', '\u{309a}'), + ('ゝ', 'ゟ'), + ('ァ', 'ヺ'), + ('ー', 'ヿ'), + ('ㄅ', 'ㄯ'), + ('ㄱ', 'ㆎ'), + ('ㆠ', 'ㆿ'), + ('ㇰ', 'ㇿ'), + ('㐀', '䶿'), + ('一', 'ꒌ'), + ('ꓐ', 'ꓽ'), + ('ꔀ', 'ꘌ'), + ('ꘐ', 'ꘫ'), + ('Ꙁ', '\u{a672}'), + ('\u{a674}', '\u{a67d}'), + ('ꙿ', '\u{a6f1}'), + ('ꜗ', 'ꜟ'), + ('Ꜣ', 'ꞈ'), + ('Ꞌ', 'ꟊ'), + ('Ꟑ', 'ꟑ'), + ('ꟓ', 'ꟓ'), + ('ꟕ', 'ꟙ'), + ('ꟲ', 'ꠧ'), + ('\u{a82c}', '\u{a82c}'), + ('ꡀ', 'ꡳ'), + ('ꢀ', '\u{a8c5}'), + ('꣐', '꣙'), + ('\u{a8e0}', 'ꣷ'), + ('ꣻ', 'ꣻ'), + ('ꣽ', '\u{a92d}'), + ('ꤰ', '꥓'), + ('ꥠ', 'ꥼ'), + ('\u{a980}', '꧀'), + ('ꧏ', '꧙'), + ('ꧠ', 'ꧾ'), + ('ꨀ', '\u{aa36}'), + ('ꩀ', 'ꩍ'), + ('꩐', '꩙'), + ('ꩠ', 'ꩶ'), + ('ꩺ', 'ꫂ'), + ('ꫛ', 'ꫝ'), + ('ꫠ', 'ꫯ'), + ('ꫲ', '\u{aaf6}'), + ('ꬁ', 'ꬆ'), + ('ꬉ', 'ꬎ'), + ('ꬑ', 'ꬖ'), + ('ꬠ', 'ꬦ'), + ('ꬨ', 'ꬮ'), + ('ꬰ', 'ꭚ'), + ('ꭜ', 'ꭩ'), + ('ꭰ', 'ꯪ'), + ('꯬', '\u{abed}'), + ('꯰', '꯹'), + ('가', '힣'), + ('ힰ', 'ퟆ'), + ('ퟋ', 'ퟻ'), + ('豈', '舘'), + ('並', '龎'), + ('ff', 'st'), + ('ﬓ', 'ﬗ'), + ('יִ', 'ﬨ'), + ('שׁ', 'זּ'), + ('טּ', 'לּ'), + ('מּ', 'מּ'), + ('נּ', 'סּ'), + ('ףּ', 'פּ'), + ('צּ', 'ﮱ'), + ('ﯓ', 'ﴽ'), + ('ﵐ', 'ﶏ'), + ('ﶒ', 'ﷇ'), + ('ﷰ', 'ﷻ'), + ('\u{fe00}', '\u{fe0f}'), + ('\u{fe20}', '\u{fe2f}'), + ('︳', '︴'), + ('﹍', '﹏'), + ('ﹰ', 'ﹴ'), + ('ﹶ', 'ﻼ'), + ('0', '9'), + ('A', 'Z'), + ('_', '_'), + ('a', 'z'), + ('ヲ', 'ᄒ'), + ('ᅡ', 'ᅦ'), + ('ᅧ', 'ᅬ'), + ('ᅭ', 'ᅲ'), + ('ᅳ', 'ᅵ'), + ('𐀀', '𐀋'), + ('𐀍', '𐀦'), + ('𐀨', '𐀺'), + ('𐀼', '𐀽'), + ('𐀿', '𐁍'), + ('𐁐', '𐁝'), + ('𐂀', '𐃺'), + ('𐅀', '𐅴'), + ('\u{101fd}', '\u{101fd}'), + ('𐊀', '𐊜'), + ('𐊠', '𐋐'), + ('\u{102e0}', '\u{102e0}'), + ('𐌀', '𐌟'), + ('𐌭', '𐍊'), + ('𐍐', '\u{1037a}'), + ('𐎀', '𐎝'), + ('𐎠', '𐏃'), + ('𐏈', '𐏏'), + ('𐏑', '𐏕'), + ('𐐀', '𐒝'), + ('𐒠', '𐒩'), + ('𐒰', '𐓓'), + ('𐓘', '𐓻'), + ('𐔀', '𐔧'), + ('𐔰', '𐕣'), + ('𐕰', '𐕺'), + ('𐕼', '𐖊'), + ('𐖌', '𐖒'), + ('𐖔', '𐖕'), + ('𐖗', '𐖡'), + ('𐖣', '𐖱'), + ('𐖳', '𐖹'), + ('𐖻', '𐖼'), + ('𐘀', '𐜶'), + ('𐝀', '𐝕'), + ('𐝠', '𐝧'), + ('𐞀', '𐞅'), + ('𐞇', '𐞰'), + ('𐞲', '𐞺'), + ('𐠀', '𐠅'), + ('𐠈', '𐠈'), + ('𐠊', '𐠵'), + ('𐠷', '𐠸'), + ('𐠼', '𐠼'), + ('𐠿', '𐡕'), + ('𐡠', '𐡶'), + ('𐢀', '𐢞'), + ('𐣠', '𐣲'), + ('𐣴', '𐣵'), + ('𐤀', '𐤕'), + ('𐤠', '𐤹'), + ('𐦀', '𐦷'), + ('𐦾', '𐦿'), + ('𐨀', '\u{10a03}'), + ('\u{10a05}', '\u{10a06}'), + ('\u{10a0c}', '𐨓'), + ('𐨕', '𐨗'), + ('𐨙', '𐨵'), + ('\u{10a38}', '\u{10a3a}'), + ('\u{10a3f}', '\u{10a3f}'), + ('𐩠', '𐩼'), + ('𐪀', '𐪜'), + ('𐫀', '𐫇'), + ('𐫉', '\u{10ae6}'), + ('𐬀', '𐬵'), + ('𐭀', '𐭕'), + ('𐭠', '𐭲'), + ('𐮀', '𐮑'), + ('𐰀', '𐱈'), + ('𐲀', '𐲲'), + ('𐳀', '𐳲'), + ('𐴀', '\u{10d27}'), + ('𐴰', '𐴹'), + ('𐺀', '𐺩'), + ('\u{10eab}', '\u{10eac}'), + ('𐺰', '𐺱'), + ('\u{10efd}', '𐼜'), + ('𐼧', '𐼧'), + ('𐼰', '\u{10f50}'), + ('𐽰', '\u{10f85}'), + ('𐾰', '𐿄'), + ('𐿠', '𐿶'), + ('𑀀', '\u{11046}'), + ('𑁦', '𑁵'), + ('\u{1107f}', '\u{110ba}'), + ('\u{110c2}', '\u{110c2}'), + ('𑃐', '𑃨'), + ('𑃰', '𑃹'), + ('\u{11100}', '\u{11134}'), + ('𑄶', '𑄿'), + ('𑅄', '𑅇'), + ('𑅐', '\u{11173}'), + ('𑅶', '𑅶'), + ('\u{11180}', '𑇄'), + ('\u{111c9}', '\u{111cc}'), + ('𑇎', '𑇚'), + ('𑇜', '𑇜'), + ('𑈀', '𑈑'), + ('𑈓', '\u{11237}'), + ('\u{1123e}', '\u{11241}'), + ('𑊀', '𑊆'), + ('𑊈', '𑊈'), + ('𑊊', '𑊍'), + ('𑊏', '𑊝'), + ('𑊟', '𑊨'), + ('𑊰', '\u{112ea}'), + ('𑋰', '𑋹'), + ('\u{11300}', '𑌃'), + ('𑌅', '𑌌'), + ('𑌏', '𑌐'), + ('𑌓', '𑌨'), + ('𑌪', '𑌰'), + ('𑌲', '𑌳'), + ('𑌵', '𑌹'), + ('\u{1133b}', '𑍄'), + ('𑍇', '𑍈'), + ('𑍋', '𑍍'), + ('𑍐', '𑍐'), + ('\u{11357}', '\u{11357}'), + ('𑍝', '𑍣'), + ('\u{11366}', '\u{1136c}'), + ('\u{11370}', '\u{11374}'), + ('𑐀', '𑑊'), + ('𑑐', '𑑙'), + ('\u{1145e}', '𑑡'), + ('𑒀', '𑓅'), + ('𑓇', '𑓇'), + ('𑓐', '𑓙'), + ('𑖀', '\u{115b5}'), + ('𑖸', '\u{115c0}'), + ('𑗘', '\u{115dd}'), + ('𑘀', '\u{11640}'), + ('𑙄', '𑙄'), + ('𑙐', '𑙙'), + ('𑚀', '𑚸'), + ('𑛀', '𑛉'), + ('𑜀', '𑜚'), + ('\u{1171d}', '\u{1172b}'), + ('𑜰', '𑜹'), + ('𑝀', '𑝆'), + ('𑠀', '\u{1183a}'), + ('𑢠', '𑣩'), + ('𑣿', '𑤆'), + ('𑤉', '𑤉'), + ('𑤌', '𑤓'), + ('𑤕', '𑤖'), + ('𑤘', '𑤵'), + ('𑤷', '𑤸'), + ('\u{1193b}', '\u{11943}'), + ('𑥐', '𑥙'), + ('𑦠', '𑦧'), + ('𑦪', '\u{119d7}'), + ('\u{119da}', '𑧡'), + ('𑧣', '𑧤'), + ('𑨀', '\u{11a3e}'), + ('\u{11a47}', '\u{11a47}'), + ('𑩐', '\u{11a99}'), + ('𑪝', '𑪝'), + ('𑪰', '𑫸'), + ('𑰀', '𑰈'), + ('𑰊', '\u{11c36}'), + ('\u{11c38}', '𑱀'), + ('𑱐', '𑱙'), + ('𑱲', '𑲏'), + ('\u{11c92}', '\u{11ca7}'), + ('𑲩', '\u{11cb6}'), + ('𑴀', '𑴆'), + ('𑴈', '𑴉'), + ('𑴋', '\u{11d36}'), + ('\u{11d3a}', '\u{11d3a}'), + ('\u{11d3c}', '\u{11d3d}'), + ('\u{11d3f}', '\u{11d47}'), + ('𑵐', '𑵙'), + ('𑵠', '𑵥'), + ('𑵧', '𑵨'), + ('𑵪', '𑶎'), + ('\u{11d90}', '\u{11d91}'), + ('𑶓', '𑶘'), + ('𑶠', '𑶩'), + ('𑻠', '𑻶'), + ('\u{11f00}', '𑼐'), + ('𑼒', '\u{11f3a}'), + ('𑼾', '\u{11f42}'), + ('𑽐', '𑽙'), + ('𑾰', '𑾰'), + ('𒀀', '𒎙'), + ('𒐀', '𒑮'), + ('𒒀', '𒕃'), + ('𒾐', '𒿰'), + ('𓀀', '𓐯'), + ('\u{13440}', '\u{13455}'), + ('𔐀', '𔙆'), + ('𖠀', '𖨸'), + ('𖩀', '𖩞'), + ('𖩠', '𖩩'), + ('𖩰', '𖪾'), + ('𖫀', '𖫉'), + ('𖫐', '𖫭'), + ('\u{16af0}', '\u{16af4}'), + ('𖬀', '\u{16b36}'), + ('𖭀', '𖭃'), + ('𖭐', '𖭙'), + ('𖭣', '𖭷'), + ('𖭽', '𖮏'), + ('𖹀', '𖹿'), + ('𖼀', '𖽊'), + ('\u{16f4f}', '𖾇'), + ('\u{16f8f}', '𖾟'), + ('𖿠', '𖿡'), + ('𖿣', '\u{16fe4}'), + ('𖿰', '𖿱'), + ('𗀀', '𘟷'), + ('𘠀', '𘳕'), + ('𘴀', '𘴈'), + ('𚿰', '𚿳'), + ('𚿵', '𚿻'), + ('𚿽', '𚿾'), + ('𛀀', '𛄢'), + ('𛄲', '𛄲'), + ('𛅐', '𛅒'), + ('𛅕', '𛅕'), + ('𛅤', '𛅧'), + ('𛅰', '𛋻'), + ('𛰀', '𛱪'), + ('𛱰', '𛱼'), + ('𛲀', '𛲈'), + ('𛲐', '𛲙'), + ('\u{1bc9d}', '\u{1bc9e}'), + ('\u{1cf00}', '\u{1cf2d}'), + ('\u{1cf30}', '\u{1cf46}'), + ('\u{1d165}', '\u{1d169}'), + ('𝅭', '\u{1d172}'), + ('\u{1d17b}', '\u{1d182}'), + ('\u{1d185}', '\u{1d18b}'), + ('\u{1d1aa}', '\u{1d1ad}'), + ('\u{1d242}', '\u{1d244}'), + ('𝐀', '𝑔'), + ('𝑖', '𝒜'), + ('𝒞', '𝒟'), + ('𝒢', '𝒢'), + ('𝒥', '𝒦'), + ('𝒩', '𝒬'), + ('𝒮', '𝒹'), + ('𝒻', '𝒻'), + ('𝒽', '𝓃'), + ('𝓅', '𝔅'), + ('𝔇', '𝔊'), + ('𝔍', '𝔔'), + ('𝔖', '𝔜'), + ('𝔞', '𝔹'), + ('𝔻', '𝔾'), + ('𝕀', '𝕄'), + ('𝕆', '𝕆'), + ('𝕊', '𝕐'), + ('𝕒', '𝚥'), + ('𝚨', '𝛀'), + ('𝛂', '𝛚'), + ('𝛜', '𝛺'), + ('𝛼', '𝜔'), + ('𝜖', '𝜴'), + ('𝜶', '𝝎'), + ('𝝐', '𝝮'), + ('𝝰', '𝞈'), + ('𝞊', '𝞨'), + ('𝞪', '𝟂'), + ('𝟄', '𝟋'), + ('𝟎', '𝟿'), + ('\u{1da00}', '\u{1da36}'), + ('\u{1da3b}', '\u{1da6c}'), + ('\u{1da75}', '\u{1da75}'), + ('\u{1da84}', '\u{1da84}'), + ('\u{1da9b}', '\u{1da9f}'), + ('\u{1daa1}', '\u{1daaf}'), + ('𝼀', '𝼞'), + ('𝼥', '𝼪'), + ('\u{1e000}', '\u{1e006}'), + ('\u{1e008}', '\u{1e018}'), + ('\u{1e01b}', '\u{1e021}'), + ('\u{1e023}', '\u{1e024}'), + ('\u{1e026}', '\u{1e02a}'), + ('𞀰', '𞁭'), + ('\u{1e08f}', '\u{1e08f}'), + ('𞄀', '𞄬'), + ('\u{1e130}', '𞄽'), + ('𞅀', '𞅉'), + ('𞅎', '𞅎'), + ('𞊐', '\u{1e2ae}'), + ('𞋀', '𞋹'), + ('𞓐', '𞓹'), + ('𞟠', '𞟦'), + ('𞟨', '𞟫'), + ('𞟭', '𞟮'), + ('𞟰', '𞟾'), + ('𞠀', '𞣄'), + ('\u{1e8d0}', '\u{1e8d6}'), + ('𞤀', '𞥋'), + ('𞥐', '𞥙'), + ('𞸀', '𞸃'), + ('𞸅', '𞸟'), + ('𞸡', '𞸢'), + ('𞸤', '𞸤'), + ('𞸧', '𞸧'), + ('𞸩', '𞸲'), + ('𞸴', '𞸷'), + ('𞸹', '𞸹'), + ('𞸻', '𞸻'), + ('𞹂', '𞹂'), + ('𞹇', '𞹇'), + ('𞹉', '𞹉'), + ('𞹋', '𞹋'), + ('𞹍', '𞹏'), + ('𞹑', '𞹒'), + ('𞹔', '𞹔'), + ('𞹗', '𞹗'), + ('𞹙', '𞹙'), + ('𞹛', '𞹛'), + ('𞹝', '𞹝'), + ('𞹟', '𞹟'), + ('𞹡', '𞹢'), + ('𞹤', '𞹤'), + ('𞹧', '𞹪'), + ('𞹬', '𞹲'), + ('𞹴', '𞹷'), + ('𞹹', '𞹼'), + ('𞹾', '𞹾'), + ('𞺀', '𞺉'), + ('𞺋', '𞺛'), + ('𞺡', '𞺣'), + ('𞺥', '𞺩'), + ('𞺫', '𞺻'), + ('🄰', '🅉'), + ('🅐', '🅩'), + ('🅰', '🆉'), + ('🯰', '🯹'), + ('𠀀', '𪛟'), + ('𪜀', '𫜹'), + ('𫝀', '𫠝'), + ('𫠠', '𬺡'), + ('𬺰', '𮯠'), + ('丽', '𪘀'), + ('𰀀', '𱍊'), + ('𱍐', '𲎯'), + ('\u{e0100}', '\u{e01ef}'), +]; diff --git a/vendor/regex-automata/src/util/utf8.rs b/vendor/regex-automata/src/util/utf8.rs new file mode 100644 index 0000000..91b27ef --- /dev/null +++ b/vendor/regex-automata/src/util/utf8.rs @@ -0,0 +1,196 @@ +/*! +Utilities for dealing with UTF-8. + +This module provides some UTF-8 related helper routines, including an +incremental decoder. +*/ + +/// Returns true if and only if the given byte is considered a word character. +/// This only applies to ASCII. +/// +/// This was copied from regex-syntax so that we can use it to determine the +/// starting DFA state while searching without depending on regex-syntax. The +/// definition is never going to change, so there's no maintenance/bit-rot +/// hazard here. +#[cfg_attr(feature = "perf-inline", inline(always))] +pub(crate) fn is_word_byte(b: u8) -> bool { + const fn mkwordset() -> [bool; 256] { + // FIXME: Use as_usize() once const functions in traits are stable. + let mut set = [false; 256]; + set[b'_' as usize] = true; + + let mut byte = b'0'; + while byte <= b'9' { + set[byte as usize] = true; + byte += 1; + } + byte = b'A'; + while byte <= b'Z' { + set[byte as usize] = true; + byte += 1; + } + byte = b'a'; + while byte <= b'z' { + set[byte as usize] = true; + byte += 1; + } + set + } + const WORD: [bool; 256] = mkwordset(); + WORD[b as usize] +} + +/// Decodes the next UTF-8 encoded codepoint from the given byte slice. +/// +/// If no valid encoding of a codepoint exists at the beginning of the given +/// byte slice, then the first byte is returned instead. +/// +/// This returns `None` if and only if `bytes` is empty. +/// +/// This never panics. +/// +/// *WARNING*: This is not designed for performance. If you're looking for a +/// fast UTF-8 decoder, this is not it. If you feel like you need one in this +/// crate, then please file an issue and discuss your use case. +#[cfg_attr(feature = "perf-inline", inline(always))] +pub(crate) fn decode(bytes: &[u8]) -> Option<Result<char, u8>> { + if bytes.is_empty() { + return None; + } + let len = match len(bytes[0]) { + None => return Some(Err(bytes[0])), + Some(len) if len > bytes.len() => return Some(Err(bytes[0])), + Some(1) => return Some(Ok(char::from(bytes[0]))), + Some(len) => len, + }; + match core::str::from_utf8(&bytes[..len]) { + Ok(s) => Some(Ok(s.chars().next().unwrap())), + Err(_) => Some(Err(bytes[0])), + } +} + +/// Decodes the last UTF-8 encoded codepoint from the given byte slice. +/// +/// If no valid encoding of a codepoint exists at the end of the given byte +/// slice, then the last byte is returned instead. +/// +/// This returns `None` if and only if `bytes` is empty. +#[cfg_attr(feature = "perf-inline", inline(always))] +pub(crate) fn decode_last(bytes: &[u8]) -> Option<Result<char, u8>> { + if bytes.is_empty() { + return None; + } + let mut start = bytes.len() - 1; + let limit = bytes.len().saturating_sub(4); + while start > limit && !is_leading_or_invalid_byte(bytes[start]) { + start -= 1; + } + match decode(&bytes[start..]) { + None => None, + Some(Ok(ch)) => Some(Ok(ch)), + Some(Err(_)) => Some(Err(bytes[bytes.len() - 1])), + } +} + +/// Given a UTF-8 leading byte, this returns the total number of code units +/// in the following encoded codepoint. +/// +/// If the given byte is not a valid UTF-8 leading byte, then this returns +/// `None`. +#[cfg_attr(feature = "perf-inline", inline(always))] +fn len(byte: u8) -> Option<usize> { + if byte <= 0x7F { + return Some(1); + } else if byte & 0b1100_0000 == 0b1000_0000 { + return None; + } else if byte <= 0b1101_1111 { + Some(2) + } else if byte <= 0b1110_1111 { + Some(3) + } else if byte <= 0b1111_0111 { + Some(4) + } else { + None + } +} + +/// Returns true if and only if the given offset in the given bytes falls on a +/// valid UTF-8 encoded codepoint boundary. +/// +/// If `bytes` is not valid UTF-8, then the behavior of this routine is +/// unspecified. +#[cfg_attr(feature = "perf-inline", inline(always))] +pub(crate) fn is_boundary(bytes: &[u8], i: usize) -> bool { + match bytes.get(i) { + // The position at the end of the bytes always represents an empty + // string, which is a valid boundary. But anything after that doesn't + // make much sense to call valid a boundary. + None => i == bytes.len(), + // Other than ASCII (where the most significant bit is never set), + // valid starting bytes always have their most significant two bits + // set, where as continuation bytes never have their second most + // significant bit set. Therefore, this only returns true when bytes[i] + // corresponds to a byte that begins a valid UTF-8 encoding of a + // Unicode scalar value. + Some(&b) => b <= 0b0111_1111 || b >= 0b1100_0000, + } +} + +/// Returns true if and only if the given byte is either a valid leading UTF-8 +/// byte, or is otherwise an invalid byte that can never appear anywhere in a +/// valid UTF-8 sequence. +#[cfg_attr(feature = "perf-inline", inline(always))] +fn is_leading_or_invalid_byte(b: u8) -> bool { + // In the ASCII case, the most significant bit is never set. The leading + // byte of a 2/3/4-byte sequence always has the top two most significant + // bits set. For bytes that can never appear anywhere in valid UTF-8, this + // also returns true, since every such byte has its two most significant + // bits set: + // + // \xC0 :: 11000000 + // \xC1 :: 11000001 + // \xF5 :: 11110101 + // \xF6 :: 11110110 + // \xF7 :: 11110111 + // \xF8 :: 11111000 + // \xF9 :: 11111001 + // \xFA :: 11111010 + // \xFB :: 11111011 + // \xFC :: 11111100 + // \xFD :: 11111101 + // \xFE :: 11111110 + // \xFF :: 11111111 + (b & 0b1100_0000) != 0b1000_0000 +} + +/* +/// Returns the smallest possible index of the next valid UTF-8 sequence +/// starting after `i`. +/// +/// For all inputs, including invalid UTF-8 and any value of `i`, the return +/// value is guaranteed to be greater than `i`. (If there is no value greater +/// than `i` that fits in `usize`, then this panics.) +/// +/// Generally speaking, this should only be called on `text` when it is +/// permitted to assume that it is valid UTF-8 and where either `i >= +/// text.len()` or where `text[i]` is a leading byte of a UTF-8 sequence. +/// +/// NOTE: This method was used in a previous conception of iterators where we +/// specifically tried to skip over empty matches that split a codepoint by +/// simply requiring that our next search begin at the beginning of codepoint. +/// But we ended up changing that technique to always advance by 1 byte and +/// then filter out matches that split a codepoint after-the-fact. Thus, we no +/// longer use this method. But I've kept it around in case we want to switch +/// back to this approach. Its guarantees are a little subtle, so I'd prefer +/// not to rebuild it from whole cloth. +pub(crate) fn next(text: &[u8], i: usize) -> usize { + let b = match text.get(i) { + None => return i.checked_add(1).unwrap(), + Some(&b) => b, + }; + // For cases where we see an invalid UTF-8 byte, there isn't much we can do + // other than just start at the next byte. + let inc = len(b).unwrap_or(1); + i.checked_add(inc).unwrap() +} +*/ diff --git a/vendor/regex-automata/src/util/wire.rs b/vendor/regex-automata/src/util/wire.rs new file mode 100644 index 0000000..ecf4fd8 --- /dev/null +++ b/vendor/regex-automata/src/util/wire.rs @@ -0,0 +1,975 @@ +/*! +Types and routines that support the wire format of finite automata. + +Currently, this module just exports a few error types and some small helpers +for deserializing [dense DFAs](crate::dfa::dense::DFA) using correct alignment. +*/ + +/* +A collection of helper functions, types and traits for serializing automata. + +This crate defines its own bespoke serialization mechanism for some structures +provided in the public API, namely, DFAs. A bespoke mechanism was developed +primarily because structures like automata demand a specific binary format. +Attempting to encode their rich structure in an existing serialization +format is just not feasible. Moreover, the format for each structure is +generally designed such that deserialization is cheap. More specifically, that +deserialization can be done in constant time. (The idea being that you can +embed it into your binary or mmap it, and then use it immediately.) + +In order to achieve this, the dense and sparse DFAs in this crate use an +in-memory representation that very closely corresponds to its binary serialized +form. This pervades and complicates everything, and in some cases, requires +dealing with alignment and reasoning about safety. + +This technique does have major advantages. In particular, it permits doing +the potentially costly work of compiling a finite state machine in an offline +manner, and then loading it at runtime not only without having to re-compile +the regex, but even without the code required to do the compilation. This, for +example, permits one to use a pre-compiled DFA not only in environments without +Rust's standard library, but also in environments without a heap. + +In the code below, whenever we insert some kind of padding, it's to enforce a +4-byte alignment, unless otherwise noted. Namely, u32 is the only state ID type +supported. (In a previous version of this library, DFAs were generic over the +state ID representation.) + +Also, serialization generally requires the caller to specify endianness, +where as deserialization always assumes native endianness (otherwise cheap +deserialization would be impossible). This implies that serializing a structure +generally requires serializing both its big-endian and little-endian variants, +and then loading the correct one based on the target's endianness. +*/ + +use core::{ + cmp, + convert::{TryFrom, TryInto}, + mem::size_of, +}; + +#[cfg(feature = "alloc")] +use alloc::{vec, vec::Vec}; + +use crate::util::{ + int::Pointer, + primitives::{PatternID, PatternIDError, StateID, StateIDError}, +}; + +/// A hack to align a smaller type `B` with a bigger type `T`. +/// +/// The usual use of this is with `B = [u8]` and `T = u32`. That is, +/// it permits aligning a sequence of bytes on a 4-byte boundary. This +/// is useful in contexts where one wants to embed a serialized [dense +/// DFA](crate::dfa::dense::DFA) into a Rust a program while guaranteeing the +/// alignment required for the DFA. +/// +/// See [`dense::DFA::from_bytes`](crate::dfa::dense::DFA::from_bytes) for an +/// example of how to use this type. +#[repr(C)] +#[derive(Debug)] +pub struct AlignAs<B: ?Sized, T> { + /// A zero-sized field indicating the alignment we want. + pub _align: [T; 0], + /// A possibly non-sized field containing a sequence of bytes. + pub bytes: B, +} + +/// An error that occurs when serializing an object from this crate. +/// +/// Serialization, as used in this crate, universally refers to the process +/// of transforming a structure (like a DFA) into a custom binary format +/// represented by `&[u8]`. To this end, serialization is generally infallible. +/// However, it can fail when caller provided buffer sizes are too small. When +/// that occurs, a serialization error is reported. +/// +/// A `SerializeError` provides no introspection capabilities. Its only +/// supported operation is conversion to a human readable error message. +/// +/// This error type implements the `std::error::Error` trait only when the +/// `std` feature is enabled. Otherwise, this type is defined in all +/// configurations. +#[derive(Debug)] +pub struct SerializeError { + /// The name of the thing that a buffer is too small for. + /// + /// Currently, the only kind of serialization error is one that is + /// committed by a caller: providing a destination buffer that is too + /// small to fit the serialized object. This makes sense conceptually, + /// since every valid inhabitant of a type should be serializable. + /// + /// This is somewhat exposed in the public API of this crate. For example, + /// the `to_bytes_{big,little}_endian` APIs return a `Vec<u8>` and are + /// guaranteed to never panic or error. This is only possible because the + /// implementation guarantees that it will allocate a `Vec<u8>` that is + /// big enough. + /// + /// In summary, if a new serialization error kind needs to be added, then + /// it will need careful consideration. + what: &'static str, +} + +impl SerializeError { + pub(crate) fn buffer_too_small(what: &'static str) -> SerializeError { + SerializeError { what } + } +} + +impl core::fmt::Display for SerializeError { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + write!(f, "destination buffer is too small to write {}", self.what) + } +} + +#[cfg(feature = "std")] +impl std::error::Error for SerializeError {} + +/// An error that occurs when deserializing an object defined in this crate. +/// +/// Serialization, as used in this crate, universally refers to the process +/// of transforming a structure (like a DFA) into a custom binary format +/// represented by `&[u8]`. Deserialization, then, refers to the process of +/// cheaply converting this binary format back to the object's in-memory +/// representation as defined in this crate. To the extent possible, +/// deserialization will report this error whenever this process fails. +/// +/// A `DeserializeError` provides no introspection capabilities. Its only +/// supported operation is conversion to a human readable error message. +/// +/// This error type implements the `std::error::Error` trait only when the +/// `std` feature is enabled. Otherwise, this type is defined in all +/// configurations. +#[derive(Debug)] +pub struct DeserializeError(DeserializeErrorKind); + +#[derive(Debug)] +enum DeserializeErrorKind { + Generic { msg: &'static str }, + BufferTooSmall { what: &'static str }, + InvalidUsize { what: &'static str }, + VersionMismatch { expected: u32, found: u32 }, + EndianMismatch { expected: u32, found: u32 }, + AlignmentMismatch { alignment: usize, address: usize }, + LabelMismatch { expected: &'static str }, + ArithmeticOverflow { what: &'static str }, + PatternID { err: PatternIDError, what: &'static str }, + StateID { err: StateIDError, what: &'static str }, +} + +impl DeserializeError { + pub(crate) fn generic(msg: &'static str) -> DeserializeError { + DeserializeError(DeserializeErrorKind::Generic { msg }) + } + + pub(crate) fn buffer_too_small(what: &'static str) -> DeserializeError { + DeserializeError(DeserializeErrorKind::BufferTooSmall { what }) + } + + fn invalid_usize(what: &'static str) -> DeserializeError { + DeserializeError(DeserializeErrorKind::InvalidUsize { what }) + } + + fn version_mismatch(expected: u32, found: u32) -> DeserializeError { + DeserializeError(DeserializeErrorKind::VersionMismatch { + expected, + found, + }) + } + + fn endian_mismatch(expected: u32, found: u32) -> DeserializeError { + DeserializeError(DeserializeErrorKind::EndianMismatch { + expected, + found, + }) + } + + fn alignment_mismatch( + alignment: usize, + address: usize, + ) -> DeserializeError { + DeserializeError(DeserializeErrorKind::AlignmentMismatch { + alignment, + address, + }) + } + + fn label_mismatch(expected: &'static str) -> DeserializeError { + DeserializeError(DeserializeErrorKind::LabelMismatch { expected }) + } + + fn arithmetic_overflow(what: &'static str) -> DeserializeError { + DeserializeError(DeserializeErrorKind::ArithmeticOverflow { what }) + } + + fn pattern_id_error( + err: PatternIDError, + what: &'static str, + ) -> DeserializeError { + DeserializeError(DeserializeErrorKind::PatternID { err, what }) + } + + pub(crate) fn state_id_error( + err: StateIDError, + what: &'static str, + ) -> DeserializeError { + DeserializeError(DeserializeErrorKind::StateID { err, what }) + } +} + +#[cfg(feature = "std")] +impl std::error::Error for DeserializeError {} + +impl core::fmt::Display for DeserializeError { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + use self::DeserializeErrorKind::*; + + match self.0 { + Generic { msg } => write!(f, "{}", msg), + BufferTooSmall { what } => { + write!(f, "buffer is too small to read {}", what) + } + InvalidUsize { what } => { + write!(f, "{} is too big to fit in a usize", what) + } + VersionMismatch { expected, found } => write!( + f, + "unsupported version: \ + expected version {} but found version {}", + expected, found, + ), + EndianMismatch { expected, found } => write!( + f, + "endianness mismatch: expected 0x{:X} but got 0x{:X}. \ + (Are you trying to load an object serialized with a \ + different endianness?)", + expected, found, + ), + AlignmentMismatch { alignment, address } => write!( + f, + "alignment mismatch: slice starts at address \ + 0x{:X}, which is not aligned to a {} byte boundary", + address, alignment, + ), + LabelMismatch { expected } => write!( + f, + "label mismatch: start of serialized object should \ + contain a NUL terminated {:?} label, but a different \ + label was found", + expected, + ), + ArithmeticOverflow { what } => { + write!(f, "arithmetic overflow for {}", what) + } + PatternID { ref err, what } => { + write!(f, "failed to read pattern ID for {}: {}", what, err) + } + StateID { ref err, what } => { + write!(f, "failed to read state ID for {}: {}", what, err) + } + } + } +} + +/// Safely converts a `&[u32]` to `&[StateID]` with zero cost. +#[cfg_attr(feature = "perf-inline", inline(always))] +pub(crate) fn u32s_to_state_ids(slice: &[u32]) -> &[StateID] { + // SAFETY: This is safe because StateID is defined to have the same memory + // representation as a u32 (it is repr(transparent)). While not every u32 + // is a "valid" StateID, callers are not permitted to rely on the validity + // of StateIDs for memory safety. It can only lead to logical errors. (This + // is why StateID::new_unchecked is safe.) + unsafe { + core::slice::from_raw_parts( + slice.as_ptr().cast::<StateID>(), + slice.len(), + ) + } +} + +/// Safely converts a `&mut [u32]` to `&mut [StateID]` with zero cost. +pub(crate) fn u32s_to_state_ids_mut(slice: &mut [u32]) -> &mut [StateID] { + // SAFETY: This is safe because StateID is defined to have the same memory + // representation as a u32 (it is repr(transparent)). While not every u32 + // is a "valid" StateID, callers are not permitted to rely on the validity + // of StateIDs for memory safety. It can only lead to logical errors. (This + // is why StateID::new_unchecked is safe.) + unsafe { + core::slice::from_raw_parts_mut( + slice.as_mut_ptr().cast::<StateID>(), + slice.len(), + ) + } +} + +/// Safely converts a `&[u32]` to `&[PatternID]` with zero cost. +#[cfg_attr(feature = "perf-inline", inline(always))] +pub(crate) fn u32s_to_pattern_ids(slice: &[u32]) -> &[PatternID] { + // SAFETY: This is safe because PatternID is defined to have the same + // memory representation as a u32 (it is repr(transparent)). While not + // every u32 is a "valid" PatternID, callers are not permitted to rely + // on the validity of PatternIDs for memory safety. It can only lead to + // logical errors. (This is why PatternID::new_unchecked is safe.) + unsafe { + core::slice::from_raw_parts( + slice.as_ptr().cast::<PatternID>(), + slice.len(), + ) + } +} + +/// Checks that the given slice has an alignment that matches `T`. +/// +/// This is useful for checking that a slice has an appropriate alignment +/// before casting it to a &[T]. Note though that alignment is not itself +/// sufficient to perform the cast for any `T`. +pub(crate) fn check_alignment<T>( + slice: &[u8], +) -> Result<(), DeserializeError> { + let alignment = core::mem::align_of::<T>(); + let address = slice.as_ptr().as_usize(); + if address % alignment == 0 { + return Ok(()); + } + Err(DeserializeError::alignment_mismatch(alignment, address)) +} + +/// Reads a possibly empty amount of padding, up to 7 bytes, from the beginning +/// of the given slice. All padding bytes must be NUL bytes. +/// +/// This is useful because it can be theoretically necessary to pad the +/// beginning of a serialized object with NUL bytes to ensure that it starts +/// at a correctly aligned address. These padding bytes should come immediately +/// before the label. +/// +/// This returns the number of bytes read from the given slice. +pub(crate) fn skip_initial_padding(slice: &[u8]) -> usize { + let mut nread = 0; + while nread < 7 && nread < slice.len() && slice[nread] == 0 { + nread += 1; + } + nread +} + +/// Allocate a byte buffer of the given size, along with some initial padding +/// such that `buf[padding..]` has the same alignment as `T`, where the +/// alignment of `T` must be at most `8`. In particular, callers should treat +/// the first N bytes (second return value) as padding bytes that must not be +/// overwritten. In all cases, the following identity holds: +/// +/// ```ignore +/// let (buf, padding) = alloc_aligned_buffer::<StateID>(SIZE); +/// assert_eq!(SIZE, buf[padding..].len()); +/// ``` +/// +/// In practice, padding is often zero. +/// +/// The requirement for `8` as a maximum here is somewhat arbitrary. In +/// practice, we never need anything bigger in this crate, and so this function +/// does some sanity asserts under the assumption of a max alignment of `8`. +#[cfg(feature = "alloc")] +pub(crate) fn alloc_aligned_buffer<T>(size: usize) -> (Vec<u8>, usize) { + // NOTE: This is a kludge because there's no easy way to allocate a Vec<u8> + // with an alignment guaranteed to be greater than 1. We could create a + // Vec<u32>, but this cannot be safely transmuted to a Vec<u8> without + // concern, since reallocing or dropping the Vec<u8> is UB (different + // alignment than the initial allocation). We could define a wrapper type + // to manage this for us, but it seems like more machinery than it's worth. + let buf = vec![0; size]; + let align = core::mem::align_of::<T>(); + let address = buf.as_ptr().as_usize(); + if address % align == 0 { + return (buf, 0); + } + // Let's try this again. We have to create a totally new alloc with + // the maximum amount of bytes we might need. We can't just extend our + // pre-existing 'buf' because that might create a new alloc with a + // different alignment. + let extra = align - 1; + let mut buf = vec![0; size + extra]; + let address = buf.as_ptr().as_usize(); + // The code below handles the case where 'address' is aligned to T, so if + // we got lucky and 'address' is now aligned to T (when it previously + // wasn't), then we're done. + if address % align == 0 { + buf.truncate(size); + return (buf, 0); + } + let padding = ((address & !(align - 1)).checked_add(align).unwrap()) + .checked_sub(address) + .unwrap(); + assert!(padding <= 7, "padding of {} is bigger than 7", padding); + assert!( + padding <= extra, + "padding of {} is bigger than extra {} bytes", + padding, + extra + ); + buf.truncate(size + padding); + assert_eq!(size + padding, buf.len()); + assert_eq!( + 0, + buf[padding..].as_ptr().as_usize() % align, + "expected end of initial padding to be aligned to {}", + align, + ); + (buf, padding) +} + +/// Reads a NUL terminated label starting at the beginning of the given slice. +/// +/// If a NUL terminated label could not be found, then an error is returned. +/// Similarly, if a label is found but doesn't match the expected label, then +/// an error is returned. +/// +/// Upon success, the total number of bytes read (including padding bytes) is +/// returned. +pub(crate) fn read_label( + slice: &[u8], + expected_label: &'static str, +) -> Result<usize, DeserializeError> { + // Set an upper bound on how many bytes we scan for a NUL. Since no label + // in this crate is longer than 256 bytes, if we can't find one within that + // range, then we have corrupted data. + let first_nul = + slice[..cmp::min(slice.len(), 256)].iter().position(|&b| b == 0); + let first_nul = match first_nul { + Some(first_nul) => first_nul, + None => { + return Err(DeserializeError::generic( + "could not find NUL terminated label \ + at start of serialized object", + )); + } + }; + let len = first_nul + padding_len(first_nul); + if slice.len() < len { + return Err(DeserializeError::generic( + "could not find properly sized label at start of serialized object" + )); + } + if expected_label.as_bytes() != &slice[..first_nul] { + return Err(DeserializeError::label_mismatch(expected_label)); + } + Ok(len) +} + +/// Writes the given label to the buffer as a NUL terminated string. The label +/// given must not contain NUL, otherwise this will panic. Similarly, the label +/// must not be longer than 255 bytes, otherwise this will panic. +/// +/// Additional NUL bytes are written as necessary to ensure that the number of +/// bytes written is always a multiple of 4. +/// +/// Upon success, the total number of bytes written (including padding) is +/// returned. +pub(crate) fn write_label( + label: &str, + dst: &mut [u8], +) -> Result<usize, SerializeError> { + let nwrite = write_label_len(label); + if dst.len() < nwrite { + return Err(SerializeError::buffer_too_small("label")); + } + dst[..label.len()].copy_from_slice(label.as_bytes()); + for i in 0..(nwrite - label.len()) { + dst[label.len() + i] = 0; + } + assert_eq!(nwrite % 4, 0); + Ok(nwrite) +} + +/// Returns the total number of bytes (including padding) that would be written +/// for the given label. This panics if the given label contains a NUL byte or +/// is longer than 255 bytes. (The size restriction exists so that searching +/// for a label during deserialization can be done in small bounded space.) +pub(crate) fn write_label_len(label: &str) -> usize { + if label.len() > 255 { + panic!("label must not be longer than 255 bytes"); + } + if label.as_bytes().iter().position(|&b| b == 0).is_some() { + panic!("label must not contain NUL bytes"); + } + let label_len = label.len() + 1; // +1 for the NUL terminator + label_len + padding_len(label_len) +} + +/// Reads the endianness check from the beginning of the given slice and +/// confirms that the endianness of the serialized object matches the expected +/// endianness. If the slice is too small or if the endianness check fails, +/// this returns an error. +/// +/// Upon success, the total number of bytes read is returned. +pub(crate) fn read_endianness_check( + slice: &[u8], +) -> Result<usize, DeserializeError> { + let (n, nr) = try_read_u32(slice, "endianness check")?; + assert_eq!(nr, write_endianness_check_len()); + if n != 0xFEFF { + return Err(DeserializeError::endian_mismatch(0xFEFF, n)); + } + Ok(nr) +} + +/// Writes 0xFEFF as an integer using the given endianness. +/// +/// This is useful for writing into the header of a serialized object. It can +/// be read during deserialization as a sanity check to ensure the proper +/// endianness is used. +/// +/// Upon success, the total number of bytes written is returned. +pub(crate) fn write_endianness_check<E: Endian>( + dst: &mut [u8], +) -> Result<usize, SerializeError> { + let nwrite = write_endianness_check_len(); + if dst.len() < nwrite { + return Err(SerializeError::buffer_too_small("endianness check")); + } + E::write_u32(0xFEFF, dst); + Ok(nwrite) +} + +/// Returns the number of bytes written by the endianness check. +pub(crate) fn write_endianness_check_len() -> usize { + size_of::<u32>() +} + +/// Reads a version number from the beginning of the given slice and confirms +/// that is matches the expected version number given. If the slice is too +/// small or if the version numbers aren't equivalent, this returns an error. +/// +/// Upon success, the total number of bytes read is returned. +/// +/// N.B. Currently, we require that the version number is exactly equivalent. +/// In the future, if we bump the version number without a semver bump, then +/// we'll need to relax this a bit and support older versions. +pub(crate) fn read_version( + slice: &[u8], + expected_version: u32, +) -> Result<usize, DeserializeError> { + let (n, nr) = try_read_u32(slice, "version")?; + assert_eq!(nr, write_version_len()); + if n != expected_version { + return Err(DeserializeError::version_mismatch(expected_version, n)); + } + Ok(nr) +} + +/// Writes the given version number to the beginning of the given slice. +/// +/// This is useful for writing into the header of a serialized object. It can +/// be read during deserialization as a sanity check to ensure that the library +/// code supports the format of the serialized object. +/// +/// Upon success, the total number of bytes written is returned. +pub(crate) fn write_version<E: Endian>( + version: u32, + dst: &mut [u8], +) -> Result<usize, SerializeError> { + let nwrite = write_version_len(); + if dst.len() < nwrite { + return Err(SerializeError::buffer_too_small("version number")); + } + E::write_u32(version, dst); + Ok(nwrite) +} + +/// Returns the number of bytes written by writing the version number. +pub(crate) fn write_version_len() -> usize { + size_of::<u32>() +} + +/// Reads a pattern ID from the given slice. If the slice has insufficient +/// length, then this panics. If the deserialized integer exceeds the pattern +/// ID limit for the current target, then this returns an error. +/// +/// Upon success, this also returns the number of bytes read. +pub(crate) fn read_pattern_id( + slice: &[u8], + what: &'static str, +) -> Result<(PatternID, usize), DeserializeError> { + let bytes: [u8; PatternID::SIZE] = + slice[..PatternID::SIZE].try_into().unwrap(); + let pid = PatternID::from_ne_bytes(bytes) + .map_err(|err| DeserializeError::pattern_id_error(err, what))?; + Ok((pid, PatternID::SIZE)) +} + +/// Reads a pattern ID from the given slice. If the slice has insufficient +/// length, then this panics. Otherwise, the deserialized integer is assumed +/// to be a valid pattern ID. +/// +/// This also returns the number of bytes read. +pub(crate) fn read_pattern_id_unchecked(slice: &[u8]) -> (PatternID, usize) { + let pid = PatternID::from_ne_bytes_unchecked( + slice[..PatternID::SIZE].try_into().unwrap(), + ); + (pid, PatternID::SIZE) +} + +/// Write the given pattern ID to the beginning of the given slice of bytes +/// using the specified endianness. The given slice must have length at least +/// `PatternID::SIZE`, or else this panics. Upon success, the total number of +/// bytes written is returned. +pub(crate) fn write_pattern_id<E: Endian>( + pid: PatternID, + dst: &mut [u8], +) -> usize { + E::write_u32(pid.as_u32(), dst); + PatternID::SIZE +} + +/// Attempts to read a state ID from the given slice. If the slice has an +/// insufficient number of bytes or if the state ID exceeds the limit for +/// the current target, then this returns an error. +/// +/// Upon success, this also returns the number of bytes read. +pub(crate) fn try_read_state_id( + slice: &[u8], + what: &'static str, +) -> Result<(StateID, usize), DeserializeError> { + if slice.len() < StateID::SIZE { + return Err(DeserializeError::buffer_too_small(what)); + } + read_state_id(slice, what) +} + +/// Reads a state ID from the given slice. If the slice has insufficient +/// length, then this panics. If the deserialized integer exceeds the state ID +/// limit for the current target, then this returns an error. +/// +/// Upon success, this also returns the number of bytes read. +pub(crate) fn read_state_id( + slice: &[u8], + what: &'static str, +) -> Result<(StateID, usize), DeserializeError> { + let bytes: [u8; StateID::SIZE] = + slice[..StateID::SIZE].try_into().unwrap(); + let sid = StateID::from_ne_bytes(bytes) + .map_err(|err| DeserializeError::state_id_error(err, what))?; + Ok((sid, StateID::SIZE)) +} + +/// Reads a state ID from the given slice. If the slice has insufficient +/// length, then this panics. Otherwise, the deserialized integer is assumed +/// to be a valid state ID. +/// +/// This also returns the number of bytes read. +pub(crate) fn read_state_id_unchecked(slice: &[u8]) -> (StateID, usize) { + let sid = StateID::from_ne_bytes_unchecked( + slice[..StateID::SIZE].try_into().unwrap(), + ); + (sid, StateID::SIZE) +} + +/// Write the given state ID to the beginning of the given slice of bytes +/// using the specified endianness. The given slice must have length at least +/// `StateID::SIZE`, or else this panics. Upon success, the total number of +/// bytes written is returned. +pub(crate) fn write_state_id<E: Endian>( + sid: StateID, + dst: &mut [u8], +) -> usize { + E::write_u32(sid.as_u32(), dst); + StateID::SIZE +} + +/// Try to read a u16 as a usize from the beginning of the given slice in +/// native endian format. If the slice has fewer than 2 bytes or if the +/// deserialized number cannot be represented by usize, then this returns an +/// error. The error message will include the `what` description of what is +/// being deserialized, for better error messages. `what` should be a noun in +/// singular form. +/// +/// Upon success, this also returns the number of bytes read. +pub(crate) fn try_read_u16_as_usize( + slice: &[u8], + what: &'static str, +) -> Result<(usize, usize), DeserializeError> { + try_read_u16(slice, what).and_then(|(n, nr)| { + usize::try_from(n) + .map(|n| (n, nr)) + .map_err(|_| DeserializeError::invalid_usize(what)) + }) +} + +/// Try to read a u32 as a usize from the beginning of the given slice in +/// native endian format. If the slice has fewer than 4 bytes or if the +/// deserialized number cannot be represented by usize, then this returns an +/// error. The error message will include the `what` description of what is +/// being deserialized, for better error messages. `what` should be a noun in +/// singular form. +/// +/// Upon success, this also returns the number of bytes read. +pub(crate) fn try_read_u32_as_usize( + slice: &[u8], + what: &'static str, +) -> Result<(usize, usize), DeserializeError> { + try_read_u32(slice, what).and_then(|(n, nr)| { + usize::try_from(n) + .map(|n| (n, nr)) + .map_err(|_| DeserializeError::invalid_usize(what)) + }) +} + +/// Try to read a u16 from the beginning of the given slice in native endian +/// format. If the slice has fewer than 2 bytes, then this returns an error. +/// The error message will include the `what` description of what is being +/// deserialized, for better error messages. `what` should be a noun in +/// singular form. +/// +/// Upon success, this also returns the number of bytes read. +pub(crate) fn try_read_u16( + slice: &[u8], + what: &'static str, +) -> Result<(u16, usize), DeserializeError> { + check_slice_len(slice, size_of::<u16>(), what)?; + Ok((read_u16(slice), size_of::<u16>())) +} + +/// Try to read a u32 from the beginning of the given slice in native endian +/// format. If the slice has fewer than 4 bytes, then this returns an error. +/// The error message will include the `what` description of what is being +/// deserialized, for better error messages. `what` should be a noun in +/// singular form. +/// +/// Upon success, this also returns the number of bytes read. +pub(crate) fn try_read_u32( + slice: &[u8], + what: &'static str, +) -> Result<(u32, usize), DeserializeError> { + check_slice_len(slice, size_of::<u32>(), what)?; + Ok((read_u32(slice), size_of::<u32>())) +} + +/// Try to read a u128 from the beginning of the given slice in native endian +/// format. If the slice has fewer than 16 bytes, then this returns an error. +/// The error message will include the `what` description of what is being +/// deserialized, for better error messages. `what` should be a noun in +/// singular form. +/// +/// Upon success, this also returns the number of bytes read. +pub(crate) fn try_read_u128( + slice: &[u8], + what: &'static str, +) -> Result<(u128, usize), DeserializeError> { + check_slice_len(slice, size_of::<u128>(), what)?; + Ok((read_u128(slice), size_of::<u128>())) +} + +/// Read a u16 from the beginning of the given slice in native endian format. +/// If the slice has fewer than 2 bytes, then this panics. +/// +/// Marked as inline to speed up sparse searching which decodes integers from +/// its automaton at search time. +#[cfg_attr(feature = "perf-inline", inline(always))] +pub(crate) fn read_u16(slice: &[u8]) -> u16 { + let bytes: [u8; 2] = slice[..size_of::<u16>()].try_into().unwrap(); + u16::from_ne_bytes(bytes) +} + +/// Read a u32 from the beginning of the given slice in native endian format. +/// If the slice has fewer than 4 bytes, then this panics. +/// +/// Marked as inline to speed up sparse searching which decodes integers from +/// its automaton at search time. +#[cfg_attr(feature = "perf-inline", inline(always))] +pub(crate) fn read_u32(slice: &[u8]) -> u32 { + let bytes: [u8; 4] = slice[..size_of::<u32>()].try_into().unwrap(); + u32::from_ne_bytes(bytes) +} + +/// Read a u128 from the beginning of the given slice in native endian format. +/// If the slice has fewer than 16 bytes, then this panics. +pub(crate) fn read_u128(slice: &[u8]) -> u128 { + let bytes: [u8; 16] = slice[..size_of::<u128>()].try_into().unwrap(); + u128::from_ne_bytes(bytes) +} + +/// Checks that the given slice has some minimal length. If it's smaller than +/// the bound given, then a "buffer too small" error is returned with `what` +/// describing what the buffer represents. +pub(crate) fn check_slice_len<T>( + slice: &[T], + at_least_len: usize, + what: &'static str, +) -> Result<(), DeserializeError> { + if slice.len() < at_least_len { + return Err(DeserializeError::buffer_too_small(what)); + } + Ok(()) +} + +/// Multiply the given numbers, and on overflow, return an error that includes +/// 'what' in the error message. +/// +/// This is useful when doing arithmetic with untrusted data. +pub(crate) fn mul( + a: usize, + b: usize, + what: &'static str, +) -> Result<usize, DeserializeError> { + match a.checked_mul(b) { + Some(c) => Ok(c), + None => Err(DeserializeError::arithmetic_overflow(what)), + } +} + +/// Add the given numbers, and on overflow, return an error that includes +/// 'what' in the error message. +/// +/// This is useful when doing arithmetic with untrusted data. +pub(crate) fn add( + a: usize, + b: usize, + what: &'static str, +) -> Result<usize, DeserializeError> { + match a.checked_add(b) { + Some(c) => Ok(c), + None => Err(DeserializeError::arithmetic_overflow(what)), + } +} + +/// Shift `a` left by `b`, and on overflow, return an error that includes +/// 'what' in the error message. +/// +/// This is useful when doing arithmetic with untrusted data. +pub(crate) fn shl( + a: usize, + b: usize, + what: &'static str, +) -> Result<usize, DeserializeError> { + let amount = u32::try_from(b) + .map_err(|_| DeserializeError::arithmetic_overflow(what))?; + match a.checked_shl(amount) { + Some(c) => Ok(c), + None => Err(DeserializeError::arithmetic_overflow(what)), + } +} + +/// Returns the number of additional bytes required to add to the given length +/// in order to make the total length a multiple of 4. The return value is +/// always less than 4. +pub(crate) fn padding_len(non_padding_len: usize) -> usize { + (4 - (non_padding_len & 0b11)) & 0b11 +} + +/// A simple trait for writing code generic over endianness. +/// +/// This is similar to what byteorder provides, but we only need a very small +/// subset. +pub(crate) trait Endian { + /// Writes a u16 to the given destination buffer in a particular + /// endianness. If the destination buffer has a length smaller than 2, then + /// this panics. + fn write_u16(n: u16, dst: &mut [u8]); + + /// Writes a u32 to the given destination buffer in a particular + /// endianness. If the destination buffer has a length smaller than 4, then + /// this panics. + fn write_u32(n: u32, dst: &mut [u8]); + + /// Writes a u64 to the given destination buffer in a particular + /// endianness. If the destination buffer has a length smaller than 8, then + /// this panics. + fn write_u64(n: u64, dst: &mut [u8]); + + /// Writes a u128 to the given destination buffer in a particular + /// endianness. If the destination buffer has a length smaller than 16, + /// then this panics. + fn write_u128(n: u128, dst: &mut [u8]); +} + +/// Little endian writing. +pub(crate) enum LE {} +/// Big endian writing. +pub(crate) enum BE {} + +#[cfg(target_endian = "little")] +pub(crate) type NE = LE; +#[cfg(target_endian = "big")] +pub(crate) type NE = BE; + +impl Endian for LE { + fn write_u16(n: u16, dst: &mut [u8]) { + dst[..2].copy_from_slice(&n.to_le_bytes()); + } + + fn write_u32(n: u32, dst: &mut [u8]) { + dst[..4].copy_from_slice(&n.to_le_bytes()); + } + + fn write_u64(n: u64, dst: &mut [u8]) { + dst[..8].copy_from_slice(&n.to_le_bytes()); + } + + fn write_u128(n: u128, dst: &mut [u8]) { + dst[..16].copy_from_slice(&n.to_le_bytes()); + } +} + +impl Endian for BE { + fn write_u16(n: u16, dst: &mut [u8]) { + dst[..2].copy_from_slice(&n.to_be_bytes()); + } + + fn write_u32(n: u32, dst: &mut [u8]) { + dst[..4].copy_from_slice(&n.to_be_bytes()); + } + + fn write_u64(n: u64, dst: &mut [u8]) { + dst[..8].copy_from_slice(&n.to_be_bytes()); + } + + fn write_u128(n: u128, dst: &mut [u8]) { + dst[..16].copy_from_slice(&n.to_be_bytes()); + } +} + +#[cfg(all(test, feature = "alloc"))] +mod tests { + use super::*; + + #[test] + fn labels() { + let mut buf = [0; 1024]; + + let nwrite = write_label("fooba", &mut buf).unwrap(); + assert_eq!(nwrite, 8); + assert_eq!(&buf[..nwrite], b"fooba\x00\x00\x00"); + + let nread = read_label(&buf, "fooba").unwrap(); + assert_eq!(nread, 8); + } + + #[test] + #[should_panic] + fn bad_label_interior_nul() { + // interior NULs are not allowed + write_label("foo\x00bar", &mut [0; 1024]).unwrap(); + } + + #[test] + fn bad_label_almost_too_long() { + // ok + write_label(&"z".repeat(255), &mut [0; 1024]).unwrap(); + } + + #[test] + #[should_panic] + fn bad_label_too_long() { + // labels longer than 255 bytes are banned + write_label(&"z".repeat(256), &mut [0; 1024]).unwrap(); + } + + #[test] + fn padding() { + assert_eq!(0, padding_len(8)); + assert_eq!(3, padding_len(9)); + assert_eq!(2, padding_len(10)); + assert_eq!(1, padding_len(11)); + assert_eq!(0, padding_len(12)); + assert_eq!(3, padding_len(13)); + assert_eq!(2, padding_len(14)); + assert_eq!(1, padding_len(15)); + assert_eq!(0, padding_len(16)); + } +} |