diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-30 18:31:44 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-30 18:31:44 +0000 |
commit | c23a457e72abe608715ac76f076f47dc42af07a5 (patch) | |
tree | 2772049aaf84b5c9d0ed12ec8d86812f7a7904b6 /vendor/regex-automata/src/util | |
parent | Releasing progress-linux version 1.73.0+dfsg1-1~progress7.99u1. (diff) | |
download | rustc-c23a457e72abe608715ac76f076f47dc42af07a5.tar.xz rustc-c23a457e72abe608715ac76f076f47dc42af07a5.zip |
Merging upstream version 1.74.1+dfsg1.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'vendor/regex-automata/src/util')
32 files changed, 14945 insertions, 2220 deletions
diff --git a/vendor/regex-automata/src/util/alphabet.rs b/vendor/regex-automata/src/util/alphabet.rs index 0bc1ece58..22b5a7644 100644 --- a/vendor/regex-automata/src/util/alphabet.rs +++ b/vendor/regex-automata/src/util/alphabet.rs @@ -1,25 +1,65 @@ -use core::convert::TryFrom; - +/*! +This module provides APIs for dealing with the alphabets of finite state +machines. + +There are two principal types in this module, [`ByteClasses`] and [`Unit`]. +The former defines the alphabet of a finite state machine while the latter +represents an element of that alphabet. + +To a first approximation, the alphabet of all automata in this crate is just +a `u8`. Namely, every distinct byte value. All 256 of them. In practice, this +can be quite wasteful when building a transition table for a DFA, since it +requires storing a state identifier for each element in the alphabet. Instead, +we collapse the alphabet of an automaton down into equivalence classes, where +every byte in the same equivalence class never discriminates between a match or +a non-match from any other byte in the same class. For example, in the regex +`[a-z]+`, then you could consider it having an alphabet consisting of two +equivalence classes: `a-z` and everything else. In terms of the transitions on +an automaton, it doesn't actually require representing every distinct byte. +Just the equivalence classes. + +The downside of equivalence classes is that, of course, searching a haystack +deals with individual byte values. Those byte values need to be mapped to +their corresponding equivalence class. This is what `ByteClasses` does. In +practice, doing this for every state transition has negligible impact on modern +CPUs. Moreover, it helps make more efficient use of the CPU cache by (possibly +considerably) shrinking the size of the transition table. + +One last hiccup concerns `Unit`. Namely, because of look-around and how the +DFAs in this crate work, we need to add a sentinel value to our alphabet +of equivalence classes that represents the "end" of a search. We call that +sentinel [`Unit::eoi`] or "end of input." Thus, a `Unit` is either an +equivalence class corresponding to a set of bytes, or it is a special "end of +input" sentinel. + +In general, you should not expect to need either of these types unless you're +doing lower level shenanigans with DFAs, or even building your own DFAs. +(Although, you don't have to use these types to build your own DFAs of course.) +For example, if you're walking a DFA's state graph, it's probably useful to +make use of [`ByteClasses`] to visit each element in the DFA's alphabet instead +of just visiting every distinct `u8` value. The latter isn't necessarily wrong, +but it could be potentially very wasteful. +*/ use crate::util::{ - bytes::{DeserializeError, SerializeError}, - DebugByte, + escape::DebugByte, + wire::{self, DeserializeError, SerializeError}, }; -/// Unit represents a single unit of input for DFA based regex engines. +/// Unit represents a single unit of haystack for DFA based regex engines. /// -/// **NOTE:** It is not expected for consumers of this crate to need to use -/// this type unless they are implementing their own DFA. And even then, it's -/// not required: implementors may use other techniques to handle input. +/// It is not expected for consumers of this crate to need to use this type +/// unless they are implementing their own DFA. And even then, it's not +/// required: implementors may use other techniques to handle haystack units. /// -/// Typically, a single unit of input for a DFA would be a single byte. +/// Typically, a single unit of haystack for a DFA would be a single byte. /// However, for the DFAs in this crate, matches are delayed by a single byte /// in order to handle look-ahead assertions (`\b`, `$` and `\z`). Thus, once /// we have consumed the haystack, we must run the DFA through one additional -/// transition using an input that indicates the haystack has ended. +/// transition using a unit that indicates the haystack has ended. /// -/// Since there is no way to represent a sentinel with a `u8` since all -/// possible values *may* be valid inputs to a DFA, this type explicitly adds -/// room for a sentinel value. +/// There is no way to represent a sentinel with a `u8` since all possible +/// values *may* be valid haystack units to a DFA, therefore this type +/// explicitly adds room for a sentinel value. /// /// The sentinel EOI value is always its own equivalence class and is /// ultimately represented by adding 1 to the maximum equivalence class value. @@ -36,74 +76,108 @@ use crate::util::{ /// Where EOI is the special sentinel value that is always in its own /// singleton equivalence class. #[derive(Clone, Copy, Eq, PartialEq, PartialOrd, Ord)] -pub enum Unit { +pub struct Unit(UnitKind); + +#[derive(Clone, Copy, Eq, PartialEq, PartialOrd, Ord)] +enum UnitKind { + /// Represents a byte value, or more typically, an equivalence class + /// represented as a byte value. U8(u8), + /// Represents the "end of input" sentinel. We regretably use a `u16` + /// here since the maximum sentinel value is `256`. Thankfully, we don't + /// actually store a `Unit` anywhere, so this extra space shouldn't be too + /// bad. EOI(u16), } impl Unit { - /// Create a new input unit from a byte value. + /// Create a new haystack unit from a byte value. /// - /// All possible byte values are legal. However, when creating an input - /// unit for a specific DFA, one should be careful to only construct input - /// units that are in that DFA's alphabet. Namely, one way to compact a - /// DFA's in-memory representation is to collapse its transitions to a set - /// of equivalence classes into a set of all possible byte values. If a - /// DFA uses equivalence classes instead of byte values, then the byte - /// given here should be the equivalence class. + /// All possible byte values are legal. However, when creating a haystack + /// unit for a specific DFA, one should be careful to only construct units + /// that are in that DFA's alphabet. Namely, one way to compact a DFA's + /// in-memory representation is to collapse its transitions to a set of + /// equivalence classes into a set of all possible byte values. If a DFA + /// uses equivalence classes instead of byte values, then the byte given + /// here should be the equivalence class. pub fn u8(byte: u8) -> Unit { - Unit::U8(byte) + Unit(UnitKind::U8(byte)) } + /// Create a new "end of input" haystack unit. + /// + /// The value given is the sentinel value used by this unit to represent + /// the "end of input." The value should be the total number of equivalence + /// classes in the corresponding alphabet. Its maximum value is `256`, + /// which occurs when every byte is its own equivalence class. + /// + /// # Panics + /// + /// This panics when `num_byte_equiv_classes` is greater than `256`. pub fn eoi(num_byte_equiv_classes: usize) -> Unit { assert!( num_byte_equiv_classes <= 256, "max number of byte-based equivalent classes is 256, but got {}", num_byte_equiv_classes, ); - Unit::EOI(u16::try_from(num_byte_equiv_classes).unwrap()) + Unit(UnitKind::EOI(u16::try_from(num_byte_equiv_classes).unwrap())) } + /// If this unit is not an "end of input" sentinel, then returns its + /// underlying byte value. Otherwise return `None`. pub fn as_u8(self) -> Option<u8> { - match self { - Unit::U8(b) => Some(b), - Unit::EOI(_) => None, + match self.0 { + UnitKind::U8(b) => Some(b), + UnitKind::EOI(_) => None, } } - #[cfg(feature = "alloc")] - pub fn as_eoi(self) -> Option<usize> { - match self { - Unit::U8(_) => None, - Unit::EOI(eoi) => Some(eoi as usize), + /// If this unit is an "end of input" sentinel, then return the underlying + /// sentinel value that was given to [`Unit::eoi`]. Otherwise return + /// `None`. + pub fn as_eoi(self) -> Option<u16> { + match self.0 { + UnitKind::U8(_) => None, + UnitKind::EOI(sentinel) => Some(sentinel), } } + /// Return this unit as a `usize`, regardless of whether it is a byte value + /// or an "end of input" sentinel. In the latter case, the underlying + /// sentinel value given to [`Unit::eoi`] is returned. pub fn as_usize(self) -> usize { - match self { - Unit::U8(b) => b as usize, - Unit::EOI(eoi) => eoi as usize, + match self.0 { + UnitKind::U8(b) => usize::from(b), + UnitKind::EOI(eoi) => usize::from(eoi), } } - pub fn is_eoi(&self) -> bool { - match *self { - Unit::EOI(_) => true, - _ => false, - } + /// Returns true if and only of this unit is a byte value equivalent to the + /// byte given. This always returns false when this is an "end of input" + /// sentinel. + pub fn is_byte(self, byte: u8) -> bool { + self.as_u8().map_or(false, |b| b == byte) } - #[cfg(feature = "alloc")] - pub fn is_word_byte(&self) -> bool { - self.as_u8().map_or(false, crate::util::is_word_byte) + /// Returns true when this unit represents an "end of input" sentinel. + pub fn is_eoi(self) -> bool { + self.as_eoi().is_some() + } + + /// Returns true when this unit corresponds to an ASCII word byte. + /// + /// This always returns false when this unit represents an "end of input" + /// sentinel. + pub fn is_word_byte(self) -> bool { + self.as_u8().map_or(false, crate::util::utf8::is_word_byte) } } impl core::fmt::Debug for Unit { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { - match *self { - Unit::U8(b) => write!(f, "{:?}", DebugByte(b)), - Unit::EOI(_) => write!(f, "EOI"), + match self.0 { + UnitKind::U8(b) => write!(f, "{:?}", DebugByte(b)), + UnitKind::EOI(_) => write!(f, "EOI"), } } } @@ -113,23 +187,48 @@ impl core::fmt::Debug for Unit { /// This is used in a DFA to reduce the size of the transition table. This can /// have a particularly large impact not only on the total size of a dense DFA, /// but also on compile times. +/// +/// The essential idea here is that the alphabet of a DFA is shrunk from the +/// usual 256 distinct byte values down to a set of equivalence classes. The +/// guarantee you get is that any byte belonging to the same equivalence class +/// can be treated as if it were any other byte in the same class, and the +/// result of a search wouldn't change. +/// +/// # Example +/// +/// This example shows how to get byte classes from an +/// [`NFA`](crate::nfa::thompson::NFA) and ask for the class of various bytes. +/// +/// ``` +/// use regex_automata::nfa::thompson::NFA; +/// +/// let nfa = NFA::new("[a-z]+")?; +/// let classes = nfa.byte_classes(); +/// // 'a' and 'z' are in the same class for this regex. +/// assert_eq!(classes.get(b'a'), classes.get(b'z')); +/// // But 'a' and 'A' are not. +/// assert_ne!(classes.get(b'a'), classes.get(b'A')); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` #[derive(Clone, Copy)] pub struct ByteClasses([u8; 256]); impl ByteClasses { /// Creates a new set of equivalence classes where all bytes are mapped to /// the same class. + #[inline] pub fn empty() -> ByteClasses { ByteClasses([0; 256]) } /// Creates a new set of equivalence classes where each byte belongs to /// its own equivalence class. - #[cfg(feature = "alloc")] + #[inline] pub fn singletons() -> ByteClasses { let mut classes = ByteClasses::empty(); - for i in 0..256 { - classes.set(i as u8, i as u8); + for b in 0..=255 { + classes.set(b, b); } classes } @@ -139,18 +238,19 @@ impl ByteClasses { /// an error is returned. Upon success, the number of bytes read along with /// the map are returned. The number of bytes read is always a multiple of /// 8. - pub fn from_bytes( + pub(crate) fn from_bytes( slice: &[u8], ) -> Result<(ByteClasses, usize), DeserializeError> { - if slice.len() < 256 { - return Err(DeserializeError::buffer_too_small("byte class map")); - } + wire::check_slice_len(slice, 256, "byte class map")?; let mut classes = ByteClasses::empty(); for (b, &class) in slice[..256].iter().enumerate() { - classes.set(b as u8, class); + classes.set(u8::try_from(b).unwrap(), class); } - for b in classes.iter() { - if b.as_usize() >= classes.alphabet_len() { + // We specifically don't use 'classes.iter()' here because that + // iterator depends on 'classes.alphabet_len()' being correct. But that + // is precisely the thing we're trying to verify below! + for &b in classes.0.iter() { + if usize::from(b) >= classes.alphabet_len() { return Err(DeserializeError::generic( "found equivalence class greater than alphabet len", )); @@ -163,7 +263,7 @@ impl ByteClasses { /// buffer is too small, then an error is returned. Upon success, the total /// number of bytes written is returned. The number of bytes written is /// guaranteed to be a multiple of 8. - pub fn write_to( + pub(crate) fn write_to( &self, mut dst: &mut [u8], ) -> Result<usize, SerializeError> { @@ -179,41 +279,38 @@ impl ByteClasses { } /// Returns the total number of bytes written by `write_to`. - pub fn write_to_len(&self) -> usize { + pub(crate) fn write_to_len(&self) -> usize { 256 } /// Set the equivalence class for the given byte. #[inline] pub fn set(&mut self, byte: u8, class: u8) { - self.0[byte as usize] = class; + self.0[usize::from(byte)] = class; } /// Get the equivalence class for the given byte. #[inline] pub fn get(&self, byte: u8) -> u8 { - self.0[byte as usize] - } - - /// Get the equivalence class for the given byte while forcefully - /// eliding bounds checks. - #[inline] - pub unsafe fn get_unchecked(&self, byte: u8) -> u8 { - *self.0.get_unchecked(byte as usize) + self.0[usize::from(byte)] } - /// Get the equivalence class for the given input unit and return the + /// Get the equivalence class for the given haystack unit and return the /// class as a `usize`. #[inline] pub fn get_by_unit(&self, unit: Unit) -> usize { - match unit { - Unit::U8(b) => usize::try_from(self.get(b)).unwrap(), - Unit::EOI(b) => usize::try_from(b).unwrap(), + match unit.0 { + UnitKind::U8(b) => usize::from(self.get(b)), + UnitKind::EOI(b) => usize::from(b), } } + /// Create a unit that represents the "end of input" sentinel based on the + /// number of equivalence classes. #[inline] pub fn eoi(&self) -> Unit { + // The alphabet length already includes the EOI sentinel, hence why + // we subtract 1. Unit::eoi(self.alphabet_len().checked_sub(1).unwrap()) } @@ -225,49 +322,153 @@ impl ByteClasses { // Add one since the number of equivalence classes is one bigger than // the last one. But add another to account for the final EOI class // that isn't explicitly represented. - self.0[255] as usize + 1 + 1 + usize::from(self.0[255]) + 1 + 1 } /// Returns the stride, as a base-2 exponent, required for these /// equivalence classes. /// /// The stride is always the smallest power of 2 that is greater than or - /// equal to the alphabet length. This is done so that converting between - /// state IDs and indices can be done with shifts alone, which is much - /// faster than integer division. - #[cfg(feature = "alloc")] + /// equal to the alphabet length, and the `stride2` returned here is the + /// exponent applied to `2` to get the smallest power. This is done so that + /// converting between premultiplied state IDs and indices can be done with + /// shifts alone, which is much faster than integer division. + #[inline] pub fn stride2(&self) -> usize { - self.alphabet_len().next_power_of_two().trailing_zeros() as usize + let zeros = self.alphabet_len().next_power_of_two().trailing_zeros(); + usize::try_from(zeros).unwrap() } /// Returns true if and only if every byte in this class maps to its own /// equivalence class. Equivalently, there are 257 equivalence classes - /// and each class contains exactly one byte (plus the special EOI class). + /// and each class contains either exactly one byte or corresponds to the + /// singleton class containing the "end of input" sentinel. #[inline] pub fn is_singleton(&self) -> bool { self.alphabet_len() == 257 } /// Returns an iterator over all equivalence classes in this set. + #[inline] pub fn iter(&self) -> ByteClassIter<'_> { ByteClassIter { classes: self, i: 0 } } /// Returns an iterator over a sequence of representative bytes from each - /// equivalence class. Namely, this yields exactly N items, where N is - /// equivalent to the number of equivalence classes. Each item is an - /// arbitrary byte drawn from each equivalence class. + /// equivalence class within the range of bytes given. + /// + /// When the given range is unbounded on both sides, the iterator yields + /// exactly N items, where N is equivalent to the number of equivalence + /// classes. Each item is an arbitrary byte drawn from each equivalence + /// class. /// /// This is useful when one is determinizing an NFA and the NFA's alphabet - /// hasn't been converted to equivalence classes yet. Picking an arbitrary - /// byte from each equivalence class then permits a full exploration of - /// the NFA instead of using every possible byte value. - #[cfg(feature = "alloc")] - pub fn representatives(&self) -> ByteClassRepresentatives<'_> { - ByteClassRepresentatives { classes: self, byte: 0, last_class: None } + /// hasn't been converted to equivalence classes. Picking an arbitrary byte + /// from each equivalence class then permits a full exploration of the NFA + /// instead of using every possible byte value and thus potentially saves + /// quite a lot of redundant work. + /// + /// # Example + /// + /// This shows an example of what a complete sequence of representatives + /// might look like from a real example. + /// + /// ``` + /// use regex_automata::{nfa::thompson::NFA, util::alphabet::Unit}; + /// + /// let nfa = NFA::new("[a-z]+")?; + /// let classes = nfa.byte_classes(); + /// let reps: Vec<Unit> = classes.representatives(..).collect(); + /// // Note that the specific byte values yielded are not guaranteed! + /// let expected = vec![ + /// Unit::u8(b'\x00'), + /// Unit::u8(b'a'), + /// Unit::u8(b'{'), + /// Unit::eoi(3), + /// ]; + /// assert_eq!(expected, reps); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// Note though, that you can ask for an arbitrary range of bytes, and only + /// representatives for that range will be returned: + /// + /// ``` + /// use regex_automata::{nfa::thompson::NFA, util::alphabet::Unit}; + /// + /// let nfa = NFA::new("[a-z]+")?; + /// let classes = nfa.byte_classes(); + /// let reps: Vec<Unit> = classes.representatives(b'A'..=b'z').collect(); + /// // Note that the specific byte values yielded are not guaranteed! + /// let expected = vec![ + /// Unit::u8(b'A'), + /// Unit::u8(b'a'), + /// ]; + /// assert_eq!(expected, reps); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn representatives<R: core::ops::RangeBounds<u8>>( + &self, + range: R, + ) -> ByteClassRepresentatives<'_> { + use core::ops::Bound; + + let cur_byte = match range.start_bound() { + Bound::Included(&i) => usize::from(i), + Bound::Excluded(&i) => usize::from(i).checked_add(1).unwrap(), + Bound::Unbounded => 0, + }; + let end_byte = match range.end_bound() { + Bound::Included(&i) => { + Some(usize::from(i).checked_add(1).unwrap()) + } + Bound::Excluded(&i) => Some(usize::from(i)), + Bound::Unbounded => None, + }; + assert_ne!( + cur_byte, + usize::MAX, + "start range must be less than usize::MAX", + ); + ByteClassRepresentatives { + classes: self, + cur_byte, + end_byte, + last_class: None, + } } /// Returns an iterator of the bytes in the given equivalence class. + /// + /// This is useful when one needs to know the actual bytes that belong to + /// an equivalence class. For example, conceptually speaking, accelerating + /// a DFA state occurs when a state only has a few outgoing transitions. + /// But in reality, what is required is that there are only a small + /// number of distinct bytes that can lead to an outgoing transition. The + /// difference is that any one transition can correspond to an equivalence + /// class which may contains many bytes. Therefore, DFA state acceleration + /// considers the actual elements in each equivalence class of each + /// outgoing transition. + /// + /// # Example + /// + /// This shows an example of how to get all of the elements in an + /// equivalence class. + /// + /// ``` + /// use regex_automata::{nfa::thompson::NFA, util::alphabet::Unit}; + /// + /// let nfa = NFA::new("[a-z]+")?; + /// let classes = nfa.byte_classes(); + /// let elements: Vec<Unit> = classes.elements(Unit::u8(1)).collect(); + /// let expected: Vec<Unit> = (b'a'..=b'z').map(Unit::u8).collect(); + /// assert_eq!(expected, elements); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] pub fn elements(&self, class: Unit) -> ByteClassElements { ByteClassElements { classes: self, class, byte: 0 } } @@ -281,6 +482,12 @@ impl ByteClasses { } } +impl Default for ByteClasses { + fn default() -> ByteClasses { + ByteClasses::singletons() + } +} + impl core::fmt::Debug for ByteClasses { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { if self.is_singleton() { @@ -307,6 +514,13 @@ impl core::fmt::Debug for ByteClasses { } /// An iterator over each equivalence class. +/// +/// The last element in this iterator always corresponds to [`Unit::eoi`]. +/// +/// This is created by the [`ByteClasses::iter`] method. +/// +/// The lifetime `'a` refers to the lifetime of the byte classes that this +/// iterator was created from. #[derive(Debug)] pub struct ByteClassIter<'a> { classes: &'a ByteClasses, @@ -321,7 +535,7 @@ impl<'a> Iterator for ByteClassIter<'a> { self.i += 1; Some(self.classes.eoi()) } else if self.i < self.classes.alphabet_len() { - let class = self.i as u8; + let class = u8::try_from(self.i).unwrap(); self.i += 1; Some(Unit::u8(class)) } else { @@ -331,31 +545,44 @@ impl<'a> Iterator for ByteClassIter<'a> { } /// An iterator over representative bytes from each equivalence class. -#[cfg(feature = "alloc")] +/// +/// This is created by the [`ByteClasses::representatives`] method. +/// +/// The lifetime `'a` refers to the lifetime of the byte classes that this +/// iterator was created from. #[derive(Debug)] pub struct ByteClassRepresentatives<'a> { classes: &'a ByteClasses, - byte: usize, + cur_byte: usize, + end_byte: Option<usize>, last_class: Option<u8>, } -#[cfg(feature = "alloc")] impl<'a> Iterator for ByteClassRepresentatives<'a> { type Item = Unit; fn next(&mut self) -> Option<Unit> { - while self.byte < 256 { - let byte = self.byte as u8; + while self.cur_byte < self.end_byte.unwrap_or(256) { + let byte = u8::try_from(self.cur_byte).unwrap(); let class = self.classes.get(byte); - self.byte += 1; + self.cur_byte += 1; if self.last_class != Some(class) { self.last_class = Some(class); return Some(Unit::u8(byte)); } } - if self.byte == 256 { - self.byte += 1; + if self.cur_byte != usize::MAX && self.end_byte.is_none() { + // Using usize::MAX as a sentinel is OK because we ban usize::MAX + // from appearing as a start bound in iterator construction. But + // why do it this way? Well, we want to return the EOI class + // whenever the end of the given range is unbounded because EOI + // isn't really a "byte" per se, so the only way it should be + // excluded is if there is a bounded end to the range. Therefore, + // when the end is unbounded, we just need to know whether we've + // reported EOI or not. When we do, we set cur_byte to a value it + // can never otherwise be. + self.cur_byte = usize::MAX; return Some(self.classes.eoi()); } None @@ -363,6 +590,11 @@ impl<'a> Iterator for ByteClassRepresentatives<'a> { } /// An iterator over all elements in an equivalence class. +/// +/// This is created by the [`ByteClasses::elements`] method. +/// +/// The lifetime `'a` refers to the lifetime of the byte classes that this +/// iterator was created from. #[derive(Debug)] pub struct ByteClassElements<'a> { classes: &'a ByteClasses, @@ -375,9 +607,9 @@ impl<'a> Iterator for ByteClassElements<'a> { fn next(&mut self) -> Option<Unit> { while self.byte < 256 { - let byte = self.byte as u8; + let byte = u8::try_from(self.byte).unwrap(); self.byte += 1; - if self.class.as_u8() == Some(self.classes.get(byte)) { + if self.class.is_byte(self.classes.get(byte)) { return Some(Unit::u8(byte)); } } @@ -394,7 +626,7 @@ impl<'a> Iterator for ByteClassElements<'a> { /// An iterator over all elements in an equivalence class expressed as a /// sequence of contiguous ranges. #[derive(Debug)] -pub struct ByteClassElementRanges<'a> { +struct ByteClassElementRanges<'a> { elements: ByteClassElements<'a>, range: Option<(Unit, Unit)>, } @@ -426,6 +658,8 @@ impl<'a> Iterator for ByteClassElementRanges<'a> { } } +/// A partitioning of bytes into equivalence classes. +/// /// A byte class set keeps track of an *approximation* of equivalence classes /// of bytes during NFA construction. That is, every byte in an equivalence /// class cannot discriminate between a match and a non-match. @@ -446,21 +680,28 @@ impl<'a> Iterator for ByteClassElementRanges<'a> { /// rethinking how equivalence classes are computed, including changing the /// representation here, which is only able to group contiguous bytes into the /// same equivalence class.) +#[cfg(feature = "alloc")] #[derive(Clone, Debug)] -pub struct ByteClassSet(ByteSet); +pub(crate) struct ByteClassSet(ByteSet); +#[cfg(feature = "alloc")] +impl Default for ByteClassSet { + fn default() -> ByteClassSet { + ByteClassSet::empty() + } +} + +#[cfg(feature = "alloc")] impl ByteClassSet { /// Create a new set of byte classes where all bytes are part of the same /// equivalence class. - #[cfg(feature = "alloc")] - pub fn empty() -> Self { + pub(crate) fn empty() -> Self { ByteClassSet(ByteSet::empty()) } /// Indicate the the range of byte given (inclusive) can discriminate a /// match between it and all other bytes outside of the range. - #[cfg(feature = "alloc")] - pub fn set_range(&mut self, start: u8, end: u8) { + pub(crate) fn set_range(&mut self, start: u8, end: u8) { debug_assert!(start <= end); if start > 0 { self.0.add(start - 1); @@ -469,8 +710,7 @@ impl ByteClassSet { } /// Add the contiguous ranges in the set given to this byte class set. - #[cfg(feature = "alloc")] - pub fn add_set(&mut self, set: &ByteSet) { + pub(crate) fn add_set(&mut self, set: &ByteSet) { for (start, end) in set.iter_ranges() { self.set_range(start, end); } @@ -479,8 +719,7 @@ impl ByteClassSet { /// Convert this boolean set to a map that maps all byte values to their /// corresponding equivalence class. The last mapping indicates the largest /// equivalence class identifier (which is never bigger than 255). - #[cfg(feature = "alloc")] - pub fn byte_classes(&self) -> ByteClasses { + pub(crate) fn byte_classes(&self) -> ByteClasses { let mut classes = ByteClasses::empty(); let mut class = 0u8; let mut b = 0u8; @@ -500,7 +739,7 @@ impl ByteClassSet { /// A simple set of bytes that is reasonably cheap to copy and allocation free. #[derive(Clone, Copy, Debug, Default, Eq, PartialEq)] -pub struct ByteSet { +pub(crate) struct ByteSet { bits: BitSet, } @@ -511,90 +750,113 @@ struct BitSet([u128; 2]); impl ByteSet { /// Create an empty set of bytes. - #[cfg(feature = "alloc")] - pub fn empty() -> ByteSet { + pub(crate) fn empty() -> ByteSet { ByteSet { bits: BitSet([0; 2]) } } /// Add a byte to this set. /// /// If the given byte already belongs to this set, then this is a no-op. - #[cfg(feature = "alloc")] - pub fn add(&mut self, byte: u8) { + pub(crate) fn add(&mut self, byte: u8) { let bucket = byte / 128; let bit = byte % 128; - self.bits.0[bucket as usize] |= 1 << bit; - } - - /// Add an inclusive range of bytes. - #[cfg(feature = "alloc")] - pub fn add_all(&mut self, start: u8, end: u8) { - for b in start..=end { - self.add(b); - } + self.bits.0[usize::from(bucket)] |= 1 << bit; } /// Remove a byte from this set. /// /// If the given byte is not in this set, then this is a no-op. - #[cfg(feature = "alloc")] - pub fn remove(&mut self, byte: u8) { + pub(crate) fn remove(&mut self, byte: u8) { let bucket = byte / 128; let bit = byte % 128; - self.bits.0[bucket as usize] &= !(1 << bit); - } - - /// Remove an inclusive range of bytes. - #[cfg(feature = "alloc")] - pub fn remove_all(&mut self, start: u8, end: u8) { - for b in start..=end { - self.remove(b); - } + self.bits.0[usize::from(bucket)] &= !(1 << bit); } /// Return true if and only if the given byte is in this set. - pub fn contains(&self, byte: u8) -> bool { + pub(crate) fn contains(&self, byte: u8) -> bool { let bucket = byte / 128; let bit = byte % 128; - self.bits.0[bucket as usize] & (1 << bit) > 0 + self.bits.0[usize::from(bucket)] & (1 << bit) > 0 } /// Return true if and only if the given inclusive range of bytes is in /// this set. - #[cfg(feature = "alloc")] - pub fn contains_range(&self, start: u8, end: u8) -> bool { + pub(crate) fn contains_range(&self, start: u8, end: u8) -> bool { (start..=end).all(|b| self.contains(b)) } /// Returns an iterator over all bytes in this set. - #[cfg(feature = "alloc")] - pub fn iter(&self) -> ByteSetIter { + pub(crate) fn iter(&self) -> ByteSetIter { ByteSetIter { set: self, b: 0 } } /// Returns an iterator over all contiguous ranges of bytes in this set. - #[cfg(feature = "alloc")] - pub fn iter_ranges(&self) -> ByteSetRangeIter { + pub(crate) fn iter_ranges(&self) -> ByteSetRangeIter { ByteSetRangeIter { set: self, b: 0 } } - /// Return the number of bytes in this set. - #[cfg(feature = "alloc")] - pub fn len(&self) -> usize { - (self.bits.0[0].count_ones() + self.bits.0[1].count_ones()) as usize - } - /// Return true if and only if this set is empty. - #[cfg(feature = "alloc")] - pub fn is_empty(&self) -> bool { + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn is_empty(&self) -> bool { self.bits.0 == [0, 0] } + + /// Deserializes a byte set from the given slice. If the slice is of + /// incorrect length or is otherwise malformed, then an error is returned. + /// Upon success, the number of bytes read along with the set are returned. + /// The number of bytes read is always a multiple of 8. + pub(crate) fn from_bytes( + slice: &[u8], + ) -> Result<(ByteSet, usize), DeserializeError> { + use core::mem::size_of; + + wire::check_slice_len(slice, 2 * size_of::<u128>(), "byte set")?; + let mut nread = 0; + let (low, nr) = wire::try_read_u128(slice, "byte set low bucket")?; + nread += nr; + let (high, nr) = wire::try_read_u128(slice, "byte set high bucket")?; + nread += nr; + Ok((ByteSet { bits: BitSet([low, high]) }, nread)) + } + + /// Writes this byte set to the given byte buffer. If the given buffer is + /// too small, then an error is returned. Upon success, the total number of + /// bytes written is returned. The number of bytes written is guaranteed to + /// be a multiple of 8. + pub(crate) fn write_to<E: crate::util::wire::Endian>( + &self, + dst: &mut [u8], + ) -> Result<usize, SerializeError> { + use core::mem::size_of; + + let nwrite = self.write_to_len(); + if dst.len() < nwrite { + return Err(SerializeError::buffer_too_small("byte set")); + } + let mut nw = 0; + E::write_u128(self.bits.0[0], &mut dst[nw..]); + nw += size_of::<u128>(); + E::write_u128(self.bits.0[1], &mut dst[nw..]); + nw += size_of::<u128>(); + assert_eq!(nwrite, nw, "expected to write certain number of bytes",); + assert_eq!( + nw % 8, + 0, + "expected to write multiple of 8 bytes for byte set", + ); + Ok(nw) + } + + /// Returns the total number of bytes written by `write_to`. + pub(crate) fn write_to_len(&self) -> usize { + 2 * core::mem::size_of::<u128>() + } } impl core::fmt::Debug for BitSet { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { let mut fmtd = f.debug_set(); - for b in (0..256).map(|b| b as u8) { + for b in 0u8..=255 { if (ByteSet { bits: *self }).contains(b) { fmtd.entry(&b); } @@ -604,7 +866,7 @@ impl core::fmt::Debug for BitSet { } #[derive(Debug)] -pub struct ByteSetIter<'a> { +pub(crate) struct ByteSetIter<'a> { set: &'a ByteSet, b: usize, } @@ -614,7 +876,7 @@ impl<'a> Iterator for ByteSetIter<'a> { fn next(&mut self) -> Option<u8> { while self.b <= 255 { - let b = self.b as u8; + let b = u8::try_from(self.b).unwrap(); self.b += 1; if self.set.contains(b) { return Some(b); @@ -625,7 +887,7 @@ impl<'a> Iterator for ByteSetIter<'a> { } #[derive(Debug)] -pub struct ByteSetRangeIter<'a> { +pub(crate) struct ByteSetRangeIter<'a> { set: &'a ByteSet, b: usize, } @@ -634,16 +896,17 @@ impl<'a> Iterator for ByteSetRangeIter<'a> { type Item = (u8, u8); fn next(&mut self) -> Option<(u8, u8)> { + let asu8 = |n: usize| u8::try_from(n).unwrap(); while self.b <= 255 { - let start = self.b as u8; + let start = asu8(self.b); self.b += 1; if !self.set.contains(start) { continue; } let mut end = start; - while self.b <= 255 && self.set.contains(self.b as u8) { - end = self.b as u8; + while self.b <= 255 && self.set.contains(asu8(self.b)) { + end = asu8(self.b); self.b += 1; } return Some((start, end)); @@ -652,8 +915,7 @@ impl<'a> Iterator for ByteSetRangeIter<'a> { } } -#[cfg(test)] -#[cfg(feature = "alloc")] +#[cfg(all(test, feature = "alloc"))] mod tests { use alloc::{vec, vec::Vec}; @@ -694,8 +956,8 @@ mod tests { #[test] fn full_byte_classes() { let mut set = ByteClassSet::empty(); - for i in 0..256u16 { - set.set_range(i as u8, i as u8); + for b in 0u8..=255 { + set.set_range(b, b); } assert_eq!(set.byte_classes().alphabet_len(), 257); } @@ -787,4 +1049,91 @@ mod tests { let elements = classes.elements(Unit::eoi(1)).collect::<Vec<_>>(); assert_eq!(elements, vec![Unit::eoi(256)]); } + + #[test] + fn representatives() { + let mut set = ByteClassSet::empty(); + set.set_range(b'b', b'd'); + set.set_range(b'g', b'm'); + set.set_range(b'z', b'z'); + let classes = set.byte_classes(); + + let got: Vec<Unit> = classes.representatives(..).collect(); + let expected = vec![ + Unit::u8(b'\x00'), + Unit::u8(b'b'), + Unit::u8(b'e'), + Unit::u8(b'g'), + Unit::u8(b'n'), + Unit::u8(b'z'), + Unit::u8(b'\x7B'), + Unit::eoi(7), + ]; + assert_eq!(expected, got); + + let got: Vec<Unit> = classes.representatives(..0).collect(); + assert!(got.is_empty()); + let got: Vec<Unit> = classes.representatives(1..1).collect(); + assert!(got.is_empty()); + let got: Vec<Unit> = classes.representatives(255..255).collect(); + assert!(got.is_empty()); + + // A weird case that is the only guaranteed to way to get an iterator + // of just the EOI class by excluding all possible byte values. + let got: Vec<Unit> = classes + .representatives(( + core::ops::Bound::Excluded(255), + core::ops::Bound::Unbounded, + )) + .collect(); + let expected = vec![Unit::eoi(7)]; + assert_eq!(expected, got); + + let got: Vec<Unit> = classes.representatives(..=255).collect(); + let expected = vec![ + Unit::u8(b'\x00'), + Unit::u8(b'b'), + Unit::u8(b'e'), + Unit::u8(b'g'), + Unit::u8(b'n'), + Unit::u8(b'z'), + Unit::u8(b'\x7B'), + ]; + assert_eq!(expected, got); + + let got: Vec<Unit> = classes.representatives(b'b'..=b'd').collect(); + let expected = vec![Unit::u8(b'b')]; + assert_eq!(expected, got); + + let got: Vec<Unit> = classes.representatives(b'a'..=b'd').collect(); + let expected = vec![Unit::u8(b'a'), Unit::u8(b'b')]; + assert_eq!(expected, got); + + let got: Vec<Unit> = classes.representatives(b'b'..=b'e').collect(); + let expected = vec![Unit::u8(b'b'), Unit::u8(b'e')]; + assert_eq!(expected, got); + + let got: Vec<Unit> = classes.representatives(b'A'..=b'Z').collect(); + let expected = vec![Unit::u8(b'A')]; + assert_eq!(expected, got); + + let got: Vec<Unit> = classes.representatives(b'A'..=b'z').collect(); + let expected = vec![ + Unit::u8(b'A'), + Unit::u8(b'b'), + Unit::u8(b'e'), + Unit::u8(b'g'), + Unit::u8(b'n'), + Unit::u8(b'z'), + ]; + assert_eq!(expected, got); + + let got: Vec<Unit> = classes.representatives(b'z'..).collect(); + let expected = vec![Unit::u8(b'z'), Unit::u8(b'\x7B'), Unit::eoi(7)]; + assert_eq!(expected, got); + + let got: Vec<Unit> = classes.representatives(b'z'..=0xFF).collect(); + let expected = vec![Unit::u8(b'z'), Unit::u8(b'\x7B')]; + assert_eq!(expected, got); + } } diff --git a/vendor/regex-automata/src/util/captures.rs b/vendor/regex-automata/src/util/captures.rs new file mode 100644 index 000000000..cd3a5f8f7 --- /dev/null +++ b/vendor/regex-automata/src/util/captures.rs @@ -0,0 +1,2547 @@ +/*! +Provides types for dealing with capturing groups. + +Capturing groups refer to sub-patterns of regexes that some regex engines can +report matching offsets for. For example, matching `[a-z]([0-9]+)` against +`a789` would give `a789` as the overall match (for the implicit capturing group +at index `0`) and `789` as the match for the capturing group `([0-9]+)` (an +explicit capturing group at index `1`). + +Not all regex engines can report match offsets for capturing groups. Indeed, +to a first approximation, regex engines that can report capturing group offsets +tend to be quite a bit slower than regex engines that can't. This is because +tracking capturing groups at search time usually requires more "power" that +in turn adds overhead. + +Other regex implementations might call capturing groups "submatches." + +# Overview + +The main types in this module are: + +* [`Captures`] records the capturing group offsets found during a search. It +provides convenience routines for looking up capturing group offsets by either +index or name. +* [`GroupInfo`] records the mapping between capturing groups and "slots," +where the latter are how capturing groups are recorded during a regex search. +This also keeps a mapping from capturing group name to index, and capture +group index to name. A `GroupInfo` is used by `Captures` internally to +provide a convenient API. It is unlikely that you'll use a `GroupInfo` +directly, but for example, if you've compiled an Thompson NFA, then you can use +[`thompson::NFA::group_info`](crate::nfa::thompson::NFA::group_info) to get its +underlying `GroupInfo`. +*/ + +use alloc::{string::String, sync::Arc, vec, vec::Vec}; + +use crate::util::{ + interpolate, + primitives::{ + NonMaxUsize, PatternID, PatternIDError, PatternIDIter, SmallIndex, + }, + search::{Match, Span}, +}; + +/// The span offsets of capturing groups after a match has been found. +/// +/// This type represents the output of regex engines that can report the +/// offsets at which capturing groups matches or "submatches" occur. For +/// example, the [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM). When a match +/// occurs, it will at minimum contain the [`PatternID`] of the pattern that +/// matched. Depending upon how it was constructed, it may also contain the +/// start/end offsets of the entire match of the pattern and the start/end +/// offsets of each capturing group that participated in the match. +/// +/// Values of this type are always created for a specific [`GroupInfo`]. It is +/// unspecified behavior to use a `Captures` value in a search with any regex +/// engine that has a different `GroupInfo` than the one the `Captures` were +/// created with. +/// +/// # Constructors +/// +/// There are three constructors for this type that control what kind of +/// information is available upon a match: +/// +/// * [`Captures::all`]: Will store overall pattern match offsets in addition +/// to the offsets of capturing groups that participated in the match. +/// * [`Captures::matches`]: Will store only the overall pattern +/// match offsets. The offsets of capturing groups (even ones that participated +/// in the match) are not available. +/// * [`Captures::empty`]: Will only store the pattern ID that matched. No +/// match offsets are available at all. +/// +/// If you aren't sure which to choose, then pick the first one. The first one +/// is what convenience routines like, +/// [`PikeVM::create_captures`](crate::nfa::thompson::pikevm::PikeVM::create_captures), +/// will use automatically. +/// +/// The main difference between these choices is performance. Namely, if you +/// ask for _less_ information, then the execution of regex search may be able +/// to run more quickly. +/// +/// # Notes +/// +/// It is worth pointing out that this type is not coupled to any one specific +/// regex engine. Instead, its coupling is with [`GroupInfo`], which is the +/// thing that is responsible for mapping capturing groups to "slot" offsets. +/// Slot offsets are indices into a single sequence of memory at which matching +/// haystack offsets for the corresponding group are written by regex engines. +/// +/// # Example +/// +/// This example shows how to parse a simple date and extract the components of +/// the date via capturing groups: +/// +/// ``` +/// use regex_automata::{nfa::thompson::pikevm::PikeVM, Span}; +/// +/// let re = PikeVM::new(r"^([0-9]{4})-([0-9]{2})-([0-9]{2})$")?; +/// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); +/// +/// re.captures(&mut cache, "2010-03-14", &mut caps); +/// assert!(caps.is_match()); +/// assert_eq!(Some(Span::from(0..4)), caps.get_group(1)); +/// assert_eq!(Some(Span::from(5..7)), caps.get_group(2)); +/// assert_eq!(Some(Span::from(8..10)), caps.get_group(3)); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +/// +/// # Example: named capturing groups +/// +/// This example is like the one above, but leverages the ability to name +/// capturing groups in order to make the code a bit clearer: +/// +/// ``` +/// use regex_automata::{nfa::thompson::pikevm::PikeVM, Span}; +/// +/// let re = PikeVM::new(r"^(?P<y>[0-9]{4})-(?P<m>[0-9]{2})-(?P<d>[0-9]{2})$")?; +/// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); +/// +/// re.captures(&mut cache, "2010-03-14", &mut caps); +/// assert!(caps.is_match()); +/// assert_eq!(Some(Span::from(0..4)), caps.get_group_by_name("y")); +/// assert_eq!(Some(Span::from(5..7)), caps.get_group_by_name("m")); +/// assert_eq!(Some(Span::from(8..10)), caps.get_group_by_name("d")); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +#[derive(Clone)] +pub struct Captures { + /// The group info that these capture groups are coupled to. This is what + /// gives the "convenience" of the `Captures` API. Namely, it provides the + /// slot mapping and the name|-->index mapping for capture lookups by name. + group_info: GroupInfo, + /// The ID of the pattern that matched. Regex engines must set this to + /// None when no match occurs. + pid: Option<PatternID>, + /// The slot values, i.e., submatch offsets. + /// + /// In theory, the smallest sequence of slots would be something like + /// `max(groups(pattern) for pattern in regex) * 2`, but instead, we use + /// `sum(groups(pattern) for pattern in regex) * 2`. Why? + /// + /// Well, the former could be used in theory, because we don't generally + /// have any overlapping APIs that involve capturing groups. Therefore, + /// there's technically never any need to have slots set for multiple + /// patterns. However, this might change some day, in which case, we would + /// need to have slots available. + /// + /// The other reason is that during the execution of some regex engines, + /// there exists a point in time where multiple slots for different + /// patterns may be written to before knowing which pattern has matched. + /// Therefore, the regex engines themselves, in order to support multiple + /// patterns correctly, must have all slots available. If `Captures` + /// doesn't have all slots available, then regex engines can't write + /// directly into the caller provided `Captures` and must instead write + /// into some other storage and then copy the slots involved in the match + /// at the end of the search. + /// + /// So overall, at least as of the time of writing, it seems like the path + /// of least resistance is to just require allocating all possible slots + /// instead of the conceptual minimum. Another way to justify this is that + /// the most common case is a single pattern, in which case, there is no + /// inefficiency here since the 'max' and 'sum' calculations above are + /// equivalent in that case. + /// + /// N.B. The mapping from group index to slot is maintained by `GroupInfo` + /// and is considered an API guarantee. See `GroupInfo` for more details on + /// that mapping. + /// + /// N.B. `Option<NonMaxUsize>` has the same size as a `usize`. + slots: Vec<Option<NonMaxUsize>>, +} + +impl Captures { + /// Create new storage for the offsets of all matching capturing groups. + /// + /// This routine provides the most information for matches---namely, the + /// spans of matching capturing groups---but also requires the regex search + /// routines to do the most work. + /// + /// It is unspecified behavior to use the returned `Captures` value in a + /// search with a `GroupInfo` other than the one that is provided to this + /// constructor. + /// + /// # Example + /// + /// This example shows that all capturing groups---but only ones that + /// participated in a match---are available to query after a match has + /// been found: + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson::pikevm::PikeVM, + /// util::captures::Captures, + /// Span, Match, + /// }; + /// + /// let re = PikeVM::new( + /// r"^(?:(?P<lower>[a-z]+)|(?P<upper>[A-Z]+))(?P<digits>[0-9]+)$", + /// )?; + /// let mut cache = re.create_cache(); + /// let mut caps = Captures::all(re.get_nfa().group_info().clone()); + /// + /// re.captures(&mut cache, "ABC123", &mut caps); + /// assert!(caps.is_match()); + /// assert_eq!(Some(Match::must(0, 0..6)), caps.get_match()); + /// // The 'lower' group didn't match, so it won't have any offsets. + /// assert_eq!(None, caps.get_group_by_name("lower")); + /// assert_eq!(Some(Span::from(0..3)), caps.get_group_by_name("upper")); + /// assert_eq!(Some(Span::from(3..6)), caps.get_group_by_name("digits")); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn all(group_info: GroupInfo) -> Captures { + let slots = group_info.slot_len(); + Captures { group_info, pid: None, slots: vec![None; slots] } + } + + /// Create new storage for only the full match spans of a pattern. This + /// does not include any capturing group offsets. + /// + /// It is unspecified behavior to use the returned `Captures` value in a + /// search with a `GroupInfo` other than the one that is provided to this + /// constructor. + /// + /// # Example + /// + /// This example shows that only overall match offsets are reported when + /// this constructor is used. Accessing any capturing groups other than + /// the 0th will always return `None`. + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson::pikevm::PikeVM, + /// util::captures::Captures, + /// Match, + /// }; + /// + /// let re = PikeVM::new( + /// r"^(?:(?P<lower>[a-z]+)|(?P<upper>[A-Z]+))(?P<digits>[0-9]+)$", + /// )?; + /// let mut cache = re.create_cache(); + /// let mut caps = Captures::matches(re.get_nfa().group_info().clone()); + /// + /// re.captures(&mut cache, "ABC123", &mut caps); + /// assert!(caps.is_match()); + /// assert_eq!(Some(Match::must(0, 0..6)), caps.get_match()); + /// // We didn't ask for capturing group offsets, so they aren't available. + /// assert_eq!(None, caps.get_group_by_name("lower")); + /// assert_eq!(None, caps.get_group_by_name("upper")); + /// assert_eq!(None, caps.get_group_by_name("digits")); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn matches(group_info: GroupInfo) -> Captures { + // This is OK because we know there are at least this many slots, + // and GroupInfo construction guarantees that the number of slots fits + // into a usize. + let slots = group_info.pattern_len().checked_mul(2).unwrap(); + Captures { group_info, pid: None, slots: vec![None; slots] } + } + + /// Create new storage for only tracking which pattern matched. No offsets + /// are stored at all. + /// + /// It is unspecified behavior to use the returned `Captures` value in a + /// search with a `GroupInfo` other than the one that is provided to this + /// constructor. + /// + /// # Example + /// + /// This example shows that only the pattern that matched can be accessed + /// from a `Captures` value created via this constructor. + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson::pikevm::PikeVM, + /// util::captures::Captures, + /// PatternID, + /// }; + /// + /// let re = PikeVM::new_many(&[r"[a-z]+", r"[A-Z]+"])?; + /// let mut cache = re.create_cache(); + /// let mut caps = Captures::empty(re.get_nfa().group_info().clone()); + /// + /// re.captures(&mut cache, "aABCz", &mut caps); + /// assert!(caps.is_match()); + /// assert_eq!(Some(PatternID::must(0)), caps.pattern()); + /// // We didn't ask for any offsets, so they aren't available. + /// assert_eq!(None, caps.get_match()); + /// + /// re.captures(&mut cache, &"aABCz"[1..], &mut caps); + /// assert!(caps.is_match()); + /// assert_eq!(Some(PatternID::must(1)), caps.pattern()); + /// // We didn't ask for any offsets, so they aren't available. + /// assert_eq!(None, caps.get_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn empty(group_info: GroupInfo) -> Captures { + Captures { group_info, pid: None, slots: vec![] } + } + + /// Returns true if and only if this capturing group represents a match. + /// + /// This is a convenience routine for `caps.pattern().is_some()`. + /// + /// # Example + /// + /// When using the PikeVM (for example), the lightest weight way of + /// detecting whether a match exists is to create capturing groups that + /// only track the ID of the pattern that match (if any): + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson::pikevm::PikeVM, + /// util::captures::Captures, + /// }; + /// + /// let re = PikeVM::new(r"[a-z]+")?; + /// let mut cache = re.create_cache(); + /// let mut caps = Captures::empty(re.get_nfa().group_info().clone()); + /// + /// re.captures(&mut cache, "aABCz", &mut caps); + /// assert!(caps.is_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn is_match(&self) -> bool { + self.pid.is_some() + } + + /// Returns the identifier of the pattern that matched when this + /// capturing group represents a match. If no match was found, then this + /// always returns `None`. + /// + /// This returns a pattern ID in precisely the cases in which `is_match` + /// returns `true`. Similarly, the pattern ID returned is always the + /// same pattern ID found in the `Match` returned by `get_match`. + /// + /// # Example + /// + /// When using the PikeVM (for example), the lightest weight way of + /// detecting which pattern matched is to create capturing groups that only + /// track the ID of the pattern that match (if any): + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson::pikevm::PikeVM, + /// util::captures::Captures, + /// PatternID, + /// }; + /// + /// let re = PikeVM::new_many(&[r"[a-z]+", r"[A-Z]+"])?; + /// let mut cache = re.create_cache(); + /// let mut caps = Captures::empty(re.get_nfa().group_info().clone()); + /// + /// re.captures(&mut cache, "ABC", &mut caps); + /// assert_eq!(Some(PatternID::must(1)), caps.pattern()); + /// // Recall that offsets are only available when using a non-empty + /// // Captures value. So even though a match occurred, this returns None! + /// assert_eq!(None, caps.get_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn pattern(&self) -> Option<PatternID> { + self.pid + } + + /// Returns the pattern ID and the span of the match, if one occurred. + /// + /// This always returns `None` when `Captures` was created with + /// [`Captures::empty`], even if a match was found. + /// + /// If this routine returns a non-`None` value, then `is_match` is + /// guaranteed to return `true` and `pattern` is also guaranteed to return + /// a non-`None` value. + /// + /// # Example + /// + /// This example shows how to get the full match from a search: + /// + /// ``` + /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; + /// + /// let re = PikeVM::new_many(&[r"[a-z]+", r"[A-Z]+"])?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// + /// re.captures(&mut cache, "ABC", &mut caps); + /// assert_eq!(Some(Match::must(1, 0..3)), caps.get_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn get_match(&self) -> Option<Match> { + Some(Match::new(self.pattern()?, self.get_group(0)?)) + } + + /// Returns the span of a capturing group match corresponding to the group + /// index given, only if both the overall pattern matched and the capturing + /// group participated in that match. + /// + /// This returns `None` if `index` is invalid. `index` is valid if and only + /// if it's less than [`Captures::group_len`] for the matching pattern. + /// + /// This always returns `None` when `Captures` was created with + /// [`Captures::empty`], even if a match was found. This also always + /// returns `None` for any `index > 0` when `Captures` was created with + /// [`Captures::matches`]. + /// + /// If this routine returns a non-`None` value, then `is_match` is + /// guaranteed to return `true`, `pattern` is guaranteed to return a + /// non-`None` value and `get_match` is guaranteed to return a non-`None` + /// value. + /// + /// By convention, the 0th capture group will always return the same + /// span as the span returned by `get_match`. This is because the 0th + /// capture group always corresponds to the entirety of the pattern's + /// match. (It is similarly always unnamed because it is implicit.) This + /// isn't necessarily true of all regex engines. For example, one can + /// hand-compile a [`thompson::NFA`](crate::nfa::thompson::NFA) via a + /// [`thompson::Builder`](crate::nfa::thompson::Builder), which isn't + /// technically forced to make the 0th capturing group always correspond to + /// the entire match. + /// + /// # Example + /// + /// This example shows how to get the capturing groups, by index, from a + /// match: + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// # if !cfg!(target_pointer_width = "64") { return Ok(()); } // see #1039 + /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Span, Match}; + /// + /// let re = PikeVM::new(r"^(?P<first>\pL+)\s+(?P<last>\pL+)$")?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// + /// re.captures(&mut cache, "Bruce Springsteen", &mut caps); + /// assert_eq!(Some(Match::must(0, 0..17)), caps.get_match()); + /// assert_eq!(Some(Span::from(0..5)), caps.get_group(1)); + /// assert_eq!(Some(Span::from(6..17)), caps.get_group(2)); + /// // Looking for a non-existent capturing group will return None: + /// assert_eq!(None, caps.get_group(3)); + /// assert_eq!(None, caps.get_group(9944060567225171988)); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn get_group(&self, index: usize) -> Option<Span> { + let pid = self.pattern()?; + // There's a little bit of work needed to map captures to slots in the + // fully general case. But in the overwhelming common case of a single + // pattern, we can just do some simple arithmetic. + let (slot_start, slot_end) = if self.group_info().pattern_len() == 1 { + (index.checked_mul(2)?, index.checked_mul(2)?.checked_add(1)?) + } else { + self.group_info().slots(pid, index)? + }; + let start = self.slots.get(slot_start).copied()??; + let end = self.slots.get(slot_end).copied()??; + Some(Span { start: start.get(), end: end.get() }) + } + + /// Returns the span of a capturing group match corresponding to the group + /// name given, only if both the overall pattern matched and the capturing + /// group participated in that match. + /// + /// This returns `None` if `name` does not correspond to a valid capturing + /// group for the pattern that matched. + /// + /// This always returns `None` when `Captures` was created with + /// [`Captures::empty`], even if a match was found. This also always + /// returns `None` for any `index > 0` when `Captures` was created with + /// [`Captures::matches`]. + /// + /// If this routine returns a non-`None` value, then `is_match` is + /// guaranteed to return `true`, `pattern` is guaranteed to return a + /// non-`None` value and `get_match` is guaranteed to return a non-`None` + /// value. + /// + /// # Example + /// + /// This example shows how to get the capturing groups, by name, from a + /// match: + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Span, Match}; + /// + /// let re = PikeVM::new(r"^(?P<first>\pL+)\s+(?P<last>\pL+)$")?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// + /// re.captures(&mut cache, "Bruce Springsteen", &mut caps); + /// assert_eq!(Some(Match::must(0, 0..17)), caps.get_match()); + /// assert_eq!(Some(Span::from(0..5)), caps.get_group_by_name("first")); + /// assert_eq!(Some(Span::from(6..17)), caps.get_group_by_name("last")); + /// // Looking for a non-existent capturing group will return None: + /// assert_eq!(None, caps.get_group_by_name("middle")); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn get_group_by_name(&self, name: &str) -> Option<Span> { + let index = self.group_info().to_index(self.pattern()?, name)?; + self.get_group(index) + } + + /// Returns an iterator of possible spans for every capturing group in the + /// matching pattern. + /// + /// If this `Captures` value does not correspond to a match, then the + /// iterator returned yields no elements. + /// + /// Note that the iterator returned yields elements of type `Option<Span>`. + /// A span is present if and only if it corresponds to a capturing group + /// that participated in a match. + /// + /// # Example + /// + /// This example shows how to collect all capturing groups: + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Span}; + /// + /// let re = PikeVM::new( + /// // Matches first/last names, with an optional middle name. + /// r"^(?P<first>\pL+)\s+(?:(?P<middle>\pL+)\s+)?(?P<last>\pL+)$", + /// )?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// + /// re.captures(&mut cache, "Harry James Potter", &mut caps); + /// assert!(caps.is_match()); + /// let groups: Vec<Option<Span>> = caps.iter().collect(); + /// assert_eq!(groups, vec![ + /// Some(Span::from(0..18)), + /// Some(Span::from(0..5)), + /// Some(Span::from(6..11)), + /// Some(Span::from(12..18)), + /// ]); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// This example uses the same regex as the previous example, but with a + /// haystack that omits the middle name. This results in a capturing group + /// that is present in the elements yielded by the iterator but without a + /// match: + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Span}; + /// + /// let re = PikeVM::new( + /// // Matches first/last names, with an optional middle name. + /// r"^(?P<first>\pL+)\s+(?:(?P<middle>\pL+)\s+)?(?P<last>\pL+)$", + /// )?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// + /// re.captures(&mut cache, "Harry Potter", &mut caps); + /// assert!(caps.is_match()); + /// let groups: Vec<Option<Span>> = caps.iter().collect(); + /// assert_eq!(groups, vec![ + /// Some(Span::from(0..12)), + /// Some(Span::from(0..5)), + /// None, + /// Some(Span::from(6..12)), + /// ]); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn iter(&self) -> CapturesPatternIter<'_> { + let names = self + .pattern() + .map_or(GroupInfoPatternNames::empty().enumerate(), |pid| { + self.group_info().pattern_names(pid).enumerate() + }); + CapturesPatternIter { caps: self, names } + } + + /// Return the total number of capturing groups for the matching pattern. + /// + /// If this `Captures` value does not correspond to a match, then this + /// always returns `0`. + /// + /// This always returns the same number of elements yielded by + /// [`Captures::iter`]. That is, the number includes capturing groups even + /// if they don't participate in the match. + /// + /// # Example + /// + /// This example shows how to count the total number of capturing groups + /// associated with a pattern. Notice that it includes groups that did not + /// participate in a match (just like `Captures::iter` does). + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::nfa::thompson::pikevm::PikeVM; + /// + /// let re = PikeVM::new( + /// // Matches first/last names, with an optional middle name. + /// r"^(?P<first>\pL+)\s+(?:(?P<middle>\pL+)\s+)?(?P<last>\pL+)$", + /// )?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// + /// re.captures(&mut cache, "Harry Potter", &mut caps); + /// assert_eq!(4, caps.group_len()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn group_len(&self) -> usize { + let pid = match self.pattern() { + None => return 0, + Some(pid) => pid, + }; + self.group_info().group_len(pid) + } + + /// Returns a reference to the underlying group info on which these + /// captures are based. + /// + /// The difference between `GroupInfo` and `Captures` is that the former + /// defines the structure of capturing groups where as the latter is what + /// stores the actual match information. So where as `Captures` only gives + /// you access to the current match, `GroupInfo` lets you query any + /// information about all capturing groups, even ones for patterns that + /// weren't involved in a match. + /// + /// Note that a `GroupInfo` uses reference counting internally, so it may + /// be cloned cheaply. + /// + /// # Example + /// + /// This example shows how to get all capturing group names from the + /// underlying `GroupInfo`. Notice that we don't even need to run a + /// search. + /// + /// ``` + /// use regex_automata::{nfa::thompson::pikevm::PikeVM, PatternID}; + /// + /// let re = PikeVM::new_many(&[ + /// r"(?P<foo>a)", + /// r"(a)(b)", + /// r"ab", + /// r"(?P<bar>a)(?P<quux>a)", + /// r"(?P<foo>z)", + /// ])?; + /// let caps = re.create_captures(); + /// + /// let expected = vec![ + /// (PatternID::must(0), 0, None), + /// (PatternID::must(0), 1, Some("foo")), + /// (PatternID::must(1), 0, None), + /// (PatternID::must(1), 1, None), + /// (PatternID::must(1), 2, None), + /// (PatternID::must(2), 0, None), + /// (PatternID::must(3), 0, None), + /// (PatternID::must(3), 1, Some("bar")), + /// (PatternID::must(3), 2, Some("quux")), + /// (PatternID::must(4), 0, None), + /// (PatternID::must(4), 1, Some("foo")), + /// ]; + /// // We could also just use 're.get_nfa().group_info()'. + /// let got: Vec<(PatternID, usize, Option<&str>)> = + /// caps.group_info().all_names().collect(); + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn group_info(&self) -> &GroupInfo { + &self.group_info + } + + /// Interpolates the capture references in `replacement` with the + /// corresponding substrings in `haystack` matched by each reference. The + /// interpolated string is returned. + /// + /// See the [`interpolate` module](interpolate) for documentation on the + /// format of the replacement string. + /// + /// # Example + /// + /// This example shows how to use interpolation, and also shows how it + /// can work with multi-pattern regexes. + /// + /// ``` + /// use regex_automata::{nfa::thompson::pikevm::PikeVM, PatternID}; + /// + /// let re = PikeVM::new_many(&[ + /// r"(?<day>[0-9]{2})-(?<month>[0-9]{2})-(?<year>[0-9]{4})", + /// r"(?<year>[0-9]{4})-(?<month>[0-9]{2})-(?<day>[0-9]{2})", + /// ])?; + /// let mut cache = re.create_cache(); + /// let mut caps = re.create_captures(); + /// + /// let replacement = "year=$year, month=$month, day=$day"; + /// + /// // This matches the first pattern. + /// let hay = "On 14-03-2010, I became a Tenneessee lamb."; + /// re.captures(&mut cache, hay, &mut caps); + /// let result = caps.interpolate_string(hay, replacement); + /// assert_eq!("year=2010, month=03, day=14", result); + /// + /// // And this matches the second pattern. + /// let hay = "On 2010-03-14, I became a Tenneessee lamb."; + /// re.captures(&mut cache, hay, &mut caps); + /// let result = caps.interpolate_string(hay, replacement); + /// assert_eq!("year=2010, month=03, day=14", result); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn interpolate_string( + &self, + haystack: &str, + replacement: &str, + ) -> String { + let mut dst = String::new(); + self.interpolate_string_into(haystack, replacement, &mut dst); + dst + } + + /// Interpolates the capture references in `replacement` with the + /// corresponding substrings in `haystack` matched by each reference. The + /// interpolated string is written to `dst`. + /// + /// See the [`interpolate` module](interpolate) for documentation on the + /// format of the replacement string. + /// + /// # Example + /// + /// This example shows how to use interpolation, and also shows how it + /// can work with multi-pattern regexes. + /// + /// ``` + /// use regex_automata::{nfa::thompson::pikevm::PikeVM, PatternID}; + /// + /// let re = PikeVM::new_many(&[ + /// r"(?<day>[0-9]{2})-(?<month>[0-9]{2})-(?<year>[0-9]{4})", + /// r"(?<year>[0-9]{4})-(?<month>[0-9]{2})-(?<day>[0-9]{2})", + /// ])?; + /// let mut cache = re.create_cache(); + /// let mut caps = re.create_captures(); + /// + /// let replacement = "year=$year, month=$month, day=$day"; + /// + /// // This matches the first pattern. + /// let hay = "On 14-03-2010, I became a Tenneessee lamb."; + /// re.captures(&mut cache, hay, &mut caps); + /// let mut dst = String::new(); + /// caps.interpolate_string_into(hay, replacement, &mut dst); + /// assert_eq!("year=2010, month=03, day=14", dst); + /// + /// // And this matches the second pattern. + /// let hay = "On 2010-03-14, I became a Tenneessee lamb."; + /// re.captures(&mut cache, hay, &mut caps); + /// let mut dst = String::new(); + /// caps.interpolate_string_into(hay, replacement, &mut dst); + /// assert_eq!("year=2010, month=03, day=14", dst); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn interpolate_string_into( + &self, + haystack: &str, + replacement: &str, + dst: &mut String, + ) { + interpolate::string( + replacement, + |index, dst| { + let span = match self.get_group(index) { + None => return, + Some(span) => span, + }; + dst.push_str(&haystack[span]); + }, + |name| self.group_info().to_index(self.pattern()?, name), + dst, + ); + } + + /// Interpolates the capture references in `replacement` with the + /// corresponding substrings in `haystack` matched by each reference. The + /// interpolated byte string is returned. + /// + /// See the [`interpolate` module](interpolate) for documentation on the + /// format of the replacement string. + /// + /// # Example + /// + /// This example shows how to use interpolation, and also shows how it + /// can work with multi-pattern regexes. + /// + /// ``` + /// use regex_automata::{nfa::thompson::pikevm::PikeVM, PatternID}; + /// + /// let re = PikeVM::new_many(&[ + /// r"(?<day>[0-9]{2})-(?<month>[0-9]{2})-(?<year>[0-9]{4})", + /// r"(?<year>[0-9]{4})-(?<month>[0-9]{2})-(?<day>[0-9]{2})", + /// ])?; + /// let mut cache = re.create_cache(); + /// let mut caps = re.create_captures(); + /// + /// let replacement = b"year=$year, month=$month, day=$day"; + /// + /// // This matches the first pattern. + /// let hay = b"On 14-03-2010, I became a Tenneessee lamb."; + /// re.captures(&mut cache, hay, &mut caps); + /// let result = caps.interpolate_bytes(hay, replacement); + /// assert_eq!(&b"year=2010, month=03, day=14"[..], result); + /// + /// // And this matches the second pattern. + /// let hay = b"On 2010-03-14, I became a Tenneessee lamb."; + /// re.captures(&mut cache, hay, &mut caps); + /// let result = caps.interpolate_bytes(hay, replacement); + /// assert_eq!(&b"year=2010, month=03, day=14"[..], result); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn interpolate_bytes( + &self, + haystack: &[u8], + replacement: &[u8], + ) -> Vec<u8> { + let mut dst = vec![]; + self.interpolate_bytes_into(haystack, replacement, &mut dst); + dst + } + + /// Interpolates the capture references in `replacement` with the + /// corresponding substrings in `haystack` matched by each reference. The + /// interpolated byte string is written to `dst`. + /// + /// See the [`interpolate` module](interpolate) for documentation on the + /// format of the replacement string. + /// + /// # Example + /// + /// This example shows how to use interpolation, and also shows how it + /// can work with multi-pattern regexes. + /// + /// ``` + /// use regex_automata::{nfa::thompson::pikevm::PikeVM, PatternID}; + /// + /// let re = PikeVM::new_many(&[ + /// r"(?<day>[0-9]{2})-(?<month>[0-9]{2})-(?<year>[0-9]{4})", + /// r"(?<year>[0-9]{4})-(?<month>[0-9]{2})-(?<day>[0-9]{2})", + /// ])?; + /// let mut cache = re.create_cache(); + /// let mut caps = re.create_captures(); + /// + /// let replacement = b"year=$year, month=$month, day=$day"; + /// + /// // This matches the first pattern. + /// let hay = b"On 14-03-2010, I became a Tenneessee lamb."; + /// re.captures(&mut cache, hay, &mut caps); + /// let mut dst = vec![]; + /// caps.interpolate_bytes_into(hay, replacement, &mut dst); + /// assert_eq!(&b"year=2010, month=03, day=14"[..], dst); + /// + /// // And this matches the second pattern. + /// let hay = b"On 2010-03-14, I became a Tenneessee lamb."; + /// re.captures(&mut cache, hay, &mut caps); + /// let mut dst = vec![]; + /// caps.interpolate_bytes_into(hay, replacement, &mut dst); + /// assert_eq!(&b"year=2010, month=03, day=14"[..], dst); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn interpolate_bytes_into( + &self, + haystack: &[u8], + replacement: &[u8], + dst: &mut Vec<u8>, + ) { + interpolate::bytes( + replacement, + |index, dst| { + let span = match self.get_group(index) { + None => return, + Some(span) => span, + }; + dst.extend_from_slice(&haystack[span]); + }, + |name| self.group_info().to_index(self.pattern()?, name), + dst, + ); + } + + /// This is a convenience routine for extracting the substrings + /// corresponding to matching capture groups in the given `haystack`. The + /// `haystack` should be the same substring used to find the match spans in + /// this `Captures` value. + /// + /// This is identical to [`Captures::extract_bytes`], except it works with + /// `&str` instead of `&[u8]`. + /// + /// # Panics + /// + /// This panics if the number of explicit matching groups in this + /// `Captures` value is less than `N`. This also panics if this `Captures` + /// value does not correspond to a match. + /// + /// Note that this does *not* panic if the number of explicit matching + /// groups is bigger than `N`. In that case, only the first `N` matching + /// groups are extracted. + /// + /// # Example + /// + /// ``` + /// use regex_automata::nfa::thompson::pikevm::PikeVM; + /// + /// let re = PikeVM::new(r"([0-9]{4})-([0-9]{2})-([0-9]{2})")?; + /// let mut cache = re.create_cache(); + /// let mut caps = re.create_captures(); + /// + /// let hay = "On 2010-03-14, I became a Tenneessee lamb."; + /// re.captures(&mut cache, hay, &mut caps); + /// assert!(caps.is_match()); + /// let (full, [year, month, day]) = caps.extract(hay); + /// assert_eq!("2010-03-14", full); + /// assert_eq!("2010", year); + /// assert_eq!("03", month); + /// assert_eq!("14", day); + /// + /// // We can also ask for fewer than all capture groups. + /// let (full, [year]) = caps.extract(hay); + /// assert_eq!("2010-03-14", full); + /// assert_eq!("2010", year); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn extract<'h, const N: usize>( + &self, + haystack: &'h str, + ) -> (&'h str, [&'h str; N]) { + let mut matched = self.iter().flatten(); + let whole_match = &haystack[matched.next().expect("a match")]; + let group_matches = [0; N].map(|_| { + let sp = matched.next().expect("too few matching groups"); + &haystack[sp] + }); + (whole_match, group_matches) + } + + /// This is a convenience routine for extracting the substrings + /// corresponding to matching capture groups in the given `haystack`. The + /// `haystack` should be the same substring used to find the match spans in + /// this `Captures` value. + /// + /// This is identical to [`Captures::extract`], except it works with + /// `&[u8]` instead of `&str`. + /// + /// # Panics + /// + /// This panics if the number of explicit matching groups in this + /// `Captures` value is less than `N`. This also panics if this `Captures` + /// value does not correspond to a match. + /// + /// Note that this does *not* panic if the number of explicit matching + /// groups is bigger than `N`. In that case, only the first `N` matching + /// groups are extracted. + /// + /// # Example + /// + /// ``` + /// use regex_automata::nfa::thompson::pikevm::PikeVM; + /// + /// let re = PikeVM::new(r"([0-9]{4})-([0-9]{2})-([0-9]{2})")?; + /// let mut cache = re.create_cache(); + /// let mut caps = re.create_captures(); + /// + /// let hay = b"On 2010-03-14, I became a Tenneessee lamb."; + /// re.captures(&mut cache, hay, &mut caps); + /// assert!(caps.is_match()); + /// let (full, [year, month, day]) = caps.extract_bytes(hay); + /// assert_eq!(b"2010-03-14", full); + /// assert_eq!(b"2010", year); + /// assert_eq!(b"03", month); + /// assert_eq!(b"14", day); + /// + /// // We can also ask for fewer than all capture groups. + /// let (full, [year]) = caps.extract_bytes(hay); + /// assert_eq!(b"2010-03-14", full); + /// assert_eq!(b"2010", year); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn extract_bytes<'h, const N: usize>( + &self, + haystack: &'h [u8], + ) -> (&'h [u8], [&'h [u8]; N]) { + let mut matched = self.iter().flatten(); + let whole_match = &haystack[matched.next().expect("a match")]; + let group_matches = [0; N].map(|_| { + let sp = matched.next().expect("too few matching groups"); + &haystack[sp] + }); + (whole_match, group_matches) + } +} + +/// Lower level "slot" oriented APIs. One does not typically need to use these +/// when executing a search. They are instead mostly intended for folks that +/// are writing their own regex engine while reusing this `Captures` type. +impl Captures { + /// Clear this `Captures` value. + /// + /// After clearing, all slots inside this `Captures` value will be set to + /// `None`. Similarly, any pattern ID that it was previously associated + /// with (for a match) is erased. + /// + /// It is not usually necessary to call this routine. Namely, a `Captures` + /// value only provides high level access to the capturing groups of the + /// pattern that matched, and only low level access to individual slots. + /// Thus, even if slots corresponding to groups that aren't associated + /// with the matching pattern are set, then it won't impact the higher + /// level APIs. Namely, higher level APIs like [`Captures::get_group`] will + /// return `None` if no pattern ID is present, even if there are spans set + /// in the underlying slots. + /// + /// Thus, to "clear" a `Captures` value of a match, it is usually only + /// necessary to call [`Captures::set_pattern`] with `None`. + /// + /// # Example + /// + /// This example shows what happens when a `Captures` value is cleared. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::nfa::thompson::pikevm::PikeVM; + /// + /// let re = PikeVM::new(r"^(?P<first>\pL+)\s+(?P<last>\pL+)$")?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// + /// re.captures(&mut cache, "Bruce Springsteen", &mut caps); + /// assert!(caps.is_match()); + /// let slots: Vec<Option<usize>> = + /// caps.slots().iter().map(|s| s.map(|x| x.get())).collect(); + /// // Note that the following ordering is considered an API guarantee. + /// assert_eq!(slots, vec![ + /// Some(0), + /// Some(17), + /// Some(0), + /// Some(5), + /// Some(6), + /// Some(17), + /// ]); + /// + /// // Now clear the slots. Everything is gone and it is no longer a match. + /// caps.clear(); + /// assert!(!caps.is_match()); + /// let slots: Vec<Option<usize>> = + /// caps.slots().iter().map(|s| s.map(|x| x.get())).collect(); + /// assert_eq!(slots, vec![ + /// None, + /// None, + /// None, + /// None, + /// None, + /// None, + /// ]); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn clear(&mut self) { + self.pid = None; + for slot in self.slots.iter_mut() { + *slot = None; + } + } + + /// Set the pattern on this `Captures` value. + /// + /// When the pattern ID is `None`, then this `Captures` value does not + /// correspond to a match (`is_match` will return `false`). Otherwise, it + /// corresponds to a match. + /// + /// This is useful in search implementations where you might want to + /// initially call `set_pattern(None)` in order to avoid the cost of + /// calling `clear()` if it turns out to not be necessary. + /// + /// # Example + /// + /// This example shows that `set_pattern` merely overwrites the pattern ID. + /// It does not actually change the underlying slot values. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::nfa::thompson::pikevm::PikeVM; + /// + /// let re = PikeVM::new(r"^(?P<first>\pL+)\s+(?P<last>\pL+)$")?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// + /// re.captures(&mut cache, "Bruce Springsteen", &mut caps); + /// assert!(caps.is_match()); + /// assert!(caps.pattern().is_some()); + /// let slots: Vec<Option<usize>> = + /// caps.slots().iter().map(|s| s.map(|x| x.get())).collect(); + /// // Note that the following ordering is considered an API guarantee. + /// assert_eq!(slots, vec![ + /// Some(0), + /// Some(17), + /// Some(0), + /// Some(5), + /// Some(6), + /// Some(17), + /// ]); + /// + /// // Now set the pattern to None. Note that the slot values remain. + /// caps.set_pattern(None); + /// assert!(!caps.is_match()); + /// assert!(!caps.pattern().is_some()); + /// let slots: Vec<Option<usize>> = + /// caps.slots().iter().map(|s| s.map(|x| x.get())).collect(); + /// // Note that the following ordering is considered an API guarantee. + /// assert_eq!(slots, vec![ + /// Some(0), + /// Some(17), + /// Some(0), + /// Some(5), + /// Some(6), + /// Some(17), + /// ]); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn set_pattern(&mut self, pid: Option<PatternID>) { + self.pid = pid; + } + + /// Returns the underlying slots, where each slot stores a single offset. + /// + /// Every matching capturing group generally corresponds to two slots: one + /// slot for the starting position and another for the ending position. + /// Typically, either both are present or neither are. (The weasel word + /// "typically" is used here because it really depends on the regex engine + /// implementation. Every sensible regex engine likely adheres to this + /// invariant, and every regex engine in this crate is sensible.) + /// + /// Generally speaking, callers should prefer to use higher level routines + /// like [`Captures::get_match`] or [`Captures::get_group`]. + /// + /// An important note here is that a regex engine may not reset all of the + /// slots to `None` values when no match occurs, or even when a match of + /// a different pattern occurs. But this depends on how the regex engine + /// implementation deals with slots. + /// + /// # Example + /// + /// This example shows how to get the underlying slots from a regex match. + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson::pikevm::PikeVM, + /// util::primitives::{PatternID, NonMaxUsize}, + /// }; + /// + /// let re = PikeVM::new_many(&[ + /// r"[a-z]+", + /// r"[0-9]+", + /// ])?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// + /// re.captures(&mut cache, "123", &mut caps); + /// assert_eq!(Some(PatternID::must(1)), caps.pattern()); + /// // Note that the only guarantee we have here is that slots 2 and 3 + /// // are set to correct values. The contents of the first two slots are + /// // unspecified since the 0th pattern did not match. + /// let expected = &[ + /// None, + /// None, + /// NonMaxUsize::new(0), + /// NonMaxUsize::new(3), + /// ]; + /// assert_eq!(expected, caps.slots()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn slots(&self) -> &[Option<NonMaxUsize>] { + &self.slots + } + + /// Returns the underlying slots as a mutable slice, where each slot stores + /// a single offset. + /// + /// This tends to be most useful for regex engine implementations for + /// writing offsets for matching capturing groups to slots. + /// + /// See [`Captures::slots`] for more information about slots. + #[inline] + pub fn slots_mut(&mut self) -> &mut [Option<NonMaxUsize>] { + &mut self.slots + } +} + +impl core::fmt::Debug for Captures { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + let mut dstruct = f.debug_struct("Captures"); + dstruct.field("pid", &self.pid); + if let Some(pid) = self.pid { + dstruct.field("spans", &CapturesDebugMap { pid, caps: self }); + } + dstruct.finish() + } +} + +/// A little helper type to provide a nice map-like debug representation for +/// our capturing group spans. +struct CapturesDebugMap<'a> { + pid: PatternID, + caps: &'a Captures, +} + +impl<'a> core::fmt::Debug for CapturesDebugMap<'a> { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + struct Key<'a>(usize, Option<&'a str>); + + impl<'a> core::fmt::Debug for Key<'a> { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + write!(f, "{}", self.0)?; + if let Some(name) = self.1 { + write!(f, "/{:?}", name)?; + } + Ok(()) + } + } + + let mut map = f.debug_map(); + let names = self.caps.group_info().pattern_names(self.pid); + for (group_index, maybe_name) in names.enumerate() { + let key = Key(group_index, maybe_name); + match self.caps.get_group(group_index) { + None => map.entry(&key, &None::<()>), + Some(span) => map.entry(&key, &span), + }; + } + map.finish() + } +} + +/// An iterator over all capturing groups in a `Captures` value. +/// +/// This iterator includes capturing groups that did not participate in a +/// match. See the [`Captures::iter`] method documentation for more details +/// and examples. +/// +/// The lifetime parameter `'a` refers to the lifetime of the underlying +/// `Captures` value. +#[derive(Clone, Debug)] +pub struct CapturesPatternIter<'a> { + caps: &'a Captures, + names: core::iter::Enumerate<GroupInfoPatternNames<'a>>, +} + +impl<'a> Iterator for CapturesPatternIter<'a> { + type Item = Option<Span>; + + fn next(&mut self) -> Option<Option<Span>> { + let (group_index, _) = self.names.next()?; + Some(self.caps.get_group(group_index)) + } + + fn size_hint(&self) -> (usize, Option<usize>) { + self.names.size_hint() + } + + fn count(self) -> usize { + self.names.count() + } +} + +impl<'a> ExactSizeIterator for CapturesPatternIter<'a> {} +impl<'a> core::iter::FusedIterator for CapturesPatternIter<'a> {} + +/// Represents information about capturing groups in a compiled regex. +/// +/// The information encapsulated by this type consists of the following. For +/// each pattern: +/// +/// * A map from every capture group name to its corresponding capture group +/// index. +/// * A map from every capture group index to its corresponding capture group +/// name. +/// * A map from capture group index to its corresponding slot index. A slot +/// refers to one half of a capturing group. That is, a capture slot is either +/// the start or end of a capturing group. A slot is usually the mechanism +/// by which a regex engine records offsets for each capturing group during a +/// search. +/// +/// A `GroupInfo` uses reference counting internally and is thus cheap to +/// clone. +/// +/// # Mapping from capture groups to slots +/// +/// One of the main responsibilities of a `GroupInfo` is to build a mapping +/// from `(PatternID, u32)` (where the `u32` is a capture index) to something +/// called a "slot." As mentioned above, a slot refers to one half of a +/// capturing group. Both combined provide the start and end offsets of +/// a capturing group that participated in a match. +/// +/// **The mapping between group indices and slots is an API guarantee.** That +/// is, the mapping won't change within a semver compatible release. +/// +/// Slots exist primarily because this is a convenient mechanism by which +/// regex engines report group offsets at search time. For example, the +/// [`nfa::thompson::State::Capture`](crate::nfa::thompson::State::Capture) +/// NFA state includes the slot index. When a regex engine transitions through +/// this state, it will likely use the slot index to write the current haystack +/// offset to some region of memory. When a match is found, those slots are +/// then reported to the caller, typically via a convenient abstraction like a +/// [`Captures`] value. +/// +/// Because this crate provides first class support for multi-pattern regexes, +/// and because of some performance related reasons, the mapping between +/// capturing groups and slots is a little complex. However, in the case of a +/// single pattern, the mapping can be described very simply: for all capture +/// group indices `i`, its corresponding slots are at `i * 2` and `i * 2 + 1`. +/// Notice that the pattern ID isn't involved at all here, because it only +/// applies to a single-pattern regex, it is therefore always `0`. +/// +/// In the multi-pattern case, the mapping is a bit more complicated. To talk +/// about it, we must define what we mean by "implicit" vs "explicit" +/// capturing groups: +/// +/// * An **implicit** capturing group refers to the capturing group that is +/// present for every pattern automatically, and corresponds to the overall +/// match of a pattern. Every pattern has precisely one implicit capturing +/// group. It is always unnamed and it always corresponds to the capture group +/// index `0`. +/// * An **explicit** capturing group refers to any capturing group that +/// appears in the concrete syntax of the pattern. (Or, if an NFA was hand +/// built without any concrete syntax, it refers to any capturing group with an +/// index greater than `0`.) +/// +/// Some examples: +/// +/// * `\w+` has one implicit capturing group and zero explicit capturing +/// groups. +/// * `(\w+)` has one implicit group and one explicit group. +/// * `foo(\d+)(?:\pL+)(\d+)` has one implicit group and two explicit groups. +/// +/// Turning back to the slot mapping, we can now state it as follows: +/// +/// * Given a pattern ID `pid`, the slots for its implicit group are always +/// at `pid * 2` and `pid * 2 + 1`. +/// * Given a pattern ID `0`, the slots for its explicit groups start +/// at `group_info.pattern_len() * 2`. +/// * Given a pattern ID `pid > 0`, the slots for its explicit groups start +/// immediately following where the slots for the explicit groups of `pid - 1` +/// end. +/// +/// In particular, while there is a concrete formula one can use to determine +/// where the slots for the implicit group of any pattern are, there is no +/// general formula for determining where the slots for explicit capturing +/// groups are. This is because each pattern can contain a different number +/// of groups. +/// +/// The intended way of getting the slots for a particular capturing group +/// (whether implicit or explicit) is via the [`GroupInfo::slot`] or +/// [`GroupInfo::slots`] method. +/// +/// See below for a concrete example of how capturing groups get mapped to +/// slots. +/// +/// # Example +/// +/// This example shows how to build a new `GroupInfo` and query it for +/// information. +/// +/// ``` +/// use regex_automata::util::{captures::GroupInfo, primitives::PatternID}; +/// +/// let info = GroupInfo::new(vec![ +/// vec![None, Some("foo")], +/// vec![None], +/// vec![None, None, None, Some("bar"), None], +/// vec![None, None, Some("foo")], +/// ])?; +/// // The number of patterns being tracked. +/// assert_eq!(4, info.pattern_len()); +/// // We can query the number of groups for any pattern. +/// assert_eq!(2, info.group_len(PatternID::must(0))); +/// assert_eq!(1, info.group_len(PatternID::must(1))); +/// assert_eq!(5, info.group_len(PatternID::must(2))); +/// assert_eq!(3, info.group_len(PatternID::must(3))); +/// // An invalid pattern always has zero groups. +/// assert_eq!(0, info.group_len(PatternID::must(999))); +/// // 2 slots per group +/// assert_eq!(22, info.slot_len()); +/// +/// // We can map a group index for a particular pattern to its name, if +/// // one exists. +/// assert_eq!(Some("foo"), info.to_name(PatternID::must(3), 2)); +/// assert_eq!(None, info.to_name(PatternID::must(2), 4)); +/// // Or map a name to its group index. +/// assert_eq!(Some(1), info.to_index(PatternID::must(0), "foo")); +/// assert_eq!(Some(2), info.to_index(PatternID::must(3), "foo")); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +/// +/// # Example: mapping from capture groups to slots +/// +/// This example shows the specific mapping from capture group indices for +/// each pattern to their corresponding slots. The slot values shown in this +/// example are considered an API guarantee. +/// +/// ``` +/// use regex_automata::util::{captures::GroupInfo, primitives::PatternID}; +/// +/// let info = GroupInfo::new(vec![ +/// vec![None, Some("foo")], +/// vec![None], +/// vec![None, None, None, Some("bar"), None], +/// vec![None, None, Some("foo")], +/// ])?; +/// +/// // We first show the slots for each pattern's implicit group. +/// assert_eq!(Some((0, 1)), info.slots(PatternID::must(0), 0)); +/// assert_eq!(Some((2, 3)), info.slots(PatternID::must(1), 0)); +/// assert_eq!(Some((4, 5)), info.slots(PatternID::must(2), 0)); +/// assert_eq!(Some((6, 7)), info.slots(PatternID::must(3), 0)); +/// +/// // And now we show the slots for each pattern's explicit group. +/// assert_eq!(Some((8, 9)), info.slots(PatternID::must(0), 1)); +/// assert_eq!(Some((10, 11)), info.slots(PatternID::must(2), 1)); +/// assert_eq!(Some((12, 13)), info.slots(PatternID::must(2), 2)); +/// assert_eq!(Some((14, 15)), info.slots(PatternID::must(2), 3)); +/// assert_eq!(Some((16, 17)), info.slots(PatternID::must(2), 4)); +/// assert_eq!(Some((18, 19)), info.slots(PatternID::must(3), 1)); +/// assert_eq!(Some((20, 21)), info.slots(PatternID::must(3), 2)); +/// +/// // Asking for the slots for an invalid pattern ID or even for an invalid +/// // group index for a specific pattern will return None. So for example, +/// // you're guaranteed to not get the slots for a different pattern than the +/// // one requested. +/// assert_eq!(None, info.slots(PatternID::must(5), 0)); +/// assert_eq!(None, info.slots(PatternID::must(1), 1)); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +#[derive(Clone, Debug, Default)] +pub struct GroupInfo(Arc<GroupInfoInner>); + +impl GroupInfo { + /// Creates a new group info from a sequence of patterns, where each + /// sequence of patterns yields a sequence of possible group names. The + /// index of each pattern in the sequence corresponds to its `PatternID`, + /// and the index of each group in each pattern's sequence corresponds to + /// its corresponding group index. + /// + /// While this constructor is very generic and therefore perhaps hard to + /// chew on, an example of a valid concrete type that can be passed to + /// this constructor is `Vec<Vec<Option<String>>>`. The outer `Vec` + /// corresponds to the patterns, i.e., one `Vec<Option<String>>` per + /// pattern. The inner `Vec` corresponds to the capturing groups for + /// each pattern. The `Option<String>` corresponds to the name of the + /// capturing group, if present. + /// + /// It is legal to pass an empty iterator to this constructor. It will + /// return an empty group info with zero slots. An empty group info is + /// useful for cases where you have no patterns or for cases where slots + /// aren't being used at all (e.g., for most DFAs in this crate). + /// + /// # Errors + /// + /// This constructor returns an error if the given capturing groups are + /// invalid in some way. Those reasons include, but are not necessarily + /// limited to: + /// + /// * Too many patterns (i.e., `PatternID` would overflow). + /// * Too many capturing groups (e.g., `u32` would overflow). + /// * A pattern is given that has no capturing groups. (All patterns must + /// have at least an implicit capturing group at index `0`.) + /// * The capturing group at index `0` has a name. It must be unnamed. + /// * There are duplicate capturing group names within the same pattern. + /// (Multiple capturing groups with the same name may exist, but they + /// must be in different patterns.) + /// + /// An example below shows how to trigger some of the above error + /// conditions. + /// + /// # Example + /// + /// This example shows how to build a new `GroupInfo` and query it for + /// information. + /// + /// ``` + /// use regex_automata::util::captures::GroupInfo; + /// + /// let info = GroupInfo::new(vec![ + /// vec![None, Some("foo")], + /// vec![None], + /// vec![None, None, None, Some("bar"), None], + /// vec![None, None, Some("foo")], + /// ])?; + /// // The number of patterns being tracked. + /// assert_eq!(4, info.pattern_len()); + /// // 2 slots per group + /// assert_eq!(22, info.slot_len()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// # Example: empty `GroupInfo` + /// + /// This example shows how to build a new `GroupInfo` and query it for + /// information. + /// + /// ``` + /// use regex_automata::util::captures::GroupInfo; + /// + /// let info = GroupInfo::empty(); + /// // Everything is zero. + /// assert_eq!(0, info.pattern_len()); + /// assert_eq!(0, info.slot_len()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// # Example: error conditions + /// + /// This example shows how to provoke some of the ways in which building + /// a `GroupInfo` can fail. + /// + /// ``` + /// use regex_automata::util::captures::GroupInfo; + /// + /// // Either the group info is empty, or all patterns must have at least + /// // one capturing group. + /// assert!(GroupInfo::new(vec![ + /// vec![None, Some("a")], // ok + /// vec![None], // ok + /// vec![], // not ok + /// ]).is_err()); + /// // Note that building an empty group info is OK. + /// assert!(GroupInfo::new(Vec::<Vec<Option<String>>>::new()).is_ok()); + /// + /// // The first group in each pattern must correspond to an implicit + /// // anonymous group. i.e., One that is not named. By convention, this + /// // group corresponds to the overall match of a regex. Every other group + /// // in a pattern is explicit and optional. + /// assert!(GroupInfo::new(vec![vec![Some("foo")]]).is_err()); + /// + /// // There must not be duplicate group names within the same pattern. + /// assert!(GroupInfo::new(vec![ + /// vec![None, Some("foo"), Some("foo")], + /// ]).is_err()); + /// // But duplicate names across distinct patterns is OK. + /// assert!(GroupInfo::new(vec![ + /// vec![None, Some("foo")], + /// vec![None, Some("foo")], + /// ]).is_ok()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// There are other ways for building a `GroupInfo` to fail but are + /// difficult to show. For example, if the number of patterns given would + /// overflow `PatternID`. + pub fn new<P, G, N>(pattern_groups: P) -> Result<GroupInfo, GroupInfoError> + where + P: IntoIterator<Item = G>, + G: IntoIterator<Item = Option<N>>, + N: AsRef<str>, + { + let mut group_info = GroupInfoInner { + slot_ranges: vec![], + name_to_index: vec![], + index_to_name: vec![], + memory_extra: 0, + }; + for (pattern_index, groups) in pattern_groups.into_iter().enumerate() { + // If we can't convert the pattern index to an ID, then the caller + // tried to build capture info for too many patterns. + let pid = PatternID::new(pattern_index) + .map_err(GroupInfoError::too_many_patterns)?; + + let mut groups_iter = groups.into_iter().enumerate(); + match groups_iter.next() { + None => return Err(GroupInfoError::missing_groups(pid)), + Some((_, Some(_))) => { + return Err(GroupInfoError::first_must_be_unnamed(pid)) + } + Some((_, None)) => {} + } + group_info.add_first_group(pid); + // Now iterate over the rest, which correspond to all of the + // (conventionally) explicit capture groups in a regex pattern. + for (group_index, maybe_name) in groups_iter { + // Just like for patterns, if the group index can't be + // converted to a "small" index, then the caller has given too + // many groups for a particular pattern. + let group = SmallIndex::new(group_index).map_err(|_| { + GroupInfoError::too_many_groups(pid, group_index) + })?; + group_info.add_explicit_group(pid, group, maybe_name)?; + } + } + group_info.fixup_slot_ranges()?; + Ok(GroupInfo(Arc::new(group_info))) + } + + /// This creates an empty `GroupInfo`. + /// + /// This is a convenience routine for calling `GroupInfo::new` with an + /// iterator that yields no elements. + /// + /// # Example + /// + /// This example shows how to build a new empty `GroupInfo` and query it + /// for information. + /// + /// ``` + /// use regex_automata::util::captures::GroupInfo; + /// + /// let info = GroupInfo::empty(); + /// // Everything is zero. + /// assert_eq!(0, info.pattern_len()); + /// assert_eq!(0, info.all_group_len()); + /// assert_eq!(0, info.slot_len()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn empty() -> GroupInfo { + GroupInfo::new(core::iter::empty::<[Option<&str>; 0]>()) + .expect("empty group info is always valid") + } + + /// Return the capture group index corresponding to the given name in the + /// given pattern. If no such capture group name exists in the given + /// pattern, then this returns `None`. + /// + /// If the given pattern ID is invalid, then this returns `None`. + /// + /// This also returns `None` for all inputs if these captures are empty + /// (e.g., built from an empty [`GroupInfo`]). To check whether captures + /// are are present for a specific pattern, use [`GroupInfo::group_len`]. + /// + /// # Example + /// + /// This example shows how to find the capture index for the given pattern + /// and group name. + /// + /// Remember that capture indices are relative to the pattern, such that + /// the same capture index value may refer to different capturing groups + /// for distinct patterns. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{nfa::thompson::NFA, PatternID}; + /// + /// let (pid0, pid1) = (PatternID::must(0), PatternID::must(1)); + /// + /// let nfa = NFA::new_many(&[ + /// r"a(?P<quux>\w+)z(?P<foo>\s+)", + /// r"a(?P<foo>\d+)z", + /// ])?; + /// let groups = nfa.group_info(); + /// assert_eq!(Some(2), groups.to_index(pid0, "foo")); + /// // Recall that capture index 0 is always unnamed and refers to the + /// // entire pattern. So the first capturing group present in the pattern + /// // itself always starts at index 1. + /// assert_eq!(Some(1), groups.to_index(pid1, "foo")); + /// + /// // And if a name does not exist for a particular pattern, None is + /// // returned. + /// assert!(groups.to_index(pid0, "quux").is_some()); + /// assert!(groups.to_index(pid1, "quux").is_none()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn to_index(&self, pid: PatternID, name: &str) -> Option<usize> { + let indices = self.0.name_to_index.get(pid.as_usize())?; + indices.get(name).cloned().map(|i| i.as_usize()) + } + + /// Return the capture name for the given index and given pattern. If the + /// corresponding group does not have a name, then this returns `None`. + /// + /// If the pattern ID is invalid, then this returns `None`. + /// + /// If the group index is invalid for the given pattern, then this returns + /// `None`. A group `index` is valid for a pattern `pid` in an `nfa` if and + /// only if `index < nfa.pattern_capture_len(pid)`. + /// + /// This also returns `None` for all inputs if these captures are empty + /// (e.g., built from an empty [`GroupInfo`]). To check whether captures + /// are are present for a specific pattern, use [`GroupInfo::group_len`]. + /// + /// # Example + /// + /// This example shows how to find the capture group name for the given + /// pattern and group index. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{nfa::thompson::NFA, PatternID}; + /// + /// let (pid0, pid1) = (PatternID::must(0), PatternID::must(1)); + /// + /// let nfa = NFA::new_many(&[ + /// r"a(?P<foo>\w+)z(\s+)x(\d+)", + /// r"a(\d+)z(?P<foo>\s+)", + /// ])?; + /// let groups = nfa.group_info(); + /// assert_eq!(None, groups.to_name(pid0, 0)); + /// assert_eq!(Some("foo"), groups.to_name(pid0, 1)); + /// assert_eq!(None, groups.to_name(pid0, 2)); + /// assert_eq!(None, groups.to_name(pid0, 3)); + /// + /// assert_eq!(None, groups.to_name(pid1, 0)); + /// assert_eq!(None, groups.to_name(pid1, 1)); + /// assert_eq!(Some("foo"), groups.to_name(pid1, 2)); + /// // '3' is not a valid capture index for the second pattern. + /// assert_eq!(None, groups.to_name(pid1, 3)); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn to_name(&self, pid: PatternID, group_index: usize) -> Option<&str> { + let pattern_names = self.0.index_to_name.get(pid.as_usize())?; + pattern_names.get(group_index)?.as_deref() + } + + /// Return an iterator of all capture groups and their names (if present) + /// for a particular pattern. + /// + /// If the given pattern ID is invalid or if this `GroupInfo` is empty, + /// then the iterator yields no elements. + /// + /// The number of elements yielded by this iterator is always equal to + /// the result of calling [`GroupInfo::group_len`] with the same + /// `PatternID`. + /// + /// # Example + /// + /// This example shows how to get a list of all capture group names for + /// a particular pattern. + /// + /// ``` + /// use regex_automata::{nfa::thompson::NFA, PatternID}; + /// + /// let nfa = NFA::new(r"(a)(?P<foo>b)(c)(d)(?P<bar>e)")?; + /// // The first is the implicit group that is always unnammed. The next + /// // 5 groups are the explicit groups found in the concrete syntax above. + /// let expected = vec![None, None, Some("foo"), None, None, Some("bar")]; + /// let got: Vec<Option<&str>> = + /// nfa.group_info().pattern_names(PatternID::ZERO).collect(); + /// assert_eq!(expected, got); + /// + /// // Using an invalid pattern ID will result in nothing yielded. + /// let got = nfa.group_info().pattern_names(PatternID::must(999)).count(); + /// assert_eq!(0, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn pattern_names(&self, pid: PatternID) -> GroupInfoPatternNames<'_> { + GroupInfoPatternNames { + it: self + .0 + .index_to_name + .get(pid.as_usize()) + .map(|indices| indices.iter()) + .unwrap_or([].iter()), + } + } + + /// Return an iterator of all capture groups for all patterns supported by + /// this `GroupInfo`. Each item yielded is a triple of the group's pattern + /// ID, index in the pattern and the group's name, if present. + /// + /// # Example + /// + /// This example shows how to get a list of all capture groups found in + /// one NFA, potentially spanning multiple patterns. + /// + /// ``` + /// use regex_automata::{nfa::thompson::NFA, PatternID}; + /// + /// let nfa = NFA::new_many(&[ + /// r"(?P<foo>a)", + /// r"a", + /// r"(a)", + /// ])?; + /// let expected = vec![ + /// (PatternID::must(0), 0, None), + /// (PatternID::must(0), 1, Some("foo")), + /// (PatternID::must(1), 0, None), + /// (PatternID::must(2), 0, None), + /// (PatternID::must(2), 1, None), + /// ]; + /// let got: Vec<(PatternID, usize, Option<&str>)> = + /// nfa.group_info().all_names().collect(); + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// Unlike other capturing group related routines, this routine doesn't + /// panic even if captures aren't enabled on this NFA: + /// + /// ``` + /// use regex_automata::nfa::thompson::{NFA, WhichCaptures}; + /// + /// let nfa = NFA::compiler() + /// .configure(NFA::config().which_captures(WhichCaptures::None)) + /// .build_many(&[ + /// r"(?P<foo>a)", + /// r"a", + /// r"(a)", + /// ])?; + /// // When captures aren't enabled, there's nothing to return. + /// assert_eq!(0, nfa.group_info().all_names().count()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn all_names(&self) -> GroupInfoAllNames<'_> { + GroupInfoAllNames { + group_info: self, + pids: PatternID::iter(self.pattern_len()), + current_pid: None, + names: None, + } + } + + /// Returns the starting and ending slot corresponding to the given + /// capturing group for the given pattern. The ending slot is always one + /// more than the starting slot returned. + /// + /// Note that this is like [`GroupInfo::slot`], except that it also returns + /// the ending slot value for convenience. + /// + /// If either the pattern ID or the capture index is invalid, then this + /// returns None. + /// + /// # Example + /// + /// This example shows that the starting slots for the first capturing + /// group of each pattern are distinct. + /// + /// ``` + /// use regex_automata::{nfa::thompson::NFA, PatternID}; + /// + /// let nfa = NFA::new_many(&["a", "b"])?; + /// assert_ne!( + /// nfa.group_info().slots(PatternID::must(0), 0), + /// nfa.group_info().slots(PatternID::must(1), 0), + /// ); + /// + /// // Also, the start and end slot values are never equivalent. + /// let (start, end) = nfa.group_info().slots(PatternID::ZERO, 0).unwrap(); + /// assert_ne!(start, end); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn slots( + &self, + pid: PatternID, + group_index: usize, + ) -> Option<(usize, usize)> { + // Since 'slot' only even returns valid starting slots, we know that + // there must also be an end slot and that end slot is always one more + // than the start slot. + self.slot(pid, group_index).map(|start| (start, start + 1)) + } + + /// Returns the starting slot corresponding to the given capturing group + /// for the given pattern. The ending slot is always one more than the + /// value returned. + /// + /// If either the pattern ID or the capture index is invalid, then this + /// returns None. + /// + /// # Example + /// + /// This example shows that the starting slots for the first capturing + /// group of each pattern are distinct. + /// + /// ``` + /// use regex_automata::{nfa::thompson::NFA, PatternID}; + /// + /// let nfa = NFA::new_many(&["a", "b"])?; + /// assert_ne!( + /// nfa.group_info().slot(PatternID::must(0), 0), + /// nfa.group_info().slot(PatternID::must(1), 0), + /// ); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn slot(&self, pid: PatternID, group_index: usize) -> Option<usize> { + if group_index >= self.group_len(pid) { + return None; + } + // At this point, we know that 'pid' refers to a real pattern and that + // 'group_index' refers to a real group. We therefore also know that + // the pattern and group can be combined to return a correct slot. + // That's why we don't need to use checked arithmetic below. + if group_index == 0 { + Some(pid.as_usize() * 2) + } else { + // As above, we don't need to check that our slot is less than the + // end of our range since we already know the group index is a + // valid index for the given pattern. + let (start, _) = self.0.slot_ranges[pid]; + Some(start.as_usize() + ((group_index - 1) * 2)) + } + } + + /// Returns the total number of patterns in this `GroupInfo`. + /// + /// This may return zero if the `GroupInfo` was constructed with no + /// patterns. + /// + /// This is guaranteed to be no bigger than [`PatternID::LIMIT`] because + /// `GroupInfo` construction will fail if too many patterns are added. + /// + /// # Example + /// + /// ``` + /// use regex_automata::nfa::thompson::NFA; + /// + /// let nfa = NFA::new_many(&["[0-9]+", "[a-z]+", "[A-Z]+"])?; + /// assert_eq!(3, nfa.group_info().pattern_len()); + /// + /// let nfa = NFA::never_match(); + /// assert_eq!(0, nfa.group_info().pattern_len()); + /// + /// let nfa = NFA::always_match(); + /// assert_eq!(1, nfa.group_info().pattern_len()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn pattern_len(&self) -> usize { + self.0.pattern_len() + } + + /// Return the number of capture groups in a pattern. + /// + /// If the pattern ID is invalid, then this returns `0`. + /// + /// # Example + /// + /// This example shows how the values returned by this routine may vary + /// for different patterns and NFA configurations. + /// + /// ``` + /// use regex_automata::{nfa::thompson::{NFA, WhichCaptures}, PatternID}; + /// + /// let nfa = NFA::new(r"(a)(b)(c)")?; + /// // There are 3 explicit groups in the pattern's concrete syntax and + /// // 1 unnamed and implicit group spanning the entire pattern. + /// assert_eq!(4, nfa.group_info().group_len(PatternID::ZERO)); + /// + /// let nfa = NFA::new(r"abc")?; + /// // There is just the unnamed implicit group. + /// assert_eq!(1, nfa.group_info().group_len(PatternID::ZERO)); + /// + /// let nfa = NFA::compiler() + /// .configure(NFA::config().which_captures(WhichCaptures::None)) + /// .build(r"abc")?; + /// // We disabled capturing groups, so there are none. + /// assert_eq!(0, nfa.group_info().group_len(PatternID::ZERO)); + /// + /// let nfa = NFA::compiler() + /// .configure(NFA::config().which_captures(WhichCaptures::None)) + /// .build(r"(a)(b)(c)")?; + /// // We disabled capturing groups, so there are none, even if there are + /// // explicit groups in the concrete syntax. + /// assert_eq!(0, nfa.group_info().group_len(PatternID::ZERO)); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn group_len(&self, pid: PatternID) -> usize { + self.0.group_len(pid) + } + + /// Return the total number of capture groups across all patterns. + /// + /// This includes implicit groups that represent the entire match of a + /// pattern. + /// + /// # Example + /// + /// This example shows how the values returned by this routine may vary + /// for different patterns and NFA configurations. + /// + /// ``` + /// use regex_automata::{nfa::thompson::{NFA, WhichCaptures}, PatternID}; + /// + /// let nfa = NFA::new(r"(a)(b)(c)")?; + /// // There are 3 explicit groups in the pattern's concrete syntax and + /// // 1 unnamed and implicit group spanning the entire pattern. + /// assert_eq!(4, nfa.group_info().all_group_len()); + /// + /// let nfa = NFA::new(r"abc")?; + /// // There is just the unnamed implicit group. + /// assert_eq!(1, nfa.group_info().all_group_len()); + /// + /// let nfa = NFA::new_many(&["(a)", "b", "(c)"])?; + /// // Each pattern has one implicit groups, and two + /// // patterns have one explicit group each. + /// assert_eq!(5, nfa.group_info().all_group_len()); + /// + /// let nfa = NFA::compiler() + /// .configure(NFA::config().which_captures(WhichCaptures::None)) + /// .build(r"abc")?; + /// // We disabled capturing groups, so there are none. + /// assert_eq!(0, nfa.group_info().all_group_len()); + /// + /// let nfa = NFA::compiler() + /// .configure(NFA::config().which_captures(WhichCaptures::None)) + /// .build(r"(a)(b)(c)")?; + /// // We disabled capturing groups, so there are none, even if there are + /// // explicit groups in the concrete syntax. + /// assert_eq!(0, nfa.group_info().group_len(PatternID::ZERO)); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn all_group_len(&self) -> usize { + self.slot_len() / 2 + } + + /// Returns the total number of slots in this `GroupInfo` across all + /// patterns. + /// + /// The total number of slots is always twice the total number of capturing + /// groups, including both implicit and explicit groups. + /// + /// # Example + /// + /// This example shows the relationship between the number of capturing + /// groups and slots. + /// + /// ``` + /// use regex_automata::util::captures::GroupInfo; + /// + /// // There are 11 total groups here. + /// let info = GroupInfo::new(vec![ + /// vec![None, Some("foo")], + /// vec![None], + /// vec![None, None, None, Some("bar"), None], + /// vec![None, None, Some("foo")], + /// ])?; + /// // 2 slots per group gives us 11*2=22 slots. + /// assert_eq!(22, info.slot_len()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn slot_len(&self) -> usize { + self.0.small_slot_len().as_usize() + } + + /// Returns the total number of slots for implicit capturing groups. + /// + /// This is like [`GroupInfo::slot_len`], except it doesn't include the + /// explicit slots for each pattern. Since there are always exactly 2 + /// implicit slots for each pattern, the number of implicit slots is always + /// equal to twice the number of patterns. + /// + /// # Example + /// + /// This example shows the relationship between the number of capturing + /// groups, implicit slots and explicit slots. + /// + /// ``` + /// use regex_automata::util::captures::GroupInfo; + /// + /// // There are 11 total groups here. + /// let info = GroupInfo::new(vec![vec![None, Some("foo"), Some("bar")]])?; + /// // 2 slots per group gives us 11*2=22 slots. + /// assert_eq!(6, info.slot_len()); + /// // 2 implicit slots per pattern gives us 2 implicit slots since there + /// // is 1 pattern. + /// assert_eq!(2, info.implicit_slot_len()); + /// // 2 explicit capturing groups gives us 2*2=4 explicit slots. + /// assert_eq!(4, info.explicit_slot_len()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn implicit_slot_len(&self) -> usize { + self.pattern_len() * 2 + } + + /// Returns the total number of slots for explicit capturing groups. + /// + /// This is like [`GroupInfo::slot_len`], except it doesn't include the + /// implicit slots for each pattern. (There are always 2 implicit slots for + /// each pattern.) + /// + /// For a non-empty `GroupInfo`, it is always the case that `slot_len` is + /// strictly greater than `explicit_slot_len`. For an empty `GroupInfo`, + /// both the total number of slots and the number of explicit slots is + /// `0`. + /// + /// # Example + /// + /// This example shows the relationship between the number of capturing + /// groups, implicit slots and explicit slots. + /// + /// ``` + /// use regex_automata::util::captures::GroupInfo; + /// + /// // There are 11 total groups here. + /// let info = GroupInfo::new(vec![vec![None, Some("foo"), Some("bar")]])?; + /// // 2 slots per group gives us 11*2=22 slots. + /// assert_eq!(6, info.slot_len()); + /// // 2 implicit slots per pattern gives us 2 implicit slots since there + /// // is 1 pattern. + /// assert_eq!(2, info.implicit_slot_len()); + /// // 2 explicit capturing groups gives us 2*2=4 explicit slots. + /// assert_eq!(4, info.explicit_slot_len()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn explicit_slot_len(&self) -> usize { + self.slot_len().saturating_sub(self.implicit_slot_len()) + } + + /// Returns the memory usage, in bytes, of this `GroupInfo`. + /// + /// This does **not** include the stack size used up by this `GroupInfo`. + /// To compute that, use `std::mem::size_of::<GroupInfo>()`. + #[inline] + pub fn memory_usage(&self) -> usize { + use core::mem::size_of as s; + + s::<GroupInfoInner>() + + self.0.slot_ranges.len() * s::<(SmallIndex, SmallIndex)>() + + self.0.name_to_index.len() * s::<CaptureNameMap>() + + self.0.index_to_name.len() * s::<Vec<Option<Arc<str>>>>() + + self.0.memory_extra + } +} + +/// A map from capture group name to its corresponding capture group index. +/// +/// This type is actually wrapped inside a Vec indexed by pattern ID on a +/// `GroupInfo`, since multiple patterns may have the same capture group name. +/// That is, each pattern gets its own namespace of capture group names. +/// +/// Perhaps a more memory efficient representation would be +/// HashMap<(PatternID, Arc<str>), usize>, but this makes it difficult to look +/// up a capture index by name without producing a `Arc<str>`, which requires +/// an allocation. To fix this, I think we'd need to define our own unsized +/// type or something? Anyway, I didn't give this much thought since it +/// probably doesn't matter much in the grand scheme of things. But it did +/// stand out to me as mildly wasteful. +#[cfg(feature = "std")] +type CaptureNameMap = std::collections::HashMap<Arc<str>, SmallIndex>; +#[cfg(not(feature = "std"))] +type CaptureNameMap = alloc::collections::BTreeMap<Arc<str>, SmallIndex>; + +/// The inner guts of `GroupInfo`. This type only exists so that it can +/// be wrapped in an `Arc` to make `GroupInfo` reference counted. +#[derive(Debug, Default)] +struct GroupInfoInner { + slot_ranges: Vec<(SmallIndex, SmallIndex)>, + name_to_index: Vec<CaptureNameMap>, + index_to_name: Vec<Vec<Option<Arc<str>>>>, + memory_extra: usize, +} + +impl GroupInfoInner { + /// This adds the first unnamed group for the given pattern ID. The given + /// pattern ID must be zero if this is the first time this method is + /// called, or must be exactly one more than the pattern ID supplied to the + /// previous call to this method. (This method panics if this rule is + /// violated.) + /// + /// This can be thought of as initializing the GroupInfo state for the + /// given pattern and closing off the state for any previous pattern. + fn add_first_group(&mut self, pid: PatternID) { + assert_eq!(pid.as_usize(), self.slot_ranges.len()); + assert_eq!(pid.as_usize(), self.name_to_index.len()); + assert_eq!(pid.as_usize(), self.index_to_name.len()); + // This is the start of our slots for the explicit capturing groups. + // Note that since the slots for the 0th group for every pattern appear + // before any slots for the nth group (where n > 0) in any pattern, we + // will have to fix up the slot ranges once we know how many patterns + // we've added capture groups for. + let slot_start = self.small_slot_len(); + self.slot_ranges.push((slot_start, slot_start)); + self.name_to_index.push(CaptureNameMap::new()); + self.index_to_name.push(vec![None]); + self.memory_extra += core::mem::size_of::<Option<Arc<str>>>(); + } + + /// Add an explicit capturing group for the given pattern with the given + /// index. If the group has a name, then that must be given as well. + /// + /// Note that every capturing group except for the first or zeroth group is + /// explicit. + /// + /// This returns an error if adding this group would result in overflowing + /// slot indices or if a capturing group with the same name for this + /// pattern has already been added. + fn add_explicit_group<N: AsRef<str>>( + &mut self, + pid: PatternID, + group: SmallIndex, + maybe_name: Option<N>, + ) -> Result<(), GroupInfoError> { + // We also need to check that the slot index generated for + // this group is also valid. Although, this is a little weird + // because we offset these indices below, at which point, we'll + // have to recheck them. Gosh this is annoying. Note that + // the '+2' below is OK because 'end' is guaranteed to be less + // than isize::MAX. + let end = &mut self.slot_ranges[pid].1; + *end = SmallIndex::new(end.as_usize() + 2).map_err(|_| { + GroupInfoError::too_many_groups(pid, group.as_usize()) + })?; + if let Some(name) = maybe_name { + let name = Arc::<str>::from(name.as_ref()); + if self.name_to_index[pid].contains_key(&*name) { + return Err(GroupInfoError::duplicate(pid, &name)); + } + let len = name.len(); + self.name_to_index[pid].insert(Arc::clone(&name), group); + self.index_to_name[pid].push(Some(name)); + // Adds the memory used by the Arc<str> in both maps. + self.memory_extra += + 2 * (len + core::mem::size_of::<Option<Arc<str>>>()); + // And also the value entry for the 'name_to_index' map. + // This is probably an underestimate for 'name_to_index' since + // hashmaps/btrees likely have some non-zero overhead, but we + // assume here that they have zero overhead. + self.memory_extra += core::mem::size_of::<SmallIndex>(); + } else { + self.index_to_name[pid].push(None); + self.memory_extra += core::mem::size_of::<Option<Arc<str>>>(); + } + // This is a sanity assert that checks that our group index + // is in line with the number of groups added so far for this + // pattern. + assert_eq!(group.one_more(), self.group_len(pid)); + // And is also in line with the 'index_to_name' map. + assert_eq!(group.one_more(), self.index_to_name[pid].len()); + Ok(()) + } + + /// This corrects the slot ranges to account for the slots corresponding + /// to the zeroth group of each pattern. That is, every slot range is + /// offset by 'pattern_len() * 2', since each pattern uses two slots to + /// represent the zeroth group. + fn fixup_slot_ranges(&mut self) -> Result<(), GroupInfoError> { + use crate::util::primitives::IteratorIndexExt; + // Since we know number of patterns fits in PatternID and + // PatternID::MAX < isize::MAX, it follows that multiplying by 2 will + // never overflow usize. + let offset = self.pattern_len().checked_mul(2).unwrap(); + for (pid, &mut (ref mut start, ref mut end)) in + self.slot_ranges.iter_mut().with_pattern_ids() + { + let group_len = 1 + ((end.as_usize() - start.as_usize()) / 2); + let new_end = match end.as_usize().checked_add(offset) { + Some(new_end) => new_end, + None => { + return Err(GroupInfoError::too_many_groups( + pid, group_len, + )) + } + }; + *end = SmallIndex::new(new_end).map_err(|_| { + GroupInfoError::too_many_groups(pid, group_len) + })?; + // Since start <= end, if end is valid then start must be too. + *start = SmallIndex::new(start.as_usize() + offset).unwrap(); + } + Ok(()) + } + + /// Return the total number of patterns represented by this capture slot + /// info. + fn pattern_len(&self) -> usize { + self.slot_ranges.len() + } + + /// Return the total number of capturing groups for the given pattern. If + /// the given pattern isn't valid for this capture slot info, then 0 is + /// returned. + fn group_len(&self, pid: PatternID) -> usize { + let (start, end) = match self.slot_ranges.get(pid.as_usize()) { + None => return 0, + Some(range) => range, + }; + // The difference between any two SmallIndex values always fits in a + // usize since we know that SmallIndex::MAX <= isize::MAX-1. We also + // know that start<=end by construction and that the number of groups + // never exceeds SmallIndex and thus never overflows usize. + 1 + ((end.as_usize() - start.as_usize()) / 2) + } + + /// Return the total number of slots in this capture slot info as a + /// "small index." + fn small_slot_len(&self) -> SmallIndex { + // Since slots are allocated in order of pattern (starting at 0) and + // then in order of capture group, it follows that the number of slots + // is the end of the range of slots for the last pattern. This is + // true even when the last pattern has no capturing groups, since + // 'slot_ranges' will still represent it explicitly with an empty + // range. + self.slot_ranges.last().map_or(SmallIndex::ZERO, |&(_, end)| end) + } +} + +/// An error that may occur when building a `GroupInfo`. +/// +/// Building a `GroupInfo` does a variety of checks to make sure the +/// capturing groups satisfy a number of invariants. This includes, but is not +/// limited to, ensuring that the first capturing group is unnamed and that +/// there are no duplicate capture groups for a specific pattern. +#[derive(Clone, Debug)] +pub struct GroupInfoError { + kind: GroupInfoErrorKind, +} + +/// The kind of error that occurs when building a `GroupInfo` fails. +/// +/// We keep this un-exported because it's not clear how useful it is to +/// export it. +#[derive(Clone, Debug)] +enum GroupInfoErrorKind { + /// This occurs when too many patterns have been added. i.e., It would + /// otherwise overflow a `PatternID`. + TooManyPatterns { err: PatternIDError }, + /// This occurs when too many capturing groups have been added for a + /// particular pattern. + TooManyGroups { + /// The ID of the pattern that had too many groups. + pattern: PatternID, + /// The minimum number of groups that the caller has tried to add for + /// a pattern. + minimum: usize, + }, + /// An error that occurs when a pattern has no capture groups. Either the + /// group info must be empty, or all patterns must have at least one group + /// (corresponding to the unnamed group for the entire pattern). + MissingGroups { + /// The ID of the pattern that had no capturing groups. + pattern: PatternID, + }, + /// An error that occurs when one tries to provide a name for the capture + /// group at index 0. This capturing group must currently always be + /// unnamed. + FirstMustBeUnnamed { + /// The ID of the pattern that was found to have a named first + /// capturing group. + pattern: PatternID, + }, + /// An error that occurs when duplicate capture group names for the same + /// pattern are added. + /// + /// NOTE: At time of writing, this error can never occur if you're using + /// regex-syntax, since the parser itself will reject patterns with + /// duplicate capture group names. This error can only occur when the + /// builder is used to hand construct NFAs. + Duplicate { + /// The pattern in which the duplicate capture group name was found. + pattern: PatternID, + /// The duplicate name. + name: String, + }, +} + +impl GroupInfoError { + fn too_many_patterns(err: PatternIDError) -> GroupInfoError { + GroupInfoError { kind: GroupInfoErrorKind::TooManyPatterns { err } } + } + + fn too_many_groups(pattern: PatternID, minimum: usize) -> GroupInfoError { + GroupInfoError { + kind: GroupInfoErrorKind::TooManyGroups { pattern, minimum }, + } + } + + fn missing_groups(pattern: PatternID) -> GroupInfoError { + GroupInfoError { kind: GroupInfoErrorKind::MissingGroups { pattern } } + } + + fn first_must_be_unnamed(pattern: PatternID) -> GroupInfoError { + GroupInfoError { + kind: GroupInfoErrorKind::FirstMustBeUnnamed { pattern }, + } + } + + fn duplicate(pattern: PatternID, name: &str) -> GroupInfoError { + GroupInfoError { + kind: GroupInfoErrorKind::Duplicate { + pattern, + name: String::from(name), + }, + } + } +} + +#[cfg(feature = "std")] +impl std::error::Error for GroupInfoError { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + match self.kind { + GroupInfoErrorKind::TooManyPatterns { .. } + | GroupInfoErrorKind::TooManyGroups { .. } + | GroupInfoErrorKind::MissingGroups { .. } + | GroupInfoErrorKind::FirstMustBeUnnamed { .. } + | GroupInfoErrorKind::Duplicate { .. } => None, + } + } +} + +impl core::fmt::Display for GroupInfoError { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + use self::GroupInfoErrorKind::*; + + match self.kind { + TooManyPatterns { ref err } => { + write!(f, "too many patterns to build capture info: {}", err) + } + TooManyGroups { pattern, minimum } => { + write!( + f, + "too many capture groups (at least {}) were \ + found for pattern {}", + minimum, + pattern.as_usize() + ) + } + MissingGroups { pattern } => write!( + f, + "no capturing groups found for pattern {} \ + (either all patterns have zero groups or all patterns have \ + at least one group)", + pattern.as_usize(), + ), + FirstMustBeUnnamed { pattern } => write!( + f, + "first capture group (at index 0) for pattern {} has a name \ + (it must be unnamed)", + pattern.as_usize(), + ), + Duplicate { pattern, ref name } => write!( + f, + "duplicate capture group name '{}' found for pattern {}", + name, + pattern.as_usize(), + ), + } + } +} + +/// An iterator over capturing groups and their names for a specific pattern. +/// +/// This iterator is created by [`GroupInfo::pattern_names`]. +/// +/// The lifetime parameter `'a` refers to the lifetime of the `GroupInfo` +/// from which this iterator was created. +#[derive(Clone, Debug)] +pub struct GroupInfoPatternNames<'a> { + it: core::slice::Iter<'a, Option<Arc<str>>>, +} + +impl GroupInfoPatternNames<'static> { + fn empty() -> GroupInfoPatternNames<'static> { + GroupInfoPatternNames { it: [].iter() } + } +} + +impl<'a> Iterator for GroupInfoPatternNames<'a> { + type Item = Option<&'a str>; + + fn next(&mut self) -> Option<Option<&'a str>> { + self.it.next().map(|x| x.as_deref()) + } + + fn size_hint(&self) -> (usize, Option<usize>) { + self.it.size_hint() + } + + fn count(self) -> usize { + self.it.count() + } +} + +impl<'a> ExactSizeIterator for GroupInfoPatternNames<'a> {} +impl<'a> core::iter::FusedIterator for GroupInfoPatternNames<'a> {} + +/// An iterator over capturing groups and their names for a `GroupInfo`. +/// +/// This iterator is created by [`GroupInfo::all_names`]. +/// +/// The lifetime parameter `'a` refers to the lifetime of the `GroupInfo` +/// from which this iterator was created. +#[derive(Debug)] +pub struct GroupInfoAllNames<'a> { + group_info: &'a GroupInfo, + pids: PatternIDIter, + current_pid: Option<PatternID>, + names: Option<core::iter::Enumerate<GroupInfoPatternNames<'a>>>, +} + +impl<'a> Iterator for GroupInfoAllNames<'a> { + type Item = (PatternID, usize, Option<&'a str>); + + fn next(&mut self) -> Option<(PatternID, usize, Option<&'a str>)> { + // If the group info has no captures, then we never have anything + // to yield. We need to consider this case explicitly (at time of + // writing) because 'pattern_capture_names' will panic if captures + // aren't enabled. + if self.group_info.0.index_to_name.is_empty() { + return None; + } + if self.current_pid.is_none() { + self.current_pid = Some(self.pids.next()?); + } + let pid = self.current_pid.unwrap(); + if self.names.is_none() { + self.names = Some(self.group_info.pattern_names(pid).enumerate()); + } + let (group_index, name) = match self.names.as_mut().unwrap().next() { + Some((group_index, name)) => (group_index, name), + None => { + self.current_pid = None; + self.names = None; + return self.next(); + } + }; + Some((pid, group_index, name)) + } +} diff --git a/vendor/regex-automata/src/util/determinize/mod.rs b/vendor/regex-automata/src/util/determinize/mod.rs index b384de8e1..30a82afb8 100644 --- a/vendor/regex-automata/src/util/determinize/mod.rs +++ b/vendor/regex-automata/src/util/determinize/mod.rs @@ -13,11 +13,9 @@ in common, as defined by this module: word boundaries, line boundaries, etc., is all the same. This also includes the look-behind assertions that are satisfied by each starting state classification. - * The representation of DFA states as sets of NFA states, including convenience types for building these DFA states that are amenable to reusing allocations. - * Routines for the "classical" parts of determinization: computing the epsilon closure, tracking match states (with corresponding pattern IDs, since we support multi-pattern finite automata) and, of course, computing the @@ -44,19 +42,21 @@ pub(crate) use self::state::{ }; use crate::{ - nfa::thompson::{self, Look, LookSet}, + nfa::thompson, util::{ alphabet, - id::StateID, - matchtypes::MatchKind, + look::{Look, LookSet}, + primitives::StateID, + search::MatchKind, sparse_set::{SparseSet, SparseSets}, start::Start, + utf8, }, }; mod state; -/// Compute the set of all eachable NFA states, including the full epsilon +/// Compute the set of all reachable NFA states, including the full epsilon /// closure, from a DFA state for a single unit of input. The set of reachable /// states is returned as a `StateBuilderNFA`. The `StateBuilderNFA` returned /// also includes any look-behind assertions satisfied by `unit`, in addition @@ -100,6 +100,15 @@ pub(crate) fn next( ) -> StateBuilderNFA { sparses.clear(); + // Whether the NFA is matched in reverse or not. We use this in some + // conditional logic for dealing with the exceptionally annoying CRLF-aware + // line anchors. + let rev = nfa.is_reverse(); + // The look-around matcher that our NFA is configured with. We don't + // actually use it to match look-around assertions, but we do need its + // configuration for constructing states consistent with how it matches. + let lookm = nfa.look_matcher(); + // Put the NFA state IDs into a sparse set in case we need to // re-compute their epsilon closure. // @@ -113,43 +122,66 @@ pub(crate) fn next( sparses.set1.insert(nfa_id); }); - // Compute look-ahead assertions originating from the current state. - // Based on the input unit we're transitioning over, some additional - // set of assertions may be true. Thus, we re-compute this state's - // epsilon closure (but only if necessary). + // Compute look-ahead assertions originating from the current state. Based + // on the input unit we're transitioning over, some additional set of + // assertions may be true. Thus, we re-compute this state's epsilon closure + // (but only if necessary). Notably, when we build a DFA state initially, + // we don't enable any look-ahead assertions because we don't know whether + // they're true or not at that point. if !state.look_need().is_empty() { // Add look-ahead assertions that are now true based on the current // input unit. let mut look_have = state.look_have().clone(); match unit.as_u8() { + Some(b'\r') => { + if !rev || !state.is_half_crlf() { + look_have = look_have.insert(Look::EndCRLF); + } + } Some(b'\n') => { - look_have.insert(Look::EndLine); + if rev || !state.is_half_crlf() { + look_have = look_have.insert(Look::EndCRLF); + } } Some(_) => {} None => { - look_have.insert(Look::EndText); - look_have.insert(Look::EndLine); + look_have = look_have.insert(Look::End); + look_have = look_have.insert(Look::EndLF); + look_have = look_have.insert(Look::EndCRLF); } } + if unit.is_byte(lookm.get_line_terminator()) { + look_have = look_have.insert(Look::EndLF); + } + if state.is_half_crlf() + && ((rev && !unit.is_byte(b'\r')) + || (!rev && !unit.is_byte(b'\n'))) + { + look_have = look_have.insert(Look::StartCRLF); + } if state.is_from_word() == unit.is_word_byte() { - look_have.insert(Look::WordBoundaryUnicodeNegate); - look_have.insert(Look::WordBoundaryAsciiNegate); + look_have = look_have.insert(Look::WordUnicodeNegate); + look_have = look_have.insert(Look::WordAsciiNegate); } else { - look_have.insert(Look::WordBoundaryUnicode); - look_have.insert(Look::WordBoundaryAscii); + look_have = look_have.insert(Look::WordUnicode); + look_have = look_have.insert(Look::WordAscii); } // If we have new assertions satisfied that are among the set of - // assertions that exist in this state (that is, just because - // we added an EndLine assertion above doesn't mean there is an - // EndLine conditional epsilon transition in this state), then we - // re-compute this state's epsilon closure using the updated set of - // assertions. + // assertions that exist in this state (that is, just because we added + // an EndLF assertion above doesn't mean there is an EndLF conditional + // epsilon transition in this state), then we re-compute this state's + // epsilon closure using the updated set of assertions. + // + // Note that since our DFA states omit unconditional epsilon + // transitions, this check is necessary for correctness. If we re-did + // the epsilon closure below needlessly, it could change based on the + // fact that we omitted epsilon states originally. if !look_have .subtract(state.look_have()) .intersect(state.look_need()) .is_empty() { - for nfa_id in &sparses.set1 { + for nfa_id in sparses.set1.iter() { epsilon_closure( nfa, nfa_id, @@ -166,24 +198,36 @@ pub(crate) fn next( // Convert our empty builder into one that can record assertions and match // pattern IDs. let mut builder = empty_builder.into_matches(); - // Set whether the StartLine look-behind assertion is true for this + // Set whether the StartLF look-behind assertion is true for this // transition or not. The look-behind assertion for ASCII word boundaries // is handled below. - if nfa.has_any_anchor() { - if unit.as_u8().map_or(false, |b| b == b'\n') { - // Why only handle StartLine here and not StartText? That's - // because StartText can only impact the starting state, which - // is speical cased in start state handling. - builder.look_have().insert(Look::StartLine); - } + if nfa.look_set_any().contains_anchor_line() + && unit.is_byte(lookm.get_line_terminator()) + { + // Why only handle StartLF here and not Start? That's because Start + // can only impact the starting state, which is special cased in + // start state handling. + builder.set_look_have(|have| have.insert(Look::StartLF)); + } + // We also need to add StartCRLF to our assertions too, if we can. This + // is unfortunately a bit more complicated, because it depends on the + // direction of the search. In the forward direction, ^ matches after a + // \n, but in the reverse direction, ^ only matches after a \r. (This is + // further complicated by the fact that reverse a regex means changing a ^ + // to a $ and vice versa.) + if nfa.look_set_any().contains_anchor_crlf() + && ((rev && unit.is_byte(b'\r')) || (!rev && unit.is_byte(b'\n'))) + { + builder.set_look_have(|have| have.insert(Look::StartCRLF)); } - for nfa_id in &sparses.set1 { + for nfa_id in sparses.set1.iter() { match *nfa.state(nfa_id) { thompson::State::Union { .. } + | thompson::State::BinaryUnion { .. } | thompson::State::Fail | thompson::State::Look { .. } | thompson::State::Capture { .. } => {} - thompson::State::Match { id } => { + thompson::State::Match { pattern_id } => { // Notice here that we are calling the NEW state a match // state if the OLD state we are transitioning from // contains an NFA match state. This is precisely how we @@ -204,17 +248,25 @@ pub(crate) fn next( // IDs in a set, we are guarateed not to have any duplicative // match states. Thus, it is impossible to add the same pattern // ID more than once. - builder.add_match_pattern_id(id); + // + // N.B. We delay matches by 1 byte as a way to hack 1-byte + // look-around into DFA searches. This lets us support ^, $ + // and ASCII-only \b. The delay is also why we need a special + // "end-of-input" (EOI) sentinel and why we need to follow the + // EOI sentinel at the end of every search. This final EOI + // transition is necessary to report matches found at the end + // of a haystack. + builder.add_match_pattern_id(pattern_id); if !match_kind.continue_past_first_match() { break; } } - thompson::State::Range { range: ref r } => { - if r.matches_unit(unit) { + thompson::State::ByteRange { ref trans } => { + if trans.matches_unit(unit) { epsilon_closure( nfa, - r.next, - *builder.look_have(), + trans.next, + builder.look_have(), stack, &mut sparses.set2, ); @@ -225,7 +277,18 @@ pub(crate) fn next( epsilon_closure( nfa, next, - *builder.look_have(), + builder.look_have(), + stack, + &mut sparses.set2, + ); + } + } + thompson::State::Dense(ref dense) => { + if let Some(next) = dense.matches_unit(unit) { + epsilon_closure( + nfa, + next, + builder.look_have(), stack, &mut sparses.set2, ); @@ -250,11 +313,15 @@ pub(crate) fn next( // if one was detected once it enters a quit state (and indeed, the search // routines in this crate do just that), but it seems better to prevent // these things by construction if possible.) - if nfa.has_word_boundary() - && unit.is_word_byte() - && !sparses.set2.is_empty() - { - builder.set_is_from_word(); + if !sparses.set2.is_empty() { + if nfa.look_set_any().contains_word() && unit.is_word_byte() { + builder.set_is_from_word(); + } + if nfa.look_set_any().contains_anchor_crlf() + && ((rev && unit.is_byte(b'\n')) || (!rev && unit.is_byte(b'\r'))) + { + builder.set_is_half_crlf(); + } } let mut builder_nfa = builder.into_nfa(); add_nfa_states(nfa, &sparses.set2, &mut builder_nfa); @@ -303,8 +370,9 @@ pub(crate) fn epsilon_closure( break; } match *nfa.state(id) { - thompson::State::Range { .. } + thompson::State::ByteRange { .. } | thompson::State::Sparse { .. } + | thompson::State::Dense { .. } | thompson::State::Fail | thompson::State::Match { .. } => break, thompson::State::Look { look, next } => { @@ -323,6 +391,10 @@ pub(crate) fn epsilon_closure( // to the top of the stack. stack.extend(alternates[1..].iter().rev()); } + thompson::State::BinaryUnion { alt1, alt2 } => { + id = alt1; + stack.push(alt2); + } thompson::State::Capture { next, .. } => { id = next; } @@ -336,15 +408,15 @@ pub(crate) fn epsilon_closure( /// were added to `set`. /// /// The DFA builder state given should already have its complete set of match -/// pattern IDs added (if any) and any look-behind assertions (StartLine, -/// StartText and whether this state is being generated for a transition over a -/// word byte when applicable) that are true immediately prior to transitioning -/// into this state (via `builder.look_have()`). The match pattern IDs should -/// correspond to matches that occured on the previous transition, since all -/// matches are delayed by one byte. The things that should _not_ be set are -/// look-ahead assertions (EndLine, EndText and whether the next byte is a -/// word byte or not). The builder state should also not have anything in -/// `look_need` set, as this routine will compute that for you. +/// pattern IDs added (if any) and any look-behind assertions (StartLF, Start +/// and whether this state is being generated for a transition over a word byte +/// when applicable) that are true immediately prior to transitioning into this +/// state (via `builder.look_have()`). The match pattern IDs should correspond +/// to matches that occurred on the previous transition, since all matches are +/// delayed by one byte. The things that should _not_ be set are look-ahead +/// assertions (EndLF, End and whether the next byte is a word byte or not). +/// The builder state should also not have anything in `look_need` set, as this +/// routine will compute that for you. /// /// The given NFA should be able to resolve all identifiers in `set` to a /// particular NFA state. Additionally, `set` must have capacity equivalent @@ -354,56 +426,114 @@ pub(crate) fn add_nfa_states( set: &SparseSet, builder: &mut StateBuilderNFA, ) { - for nfa_id in set { + for nfa_id in set.iter() { match *nfa.state(nfa_id) { - thompson::State::Range { .. } => { + thompson::State::ByteRange { .. } => { builder.add_nfa_state_id(nfa_id); } thompson::State::Sparse { .. } => { builder.add_nfa_state_id(nfa_id); } + thompson::State::Dense { .. } => { + builder.add_nfa_state_id(nfa_id); + } thompson::State::Look { look, .. } => { builder.add_nfa_state_id(nfa_id); - builder.look_need().insert(look); + builder.set_look_need(|need| need.insert(look)); } thompson::State::Union { .. } - | thompson::State::Capture { .. } => { - // Pure epsilon transitions don't need to be tracked - // as part of the DFA state. Tracking them is actually - // superfluous; they won't cause any harm other than making - // determinization slower. + | thompson::State::BinaryUnion { .. } => { + // Pure epsilon transitions don't need to be tracked as part + // of the DFA state. Tracking them is actually superfluous; + // they won't cause any harm other than making determinization + // slower. // // Why aren't these needed? Well, in an NFA, epsilon - // transitions are really just jumping points to other - // states. So once you hit an epsilon transition, the same - // set of resulting states always appears. Therefore, - // putting them in a DFA's set of ordered NFA states is - // strictly redundant. + // transitions are really just jumping points to other states. + // So once you hit an epsilon transition, the same set of + // resulting states always appears. Therefore, putting them in + // a DFA's set of ordered NFA states is strictly redundant. // // Look-around states are also epsilon transitions, but // they are *conditional*. So their presence could be // discriminatory, and thus, they are tracked above. // - // But wait... why are epsilon states in our `set` in the - // first place? Why not just leave them out? They're in - // our `set` because it was generated by computing an - // epsilon closure, and we want to keep track of all states - // we visited to avoid re-visiting them. In exchange, we - // have to do this second iteration over our collected - // states to finalize our DFA state. + // But wait... why are epsilon states in our `set` in the first + // place? Why not just leave them out? They're in our `set` + // because it was generated by computing an epsilon closure, + // and we want to keep track of all states we visited to avoid + // re-visiting them. In exchange, we have to do this second + // iteration over our collected states to finalize our DFA + // state. In theory, we could avoid this second iteration if + // we maintained two sets during epsilon closure: the set of + // visited states (to avoid cycles) and the set of states that + // will actually be used to construct the next DFA state. + // + // Note that this optimization requires that we re-compute the + // epsilon closure to account for look-ahead in 'next' *only + // when necessary*. Namely, only when the set of look-around + // assertions changes and only when those changes are within + // the set of assertions that are needed in order to step + // through the closure correctly. Otherwise, if we re-do the + // epsilon closure needlessly, it could change based on the + // fact that we are omitting epsilon states here. + // + // ----- + // + // Welp, scratch the above. It turns out that recording these + // is in fact necessary to seemingly handle one particularly + // annoying case: when a conditional epsilon transition is + // put inside of a repetition operator. One specific case I + // ran into was the regex `(?:\b|%)+` on the haystack `z%`. + // The correct leftmost first matches are: [0, 0] and [1, 1]. + // But the DFA was reporting [0, 0] and [1, 2]. To understand + // why this happens, consider the NFA for the aforementioned + // regex: // - // Note that this optimization requires that we re-compute - // the epsilon closure to account for look-ahead in 'next' - // *only when necessary*. Namely, only when the set of - // look-around assertions changes and only when those - // changes are within the set of assertions that are - // needed in order to step through the closure correctly. - // Otherwise, if we re-do the epsilon closure needlessly, - // it could change based on the fact that we are omitting - // epsilon states here. + // >000000: binary-union(4, 1) + // 000001: \x00-\xFF => 0 + // 000002: WordAscii => 5 + // 000003: % => 5 + // ^000004: binary-union(2, 3) + // 000005: binary-union(4, 6) + // 000006: MATCH(0) + // + // The problem here is that one of the DFA start states is + // going to consist of the NFA states [2, 3] by computing the + // epsilon closure of state 4. State 4 isn't included because + // we previously were not keeping track of union states. But + // only a subset of transitions out of this state will be able + // to follow WordAscii, and in those cases, the epsilon closure + // is redone. The only problem is that computing the epsilon + // closure from [2, 3] is different than computing the epsilon + // closure from [4]. In the former case, assuming the WordAscii + // assertion is satisfied, you get: [2, 3, 6]. In the latter + // case, you get: [2, 6, 3]. Notice that '6' is the match state + // and appears AFTER '3' in the former case. This leads to a + // preferential but incorrect match of '%' before returning + // a match. In the latter case, the match is preferred over + // continuing to accept the '%'. + // + // It almost feels like we might be able to fix the NFA states + // to avoid this, or to at least only keep track of union + // states where this actually matters, since in the vast + // majority of cases, this doesn't matter. + // + // Another alternative would be to define a new HIR property + // called "assertion is repeated anywhere" and compute it + // inductively over the entire pattern. If it happens anywhere, + // which is probably pretty rare, then we record union states. + // Otherwise we don't. + builder.add_nfa_state_id(nfa_id); } + // Capture states we definitely do not need to record, since they + // are unconditional epsilon transitions with no branching. + thompson::State::Capture { .. } => {} + // It's not totally clear whether we need to record fail states or + // not, but we do so out of an abundance of caution. Since they are + // quite rare in practice, there isn't much cost to recording them. thompson::State::Fail => { - break; + builder.add_nfa_state_id(nfa_id); } thompson::State::Match { .. } => { // Normally, the NFA match state doesn't actually need to @@ -420,74 +550,61 @@ pub(crate) fn add_nfa_states( // there's no reason to track which look-around assertions were // satisfied when this state was created. if builder.look_need().is_empty() { - builder.look_have().clear(); + builder.set_look_have(|_| LookSet::empty()); } } /// Sets the appropriate look-behind assertions on the given state based on /// this starting configuration. pub(crate) fn set_lookbehind_from_start( + nfa: &thompson::NFA, start: &Start, builder: &mut StateBuilderMatches, ) { + let rev = nfa.is_reverse(); + let lineterm = nfa.look_matcher().get_line_terminator(); match *start { Start::NonWordByte => {} Start::WordByte => { builder.set_is_from_word(); } Start::Text => { - builder.look_have().insert(Look::StartText); - builder.look_have().insert(Look::StartLine); + builder.set_look_have(|have| { + have.insert(Look::Start) + .insert(Look::StartLF) + .insert(Look::StartCRLF) + }); } - Start::Line => { - builder.look_have().insert(Look::StartLine); + Start::LineLF => { + if rev { + builder.set_is_half_crlf(); + builder.set_look_have(|have| have.insert(Look::StartLF)); + } else { + builder.set_look_have(|have| have.insert(Look::StartCRLF)); + } + if lineterm == b'\n' { + builder.set_look_have(|have| have.insert(Look::StartLF)); + } + } + Start::LineCR => { + if rev { + builder.set_look_have(|have| have.insert(Look::StartCRLF)); + } else { + builder.set_is_half_crlf(); + } + if lineterm == b'\r' { + builder.set_look_have(|have| have.insert(Look::StartLF)); + } + } + Start::CustomLineTerminator => { + builder.set_look_have(|have| have.insert(Look::StartLF)); + // This is a bit of a tricky case, but if the line terminator was + // set to a word byte, then we also need to behave as if the start + // configuration is Start::WordByte. That is, we need to mark our + // state as having come from a word byte. + if utf8::is_word_byte(lineterm) { + builder.set_is_from_word(); + } } - } -} - -#[cfg(test)] -mod tests { - use super::Start; - - #[test] - #[should_panic] - fn start_fwd_bad_range() { - Start::from_position_fwd(&[], 0, 1); - } - - #[test] - #[should_panic] - fn start_rev_bad_range() { - Start::from_position_rev(&[], 0, 1); - } - - #[test] - fn start_fwd() { - let f = Start::from_position_fwd; - - assert_eq!(Start::Text, f(&[], 0, 0)); - assert_eq!(Start::Text, f(b"abc", 0, 3)); - assert_eq!(Start::Text, f(b"\nabc", 0, 3)); - - assert_eq!(Start::Line, f(b"\nabc", 1, 3)); - - assert_eq!(Start::WordByte, f(b"abc", 1, 3)); - - assert_eq!(Start::NonWordByte, f(b" abc", 1, 3)); - } - - #[test] - fn start_rev() { - let f = Start::from_position_rev; - - assert_eq!(Start::Text, f(&[], 0, 0)); - assert_eq!(Start::Text, f(b"abc", 0, 3)); - assert_eq!(Start::Text, f(b"abc\n", 0, 4)); - - assert_eq!(Start::Line, f(b"abc\nz", 0, 3)); - - assert_eq!(Start::WordByte, f(b"abc", 0, 2)); - - assert_eq!(Start::NonWordByte, f(b"abc ", 0, 3)); } } diff --git a/vendor/regex-automata/src/util/determinize/state.rs b/vendor/regex-automata/src/util/determinize/state.rs index 567e600d6..e64123587 100644 --- a/vendor/regex-automata/src/util/determinize/state.rs +++ b/vendor/regex-automata/src/util/determinize/state.rs @@ -10,13 +10,13 @@ The term "DFA state" is somewhat overloaded in this crate. In some cases, it refers to the set of transitions over an alphabet for a particular state. In other cases, it refers to a set of NFA states. The former is really about the final representation of a state in a DFA's transition table, where as the -latter---what this module is focusedon---is closer to an intermediate form that -is used to help eventually build the transition table. +latter---what this module is focused on---is closer to an intermediate form +that is used to help eventually build the transition table. This module exports four types. All four types represent the same idea: an ordered set of NFA states. This ordered set represents the epsilon closure of a particular NFA state, where the "epsilon closure" is the set of NFA states that -can be transitioned to without consuming any input. i.e., Follow all of theNFA +can be transitioned to without consuming any input. i.e., Follow all of the NFA state's epsilon transitions. In addition, this implementation of DFA states cares about two other things: the ordered set of pattern IDs corresponding to the patterns that match if the state is a match state, and the set of @@ -46,9 +46,11 @@ a copy). Here are the three types described succinctly: and no NFA states. Creating a `StateBuilderEmpty` performs no allocs. A `StateBuilderEmpty` can only be used to query its underlying memory capacity, or to convert into a builder for recording pattern IDs and/or assertions. + * `StateBuilderMatches` represents a state with zero or more pattern IDs, zero or more satisfied assertions and zero NFA state IDs. A `StateBuilderMatches` can only be used for adding pattern IDs and recording assertions. + * `StateBuilderNFA` represents a state with zero or more pattern IDs, zero or more satisfied assertions and zero or more NFA state IDs. A `StateBuilderNFA` can only be used for adding NFA state IDs and recording some assertions. @@ -58,7 +60,7 @@ DFA state to check if it already exists. If it does, then there's no need to freeze it into a `State`. It it doesn't exist, then `StateBuilderNFA::to_state` can be called to freeze the builder into an immutable `State`. In either case, `clear` should be called on the builder to turn it back into a -`StateBuilderEmpty` that reuses the underyling memory. +`StateBuilderEmpty` that reuses the underlying memory. The main purpose for splitting the builder into these distinct types is to make it impossible to do things like adding a pattern ID after adding an NFA @@ -68,7 +70,7 @@ type below.) If we just used one type for everything, it would be possible for callers to use an incorrect interleaving of calls and thus result in a corrupt representation. I chose to use more type machinery to make this impossible to do because 1) determinization is itself pretty complex and it wouldn't be too -hard to foul this up and 2) there isn't too much machinery involve and it's +hard to foul this up and 2) there isn't too much machinery involved and it's well contained. As an optimization, sometimes states won't have certain things set. For @@ -88,12 +90,11 @@ use core::{convert::TryFrom, mem}; use alloc::{sync::Arc, vec::Vec}; -use crate::{ - nfa::thompson::LookSet, - util::{ - bytes::{self, Endian}, - id::{PatternID, StateID}, - }, +use crate::util::{ + int::{I32, U32}, + look::LookSet, + primitives::{PatternID, StateID}, + wire::{self, Endian}, }; /// A DFA state that, at its core, is represented by an ordered set of NFA @@ -102,7 +103,7 @@ use crate::{ /// This type is intended to be used only in NFA-to-DFA conversion via powerset /// construction. /// -/// It may be cheaply cloned and accessed safely from mulitple threads +/// It may be cheaply cloned and accessed safely from multiple threads /// simultaneously. #[derive(Clone, Eq, Hash, PartialEq, PartialOrd, Ord)] pub(crate) struct State(Arc<[u8]>); @@ -138,6 +139,10 @@ impl State { self.repr().is_from_word() } + pub(crate) fn is_half_crlf(&self) -> bool { + self.repr().is_half_crlf() + } + pub(crate) fn look_have(&self) -> LookSet { self.repr().look_have() } @@ -146,8 +151,8 @@ impl State { self.repr().look_need() } - pub(crate) fn match_count(&self) -> usize { - self.repr().match_count() + pub(crate) fn match_len(&self) -> usize { + self.repr().match_len() } pub(crate) fn match_pattern(&self, index: usize) -> PatternID { @@ -158,6 +163,7 @@ impl State { self.repr().match_pattern_ids() } + #[cfg(all(test, not(miri)))] pub(crate) fn iter_match_pattern_ids<F: FnMut(PatternID)>(&self, f: F) { self.repr().iter_match_pattern_ids(f) } @@ -191,7 +197,7 @@ impl StateBuilderEmpty { } pub(crate) fn into_matches(mut self) -> StateBuilderMatches { - self.0.extend_from_slice(&[0, 0, 0]); + self.0.extend_from_slice(&[0, 0, 0, 0, 0]); StateBuilderMatches(self.0) } @@ -224,30 +230,23 @@ impl StateBuilderMatches { StateBuilderNFA { repr: self.0, prev_nfa_state_id: StateID::ZERO } } - pub(crate) fn clear(self) -> StateBuilderEmpty { - let mut builder = StateBuilderEmpty(self.0); - builder.clear(); - builder - } - - pub(crate) fn is_match(&self) -> bool { - self.repr().is_match() - } - - pub(crate) fn is_from_word(&self) -> bool { - self.repr().is_from_word() - } - pub(crate) fn set_is_from_word(&mut self) { self.repr_vec().set_is_from_word() } - pub(crate) fn look_have(&mut self) -> &mut LookSet { - LookSet::from_repr_mut(&mut self.0[1]) + pub(crate) fn set_is_half_crlf(&mut self) { + self.repr_vec().set_is_half_crlf() + } + + pub(crate) fn look_have(&self) -> LookSet { + LookSet::read_repr(&self.0[1..]) } - pub(crate) fn look_need(&mut self) -> &mut LookSet { - LookSet::from_repr_mut(&mut self.0[2]) + pub(crate) fn set_look_have( + &mut self, + set: impl FnMut(LookSet) -> LookSet, + ) { + self.repr_vec().set_look_have(set) } pub(crate) fn add_match_pattern_id(&mut self, pid: PatternID) { @@ -295,20 +294,22 @@ impl StateBuilderNFA { builder } - pub(crate) fn is_match(&self) -> bool { - self.repr().is_match() - } - - pub(crate) fn is_from_word(&self) -> bool { - self.repr().is_from_word() + pub(crate) fn look_need(&self) -> LookSet { + self.repr().look_need() } - pub(crate) fn look_have(&mut self) -> &mut LookSet { - LookSet::from_repr_mut(&mut self.repr[1]) + pub(crate) fn set_look_have( + &mut self, + set: impl FnMut(LookSet) -> LookSet, + ) { + self.repr_vec().set_look_have(set) } - pub(crate) fn look_need(&mut self) -> &mut LookSet { - LookSet::from_repr_mut(&mut self.repr[2]) + pub(crate) fn set_look_need( + &mut self, + set: impl FnMut(LookSet) -> LookSet, + ) { + self.repr_vec().set_look_need(set) } pub(crate) fn add_nfa_state_id(&mut self, sid: StateID) { @@ -316,10 +317,6 @@ impl StateBuilderNFA { .add_nfa_state_id(&mut self.prev_nfa_state_id, sid) } - pub(crate) fn memory_usage(&self) -> usize { - self.repr.len() - } - pub(crate) fn as_bytes(&self) -> &[u8] { &self.repr } @@ -355,8 +352,8 @@ impl StateBuilderNFA { /// /// Byte 1 corresponds to the look-behind assertions that were satisfied by /// the transition that created this state. This generally only includes the -/// StartLine and StartText assertions. (Look-ahead assertions are not tracked -/// as part of states. Instead, these are applied by re-computing the epsilon +/// StartLF and Start assertions. (Look-ahead assertions are not tracked as +/// part of states. Instead, these are applied by re-computing the epsilon /// closure of a state when computing the transition function. See `next` in /// the parent module.) /// @@ -425,6 +422,14 @@ impl<'a> Repr<'a> { self.0[0] & (1 << 2) > 0 } + /// Returns true if and only if this state is marked as being inside of a + /// CRLF terminator. In the forward direction, this means the state was + /// created after seeing a `\r`. In the reverse direction, this means the + /// state was created after seeing a `\n`. + fn is_half_crlf(&self) -> bool { + self.0[0] & (1 << 3) > 0 + } + /// The set of look-behind assertions that were true in the transition that /// created this state. /// @@ -436,7 +441,7 @@ impl<'a> Repr<'a> { /// these are re-computed on demand via epsilon closure when computing the /// transition function. fn look_have(&self) -> LookSet { - LookSet::from_repr(self.0[1]) + LookSet::read_repr(&self.0[1..]) } /// The set of look-around (both behind and ahead) assertions that appear @@ -447,34 +452,34 @@ impl<'a> Repr<'a> { /// state has no conditional epsilon transitions, then there is no need /// to re-compute the epsilon closure. fn look_need(&self) -> LookSet { - LookSet::from_repr(self.0[2]) + LookSet::read_repr(&self.0[3..]) } /// Returns the total number of match pattern IDs in this state. /// /// If this state is not a match state, then this always returns 0. - fn match_count(&self) -> usize { + fn match_len(&self) -> usize { if !self.is_match() { return 0; } else if !self.has_pattern_ids() { 1 } else { - self.encoded_pattern_count() + self.encoded_pattern_len() } } /// Returns the pattern ID for this match state at the given index. /// - /// If the given index is greater than or equal to `match_count()` for this + /// If the given index is greater than or equal to `match_len()` for this /// state, then this could panic or return incorrect results. fn match_pattern(&self, index: usize) -> PatternID { if !self.has_pattern_ids() { PatternID::ZERO } else { - let offset = 7 + index * PatternID::SIZE; + let offset = 9 + index * PatternID::SIZE; // This is OK since we only ever serialize valid PatternIDs to // states. - bytes::read_pattern_id_unchecked(&self.0[offset..]).0 + wire::read_pattern_id_unchecked(&self.0[offset..]).0 } } @@ -502,9 +507,9 @@ impl<'a> Repr<'a> { f(PatternID::ZERO); return; } - let mut pids = &self.0[7..self.pattern_offset_end()]; + let mut pids = &self.0[9..self.pattern_offset_end()]; while !pids.is_empty() { - let pid = bytes::read_u32(pids); + let pid = wire::read_u32(pids); pids = &pids[PatternID::SIZE..]; // This is OK since we only ever serialize valid PatternIDs to // states. And since pattern IDs can never exceed a usize, the @@ -525,20 +530,20 @@ impl<'a> Repr<'a> { // This is OK since we only ever serialize valid StateIDs to // states. And since state IDs can never exceed an isize, they must // always be able to fit into a usize, and thus cast is OK. - f(StateID::new_unchecked(sid as usize)) + f(StateID::new_unchecked(sid.as_usize())) } } /// Returns the offset into this state's representation where the pattern /// IDs end and the NFA state IDs begin. fn pattern_offset_end(&self) -> usize { - let encoded = self.encoded_pattern_count(); + let encoded = self.encoded_pattern_len(); if encoded == 0 { - return 3; + return 5; } // This arithmetic is OK since we were able to address this many bytes // when writing to the state, thus, it must fit into a usize. - encoded.checked_mul(4).unwrap().checked_add(7).unwrap() + encoded.checked_mul(4).unwrap().checked_add(9).unwrap() } /// Returns the total number of *encoded* pattern IDs in this state. @@ -546,13 +551,13 @@ impl<'a> Repr<'a> { /// This may return 0 even when this is a match state, since the pattern /// ID `PatternID::ZERO` is not encoded when it's the only pattern ID in /// the match state (the overwhelming common case). - fn encoded_pattern_count(&self) -> usize { + fn encoded_pattern_len(&self) -> usize { if !self.has_pattern_ids() { return 0; } // This unwrap is OK since the total number of patterns is always // guaranteed to fit into a usize. - usize::try_from(bytes::read_u32(&self.0[3..7])).unwrap() + usize::try_from(wire::read_u32(&self.0[5..9])).unwrap() } } @@ -563,6 +568,7 @@ impl<'a> core::fmt::Debug for Repr<'a> { f.debug_struct("Repr") .field("is_match", &self.is_match()) .field("is_from_word", &self.is_from_word()) + .field("is_half_crlf", &self.is_half_crlf()) .field("look_have", &self.look_have()) .field("look_need", &self.look_need()) .field("match_pattern_ids", &self.match_pattern_ids()) @@ -608,14 +614,36 @@ impl<'a> ReprVec<'a> { self.0[0] |= 1 << 2; } - /// Return a mutable reference to the 'look_have' assertion set. - fn look_have_mut(&mut self) -> &mut LookSet { - LookSet::from_repr_mut(&mut self.0[1]) + /// Set this state as having seen half of a CRLF terminator. + /// + /// In the forward direction, this should be set when a `\r` has been seen. + /// In the reverse direction, this should be set when a `\n` has been seen. + fn set_is_half_crlf(&mut self) { + self.0[0] |= 1 << 3; } - /// Return a mutable reference to the 'look_need' assertion set. - fn look_need_mut(&mut self) -> &mut LookSet { - LookSet::from_repr_mut(&mut self.0[2]) + /// The set of look-behind assertions that were true in the transition that + /// created this state. + fn look_have(&self) -> LookSet { + self.repr().look_have() + } + + /// The set of look-around (both behind and ahead) assertions that appear + /// at least once in this state's set of NFA states. + fn look_need(&self) -> LookSet { + self.repr().look_need() + } + + /// Mutate the set of look-behind assertions that were true in the + /// transition that created this state. + fn set_look_have(&mut self, mut set: impl FnMut(LookSet) -> LookSet) { + set(self.look_have()).write_repr(&mut self.0[1..]); + } + + /// Mutate the set of look-around (both behind and ahead) assertions that + /// appear at least once in this state's set of NFA states. + fn set_look_need(&mut self, mut set: impl FnMut(LookSet) -> LookSet) { + set(self.look_need()).write_repr(&mut self.0[3..]); } /// Add a pattern ID to this state. All match states must have at least @@ -675,14 +703,14 @@ impl<'a> ReprVec<'a> { return; } let patsize = PatternID::SIZE; - let pattern_bytes = self.0.len() - 7; + let pattern_bytes = self.0.len() - 9; // Every pattern ID uses 4 bytes, so number of bytes should be // divisible by 4. assert_eq!(pattern_bytes % patsize, 0); // This unwrap is OK since we are guaranteed that the maximum number // of possible patterns fits into a u32. let count32 = u32::try_from(pattern_bytes / patsize).unwrap(); - bytes::NE::write_u32(count32, &mut self.0[3..7]); + wire::NE::write_u32(count32, &mut self.0[5..9]); } /// Add an NFA state ID to this state. The order in which NFA states are @@ -704,7 +732,7 @@ impl<'a> ReprVec<'a> { /// /// https://developers.google.com/protocol-buffers/docs/encoding#varints fn write_vari32(data: &mut Vec<u8>, n: i32) { - let mut un = (n as u32) << 1; + let mut un = n.to_bits() << 1; if n < 0 { un = !un; } @@ -717,7 +745,7 @@ fn write_vari32(data: &mut Vec<u8>, n: i32) { /// https://developers.google.com/protocol-buffers/docs/encoding#varints fn read_vari32(data: &[u8]) -> (i32, usize) { let (un, i) = read_varu32(data); - let mut n = (un >> 1) as i32; + let mut n = i32::from_bits(un >> 1); if un & 1 != 0 { n = !n; } @@ -733,10 +761,10 @@ fn read_vari32(data: &[u8]) -> (i32, usize) { /// https://developers.google.com/protocol-buffers/docs/encoding#varints fn write_varu32(data: &mut Vec<u8>, mut n: u32) { while n >= 0b1000_0000 { - data.push((n as u8) | 0b1000_0000); + data.push(n.low_u8() | 0b1000_0000); n >>= 7; } - data.push(n as u8); + data.push(n.low_u8()); } /// Read an unsigned 32-bit varint. Also, return the number of bytes read. @@ -750,9 +778,9 @@ fn read_varu32(data: &[u8]) -> (u32, usize) { let mut shift: u32 = 0; for (i, &b) in data.iter().enumerate() { if b < 0b1000_0000 { - return (n | ((b as u32) << shift), i + 1); + return (n | (u32::from(b) << shift), i + 1); } - n |= ((b as u32) & 0b0111_1111) << shift; + n |= (u32::from(b) & 0b0111_1111) << shift; shift += 7; } (0, 0) @@ -760,7 +788,7 @@ fn read_varu32(data: &[u8]) -> (u32, usize) { /// Push a native-endian encoded `n` on to `dst`. fn write_u32(dst: &mut Vec<u8>, n: u32) { - use crate::util::bytes::{Endian, NE}; + use crate::util::wire::NE; let start = dst.len(); dst.extend(core::iter::repeat(0).take(mem::size_of::<u32>())); @@ -775,6 +803,7 @@ mod tests { use super::*; + #[cfg(not(miri))] quickcheck! { fn prop_state_read_write_nfa_state_ids(sids: Vec<StateID>) -> bool { // Builders states do not permit duplicate IDs. @@ -829,7 +858,9 @@ mod tests { s.iter_nfa_state_ids(|sid| got_sids.push(sid)); got_pids == pids && got_sids == sids } + } + quickcheck! { fn prop_read_write_varu32(n: u32) -> bool { let mut buf = vec![]; write_varu32(&mut buf, n); @@ -845,6 +876,7 @@ mod tests { } } + #[cfg(not(miri))] fn dedup_state_ids(sids: Vec<StateID>) -> Vec<StateID> { let mut set = alloc::collections::BTreeSet::new(); let mut deduped = vec![]; @@ -858,6 +890,7 @@ mod tests { deduped } + #[cfg(not(miri))] fn dedup_pattern_ids(pids: Vec<PatternID>) -> Vec<PatternID> { let mut set = alloc::collections::BTreeSet::new(); let mut deduped = vec![]; diff --git a/vendor/regex-automata/src/util/empty.rs b/vendor/regex-automata/src/util/empty.rs new file mode 100644 index 000000000..e16af3b6e --- /dev/null +++ b/vendor/regex-automata/src/util/empty.rs @@ -0,0 +1,265 @@ +/*! +This module provides helper routines for dealing with zero-width matches. + +The main problem being solved here is this: + +1. The caller wants to search something that they know is valid UTF-8, such +as a Rust `&str`. +2. The regex used by the caller can match the empty string. For example, `a*`. +3. The caller should never get match offsets returned that occur within the +encoding of a UTF-8 codepoint. It is logically incorrect, and also means that, +e.g., slicing the `&str` at those offsets will lead to a panic. + +So the question here is, how do we prevent the caller from getting match +offsets that split a codepoint? For example, strictly speaking, the regex `a*` +matches `☃` at the positions `[0, 0]`, `[1, 1]`, `[2, 2]` and `[3, 3]` since +the UTF-8 encoding of `☃` is `\xE2\x98\x83`. In particular, the `NFA` that +underlies all of the matching engines in this crate doesn't have anything in +its state graph that prevents matching between UTF-8 code units. Indeed, any +engine derived from the `NFA` will match at those positions by virtue of the +fact that the `NFA` is byte oriented. That is, its transitions are defined over +bytes and the matching engines work by proceeding one byte at a time. + +(An alternative architecture would be to define the transitions in an `NFA` +over codepoints, or `char`. And then make the matching engines proceed by +decoding one codepoint at a time. This is a viable strategy, but it doesn't +work for DFA matching engines because designing a fast and memory efficient +transition table for an alphabet as large as Unicode is quite difficult. More +to the point, the top-level `regex` crate supports matching on arbitrary bytes +when Unicode mode is disabled and one is searching a `&[u8]`. So in that case, +you can't just limit yourself to decoding codepoints and matching those. You +really do need to be able to follow byte oriented transitions on the `NFA`.) + +In an older version of the regex crate, we handled this case not in the regex +engine, but in the iterators over matches. Namely, since this case only arises +when the match is empty, we "just" incremented the next starting position +of the search by `N`, where `N` is the length of the codepoint encoded at +the current position. The alternative or more "natural" solution of just +incrementing by `1` would result in executing a search of `a*` on `☃` like +this: + +* Start search at `0`. +* Found match at `[0, 0]`. +* Next start position is `0`. +* To avoid an infinite loop, since it's an empty match, increment by `1`. +* Start search at `1`. +* Found match at `[1, 1]`. Oops. + +But if we instead incremented by `3` (the length in bytes of `☃`), then we get +the following: + +* Start search at `0`. +* Found match at `[0, 0]`. +* Next start position is `0`. +* To avoid an infinite loop, since it's an empty match, increment by `3`. +* Start search at `3`. +* Found match at `[3, 3]`. + +And we get the correct result. But does this technique work in all cases? +Crucially, it requires that a zero-width match that splits a codepoint never +occurs beyond the starting position of the search. Because if it did, merely +incrementing the start position by the number of bytes in the codepoint at +the current position wouldn't be enough. A zero-width match could just occur +anywhere. It turns out that it is _almost_ true. We can convince ourselves by +looking at all possible patterns that can match the empty string: + +* Patterns like `a*`, `a{0}`, `(?:)`, `a|` and `|a` all unconditionally match +the empty string. That is, assuming there isn't an `a` at the current position, +they will all match the empty string at the start of a search. There is no way +to move past it because any other match would not be "leftmost." +* `^` only matches at the beginning of the haystack, where the start position +is `0`. Since we know we're searching valid UTF-8 (if it isn't valid UTF-8, +then this entire problem goes away because it implies your string type supports +invalid UTF-8 and thus must deal with offsets that not only split a codepoint +but occur in entirely invalid UTF-8 somehow), it follows that `^` never matches +between the code units of a codepoint because the start of a valid UTF-8 string +is never within the encoding of a codepoint. +* `$` basically the same logic as `^`, but for the end of a string. A valid +UTF-8 string can't have an incomplete codepoint at the end of it. +* `(?m:^)` follows similarly to `^`, but it can match immediately following +a `\n`. However, since a `\n` is always a codepoint itself and can never +appear within a codepoint, it follows that the position immediately following +a `\n` in a string that is valid UTF-8 is guaranteed to not be between the +code units of another codepoint. (One caveat here is that the line terminator +for multi-line anchors can now be changed to any arbitrary byte, including +things like `\x98` which might occur within a codepoint. However, this wasn't +supported by the old regex crate. If it was, it pose the same problems as +`(?-u:\B)`, as we'll discuss below.) +* `(?m:$)` a similar argument as for `(?m:^)`. The only difference is that a +`(?m:$)` matches just before a `\n`. But the same argument applies. +* `(?Rm:^)` and `(?Rm:$)` weren't supported by the old regex crate, but the +CRLF aware line anchors follow a similar argument as for `(?m:^)` and `(?m:$)`. +Namely, since they only ever match at a boundary where one side is either a +`\r` or a `\n`, neither of which can occur within a codepoint. +* `\b` only matches at positions where both sides are valid codepoints, so +this cannot split a codepoint. +* `\B`, like `\b`, also only matches at positions where both sides are valid +codepoints. So this cannot split a codepoint either. +* `(?-u:\b)` matches only at positions where at least one side of it is an ASCII +word byte. Since ASCII bytes cannot appear as code units in non-ASCII codepoints +(one of the many amazing qualities of UTF-8), it follows that this too cannot +split a codepoint. +* `(?-u:\B)` finally represents a problem. It can matches between *any* two +bytes that are either both word bytes or non-word bytes. Since code units like +`\xE2` and `\x98` (from the UTF-8 encoding of `☃`) are both non-word bytes, +`(?-u:\B)` will match at the position between them. + +Thus, our approach of incrementing one codepoint at a time after seeing an +empty match is flawed because `(?-u:\B)` can result in an empty match that +splits a codepoint at a position past the starting point of a search. For +example, searching `(?-u:\B)` on `a☃` would produce the following matches: `[2, +2]`, `[3, 3]` and `[4, 4]`. The positions at `0` and `1` don't match because +they correspond to word boundaries since `a` is an ASCII word byte. + +So what did the old regex crate do to avoid this? It banned `(?-u:\B)` from +regexes that could match `&str`. That might sound extreme, but a lot of other +things were banned too. For example, all of `(?-u:.)`, `(?-u:[^a])` and +`(?-u:\W)` can match invalid UTF-8 too, including individual code units with a +codepoint. The key difference is that those expressions could never produce an +empty match. That ban happens when translating an `Ast` to an `Hir`, because +that process that reason about whether an `Hir` can produce *non-empty* matches +at invalid UTF-8 boundaries. Bottom line though is that we side-stepped the +`(?-u:\B)` issue by banning it. + +If banning `(?-u:\B)` were the only issue with the old regex crate's approach, +then I probably would have kept it. `\B` is rarely used, so it's not such a big +deal to have to work-around it. However, the problem with the above approach +is that it doesn't compose. The logic for avoiding splitting a codepoint only +lived in the iterator, which means if anyone wants to implement their own +iterator over regex matches, they have to deal with this extremely subtle edge +case to get full correctness. + +Instead, in this crate, we take the approach of pushing this complexity down +to the lowest layers of each regex engine. The approach is pretty simple: + +* If this corner case doesn't apply, don't do anything. (For example, if UTF-8 +mode isn't enabled or if the regex cannot match the empty string.) +* If an empty match is reported, explicitly check if it splits a codepoint. +* If it doesn't, we're done, return the match. +* If it does, then ignore the match and re-run the search. +* Repeat the above process until the end of the haystack is reached or a match +is found that doesn't split a codepoint or isn't zero width. + +And that's pretty much what this module provides. Every regex engine uses these +methods in their lowest level public APIs, but just above the layer where +their internal engine is used. That way, all regex engines can be arbitrarily +composed without worrying about handling this case, and iterators don't need to +handle it explicitly. + +(It turns out that a new feature I added, support for changing the line +terminator in a regex to any arbitrary byte, also provokes the above problem. +Namely, the byte could be invalid UTF-8 or a UTF-8 continuation byte. So that +support would need to be limited or banned when UTF-8 mode is enabled, just +like we did for `(?-u:\B)`. But thankfully our more robust approach in this +crate handles that case just fine too.) +*/ + +use crate::util::search::{Input, MatchError}; + +#[cold] +#[inline(never)] +pub(crate) fn skip_splits_fwd<T, F>( + input: &Input<'_>, + init_value: T, + match_offset: usize, + find: F, +) -> Result<Option<T>, MatchError> +where + F: FnMut(&Input<'_>) -> Result<Option<(T, usize)>, MatchError>, +{ + skip_splits(true, input, init_value, match_offset, find) +} + +#[cold] +#[inline(never)] +pub(crate) fn skip_splits_rev<T, F>( + input: &Input<'_>, + init_value: T, + match_offset: usize, + find: F, +) -> Result<Option<T>, MatchError> +where + F: FnMut(&Input<'_>) -> Result<Option<(T, usize)>, MatchError>, +{ + skip_splits(false, input, init_value, match_offset, find) +} + +fn skip_splits<T, F>( + forward: bool, + input: &Input<'_>, + init_value: T, + mut match_offset: usize, + mut find: F, +) -> Result<Option<T>, MatchError> +where + F: FnMut(&Input<'_>) -> Result<Option<(T, usize)>, MatchError>, +{ + // If our config says to do an anchored search, then we're definitely + // done. We just need to determine whether we have a valid match or + // not. If we don't, then we're not allowed to continue, so we report + // no match. + // + // This is actually quite a subtle correctness thing. The key here is + // that if we got an empty match that splits a codepoint after doing an + // anchored search in UTF-8 mode, then that implies that we must have + // *started* the search at a location that splits a codepoint. This + // follows from the fact that if a match is reported from an anchored + // search, then the start offset of the match *must* match the start + // offset of the search. + // + // It also follows that no other non-empty match is possible. For + // example, you might write a regex like '(?:)|SOMETHING' and start its + // search in the middle of a codepoint. The first branch is an empty + // regex that will bubble up a match at the first position, and then + // get rejected here and report no match. But what if 'SOMETHING' could + // have matched? We reason that such a thing is impossible, because + // if it does, it must report a match that starts in the middle of a + // codepoint. This in turn implies that a match is reported whose span + // does not correspond to valid UTF-8, and this breaks the promise + // made when UTF-8 mode is enabled. (That promise *can* be broken, for + // example, by enabling UTF-8 mode but building an by hand NFA that + // produces non-empty matches that span invalid UTF-8. This is an unchecked + // but documented precondition violation of UTF-8 mode, and is documented + // to have unspecified behavior.) + // + // I believe this actually means that if an anchored search is run, and + // UTF-8 mode is enabled and the start position splits a codepoint, + // then it is correct to immediately report no match without even + // executing the regex engine. But it doesn't really seem worth writing + // out that case in every regex engine to save a tiny bit of work in an + // extremely pathological case, so we just handle it here. + if input.get_anchored().is_anchored() { + return Ok(if input.is_char_boundary(match_offset) { + Some(init_value) + } else { + None + }); + } + // Otherwise, we have an unanchored search, so just keep looking for + // matches until we have one that does not split a codepoint or we hit + // EOI. + let mut value = init_value; + let mut input = input.clone(); + while !input.is_char_boundary(match_offset) { + if forward { + // The unwrap is OK here because overflowing usize while + // iterating over a slice is impossible, at it would require + // a slice of length greater than isize::MAX, which is itself + // impossible. + input.set_start(input.start().checked_add(1).unwrap()); + } else { + input.set_end(match input.end().checked_sub(1) { + None => return Ok(None), + Some(end) => end, + }); + } + match find(&input)? { + None => return Ok(None), + Some((new_value, new_match_end)) => { + value = new_value; + match_offset = new_match_end; + } + } + } + Ok(Some(value)) +} diff --git a/vendor/regex-automata/src/util/escape.rs b/vendor/regex-automata/src/util/escape.rs new file mode 100644 index 000000000..7f6aa15f5 --- /dev/null +++ b/vendor/regex-automata/src/util/escape.rs @@ -0,0 +1,84 @@ +/*! +Provides convenience routines for escaping raw bytes. + +Since this crate tends to deal with `&[u8]` everywhere and the default +`Debug` implementation just shows decimal integers, it makes debugging those +representations quite difficult. This module provides types that show `&[u8]` +as if it were a string, with invalid UTF-8 escaped into its byte-by-byte hex +representation. +*/ + +use crate::util::utf8; + +/// Provides a convenient `Debug` implementation for a `u8`. +/// +/// The `Debug` impl treats the byte as an ASCII, and emits a human readable +/// representation of it. If the byte isn't ASCII, then it's emitted as a hex +/// escape sequence. +#[derive(Clone, Copy)] +pub struct DebugByte(pub u8); + +impl core::fmt::Debug for DebugByte { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + // Special case ASCII space. It's too hard to read otherwise, so + // put quotes around it. I sometimes wonder whether just '\x20' would + // be better... + if self.0 == b' ' { + return write!(f, "' '"); + } + // 10 bytes is enough to cover any output from ascii::escape_default. + let mut bytes = [0u8; 10]; + let mut len = 0; + for (i, mut b) in core::ascii::escape_default(self.0).enumerate() { + // capitalize \xab to \xAB + if i >= 2 && b'a' <= b && b <= b'f' { + b -= 32; + } + bytes[len] = b; + len += 1; + } + write!(f, "{}", core::str::from_utf8(&bytes[..len]).unwrap()) + } +} + +/// Provides a convenient `Debug` implementation for `&[u8]`. +/// +/// This generally works best when the bytes are presumed to be mostly UTF-8, +/// but will work for anything. For any bytes that aren't UTF-8, they are +/// emitted as hex escape sequences. +pub struct DebugHaystack<'a>(pub &'a [u8]); + +impl<'a> core::fmt::Debug for DebugHaystack<'a> { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + write!(f, "\"")?; + // This is a sad re-implementation of a similar impl found in bstr. + let mut bytes = self.0; + while let Some(result) = utf8::decode(bytes) { + let ch = match result { + Ok(ch) => ch, + Err(byte) => { + write!(f, r"\x{:02x}", byte)?; + bytes = &bytes[1..]; + continue; + } + }; + bytes = &bytes[ch.len_utf8()..]; + match ch { + '\0' => write!(f, "\\0")?, + // ASCII control characters except \0, \n, \r, \t + '\x01'..='\x08' + | '\x0b' + | '\x0c' + | '\x0e'..='\x19' + | '\x7f' => { + write!(f, "\\x{:02x}", u32::from(ch))?; + } + '\n' | '\r' | '\t' | _ => { + write!(f, "{}", ch.escape_debug())?; + } + } + } + write!(f, "\"")?; + Ok(()) + } +} diff --git a/vendor/regex-automata/src/util/id.rs b/vendor/regex-automata/src/util/id.rs deleted file mode 100644 index 70bf0a93b..000000000 --- a/vendor/regex-automata/src/util/id.rs +++ /dev/null @@ -1,608 +0,0 @@ -/*! -Type definitions for identifier types. - -A [`StateID`] represents the possible set of identifiers used in regex engine -implementations in this crate. For example, they are used to identify both NFA -and DFA states. - -A [`PatternID`] represents the possible set of identifiers for patterns. All -regex engine implementations in this crate support searching for multiple -patterns simultaneously. A `PatternID` is how each pattern is uniquely -identified for a particular instance of a regex engine. Namely, a pattern is -assigned an auto-incrementing integer, starting at `0`, based on the order of -patterns supplied during the construction of the regex engine. - -These identifier types represent a way for this crate to make correctness -guarantees around the possible set of values that a `StateID` or a `PatternID` -might represent. Similarly, they also provide a way of constraining the size of -these identifiers to reduce space usage while still guaranteeing that all such -identifiers are repsentable by a `usize` for the current target. - -Moreover, the identifier types clamp the range of permissible values to a range -that is typically smaller than its internal representation. (With the maximum -value being, e.g., `StateID::MAX`.) Users of these types may not rely this -clamping for the purpose of memory safety. Users may, however, rely on these -invariants to avoid panics or other types of logic bugs. -*/ - -// Continuing from the above comment about correctness guarantees, an example -// of a way in which we use the guarantees on these types is delta encoding. -// Namely, we require that IDs can be at most 2^31 - 2, which means the -// difference between any two IDs is always representable as an i32. - -use core::{ - convert::{Infallible, TryFrom}, - mem, ops, -}; - -#[cfg(feature = "alloc")] -use alloc::vec::Vec; - -/// An identifier for a regex pattern. -/// -/// The identifier for a pattern corresponds to its relative position among -/// other patterns in a single finite state machine. Namely, when building -/// a multi-pattern regex engine, one must supply a sequence of patterns to -/// match. The position (starting at 0) of each pattern in that sequence -/// represents its identifier. This identifier is in turn used to identify and -/// report matches of that pattern in various APIs. -/// -/// A pattern ID is guaranteed to be representable by a `usize`. Similarly, -/// the number of patterns in any regex engine in this crate is guaranteed to -/// be representable by a `usize`. This applies to regex engines that have -/// been deserialized; a deserialization error will be returned if it contains -/// pattern IDs that violate these requirements in your current environment. -/// -/// For extra convenience in some cases, this type also guarantees that all -/// IDs can fit into an `i32` and an `isize` without overflowing. -/// -/// # Representation -/// -/// This type is always represented internally by a `u32` and is marked as -/// `repr(transparent)`. Thus, this type always has the same representation as -/// a `u32`. -/// -/// # Indexing -/// -/// For convenience, callers may use a `PatternID` to index slices. -/// -/// # Safety -/// -/// While a `PatternID` is meant to guarantee that its value fits into `usize` -/// (while using a possibly smaller representation than `usize` on some -/// targets), callers must not rely on this property for safety. Callers may -/// choose to rely on this property for correctness however. -#[repr(transparent)] -#[derive( - Clone, Copy, Debug, Default, Eq, Hash, PartialEq, PartialOrd, Ord, -)] -pub struct PatternID(u32); - -impl PatternID { - /// The maximum pattern ID value, represented as a `usize`. - #[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))] - pub const MAX: PatternID = - PatternID::new_unchecked(core::i32::MAX as usize - 1); - - /// The maximum pattern ID value, represented as a `usize`. - #[cfg(target_pointer_width = "16")] - pub const MAX: PatternID = PatternID::new_unchecked(core::isize::MAX - 1); - - /// The total number of patterns that are allowed in any single regex - /// engine. - pub const LIMIT: usize = PatternID::MAX.as_usize() + 1; - - /// The zero pattern ID value. - pub const ZERO: PatternID = PatternID::new_unchecked(0); - - /// The number of bytes that a single `PatternID` uses in memory. - pub const SIZE: usize = core::mem::size_of::<PatternID>(); - - /// Create a new pattern ID. - /// - /// If the given identifier exceeds [`PatternID::MAX`], then this returns - /// an error. - #[inline] - pub fn new(id: usize) -> Result<PatternID, PatternIDError> { - PatternID::try_from(id) - } - - /// Create a new pattern ID without checking whether the given value - /// exceeds [`PatternID::MAX`]. - /// - /// While this is unchecked, providing an incorrect value must never - /// sacrifice memory safety, as documented above. - #[inline] - pub const fn new_unchecked(id: usize) -> PatternID { - PatternID(id as u32) - } - - /// Like [`PatternID::new`], but panics if the given ID is not valid. - #[inline] - pub fn must(id: usize) -> PatternID { - PatternID::new(id).unwrap() - } - - /// Return this pattern ID as a `usize`. - #[inline] - pub const fn as_usize(&self) -> usize { - self.0 as usize - } - - /// Return the internal u32 of this pattern ID. - #[inline] - pub const fn as_u32(&self) -> u32 { - self.0 - } - - /// Return the internal u32 of this pattern ID represented as an i32. - /// - /// This is guaranteed to never overflow an `i32`. - #[inline] - pub const fn as_i32(&self) -> i32 { - self.0 as i32 - } - - /// Returns one more than this pattern ID as a usize. - /// - /// Since a pattern ID has constraints on its maximum value, adding `1` to - /// it will always fit in a `usize` (and a `u32`). - #[inline] - pub fn one_more(&self) -> usize { - self.as_usize().checked_add(1).unwrap() - } - - /// Decode this pattern ID from the bytes given using the native endian - /// byte order for the current target. - /// - /// If the decoded integer is not representable as a pattern ID for the - /// current target, then this returns an error. - #[inline] - pub fn from_ne_bytes(bytes: [u8; 4]) -> Result<PatternID, PatternIDError> { - let id = u32::from_ne_bytes(bytes); - if id > PatternID::MAX.as_u32() { - return Err(PatternIDError { attempted: id as u64 }); - } - Ok(PatternID::new_unchecked(id as usize)) - } - - /// Decode this pattern ID from the bytes given using the native endian - /// byte order for the current target. - /// - /// This is analogous to [`PatternID::new_unchecked`] in that is does not - /// check whether the decoded integer is representable as a pattern ID. - #[inline] - pub fn from_ne_bytes_unchecked(bytes: [u8; 4]) -> PatternID { - PatternID::new_unchecked(u32::from_ne_bytes(bytes) as usize) - } - - /// Return the underlying pattern ID integer as raw bytes in native endian - /// format. - #[inline] - pub fn to_ne_bytes(&self) -> [u8; 4] { - self.0.to_ne_bytes() - } - - /// Returns an iterator over all pattern IDs from 0 up to and not including - /// the given length. - /// - /// If the given length exceeds [`PatternID::LIMIT`], then this panics. - #[cfg(feature = "alloc")] - pub(crate) fn iter(len: usize) -> PatternIDIter { - PatternIDIter::new(len) - } -} - -/// This error occurs when a pattern ID could not be constructed. -/// -/// This occurs when given an integer exceeding the maximum pattern ID value. -/// -/// When the `std` feature is enabled, this implements the `Error` trait. -#[derive(Clone, Debug, Eq, PartialEq)] -pub struct PatternIDError { - attempted: u64, -} - -impl PatternIDError { - /// Returns the value that failed to constructed a pattern ID. - pub fn attempted(&self) -> u64 { - self.attempted - } -} - -#[cfg(feature = "std")] -impl std::error::Error for PatternIDError {} - -impl core::fmt::Display for PatternIDError { - fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { - write!( - f, - "failed to create PatternID from {:?}, which exceeds {:?}", - self.attempted(), - PatternID::MAX, - ) - } -} - -/// An identifier for a state in a regex engine. -/// -/// A state ID is guaranteed to be representable by a `usize`. Similarly, the -/// number of states in any regex engine in this crate is guaranteed to be -/// representable by a `usize`. This applies to regex engines that have been -/// deserialized; a deserialization error will be returned if it contains state -/// IDs that violate these requirements in your current environment. -/// -/// For extra convenience in some cases, this type also guarantees that all -/// IDs can fit into an `i32` and an `isize` without overflowing. -/// -/// # Representation -/// -/// This type is always represented internally by a `u32` and is marked as -/// `repr(transparent)`. Thus, this type always has the same representation as -/// a `u32`. -/// -/// # Indexing -/// -/// For convenience, callers may use a `StateID` to index slices. -/// -/// # Safety -/// -/// While a `StateID` is meant to guarantee that its value fits into `usize` -/// (while using a possibly smaller representation than `usize` on some -/// targets), callers must not rely on this property for safety. Callers may -/// choose to rely on this property for correctness however. -#[repr(transparent)] -#[derive( - Clone, Copy, Debug, Default, Eq, Hash, PartialEq, PartialOrd, Ord, -)] -pub struct StateID(u32); - -impl StateID { - /// The maximum state ID value. - #[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))] - pub const MAX: StateID = - StateID::new_unchecked(core::i32::MAX as usize - 1); - - /// The maximum state ID value. - #[cfg(target_pointer_width = "16")] - pub const MAX: StateID = StateID::new_unchecked(core::isize::MAX - 1); - - /// The total number of states that are allowed in any single regex - /// engine, represented as a `usize`. - pub const LIMIT: usize = StateID::MAX.as_usize() + 1; - - /// The zero state ID value. - pub const ZERO: StateID = StateID::new_unchecked(0); - - /// The number of bytes that a single `StateID` uses in memory. - pub const SIZE: usize = core::mem::size_of::<StateID>(); - - /// Create a new state ID. - /// - /// If the given identifier exceeds [`StateID::MAX`], then this returns - /// an error. - #[inline] - pub fn new(id: usize) -> Result<StateID, StateIDError> { - StateID::try_from(id) - } - - /// Create a new state ID without checking whether the given value - /// exceeds [`StateID::MAX`]. - /// - /// While this is unchecked, providing an incorrect value must never - /// sacrifice memory safety, as documented above. - #[inline] - pub const fn new_unchecked(id: usize) -> StateID { - StateID(id as u32) - } - - /// Like [`StateID::new`], but panics if the given ID is not valid. - #[inline] - pub fn must(id: usize) -> StateID { - StateID::new(id).unwrap() - } - - /// Return this state ID as a `usize`. - #[inline] - pub const fn as_usize(&self) -> usize { - self.0 as usize - } - - /// Return the internal u32 of this state ID. - #[inline] - pub const fn as_u32(&self) -> u32 { - self.0 - } - - /// Return the internal u32 of this pattern ID represented as an i32. - /// - /// This is guaranteed to never overflow an `i32`. - #[inline] - pub const fn as_i32(&self) -> i32 { - self.0 as i32 - } - - /// Returns one more than this state ID as a usize. - /// - /// Since a state ID has constraints on its maximum value, adding `1` to - /// it will always fit in a `usize` (and a `u32`). - #[inline] - pub fn one_more(&self) -> usize { - self.as_usize().checked_add(1).unwrap() - } - - /// Decode this state ID from the bytes given using the native endian byte - /// order for the current target. - /// - /// If the decoded integer is not representable as a state ID for the - /// current target, then this returns an error. - #[inline] - pub fn from_ne_bytes(bytes: [u8; 4]) -> Result<StateID, StateIDError> { - let id = u32::from_ne_bytes(bytes); - if id > StateID::MAX.as_u32() { - return Err(StateIDError { attempted: id as u64 }); - } - Ok(StateID::new_unchecked(id as usize)) - } - - /// Decode this state ID from the bytes given using the native endian - /// byte order for the current target. - /// - /// This is analogous to [`StateID::new_unchecked`] in that is does not - /// check whether the decoded integer is representable as a state ID. - #[inline] - pub fn from_ne_bytes_unchecked(bytes: [u8; 4]) -> StateID { - StateID::new_unchecked(u32::from_ne_bytes(bytes) as usize) - } - - /// Return the underlying state ID integer as raw bytes in native endian - /// format. - #[inline] - pub fn to_ne_bytes(&self) -> [u8; 4] { - self.0.to_ne_bytes() - } - - /// Returns an iterator over all state IDs from 0 up to and not including - /// the given length. - /// - /// If the given length exceeds [`StateID::LIMIT`], then this panics. - #[cfg(feature = "alloc")] - pub(crate) fn iter(len: usize) -> StateIDIter { - StateIDIter::new(len) - } -} - -/// This error occurs when a state ID could not be constructed. -/// -/// This occurs when given an integer exceeding the maximum state ID value. -/// -/// When the `std` feature is enabled, this implements the `Error` trait. -#[derive(Clone, Debug, Eq, PartialEq)] -pub struct StateIDError { - attempted: u64, -} - -impl StateIDError { - /// Returns the value that failed to constructed a state ID. - pub fn attempted(&self) -> u64 { - self.attempted - } -} - -#[cfg(feature = "std")] -impl std::error::Error for StateIDError {} - -impl core::fmt::Display for StateIDError { - fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { - write!( - f, - "failed to create StateID from {:?}, which exceeds {:?}", - self.attempted(), - StateID::MAX, - ) - } -} - -/// A macro for defining exactly identical (modulo names) impls for ID types. -macro_rules! impls { - ($ty:ident, $tyerr:ident, $tyiter:ident) => { - #[derive(Clone, Debug)] - pub(crate) struct $tyiter { - rng: ops::Range<usize>, - } - - impl $tyiter { - #[cfg(feature = "alloc")] - fn new(len: usize) -> $tyiter { - assert!( - len <= $ty::LIMIT, - "cannot create iterator with IDs when number of \ - elements exceed {:?}", - $ty::LIMIT, - ); - $tyiter { rng: 0..len } - } - } - - impl Iterator for $tyiter { - type Item = $ty; - - fn next(&mut self) -> Option<$ty> { - if self.rng.start >= self.rng.end { - return None; - } - let next_id = self.rng.start + 1; - let id = mem::replace(&mut self.rng.start, next_id); - // new_unchecked is OK since we asserted that the number of - // elements in this iterator will fit in an ID at construction. - Some($ty::new_unchecked(id)) - } - } - - impl<T> core::ops::Index<$ty> for [T] { - type Output = T; - - #[inline] - fn index(&self, index: $ty) -> &T { - &self[index.as_usize()] - } - } - - impl<T> core::ops::IndexMut<$ty> for [T] { - #[inline] - fn index_mut(&mut self, index: $ty) -> &mut T { - &mut self[index.as_usize()] - } - } - - #[cfg(feature = "alloc")] - impl<T> core::ops::Index<$ty> for Vec<T> { - type Output = T; - - #[inline] - fn index(&self, index: $ty) -> &T { - &self[index.as_usize()] - } - } - - #[cfg(feature = "alloc")] - impl<T> core::ops::IndexMut<$ty> for Vec<T> { - #[inline] - fn index_mut(&mut self, index: $ty) -> &mut T { - &mut self[index.as_usize()] - } - } - - impl TryFrom<usize> for $ty { - type Error = $tyerr; - - fn try_from(id: usize) -> Result<$ty, $tyerr> { - if id > $ty::MAX.as_usize() { - return Err($tyerr { attempted: id as u64 }); - } - Ok($ty::new_unchecked(id)) - } - } - - impl TryFrom<u8> for $ty { - type Error = Infallible; - - fn try_from(id: u8) -> Result<$ty, Infallible> { - Ok($ty::new_unchecked(id as usize)) - } - } - - impl TryFrom<u16> for $ty { - type Error = $tyerr; - - fn try_from(id: u16) -> Result<$ty, $tyerr> { - if id as u32 > $ty::MAX.as_u32() { - return Err($tyerr { attempted: id as u64 }); - } - Ok($ty::new_unchecked(id as usize)) - } - } - - impl TryFrom<u32> for $ty { - type Error = $tyerr; - - fn try_from(id: u32) -> Result<$ty, $tyerr> { - if id > $ty::MAX.as_u32() { - return Err($tyerr { attempted: id as u64 }); - } - Ok($ty::new_unchecked(id as usize)) - } - } - - impl TryFrom<u64> for $ty { - type Error = $tyerr; - - fn try_from(id: u64) -> Result<$ty, $tyerr> { - if id > $ty::MAX.as_u32() as u64 { - return Err($tyerr { attempted: id }); - } - Ok($ty::new_unchecked(id as usize)) - } - } - - #[cfg(test)] - impl quickcheck::Arbitrary for $ty { - fn arbitrary(gen: &mut quickcheck::Gen) -> $ty { - use core::cmp::max; - - let id = max(i32::MIN + 1, i32::arbitrary(gen)).abs(); - if id > $ty::MAX.as_i32() { - $ty::MAX - } else { - $ty::new(usize::try_from(id).unwrap()).unwrap() - } - } - } - }; -} - -impls!(PatternID, PatternIDError, PatternIDIter); -impls!(StateID, StateIDError, StateIDIter); - -/// A utility trait that defines a couple of adapters for making it convenient -/// to access indices as ID types. We require ExactSizeIterator so that -/// iterator construction can do a single check to make sure the index of each -/// element is representable by its ID type. -#[cfg(feature = "alloc")] -pub(crate) trait IteratorIDExt: Iterator { - fn with_pattern_ids(self) -> WithPatternIDIter<Self> - where - Self: Sized + ExactSizeIterator, - { - WithPatternIDIter::new(self) - } - - fn with_state_ids(self) -> WithStateIDIter<Self> - where - Self: Sized + ExactSizeIterator, - { - WithStateIDIter::new(self) - } -} - -#[cfg(feature = "alloc")] -impl<I: Iterator> IteratorIDExt for I {} - -#[cfg(feature = "alloc")] -macro_rules! iditer { - ($ty:ident, $iterty:ident, $withiterty:ident) => { - /// An iterator adapter that is like std::iter::Enumerate, but attaches - /// IDs. It requires ExactSizeIterator. At construction, it ensures - /// that the index of each element in the iterator is representable in - /// the corresponding ID type. - #[derive(Clone, Debug)] - pub(crate) struct $withiterty<I> { - it: I, - ids: $iterty, - } - - impl<I: Iterator + ExactSizeIterator> $withiterty<I> { - fn new(it: I) -> $withiterty<I> { - let ids = $ty::iter(it.len()); - $withiterty { it, ids } - } - } - - impl<I: Iterator + ExactSizeIterator> Iterator for $withiterty<I> { - type Item = ($ty, I::Item); - - fn next(&mut self) -> Option<($ty, I::Item)> { - let item = self.it.next()?; - // Number of elements in this iterator must match, according - // to contract of ExactSizeIterator. - let id = self.ids.next().unwrap(); - Some((id, item)) - } - } - }; -} - -#[cfg(feature = "alloc")] -iditer!(PatternID, PatternIDIter, WithPatternIDIter); -#[cfg(feature = "alloc")] -iditer!(StateID, StateIDIter, WithStateIDIter); diff --git a/vendor/regex-automata/src/util/int.rs b/vendor/regex-automata/src/util/int.rs new file mode 100644 index 000000000..e6b13bff9 --- /dev/null +++ b/vendor/regex-automata/src/util/int.rs @@ -0,0 +1,252 @@ +/*! +This module provides several integer oriented traits for converting between +both fixed size integers and integers whose size varies based on the target +(like `usize`). + +The driving design principle of this module is to attempt to centralize as many +`as` casts as possible here. And in particular, we separate casts into two +buckets: + +* Casts that we use for their truncating behavior. In this case, we use more +descriptive names, like `low_u32` and `high_u32`. +* Casts that we use for converting back-and-forth between `usize`. These +conversions are generally necessary because we often store indices in different +formats to save on memory, which requires converting to and from `usize`. In +this case, we very specifically do not want to overflow, and so the methods +defined here will panic if the `as` cast would be lossy in debug mode. (A +normal `as` cast will never panic!) + +For `as` casts between raw pointers, we use `cast`, so `as` isn't needed there. + +For regex engines, floating point is just never used, so we don't have to worry +about `as` casts for those. + +Otherwise, this module pretty much covers all of our `as` needs except for one +thing: const contexts. There are a select few places in this crate where we +still need to use `as` because const functions on traits aren't stable yet. +If we wind up significantly expanding our const footprint in this crate, it +might be worth defining free functions to handle those cases. But at the time +of writing, that just seemed like too much ceremony. Instead, I comment each +such use of `as` in a const context with a "fixme" notice. + +NOTE: for simplicity, we don't take target pointer width into account here for +`usize` conversions. Since we currently only panic in debug mode, skipping the +check when it can be proven it isn't needed at compile time doesn't really +matter. Now, if we wind up wanting to do as many checks as possible in release +mode, then we would want to skip those when we know the conversions are always +non-lossy. + +NOTE: this module isn't an exhaustive API. For example, we still use things +like `u64::from` where possible, or even `usize::try_from()` for when we do +explicitly want to panic or when we want to return an error for overflow. +*/ + +pub(crate) trait U8 { + fn as_usize(self) -> usize; +} + +impl U8 for u8 { + fn as_usize(self) -> usize { + usize::from(self) + } +} + +pub(crate) trait U16 { + fn as_usize(self) -> usize; + fn low_u8(self) -> u8; + fn high_u8(self) -> u8; +} + +impl U16 for u16 { + fn as_usize(self) -> usize { + usize::from(self) + } + + fn low_u8(self) -> u8 { + self as u8 + } + + fn high_u8(self) -> u8 { + (self >> 8) as u8 + } +} + +pub(crate) trait U32 { + fn as_usize(self) -> usize; + fn low_u8(self) -> u8; + fn low_u16(self) -> u16; + fn high_u16(self) -> u16; +} + +impl U32 for u32 { + fn as_usize(self) -> usize { + #[cfg(debug_assertions)] + { + usize::try_from(self).expect("u32 overflowed usize") + } + #[cfg(not(debug_assertions))] + { + self as usize + } + } + + fn low_u8(self) -> u8 { + self as u8 + } + + fn low_u16(self) -> u16 { + self as u16 + } + + fn high_u16(self) -> u16 { + (self >> 16) as u16 + } +} + +pub(crate) trait U64 { + fn as_usize(self) -> usize; + fn low_u8(self) -> u8; + fn low_u16(self) -> u16; + fn low_u32(self) -> u32; + fn high_u32(self) -> u32; +} + +impl U64 for u64 { + fn as_usize(self) -> usize { + #[cfg(debug_assertions)] + { + usize::try_from(self).expect("u64 overflowed usize") + } + #[cfg(not(debug_assertions))] + { + self as usize + } + } + + fn low_u8(self) -> u8 { + self as u8 + } + + fn low_u16(self) -> u16 { + self as u16 + } + + fn low_u32(self) -> u32 { + self as u32 + } + + fn high_u32(self) -> u32 { + (self >> 32) as u32 + } +} + +pub(crate) trait I32 { + fn as_usize(self) -> usize; + fn to_bits(self) -> u32; + fn from_bits(n: u32) -> i32; +} + +impl I32 for i32 { + fn as_usize(self) -> usize { + #[cfg(debug_assertions)] + { + usize::try_from(self).expect("i32 overflowed usize") + } + #[cfg(not(debug_assertions))] + { + self as usize + } + } + + fn to_bits(self) -> u32 { + self as u32 + } + + fn from_bits(n: u32) -> i32 { + n as i32 + } +} + +pub(crate) trait Usize { + fn as_u8(self) -> u8; + fn as_u16(self) -> u16; + fn as_u32(self) -> u32; + fn as_u64(self) -> u64; +} + +impl Usize for usize { + fn as_u8(self) -> u8 { + #[cfg(debug_assertions)] + { + u8::try_from(self).expect("usize overflowed u8") + } + #[cfg(not(debug_assertions))] + { + self as u8 + } + } + + fn as_u16(self) -> u16 { + #[cfg(debug_assertions)] + { + u16::try_from(self).expect("usize overflowed u16") + } + #[cfg(not(debug_assertions))] + { + self as u16 + } + } + + fn as_u32(self) -> u32 { + #[cfg(debug_assertions)] + { + u32::try_from(self).expect("usize overflowed u32") + } + #[cfg(not(debug_assertions))] + { + self as u32 + } + } + + fn as_u64(self) -> u64 { + #[cfg(debug_assertions)] + { + u64::try_from(self).expect("usize overflowed u64") + } + #[cfg(not(debug_assertions))] + { + self as u64 + } + } +} + +// Pointers aren't integers, but we convert pointers to integers to perform +// offset arithmetic in some places. (And no, we don't convert the integers +// back to pointers.) So add 'as_usize' conversions here too for completeness. +// +// These 'as' casts are actually okay because they're always non-lossy. But the +// idea here is to just try and remove as much 'as' as possible, particularly +// in this crate where we are being really paranoid about offsets and making +// sure we don't panic on inputs that might be untrusted. This way, the 'as' +// casts become easier to audit if they're all in one place, even when some of +// them are actually okay 100% of the time. + +pub(crate) trait Pointer { + fn as_usize(self) -> usize; +} + +impl<T> Pointer for *const T { + fn as_usize(self) -> usize { + self as usize + } +} + +pub(crate) trait PointerMut { + fn as_usize(self) -> usize; +} + +impl<T> PointerMut for *mut T { + fn as_usize(self) -> usize { + self as usize + } +} diff --git a/vendor/regex-automata/src/util/interpolate.rs b/vendor/regex-automata/src/util/interpolate.rs new file mode 100644 index 000000000..f274629df --- /dev/null +++ b/vendor/regex-automata/src/util/interpolate.rs @@ -0,0 +1,579 @@ +/*! +Provides routines for interpolating capture group references. + +That is, if a replacement string contains references like `$foo` or `${foo1}`, +then they are replaced with the corresponding capture values for the groups +named `foo` and `foo1`, respectively. Similarly, syntax like `$1` and `${1}` +is supported as well, with `1` corresponding to a capture group index and not +a name. + +This module provides the free functions [`string`] and [`bytes`], which +interpolate Rust Unicode strings and byte strings, respectively. + +# Format + +These routines support two different kinds of capture references: unbraced and +braced. + +For the unbraced format, the format supported is `$ref` where `name` can be +any character in the class `[0-9A-Za-z_]`. `ref` is always the longest +possible parse. So for example, `$1a` corresponds to the capture group named +`1a` and not the capture group at index `1`. If `ref` matches `^[0-9]+$`, then +it is treated as a capture group index itself and not a name. + +For the braced format, the format supported is `${ref}` where `ref` can be any +sequence of bytes except for `}`. If no closing brace occurs, then it is not +considered a capture reference. As with the unbraced format, if `ref` matches +`^[0-9]+$`, then it is treated as a capture group index and not a name. + +The braced format is useful for exerting precise control over the name of the +capture reference. For example, `${1}a` corresponds to the capture group +reference `1` followed by the letter `a`, where as `$1a` (as mentioned above) +corresponds to the capture group reference `1a`. The braced format is also +useful for expressing capture group names that use characters not supported by +the unbraced format. For example, `${foo[bar].baz}` refers to the capture group +named `foo[bar].baz`. + +If a capture group reference is found and it does not refer to a valid capture +group, then it will be replaced with the empty string. + +To write a literal `$`, use `$$`. + +To be clear, and as exhibited via the type signatures in the routines in this +module, it is impossible for a replacement string to be invalid. A replacement +string may not have the intended semantics, but the interpolation procedure +itself can never fail. +*/ + +use alloc::{string::String, vec::Vec}; + +use crate::util::memchr::memchr; + +/// Accepts a replacement string and interpolates capture references with their +/// corresponding values. +/// +/// `append` should be a function that appends the string value of a capture +/// group at a particular index to the string given. If the capture group +/// index is invalid, then nothing should be appended. +/// +/// `name_to_index` should be a function that maps a capture group name to a +/// capture group index. If the given name doesn't exist, then `None` should +/// be returned. +/// +/// Finally, `dst` is where the final interpolated contents should be written. +/// If `replacement` contains no capture group references, then `dst` will be +/// equivalent to `replacement`. +/// +/// See the [module documentation](self) for details about the format +/// supported. +/// +/// # Example +/// +/// ``` +/// use regex_automata::util::interpolate; +/// +/// let mut dst = String::new(); +/// interpolate::string( +/// "foo $bar baz", +/// |index, dst| { +/// if index == 0 { +/// dst.push_str("BAR"); +/// } +/// }, +/// |name| { +/// if name == "bar" { +/// Some(0) +/// } else { +/// None +/// } +/// }, +/// &mut dst, +/// ); +/// assert_eq!("foo BAR baz", dst); +/// ``` +pub fn string( + mut replacement: &str, + mut append: impl FnMut(usize, &mut String), + mut name_to_index: impl FnMut(&str) -> Option<usize>, + dst: &mut String, +) { + while !replacement.is_empty() { + match memchr(b'$', replacement.as_bytes()) { + None => break, + Some(i) => { + dst.push_str(&replacement[..i]); + replacement = &replacement[i..]; + } + } + // Handle escaping of '$'. + if replacement.as_bytes().get(1).map_or(false, |&b| b == b'$') { + dst.push_str("$"); + replacement = &replacement[2..]; + continue; + } + debug_assert!(!replacement.is_empty()); + let cap_ref = match find_cap_ref(replacement.as_bytes()) { + Some(cap_ref) => cap_ref, + None => { + dst.push_str("$"); + replacement = &replacement[1..]; + continue; + } + }; + replacement = &replacement[cap_ref.end..]; + match cap_ref.cap { + Ref::Number(i) => append(i, dst), + Ref::Named(name) => { + if let Some(i) = name_to_index(name) { + append(i, dst); + } + } + } + } + dst.push_str(replacement); +} + +/// Accepts a replacement byte string and interpolates capture references with +/// their corresponding values. +/// +/// `append` should be a function that appends the byte string value of a +/// capture group at a particular index to the byte string given. If the +/// capture group index is invalid, then nothing should be appended. +/// +/// `name_to_index` should be a function that maps a capture group name to a +/// capture group index. If the given name doesn't exist, then `None` should +/// be returned. +/// +/// Finally, `dst` is where the final interpolated contents should be written. +/// If `replacement` contains no capture group references, then `dst` will be +/// equivalent to `replacement`. +/// +/// See the [module documentation](self) for details about the format +/// supported. +/// +/// # Example +/// +/// ``` +/// use regex_automata::util::interpolate; +/// +/// let mut dst = vec![]; +/// interpolate::bytes( +/// b"foo $bar baz", +/// |index, dst| { +/// if index == 0 { +/// dst.extend_from_slice(b"BAR"); +/// } +/// }, +/// |name| { +/// if name == "bar" { +/// Some(0) +/// } else { +/// None +/// } +/// }, +/// &mut dst, +/// ); +/// assert_eq!(&b"foo BAR baz"[..], dst); +/// ``` +pub fn bytes( + mut replacement: &[u8], + mut append: impl FnMut(usize, &mut Vec<u8>), + mut name_to_index: impl FnMut(&str) -> Option<usize>, + dst: &mut Vec<u8>, +) { + while !replacement.is_empty() { + match memchr(b'$', replacement) { + None => break, + Some(i) => { + dst.extend_from_slice(&replacement[..i]); + replacement = &replacement[i..]; + } + } + // Handle escaping of '$'. + if replacement.get(1).map_or(false, |&b| b == b'$') { + dst.push(b'$'); + replacement = &replacement[2..]; + continue; + } + debug_assert!(!replacement.is_empty()); + let cap_ref = match find_cap_ref(replacement) { + Some(cap_ref) => cap_ref, + None => { + dst.push(b'$'); + replacement = &replacement[1..]; + continue; + } + }; + replacement = &replacement[cap_ref.end..]; + match cap_ref.cap { + Ref::Number(i) => append(i, dst), + Ref::Named(name) => { + if let Some(i) = name_to_index(name) { + append(i, dst); + } + } + } + } + dst.extend_from_slice(replacement); +} + +/// `CaptureRef` represents a reference to a capture group inside some text. +/// The reference is either a capture group name or a number. +/// +/// It is also tagged with the position in the text following the +/// capture reference. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +struct CaptureRef<'a> { + cap: Ref<'a>, + end: usize, +} + +/// A reference to a capture group in some text. +/// +/// e.g., `$2`, `$foo`, `${foo}`. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +enum Ref<'a> { + Named(&'a str), + Number(usize), +} + +impl<'a> From<&'a str> for Ref<'a> { + fn from(x: &'a str) -> Ref<'a> { + Ref::Named(x) + } +} + +impl From<usize> for Ref<'static> { + fn from(x: usize) -> Ref<'static> { + Ref::Number(x) + } +} + +/// Parses a possible reference to a capture group name in the given text, +/// starting at the beginning of `replacement`. +/// +/// If no such valid reference could be found, None is returned. +/// +/// Note that this returns a "possible" reference because this routine doesn't +/// know whether the reference is to a valid group or not. If it winds up not +/// being a valid reference, then it should be replaced with the empty string. +fn find_cap_ref(replacement: &[u8]) -> Option<CaptureRef<'_>> { + let mut i = 0; + let rep: &[u8] = replacement; + if rep.len() <= 1 || rep[0] != b'$' { + return None; + } + i += 1; + if rep[i] == b'{' { + return find_cap_ref_braced(rep, i + 1); + } + let mut cap_end = i; + while rep.get(cap_end).copied().map_or(false, is_valid_cap_letter) { + cap_end += 1; + } + if cap_end == i { + return None; + } + // We just verified that the range 0..cap_end is valid ASCII, so it must + // therefore be valid UTF-8. If we really cared, we could avoid this UTF-8 + // check via an unchecked conversion or by parsing the number straight from + // &[u8]. + let cap = core::str::from_utf8(&rep[i..cap_end]) + .expect("valid UTF-8 capture name"); + Some(CaptureRef { + cap: match cap.parse::<usize>() { + Ok(i) => Ref::Number(i), + Err(_) => Ref::Named(cap), + }, + end: cap_end, + }) +} + +/// Looks for a braced reference, e.g., `${foo1}`. This assumes that an opening +/// brace has been found at `i-1` in `rep`. This then looks for a closing +/// brace and returns the capture reference within the brace. +fn find_cap_ref_braced(rep: &[u8], mut i: usize) -> Option<CaptureRef<'_>> { + assert_eq!(b'{', rep[i.checked_sub(1).unwrap()]); + let start = i; + while rep.get(i).map_or(false, |&b| b != b'}') { + i += 1; + } + if !rep.get(i).map_or(false, |&b| b == b'}') { + return None; + } + // When looking at braced names, we don't put any restrictions on the name, + // so it's possible it could be invalid UTF-8. But a capture group name + // can never be invalid UTF-8, so if we have invalid UTF-8, then we can + // safely return None. + let cap = match core::str::from_utf8(&rep[start..i]) { + Err(_) => return None, + Ok(cap) => cap, + }; + Some(CaptureRef { + cap: match cap.parse::<usize>() { + Ok(i) => Ref::Number(i), + Err(_) => Ref::Named(cap), + }, + end: i + 1, + }) +} + +/// Returns true if and only if the given byte is allowed in a capture name +/// written in non-brace form. +fn is_valid_cap_letter(b: u8) -> bool { + match b { + b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' | b'_' => true, + _ => false, + } +} + +#[cfg(test)] +mod tests { + use alloc::{string::String, vec, vec::Vec}; + + use super::{find_cap_ref, CaptureRef}; + + macro_rules! find { + ($name:ident, $text:expr) => { + #[test] + fn $name() { + assert_eq!(None, find_cap_ref($text.as_bytes())); + } + }; + ($name:ident, $text:expr, $capref:expr) => { + #[test] + fn $name() { + assert_eq!(Some($capref), find_cap_ref($text.as_bytes())); + } + }; + } + + macro_rules! c { + ($name_or_number:expr, $pos:expr) => { + CaptureRef { cap: $name_or_number.into(), end: $pos } + }; + } + + find!(find_cap_ref1, "$foo", c!("foo", 4)); + find!(find_cap_ref2, "${foo}", c!("foo", 6)); + find!(find_cap_ref3, "$0", c!(0, 2)); + find!(find_cap_ref4, "$5", c!(5, 2)); + find!(find_cap_ref5, "$10", c!(10, 3)); + // See https://github.com/rust-lang/regex/pull/585 + // for more on characters following numbers + find!(find_cap_ref6, "$42a", c!("42a", 4)); + find!(find_cap_ref7, "${42}a", c!(42, 5)); + find!(find_cap_ref8, "${42"); + find!(find_cap_ref9, "${42 "); + find!(find_cap_ref10, " $0 "); + find!(find_cap_ref11, "$"); + find!(find_cap_ref12, " "); + find!(find_cap_ref13, ""); + find!(find_cap_ref14, "$1-$2", c!(1, 2)); + find!(find_cap_ref15, "$1_$2", c!("1_", 3)); + find!(find_cap_ref16, "$x-$y", c!("x", 2)); + find!(find_cap_ref17, "$x_$y", c!("x_", 3)); + find!(find_cap_ref18, "${#}", c!("#", 4)); + find!(find_cap_ref19, "${Z[}", c!("Z[", 5)); + find!(find_cap_ref20, "${¾}", c!("¾", 5)); + find!(find_cap_ref21, "${¾a}", c!("¾a", 6)); + find!(find_cap_ref22, "${a¾}", c!("a¾", 6)); + find!(find_cap_ref23, "${☃}", c!("☃", 6)); + find!(find_cap_ref24, "${a☃}", c!("a☃", 7)); + find!(find_cap_ref25, "${☃a}", c!("☃a", 7)); + find!(find_cap_ref26, "${名字}", c!("名字", 9)); + + fn interpolate_string( + mut name_to_index: Vec<(&'static str, usize)>, + caps: Vec<&'static str>, + replacement: &str, + ) -> String { + name_to_index.sort_by_key(|x| x.0); + + let mut dst = String::new(); + super::string( + replacement, + |i, dst| { + if let Some(&s) = caps.get(i) { + dst.push_str(s); + } + }, + |name| -> Option<usize> { + name_to_index + .binary_search_by_key(&name, |x| x.0) + .ok() + .map(|i| name_to_index[i].1) + }, + &mut dst, + ); + dst + } + + fn interpolate_bytes( + mut name_to_index: Vec<(&'static str, usize)>, + caps: Vec<&'static str>, + replacement: &str, + ) -> String { + name_to_index.sort_by_key(|x| x.0); + + let mut dst = vec![]; + super::bytes( + replacement.as_bytes(), + |i, dst| { + if let Some(&s) = caps.get(i) { + dst.extend_from_slice(s.as_bytes()); + } + }, + |name| -> Option<usize> { + name_to_index + .binary_search_by_key(&name, |x| x.0) + .ok() + .map(|i| name_to_index[i].1) + }, + &mut dst, + ); + String::from_utf8(dst).unwrap() + } + + macro_rules! interp { + ($name:ident, $map:expr, $caps:expr, $hay:expr, $expected:expr $(,)*) => { + #[test] + fn $name() { + assert_eq!( + $expected, + interpolate_string($map, $caps, $hay), + "interpolate::string failed", + ); + assert_eq!( + $expected, + interpolate_bytes($map, $caps, $hay), + "interpolate::bytes failed", + ); + } + }; + } + + interp!( + interp1, + vec![("foo", 2)], + vec!["", "", "xxx"], + "test $foo test", + "test xxx test", + ); + + interp!( + interp2, + vec![("foo", 2)], + vec!["", "", "xxx"], + "test$footest", + "test", + ); + + interp!( + interp3, + vec![("foo", 2)], + vec!["", "", "xxx"], + "test${foo}test", + "testxxxtest", + ); + + interp!( + interp4, + vec![("foo", 2)], + vec!["", "", "xxx"], + "test$2test", + "test", + ); + + interp!( + interp5, + vec![("foo", 2)], + vec!["", "", "xxx"], + "test${2}test", + "testxxxtest", + ); + + interp!( + interp6, + vec![("foo", 2)], + vec!["", "", "xxx"], + "test $$foo test", + "test $foo test", + ); + + interp!( + interp7, + vec![("foo", 2)], + vec!["", "", "xxx"], + "test $foo", + "test xxx", + ); + + interp!( + interp8, + vec![("foo", 2)], + vec!["", "", "xxx"], + "$foo test", + "xxx test", + ); + + interp!( + interp9, + vec![("bar", 1), ("foo", 2)], + vec!["", "yyy", "xxx"], + "test $bar$foo", + "test yyyxxx", + ); + + interp!( + interp10, + vec![("bar", 1), ("foo", 2)], + vec!["", "yyy", "xxx"], + "test $ test", + "test $ test", + ); + + interp!( + interp11, + vec![("bar", 1), ("foo", 2)], + vec!["", "yyy", "xxx"], + "test ${} test", + "test test", + ); + + interp!( + interp12, + vec![("bar", 1), ("foo", 2)], + vec!["", "yyy", "xxx"], + "test ${ } test", + "test test", + ); + + interp!( + interp13, + vec![("bar", 1), ("foo", 2)], + vec!["", "yyy", "xxx"], + "test ${a b} test", + "test test", + ); + + interp!( + interp14, + vec![("bar", 1), ("foo", 2)], + vec!["", "yyy", "xxx"], + "test ${a} test", + "test test", + ); + + // This is a funny case where a braced reference is never closed, but + // within the unclosed braced reference, there is an unbraced reference. + // In this case, the braced reference is just treated literally and the + // unbraced reference is found. + interp!( + interp15, + vec![("bar", 1), ("foo", 2)], + vec!["", "yyy", "xxx"], + "test ${wat $bar ok", + "test ${wat yyy ok", + ); +} diff --git a/vendor/regex-automata/src/util/iter.rs b/vendor/regex-automata/src/util/iter.rs new file mode 100644 index 000000000..a789fa042 --- /dev/null +++ b/vendor/regex-automata/src/util/iter.rs @@ -0,0 +1,1027 @@ +/*! +Generic helpers for iteration of matches from a regex engine in a haystack. + +The principle type in this module is a [`Searcher`]. A `Searcher` provides +its own lower level iterator-like API in addition to methods for constructing +types that implement `Iterator`. The documentation for `Searcher` explains a +bit more about why these different APIs exist. + +Currently, this module supports iteration over any regex engine that works +with the [`HalfMatch`], [`Match`] or [`Captures`] types. +*/ + +#[cfg(feature = "alloc")] +use crate::util::captures::Captures; +use crate::util::search::{HalfMatch, Input, Match, MatchError}; + +/// A searcher for creating iterators and performing lower level iteration. +/// +/// This searcher encapsulates the logic required for finding all successive +/// non-overlapping matches in a haystack. In theory, iteration would look +/// something like this: +/// +/// 1. Setting the start position to `0`. +/// 2. Execute a regex search. If no match, end iteration. +/// 3. Report the match and set the start position to the end of the match. +/// 4. Go back to (2). +/// +/// And if this were indeed the case, it's likely that `Searcher` wouldn't +/// exist. Unfortunately, because a regex may match the empty string, the above +/// logic won't work for all possible regexes. Namely, if an empty match is +/// found, then step (3) would set the start position of the search to the +/// position it was at. Thus, iteration would never end. +/// +/// Instead, a `Searcher` knows how to detect these cases and forcefully +/// advance iteration in the case of an empty match that overlaps with a +/// previous match. +/// +/// If you know that your regex cannot match any empty string, then the simple +/// algorithm described above will work correctly. +/// +/// When possible, prefer the iterators defined on the regex engine you're +/// using. This tries to abstract over the regex engine and is thus a bit more +/// unwieldy to use. +/// +/// In particular, a `Searcher` is not itself an iterator. Instead, it provides +/// `advance` routines that permit moving the search along explicitly. It also +/// provides various routines, like [`Searcher::into_matches_iter`], that +/// accept a closure (representing how a regex engine executes a search) and +/// returns a conventional iterator. +/// +/// The lifetime parameters come from the [`Input`] type passed to +/// [`Searcher::new`]: +/// +/// * `'h` is the lifetime of the underlying haystack. +/// +/// # Searcher vs Iterator +/// +/// Why does a search type with "advance" APIs exist at all when we also have +/// iterators? Unfortunately, the reasoning behind this split is a complex +/// combination of the following things: +/// +/// 1. While many of the regex engines expose their own iterators, it is also +/// nice to expose this lower level iteration helper because it permits callers +/// to provide their own `Input` configuration. Moreover, a `Searcher` can work +/// with _any_ regex engine instead of only the ones defined in this crate. +/// This way, everyone benefits from a shared iteration implementation. +/// 2. There are many different regex engines that, while they have the same +/// match semantics, they have slightly different APIs. Iteration is just +/// complex enough to want to share code, and so we need a way of abstracting +/// over those different regex engines. While we could define a new trait that +/// describes any regex engine search API, it would wind up looking very close +/// to a closure. While there may still be reasons for the more generic trait +/// to exist, for now and for the purposes of iteration, we use a closure. +/// Closures also provide a lot of easy flexibility at the call site, in that +/// they permit the caller to borrow any kind of state they want for use during +/// each search call. +/// 3. As a result of using closures, and because closures are anonymous types +/// that cannot be named, it is difficult to encapsulate them without both +/// costs to speed and added complexity to the public API. For example, in +/// defining an iterator type like +/// [`dfa::regex::FindMatches`](crate::dfa::regex::FindMatches), +/// if we use a closure internally, it's not possible to name this type in the +/// return type of the iterator constructor. Thus, the only way around it is +/// to erase the type by boxing it and turning it into a `Box<dyn FnMut ...>`. +/// This boxed closure is unlikely to be inlined _and_ it infects the public +/// API in subtle ways. Namely, unless you declare the closure as implementing +/// `Send` and `Sync`, then the resulting iterator type won't implement it +/// either. But there are practical issues with requiring the closure to +/// implement `Send` and `Sync` that result in other API complexities that +/// are beyond the scope of this already long exposition. +/// 4. Some regex engines expose more complex match information than just +/// "which pattern matched" and "at what offsets." For example, the PikeVM +/// exposes match spans for each capturing group that participated in the +/// match. In such cases, it can be quite beneficial to reuse the capturing +/// group allocation on subsequent searches. A proper iterator doesn't permit +/// this API due to its interface, so it's useful to have something a bit lower +/// level that permits callers to amortize allocations while also reusing a +/// shared implementation of iteration. (See the documentation for +/// [`Searcher::advance`] for an example of using the "advance" API with the +/// PikeVM.) +/// +/// What this boils down to is that there are "advance" APIs which require +/// handing a closure to it for every call, and there are also APIs to create +/// iterators from a closure. The former are useful for _implementing_ +/// iterators or when you need more flexibility, while the latter are useful +/// for conveniently writing custom iterators on-the-fly. +/// +/// # Example: iterating with captures +/// +/// Several regex engines in this crate over convenient iterator APIs over +/// [`Captures`] values. To do so, this requires allocating a new `Captures` +/// value for each iteration step. This can perhaps be more costly than you +/// might want. Instead of implementing your own iterator to avoid that +/// cost (which can be a little subtle if you want to handle empty matches +/// correctly), you can use this `Searcher` to do it for you: +/// +/// ``` +/// use regex_automata::{ +/// nfa::thompson::pikevm::PikeVM, +/// util::iter::Searcher, +/// Input, Span, +/// }; +/// +/// let re = PikeVM::new("foo(?P<numbers>[0-9]+)")?; +/// let haystack = "foo1 foo12 foo123"; +/// +/// let mut caps = re.create_captures(); +/// let mut cache = re.create_cache(); +/// let mut matches = vec![]; +/// let mut searcher = Searcher::new(Input::new(haystack)); +/// while let Some(_) = searcher.advance(|input| { +/// re.search(&mut cache, input, &mut caps); +/// Ok(caps.get_match()) +/// }) { +/// // The unwrap is OK since 'numbers' matches if the pattern matches. +/// matches.push(caps.get_group_by_name("numbers").unwrap()); +/// } +/// assert_eq!(matches, vec![ +/// Span::from(3..4), +/// Span::from(8..10), +/// Span::from(14..17), +/// ]); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +#[derive(Clone, Debug)] +pub struct Searcher<'h> { + /// The input parameters to give to each regex engine call. + /// + /// The start position of the search is mutated during iteration. + input: Input<'h>, + /// Records the end offset of the most recent match. This is necessary to + /// handle a corner case for preventing empty matches from overlapping with + /// the ending bounds of a prior match. + last_match_end: Option<usize>, +} + +impl<'h> Searcher<'h> { + /// Create a new fallible non-overlapping matches iterator. + /// + /// The given `input` provides the parameters (including the haystack), + /// while the `finder` represents a closure that calls the underlying regex + /// engine. The closure may borrow any additional state that is needed, + /// such as a prefilter scanner. + pub fn new(input: Input<'h>) -> Searcher<'h> { + Searcher { input, last_match_end: None } + } + + /// Returns the current `Input` used by this searcher. + /// + /// The `Input` returned is generally equivalent to the one given to + /// [`Searcher::new`], but its start position may be different to reflect + /// the start of the next search to be executed. + pub fn input<'s>(&'s self) -> &'s Input<'h> { + &self.input + } + + /// Return the next half match for an infallible search if one exists, and + /// advance to the next position. + /// + /// This is like `try_advance_half`, except errors are converted into + /// panics. + /// + /// # Panics + /// + /// If the given closure returns an error, then this panics. This is useful + /// when you know your underlying regex engine has been configured to not + /// return an error. + /// + /// # Example + /// + /// This example shows how to use a `Searcher` to iterate over all matches + /// when using a DFA, which only provides "half" matches. + /// + /// ``` + /// use regex_automata::{ + /// hybrid::dfa::DFA, + /// util::iter::Searcher, + /// HalfMatch, Input, + /// }; + /// + /// let re = DFA::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}")?; + /// let mut cache = re.create_cache(); + /// + /// let input = Input::new("2010-03-14 2016-10-08 2020-10-22"); + /// let mut it = Searcher::new(input); + /// + /// let expected = Some(HalfMatch::must(0, 10)); + /// let got = it.advance_half(|input| re.try_search_fwd(&mut cache, input)); + /// assert_eq!(expected, got); + /// + /// let expected = Some(HalfMatch::must(0, 21)); + /// let got = it.advance_half(|input| re.try_search_fwd(&mut cache, input)); + /// assert_eq!(expected, got); + /// + /// let expected = Some(HalfMatch::must(0, 32)); + /// let got = it.advance_half(|input| re.try_search_fwd(&mut cache, input)); + /// assert_eq!(expected, got); + /// + /// let expected = None; + /// let got = it.advance_half(|input| re.try_search_fwd(&mut cache, input)); + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// This correctly moves iteration forward even when an empty match occurs: + /// + /// ``` + /// use regex_automata::{ + /// hybrid::dfa::DFA, + /// util::iter::Searcher, + /// HalfMatch, Input, + /// }; + /// + /// let re = DFA::new(r"a|")?; + /// let mut cache = re.create_cache(); + /// + /// let input = Input::new("abba"); + /// let mut it = Searcher::new(input); + /// + /// let expected = Some(HalfMatch::must(0, 1)); + /// let got = it.advance_half(|input| re.try_search_fwd(&mut cache, input)); + /// assert_eq!(expected, got); + /// + /// let expected = Some(HalfMatch::must(0, 2)); + /// let got = it.advance_half(|input| re.try_search_fwd(&mut cache, input)); + /// assert_eq!(expected, got); + /// + /// let expected = Some(HalfMatch::must(0, 4)); + /// let got = it.advance_half(|input| re.try_search_fwd(&mut cache, input)); + /// assert_eq!(expected, got); + /// + /// let expected = None; + /// let got = it.advance_half(|input| re.try_search_fwd(&mut cache, input)); + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn advance_half<F>(&mut self, finder: F) -> Option<HalfMatch> + where + F: FnMut(&Input<'_>) -> Result<Option<HalfMatch>, MatchError>, + { + match self.try_advance_half(finder) { + Ok(m) => m, + Err(err) => panic!( + "unexpected regex half find error: {}\n\ + to handle find errors, use 'try' or 'search' methods", + err, + ), + } + } + + /// Return the next match for an infallible search if one exists, and + /// advance to the next position. + /// + /// The search is advanced even in the presence of empty matches by + /// forbidding empty matches from overlapping with any other match. + /// + /// This is like `try_advance`, except errors are converted into panics. + /// + /// # Panics + /// + /// If the given closure returns an error, then this panics. This is useful + /// when you know your underlying regex engine has been configured to not + /// return an error. + /// + /// # Example + /// + /// This example shows how to use a `Searcher` to iterate over all matches + /// when using a regex based on lazy DFAs: + /// + /// ``` + /// use regex_automata::{ + /// hybrid::regex::Regex, + /// util::iter::Searcher, + /// Match, Input, + /// }; + /// + /// let re = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}")?; + /// let mut cache = re.create_cache(); + /// + /// let input = Input::new("2010-03-14 2016-10-08 2020-10-22"); + /// let mut it = Searcher::new(input); + /// + /// let expected = Some(Match::must(0, 0..10)); + /// let got = it.advance(|input| re.try_search(&mut cache, input)); + /// assert_eq!(expected, got); + /// + /// let expected = Some(Match::must(0, 11..21)); + /// let got = it.advance(|input| re.try_search(&mut cache, input)); + /// assert_eq!(expected, got); + /// + /// let expected = Some(Match::must(0, 22..32)); + /// let got = it.advance(|input| re.try_search(&mut cache, input)); + /// assert_eq!(expected, got); + /// + /// let expected = None; + /// let got = it.advance(|input| re.try_search(&mut cache, input)); + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// This example shows the same as above, but with the PikeVM. This example + /// is useful because it shows how to use this API even when the regex + /// engine doesn't directly return a `Match`. + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson::pikevm::PikeVM, + /// util::iter::Searcher, + /// Match, Input, + /// }; + /// + /// let re = PikeVM::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}")?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// + /// let input = Input::new("2010-03-14 2016-10-08 2020-10-22"); + /// let mut it = Searcher::new(input); + /// + /// let expected = Some(Match::must(0, 0..10)); + /// let got = it.advance(|input| { + /// re.search(&mut cache, input, &mut caps); + /// Ok(caps.get_match()) + /// }); + /// // Note that if we wanted to extract capturing group spans, we could + /// // do that here with 'caps'. + /// assert_eq!(expected, got); + /// + /// let expected = Some(Match::must(0, 11..21)); + /// let got = it.advance(|input| { + /// re.search(&mut cache, input, &mut caps); + /// Ok(caps.get_match()) + /// }); + /// assert_eq!(expected, got); + /// + /// let expected = Some(Match::must(0, 22..32)); + /// let got = it.advance(|input| { + /// re.search(&mut cache, input, &mut caps); + /// Ok(caps.get_match()) + /// }); + /// assert_eq!(expected, got); + /// + /// let expected = None; + /// let got = it.advance(|input| { + /// re.search(&mut cache, input, &mut caps); + /// Ok(caps.get_match()) + /// }); + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn advance<F>(&mut self, finder: F) -> Option<Match> + where + F: FnMut(&Input<'_>) -> Result<Option<Match>, MatchError>, + { + match self.try_advance(finder) { + Ok(m) => m, + Err(err) => panic!( + "unexpected regex find error: {}\n\ + to handle find errors, use 'try' or 'search' methods", + err, + ), + } + } + + /// Return the next half match for a fallible search if one exists, and + /// advance to the next position. + /// + /// This is like `advance_half`, except it permits callers to handle errors + /// during iteration. + #[inline] + pub fn try_advance_half<F>( + &mut self, + mut finder: F, + ) -> Result<Option<HalfMatch>, MatchError> + where + F: FnMut(&Input<'_>) -> Result<Option<HalfMatch>, MatchError>, + { + let mut m = match finder(&self.input)? { + None => return Ok(None), + Some(m) => m, + }; + if Some(m.offset()) == self.last_match_end { + m = match self.handle_overlapping_empty_half_match(m, finder)? { + None => return Ok(None), + Some(m) => m, + }; + } + self.input.set_start(m.offset()); + self.last_match_end = Some(m.offset()); + Ok(Some(m)) + } + + /// Return the next match for a fallible search if one exists, and advance + /// to the next position. + /// + /// This is like `advance`, except it permits callers to handle errors + /// during iteration. + #[inline] + pub fn try_advance<F>( + &mut self, + mut finder: F, + ) -> Result<Option<Match>, MatchError> + where + F: FnMut(&Input<'_>) -> Result<Option<Match>, MatchError>, + { + let mut m = match finder(&self.input)? { + None => return Ok(None), + Some(m) => m, + }; + if m.is_empty() && Some(m.end()) == self.last_match_end { + m = match self.handle_overlapping_empty_match(m, finder)? { + None => return Ok(None), + Some(m) => m, + }; + } + self.input.set_start(m.end()); + self.last_match_end = Some(m.end()); + Ok(Some(m)) + } + + /// Given a closure that executes a single search, return an iterator over + /// all successive non-overlapping half matches. + /// + /// The iterator returned yields result values. If the underlying regex + /// engine is configured to never return an error, consider calling + /// [`TryHalfMatchesIter::infallible`] to convert errors into panics. + /// + /// # Example + /// + /// This example shows how to use a `Searcher` to create a proper + /// iterator over half matches. + /// + /// ``` + /// use regex_automata::{ + /// hybrid::dfa::DFA, + /// util::iter::Searcher, + /// HalfMatch, Input, + /// }; + /// + /// let re = DFA::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}")?; + /// let mut cache = re.create_cache(); + /// + /// let input = Input::new("2010-03-14 2016-10-08 2020-10-22"); + /// let mut it = Searcher::new(input).into_half_matches_iter(|input| { + /// re.try_search_fwd(&mut cache, input) + /// }); + /// + /// let expected = Some(Ok(HalfMatch::must(0, 10))); + /// assert_eq!(expected, it.next()); + /// + /// let expected = Some(Ok(HalfMatch::must(0, 21))); + /// assert_eq!(expected, it.next()); + /// + /// let expected = Some(Ok(HalfMatch::must(0, 32))); + /// assert_eq!(expected, it.next()); + /// + /// let expected = None; + /// assert_eq!(expected, it.next()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn into_half_matches_iter<F>( + self, + finder: F, + ) -> TryHalfMatchesIter<'h, F> + where + F: FnMut(&Input<'_>) -> Result<Option<HalfMatch>, MatchError>, + { + TryHalfMatchesIter { it: self, finder } + } + + /// Given a closure that executes a single search, return an iterator over + /// all successive non-overlapping matches. + /// + /// The iterator returned yields result values. If the underlying regex + /// engine is configured to never return an error, consider calling + /// [`TryMatchesIter::infallible`] to convert errors into panics. + /// + /// # Example + /// + /// This example shows how to use a `Searcher` to create a proper + /// iterator over matches. + /// + /// ``` + /// use regex_automata::{ + /// hybrid::regex::Regex, + /// util::iter::Searcher, + /// Match, Input, + /// }; + /// + /// let re = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}")?; + /// let mut cache = re.create_cache(); + /// + /// let input = Input::new("2010-03-14 2016-10-08 2020-10-22"); + /// let mut it = Searcher::new(input).into_matches_iter(|input| { + /// re.try_search(&mut cache, input) + /// }); + /// + /// let expected = Some(Ok(Match::must(0, 0..10))); + /// assert_eq!(expected, it.next()); + /// + /// let expected = Some(Ok(Match::must(0, 11..21))); + /// assert_eq!(expected, it.next()); + /// + /// let expected = Some(Ok(Match::must(0, 22..32))); + /// assert_eq!(expected, it.next()); + /// + /// let expected = None; + /// assert_eq!(expected, it.next()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn into_matches_iter<F>(self, finder: F) -> TryMatchesIter<'h, F> + where + F: FnMut(&Input<'_>) -> Result<Option<Match>, MatchError>, + { + TryMatchesIter { it: self, finder } + } + + /// Given a closure that executes a single search, return an iterator over + /// all successive non-overlapping `Captures` values. + /// + /// The iterator returned yields result values. If the underlying regex + /// engine is configured to never return an error, consider calling + /// [`TryCapturesIter::infallible`] to convert errors into panics. + /// + /// Unlike the other iterator constructors, this accepts an initial + /// `Captures` value. This `Captures` value is reused for each search, and + /// the iterator implementation clones it before returning it. The caller + /// must provide this value because the iterator is purposely ignorant + /// of the underlying regex engine and thus doesn't know how to create + /// one itself. More to the point, a `Captures` value itself has a few + /// different constructors, which change which kind of information is + /// available to query in exchange for search performance. + /// + /// # Example + /// + /// This example shows how to use a `Searcher` to create a proper iterator + /// over `Captures` values, which provides access to all capturing group + /// spans for each match. + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson::pikevm::PikeVM, + /// util::iter::Searcher, + /// Input, + /// }; + /// + /// let re = PikeVM::new( + /// r"(?P<y>[0-9]{4})-(?P<m>[0-9]{2})-(?P<d>[0-9]{2})", + /// )?; + /// let (mut cache, caps) = (re.create_cache(), re.create_captures()); + /// + /// let haystack = "2010-03-14 2016-10-08 2020-10-22"; + /// let input = Input::new(haystack); + /// let mut it = Searcher::new(input) + /// .into_captures_iter(caps, |input, caps| { + /// re.search(&mut cache, input, caps); + /// Ok(()) + /// }); + /// + /// let got = it.next().expect("first date")?; + /// let year = got.get_group_by_name("y").expect("must match"); + /// assert_eq!("2010", &haystack[year]); + /// + /// let got = it.next().expect("second date")?; + /// let month = got.get_group_by_name("m").expect("must match"); + /// assert_eq!("10", &haystack[month]); + /// + /// let got = it.next().expect("third date")?; + /// let day = got.get_group_by_name("d").expect("must match"); + /// assert_eq!("22", &haystack[day]); + /// + /// assert!(it.next().is_none()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[cfg(feature = "alloc")] + #[inline] + pub fn into_captures_iter<F>( + self, + caps: Captures, + finder: F, + ) -> TryCapturesIter<'h, F> + where + F: FnMut(&Input<'_>, &mut Captures) -> Result<(), MatchError>, + { + TryCapturesIter { it: self, caps, finder } + } + + /// Handles the special case of a match that begins where the previous + /// match ended. Without this special handling, it'd be possible to get + /// stuck where an empty match never results in forward progress. This + /// also makes it more consistent with how presiding general purpose regex + /// engines work. + #[cold] + #[inline(never)] + fn handle_overlapping_empty_half_match<F>( + &mut self, + _: HalfMatch, + mut finder: F, + ) -> Result<Option<HalfMatch>, MatchError> + where + F: FnMut(&Input<'_>) -> Result<Option<HalfMatch>, MatchError>, + { + // Since we are only here when 'm.offset()' matches the offset of the + // last match, it follows that this must have been an empty match. + // Since we both need to make progress *and* prevent overlapping + // matches, we discard this match and advance the search by 1. + // + // Note that this may start a search in the middle of a codepoint. The + // regex engines themselves are expected to deal with that and not + // report any matches within a codepoint if they are configured in + // UTF-8 mode. + self.input.set_start(self.input.start().checked_add(1).unwrap()); + finder(&self.input) + } + + /// Handles the special case of an empty match by ensuring that 1) the + /// iterator always advances and 2) empty matches never overlap with other + /// matches. + /// + /// (1) is necessary because we principally make progress by setting the + /// starting location of the next search to the ending location of the last + /// match. But if a match is empty, then this results in a search that does + /// not advance and thus does not terminate. + /// + /// (2) is not strictly necessary, but makes intuitive sense and matches + /// the presiding behavior of most general purpose regex engines. The + /// "intuitive sense" here is that we want to report NON-overlapping + /// matches. So for example, given the regex 'a|(?:)' against the haystack + /// 'a', without the special handling, you'd get the matches [0, 1) and [1, + /// 1), where the latter overlaps with the end bounds of the former. + /// + /// Note that we mark this cold and forcefully prevent inlining because + /// handling empty matches like this is extremely rare and does require + /// quite a bit of code, comparatively. Keeping this code out of the main + /// iterator function keeps it smaller and more amenable to inlining + /// itself. + #[cold] + #[inline(never)] + fn handle_overlapping_empty_match<F>( + &mut self, + m: Match, + mut finder: F, + ) -> Result<Option<Match>, MatchError> + where + F: FnMut(&Input<'_>) -> Result<Option<Match>, MatchError>, + { + assert!(m.is_empty()); + self.input.set_start(self.input.start().checked_add(1).unwrap()); + finder(&self.input) + } +} + +/// An iterator over all non-overlapping half matches for a fallible search. +/// +/// The iterator yields a `Result<HalfMatch, MatchError>` value until no more +/// matches could be found. +/// +/// The type parameters are as follows: +/// +/// * `F` represents the type of a closure that executes the search. +/// +/// The lifetime parameters come from the [`Input`] type: +/// +/// * `'h` is the lifetime of the underlying haystack. +/// +/// When possible, prefer the iterators defined on the regex engine you're +/// using. This tries to abstract over the regex engine and is thus a bit more +/// unwieldy to use. +/// +/// This iterator is created by [`Searcher::into_half_matches_iter`]. +pub struct TryHalfMatchesIter<'h, F> { + it: Searcher<'h>, + finder: F, +} + +impl<'h, F> TryHalfMatchesIter<'h, F> { + /// Return an infallible version of this iterator. + /// + /// Any item yielded that corresponds to an error results in a panic. This + /// is useful if your underlying regex engine is configured in a way that + /// it is guaranteed to never return an error. + pub fn infallible(self) -> HalfMatchesIter<'h, F> { + HalfMatchesIter(self) + } + + /// Returns the current `Input` used by this iterator. + /// + /// The `Input` returned is generally equivalent to the one used to + /// construct this iterator, but its start position may be different to + /// reflect the start of the next search to be executed. + pub fn input<'i>(&'i self) -> &'i Input<'h> { + self.it.input() + } +} + +impl<'h, F> Iterator for TryHalfMatchesIter<'h, F> +where + F: FnMut(&Input<'_>) -> Result<Option<HalfMatch>, MatchError>, +{ + type Item = Result<HalfMatch, MatchError>; + + #[inline] + fn next(&mut self) -> Option<Result<HalfMatch, MatchError>> { + self.it.try_advance_half(&mut self.finder).transpose() + } +} + +impl<'h, F> core::fmt::Debug for TryHalfMatchesIter<'h, F> { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + f.debug_struct("TryHalfMatchesIter") + .field("it", &self.it) + .field("finder", &"<closure>") + .finish() + } +} + +/// An iterator over all non-overlapping half matches for an infallible search. +/// +/// The iterator yields a [`HalfMatch`] value until no more matches could be +/// found. +/// +/// The type parameters are as follows: +/// +/// * `F` represents the type of a closure that executes the search. +/// +/// The lifetime parameters come from the [`Input`] type: +/// +/// * `'h` is the lifetime of the underlying haystack. +/// +/// When possible, prefer the iterators defined on the regex engine you're +/// using. This tries to abstract over the regex engine and is thus a bit more +/// unwieldy to use. +/// +/// This iterator is created by [`Searcher::into_half_matches_iter`] and +/// then calling [`TryHalfMatchesIter::infallible`]. +#[derive(Debug)] +pub struct HalfMatchesIter<'h, F>(TryHalfMatchesIter<'h, F>); + +impl<'h, F> HalfMatchesIter<'h, F> { + /// Returns the current `Input` used by this iterator. + /// + /// The `Input` returned is generally equivalent to the one used to + /// construct this iterator, but its start position may be different to + /// reflect the start of the next search to be executed. + pub fn input<'i>(&'i self) -> &'i Input<'h> { + self.0.it.input() + } +} + +impl<'h, F> Iterator for HalfMatchesIter<'h, F> +where + F: FnMut(&Input<'_>) -> Result<Option<HalfMatch>, MatchError>, +{ + type Item = HalfMatch; + + #[inline] + fn next(&mut self) -> Option<HalfMatch> { + match self.0.next()? { + Ok(m) => Some(m), + Err(err) => panic!( + "unexpected regex half find error: {}\n\ + to handle find errors, use 'try' or 'search' methods", + err, + ), + } + } +} + +/// An iterator over all non-overlapping matches for a fallible search. +/// +/// The iterator yields a `Result<Match, MatchError>` value until no more +/// matches could be found. +/// +/// The type parameters are as follows: +/// +/// * `F` represents the type of a closure that executes the search. +/// +/// The lifetime parameters come from the [`Input`] type: +/// +/// * `'h` is the lifetime of the underlying haystack. +/// +/// When possible, prefer the iterators defined on the regex engine you're +/// using. This tries to abstract over the regex engine and is thus a bit more +/// unwieldy to use. +/// +/// This iterator is created by [`Searcher::into_matches_iter`]. +pub struct TryMatchesIter<'h, F> { + it: Searcher<'h>, + finder: F, +} + +impl<'h, F> TryMatchesIter<'h, F> { + /// Return an infallible version of this iterator. + /// + /// Any item yielded that corresponds to an error results in a panic. This + /// is useful if your underlying regex engine is configured in a way that + /// it is guaranteed to never return an error. + pub fn infallible(self) -> MatchesIter<'h, F> { + MatchesIter(self) + } + + /// Returns the current `Input` used by this iterator. + /// + /// The `Input` returned is generally equivalent to the one used to + /// construct this iterator, but its start position may be different to + /// reflect the start of the next search to be executed. + pub fn input<'i>(&'i self) -> &'i Input<'h> { + self.it.input() + } +} + +impl<'h, F> Iterator for TryMatchesIter<'h, F> +where + F: FnMut(&Input<'_>) -> Result<Option<Match>, MatchError>, +{ + type Item = Result<Match, MatchError>; + + #[inline] + fn next(&mut self) -> Option<Result<Match, MatchError>> { + self.it.try_advance(&mut self.finder).transpose() + } +} + +impl<'h, F> core::fmt::Debug for TryMatchesIter<'h, F> { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + f.debug_struct("TryMatchesIter") + .field("it", &self.it) + .field("finder", &"<closure>") + .finish() + } +} + +/// An iterator over all non-overlapping matches for an infallible search. +/// +/// The iterator yields a [`Match`] value until no more matches could be found. +/// +/// The type parameters are as follows: +/// +/// * `F` represents the type of a closure that executes the search. +/// +/// The lifetime parameters come from the [`Input`] type: +/// +/// * `'h` is the lifetime of the underlying haystack. +/// +/// When possible, prefer the iterators defined on the regex engine you're +/// using. This tries to abstract over the regex engine and is thus a bit more +/// unwieldy to use. +/// +/// This iterator is created by [`Searcher::into_matches_iter`] and +/// then calling [`TryMatchesIter::infallible`]. +#[derive(Debug)] +pub struct MatchesIter<'h, F>(TryMatchesIter<'h, F>); + +impl<'h, F> MatchesIter<'h, F> { + /// Returns the current `Input` used by this iterator. + /// + /// The `Input` returned is generally equivalent to the one used to + /// construct this iterator, but its start position may be different to + /// reflect the start of the next search to be executed. + pub fn input<'i>(&'i self) -> &'i Input<'h> { + self.0.it.input() + } +} + +impl<'h, F> Iterator for MatchesIter<'h, F> +where + F: FnMut(&Input<'_>) -> Result<Option<Match>, MatchError>, +{ + type Item = Match; + + #[inline] + fn next(&mut self) -> Option<Match> { + match self.0.next()? { + Ok(m) => Some(m), + Err(err) => panic!( + "unexpected regex find error: {}\n\ + to handle find errors, use 'try' or 'search' methods", + err, + ), + } + } +} + +/// An iterator over all non-overlapping captures for a fallible search. +/// +/// The iterator yields a `Result<Captures, MatchError>` value until no more +/// matches could be found. +/// +/// The type parameters are as follows: +/// +/// * `F` represents the type of a closure that executes the search. +/// +/// The lifetime parameters come from the [`Input`] type: +/// +/// * `'h` is the lifetime of the underlying haystack. +/// +/// When possible, prefer the iterators defined on the regex engine you're +/// using. This tries to abstract over the regex engine and is thus a bit more +/// unwieldy to use. +/// +/// This iterator is created by [`Searcher::into_captures_iter`]. +#[cfg(feature = "alloc")] +pub struct TryCapturesIter<'h, F> { + it: Searcher<'h>, + caps: Captures, + finder: F, +} + +#[cfg(feature = "alloc")] +impl<'h, F> TryCapturesIter<'h, F> { + /// Return an infallible version of this iterator. + /// + /// Any item yielded that corresponds to an error results in a panic. This + /// is useful if your underlying regex engine is configured in a way that + /// it is guaranteed to never return an error. + pub fn infallible(self) -> CapturesIter<'h, F> { + CapturesIter(self) + } +} + +#[cfg(feature = "alloc")] +impl<'h, F> Iterator for TryCapturesIter<'h, F> +where + F: FnMut(&Input<'_>, &mut Captures) -> Result<(), MatchError>, +{ + type Item = Result<Captures, MatchError>; + + #[inline] + fn next(&mut self) -> Option<Result<Captures, MatchError>> { + let TryCapturesIter { ref mut it, ref mut caps, ref mut finder } = + *self; + let result = it + .try_advance(|input| { + (finder)(input, caps)?; + Ok(caps.get_match()) + }) + .transpose()?; + match result { + Ok(_) => Some(Ok(caps.clone())), + Err(err) => Some(Err(err)), + } + } +} + +#[cfg(feature = "alloc")] +impl<'h, F> core::fmt::Debug for TryCapturesIter<'h, F> { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + f.debug_struct("TryCapturesIter") + .field("it", &self.it) + .field("caps", &self.caps) + .field("finder", &"<closure>") + .finish() + } +} + +/// An iterator over all non-overlapping captures for an infallible search. +/// +/// The iterator yields a [`Captures`] value until no more matches could be +/// found. +/// +/// The type parameters are as follows: +/// +/// * `F` represents the type of a closure that executes the search. +/// +/// The lifetime parameters come from the [`Input`] type: +/// +/// * `'h` is the lifetime of the underlying haystack. +/// +/// When possible, prefer the iterators defined on the regex engine you're +/// using. This tries to abstract over the regex engine and is thus a bit more +/// unwieldy to use. +/// +/// This iterator is created by [`Searcher::into_captures_iter`] and then +/// calling [`TryCapturesIter::infallible`]. +#[cfg(feature = "alloc")] +#[derive(Debug)] +pub struct CapturesIter<'h, F>(TryCapturesIter<'h, F>); + +#[cfg(feature = "alloc")] +impl<'h, F> Iterator for CapturesIter<'h, F> +where + F: FnMut(&Input<'_>, &mut Captures) -> Result<(), MatchError>, +{ + type Item = Captures; + + #[inline] + fn next(&mut self) -> Option<Captures> { + match self.0.next()? { + Ok(m) => Some(m), + Err(err) => panic!( + "unexpected regex captures error: {}\n\ + to handle find errors, use 'try' or 'search' methods", + err, + ), + } + } +} diff --git a/vendor/regex-automata/src/util/lazy.rs b/vendor/regex-automata/src/util/lazy.rs index d8cac6ef4..de27a2a6e 100644 --- a/vendor/regex-automata/src/util/lazy.rs +++ b/vendor/regex-automata/src/util/lazy.rs @@ -1,31 +1,465 @@ -use core::{ - cell::Cell, - ptr, - sync::atomic::{AtomicPtr, Ordering}, -}; - -use alloc::{boxed::Box, vec::Vec}; - -#[inline(always)] -pub(crate) fn get_or_init<T: Send + Sync + 'static>( - location: &'static AtomicPtr<T>, - init: impl FnOnce() -> T, -) -> &'static T { - let mut ptr = location.load(Ordering::Acquire); - if ptr.is_null() { - let new_dfa = Box::new(init()); - ptr = Box::into_raw(new_dfa); - let result = location.compare_exchange( - ptr::null_mut(), - ptr, - Ordering::AcqRel, - Ordering::Acquire, - ); - if let Err(old) = result { - let redundant = unsafe { Box::from_raw(ptr) }; - drop(redundant); - ptr = old; - } - } - unsafe { &*ptr } +/*! +A lazily initialized value for safe sharing between threads. + +The principal type in this module is `Lazy`, which makes it easy to construct +values that are shared safely across multiple threads simultaneously. +*/ + +use core::fmt; + +/// A lazily initialized value that implements `Deref` for `T`. +/// +/// A `Lazy` takes an initialization function and permits callers from any +/// thread to access the result of that initialization function in a safe +/// manner. In effect, this permits one-time initialization of global resources +/// in a (possibly) multi-threaded program. +/// +/// This type and its functionality are available even when neither the `alloc` +/// nor the `std` features are enabled. In exchange, a `Lazy` does **not** +/// guarantee that the given `create` function is called at most once. It +/// might be called multiple times. Moreover, a call to `Lazy::get` (either +/// explicitly or implicitly via `Lazy`'s `Deref` impl) may block until a `T` +/// is available. +/// +/// This is very similar to `lazy_static` or `once_cell`, except it doesn't +/// guarantee that the initialization function will be run once and it works +/// in no-alloc no-std environments. With that said, if you need stronger +/// guarantees or a more flexible API, then it is recommended to use either +/// `lazy_static` or `once_cell`. +/// +/// # Warning: may use a spin lock +/// +/// When this crate is compiled _without_ the `alloc` feature, then this type +/// may used a spin lock internally. This can have subtle effects that may +/// be undesirable. See [Spinlocks Considered Harmful][spinharm] for a more +/// thorough treatment of this topic. +/// +/// [spinharm]: https://matklad.github.io/2020/01/02/spinlocks-considered-harmful.html +/// +/// # Example +/// +/// This type is useful for creating regexes once, and then using them from +/// multiple threads simultaneously without worrying about synchronization. +/// +/// ``` +/// use regex_automata::{dfa::regex::Regex, util::lazy::Lazy, Match}; +/// +/// static RE: Lazy<Regex> = Lazy::new(|| Regex::new("foo[0-9]+bar").unwrap()); +/// +/// let expected = Some(Match::must(0, 3..14)); +/// assert_eq!(expected, RE.find(b"zzzfoo12345barzzz")); +/// ``` +pub struct Lazy<T, F = fn() -> T>(lazy::Lazy<T, F>); + +impl<T, F> Lazy<T, F> { + /// Create a new `Lazy` value that is initialized via the given function. + /// + /// The `T` type is automatically inferred from the return type of the + /// `create` function given. + pub const fn new(create: F) -> Lazy<T, F> { + Lazy(lazy::Lazy::new(create)) + } +} + +impl<T, F: Fn() -> T> Lazy<T, F> { + /// Return a reference to the lazily initialized value. + /// + /// This routine may block if another thread is initializing a `T`. + /// + /// Note that given a `x` which has type `Lazy`, this must be called via + /// `Lazy::get(x)` and not `x.get()`. This routine is defined this way + /// because `Lazy` impls `Deref` with a target of `T`. + /// + /// # Panics + /// + /// This panics if the `create` function inside this lazy value panics. + /// If the panic occurred in another thread, then this routine _may_ also + /// panic (but is not guaranteed to do so). + pub fn get(this: &Lazy<T, F>) -> &T { + this.0.get() + } +} + +impl<T, F: Fn() -> T> core::ops::Deref for Lazy<T, F> { + type Target = T; + + fn deref(&self) -> &T { + Lazy::get(self) + } +} + +impl<T: fmt::Debug, F: Fn() -> T> fmt::Debug for Lazy<T, F> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + self.0.fmt(f) + } +} + +#[cfg(feature = "alloc")] +mod lazy { + use core::{ + fmt, + marker::PhantomData, + sync::atomic::{AtomicPtr, Ordering}, + }; + + use alloc::boxed::Box; + + /// A non-std lazy initialized value. + /// + /// This might run the initialization function more than once, but will + /// never block. + /// + /// I wish I could get these semantics into the non-alloc non-std Lazy + /// type below, but I'm not sure how to do it. If you can do an alloc, + /// then the implementation becomes very simple if you don't care about + /// redundant work precisely because a pointer can be atomically swapped. + /// + /// Perhaps making this approach work in the non-alloc non-std case + /// requires asking the caller for a pointer? It would make the API less + /// convenient I think. + pub(super) struct Lazy<T, F> { + data: AtomicPtr<T>, + create: F, + // This indicates to the compiler that this type can drop T. It's not + // totally clear how the absence of this marker could lead to trouble, + // but putting here doesn't have any downsides so we hedge until somone + // can from the Unsafe Working Group can tell us definitively that we + // don't need it. + // + // See: https://github.com/BurntSushi/regex-automata/issues/30 + owned: PhantomData<Box<T>>, + } + + // SAFETY: So long as T and &T (and F and &F) can themselves be safely + // shared among threads, so to can a Lazy<T, _>. Namely, the Lazy API only + // permits accessing a &T and initialization is free of data races. So if T + // is thread safe, then so to is Lazy<T, _>. + // + // We specifically require that T: Send in order for Lazy<T> to be Sync. + // Without that requirement, it's possible to send a T from one thread to + // another via Lazy's destructor. + // + // It's not clear whether we need F: Send+Sync for Lazy to be Sync. But + // we're conservative for now and keep both. + unsafe impl<T: Send + Sync, F: Send + Sync> Sync for Lazy<T, F> {} + + impl<T, F> Lazy<T, F> { + /// Create a new alloc but non-std lazy value that is racily + /// initialized. That is, the 'create' function may be called more than + /// once. + pub(super) const fn new(create: F) -> Lazy<T, F> { + Lazy { + data: AtomicPtr::new(core::ptr::null_mut()), + create, + owned: PhantomData, + } + } + } + + impl<T, F: Fn() -> T> Lazy<T, F> { + /// Get the underlying lazy value. If it hasn't been initialized + /// yet, then always attempt to initialize it (even if some other + /// thread is initializing it) and atomically attach it to this lazy + /// value before returning it. + pub(super) fn get(&self) -> &T { + if let Some(data) = self.poll() { + return data; + } + let data = (self.create)(); + let mut ptr = Box::into_raw(Box::new(data)); + // We attempt to stuff our initialized value into our atomic + // pointer. Upon success, we don't need to do anything. But if + // someone else beat us to the punch, then we need to make sure + // our newly created value is dropped. + let result = self.data.compare_exchange( + core::ptr::null_mut(), + ptr, + Ordering::AcqRel, + Ordering::Acquire, + ); + if let Err(old) = result { + // SAFETY: We created 'ptr' via Box::into_raw above, so turning + // it back into a Box via from_raw is safe. + drop(unsafe { Box::from_raw(ptr) }); + ptr = old; + } + // SAFETY: We just set the pointer above to a non-null value, even + // in the error case, and set it to a fully initialized value + // returned by 'create'. + unsafe { &*ptr } + } + + /// If this lazy value has been initialized successfully, then return + /// that value. Otherwise return None immediately. This never attempts + /// to run initialization itself. + fn poll(&self) -> Option<&T> { + let ptr = self.data.load(Ordering::Acquire); + if ptr.is_null() { + return None; + } + // SAFETY: We just checked that the pointer is not null. Since it's + // not null, it must have been fully initialized by 'get' at some + // point. + Some(unsafe { &*ptr }) + } + } + + impl<T: fmt::Debug, F: Fn() -> T> fmt::Debug for Lazy<T, F> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.debug_struct("Lazy").field("data", &self.poll()).finish() + } + } + + impl<T, F> Drop for Lazy<T, F> { + fn drop(&mut self) { + let ptr = *self.data.get_mut(); + if !ptr.is_null() { + // SAFETY: We just checked that 'ptr' is not null. And since + // we have exclusive access, there are no races to worry about. + drop(unsafe { Box::from_raw(ptr) }); + } + } + } +} + +#[cfg(not(feature = "alloc"))] +mod lazy { + use core::{ + cell::Cell, + fmt, + mem::MaybeUninit, + panic::{RefUnwindSafe, UnwindSafe}, + sync::atomic::{AtomicU8, Ordering}, + }; + + /// Our 'Lazy' value can be in one of three states: + /// + /// * INIT is where it starts, and also ends up back here if the + /// 'create' routine panics. + /// * BUSY is where it sits while initialization is running in exactly + /// one thread. + /// * DONE is where it sits after 'create' has completed and 'data' has + /// been fully initialized. + const LAZY_STATE_INIT: u8 = 0; + const LAZY_STATE_BUSY: u8 = 1; + const LAZY_STATE_DONE: u8 = 2; + + /// A non-alloc non-std lazy initialized value. + /// + /// This guarantees initialization only happens once, but uses a spinlock + /// to block in the case of simultaneous access. Blocking occurs so that + /// one thread waits while another thread initializes the value. + /// + /// I would much rather have the semantics of the 'alloc' Lazy type above. + /// Namely, that we might run the initialization function more than once, + /// but we never otherwise block. However, I don't know how to do that in + /// a non-alloc non-std context. + pub(super) struct Lazy<T, F> { + state: AtomicU8, + create: Cell<Option<F>>, + data: Cell<MaybeUninit<T>>, + } + + // SAFETY: So long as T and &T (and F and &F) can themselves be safely + // shared among threads, so to can a Lazy<T, _>. Namely, the Lazy API only + // permits accessing a &T and initialization is free of data races. So if T + // is thread safe, then so to is Lazy<T, _>. + unsafe impl<T: Send + Sync, F: Send + Sync> Sync for Lazy<T, F> {} + // A reference to a Lazy is unwind safe because we specifically take + // precautions to poison all accesses to a Lazy if the caller-provided + // 'create' function panics. + impl<T: UnwindSafe, F: UnwindSafe + RefUnwindSafe> RefUnwindSafe + for Lazy<T, F> + { + } + + impl<T, F> Lazy<T, F> { + /// Create a new non-alloc non-std lazy value that is initialized + /// exactly once on first use using the given function. + pub(super) const fn new(create: F) -> Lazy<T, F> { + Lazy { + state: AtomicU8::new(LAZY_STATE_INIT), + create: Cell::new(Some(create)), + data: Cell::new(MaybeUninit::uninit()), + } + } + } + + impl<T, F: FnOnce() -> T> Lazy<T, F> { + /// Get the underlying lazy value. If it isn't been initialized + /// yet, then either initialize it or block until some other thread + /// initializes it. If the 'create' function given to Lazy::new panics + /// (even in another thread), then this panics too. + pub(super) fn get(&self) -> &T { + // This is effectively a spinlock. We loop until we enter a DONE + // state, and if possible, initialize it ourselves. The only way + // we exit the loop is if 'create' panics, we initialize 'data' or + // some other thread initializes 'data'. + // + // Yes, I have read spinlocks considered harmful[1]. And that + // article is why this spinlock is only active when 'alloc' isn't + // enabled. I did this because I don't think there is really + // another choice without 'alloc', other than not providing this at + // all. But I think that's a big bummer. + // + // [1]: https://matklad.github.io/2020/01/02/spinlocks-considered-harmful.html + while self.state.load(Ordering::Acquire) != LAZY_STATE_DONE { + // Check if we're the first ones to get here. If so, we'll be + // the ones who initialize. + let result = self.state.compare_exchange( + LAZY_STATE_INIT, + LAZY_STATE_BUSY, + Ordering::AcqRel, + Ordering::Acquire, + ); + // This means we saw the INIT state and nobody else can. So we + // must take responsibility for initializing. And by virtue of + // observing INIT, we have also told anyone else trying to + // get here that we are BUSY. If someone else sees BUSY, then + // they will spin until we finish initialization. + if let Ok(_) = result { + // Since we are guaranteed to be the only ones here, we + // know that 'create' is there... Unless someone else got + // here before us and 'create' panicked. In which case, + // 'self.create' is now 'None' and we forward the panic + // to the caller. (i.e., We implement poisoning.) + // + // SAFETY: Our use of 'self.state' guarantees that we are + // the only thread executing this line, and thus there are + // no races. + let create = unsafe { + (*self.create.as_ptr()).take().expect( + "Lazy's create function panicked, \ + preventing initialization, + poisoning current thread", + ) + }; + let guard = Guard { state: &self.state }; + // SAFETY: Our use of 'self.state' guarantees that we are + // the only thread executing this line, and thus there are + // no races. + unsafe { + (*self.data.as_ptr()).as_mut_ptr().write(create()); + } + // All is well. 'self.create' ran successfully, so we + // forget the guard. + core::mem::forget(guard); + // Everything is initialized, so we can declare success. + self.state.store(LAZY_STATE_DONE, Ordering::Release); + break; + } + core::hint::spin_loop(); + } + // We only get here if data is fully initialized, and thus poll + // will always return something. + self.poll().unwrap() + } + + /// If this lazy value has been initialized successfully, then return + /// that value. Otherwise return None immediately. This never blocks. + fn poll(&self) -> Option<&T> { + if self.state.load(Ordering::Acquire) == LAZY_STATE_DONE { + // SAFETY: The DONE state only occurs when data has been fully + // initialized. + Some(unsafe { &*(*self.data.as_ptr()).as_ptr() }) + } else { + None + } + } + } + + impl<T: fmt::Debug, F: FnMut() -> T> fmt::Debug for Lazy<T, F> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.debug_struct("Lazy") + .field("state", &self.state.load(Ordering::Acquire)) + .field("create", &"<closure>") + .field("data", &self.poll()) + .finish() + } + } + + impl<T, F> Drop for Lazy<T, F> { + fn drop(&mut self) { + if *self.state.get_mut() == LAZY_STATE_DONE { + // SAFETY: state is DONE if and only if data has been fully + // initialized. At which point, it is safe to drop. + unsafe { + // MSRV(1.60): Use assume_init_drop. The below is how + // assume_init_drop is implemented. + core::ptr::drop_in_place( + (*self.data.as_ptr()).as_mut_ptr(), + ) + } + } + } + } + + /// A guard that will reset a Lazy's state back to INIT when dropped. The + /// idea here is to 'forget' this guard on success. On failure (when a + /// panic occurs), the Drop impl runs and causes all in-progress and future + /// 'get' calls to panic. Without this guard, all in-progress and future + /// 'get' calls would spin forever. Crashing is much better than getting + /// stuck in an infinite loop. + struct Guard<'a> { + state: &'a AtomicU8, + } + + impl<'a> Drop for Guard<'a> { + fn drop(&mut self) { + // We force ourselves back into an INIT state. This will in turn + // cause any future 'get' calls to attempt calling 'self.create' + // again which will in turn panic because 'self.create' will now + // be 'None'. + self.state.store(LAZY_STATE_INIT, Ordering::Release); + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn assert_send<T: Send>() {} + fn assert_sync<T: Sync>() {} + fn assert_unwind<T: core::panic::UnwindSafe>() {} + fn assert_refunwind<T: core::panic::RefUnwindSafe>() {} + + #[test] + fn oibits() { + assert_send::<Lazy<u64>>(); + assert_sync::<Lazy<u64>>(); + assert_unwind::<Lazy<u64>>(); + assert_refunwind::<Lazy<u64>>(); + } + + // This is a regression test because we used to rely on the inferred Sync + // impl for the Lazy type defined above (for 'alloc' mode). In the + // inferred impl, it only requires that T: Sync for Lazy<T>: Sync. But + // if we have that, we can actually make use of the fact that Lazy<T> drops + // T to create a value on one thread and drop it on another. This *should* + // require T: Send, but our missing bounds before let it sneak by. + // + // Basically, this test should not compile, so we... comment it out. We + // don't have a great way of testing compile-fail tests right now. + // + // See: https://github.com/BurntSushi/regex-automata/issues/30 + /* + #[test] + fn sync_not_send() { + #[allow(dead_code)] + fn inner<T: Sync + Default>() { + let lazy = Lazy::new(move || T::default()); + std::thread::scope(|scope| { + scope.spawn(|| { + Lazy::get(&lazy); // We create T in this thread + }); + }); + // And drop in this thread. + drop(lazy); + // So we have send a !Send type over threads. (with some more + // legwork, its possible to even sneak the value out of drop + // through thread local) + } + } + */ } diff --git a/vendor/regex-automata/src/util/look.rs b/vendor/regex-automata/src/util/look.rs new file mode 100644 index 000000000..aee31b34e --- /dev/null +++ b/vendor/regex-automata/src/util/look.rs @@ -0,0 +1,1748 @@ +/*! +Types and routines for working with look-around assertions. + +This module principally defines two types: + +* [`Look`] enumerates all of the assertions supported by this crate. +* [`LookSet`] provides a way to efficiently store a set of [`Look`] values. +* [`LookMatcher`] provides routines for checking whether a `Look` or a +`LookSet` matches at a particular position in a haystack. +*/ + +// LAMENTATION: Sadly, a lot of the API of `Look` and `LookSet` were basically +// copied verbatim from the regex-syntax crate. I would have no problems using +// the regex-syntax types and defining the matching routines (only found +// in this crate) as free functions, except the `Look` and `LookSet` types +// are used in lots of places. Including in places we expect to work when +// regex-syntax is *not* enabled, such as in the definition of the NFA itself. +// +// Thankfully the code we copy is pretty simple and there isn't much of it. +// Otherwise, the rest of this module deals with *matching* the assertions, +// which is not something that regex-syntax handles. + +use crate::util::{escape::DebugByte, utf8}; + +/// A look-around assertion. +/// +/// An assertion matches at a position between characters in a haystack. +/// Namely, it does not actually "consume" any input as most parts of a regular +/// expression do. Assertions are a way of stating that some property must be +/// true at a particular point during matching. +/// +/// For example, `(?m)^[a-z]+$` is a pattern that: +/// +/// * Scans the haystack for a position at which `(?m:^)` is satisfied. That +/// occurs at either the beginning of the haystack, or immediately following +/// a `\n` character. +/// * Looks for one or more occurrences of `[a-z]`. +/// * Once `[a-z]+` has matched as much as it can, an overall match is only +/// reported when `[a-z]+` stops just before a `\n`. +/// +/// So in this case, `abc` and `\nabc\n` match, but `\nabc1\n` does not. +/// +/// Assertions are also called "look-around," "look-behind" and "look-ahead." +/// Specifically, some assertions are look-behind (like `^`), other assertions +/// are look-ahead (like `$`) and yet other assertions are both look-ahead and +/// look-behind (like `\b`). +/// +/// # Assertions in an NFA +/// +/// An assertion in a [`thompson::NFA`](crate::nfa::thompson::NFA) can be +/// thought of as a conditional epsilon transition. That is, a matching engine +/// like the [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM) only permits +/// moving through conditional epsilon transitions when their condition +/// is satisfied at whatever position the `PikeVM` is currently at in the +/// haystack. +/// +/// How assertions are handled in a `DFA` is trickier, since a DFA does not +/// have epsilon transitions at all. In this case, they are compiled into the +/// automaton itself, at the expense of more states than what would be required +/// without an assertion. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum Look { + /// Match the beginning of text. Specifically, this matches at the starting + /// position of the input. + Start = 1 << 0, + /// Match the end of text. Specifically, this matches at the ending + /// position of the input. + End = 1 << 1, + /// Match the beginning of a line or the beginning of text. Specifically, + /// this matches at the starting position of the input, or at the position + /// immediately following a `\n` character. + StartLF = 1 << 2, + /// Match the end of a line or the end of text. Specifically, this matches + /// at the end position of the input, or at the position immediately + /// preceding a `\n` character. + EndLF = 1 << 3, + /// Match the beginning of a line or the beginning of text. Specifically, + /// this matches at the starting position of the input, or at the position + /// immediately following either a `\r` or `\n` character, but never after + /// a `\r` when a `\n` follows. + StartCRLF = 1 << 4, + /// Match the end of a line or the end of text. Specifically, this matches + /// at the end position of the input, or at the position immediately + /// preceding a `\r` or `\n` character, but never before a `\n` when a `\r` + /// precedes it. + EndCRLF = 1 << 5, + /// Match an ASCII-only word boundary. That is, this matches a position + /// where the left adjacent character and right adjacent character + /// correspond to a word and non-word or a non-word and word character. + WordAscii = 1 << 6, + /// Match an ASCII-only negation of a word boundary. + WordAsciiNegate = 1 << 7, + /// Match a Unicode-aware word boundary. That is, this matches a position + /// where the left adjacent character and right adjacent character + /// correspond to a word and non-word or a non-word and word character. + WordUnicode = 1 << 8, + /// Match a Unicode-aware negation of a word boundary. + WordUnicodeNegate = 1 << 9, +} + +impl Look { + /// Flip the look-around assertion to its equivalent for reverse searches. + /// For example, `StartLF` gets translated to `EndLF`. + /// + /// Some assertions, such as `WordUnicode`, remain the same since they + /// match the same positions regardless of the direction of the search. + #[inline] + pub const fn reversed(self) -> Look { + match self { + Look::Start => Look::End, + Look::End => Look::Start, + Look::StartLF => Look::EndLF, + Look::EndLF => Look::StartLF, + Look::StartCRLF => Look::EndCRLF, + Look::EndCRLF => Look::StartCRLF, + Look::WordAscii => Look::WordAscii, + Look::WordAsciiNegate => Look::WordAsciiNegate, + Look::WordUnicode => Look::WordUnicode, + Look::WordUnicodeNegate => Look::WordUnicodeNegate, + } + } + + /// Return the underlying representation of this look-around enumeration + /// as an integer. Giving the return value to the [`Look::from_repr`] + /// constructor is guaranteed to return the same look-around variant that + /// one started with within a semver compatible release of this crate. + #[inline] + pub const fn as_repr(self) -> u16 { + // AFAIK, 'as' is the only way to zero-cost convert an int enum to an + // actual int. + self as u16 + } + + /// Given the underlying representation of a `Look` value, return the + /// corresponding `Look` value if the representation is valid. Otherwise + /// `None` is returned. + #[inline] + pub const fn from_repr(repr: u16) -> Option<Look> { + match repr { + 0b00_0000_0001 => Some(Look::Start), + 0b00_0000_0010 => Some(Look::End), + 0b00_0000_0100 => Some(Look::StartLF), + 0b00_0000_1000 => Some(Look::EndLF), + 0b00_0001_0000 => Some(Look::StartCRLF), + 0b00_0010_0000 => Some(Look::EndCRLF), + 0b00_0100_0000 => Some(Look::WordAscii), + 0b00_1000_0000 => Some(Look::WordAsciiNegate), + 0b01_0000_0000 => Some(Look::WordUnicode), + 0b10_0000_0000 => Some(Look::WordUnicodeNegate), + _ => None, + } + } + + /// Returns a convenient single codepoint representation of this + /// look-around assertion. Each assertion is guaranteed to be represented + /// by a distinct character. + /// + /// This is useful for succinctly representing a look-around assertion in + /// human friendly but succinct output intended for a programmer working on + /// regex internals. + #[inline] + pub const fn as_char(self) -> char { + match self { + Look::Start => 'A', + Look::End => 'z', + Look::StartLF => '^', + Look::EndLF => '$', + Look::StartCRLF => 'r', + Look::EndCRLF => 'R', + Look::WordAscii => 'b', + Look::WordAsciiNegate => 'B', + Look::WordUnicode => '𝛃', + Look::WordUnicodeNegate => '𝚩', + } + } +} + +/// LookSet is a memory-efficient set of look-around assertions. +/// +/// This is useful for efficiently tracking look-around assertions. For +/// example, a [`thompson::NFA`](crate::nfa::thompson::NFA) provides properties +/// that return `LookSet`s. +#[derive(Clone, Copy, Default, Eq, PartialEq)] +pub struct LookSet { + /// The underlying representation this set is exposed to make it possible + /// to store it somewhere efficiently. The representation is that + /// of a bitset, where each assertion occupies bit `i` where `i = + /// Look::as_repr()`. + /// + /// Note that users of this internal representation must permit the full + /// range of `u16` values to be represented. For example, even if the + /// current implementation only makes use of the 10 least significant bits, + /// it may use more bits in a future semver compatible release. + pub bits: u16, +} + +impl LookSet { + /// Create an empty set of look-around assertions. + #[inline] + pub fn empty() -> LookSet { + LookSet { bits: 0 } + } + + /// Create a full set of look-around assertions. + /// + /// This set contains all possible look-around assertions. + #[inline] + pub fn full() -> LookSet { + LookSet { bits: !0 } + } + + /// Create a look-around set containing the look-around assertion given. + /// + /// This is a convenience routine for creating an empty set and inserting + /// one look-around assertions. + #[inline] + pub fn singleton(look: Look) -> LookSet { + LookSet::empty().insert(look) + } + + /// Returns the total number of look-around assertions in this set. + #[inline] + pub fn len(self) -> usize { + // OK because max value always fits in a u8, which in turn always + // fits in a usize, regardless of target. + usize::try_from(self.bits.count_ones()).unwrap() + } + + /// Returns true if and only if this set is empty. + #[inline] + pub fn is_empty(self) -> bool { + self.len() == 0 + } + + /// Returns true if and only if the given look-around assertion is in this + /// set. + #[inline] + pub fn contains(self, look: Look) -> bool { + self.bits & look.as_repr() != 0 + } + + /// Returns true if and only if this set contains any anchor assertions. + /// This includes both "start/end of haystack" and "start/end of line." + #[inline] + pub fn contains_anchor(&self) -> bool { + self.contains_anchor_haystack() || self.contains_anchor_line() + } + + /// Returns true if and only if this set contains any "start/end of + /// haystack" anchors. This doesn't include "start/end of line" anchors. + #[inline] + pub fn contains_anchor_haystack(&self) -> bool { + self.contains(Look::Start) || self.contains(Look::End) + } + + /// Returns true if and only if this set contains any "start/end of line" + /// anchors. This doesn't include "start/end of haystack" anchors. This + /// includes both `\n` line anchors and CRLF (`\r\n`) aware line anchors. + #[inline] + pub fn contains_anchor_line(&self) -> bool { + self.contains(Look::StartLF) + || self.contains(Look::EndLF) + || self.contains(Look::StartCRLF) + || self.contains(Look::EndCRLF) + } + + /// Returns true if and only if this set contains any "start/end of line" + /// anchors that only treat `\n` as line terminators. This does not include + /// haystack anchors or CRLF aware line anchors. + #[inline] + pub fn contains_anchor_lf(&self) -> bool { + self.contains(Look::StartLF) || self.contains(Look::EndLF) + } + + /// Returns true if and only if this set contains any "start/end of line" + /// anchors that are CRLF-aware. This doesn't include "start/end of + /// haystack" or "start/end of line-feed" anchors. + #[inline] + pub fn contains_anchor_crlf(&self) -> bool { + self.contains(Look::StartCRLF) || self.contains(Look::EndCRLF) + } + + /// Returns true if and only if this set contains any word boundary or + /// negated word boundary assertions. This include both Unicode and ASCII + /// word boundaries. + #[inline] + pub fn contains_word(self) -> bool { + self.contains_word_unicode() || self.contains_word_ascii() + } + + /// Returns true if and only if this set contains any Unicode word boundary + /// or negated Unicode word boundary assertions. + #[inline] + pub fn contains_word_unicode(self) -> bool { + self.contains(Look::WordUnicode) + || self.contains(Look::WordUnicodeNegate) + } + + /// Returns true if and only if this set contains any ASCII word boundary + /// or negated ASCII word boundary assertions. + #[inline] + pub fn contains_word_ascii(self) -> bool { + self.contains(Look::WordAscii) || self.contains(Look::WordAsciiNegate) + } + + /// Returns an iterator over all of the look-around assertions in this set. + #[inline] + pub fn iter(self) -> LookSetIter { + LookSetIter { set: self } + } + + /// Return a new set that is equivalent to the original, but with the given + /// assertion added to it. If the assertion is already in the set, then the + /// returned set is equivalent to the original. + #[inline] + pub fn insert(self, look: Look) -> LookSet { + LookSet { bits: self.bits | look.as_repr() } + } + + /// Updates this set in place with the result of inserting the given + /// assertion into this set. + #[inline] + pub fn set_insert(&mut self, look: Look) { + *self = self.insert(look); + } + + /// Return a new set that is equivalent to the original, but with the given + /// assertion removed from it. If the assertion is not in the set, then the + /// returned set is equivalent to the original. + #[inline] + pub fn remove(self, look: Look) -> LookSet { + LookSet { bits: self.bits & !look.as_repr() } + } + + /// Updates this set in place with the result of removing the given + /// assertion from this set. + #[inline] + pub fn set_remove(&mut self, look: Look) { + *self = self.remove(look); + } + + /// Returns a new set that is the result of subtracting the given set from + /// this set. + #[inline] + pub fn subtract(self, other: LookSet) -> LookSet { + LookSet { bits: self.bits & !other.bits } + } + + /// Updates this set in place with the result of subtracting the given set + /// from this set. + #[inline] + pub fn set_subtract(&mut self, other: LookSet) { + *self = self.subtract(other); + } + + /// Returns a new set that is the union of this and the one given. + #[inline] + pub fn union(self, other: LookSet) -> LookSet { + LookSet { bits: self.bits | other.bits } + } + + /// Updates this set in place with the result of unioning it with the one + /// given. + #[inline] + pub fn set_union(&mut self, other: LookSet) { + *self = self.union(other); + } + + /// Returns a new set that is the intersection of this and the one given. + #[inline] + pub fn intersect(self, other: LookSet) -> LookSet { + LookSet { bits: self.bits & other.bits } + } + + /// Updates this set in place with the result of intersecting it with the + /// one given. + #[inline] + pub fn set_intersect(&mut self, other: LookSet) { + *self = self.intersect(other); + } + + /// Return a `LookSet` from the slice given as a native endian 16-bit + /// integer. + /// + /// # Panics + /// + /// This panics if `slice.len() < 2`. + #[inline] + pub fn read_repr(slice: &[u8]) -> LookSet { + let bits = u16::from_ne_bytes(slice[..2].try_into().unwrap()); + LookSet { bits } + } + + /// Write a `LookSet` as a native endian 16-bit integer to the beginning + /// of the slice given. + /// + /// # Panics + /// + /// This panics if `slice.len() < 2`. + #[inline] + pub fn write_repr(self, slice: &mut [u8]) { + let raw = self.bits.to_ne_bytes(); + slice[0] = raw[0]; + slice[1] = raw[1]; + } + + /// Checks that all assertions in this set can be matched. + /// + /// Some assertions, such as Unicode word boundaries, require optional (but + /// enabled by default) tables that may not be available. If there are + /// assertions in this set that require tables that are not available, then + /// this will return an error. + /// + /// Specifically, this returns an error when the the + /// `unicode-word-boundary` feature is _not_ enabled _and_ this set + /// contains a Unicode word boundary assertion. + /// + /// It can be useful to use this on the result of + /// [`NFA::look_set_any`](crate::nfa::thompson::NFA::look_set_any) + /// when building a matcher engine to ensure methods like + /// [`LookMatcher::matches_set`] do not panic at search time. + pub fn available(self) -> Result<(), UnicodeWordBoundaryError> { + if self.contains_word_unicode() { + UnicodeWordBoundaryError::check()?; + } + Ok(()) + } +} + +impl core::fmt::Debug for LookSet { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + if self.is_empty() { + return write!(f, "∅"); + } + for look in self.iter() { + write!(f, "{}", look.as_char())?; + } + Ok(()) + } +} + +/// An iterator over all look-around assertions in a [`LookSet`]. +/// +/// This iterator is created by [`LookSet::iter`]. +#[derive(Clone, Debug)] +pub struct LookSetIter { + set: LookSet, +} + +impl Iterator for LookSetIter { + type Item = Look; + + #[inline] + fn next(&mut self) -> Option<Look> { + if self.set.is_empty() { + return None; + } + // We'll never have more than u8::MAX distinct look-around assertions, + // so 'repr' will always fit into a u16. + let repr = u16::try_from(self.set.bits.trailing_zeros()).unwrap(); + let look = Look::from_repr(1 << repr)?; + self.set = self.set.remove(look); + Some(look) + } +} + +/// A matcher for look-around assertions. +/// +/// This matcher permits configuring aspects of how look-around assertions are +/// matched. +/// +/// # Example +/// +/// A `LookMatcher` can change the line terminator used for matching multi-line +/// anchors such as `(?m:^)` and `(?m:$)`. +/// +/// ``` +/// use regex_automata::{ +/// nfa::thompson::{self, pikevm::PikeVM}, +/// util::look::LookMatcher, +/// Match, Input, +/// }; +/// +/// let mut lookm = LookMatcher::new(); +/// lookm.set_line_terminator(b'\x00'); +/// +/// let re = PikeVM::builder() +/// .thompson(thompson::Config::new().look_matcher(lookm)) +/// .build(r"(?m)^[a-z]+$")?; +/// let mut cache = re.create_cache(); +/// +/// // Multi-line assertions now use NUL as a terminator. +/// assert_eq!( +/// Some(Match::must(0, 1..4)), +/// re.find(&mut cache, b"\x00abc\x00"), +/// ); +/// // ... and \n is no longer recognized as a terminator. +/// assert_eq!( +/// None, +/// re.find(&mut cache, b"\nabc\n"), +/// ); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +#[derive(Clone, Debug)] +pub struct LookMatcher { + lineterm: DebugByte, +} + +impl LookMatcher { + /// Creates a new default matcher for look-around assertions. + pub fn new() -> LookMatcher { + LookMatcher { lineterm: DebugByte(b'\n') } + } + + /// Sets the line terminator for use with `(?m:^)` and `(?m:$)`. + /// + /// Namely, instead of `^` matching after `\n` and `$` matching immediately + /// before a `\n`, this will cause it to match after and before the byte + /// given. + /// + /// It can occasionally be useful to use this to configure the line + /// terminator to the NUL byte when searching binary data. + /// + /// Note that this does not apply to CRLF-aware line anchors such as + /// `(?Rm:^)` and `(?Rm:$)`. CRLF-aware line anchors are hard-coded to + /// use `\r` and `\n`. + pub fn set_line_terminator(&mut self, byte: u8) -> &mut LookMatcher { + self.lineterm.0 = byte; + self + } + + /// Returns the line terminator that was configured for this matcher. + /// + /// If no line terminator was configured, then this returns `\n`. + /// + /// Note that the line terminator should only be used for matching `(?m:^)` + /// and `(?m:$)` assertions. It specifically should _not_ be used for + /// matching the CRLF aware assertions `(?Rm:^)` and `(?Rm:$)`. + pub fn get_line_terminator(&self) -> u8 { + self.lineterm.0 + } + + /// Returns true when the position `at` in `haystack` satisfies the given + /// look-around assertion. + /// + /// # Panics + /// + /// This panics when testing any Unicode word boundary assertion in this + /// set and when the Unicode word data is not available. Specifically, this + /// only occurs when the `unicode-word-boundary` feature is not enabled. + /// + /// Since it's generally expected that this routine is called inside of + /// a matching engine, callers should check the error condition when + /// building the matching engine. If there is a Unicode word boundary + /// in the matcher and the data isn't available, then the matcher should + /// fail to build. + /// + /// Callers can check the error condition with [`LookSet::available`]. + /// + /// This also may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. + #[inline] + pub fn matches(&self, look: Look, haystack: &[u8], at: usize) -> bool { + self.matches_inline(look, haystack, at) + } + + /// Like `matches`, but forcefully inlined. + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn matches_inline( + &self, + look: Look, + haystack: &[u8], + at: usize, + ) -> bool { + match look { + Look::Start => self.is_start(haystack, at), + Look::End => self.is_end(haystack, at), + Look::StartLF => self.is_start_lf(haystack, at), + Look::EndLF => self.is_end_lf(haystack, at), + Look::StartCRLF => self.is_start_crlf(haystack, at), + Look::EndCRLF => self.is_end_crlf(haystack, at), + Look::WordAscii => self.is_word_ascii(haystack, at), + Look::WordAsciiNegate => self.is_word_ascii_negate(haystack, at), + Look::WordUnicode => self.is_word_unicode(haystack, at).unwrap(), + Look::WordUnicodeNegate => { + self.is_word_unicode_negate(haystack, at).unwrap() + } + } + } + + /// Returns true when _all_ of the assertions in the given set match at the + /// given position in the haystack. + /// + /// # Panics + /// + /// This panics when testing any Unicode word boundary assertion in this + /// set and when the Unicode word data is not available. Specifically, this + /// only occurs when the `unicode-word-boundary` feature is not enabled. + /// + /// Since it's generally expected that this routine is called inside of + /// a matching engine, callers should check the error condition when + /// building the matching engine. If there is a Unicode word boundary + /// in the matcher and the data isn't available, then the matcher should + /// fail to build. + /// + /// Callers can check the error condition with [`LookSet::available`]. + /// + /// This also may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. + #[inline] + pub fn matches_set( + &self, + set: LookSet, + haystack: &[u8], + at: usize, + ) -> bool { + self.matches_set_inline(set, haystack, at) + } + + /// Like `LookSet::matches`, but forcefully inlined for perf. + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn matches_set_inline( + &self, + set: LookSet, + haystack: &[u8], + at: usize, + ) -> bool { + // This used to luse LookSet::iter with Look::matches on each element, + // but that proved to be quite diastrous for perf. The manual "if + // the set has this assertion, check it" turns out to be quite a bit + // faster. + if set.contains(Look::Start) { + if !self.is_start(haystack, at) { + return false; + } + } + if set.contains(Look::End) { + if !self.is_end(haystack, at) { + return false; + } + } + if set.contains(Look::StartLF) { + if !self.is_start_lf(haystack, at) { + return false; + } + } + if set.contains(Look::EndLF) { + if !self.is_end_lf(haystack, at) { + return false; + } + } + if set.contains(Look::StartCRLF) { + if !self.is_start_crlf(haystack, at) { + return false; + } + } + if set.contains(Look::EndCRLF) { + if !self.is_end_crlf(haystack, at) { + return false; + } + } + if set.contains(Look::WordAscii) { + if !self.is_word_ascii(haystack, at) { + return false; + } + } + if set.contains(Look::WordAsciiNegate) { + if !self.is_word_ascii_negate(haystack, at) { + return false; + } + } + if set.contains(Look::WordUnicode) { + if !self.is_word_unicode(haystack, at).unwrap() { + return false; + } + } + if set.contains(Look::WordUnicodeNegate) { + if !self.is_word_unicode_negate(haystack, at).unwrap() { + return false; + } + } + true + } + + /// Split up the given byte classes into equivalence classes in a way that + /// is consistent with this look-around assertion. + #[cfg(feature = "alloc")] + pub(crate) fn add_to_byteset( + &self, + look: Look, + set: &mut crate::util::alphabet::ByteClassSet, + ) { + match look { + Look::Start | Look::End => {} + Look::StartLF | Look::EndLF => { + set.set_range(self.lineterm.0, self.lineterm.0); + } + Look::StartCRLF | Look::EndCRLF => { + set.set_range(b'\r', b'\r'); + set.set_range(b'\n', b'\n'); + } + Look::WordAscii + | Look::WordAsciiNegate + | Look::WordUnicode + | Look::WordUnicodeNegate => { + // We need to mark all ranges of bytes whose pairs result in + // evaluating \b differently. This isn't technically correct + // for Unicode word boundaries, but DFAs can't handle those + // anyway, and thus, the byte classes don't need to either + // since they are themselves only used in DFAs. + // + // FIXME: It seems like the calls to 'set_range' here are + // completely invariant, which means we could just hard-code + // them here without needing to write a loop. And we only need + // to do this dance at most once per regex. + // + // FIXME: Is this correct for \B? + let iswb = utf8::is_word_byte; + // This unwrap is OK because we guard every use of 'asu8' with + // a check that the input is <= 255. + let asu8 = |b: u16| u8::try_from(b).unwrap(); + let mut b1: u16 = 0; + let mut b2: u16; + while b1 <= 255 { + b2 = b1 + 1; + while b2 <= 255 && iswb(asu8(b1)) == iswb(asu8(b2)) { + b2 += 1; + } + // The guards above guarantee that b2 can never get any + // bigger. + assert!(b2 <= 256); + // Subtracting 1 from b2 is always OK because it is always + // at least 1 greater than b1, and the assert above + // guarantees that the asu8 conversion will succeed. + set.set_range(asu8(b1), asu8(b2.checked_sub(1).unwrap())); + b1 = b2; + } + } + } + } + + /// Returns true when [`Look::Start`] is satisfied `at` the given position + /// in `haystack`. + /// + /// # Panics + /// + /// This may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. + #[inline] + pub fn is_start(&self, _haystack: &[u8], at: usize) -> bool { + at == 0 + } + + /// Returns true when [`Look::End`] is satisfied `at` the given position in + /// `haystack`. + /// + /// # Panics + /// + /// This may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. + #[inline] + pub fn is_end(&self, haystack: &[u8], at: usize) -> bool { + at == haystack.len() + } + + /// Returns true when [`Look::StartLF`] is satisfied `at` the given + /// position in `haystack`. + /// + /// # Panics + /// + /// This may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. + #[inline] + pub fn is_start_lf(&self, haystack: &[u8], at: usize) -> bool { + self.is_start(haystack, at) || haystack[at - 1] == self.lineterm.0 + } + + /// Returns true when [`Look::EndLF`] is satisfied `at` the given position + /// in `haystack`. + /// + /// # Panics + /// + /// This may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. + #[inline] + pub fn is_end_lf(&self, haystack: &[u8], at: usize) -> bool { + self.is_end(haystack, at) || haystack[at] == self.lineterm.0 + } + + /// Returns true when [`Look::StartCRLF`] is satisfied `at` the given + /// position in `haystack`. + /// + /// # Panics + /// + /// This may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. + #[inline] + pub fn is_start_crlf(&self, haystack: &[u8], at: usize) -> bool { + self.is_start(haystack, at) + || haystack[at - 1] == b'\n' + || (haystack[at - 1] == b'\r' + && (at >= haystack.len() || haystack[at] != b'\n')) + } + + /// Returns true when [`Look::EndCRLF`] is satisfied `at` the given + /// position in `haystack`. + /// + /// # Panics + /// + /// This may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. + #[inline] + pub fn is_end_crlf(&self, haystack: &[u8], at: usize) -> bool { + self.is_end(haystack, at) + || haystack[at] == b'\r' + || (haystack[at] == b'\n' + && (at == 0 || haystack[at - 1] != b'\r')) + } + + /// Returns true when [`Look::WordAscii`] is satisfied `at` the given + /// position in `haystack`. + /// + /// # Panics + /// + /// This may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. + #[inline] + pub fn is_word_ascii(&self, haystack: &[u8], at: usize) -> bool { + let word_before = at > 0 && utf8::is_word_byte(haystack[at - 1]); + let word_after = + at < haystack.len() && utf8::is_word_byte(haystack[at]); + word_before != word_after + } + + /// Returns true when [`Look::WordAsciiNegate`] is satisfied `at` the given + /// position in `haystack`. + /// + /// # Panics + /// + /// This may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. + #[inline] + pub fn is_word_ascii_negate(&self, haystack: &[u8], at: usize) -> bool { + !self.is_word_ascii(haystack, at) + } + + /// Returns true when [`Look::WordUnicode`] is satisfied `at` the given + /// position in `haystack`. + /// + /// # Panics + /// + /// This may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. + /// + /// # Errors + /// + /// This returns an error when Unicode word boundary tables + /// are not available. Specifically, this only occurs when the + /// `unicode-word-boundary` feature is not enabled. + #[inline] + pub fn is_word_unicode( + &self, + haystack: &[u8], + at: usize, + ) -> Result<bool, UnicodeWordBoundaryError> { + let word_before = is_word_char::rev(haystack, at)?; + let word_after = is_word_char::fwd(haystack, at)?; + Ok(word_before != word_after) + } + + /// Returns true when [`Look::WordUnicodeNegate`] is satisfied `at` the + /// given position in `haystack`. + /// + /// # Panics + /// + /// This may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. + /// + /// # Errors + /// + /// This returns an error when Unicode word boundary tables + /// are not available. Specifically, this only occurs when the + /// `unicode-word-boundary` feature is not enabled. + #[inline] + pub fn is_word_unicode_negate( + &self, + haystack: &[u8], + at: usize, + ) -> Result<bool, UnicodeWordBoundaryError> { + // This is pretty subtle. Why do we need to do UTF-8 decoding here? + // Well... at time of writing, the is_word_char_{fwd,rev} routines will + // only return true if there is a valid UTF-8 encoding of a "word" + // codepoint, and false in every other case (including invalid UTF-8). + // This means that in regions of invalid UTF-8 (which might be a + // subset of valid UTF-8!), it would result in \B matching. While this + // would be questionable in the context of truly invalid UTF-8, it is + // *certainly* wrong to report match boundaries that split the encoding + // of a codepoint. So to work around this, we ensure that we can decode + // a codepoint on either side of `at`. If either direction fails, then + // we don't permit \B to match at all. + // + // Now, this isn't exactly optimal from a perf perspective. We could + // try and detect this in is_word_char::{fwd,rev}, but it's not clear + // if it's worth it. \B is, after all, rarely used. Even worse, + // is_word_char::{fwd,rev} could do its own UTF-8 decoding, and so this + // will wind up doing UTF-8 decoding twice. Owch. We could fix this + // with more code complexity, but it just doesn't feel worth it for \B. + // + // And in particular, we do *not* have to do this with \b, because \b + // *requires* that at least one side of `at` be a "word" codepoint, + // which in turn implies one side of `at` must be valid UTF-8. This in + // turn implies that \b can never split a valid UTF-8 encoding of a + // codepoint. In the case where one side of `at` is truly invalid UTF-8 + // and the other side IS a word codepoint, then we want \b to match + // since it represents a valid UTF-8 boundary. It also makes sense. For + // example, you'd want \b\w+\b to match 'abc' in '\xFFabc\xFF'. + // + // Note also that this is not just '!is_word_unicode(..)' like it is + // for the ASCII case. For example, neither \b nor \B is satisfied + // within invalid UTF-8 sequences. + let word_before = at > 0 + && match utf8::decode_last(&haystack[..at]) { + None | Some(Err(_)) => return Ok(false), + Some(Ok(_)) => is_word_char::rev(haystack, at)?, + }; + let word_after = at < haystack.len() + && match utf8::decode(&haystack[at..]) { + None | Some(Err(_)) => return Ok(false), + Some(Ok(_)) => is_word_char::fwd(haystack, at)?, + }; + Ok(word_before == word_after) + } +} + +impl Default for LookMatcher { + fn default() -> LookMatcher { + LookMatcher::new() + } +} + +/// An error that occurs when the Unicode-aware `\w` class is unavailable. +/// +/// This error can occur when the data tables necessary for the Unicode aware +/// Perl character class `\w` are unavailable. The `\w` class is used to +/// determine whether a codepoint is considered a word character or not when +/// determining whether a Unicode aware `\b` (or `\B`) matches at a particular +/// position. +/// +/// This error can only occur when the `unicode-word-boundary` feature is +/// disabled. +#[derive(Clone, Debug)] +pub struct UnicodeWordBoundaryError(()); + +impl UnicodeWordBoundaryError { + #[cfg(not(feature = "unicode-word-boundary"))] + pub(crate) fn new() -> UnicodeWordBoundaryError { + UnicodeWordBoundaryError(()) + } + + /// Returns an error if and only if Unicode word boundary data is + /// unavailable. + pub fn check() -> Result<(), UnicodeWordBoundaryError> { + is_word_char::check() + } +} + +#[cfg(feature = "std")] +impl std::error::Error for UnicodeWordBoundaryError {} + +impl core::fmt::Display for UnicodeWordBoundaryError { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + write!( + f, + "Unicode-aware \\b and \\B are unavailable because the \ + requisite data tables are missing, please enable the \ + unicode-word-boundary feature" + ) + } +} + +// Below are FOUR different ways for checking whether whether a "word" +// codepoint exists at a particular position in the haystack. The four +// different approaches are, in order of preference: +// +// 1. Parse '\w', convert to an NFA, convert to a fully compiled DFA on the +// first call, and then use that DFA for all subsequent calls. +// 2. Do UTF-8 decoding and use regex_syntax::is_word_character if available. +// 3. Do UTF-8 decoding and use our own 'perl_word' table. +// 4. Return an error. +// +// The reason for all of these approaches is a combination of perf and +// permitting one to build regex-automata without the Unicode data necessary +// for handling Unicode-aware word boundaries. (In which case, '(?-u:\b)' would +// still work.) +// +// The DFA approach is the fastest, but it requires the regex parser, the +// NFA compiler, the DFA builder and the DFA search runtime. That's a lot to +// bring in, but if it's available, it's (probably) the best we can do. +// +// Approaches (2) and (3) are effectively equivalent, but (2) reuses the +// data in regex-syntax and avoids duplicating it in regex-automata. +// +// Finally, (4) unconditionally returns an error since the requisite data isn't +// available anywhere. +// +// There are actually more approaches possible that we didn't implement. For +// example, if the DFA builder is available but the syntax parser is not, we +// could technically hand construct our own NFA from the 'perl_word' data +// table. But to avoid some pretty hairy code duplication, we would in turn +// need to pull the UTF-8 compiler out of the NFA compiler. Yikes. +// +// A possibly more sensible alternative is to use a lazy DFA when the full +// DFA builder isn't available... +// +// Yet another choice would be to build the full DFA and then embed it into the +// source. Then we'd only need to bring in the DFA search runtime, which is +// considerably smaller than the DFA builder code. The problem here is that the +// Debian people have spooked me[1] into avoiding cyclic dependencies. Namely, +// we'd need to build regex-cli, which depends on regex-automata in order to +// build some part of regex-automata. But to be honest, something like this has +// to be allowed somehow? I just don't know what the right process is. +// +// There are perhaps other choices as well. Why did I stop at these 4? Because +// I wanted to preserve my sanity. I suspect I'll wind up adding the lazy DFA +// approach eventually, as the benefits of the DFA approach are somewhat +// compelling. The 'boundary-words-holmes' benchmark tests this: +// +// $ regex-cli bench measure -f boundary-words-holmes -e pikevm > dfa.csv +// +// Then I changed the code below so that the util/unicode_data/perl_word table +// was used and re-ran the benchmark: +// +// $ regex-cli bench measure -f boundary-words-holmes -e pikevm > table.csv +// +// And compared them: +// +// $ regex-cli bench diff dfa.csv table.csv +// benchmark engine dfa table +// --------- ------ --- ----- +// internal/count/boundary-words-holmes regex/automata/pikevm 18.6 MB/s 12.9 MB/s +// +// Which is a nice improvement. +// +// UPDATE: It turns out that it takes approximately 22ms to build the reverse +// DFA for \w. (And about 3ms for the forward DFA.) It's probably not much in +// the grand scheme things, but that is a significant latency cost. So I'm not +// sure that's a good idea. I then tried using a lazy DFA instead, and that +// eliminated the overhead, but since the lazy DFA requires mutable working +// memory, that requires introducing a 'Cache' for every simultaneous call. +// +// I ended up deciding for now to just keep the "UTF-8 decode and check the +// table." The DFA and lazy DFA approaches are still below, but commented out. +// +// [1]: https://github.com/BurntSushi/ucd-generate/issues/11 + +/* +/// A module that looks for word codepoints using lazy DFAs. +#[cfg(all( + feature = "unicode-word-boundary", + feature = "syntax", + feature = "unicode-perl", + feature = "hybrid" +))] +mod is_word_char { + use alloc::vec::Vec; + + use crate::{ + hybrid::dfa::{Cache, DFA}, + nfa::thompson::NFA, + util::{lazy::Lazy, pool::Pool, primitives::StateID}, + Anchored, Input, + }; + + pub(super) fn check() -> Result<(), super::UnicodeWordBoundaryError> { + Ok(()) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(super) fn fwd( + haystack: &[u8], + mut at: usize, + ) -> Result<bool, super::UnicodeWordBoundaryError> { + static WORD: Lazy<DFA> = Lazy::new(|| DFA::new(r"\w").unwrap()); + static CACHE: Lazy<Pool<Cache>> = + Lazy::new(|| Pool::new(|| WORD.create_cache())); + let dfa = Lazy::get(&WORD); + let mut cache = Lazy::get(&CACHE).get(); + let mut sid = dfa + .start_state_forward( + &mut cache, + &Input::new("").anchored(Anchored::Yes), + ) + .unwrap(); + while at < haystack.len() { + let byte = haystack[at]; + sid = dfa.next_state(&mut cache, sid, byte).unwrap(); + at += 1; + if sid.is_tagged() { + if sid.is_match() { + return Ok(true); + } else if sid.is_dead() { + return Ok(false); + } + } + } + Ok(dfa.next_eoi_state(&mut cache, sid).unwrap().is_match()) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(super) fn rev( + haystack: &[u8], + mut at: usize, + ) -> Result<bool, super::UnicodeWordBoundaryError> { + static WORD: Lazy<DFA> = Lazy::new(|| { + DFA::builder() + .thompson(NFA::config().reverse(true)) + .build(r"\w") + .unwrap() + }); + static CACHE: Lazy<Pool<Cache>> = + Lazy::new(|| Pool::new(|| WORD.create_cache())); + let dfa = Lazy::get(&WORD); + let mut cache = Lazy::get(&CACHE).get(); + let mut sid = dfa + .start_state_reverse( + &mut cache, + &Input::new("").anchored(Anchored::Yes), + ) + .unwrap(); + while at > 0 { + at -= 1; + let byte = haystack[at]; + sid = dfa.next_state(&mut cache, sid, byte).unwrap(); + if sid.is_tagged() { + if sid.is_match() { + return Ok(true); + } else if sid.is_dead() { + return Ok(false); + } + } + } + Ok(dfa.next_eoi_state(&mut cache, sid).unwrap().is_match()) + } +} +*/ + +/* +/// A module that looks for word codepoints using fully compiled DFAs. +#[cfg(all( + feature = "unicode-word-boundary", + feature = "syntax", + feature = "unicode-perl", + feature = "dfa-build" +))] +mod is_word_char { + use alloc::vec::Vec; + + use crate::{ + dfa::{dense::DFA, Automaton, StartKind}, + nfa::thompson::NFA, + util::{lazy::Lazy, primitives::StateID}, + Anchored, Input, + }; + + pub(super) fn check() -> Result<(), super::UnicodeWordBoundaryError> { + Ok(()) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(super) fn fwd( + haystack: &[u8], + mut at: usize, + ) -> Result<bool, super::UnicodeWordBoundaryError> { + static WORD: Lazy<(DFA<Vec<u32>>, StateID)> = Lazy::new(|| { + let dfa = DFA::builder() + .configure(DFA::config().start_kind(StartKind::Anchored)) + .build(r"\w") + .unwrap(); + // OK because our regex has no look-around. + let start_id = dfa.universal_start_state(Anchored::Yes).unwrap(); + (dfa, start_id) + }); + let &(ref dfa, mut sid) = Lazy::get(&WORD); + while at < haystack.len() { + let byte = haystack[at]; + sid = dfa.next_state(sid, byte); + at += 1; + if dfa.is_special_state(sid) { + if dfa.is_match_state(sid) { + return Ok(true); + } else if dfa.is_dead_state(sid) { + return Ok(false); + } + } + } + Ok(dfa.is_match_state(dfa.next_eoi_state(sid))) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(super) fn rev( + haystack: &[u8], + mut at: usize, + ) -> Result<bool, super::UnicodeWordBoundaryError> { + static WORD: Lazy<(DFA<Vec<u32>>, StateID)> = Lazy::new(|| { + let dfa = DFA::builder() + .configure(DFA::config().start_kind(StartKind::Anchored)) + // From ad hoc measurements, it looks like setting + // shrink==false is slightly faster than shrink==true. I kind + // of feel like this indicates that shrinking is probably a + // failure, although it can help in some cases. Sigh. + .thompson(NFA::config().reverse(true).shrink(false)) + .build(r"\w") + .unwrap(); + // OK because our regex has no look-around. + let start_id = dfa.universal_start_state(Anchored::Yes).unwrap(); + (dfa, start_id) + }); + let &(ref dfa, mut sid) = Lazy::get(&WORD); + while at > 0 { + at -= 1; + let byte = haystack[at]; + sid = dfa.next_state(sid, byte); + if dfa.is_special_state(sid) { + if dfa.is_match_state(sid) { + return Ok(true); + } else if dfa.is_dead_state(sid) { + return Ok(false); + } + } + } + Ok(dfa.is_match_state(dfa.next_eoi_state(sid))) + } +} +*/ + +/// A module that looks for word codepoints using regex-syntax's data tables. +#[cfg(all( + feature = "unicode-word-boundary", + feature = "syntax", + feature = "unicode-perl", +))] +mod is_word_char { + use regex_syntax::try_is_word_character; + + use crate::util::utf8; + + pub(super) fn check() -> Result<(), super::UnicodeWordBoundaryError> { + Ok(()) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(super) fn fwd( + haystack: &[u8], + at: usize, + ) -> Result<bool, super::UnicodeWordBoundaryError> { + Ok(match utf8::decode(&haystack[at..]) { + None | Some(Err(_)) => false, + Some(Ok(ch)) => try_is_word_character(ch).expect( + "since unicode-word-boundary, syntax and unicode-perl \ + are all enabled, it is expected that \ + try_is_word_character succeeds", + ), + }) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(super) fn rev( + haystack: &[u8], + at: usize, + ) -> Result<bool, super::UnicodeWordBoundaryError> { + Ok(match utf8::decode_last(&haystack[..at]) { + None | Some(Err(_)) => false, + Some(Ok(ch)) => try_is_word_character(ch).expect( + "since unicode-word-boundary, syntax and unicode-perl \ + are all enabled, it is expected that \ + try_is_word_character succeeds", + ), + }) + } +} + +/// A module that looks for word codepoints using regex-automata's data tables +/// (which are only compiled when regex-syntax's tables aren't available). +/// +/// Note that the cfg should match the one in src/util/unicode_data/mod.rs for +/// perl_word. +#[cfg(all( + feature = "unicode-word-boundary", + not(all(feature = "syntax", feature = "unicode-perl")), +))] +mod is_word_char { + use crate::util::utf8; + + pub(super) fn check() -> Result<(), super::UnicodeWordBoundaryError> { + Ok(()) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(super) fn fwd( + haystack: &[u8], + at: usize, + ) -> Result<bool, super::UnicodeWordBoundaryError> { + Ok(match utf8::decode(&haystack[at..]) { + None | Some(Err(_)) => false, + Some(Ok(ch)) => is_word_character(ch), + }) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(super) fn rev( + haystack: &[u8], + at: usize, + ) -> Result<bool, super::UnicodeWordBoundaryError> { + Ok(match utf8::decode_last(&haystack[..at]) { + None | Some(Err(_)) => false, + Some(Ok(ch)) => is_word_character(ch), + }) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn is_word_character(c: char) -> bool { + use crate::util::{unicode_data::perl_word::PERL_WORD, utf8}; + + // MSRV(1.59): Use 'u8::try_from(c)' instead. + if u8::try_from(u32::from(c)).map_or(false, utf8::is_word_byte) { + return true; + } + PERL_WORD + .binary_search_by(|&(start, end)| { + use core::cmp::Ordering; + + if start <= c && c <= end { + Ordering::Equal + } else if start > c { + Ordering::Greater + } else { + Ordering::Less + } + }) + .is_ok() + } +} + +/// A module that always returns an error if Unicode word boundaries are +/// disabled. When this feature is disabled, then regex-automata will not +/// include its own data tables even if regex-syntax is disabled. +#[cfg(not(feature = "unicode-word-boundary"))] +mod is_word_char { + pub(super) fn check() -> Result<(), super::UnicodeWordBoundaryError> { + Err(super::UnicodeWordBoundaryError::new()) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(super) fn fwd( + _bytes: &[u8], + _at: usize, + ) -> Result<bool, super::UnicodeWordBoundaryError> { + Err(super::UnicodeWordBoundaryError::new()) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(super) fn rev( + _bytes: &[u8], + _at: usize, + ) -> Result<bool, super::UnicodeWordBoundaryError> { + Err(super::UnicodeWordBoundaryError::new()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + macro_rules! testlook { + ($look:expr, $haystack:expr, $at:expr) => { + LookMatcher::default().matches($look, $haystack.as_bytes(), $at) + }; + } + + #[test] + fn look_matches_start_line() { + let look = Look::StartLF; + + assert!(testlook!(look, "", 0)); + assert!(testlook!(look, "\n", 0)); + assert!(testlook!(look, "\n", 1)); + assert!(testlook!(look, "a", 0)); + assert!(testlook!(look, "\na", 1)); + + assert!(!testlook!(look, "a", 1)); + assert!(!testlook!(look, "a\na", 1)); + } + + #[test] + fn look_matches_end_line() { + let look = Look::EndLF; + + assert!(testlook!(look, "", 0)); + assert!(testlook!(look, "\n", 1)); + assert!(testlook!(look, "\na", 0)); + assert!(testlook!(look, "\na", 2)); + assert!(testlook!(look, "a\na", 1)); + + assert!(!testlook!(look, "a", 0)); + assert!(!testlook!(look, "\na", 1)); + assert!(!testlook!(look, "a\na", 0)); + assert!(!testlook!(look, "a\na", 2)); + } + + #[test] + fn look_matches_start_text() { + let look = Look::Start; + + assert!(testlook!(look, "", 0)); + assert!(testlook!(look, "\n", 0)); + assert!(testlook!(look, "a", 0)); + + assert!(!testlook!(look, "\n", 1)); + assert!(!testlook!(look, "\na", 1)); + assert!(!testlook!(look, "a", 1)); + assert!(!testlook!(look, "a\na", 1)); + } + + #[test] + fn look_matches_end_text() { + let look = Look::End; + + assert!(testlook!(look, "", 0)); + assert!(testlook!(look, "\n", 1)); + assert!(testlook!(look, "\na", 2)); + + assert!(!testlook!(look, "\na", 0)); + assert!(!testlook!(look, "a\na", 1)); + assert!(!testlook!(look, "a", 0)); + assert!(!testlook!(look, "\na", 1)); + assert!(!testlook!(look, "a\na", 0)); + assert!(!testlook!(look, "a\na", 2)); + } + + #[test] + #[cfg(all(not(miri), feature = "unicode-word-boundary"))] + fn look_matches_word_unicode() { + let look = Look::WordUnicode; + + // \xF0\x9D\x9B\x83 = 𝛃 (in \w) + // \xF0\x90\x86\x80 = 𐆀 (not in \w) + + // Simple ASCII word boundaries. + assert!(testlook!(look, "a", 0)); + assert!(testlook!(look, "a", 1)); + assert!(testlook!(look, "a ", 1)); + assert!(testlook!(look, " a ", 1)); + assert!(testlook!(look, " a ", 2)); + + // Unicode word boundaries with a non-ASCII codepoint. + assert!(testlook!(look, "𝛃", 0)); + assert!(testlook!(look, "𝛃", 4)); + assert!(testlook!(look, "𝛃 ", 4)); + assert!(testlook!(look, " 𝛃 ", 1)); + assert!(testlook!(look, " 𝛃 ", 5)); + + // Unicode word boundaries between non-ASCII codepoints. + assert!(testlook!(look, "𝛃𐆀", 0)); + assert!(testlook!(look, "𝛃𐆀", 4)); + + // Non word boundaries for ASCII. + assert!(!testlook!(look, "", 0)); + assert!(!testlook!(look, "ab", 1)); + assert!(!testlook!(look, "a ", 2)); + assert!(!testlook!(look, " a ", 0)); + assert!(!testlook!(look, " a ", 3)); + + // Non word boundaries with a non-ASCII codepoint. + assert!(!testlook!(look, "𝛃b", 4)); + assert!(!testlook!(look, "𝛃 ", 5)); + assert!(!testlook!(look, " 𝛃 ", 0)); + assert!(!testlook!(look, " 𝛃 ", 6)); + assert!(!testlook!(look, "𝛃", 1)); + assert!(!testlook!(look, "𝛃", 2)); + assert!(!testlook!(look, "𝛃", 3)); + + // Non word boundaries with non-ASCII codepoints. + assert!(!testlook!(look, "𝛃𐆀", 1)); + assert!(!testlook!(look, "𝛃𐆀", 2)); + assert!(!testlook!(look, "𝛃𐆀", 3)); + assert!(!testlook!(look, "𝛃𐆀", 5)); + assert!(!testlook!(look, "𝛃𐆀", 6)); + assert!(!testlook!(look, "𝛃𐆀", 7)); + assert!(!testlook!(look, "𝛃𐆀", 8)); + } + + #[test] + fn look_matches_word_ascii() { + let look = Look::WordAscii; + + // \xF0\x9D\x9B\x83 = 𝛃 (in \w) + // \xF0\x90\x86\x80 = 𐆀 (not in \w) + + // Simple ASCII word boundaries. + assert!(testlook!(look, "a", 0)); + assert!(testlook!(look, "a", 1)); + assert!(testlook!(look, "a ", 1)); + assert!(testlook!(look, " a ", 1)); + assert!(testlook!(look, " a ", 2)); + + // Unicode word boundaries with a non-ASCII codepoint. Since this is + // an ASCII word boundary, none of these match. + assert!(!testlook!(look, "𝛃", 0)); + assert!(!testlook!(look, "𝛃", 4)); + assert!(!testlook!(look, "𝛃 ", 4)); + assert!(!testlook!(look, " 𝛃 ", 1)); + assert!(!testlook!(look, " 𝛃 ", 5)); + + // Unicode word boundaries between non-ASCII codepoints. Again, since + // this is an ASCII word boundary, none of these match. + assert!(!testlook!(look, "𝛃𐆀", 0)); + assert!(!testlook!(look, "𝛃𐆀", 4)); + + // Non word boundaries for ASCII. + assert!(!testlook!(look, "", 0)); + assert!(!testlook!(look, "ab", 1)); + assert!(!testlook!(look, "a ", 2)); + assert!(!testlook!(look, " a ", 0)); + assert!(!testlook!(look, " a ", 3)); + + // Non word boundaries with a non-ASCII codepoint. + assert!(testlook!(look, "𝛃b", 4)); + assert!(!testlook!(look, "𝛃 ", 5)); + assert!(!testlook!(look, " 𝛃 ", 0)); + assert!(!testlook!(look, " 𝛃 ", 6)); + assert!(!testlook!(look, "𝛃", 1)); + assert!(!testlook!(look, "𝛃", 2)); + assert!(!testlook!(look, "𝛃", 3)); + + // Non word boundaries with non-ASCII codepoints. + assert!(!testlook!(look, "𝛃𐆀", 1)); + assert!(!testlook!(look, "𝛃𐆀", 2)); + assert!(!testlook!(look, "𝛃𐆀", 3)); + assert!(!testlook!(look, "𝛃𐆀", 5)); + assert!(!testlook!(look, "𝛃𐆀", 6)); + assert!(!testlook!(look, "𝛃𐆀", 7)); + assert!(!testlook!(look, "𝛃𐆀", 8)); + } + + #[test] + #[cfg(all(not(miri), feature = "unicode-word-boundary"))] + fn look_matches_word_unicode_negate() { + let look = Look::WordUnicodeNegate; + + // \xF0\x9D\x9B\x83 = 𝛃 (in \w) + // \xF0\x90\x86\x80 = 𐆀 (not in \w) + + // Simple ASCII word boundaries. + assert!(!testlook!(look, "a", 0)); + assert!(!testlook!(look, "a", 1)); + assert!(!testlook!(look, "a ", 1)); + assert!(!testlook!(look, " a ", 1)); + assert!(!testlook!(look, " a ", 2)); + + // Unicode word boundaries with a non-ASCII codepoint. + assert!(!testlook!(look, "𝛃", 0)); + assert!(!testlook!(look, "𝛃", 4)); + assert!(!testlook!(look, "𝛃 ", 4)); + assert!(!testlook!(look, " 𝛃 ", 1)); + assert!(!testlook!(look, " 𝛃 ", 5)); + + // Unicode word boundaries between non-ASCII codepoints. + assert!(!testlook!(look, "𝛃𐆀", 0)); + assert!(!testlook!(look, "𝛃𐆀", 4)); + + // Non word boundaries for ASCII. + assert!(testlook!(look, "", 0)); + assert!(testlook!(look, "ab", 1)); + assert!(testlook!(look, "a ", 2)); + assert!(testlook!(look, " a ", 0)); + assert!(testlook!(look, " a ", 3)); + + // Non word boundaries with a non-ASCII codepoint. + assert!(testlook!(look, "𝛃b", 4)); + assert!(testlook!(look, "𝛃 ", 5)); + assert!(testlook!(look, " 𝛃 ", 0)); + assert!(testlook!(look, " 𝛃 ", 6)); + // These don't match because they could otherwise return an offset that + // splits the UTF-8 encoding of a codepoint. + assert!(!testlook!(look, "𝛃", 1)); + assert!(!testlook!(look, "𝛃", 2)); + assert!(!testlook!(look, "𝛃", 3)); + + // Non word boundaries with non-ASCII codepoints. These also don't + // match because they could otherwise return an offset that splits the + // UTF-8 encoding of a codepoint. + assert!(!testlook!(look, "𝛃𐆀", 1)); + assert!(!testlook!(look, "𝛃𐆀", 2)); + assert!(!testlook!(look, "𝛃𐆀", 3)); + assert!(!testlook!(look, "𝛃𐆀", 5)); + assert!(!testlook!(look, "𝛃𐆀", 6)); + assert!(!testlook!(look, "𝛃𐆀", 7)); + // But this one does, since 𐆀 isn't a word codepoint, and 8 is the end + // of the haystack. So the "end" of the haystack isn't a word and 𐆀 + // isn't a word, thus, \B matches. + assert!(testlook!(look, "𝛃𐆀", 8)); + } + + #[test] + fn look_matches_word_ascii_negate() { + let look = Look::WordAsciiNegate; + + // \xF0\x9D\x9B\x83 = 𝛃 (in \w) + // \xF0\x90\x86\x80 = 𐆀 (not in \w) + + // Simple ASCII word boundaries. + assert!(!testlook!(look, "a", 0)); + assert!(!testlook!(look, "a", 1)); + assert!(!testlook!(look, "a ", 1)); + assert!(!testlook!(look, " a ", 1)); + assert!(!testlook!(look, " a ", 2)); + + // Unicode word boundaries with a non-ASCII codepoint. Since this is + // an ASCII word boundary, none of these match. + assert!(testlook!(look, "𝛃", 0)); + assert!(testlook!(look, "𝛃", 4)); + assert!(testlook!(look, "𝛃 ", 4)); + assert!(testlook!(look, " 𝛃 ", 1)); + assert!(testlook!(look, " 𝛃 ", 5)); + + // Unicode word boundaries between non-ASCII codepoints. Again, since + // this is an ASCII word boundary, none of these match. + assert!(testlook!(look, "𝛃𐆀", 0)); + assert!(testlook!(look, "𝛃𐆀", 4)); + + // Non word boundaries for ASCII. + assert!(testlook!(look, "", 0)); + assert!(testlook!(look, "ab", 1)); + assert!(testlook!(look, "a ", 2)); + assert!(testlook!(look, " a ", 0)); + assert!(testlook!(look, " a ", 3)); + + // Non word boundaries with a non-ASCII codepoint. + assert!(!testlook!(look, "𝛃b", 4)); + assert!(testlook!(look, "𝛃 ", 5)); + assert!(testlook!(look, " 𝛃 ", 0)); + assert!(testlook!(look, " 𝛃 ", 6)); + assert!(testlook!(look, "𝛃", 1)); + assert!(testlook!(look, "𝛃", 2)); + assert!(testlook!(look, "𝛃", 3)); + + // Non word boundaries with non-ASCII codepoints. + assert!(testlook!(look, "𝛃𐆀", 1)); + assert!(testlook!(look, "𝛃𐆀", 2)); + assert!(testlook!(look, "𝛃𐆀", 3)); + assert!(testlook!(look, "𝛃𐆀", 5)); + assert!(testlook!(look, "𝛃𐆀", 6)); + assert!(testlook!(look, "𝛃𐆀", 7)); + assert!(testlook!(look, "𝛃𐆀", 8)); + } + + #[test] + fn look_set() { + let mut f = LookSet::default(); + assert!(!f.contains(Look::Start)); + assert!(!f.contains(Look::End)); + assert!(!f.contains(Look::StartLF)); + assert!(!f.contains(Look::EndLF)); + assert!(!f.contains(Look::WordUnicode)); + assert!(!f.contains(Look::WordUnicodeNegate)); + assert!(!f.contains(Look::WordAscii)); + assert!(!f.contains(Look::WordAsciiNegate)); + + f = f.insert(Look::Start); + assert!(f.contains(Look::Start)); + f = f.remove(Look::Start); + assert!(!f.contains(Look::Start)); + + f = f.insert(Look::End); + assert!(f.contains(Look::End)); + f = f.remove(Look::End); + assert!(!f.contains(Look::End)); + + f = f.insert(Look::StartLF); + assert!(f.contains(Look::StartLF)); + f = f.remove(Look::StartLF); + assert!(!f.contains(Look::StartLF)); + + f = f.insert(Look::EndLF); + assert!(f.contains(Look::EndLF)); + f = f.remove(Look::EndLF); + assert!(!f.contains(Look::EndLF)); + + f = f.insert(Look::StartCRLF); + assert!(f.contains(Look::StartCRLF)); + f = f.remove(Look::StartCRLF); + assert!(!f.contains(Look::StartCRLF)); + + f = f.insert(Look::EndCRLF); + assert!(f.contains(Look::EndCRLF)); + f = f.remove(Look::EndCRLF); + assert!(!f.contains(Look::EndCRLF)); + + f = f.insert(Look::WordUnicode); + assert!(f.contains(Look::WordUnicode)); + f = f.remove(Look::WordUnicode); + assert!(!f.contains(Look::WordUnicode)); + + f = f.insert(Look::WordUnicodeNegate); + assert!(f.contains(Look::WordUnicodeNegate)); + f = f.remove(Look::WordUnicodeNegate); + assert!(!f.contains(Look::WordUnicodeNegate)); + + f = f.insert(Look::WordAscii); + assert!(f.contains(Look::WordAscii)); + f = f.remove(Look::WordAscii); + assert!(!f.contains(Look::WordAscii)); + + f = f.insert(Look::WordAsciiNegate); + assert!(f.contains(Look::WordAsciiNegate)); + f = f.remove(Look::WordAsciiNegate); + assert!(!f.contains(Look::WordAsciiNegate)); + } + + #[test] + fn look_set_iter() { + let set = LookSet::empty(); + assert_eq!(0, set.iter().count()); + + let set = LookSet::full(); + assert_eq!(10, set.iter().count()); + + let set = + LookSet::empty().insert(Look::StartLF).insert(Look::WordUnicode); + assert_eq!(2, set.iter().count()); + + let set = LookSet::empty().insert(Look::StartLF); + assert_eq!(1, set.iter().count()); + + let set = LookSet::empty().insert(Look::WordAsciiNegate); + assert_eq!(1, set.iter().count()); + } + + #[test] + #[cfg(feature = "alloc")] + fn look_set_debug() { + let res = alloc::format!("{:?}", LookSet::empty()); + assert_eq!("∅", res); + let res = alloc::format!("{:?}", LookSet::full()); + assert_eq!("Az^$rRbB𝛃𝚩", res); + } +} diff --git a/vendor/regex-automata/src/util/matchtypes.rs b/vendor/regex-automata/src/util/matchtypes.rs deleted file mode 100644 index de0fa65bf..000000000 --- a/vendor/regex-automata/src/util/matchtypes.rs +++ /dev/null @@ -1,356 +0,0 @@ -use crate::util::id::PatternID; - -/// The kind of match semantics to use for a DFA. -/// -/// The default match kind is `LeftmostFirst`. -#[derive(Clone, Copy, Debug, Eq, PartialEq)] -pub enum MatchKind { - /// Report all possible matches. - All, - /// Report only the leftmost matches. When multiple leftmost matches exist, - /// report the match corresponding to the part of the regex that appears - /// first in the syntax. - LeftmostFirst, - /// Hints that destructuring should not be exhaustive. - /// - /// This enum may grow additional variants, so this makes sure clients - /// don't count on exhaustive matching. (Otherwise, adding a new variant - /// could break existing code.) - #[doc(hidden)] - __Nonexhaustive, - // There is prior art in RE2 that shows that we should be able to add - // LeftmostLongest too. The tricky part of it is supporting ungreedy - // repetitions. Instead of treating all NFA states as having equivalent - // priority (as in 'All') or treating all NFA states as having distinct - // priority based on order (as in 'LeftmostFirst'), we instead group NFA - // states into sets, and treat members of each set as having equivalent - // priority, but having greater priority than all following members - // of different sets. - // - // However, it's not clear whether it's really worth adding this. After - // all, leftmost-longest can be emulated when using literals by using - // leftmost-first and sorting the literals by length in descending order. - // However, this won't work for arbitrary regexes. e.g., `\w|\w\w` will - // always match `a` in `ab` when using leftmost-first, but leftmost-longest - // would match `ab`. -} - -impl MatchKind { - #[cfg(feature = "alloc")] - pub(crate) fn continue_past_first_match(&self) -> bool { - *self == MatchKind::All - } -} - -impl Default for MatchKind { - fn default() -> MatchKind { - MatchKind::LeftmostFirst - } -} - -/// A representation of a match reported by a regex engine. -/// -/// A match records the start and end offsets of the match in the haystack. -/// -/// Every match guarantees that `start <= end`. -#[derive(Clone, Debug, Eq, Hash, PartialEq)] -pub struct Match { - /// The start offset of the match, inclusive. - start: usize, - /// The end offset of the match, exclusive. - end: usize, -} - -impl Match { - /// Create a new match from a byte offset span. - /// - /// # Panics - /// - /// This panics if `end < start`. - #[inline] - pub fn new(start: usize, end: usize) -> Match { - assert!(start <= end); - Match { start, end } - } - - /// The starting position of the match. - #[inline] - pub fn start(&self) -> usize { - self.start - } - - /// The ending position of the match. - #[inline] - pub fn end(&self) -> usize { - self.end - } - - /// Returns the match location as a range. - #[inline] - pub fn range(&self) -> core::ops::Range<usize> { - self.start..self.end - } - - /// Returns true if and only if this match is empty. That is, when - /// `start() == end()`. - /// - /// An empty match can only be returned when the empty string was among - /// the patterns used to build the Aho-Corasick automaton. - #[inline] - pub fn is_empty(&self) -> bool { - self.start == self.end - } -} - -/// A representation of a match reported by a DFA. -/// -/// This is called a "half" match because it only includes the end location -/// (or start location for a reverse match) of a match. This corresponds to the -/// information that a single DFA scan can report. Getting the other half of -/// the match requires a second scan with a reversed DFA. -/// -/// A half match also includes the pattern that matched. The pattern is -/// identified by an ID, which corresponds to its position (starting from `0`) -/// relative to other patterns used to construct the corresponding DFA. If only -/// a single pattern is provided to the DFA, then all matches are guaranteed to -/// have a pattern ID of `0`. -#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] -pub struct HalfMatch { - /// The pattern ID. - pub(crate) pattern: PatternID, - /// The offset of the match. - /// - /// For forward searches, the offset is exclusive. For reverse searches, - /// the offset is inclusive. - pub(crate) offset: usize, -} - -impl HalfMatch { - /// Create a new half match from a pattern ID and a byte offset. - #[inline] - pub fn new(pattern: PatternID, offset: usize) -> HalfMatch { - HalfMatch { pattern, offset } - } - - /// Create a new half match from a pattern ID and a byte offset. - /// - /// This is like [`HalfMatch::new`], but accepts a `usize` instead of a - /// [`PatternID`]. This panics if the given `usize` is not representable - /// as a `PatternID`. - #[inline] - pub fn must(pattern: usize, offset: usize) -> HalfMatch { - HalfMatch::new(PatternID::new(pattern).unwrap(), offset) - } - - /// Returns the ID of the pattern that matched. - /// - /// The ID of a pattern is derived from the position in which it was - /// originally inserted into the corresponding DFA. The first pattern has - /// identifier `0`, and each subsequent pattern is `1`, `2` and so on. - #[inline] - pub fn pattern(&self) -> PatternID { - self.pattern - } - - /// The position of the match. - /// - /// If this match was produced by a forward search, then the offset is - /// exclusive. If this match was produced by a reverse search, then the - /// offset is inclusive. - #[inline] - pub fn offset(&self) -> usize { - self.offset - } -} - -/// A representation of a multi match reported by a regex engine. -/// -/// A multi match has two essential pieces of information: the identifier of -/// the pattern that matched, along with the start and end offsets of the match -/// in the haystack. -/// -/// The pattern is identified by an ID, which corresponds to its position -/// (starting from `0`) relative to other patterns used to construct the -/// corresponding regex engine. If only a single pattern is provided, then all -/// multi matches are guaranteed to have a pattern ID of `0`. -/// -/// Every multi match guarantees that `start <= end`. -#[derive(Clone, Debug, Eq, Hash, PartialEq)] -pub struct MultiMatch { - /// The pattern ID. - pattern: PatternID, - /// The start offset of the match, inclusive. - start: usize, - /// The end offset of the match, exclusive. - end: usize, -} - -impl MultiMatch { - /// Create a new match from a pattern ID and a byte offset span. - /// - /// # Panics - /// - /// This panics if `end < start`. - #[inline] - pub fn new(pattern: PatternID, start: usize, end: usize) -> MultiMatch { - assert!(start <= end); - MultiMatch { pattern, start, end } - } - - /// Create a new match from a pattern ID and a byte offset span. - /// - /// This is like [`MultiMatch::new`], but accepts a `usize` instead of a - /// [`PatternID`]. This panics if the given `usize` is not representable - /// as a `PatternID`. - /// - /// # Panics - /// - /// This panics if `end < start` or if `pattern > PatternID::MAX`. - #[inline] - pub fn must(pattern: usize, start: usize, end: usize) -> MultiMatch { - MultiMatch::new(PatternID::new(pattern).unwrap(), start, end) - } - - /// Returns the ID of the pattern that matched. - /// - /// The ID of a pattern is derived from the position in which it was - /// originally inserted into the corresponding regex engine. The first - /// pattern has identifier `0`, and each subsequent pattern is `1`, `2` and - /// so on. - #[inline] - pub fn pattern(&self) -> PatternID { - self.pattern - } - - /// The starting position of the match. - #[inline] - pub fn start(&self) -> usize { - self.start - } - - /// The ending position of the match. - #[inline] - pub fn end(&self) -> usize { - self.end - } - - /// Returns the match location as a range. - #[inline] - pub fn range(&self) -> core::ops::Range<usize> { - self.start..self.end - } - - /// Returns true if and only if this match is empty. That is, when - /// `start() == end()`. - /// - /// An empty match can only be returned when the empty string was among - /// the patterns used to build the Aho-Corasick automaton. - #[inline] - pub fn is_empty(&self) -> bool { - self.start == self.end - } -} - -/// An error type indicating that a search stopped prematurely without finding -/// a match. -/// -/// This error type implies that one cannot assume that no matches occur, since -/// the search stopped before completing. -/// -/// Normally, when one searches for something, the response is either an -/// affirmative "it was found at this location" or a negative "not found at -/// all." However, in some cases, a regex engine can be configured to stop its -/// search before concluding whether a match exists or not. When this happens, -/// it may be important for the caller to know why the regex engine gave up and -/// where in the input it gave up at. This error type exposes the 'why' and the -/// 'where.' -/// -/// For example, the DFAs provided by this library generally cannot correctly -/// implement Unicode word boundaries. Instead, they provide an option to -/// eagerly support them on ASCII text (since Unicode word boundaries are -/// equivalent to ASCII word boundaries when searching ASCII text), but will -/// "give up" if a non-ASCII byte is seen. In such cases, one is usually -/// required to either report the failure to the caller (unergonomic) or -/// otherwise fall back to some other regex engine (ergonomic, but potentially -/// costly). -/// -/// More generally, some regex engines offer the ability for callers to specify -/// certain bytes that will trigger the regex engine to automatically quit if -/// they are seen. -/// -/// Still yet, there may be other reasons for a failed match. For example, -/// the hybrid DFA provided by this crate can be configured to give up if it -/// believes that it is not efficient. This in turn permits callers to choose a -/// different regex engine. -/// -/// # Advice -/// -/// While this form of error reporting adds complexity, it is generally -/// possible for callers to configure regex engines to never give up a search, -/// and thus never return an error. Indeed, the default configuration for every -/// regex engine in this crate is such that they will never stop searching -/// early. Therefore, the only way to get a match error is if the regex engine -/// is explicitly configured to do so. Options that enable this behavior -/// document the new error conditions they imply. -/// -/// Regex engines for which no errors are possible for any configuration will -/// return the normal `Option<Match>` and not use this error type at all. -/// -/// For example, regex engines in the `dfa` sub-module will only report -/// `MatchError::Quit` if instructed by either -/// [enabling Unicode word boundaries](crate::dfa::dense::Config::unicode_word_boundary) -/// or by -/// [explicitly specifying one or more quit bytes](crate::dfa::dense::Config::quit). -#[derive(Clone, Debug, Eq, Hash, PartialEq)] -pub enum MatchError { - // Note that the first version of this type was called `SearchError` and it - // included a third `None` variant to indicate that the search completed - // and no match was found. However, this was problematic for iterator - // APIs where the `None` sentinel for stopping iteration corresponds - // precisely to the "match not found" case. The fact that the `None` - // variant was buried inside this type was in turn quite awkward. So - // instead, I removed the `None` variant, renamed the type and used - // `Result<Option<Match>, MatchError>` in non-iterator APIs instead of the - // conceptually simpler `Result<Match, MatchError>`. However, we "regain" - // ergonomics by only putting the more complex API in the `try_` variants - // ("fallible") of search methods. The infallible APIs will instead just - // return `Option<Match>` and panic on error. - /// The search saw a "quit" byte at which it was instructed to stop - /// searching. - Quit { - /// The "quit" byte that was observed that caused the search to stop. - byte: u8, - /// The offset at which the quit byte was observed. - offset: usize, - }, - /// The search, based on heuristics, determined that it would be better - /// to stop, typically to provide the caller an opportunity to use an - /// alternative regex engine. - /// - /// Currently, the only way for this to occur is via the lazy DFA and - /// only when it is configured to do so (it will not return this error by - /// default). - GaveUp { - /// The offset at which the search stopped. This corresponds to the - /// position immediately following the last byte scanned. - offset: usize, - }, -} - -#[cfg(feature = "std")] -impl std::error::Error for MatchError {} - -impl core::fmt::Display for MatchError { - fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { - match *self { - MatchError::Quit { byte, offset } => write!( - f, - "quit search after observing byte \\x{:02X} at offset {}", - byte, offset, - ), - MatchError::GaveUp { offset } => { - write!(f, "gave up searching at offset {}", offset) - } - } - } -} diff --git a/vendor/regex-automata/src/util/memchr.rs b/vendor/regex-automata/src/util/memchr.rs new file mode 100644 index 000000000..a2cbb0732 --- /dev/null +++ b/vendor/regex-automata/src/util/memchr.rs @@ -0,0 +1,93 @@ +/*! +This module defines simple wrapper routines for the memchr functions from the +`memchr` crate. Basically, when the `memchr` crate is available, we use it, +otherwise we use a naive implementation which is still pretty fast. +*/ + +pub(crate) use self::inner::*; + +#[cfg(feature = "perf-literal-substring")] +pub(super) mod inner { + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn memchr(n1: u8, haystack: &[u8]) -> Option<usize> { + memchr::memchr(n1, haystack) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn memchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> { + memchr::memchr2(n1, n2, haystack) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn memchr3( + n1: u8, + n2: u8, + n3: u8, + haystack: &[u8], + ) -> Option<usize> { + memchr::memchr3(n1, n2, n3, haystack) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn memrchr(n1: u8, haystack: &[u8]) -> Option<usize> { + memchr::memrchr(n1, haystack) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn memrchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> { + memchr::memrchr2(n1, n2, haystack) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn memrchr3( + n1: u8, + n2: u8, + n3: u8, + haystack: &[u8], + ) -> Option<usize> { + memchr::memrchr3(n1, n2, n3, haystack) + } +} + +#[cfg(not(feature = "perf-literal-substring"))] +pub(super) mod inner { + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn memchr(n1: u8, haystack: &[u8]) -> Option<usize> { + haystack.iter().position(|&b| b == n1) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn memchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> { + haystack.iter().position(|&b| b == n1 || b == n2) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn memchr3( + n1: u8, + n2: u8, + n3: u8, + haystack: &[u8], + ) -> Option<usize> { + haystack.iter().position(|&b| b == n1 || b == n2 || b == n3) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn memrchr(n1: u8, haystack: &[u8]) -> Option<usize> { + haystack.iter().rposition(|&b| b == n1) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn memrchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> { + haystack.iter().rposition(|&b| b == n1 || b == n2) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn memrchr3( + n1: u8, + n2: u8, + n3: u8, + haystack: &[u8], + ) -> Option<usize> { + haystack.iter().rposition(|&b| b == n1 || b == n2 || b == n3) + } +} diff --git a/vendor/regex-automata/src/util/mod.rs b/vendor/regex-automata/src/util/mod.rs index 798507da2..bb739df1d 100644 --- a/vendor/regex-automata/src/util/mod.rs +++ b/vendor/regex-automata/src/util/mod.rs @@ -1,275 +1,57 @@ /*! -TODO +A collection of modules that provide APIs that are useful across many regex +engines. + +While one should explore the sub-modules directly to get a sense of what's +there, here are some highlights that tie the sub-modules to higher level +use cases: + +* `alphabet` contains APIs that are useful if you're doing low level things +with the DFAs in this crate. For example, implementing determinization or +walking its state graph directly. +* `captures` contains APIs for dealing with capture group matches and their +mapping to "slots" used inside an NFA graph. This is also where you can find +iterators over capture group names. +* `escape` contains types for pretty-printing raw byte slices as strings. +* `iter` contains API helpers for writing regex iterators. +* `lazy` contains a no-std and no-alloc variant of `lazy_static!` and +`once_cell`. +* `look` contains APIs for matching and configuring look-around assertions. +* `pool` provides a way to reuse mutable memory allocated in a thread safe +manner. +* `prefilter` provides APIs for building prefilters and using them in searches. +* `primitives` are what you might use if you're doing lower level work on +automata, such as walking an NFA state graph. +* `syntax` provides some higher level convenience functions for interacting +with the `regex-syntax` crate. +* `wire` is useful if you're working with DFA serialization. */ -use core::{ascii, fmt, str}; - -#[cfg(feature = "alloc")] -use alloc::vec::Vec; - pub mod alphabet; -pub(crate) mod bytes; #[cfg(feature = "alloc")] -pub(crate) mod determinize; -pub mod id; +pub mod captures; +pub mod escape; #[cfg(feature = "alloc")] -pub(crate) mod lazy; -pub(crate) mod matchtypes; +pub mod interpolate; +pub mod iter; +pub mod lazy; +pub mod look; +#[cfg(feature = "alloc")] +pub mod pool; pub mod prefilter; +pub mod primitives; +#[cfg(feature = "syntax")] +pub mod syntax; +pub mod wire; + +#[cfg(any(feature = "dfa-build", feature = "hybrid"))] +pub(crate) mod determinize; +pub(crate) mod empty; +pub(crate) mod int; +pub(crate) mod memchr; +pub(crate) mod search; #[cfg(feature = "alloc")] pub(crate) mod sparse_set; pub(crate) mod start; -#[cfg(feature = "alloc")] -pub(crate) mod syntax; - -/// The offset, in bytes, that a match is delayed by in the DFAs generated by -/// this crate. (This includes lazy DFAs.) -/// -/// The purpose of this delay is to support look-ahead such as \b (ASCII-only) -/// and $. In particular, both of these operators may require the -/// identification of the end of input in order to confirm a match. Not only -/// does this mean that all matches must therefore be delayed by a single byte, -/// but that a special EOI value is added to the alphabet of all DFAs. (Which -/// means that even though the alphabet of a DFA is typically all byte values, -/// the actual maximum alphabet size is 257 due to the extra EOI value.) -/// -/// Since we delay matches by only 1 byte, this can't fully support a -/// Unicode-aware \b operator, which requires multi-byte look-ahead. Indeed, -/// DFAs in this crate do not support it. (It's not as simple as just -/// increasing the match offset to do it---otherwise we would---but building -/// the full Unicode-aware word boundary detection into an automaton is quite -/// tricky.) -pub(crate) const MATCH_OFFSET: usize = 1; - -/// A type that wraps a single byte with a convenient fmt::Debug impl that -/// escapes the byte. -pub(crate) struct DebugByte(pub u8); - -impl fmt::Debug for DebugByte { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - // 10 bytes is enough to cover any output from ascii::escape_default. - let mut bytes = [0u8; 10]; - let mut len = 0; - for (i, mut b) in ascii::escape_default(self.0).enumerate() { - // capitalize \xab to \xAB - if i >= 2 && b'a' <= b && b <= b'f' { - b -= 32; - } - bytes[len] = b; - len += 1; - } - write!(f, "{}", str::from_utf8(&bytes[..len]).unwrap()) - } -} - -/// Returns the smallest possible index of the next valid UTF-8 sequence -/// starting after `i`. -/// -/// For all inputs, including invalid UTF-8 and any value of `i`, the return -/// value is guaranteed to be greater than `i`. -/// -/// Generally speaking, this should only be called on `text` when it is -/// permitted to assume that it is valid UTF-8 and where either `i >= -/// text.len()` or where `text[i]` is a leading byte of a UTF-8 sequence. -#[inline(always)] -pub(crate) fn next_utf8(text: &[u8], i: usize) -> usize { - let b = match text.get(i) { - None => return i.checked_add(1).unwrap(), - Some(&b) => b, - }; - // For cases where we see an invalid UTF-8 byte, there isn't much we can do - // other than just start at the next byte. - let inc = utf8_len(b).unwrap_or(1); - i.checked_add(inc).unwrap() -} - -/// Returns true if and only if the given byte is considered a word character. -/// This only applies to ASCII. -/// -/// This was copied from regex-syntax so that we can use it to determine the -/// starting DFA state while searching without depending on regex-syntax. The -/// definition is never going to change, so there's no maintenance/bit-rot -/// hazard here. -#[inline(always)] -pub(crate) fn is_word_byte(b: u8) -> bool { - match b { - b'_' | b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' => true, - _ => false, - } -} - -/// Decodes the next UTF-8 encoded codepoint from the given byte slice. -/// -/// If no valid encoding of a codepoint exists at the beginning of the given -/// byte slice, then the first byte is returned instead. -/// -/// This returns `None` if and only if `bytes` is empty. -#[inline(always)] -pub(crate) fn decode_utf8(bytes: &[u8]) -> Option<Result<char, u8>> { - if bytes.is_empty() { - return None; - } - let len = match utf8_len(bytes[0]) { - None => return Some(Err(bytes[0])), - Some(len) if len > bytes.len() => return Some(Err(bytes[0])), - Some(1) => return Some(Ok(bytes[0] as char)), - Some(len) => len, - }; - match str::from_utf8(&bytes[..len]) { - Ok(s) => Some(Ok(s.chars().next().unwrap())), - Err(_) => Some(Err(bytes[0])), - } -} - -/// Decodes the last UTF-8 encoded codepoint from the given byte slice. -/// -/// If no valid encoding of a codepoint exists at the end of the given byte -/// slice, then the last byte is returned instead. -/// -/// This returns `None` if and only if `bytes` is empty. -#[inline(always)] -pub(crate) fn decode_last_utf8(bytes: &[u8]) -> Option<Result<char, u8>> { - if bytes.is_empty() { - return None; - } - let mut start = bytes.len() - 1; - let limit = bytes.len().saturating_sub(4); - while start > limit && !is_leading_or_invalid_utf8_byte(bytes[start]) { - start -= 1; - } - match decode_utf8(&bytes[start..]) { - None => None, - Some(Ok(ch)) => Some(Ok(ch)), - Some(Err(_)) => Some(Err(bytes[bytes.len() - 1])), - } -} - -/// Given a UTF-8 leading byte, this returns the total number of code units -/// in the following encoded codepoint. -/// -/// If the given byte is not a valid UTF-8 leading byte, then this returns -/// `None`. -#[inline(always)] -fn utf8_len(byte: u8) -> Option<usize> { - if byte <= 0x7F { - return Some(1); - } else if byte & 0b1100_0000 == 0b1000_0000 { - return None; - } else if byte <= 0b1101_1111 { - Some(2) - } else if byte <= 0b1110_1111 { - Some(3) - } else if byte <= 0b1111_0111 { - Some(4) - } else { - None - } -} - -/// Returns true if and only if the given byte is either a valid leading UTF-8 -/// byte, or is otherwise an invalid byte that can never appear anywhere in a -/// valid UTF-8 sequence. -#[inline(always)] -fn is_leading_or_invalid_utf8_byte(b: u8) -> bool { - // In the ASCII case, the most significant bit is never set. The leading - // byte of a 2/3/4-byte sequence always has the top two most significant - // bits set. For bytes that can never appear anywhere in valid UTF-8, this - // also returns true, since every such byte has its two most significant - // bits set: - // - // \xC0 :: 11000000 - // \xC1 :: 11000001 - // \xF5 :: 11110101 - // \xF6 :: 11110110 - // \xF7 :: 11110111 - // \xF8 :: 11111000 - // \xF9 :: 11111001 - // \xFA :: 11111010 - // \xFB :: 11111011 - // \xFC :: 11111100 - // \xFD :: 11111101 - // \xFE :: 11111110 - // \xFF :: 11111111 - (b & 0b1100_0000) != 0b1000_0000 -} - -#[cfg(feature = "alloc")] -#[inline(always)] -pub(crate) fn is_word_char_fwd(bytes: &[u8], mut at: usize) -> bool { - use core::{ptr, sync::atomic::AtomicPtr}; - - use crate::{ - dfa::{ - dense::{self, DFA}, - Automaton, - }, - util::lazy, - }; - - static WORD: AtomicPtr<DFA<Vec<u32>>> = AtomicPtr::new(ptr::null_mut()); - - let dfa = lazy::get_or_init(&WORD, || { - // TODO: Should we use a lazy DFA here instead? It does complicate - // things somewhat, since we then need a mutable cache, which probably - // means a thread local. - dense::Builder::new() - .configure(dense::Config::new().anchored(true)) - .build(r"\w") - .unwrap() - }); - // This is OK since '\w' contains no look-around. - let mut sid = dfa.universal_start_state(); - while at < bytes.len() { - let byte = bytes[at]; - sid = dfa.next_state(sid, byte); - at += 1; - if dfa.is_special_state(sid) { - if dfa.is_match_state(sid) { - return true; - } else if dfa.is_dead_state(sid) { - return false; - } - } - } - dfa.is_match_state(dfa.next_eoi_state(sid)) -} - -#[cfg(feature = "alloc")] -#[inline(always)] -pub(crate) fn is_word_char_rev(bytes: &[u8], mut at: usize) -> bool { - use core::{ptr, sync::atomic::AtomicPtr}; - - use crate::{ - dfa::{ - dense::{self, DFA}, - Automaton, - }, - nfa::thompson::NFA, - }; - - static WORD: AtomicPtr<DFA<Vec<u32>>> = AtomicPtr::new(ptr::null_mut()); - - let dfa = lazy::get_or_init(&WORD, || { - dense::Builder::new() - .configure(dense::Config::new().anchored(true)) - .thompson(NFA::config().reverse(true).shrink(true)) - .build(r"\w") - .unwrap() - }); - - // This is OK since '\w' contains no look-around. - let mut sid = dfa.universal_start_state(); - while at > 0 { - at -= 1; - let byte = bytes[at]; - sid = dfa.next_state(sid, byte); - if dfa.is_special_state(sid) { - if dfa.is_match_state(sid) { - return true; - } else if dfa.is_dead_state(sid) { - return false; - } - } - } - dfa.is_match_state(dfa.next_eoi_state(sid)) -} +pub(crate) mod unicode_data; +pub(crate) mod utf8; diff --git a/vendor/regex-automata/src/util/pool.rs b/vendor/regex-automata/src/util/pool.rs new file mode 100644 index 000000000..c03d7b013 --- /dev/null +++ b/vendor/regex-automata/src/util/pool.rs @@ -0,0 +1,1142 @@ +// This module provides a relatively simple thread-safe pool of reusable +// objects. For the most part, it's implemented by a stack represented by a +// Mutex<Vec<T>>. It has one small trick: because unlocking a mutex is somewhat +// costly, in the case where a pool is accessed by the first thread that tried +// to get a value, we bypass the mutex. Here are some benchmarks showing the +// difference. +// +// 2022-10-15: These benchmarks are from the old regex crate and they aren't +// easy to reproduce because some rely on older implementations of Pool that +// are no longer around. I've left the results here for posterity, but any +// enterprising individual should feel encouraged to re-litigate the way Pool +// works. I am not at all certain it is the best approach. +// +// 1) misc::anchored_literal_long_non_match 21 (18571 MB/s) +// 2) misc::anchored_literal_long_non_match 107 (3644 MB/s) +// 3) misc::anchored_literal_long_non_match 45 (8666 MB/s) +// 4) misc::anchored_literal_long_non_match 19 (20526 MB/s) +// +// (1) represents our baseline: the master branch at the time of writing when +// using the 'thread_local' crate to implement the pool below. +// +// (2) represents a naive pool implemented completely via Mutex<Vec<T>>. There +// is no special trick for bypassing the mutex. +// +// (3) is the same as (2), except it uses Mutex<Vec<Box<T>>>. It is twice as +// fast because a Box<T> is much smaller than the T we use with a Pool in this +// crate. So pushing and popping a Box<T> from a Vec is quite a bit faster +// than for T. +// +// (4) is the same as (3), but with the trick for bypassing the mutex in the +// case of the first-to-get thread. +// +// Why move off of thread_local? Even though (4) is a hair faster than (1) +// above, this was not the main goal. The main goal was to move off of +// thread_local and find a way to *simply* re-capture some of its speed for +// regex's specific case. So again, why move off of it? The *primary* reason is +// because of memory leaks. See https://github.com/rust-lang/regex/issues/362 +// for example. (Why do I want it to be simple? Well, I suppose what I mean is, +// "use as much safe code as possible to minimize risk and be as sure as I can +// be that it is correct.") +// +// My guess is that the thread_local design is probably not appropriate for +// regex since its memory usage scales to the number of active threads that +// have used a regex, where as the pool below scales to the number of threads +// that simultaneously use a regex. While neither case permits contraction, +// since we own the pool data structure below, we can add contraction if a +// clear use case pops up in the wild. More pressingly though, it seems that +// there are at least some use case patterns where one might have many threads +// sitting around that might have used a regex at one point. While thread_local +// does try to reuse space previously used by a thread that has since stopped, +// its maximal memory usage still scales with the total number of active +// threads. In contrast, the pool below scales with the total number of threads +// *simultaneously* using the pool. The hope is that this uses less memory +// overall. And if it doesn't, we can hopefully tune it somehow. +// +// It seems that these sort of conditions happen frequently +// in FFI inside of other more "managed" languages. This was +// mentioned in the issue linked above, and also mentioned here: +// https://github.com/BurntSushi/rure-go/issues/3. And in particular, users +// confirm that disabling the use of thread_local resolves the leak. +// +// There were other weaker reasons for moving off of thread_local as well. +// Namely, at the time, I was looking to reduce dependencies. And for something +// like regex, maintenance can be simpler when we own the full dependency tree. +// +// Note that I am not entirely happy with this pool. It has some subtle +// implementation details and is overall still observable (even with the +// thread owner optimization) in benchmarks. If someone wants to take a crack +// at building something better, please file an issue. Even if it means a +// different API. The API exposed by this pool is not the minimal thing that +// something like a 'Regex' actually needs. It could adapt to, for example, +// an API more like what is found in the 'thread_local' crate. However, we do +// really need to support the no-std alloc-only context, or else the regex +// crate wouldn't be able to support no-std alloc-only. However, I'm generally +// okay with making the alloc-only context slower (as it is here), although I +// do find it unfortunate. + +/*! +A thread safe memory pool. + +The principal type in this module is a [`Pool`]. It main use case is for +holding a thread safe collection of mutable scratch spaces (usually called +`Cache` in this crate) that regex engines need to execute a search. This then +permits sharing the same read-only regex object across multiple threads while +having a quick way of reusing scratch space in a thread safe way. This avoids +needing to re-create the scratch space for every search, which could wind up +being quite expensive. +*/ + +/// A thread safe pool that works in an `alloc`-only context. +/// +/// Getting a value out comes with a guard. When that guard is dropped, the +/// value is automatically put back in the pool. The guard provides both a +/// `Deref` and a `DerefMut` implementation for easy access to an underlying +/// `T`. +/// +/// A `Pool` impls `Sync` when `T` is `Send` (even if `T` is not `Sync`). This +/// is possible because a pool is guaranteed to provide a value to exactly one +/// thread at any time. +/// +/// Currently, a pool never contracts in size. Its size is proportional to the +/// maximum number of simultaneous uses. This may change in the future. +/// +/// A `Pool` is a particularly useful data structure for this crate because +/// many of the regex engines require a mutable "cache" in order to execute +/// a search. Since regexes themselves tend to be global, the problem is then: +/// how do you get a mutable cache to execute a search? You could: +/// +/// 1. Use a `thread_local!`, which requires the standard library and requires +/// that the regex pattern be statically known. +/// 2. Use a `Pool`. +/// 3. Make the cache an explicit dependency in your code and pass it around. +/// 4. Put the cache state in a `Mutex`, but this means only one search can +/// execute at a time. +/// 5. Create a new cache for every search. +/// +/// A `thread_local!` is perhaps the best choice if it works for your use case. +/// Putting the cache in a mutex or creating a new cache for every search are +/// perhaps the worst choices. Of the remaining two choices, whether you use +/// this `Pool` or thread through a cache explicitly in your code is a matter +/// of taste and depends on your code architecture. +/// +/// # Warning: may use a spin lock +/// +/// When this crate is compiled _without_ the `std` feature, then this type +/// may used a spin lock internally. This can have subtle effects that may +/// be undesirable. See [Spinlocks Considered Harmful][spinharm] for a more +/// thorough treatment of this topic. +/// +/// [spinharm]: https://matklad.github.io/2020/01/02/spinlocks-considered-harmful.html +/// +/// # Example +/// +/// This example shows how to share a single hybrid regex among multiple +/// threads, while also safely getting exclusive access to a hybrid's +/// [`Cache`](crate::hybrid::regex::Cache) without preventing other searches +/// from running while your thread uses the `Cache`. +/// +/// ``` +/// use regex_automata::{ +/// hybrid::regex::{Cache, Regex}, +/// util::{lazy::Lazy, pool::Pool}, +/// Match, +/// }; +/// +/// static RE: Lazy<Regex> = +/// Lazy::new(|| Regex::new("foo[0-9]+bar").unwrap()); +/// static CACHE: Lazy<Pool<Cache>> = +/// Lazy::new(|| Pool::new(|| RE.create_cache())); +/// +/// let expected = Some(Match::must(0, 3..14)); +/// assert_eq!(expected, RE.find(&mut CACHE.get(), b"zzzfoo12345barzzz")); +/// ``` +pub struct Pool<T, F = fn() -> T>(alloc::boxed::Box<inner::Pool<T, F>>); + +impl<T, F> Pool<T, F> { + /// Create a new pool. The given closure is used to create values in + /// the pool when necessary. + pub fn new(create: F) -> Pool<T, F> { + Pool(alloc::boxed::Box::new(inner::Pool::new(create))) + } +} + +impl<T: Send, F: Fn() -> T> Pool<T, F> { + /// Get a value from the pool. The caller is guaranteed to have + /// exclusive access to the given value. Namely, it is guaranteed that + /// this will never return a value that was returned by another call to + /// `get` but was not put back into the pool. + /// + /// When the guard goes out of scope and its destructor is called, then + /// it will automatically be put back into the pool. Alternatively, + /// [`PoolGuard::put`] may be used to explicitly put it back in the pool + /// without relying on its destructor. + /// + /// Note that there is no guarantee provided about which value in the + /// pool is returned. That is, calling get, dropping the guard (causing + /// the value to go back into the pool) and then calling get again is + /// *not* guaranteed to return the same value received in the first `get` + /// call. + pub fn get(&self) -> PoolGuard<'_, T, F> { + PoolGuard(self.0.get()) + } +} + +impl<T: core::fmt::Debug, F> core::fmt::Debug for Pool<T, F> { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + f.debug_tuple("Pool").field(&self.0).finish() + } +} + +/// A guard that is returned when a caller requests a value from the pool. +/// +/// The purpose of the guard is to use RAII to automatically put the value +/// back in the pool once it's dropped. +pub struct PoolGuard<'a, T: Send, F: Fn() -> T>(inner::PoolGuard<'a, T, F>); + +impl<'a, T: Send, F: Fn() -> T> PoolGuard<'a, T, F> { + /// Consumes this guard and puts it back into the pool. + /// + /// This circumvents the guard's `Drop` implementation. This can be useful + /// in circumstances where the automatic `Drop` results in poorer codegen, + /// such as calling non-inlined functions. + pub fn put(this: PoolGuard<'_, T, F>) { + inner::PoolGuard::put(this.0); + } +} + +impl<'a, T: Send, F: Fn() -> T> core::ops::Deref for PoolGuard<'a, T, F> { + type Target = T; + + fn deref(&self) -> &T { + self.0.value() + } +} + +impl<'a, T: Send, F: Fn() -> T> core::ops::DerefMut for PoolGuard<'a, T, F> { + fn deref_mut(&mut self) -> &mut T { + self.0.value_mut() + } +} + +impl<'a, T: Send + core::fmt::Debug, F: Fn() -> T> core::fmt::Debug + for PoolGuard<'a, T, F> +{ + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + f.debug_tuple("PoolGuard").field(&self.0).finish() + } +} + +#[cfg(feature = "std")] +mod inner { + use core::{ + cell::UnsafeCell, + panic::{RefUnwindSafe, UnwindSafe}, + sync::atomic::{AtomicUsize, Ordering}, + }; + + use alloc::{boxed::Box, vec, vec::Vec}; + + use std::{sync::Mutex, thread_local}; + + /// An atomic counter used to allocate thread IDs. + /// + /// We specifically start our counter at 3 so that we can use the values + /// less than it as sentinels. + static COUNTER: AtomicUsize = AtomicUsize::new(3); + + /// A thread ID indicating that there is no owner. This is the initial + /// state of a pool. Once a pool has an owner, there is no way to change + /// it. + static THREAD_ID_UNOWNED: usize = 0; + + /// A thread ID indicating that the special owner value is in use and not + /// available. This state is useful for avoiding a case where the owner + /// of a pool calls `get` before putting the result of a previous `get` + /// call back into the pool. + static THREAD_ID_INUSE: usize = 1; + + /// This sentinel is used to indicate that a guard has already been dropped + /// and should not be re-dropped. We use this because our drop code can be + /// called outside of Drop and thus there could be a bug in the internal + /// implementation that results in trying to put the same guard back into + /// the same pool multiple times, and *that* could result in UB if we + /// didn't mark the guard as already having been put back in the pool. + /// + /// So this isn't strictly necessary, but this let's us define some + /// routines as safe (like PoolGuard::put_imp) that we couldn't otherwise + /// do. + static THREAD_ID_DROPPED: usize = 2; + + /// The number of stacks we use inside of the pool. These are only used for + /// non-owners. That is, these represent the "slow" path. + /// + /// In the original implementation of this pool, we only used a single + /// stack. While this might be okay for a couple threads, the prevalence of + /// 32, 64 and even 128 core CPUs has made it untenable. The contention + /// such an environment introduces when threads are doing a lot of searches + /// on short haystacks (a not uncommon use case) is palpable and leads to + /// huge slowdowns. + /// + /// This constant reflects a change from using one stack to the number of + /// stacks that this constant is set to. The stack for a particular thread + /// is simply chosen by `thread_id % MAX_POOL_STACKS`. The idea behind + /// this setup is that there should be a good chance that accesses to the + /// pool will be distributed over several stacks instead of all of them + /// converging to one. + /// + /// This is not a particularly smart or dynamic strategy. Fixing this to a + /// specific number has at least two downsides. First is that it will help, + /// say, an 8 core CPU more than it will a 128 core CPU. (But, crucially, + /// it will still help the 128 core case.) Second is that this may wind + /// up being a little wasteful with respect to memory usage. Namely, if a + /// regex is used on one thread and then moved to another thread, then it + /// could result in creating a new copy of the data in the pool even though + /// only one is actually needed. + /// + /// And that memory usage bit is why this is set to 8 and not, say, 64. + /// Keeping it at 8 limits, to an extent, how much unnecessary memory can + /// be allocated. + /// + /// In an ideal world, we'd be able to have something like this: + /// + /// * Grow the number of stacks as the number of concurrent callers + /// increases. I spent a little time trying this, but even just adding an + /// atomic addition/subtraction for each pop/push for tracking concurrent + /// callers led to a big perf hit. Since even more work would seemingly be + /// required than just an addition/subtraction, I abandoned this approach. + /// * The maximum amount of memory used should scale with respect to the + /// number of concurrent callers and *not* the total number of existing + /// threads. This is primarily why the `thread_local` crate isn't used, as + /// as some environments spin up a lot of threads. This led to multiple + /// reports of extremely high memory usage (often described as memory + /// leaks). + /// * Even more ideally, the pool should contract in size. That is, it + /// should grow with bursts and then shrink. But this is a pretty thorny + /// issue to tackle and it might be better to just not. + /// * It would be nice to explore the use of, say, a lock-free stack + /// instead of using a mutex to guard a `Vec` that is ultimately just + /// treated as a stack. The main thing preventing me from exploring this + /// is the ABA problem. The `crossbeam` crate has tools for dealing with + /// this sort of problem (via its epoch based memory reclamation strategy), + /// but I can't justify bringing in all of `crossbeam` as a dependency of + /// `regex` for this. + /// + /// See this issue for more context and discussion: + /// https://github.com/rust-lang/regex/issues/934 + const MAX_POOL_STACKS: usize = 8; + + thread_local!( + /// A thread local used to assign an ID to a thread. + static THREAD_ID: usize = { + let next = COUNTER.fetch_add(1, Ordering::Relaxed); + // SAFETY: We cannot permit the reuse of thread IDs since reusing a + // thread ID might result in more than one thread "owning" a pool, + // and thus, permit accessing a mutable value from multiple threads + // simultaneously without synchronization. The intent of this panic + // is to be a sanity check. It is not expected that the thread ID + // space will actually be exhausted in practice. Even on a 32-bit + // system, it would require spawning 2^32 threads (although they + // wouldn't all need to run simultaneously, so it is in theory + // possible). + // + // This checks that the counter never wraps around, since atomic + // addition wraps around on overflow. + if next == 0 { + panic!("regex: thread ID allocation space exhausted"); + } + next + }; + ); + + /// This puts each stack in the pool below into its own cache line. This is + /// an absolutely critical optimization that tends to have the most impact + /// in high contention workloads. Without forcing each mutex protected + /// into its own cache line, high contention exacerbates the performance + /// problem by causing "false sharing." By putting each mutex in its own + /// cache-line, we avoid the false sharing problem and the affects of + /// contention are greatly reduced. + #[derive(Debug)] + #[repr(C, align(64))] + struct CacheLine<T>(T); + + /// A thread safe pool utilizing std-only features. + /// + /// The main difference between this and the simplistic alloc-only pool is + /// the use of std::sync::Mutex and an "owner thread" optimization that + /// makes accesses by the owner of a pool faster than all other threads. + /// This makes the common case of running a regex within a single thread + /// faster by avoiding mutex unlocking. + pub(super) struct Pool<T, F> { + /// A function to create more T values when stack is empty and a caller + /// has requested a T. + create: F, + /// Multiple stacks of T values to hand out. These are used when a Pool + /// is accessed by a thread that didn't create it. + /// + /// Conceptually this is `Mutex<Vec<Box<T>>>`, but sharded out to make + /// it scale better under high contention work-loads. We index into + /// this sequence via `thread_id % stacks.len()`. + stacks: Vec<CacheLine<Mutex<Vec<Box<T>>>>>, + /// The ID of the thread that owns this pool. The owner is the thread + /// that makes the first call to 'get'. When the owner calls 'get', it + /// gets 'owner_val' directly instead of returning a T from 'stack'. + /// See comments elsewhere for details, but this is intended to be an + /// optimization for the common case that makes getting a T faster. + /// + /// It is initialized to a value of zero (an impossible thread ID) as a + /// sentinel to indicate that it is unowned. + owner: AtomicUsize, + /// A value to return when the caller is in the same thread that + /// first called `Pool::get`. + /// + /// This is set to None when a Pool is first created, and set to Some + /// once the first thread calls Pool::get. + owner_val: UnsafeCell<Option<T>>, + } + + // SAFETY: Since we want to use a Pool from multiple threads simultaneously + // behind an Arc, we need for it to be Sync. In cases where T is sync, + // Pool<T> would be Sync. However, since we use a Pool to store mutable + // scratch space, we wind up using a T that has interior mutability and is + // thus itself not Sync. So what we *really* want is for our Pool<T> to by + // Sync even when T is not Sync (but is at least Send). + // + // The only non-sync aspect of a Pool is its 'owner_val' field, which is + // used to implement faster access to a pool value in the common case of + // a pool being accessed in the same thread in which it was created. The + // 'stack' field is also shared, but a Mutex<T> where T: Send is already + // Sync. So we only need to worry about 'owner_val'. + // + // The key is to guarantee that 'owner_val' can only ever be accessed from + // one thread. In our implementation below, we guarantee this by only + // returning the 'owner_val' when the ID of the current thread matches the + // ID of the thread that first called 'Pool::get'. Since this can only ever + // be one thread, it follows that only one thread can access 'owner_val' at + // any point in time. Thus, it is safe to declare that Pool<T> is Sync when + // T is Send. + // + // If there is a way to achieve our performance goals using safe code, then + // I would very much welcome a patch. As it stands, the implementation + // below tries to balance safety with performance. The case where a Regex + // is used from multiple threads simultaneously will suffer a bit since + // getting a value out of the pool will require unlocking a mutex. + // + // We require `F: Send + Sync` because we call `F` at any point on demand, + // potentially from multiple threads simultaneously. + unsafe impl<T: Send, F: Send + Sync> Sync for Pool<T, F> {} + + // If T is UnwindSafe, then since we provide exclusive access to any + // particular value in the pool, the pool should therefore also be + // considered UnwindSafe. + // + // We require `F: UnwindSafe + RefUnwindSafe` because we call `F` at any + // point on demand, so it needs to be unwind safe on both dimensions for + // the entire Pool to be unwind safe. + impl<T: UnwindSafe, F: UnwindSafe + RefUnwindSafe> UnwindSafe for Pool<T, F> {} + + // If T is UnwindSafe, then since we provide exclusive access to any + // particular value in the pool, the pool should therefore also be + // considered RefUnwindSafe. + // + // We require `F: UnwindSafe + RefUnwindSafe` because we call `F` at any + // point on demand, so it needs to be unwind safe on both dimensions for + // the entire Pool to be unwind safe. + impl<T: UnwindSafe, F: UnwindSafe + RefUnwindSafe> RefUnwindSafe + for Pool<T, F> + { + } + + impl<T, F> Pool<T, F> { + /// Create a new pool. The given closure is used to create values in + /// the pool when necessary. + pub(super) fn new(create: F) -> Pool<T, F> { + // MSRV(1.63): Mark this function as 'const'. I've arranged the + // code such that it should "just work." Then mark the public + // 'Pool::new' method as 'const' too. (The alloc-only Pool::new + // is already 'const', so that should "just work" too.) The only + // thing we're waiting for is Mutex::new to be const. + let mut stacks = Vec::with_capacity(MAX_POOL_STACKS); + for _ in 0..stacks.capacity() { + stacks.push(CacheLine(Mutex::new(vec![]))); + } + let owner = AtomicUsize::new(THREAD_ID_UNOWNED); + let owner_val = UnsafeCell::new(None); // init'd on first access + Pool { create, stacks, owner, owner_val } + } + } + + impl<T: Send, F: Fn() -> T> Pool<T, F> { + /// Get a value from the pool. This may block if another thread is also + /// attempting to retrieve a value from the pool. + pub(super) fn get(&self) -> PoolGuard<'_, T, F> { + // Our fast path checks if the caller is the thread that "owns" + // this pool. Or stated differently, whether it is the first thread + // that tried to extract a value from the pool. If it is, then we + // can return a T to the caller without going through a mutex. + // + // SAFETY: We must guarantee that only one thread gets access + // to this value. Since a thread is uniquely identified by the + // THREAD_ID thread local, it follows that if the caller's thread + // ID is equal to the owner, then only one thread may receive this + // value. This is also why we can get away with what looks like a + // racy load and a store. We know that if 'owner == caller', then + // only one thread can be here, so we don't need to worry about any + // other thread setting the owner to something else. + let caller = THREAD_ID.with(|id| *id); + let owner = self.owner.load(Ordering::Acquire); + if caller == owner { + // N.B. We could also do a CAS here instead of a load/store, + // but ad hoc benchmarking suggests it is slower. And a lot + // slower in the case where `get_slow` is common. + self.owner.store(THREAD_ID_INUSE, Ordering::Release); + return self.guard_owned(caller); + } + self.get_slow(caller, owner) + } + + /// This is the "slow" version that goes through a mutex to pop an + /// allocated value off a stack to return to the caller. (Or, if the + /// stack is empty, a new value is created.) + /// + /// If the pool has no owner, then this will set the owner. + #[cold] + fn get_slow( + &self, + caller: usize, + owner: usize, + ) -> PoolGuard<'_, T, F> { + if owner == THREAD_ID_UNOWNED { + // This sentinel means this pool is not yet owned. We try to + // atomically set the owner. If we do, then this thread becomes + // the owner and we can return a guard that represents the + // special T for the owner. + // + // Note that we set the owner to a different sentinel that + // indicates that the owned value is in use. The owner ID will + // get updated to the actual ID of this thread once the guard + // returned by this function is put back into the pool. + let res = self.owner.compare_exchange( + THREAD_ID_UNOWNED, + THREAD_ID_INUSE, + Ordering::AcqRel, + Ordering::Acquire, + ); + if res.is_ok() { + // SAFETY: A successful CAS above implies this thread is + // the owner and that this is the only such thread that + // can reach here. Thus, there is no data race. + unsafe { + *self.owner_val.get() = Some((self.create)()); + } + return self.guard_owned(caller); + } + } + let stack_id = caller % self.stacks.len(); + // We try to acquire exclusive access to this thread's stack, and + // if so, grab a value from it if we can. We put this in a loop so + // that it's easy to tweak and experiment with a different number + // of tries. In the end, I couldn't see anything obviously better + // than one attempt in ad hoc testing. + for _ in 0..1 { + let mut stack = match self.stacks[stack_id].0.try_lock() { + Err(_) => continue, + Ok(stack) => stack, + }; + if let Some(value) = stack.pop() { + return self.guard_stack(value); + } + // Unlock the mutex guarding the stack before creating a fresh + // value since we no longer need the stack. + drop(stack); + let value = Box::new((self.create)()); + return self.guard_stack(value); + } + // We're only here if we could get access to our stack, so just + // create a new value. This seems like it could be wasteful, but + // waiting for exclusive access to a stack when there's high + // contention is brutal for perf. + self.guard_stack_transient(Box::new((self.create)())) + } + + /// Puts a value back into the pool. Callers don't need to call this. + /// Once the guard that's returned by 'get' is dropped, it is put back + /// into the pool automatically. + fn put_value(&self, value: Box<T>) { + let caller = THREAD_ID.with(|id| *id); + let stack_id = caller % self.stacks.len(); + // As with trying to pop a value from this thread's stack, we + // merely attempt to get access to push this value back on the + // stack. If there's too much contention, we just give up and throw + // the value away. + // + // Interestingly, in ad hoc benchmarking, it is beneficial to + // attempt to push the value back more than once, unlike when + // popping the value. I don't have a good theory for why this is. + // I guess if we drop too many values then that winds up forcing + // the pop operation to create new fresh values and thus leads to + // less reuse. There's definitely a balancing act here. + for _ in 0..10 { + let mut stack = match self.stacks[stack_id].0.try_lock() { + Err(_) => continue, + Ok(stack) => stack, + }; + stack.push(value); + return; + } + } + + /// Create a guard that represents the special owned T. + fn guard_owned(&self, caller: usize) -> PoolGuard<'_, T, F> { + PoolGuard { pool: self, value: Err(caller), discard: false } + } + + /// Create a guard that contains a value from the pool's stack. + fn guard_stack(&self, value: Box<T>) -> PoolGuard<'_, T, F> { + PoolGuard { pool: self, value: Ok(value), discard: false } + } + + /// Create a guard that contains a value from the pool's stack with an + /// instruction to throw away the value instead of putting it back + /// into the pool. + fn guard_stack_transient(&self, value: Box<T>) -> PoolGuard<'_, T, F> { + PoolGuard { pool: self, value: Ok(value), discard: true } + } + } + + impl<T: core::fmt::Debug, F> core::fmt::Debug for Pool<T, F> { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + f.debug_struct("Pool") + .field("stacks", &self.stacks) + .field("owner", &self.owner) + .field("owner_val", &self.owner_val) + .finish() + } + } + + /// A guard that is returned when a caller requests a value from the pool. + pub(super) struct PoolGuard<'a, T: Send, F: Fn() -> T> { + /// The pool that this guard is attached to. + pool: &'a Pool<T, F>, + /// This is Err when the guard represents the special "owned" value. + /// In which case, the value is retrieved from 'pool.owner_val'. And + /// in the special case of `Err(THREAD_ID_DROPPED)`, it means the + /// guard has been put back into the pool and should no longer be used. + value: Result<Box<T>, usize>, + /// When true, the value should be discarded instead of being pushed + /// back into the pool. We tend to use this under high contention, and + /// this allows us to avoid inflating the size of the pool. (Because + /// under contention, we tend to create more values instead of waiting + /// for access to a stack of existing values.) + discard: bool, + } + + impl<'a, T: Send, F: Fn() -> T> PoolGuard<'a, T, F> { + /// Return the underlying value. + pub(super) fn value(&self) -> &T { + match self.value { + Ok(ref v) => &**v, + // SAFETY: This is safe because the only way a PoolGuard gets + // created for self.value=Err is when the current thread + // corresponds to the owning thread, of which there can only + // be one. Thus, we are guaranteed to be providing exclusive + // access here which makes this safe. + // + // Also, since 'owner_val' is guaranteed to be initialized + // before an owned PoolGuard is created, the unchecked unwrap + // is safe. + Err(id) => unsafe { + // This assert is *not* necessary for safety, since we + // should never be here if the guard had been put back into + // the pool. This is a sanity check to make sure we didn't + // break an internal invariant. + debug_assert_ne!(THREAD_ID_DROPPED, id); + (*self.pool.owner_val.get()).as_ref().unwrap_unchecked() + }, + } + } + + /// Return the underlying value as a mutable borrow. + pub(super) fn value_mut(&mut self) -> &mut T { + match self.value { + Ok(ref mut v) => &mut **v, + // SAFETY: This is safe because the only way a PoolGuard gets + // created for self.value=None is when the current thread + // corresponds to the owning thread, of which there can only + // be one. Thus, we are guaranteed to be providing exclusive + // access here which makes this safe. + // + // Also, since 'owner_val' is guaranteed to be initialized + // before an owned PoolGuard is created, the unwrap_unchecked + // is safe. + Err(id) => unsafe { + // This assert is *not* necessary for safety, since we + // should never be here if the guard had been put back into + // the pool. This is a sanity check to make sure we didn't + // break an internal invariant. + debug_assert_ne!(THREAD_ID_DROPPED, id); + (*self.pool.owner_val.get()).as_mut().unwrap_unchecked() + }, + } + } + + /// Consumes this guard and puts it back into the pool. + pub(super) fn put(this: PoolGuard<'_, T, F>) { + // Since this is effectively consuming the guard and putting the + // value back into the pool, there's no reason to run its Drop + // impl after doing this. I don't believe there is a correctness + // problem with doing so, but there's definitely a perf problem + // by redoing this work. So we avoid it. + let mut this = core::mem::ManuallyDrop::new(this); + this.put_imp(); + } + + /// Puts this guard back into the pool by only borrowing the guard as + /// mutable. This should be called at most once. + #[inline(always)] + fn put_imp(&mut self) { + match core::mem::replace(&mut self.value, Err(THREAD_ID_DROPPED)) { + Ok(value) => { + // If we were told to discard this value then don't bother + // trying to put it back into the pool. This occurs when + // the pop operation failed to acquire a lock and we + // decided to create a new value in lieu of contending for + // the lock. + if self.discard { + return; + } + self.pool.put_value(value); + } + // If this guard has a value "owned" by the thread, then + // the Pool guarantees that this is the ONLY such guard. + // Therefore, in order to place it back into the pool and make + // it available, we need to change the owner back to the owning + // thread's ID. But note that we use the ID that was stored in + // the guard, since a guard can be moved to another thread and + // dropped. (A previous iteration of this code read from the + // THREAD_ID thread local, which uses the ID of the current + // thread which may not be the ID of the owning thread! This + // also avoids the TLS access, which is likely a hair faster.) + Err(owner) => { + // If we hit this point, it implies 'put_imp' has been + // called multiple times for the same guard which in turn + // corresponds to a bug in this implementation. + assert_ne!(THREAD_ID_DROPPED, owner); + self.pool.owner.store(owner, Ordering::Release); + } + } + } + } + + impl<'a, T: Send, F: Fn() -> T> Drop for PoolGuard<'a, T, F> { + fn drop(&mut self) { + self.put_imp(); + } + } + + impl<'a, T: Send + core::fmt::Debug, F: Fn() -> T> core::fmt::Debug + for PoolGuard<'a, T, F> + { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + f.debug_struct("PoolGuard") + .field("pool", &self.pool) + .field("value", &self.value) + .finish() + } + } +} + +// FUTURE: We should consider using Mara Bos's nearly-lock-free version of this +// here: https://gist.github.com/m-ou-se/5fdcbdf7dcf4585199ce2de697f367a4. +// +// One reason why I did things with a "mutex" below is that it isolates the +// safety concerns to just the Mutex, where as the safety of Mara's pool is a +// bit more sprawling. I also expect this code to not be used that much, and +// so is unlikely to get as much real world usage with which to test it. That +// means the "obviously correct" lever is an important one. +// +// The specific reason to use Mara's pool is that it is likely faster and also +// less likely to hit problems with spin-locks, although it is not completely +// impervious to them. +// +// The best solution to this problem, probably, is a truly lock free pool. That +// could be done with a lock free linked list. The issue is the ABA problem. It +// is difficult to avoid, and doing so is complex. BUT, the upshot of that is +// that if we had a truly lock free pool, then we could also use it above in +// the 'std' pool instead of a Mutex because it should be completely free the +// problems that come from spin-locks. +#[cfg(not(feature = "std"))] +mod inner { + use core::{ + cell::UnsafeCell, + panic::{RefUnwindSafe, UnwindSafe}, + sync::atomic::{AtomicBool, Ordering}, + }; + + use alloc::{boxed::Box, vec, vec::Vec}; + + /// A thread safe pool utilizing alloc-only features. + /// + /// Unlike the std version, it doesn't seem possible(?) to implement the + /// "thread owner" optimization because alloc-only doesn't have any concept + /// of threads. So the best we can do is just a normal stack. This will + /// increase latency in alloc-only environments. + pub(super) struct Pool<T, F> { + /// A stack of T values to hand out. These are used when a Pool is + /// accessed by a thread that didn't create it. + stack: Mutex<Vec<Box<T>>>, + /// A function to create more T values when stack is empty and a caller + /// has requested a T. + create: F, + } + + // If T is UnwindSafe, then since we provide exclusive access to any + // particular value in the pool, it should therefore also be considered + // RefUnwindSafe. + impl<T: UnwindSafe, F: UnwindSafe> RefUnwindSafe for Pool<T, F> {} + + impl<T, F> Pool<T, F> { + /// Create a new pool. The given closure is used to create values in + /// the pool when necessary. + pub(super) const fn new(create: F) -> Pool<T, F> { + Pool { stack: Mutex::new(vec![]), create } + } + } + + impl<T: Send, F: Fn() -> T> Pool<T, F> { + /// Get a value from the pool. This may block if another thread is also + /// attempting to retrieve a value from the pool. + pub(super) fn get(&self) -> PoolGuard<'_, T, F> { + let mut stack = self.stack.lock(); + let value = match stack.pop() { + None => Box::new((self.create)()), + Some(value) => value, + }; + PoolGuard { pool: self, value: Some(value) } + } + + fn put(&self, guard: PoolGuard<'_, T, F>) { + let mut guard = core::mem::ManuallyDrop::new(guard); + if let Some(value) = guard.value.take() { + self.put_value(value); + } + } + + /// Puts a value back into the pool. Callers don't need to call this. + /// Once the guard that's returned by 'get' is dropped, it is put back + /// into the pool automatically. + fn put_value(&self, value: Box<T>) { + let mut stack = self.stack.lock(); + stack.push(value); + } + } + + impl<T: core::fmt::Debug, F> core::fmt::Debug for Pool<T, F> { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + f.debug_struct("Pool").field("stack", &self.stack).finish() + } + } + + /// A guard that is returned when a caller requests a value from the pool. + pub(super) struct PoolGuard<'a, T: Send, F: Fn() -> T> { + /// The pool that this guard is attached to. + pool: &'a Pool<T, F>, + /// This is None after the guard has been put back into the pool. + value: Option<Box<T>>, + } + + impl<'a, T: Send, F: Fn() -> T> PoolGuard<'a, T, F> { + /// Return the underlying value. + pub(super) fn value(&self) -> &T { + self.value.as_deref().unwrap() + } + + /// Return the underlying value as a mutable borrow. + pub(super) fn value_mut(&mut self) -> &mut T { + self.value.as_deref_mut().unwrap() + } + + /// Consumes this guard and puts it back into the pool. + pub(super) fn put(this: PoolGuard<'_, T, F>) { + // Since this is effectively consuming the guard and putting the + // value back into the pool, there's no reason to run its Drop + // impl after doing this. I don't believe there is a correctness + // problem with doing so, but there's definitely a perf problem + // by redoing this work. So we avoid it. + let mut this = core::mem::ManuallyDrop::new(this); + this.put_imp(); + } + + /// Puts this guard back into the pool by only borrowing the guard as + /// mutable. This should be called at most once. + #[inline(always)] + fn put_imp(&mut self) { + if let Some(value) = self.value.take() { + self.pool.put_value(value); + } + } + } + + impl<'a, T: Send, F: Fn() -> T> Drop for PoolGuard<'a, T, F> { + fn drop(&mut self) { + self.put_imp(); + } + } + + impl<'a, T: Send + core::fmt::Debug, F: Fn() -> T> core::fmt::Debug + for PoolGuard<'a, T, F> + { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + f.debug_struct("PoolGuard") + .field("pool", &self.pool) + .field("value", &self.value) + .finish() + } + } + + /// A spin-lock based mutex. Yes, I have read spinlocks cosnidered + /// harmful[1], and if there's a reasonable alternative choice, I'll + /// happily take it. + /// + /// I suspect the most likely alternative here is a Treiber stack, but + /// implementing one correctly in a way that avoids the ABA problem looks + /// subtle enough that I'm not sure I want to attempt that. But otherwise, + /// we only need a mutex in order to implement our pool, so if there's + /// something simpler we can use that works for our `Pool` use case, then + /// that would be great. + /// + /// Note that this mutex does not do poisoning. + /// + /// [1]: https://matklad.github.io/2020/01/02/spinlocks-considered-harmful.html + #[derive(Debug)] + struct Mutex<T> { + locked: AtomicBool, + data: UnsafeCell<T>, + } + + // SAFETY: Since a Mutex guarantees exclusive access, as long as we can + // send it across threads, it must also be Sync. + unsafe impl<T: Send> Sync for Mutex<T> {} + + impl<T> Mutex<T> { + /// Create a new mutex for protecting access to the given value across + /// multiple threads simultaneously. + const fn new(value: T) -> Mutex<T> { + Mutex { + locked: AtomicBool::new(false), + data: UnsafeCell::new(value), + } + } + + /// Lock this mutex and return a guard providing exclusive access to + /// `T`. This blocks if some other thread has already locked this + /// mutex. + fn lock(&self) -> MutexGuard<'_, T> { + while self + .locked + .compare_exchange( + false, + true, + Ordering::AcqRel, + Ordering::Acquire, + ) + .is_err() + { + core::hint::spin_loop(); + } + // SAFETY: The only way we're here is if we successfully set + // 'locked' to true, which implies we must be the only thread here + // and thus have exclusive access to 'data'. + let data = unsafe { &mut *self.data.get() }; + MutexGuard { locked: &self.locked, data } + } + } + + /// A guard that derefs to &T and &mut T. When it's dropped, the lock is + /// released. + #[derive(Debug)] + struct MutexGuard<'a, T> { + locked: &'a AtomicBool, + data: &'a mut T, + } + + impl<'a, T> core::ops::Deref for MutexGuard<'a, T> { + type Target = T; + + fn deref(&self) -> &T { + self.data + } + } + + impl<'a, T> core::ops::DerefMut for MutexGuard<'a, T> { + fn deref_mut(&mut self) -> &mut T { + self.data + } + } + + impl<'a, T> Drop for MutexGuard<'a, T> { + fn drop(&mut self) { + // Drop means 'data' is no longer accessible, so we can unlock + // the mutex. + self.locked.store(false, Ordering::Release); + } + } +} + +#[cfg(test)] +mod tests { + use core::panic::{RefUnwindSafe, UnwindSafe}; + + use alloc::{boxed::Box, vec, vec::Vec}; + + use super::*; + + #[test] + fn oibits() { + fn assert_oitbits<T: Send + Sync + UnwindSafe + RefUnwindSafe>() {} + assert_oitbits::<Pool<Vec<u32>>>(); + assert_oitbits::<Pool<core::cell::RefCell<Vec<u32>>>>(); + assert_oitbits::< + Pool< + Vec<u32>, + Box< + dyn Fn() -> Vec<u32> + + Send + + Sync + + UnwindSafe + + RefUnwindSafe, + >, + >, + >(); + } + + // Tests that Pool implements the "single owner" optimization. That is, the + // thread that first accesses the pool gets its own copy, while all other + // threads get distinct copies. + #[cfg(feature = "std")] + #[test] + fn thread_owner_optimization() { + use std::{cell::RefCell, sync::Arc, vec}; + + let pool: Arc<Pool<RefCell<Vec<char>>>> = + Arc::new(Pool::new(|| RefCell::new(vec!['a']))); + pool.get().borrow_mut().push('x'); + + let pool1 = pool.clone(); + let t1 = std::thread::spawn(move || { + let guard = pool1.get(); + guard.borrow_mut().push('y'); + }); + + let pool2 = pool.clone(); + let t2 = std::thread::spawn(move || { + let guard = pool2.get(); + guard.borrow_mut().push('z'); + }); + + t1.join().unwrap(); + t2.join().unwrap(); + + // If we didn't implement the single owner optimization, then one of + // the threads above is likely to have mutated the [a, x] vec that + // we stuffed in the pool before spawning the threads. But since + // neither thread was first to access the pool, and because of the + // optimization, we should be guaranteed that neither thread mutates + // the special owned pool value. + // + // (Technically this is an implementation detail and not a contract of + // Pool's API.) + assert_eq!(vec!['a', 'x'], *pool.get().borrow()); + } + + // This tests that if the "owner" of a pool asks for two values, then it + // gets two distinct values and not the same one. This test failed in the + // course of developing the pool, which in turn resulted in UB because it + // permitted getting aliasing &mut borrows to the same place in memory. + #[test] + fn thread_owner_distinct() { + let pool = Pool::new(|| vec!['a']); + + { + let mut g1 = pool.get(); + let v1 = &mut *g1; + let mut g2 = pool.get(); + let v2 = &mut *g2; + v1.push('b'); + v2.push('c'); + assert_eq!(&mut vec!['a', 'b'], v1); + assert_eq!(&mut vec!['a', 'c'], v2); + } + // This isn't technically guaranteed, but we + // expect to now get the "owned" value (the first + // call to 'get()' above) now that it's back in + // the pool. + assert_eq!(&mut vec!['a', 'b'], &mut *pool.get()); + } + + // This tests that we can share a guard with another thread, mutate the + // underlying value and everything works. This failed in the course of + // developing a pool since the pool permitted 'get()' to return the same + // value to the owner thread, even before the previous value was put back + // into the pool. This in turn resulted in this test producing a data race. + #[cfg(feature = "std")] + #[test] + fn thread_owner_sync() { + let pool = Pool::new(|| vec!['a']); + { + let mut g1 = pool.get(); + let mut g2 = pool.get(); + std::thread::scope(|s| { + s.spawn(|| { + g1.push('b'); + }); + s.spawn(|| { + g2.push('c'); + }); + }); + + let v1 = &mut *g1; + let v2 = &mut *g2; + assert_eq!(&mut vec!['a', 'b'], v1); + assert_eq!(&mut vec!['a', 'c'], v2); + } + + // This isn't technically guaranteed, but we + // expect to now get the "owned" value (the first + // call to 'get()' above) now that it's back in + // the pool. + assert_eq!(&mut vec!['a', 'b'], &mut *pool.get()); + } + + // This tests that if we move a PoolGuard that is owned by the current + // thread to another thread and drop it, then the thread owner doesn't + // change. During development of the pool, this test failed because the + // PoolGuard assumed it was dropped in the same thread from which it was + // created, and thus used the current thread's ID as the owner, which could + // be different than the actual owner of the pool. + #[cfg(feature = "std")] + #[test] + fn thread_owner_send_drop() { + let pool = Pool::new(|| vec!['a']); + // Establishes this thread as the owner. + { + pool.get().push('b'); + } + std::thread::scope(|s| { + // Sanity check that we get the same value back. + // (Not technically guaranteed.) + let mut g = pool.get(); + assert_eq!(&vec!['a', 'b'], &*g); + // Now push it to another thread and drop it. + s.spawn(move || { + g.push('c'); + }) + .join() + .unwrap(); + }); + // Now check that we're still the owner. This is not technically + // guaranteed by the API, but is true in practice given the thread + // owner optimization. + assert_eq!(&vec!['a', 'b', 'c'], &*pool.get()); + } +} diff --git a/vendor/regex-automata/src/util/prefilter.rs b/vendor/regex-automata/src/util/prefilter.rs deleted file mode 100644 index 5fe151524..000000000 --- a/vendor/regex-automata/src/util/prefilter.rs +++ /dev/null @@ -1,281 +0,0 @@ -use crate::Match; - -/// A candidate is the result of running a prefilter on a haystack at a -/// particular position. The result is one of no match, a confirmed match or -/// a possible match. -/// -/// When no match is returned, the prefilter is guaranteeing that no possible -/// match can be found in the haystack, and the caller may trust this. That is, -/// all correct prefilters must never report false negatives. -/// -/// In some cases, a prefilter can confirm a match very quickly, in which case, -/// the caller may use this to stop what it's doing and report the match. In -/// this case, prefilter implementations must never report a false positive. -/// In other cases, the prefilter can only report a potential match, in which -/// case the callers must attempt to confirm the match. In this case, prefilter -/// implementations are permitted to return false positives. -#[derive(Clone, Debug)] -pub enum Candidate { - /// The prefilter reports that no match is possible. Prefilter - /// implementations will never report false negatives. - None, - /// The prefilter reports that a match has been confirmed at the provided - /// byte offsets. When this variant is reported, the prefilter is - /// guaranteeing a match. No false positives are permitted. - Match(Match), - /// The prefilter reports that a match *may* start at the given position. - /// When this variant is reported, it may correspond to a false positive. - PossibleStartOfMatch(usize), -} - -impl Candidate { - /// Convert this candidate into an option. This is useful when callers do - /// not distinguish between true positives and false positives (i.e., the - /// caller must always confirm the match in order to update some other - /// state). - /// - /// The byte offset in the option returned corresponds to the starting - /// position of the possible match. - pub fn into_option(self) -> Option<usize> { - match self { - Candidate::None => None, - Candidate::Match(ref m) => Some(m.start()), - Candidate::PossibleStartOfMatch(start) => Some(start), - } - } -} - -/// A prefilter describes the behavior of fast literal scanners for quickly -/// skipping past bytes in the haystack that we know cannot possibly -/// participate in a match. -pub trait Prefilter: core::fmt::Debug { - /// Returns the next possible match candidate. This may yield false - /// positives, so callers must confirm a match starting at the position - /// returned. This, however, must never produce false negatives. That is, - /// this must, at minimum, return the starting position of the next match - /// in the given haystack after or at the given position. - fn next_candidate( - &self, - state: &mut State, - haystack: &[u8], - at: usize, - ) -> Candidate; - - /// Returns the approximate total amount of heap used by this prefilter, in - /// units of bytes. - fn heap_bytes(&self) -> usize; - - /// Returns true if and only if this prefilter may return false positives - /// via the `Candidate::PossibleStartOfMatch` variant. This is most useful - /// when false positives are not posssible (in which case, implementations - /// should return false), which may allow completely avoiding heavier regex - /// machinery when the prefilter can quickly confirm its own matches. - /// - /// By default, this returns true, which is conservative; it is always - /// correct to return `true`. Returning `false` here and reporting a false - /// positive will result in incorrect searches. - fn reports_false_positives(&self) -> bool { - true - } -} - -impl<'a, P: Prefilter + ?Sized> Prefilter for &'a P { - #[inline] - fn next_candidate( - &self, - state: &mut State, - haystack: &[u8], - at: usize, - ) -> Candidate { - (**self).next_candidate(state, haystack, at) - } - - fn heap_bytes(&self) -> usize { - (**self).heap_bytes() - } - - fn reports_false_positives(&self) -> bool { - (**self).reports_false_positives() - } -} - -#[derive(Clone)] -pub struct Scanner<'p> { - prefilter: &'p dyn Prefilter, - state: State, -} - -impl<'p> Scanner<'p> { - pub fn new(prefilter: &'p dyn Prefilter) -> Scanner<'p> { - Scanner { prefilter, state: State::new() } - } - - pub(crate) fn is_effective(&mut self, at: usize) -> bool { - self.state.is_effective(at) - } - - pub(crate) fn reports_false_positives(&self) -> bool { - self.prefilter.reports_false_positives() - } - - pub(crate) fn next_candidate( - &mut self, - bytes: &[u8], - at: usize, - ) -> Candidate { - let cand = self.prefilter.next_candidate(&mut self.state, bytes, at); - match cand { - Candidate::None => { - self.state.update_skipped_bytes(bytes.len() - at); - } - Candidate::Match(ref m) => { - self.state.update_skipped_bytes(m.start() - at); - } - Candidate::PossibleStartOfMatch(i) => { - self.state.update_skipped_bytes(i - at); - } - } - cand - } -} - -impl<'p> core::fmt::Debug for Scanner<'p> { - fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { - f.debug_struct("Scanner").field("state", &self.state).finish() - } -} - -/// State tracks state associated with the effectiveness of a -/// prefilter. It is used to track how many bytes, on average, are skipped by -/// the prefilter. If this average dips below a certain threshold over time, -/// then the state renders the prefilter inert and stops using it. -/// -/// A prefilter state should be created for each search. (Where creating an -/// iterator via, e.g., `find_iter`, is treated as a single search.) -#[derive(Clone, Debug)] -pub struct State { - /// The number of skips that has been executed. - skips: usize, - /// The total number of bytes that have been skipped. - skipped: usize, - /// Once this heuristic has been deemed permanently ineffective, it will be - /// inert throughout the rest of its lifetime. This serves as a cheap way - /// to check inertness. - inert: bool, - /// The last (absolute) position at which a prefilter scanned to. - /// Prefilters can use this position to determine whether to re-scan or - /// not. - /// - /// Unlike other things that impact effectiveness, this is a fleeting - /// condition. That is, a prefilter can be considered ineffective if it is - /// at a position before `last_scan_at`, but can become effective again - /// once the search moves past `last_scan_at`. - /// - /// The utility of this is to both avoid additional overhead from calling - /// the prefilter and to avoid quadratic behavior. This ensures that a - /// prefilter will scan any particular byte at most once. (Note that some - /// prefilters, like the start-byte prefilter, do not need to use this - /// field at all, since it only looks for starting bytes.) - last_scan_at: usize, -} - -impl State { - /// The minimum number of skip attempts to try before considering whether - /// a prefilter is effective or not. - const MIN_SKIPS: usize = 40; - - /// The minimum amount of bytes that skipping must average. - /// - /// That is, after MIN_SKIPS have occurred, if the average number of bytes - /// skipped ever falls below MIN_AVG_SKIP, then the prefilter will be - /// rendered inert. - const MIN_AVG_SKIP: usize = 16; - - /// Create a fresh prefilter state. - pub fn new() -> State { - State { skips: 0, skipped: 0, inert: false, last_scan_at: 0 } - } - - /// Updates the position at which the last scan stopped. This may be - /// greater than the position of the last candidate reported. For example, - /// searching for the byte `z` in `abczdef` for the pattern `abcz` will - /// report a candidate at position `0`, but the end of its last scan will - /// be at position `3`. - /// - /// This position factors into the effectiveness of this prefilter. If the - /// current position is less than the last position at which a scan ended, - /// then the prefilter should not be re-run until the search moves past - /// that position. - /// - /// It is always correct to never update the last scan position. In fact, - /// it is also always correct to set the last scan position to an arbitrary - /// value. The key is setting it to a position in the future at which it - /// makes sense to restart the prefilter. - pub fn update_last_scan(&mut self, at: usize) { - if at > self.last_scan_at { - self.last_scan_at = at; - } - } - - /// Return true if and only if this state indicates that a prefilter is - /// still effective. If the prefilter is not effective, then this state - /// is rendered "inert." At which point, all subsequent calls to - /// `is_effective` on this state will return `false`. - /// - /// `at` should correspond to the current starting position of the search. - /// - /// Callers typically do not need to use this, as it represents the - /// default implementation of - /// [`Prefilter::is_effective`](trait.Prefilter.html#tymethod.is_effective). - fn is_effective(&mut self, at: usize) -> bool { - if self.inert { - return false; - } - if at < self.last_scan_at { - return false; - } - if self.skips < State::MIN_SKIPS { - return true; - } - - if self.skipped >= State::MIN_AVG_SKIP * self.skips { - return true; - } - - // We're inert. - self.inert = true; - false - } - - /// Update this state with the number of bytes skipped on the last - /// invocation of the prefilter. - fn update_skipped_bytes(&mut self, skipped: usize) { - self.skips += 1; - self.skipped += skipped; - } -} - -/// A `Prefilter` implementation that reports a possible match at every -/// position. -/// -/// This should generally not be used as an actual prefilter. It is only -/// useful when one needs to represent the absence of a prefilter in a generic -/// context. For example, a [`dfa::regex::Regex`](crate::dfa::regex::Regex) -/// uses this prefilter by default to indicate that no prefilter should be -/// used. -/// -/// A `None` prefilter value cannot be constructed. -#[derive(Clone, Debug)] -pub struct None { - _priv: (), -} - -impl Prefilter for None { - fn next_candidate(&self, _: &mut State, _: &[u8], at: usize) -> Candidate { - Candidate::PossibleStartOfMatch(at) - } - - fn heap_bytes(&self) -> usize { - 0 - } -} diff --git a/vendor/regex-automata/src/util/prefilter/aho_corasick.rs b/vendor/regex-automata/src/util/prefilter/aho_corasick.rs new file mode 100644 index 000000000..50cce827e --- /dev/null +++ b/vendor/regex-automata/src/util/prefilter/aho_corasick.rs @@ -0,0 +1,149 @@ +use crate::util::{ + prefilter::PrefilterI, + search::{MatchKind, Span}, +}; + +#[derive(Clone, Debug)] +pub(crate) struct AhoCorasick { + #[cfg(not(feature = "perf-literal-multisubstring"))] + _unused: (), + #[cfg(feature = "perf-literal-multisubstring")] + ac: aho_corasick::AhoCorasick, +} + +impl AhoCorasick { + pub(crate) fn new<B: AsRef<[u8]>>( + kind: MatchKind, + needles: &[B], + ) -> Option<AhoCorasick> { + #[cfg(not(feature = "perf-literal-multisubstring"))] + { + None + } + #[cfg(feature = "perf-literal-multisubstring")] + { + // We used to use `aho_corasick::MatchKind::Standard` here when + // `kind` was `MatchKind::All`, but this is not correct. The + // "standard" Aho-Corasick match semantics are to report a match + // immediately as soon as it is seen, but `All` isn't like that. + // In particular, with "standard" semantics, given the needles + // "abc" and "b" and the haystack "abc," it would report a match + // at offset 1 before a match at offset 0. This is never what we + // want in the context of the regex engine, regardless of whether + // we have leftmost-first or 'all' semantics. Namely, we always + // want the leftmost match. + let ac_match_kind = match kind { + MatchKind::LeftmostFirst | MatchKind::All => { + aho_corasick::MatchKind::LeftmostFirst + } + }; + // This is kind of just an arbitrary number, but basically, if we + // have a small enough set of literals, then we try to use the VERY + // memory hungry DFA. Otherwise, we whimp out and use an NFA. The + // upshot is that the NFA is quite lean and decently fast. Faster + // than a naive Aho-Corasick NFA anyway. + let ac_kind = if needles.len() <= 500 { + aho_corasick::AhoCorasickKind::DFA + } else { + aho_corasick::AhoCorasickKind::ContiguousNFA + }; + let result = aho_corasick::AhoCorasick::builder() + .kind(Some(ac_kind)) + .match_kind(ac_match_kind) + .start_kind(aho_corasick::StartKind::Both) + // We try to handle all of the prefilter cases in the super + // module, and only use Aho-Corasick for the actual automaton. + // The aho-corasick crate does have some extra prefilters, + // namely, looking for rare bytes to feed to memchr{,2,3} + // instead of just the first byte. If we end up wanting + // those---and they are somewhat tricky to implement---then + // we could port them to this crate. + // + // The main reason for doing things this way is so we have a + // complete and easy to understand picture of which prefilters + // are available and how they work. Otherwise it seems too + // easy to get into a situation where we have a prefilter + // layered on top of prefilter, and that might have unintended + // consequences. + .prefilter(false) + .build(needles); + let ac = match result { + Ok(ac) => ac, + Err(_err) => { + debug!("aho-corasick prefilter failed to build: {}", _err); + return None; + } + }; + Some(AhoCorasick { ac }) + } + } +} + +impl PrefilterI for AhoCorasick { + fn find(&self, haystack: &[u8], span: Span) -> Option<Span> { + #[cfg(not(feature = "perf-literal-multisubstring"))] + { + unreachable!() + } + #[cfg(feature = "perf-literal-multisubstring")] + { + let input = + aho_corasick::Input::new(haystack).span(span.start..span.end); + self.ac + .find(input) + .map(|m| Span { start: m.start(), end: m.end() }) + } + } + + fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span> { + #[cfg(not(feature = "perf-literal-multisubstring"))] + { + unreachable!() + } + #[cfg(feature = "perf-literal-multisubstring")] + { + let input = aho_corasick::Input::new(haystack) + .anchored(aho_corasick::Anchored::Yes) + .span(span.start..span.end); + self.ac + .find(input) + .map(|m| Span { start: m.start(), end: m.end() }) + } + } + + fn memory_usage(&self) -> usize { + #[cfg(not(feature = "perf-literal-multisubstring"))] + { + unreachable!() + } + #[cfg(feature = "perf-literal-multisubstring")] + { + self.ac.memory_usage() + } + } + + fn is_fast(&self) -> bool { + #[cfg(not(feature = "perf-literal-multisubstring"))] + { + unreachable!() + } + #[cfg(feature = "perf-literal-multisubstring")] + { + // Aho-Corasick is never considered "fast" because it's never + // going to be even close to an order of magnitude faster than the + // regex engine itself (assuming a DFA is used). In fact, it is + // usually slower. The magic of Aho-Corasick is that it can search + // a *large* number of literals with a relatively small amount of + // memory. The regex engines are far more wasteful. + // + // Aho-Corasick may be "fast" when the regex engine corresponds + // to, say, the PikeVM. That happens when the lazy DFA couldn't be + // built or used for some reason. But in these cases, the regex + // itself is likely quite big and we're probably hosed no matter + // what we do. (In this case, the best bet is for the caller to + // increase some of the memory limits on the hybrid cache capacity + // and hope that's enough.) + false + } + } +} diff --git a/vendor/regex-automata/src/util/prefilter/byteset.rs b/vendor/regex-automata/src/util/prefilter/byteset.rs new file mode 100644 index 000000000..a669d6c9d --- /dev/null +++ b/vendor/regex-automata/src/util/prefilter/byteset.rs @@ -0,0 +1,58 @@ +use crate::util::{ + prefilter::PrefilterI, + search::{MatchKind, Span}, +}; + +#[derive(Clone, Debug)] +pub(crate) struct ByteSet([bool; 256]); + +impl ByteSet { + pub(crate) fn new<B: AsRef<[u8]>>( + _kind: MatchKind, + needles: &[B], + ) -> Option<ByteSet> { + #[cfg(not(feature = "perf-literal-multisubstring"))] + { + None + } + #[cfg(feature = "perf-literal-multisubstring")] + { + let mut set = [false; 256]; + for needle in needles.iter() { + let needle = needle.as_ref(); + if needle.len() != 1 { + return None; + } + set[usize::from(needle[0])] = true; + } + Some(ByteSet(set)) + } + } +} + +impl PrefilterI for ByteSet { + fn find(&self, haystack: &[u8], span: Span) -> Option<Span> { + haystack[span].iter().position(|&b| self.0[usize::from(b)]).map(|i| { + let start = span.start + i; + let end = start + 1; + Span { start, end } + }) + } + + fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span> { + let b = *haystack.get(span.start)?; + if self.0[usize::from(b)] { + Some(Span { start: span.start, end: span.start + 1 }) + } else { + None + } + } + + fn memory_usage(&self) -> usize { + 0 + } + + fn is_fast(&self) -> bool { + false + } +} diff --git a/vendor/regex-automata/src/util/prefilter/memchr.rs b/vendor/regex-automata/src/util/prefilter/memchr.rs new file mode 100644 index 000000000..3d44b8372 --- /dev/null +++ b/vendor/regex-automata/src/util/prefilter/memchr.rs @@ -0,0 +1,186 @@ +use crate::util::{ + prefilter::PrefilterI, + search::{MatchKind, Span}, +}; + +#[derive(Clone, Debug)] +pub(crate) struct Memchr(u8); + +impl Memchr { + pub(crate) fn new<B: AsRef<[u8]>>( + _kind: MatchKind, + needles: &[B], + ) -> Option<Memchr> { + #[cfg(not(feature = "perf-literal-substring"))] + { + None + } + #[cfg(feature = "perf-literal-substring")] + { + if needles.len() != 1 { + return None; + } + if needles[0].as_ref().len() != 1 { + return None; + } + Some(Memchr(needles[0].as_ref()[0])) + } + } +} + +impl PrefilterI for Memchr { + fn find(&self, haystack: &[u8], span: Span) -> Option<Span> { + #[cfg(not(feature = "perf-literal-substring"))] + { + unreachable!() + } + #[cfg(feature = "perf-literal-substring")] + { + memchr::memchr(self.0, &haystack[span]).map(|i| { + let start = span.start + i; + let end = start + 1; + Span { start, end } + }) + } + } + + fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span> { + let b = *haystack.get(span.start)?; + if self.0 == b { + Some(Span { start: span.start, end: span.start + 1 }) + } else { + None + } + } + + fn memory_usage(&self) -> usize { + 0 + } + + fn is_fast(&self) -> bool { + true + } +} + +#[derive(Clone, Debug)] +pub(crate) struct Memchr2(u8, u8); + +impl Memchr2 { + pub(crate) fn new<B: AsRef<[u8]>>( + _kind: MatchKind, + needles: &[B], + ) -> Option<Memchr2> { + #[cfg(not(feature = "perf-literal-substring"))] + { + None + } + #[cfg(feature = "perf-literal-substring")] + { + if needles.len() != 2 { + return None; + } + if !needles.iter().all(|n| n.as_ref().len() == 1) { + return None; + } + let b1 = needles[0].as_ref()[0]; + let b2 = needles[1].as_ref()[0]; + Some(Memchr2(b1, b2)) + } + } +} + +impl PrefilterI for Memchr2 { + fn find(&self, haystack: &[u8], span: Span) -> Option<Span> { + #[cfg(not(feature = "perf-literal-substring"))] + { + unreachable!() + } + #[cfg(feature = "perf-literal-substring")] + { + memchr::memchr2(self.0, self.1, &haystack[span]).map(|i| { + let start = span.start + i; + let end = start + 1; + Span { start, end } + }) + } + } + + fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span> { + let b = *haystack.get(span.start)?; + if self.0 == b || self.1 == b { + Some(Span { start: span.start, end: span.start + 1 }) + } else { + None + } + } + + fn memory_usage(&self) -> usize { + 0 + } + + fn is_fast(&self) -> bool { + true + } +} + +#[derive(Clone, Debug)] +pub(crate) struct Memchr3(u8, u8, u8); + +impl Memchr3 { + pub(crate) fn new<B: AsRef<[u8]>>( + _kind: MatchKind, + needles: &[B], + ) -> Option<Memchr3> { + #[cfg(not(feature = "perf-literal-substring"))] + { + None + } + #[cfg(feature = "perf-literal-substring")] + { + if needles.len() != 3 { + return None; + } + if !needles.iter().all(|n| n.as_ref().len() == 1) { + return None; + } + let b1 = needles[0].as_ref()[0]; + let b2 = needles[1].as_ref()[0]; + let b3 = needles[2].as_ref()[0]; + Some(Memchr3(b1, b2, b3)) + } + } +} + +impl PrefilterI for Memchr3 { + fn find(&self, haystack: &[u8], span: Span) -> Option<Span> { + #[cfg(not(feature = "perf-literal-substring"))] + { + unreachable!() + } + #[cfg(feature = "perf-literal-substring")] + { + memchr::memchr3(self.0, self.1, self.2, &haystack[span]).map(|i| { + let start = span.start + i; + let end = start + 1; + Span { start, end } + }) + } + } + + fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span> { + let b = *haystack.get(span.start)?; + if self.0 == b || self.1 == b || self.2 == b { + Some(Span { start: span.start, end: span.start + 1 }) + } else { + None + } + } + + fn memory_usage(&self) -> usize { + 0 + } + + fn is_fast(&self) -> bool { + true + } +} diff --git a/vendor/regex-automata/src/util/prefilter/memmem.rs b/vendor/regex-automata/src/util/prefilter/memmem.rs new file mode 100644 index 000000000..deea17bd9 --- /dev/null +++ b/vendor/regex-automata/src/util/prefilter/memmem.rs @@ -0,0 +1,88 @@ +use crate::util::{ + prefilter::PrefilterI, + search::{MatchKind, Span}, +}; + +#[derive(Clone, Debug)] +pub(crate) struct Memmem { + #[cfg(not(all(feature = "std", feature = "perf-literal-substring")))] + _unused: (), + #[cfg(all(feature = "std", feature = "perf-literal-substring"))] + finder: memchr::memmem::Finder<'static>, +} + +impl Memmem { + pub(crate) fn new<B: AsRef<[u8]>>( + _kind: MatchKind, + needles: &[B], + ) -> Option<Memmem> { + #[cfg(not(all(feature = "std", feature = "perf-literal-substring")))] + { + None + } + #[cfg(all(feature = "std", feature = "perf-literal-substring"))] + { + if needles.len() != 1 { + return None; + } + let needle = needles[0].as_ref(); + let finder = memchr::memmem::Finder::new(needle).into_owned(); + Some(Memmem { finder }) + } + } +} + +impl PrefilterI for Memmem { + fn find(&self, haystack: &[u8], span: Span) -> Option<Span> { + #[cfg(not(all(feature = "std", feature = "perf-literal-substring")))] + { + unreachable!() + } + #[cfg(all(feature = "std", feature = "perf-literal-substring"))] + { + self.finder.find(&haystack[span]).map(|i| { + let start = span.start + i; + let end = start + self.finder.needle().len(); + Span { start, end } + }) + } + } + + fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span> { + #[cfg(not(all(feature = "std", feature = "perf-literal-substring")))] + { + unreachable!() + } + #[cfg(all(feature = "std", feature = "perf-literal-substring"))] + { + let needle = self.finder.needle(); + if haystack[span].starts_with(needle) { + Some(Span { end: span.start + needle.len(), ..span }) + } else { + None + } + } + } + + fn memory_usage(&self) -> usize { + #[cfg(not(all(feature = "std", feature = "perf-literal-substring")))] + { + unreachable!() + } + #[cfg(all(feature = "std", feature = "perf-literal-substring"))] + { + self.finder.needle().len() + } + } + + fn is_fast(&self) -> bool { + #[cfg(not(all(feature = "std", feature = "perf-literal-substring")))] + { + unreachable!() + } + #[cfg(all(feature = "std", feature = "perf-literal-substring"))] + { + true + } + } +} diff --git a/vendor/regex-automata/src/util/prefilter/mod.rs b/vendor/regex-automata/src/util/prefilter/mod.rs new file mode 100644 index 000000000..51fc92233 --- /dev/null +++ b/vendor/regex-automata/src/util/prefilter/mod.rs @@ -0,0 +1,696 @@ +/*! +Defines a prefilter for accelerating regex searches. + +A prefilter can be created by building a [`Prefilter`] value. + +A prefilter represents one of the most important optimizations available for +accelerating regex searches. The idea of a prefilter is to very quickly find +candidate locations in a haystack where a regex _could_ match. Once a candidate +is found, it is then intended for the regex engine to run at that position to +determine whether the candidate is a match or a false positive. + +In the aforementioned description of the prefilter optimization also lay its +demise. Namely, if a prefilter has a high false positive rate and it produces +lots of candidates, then a prefilter can overall make a regex search slower. +It can run more slowly because more time is spent ping-ponging between the +prefilter search and the regex engine attempting to confirm each candidate as +a match. This ping-ponging has overhead that adds up, and is exacerbated by +a high false positive rate. + +Nevertheless, the optimization is still generally worth performing in most +cases. Particularly given just how much throughput can be improved. (It is not +uncommon for prefilter optimizations to improve throughput by one or two orders +of magnitude.) + +Typically a prefilter is used to find occurrences of literal prefixes from a +regex pattern, but this isn't required. A prefilter can be used to look for +suffixes or even inner literals. + +Note that as of now, prefilters throw away information about which pattern +each literal comes from. In other words, when a prefilter finds a match, +there's no way to know which pattern (or patterns) it came from. Therefore, +in order to confirm a match, you'll have to check all of the patterns by +running the full regex engine. +*/ + +mod aho_corasick; +mod byteset; +mod memchr; +mod memmem; +mod teddy; + +use core::{ + borrow::Borrow, + fmt::Debug, + panic::{RefUnwindSafe, UnwindSafe}, +}; + +#[cfg(feature = "alloc")] +use alloc::sync::Arc; + +#[cfg(feature = "syntax")] +use regex_syntax::hir::{literal, Hir}; + +use crate::util::search::{MatchKind, Span}; + +pub(crate) use crate::util::prefilter::{ + aho_corasick::AhoCorasick, + byteset::ByteSet, + memchr::{Memchr, Memchr2, Memchr3}, + memmem::Memmem, + teddy::Teddy, +}; + +/// A prefilter for accelerating regex searches. +/// +/// If you already have your literals that you want to search with, +/// then the vanilla [`Prefilter::new`] constructor is for you. But +/// if you have an [`Hir`] value from the `regex-syntax` crate, then +/// [`Prefilter::from_hir_prefix`] might be more convenient. Namely, it uses +/// the [`regex-syntax::hir::literal`](regex_syntax::hir::literal) module to +/// extract literal prefixes for you, optimize them and then select and build a +/// prefilter matcher. +/// +/// A prefilter must have **zero false negatives**. However, by its very +/// nature, it may produce false positives. That is, a prefilter will never +/// skip over a position in the haystack that corresponds to a match of the +/// original regex pattern, but it *may* produce a match for a position +/// in the haystack that does *not* correspond to a match of the original +/// regex pattern. If you use either the [`Prefilter::from_hir_prefix`] or +/// [`Prefilter::from_hirs_prefix`] constructors, then this guarantee is +/// upheld for you automatically. This guarantee is not preserved if you use +/// [`Prefilter::new`] though, since it is up to the caller to provide correct +/// literal strings with respect to the original regex pattern. +/// +/// # Cloning +/// +/// It is an API guarantee that cloning a prefilter is cheap. That is, cloning +/// it will not duplicate whatever heap memory is used to represent the +/// underlying matcher. +/// +/// # Example +/// +/// This example shows how to attach a `Prefilter` to the +/// [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM) in order to accelerate +/// searches. +/// +/// ``` +/// use regex_automata::{ +/// nfa::thompson::pikevm::PikeVM, +/// util::prefilter::Prefilter, +/// Match, MatchKind, +/// }; +/// +/// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["Bruce "]) +/// .expect("a prefilter"); +/// let re = PikeVM::builder() +/// .configure(PikeVM::config().prefilter(Some(pre))) +/// .build(r"Bruce \w+")?; +/// let mut cache = re.create_cache(); +/// assert_eq!( +/// Some(Match::must(0, 6..23)), +/// re.find(&mut cache, "Hello Bruce Springsteen!"), +/// ); +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +/// +/// But note that if you get your prefilter incorrect, it could lead to an +/// incorrect result! +/// +/// ``` +/// use regex_automata::{ +/// nfa::thompson::pikevm::PikeVM, +/// util::prefilter::Prefilter, +/// Match, MatchKind, +/// }; +/// +/// // This prefilter is wrong! +/// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["Patti "]) +/// .expect("a prefilter"); +/// let re = PikeVM::builder() +/// .configure(PikeVM::config().prefilter(Some(pre))) +/// .build(r"Bruce \w+")?; +/// let mut cache = re.create_cache(); +/// // We find no match even though the regex does match. +/// assert_eq!( +/// None, +/// re.find(&mut cache, "Hello Bruce Springsteen!"), +/// ); +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +#[derive(Clone, Debug)] +pub struct Prefilter { + #[cfg(not(feature = "alloc"))] + _unused: (), + #[cfg(feature = "alloc")] + pre: Arc<dyn PrefilterI>, + #[cfg(feature = "alloc")] + is_fast: bool, +} + +impl Prefilter { + /// Create a new prefilter from a sequence of needles and a corresponding + /// match semantics. + /// + /// This may return `None` for a variety of reasons, for example, if + /// a suitable prefilter could not be constructed. That might occur + /// if they are unavailable (e.g., the `perf-literal-substring` and + /// `perf-literal-multisubstring` features aren't enabled), or it might + /// occur because of heuristics or other artifacts of how the prefilter + /// works. + /// + /// Note that if you have an [`Hir`] expression, it may be more convenient + /// to use [`Prefilter::from_hir_prefix`]. It will automatically handle the + /// task of extracting prefix literals for you. + /// + /// # Example + /// + /// This example shows how match semantics can impact the matching + /// algorithm used by the prefilter. For this reason, it is important to + /// ensure that the match semantics given here are consistent with the + /// match semantics intended for the regular expression that the literals + /// were extracted from. + /// + /// ``` + /// use regex_automata::{ + /// util::{prefilter::Prefilter, syntax}, + /// MatchKind, Span, + /// }; + /// + /// let hay = "Hello samwise"; + /// + /// // With leftmost-first, we find 'samwise' here because it comes + /// // before 'sam' in the sequence we give it.. + /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["samwise", "sam"]) + /// .expect("a prefilter"); + /// assert_eq!( + /// Some(Span::from(6..13)), + /// pre.find(hay.as_bytes(), Span::from(0..hay.len())), + /// ); + /// // Still with leftmost-first but with the literals reverse, now 'sam' + /// // will match instead! + /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["sam", "samwise"]) + /// .expect("a prefilter"); + /// assert_eq!( + /// Some(Span::from(6..9)), + /// pre.find(hay.as_bytes(), Span::from(0..hay.len())), + /// ); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn new<B: AsRef<[u8]>>( + kind: MatchKind, + needles: &[B], + ) -> Option<Prefilter> { + Choice::new(kind, needles).and_then(Prefilter::from_choice) + } + + /// This turns a prefilter selection into a `Prefilter`. That is, in turns + /// the enum given into a trait object. + fn from_choice(choice: Choice) -> Option<Prefilter> { + #[cfg(not(feature = "alloc"))] + { + None + } + #[cfg(feature = "alloc")] + { + let pre: Arc<dyn PrefilterI> = match choice { + Choice::Memchr(p) => Arc::new(p), + Choice::Memchr2(p) => Arc::new(p), + Choice::Memchr3(p) => Arc::new(p), + Choice::Memmem(p) => Arc::new(p), + Choice::Teddy(p) => Arc::new(p), + Choice::ByteSet(p) => Arc::new(p), + Choice::AhoCorasick(p) => Arc::new(p), + }; + let is_fast = pre.is_fast(); + Some(Prefilter { pre, is_fast }) + } + } + + /// This attempts to extract prefixes from the given `Hir` expression for + /// the given match semantics, and if possible, builds a prefilter for + /// them. + /// + /// # Example + /// + /// This example shows how to build a prefilter directly from an [`Hir`] + /// expression, and use to find an occurrence of a prefix from the regex + /// pattern. + /// + /// ``` + /// use regex_automata::{ + /// util::{prefilter::Prefilter, syntax}, + /// MatchKind, Span, + /// }; + /// + /// let hir = syntax::parse(r"(Bruce|Patti) \w+")?; + /// let pre = Prefilter::from_hir_prefix(MatchKind::LeftmostFirst, &hir) + /// .expect("a prefilter"); + /// let hay = "Hello Patti Scialfa!"; + /// assert_eq!( + /// Some(Span::from(6..12)), + /// pre.find(hay.as_bytes(), Span::from(0..hay.len())), + /// ); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[cfg(feature = "syntax")] + pub fn from_hir_prefix(kind: MatchKind, hir: &Hir) -> Option<Prefilter> { + Prefilter::from_hirs_prefix(kind, &[hir]) + } + + /// This attempts to extract prefixes from the given `Hir` expressions for + /// the given match semantics, and if possible, builds a prefilter for + /// them. + /// + /// Note that as of now, prefilters throw away information about which + /// pattern each literal comes from. In other words, when a prefilter finds + /// a match, there's no way to know which pattern (or patterns) it came + /// from. Therefore, in order to confirm a match, you'll have to check all + /// of the patterns by running the full regex engine. + /// + /// # Example + /// + /// This example shows how to build a prefilter directly from multiple + /// `Hir` expressions expression, and use it to find an occurrence of a + /// prefix from the regex patterns. + /// + /// ``` + /// use regex_automata::{ + /// util::{prefilter::Prefilter, syntax}, + /// MatchKind, Span, + /// }; + /// + /// let hirs = syntax::parse_many(&[ + /// r"(Bruce|Patti) \w+", + /// r"Mrs?\. Doubtfire", + /// ])?; + /// let pre = Prefilter::from_hirs_prefix(MatchKind::LeftmostFirst, &hirs) + /// .expect("a prefilter"); + /// let hay = "Hello Mrs. Doubtfire"; + /// assert_eq!( + /// Some(Span::from(6..20)), + /// pre.find(hay.as_bytes(), Span::from(0..hay.len())), + /// ); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[cfg(feature = "syntax")] + pub fn from_hirs_prefix<H: Borrow<Hir>>( + kind: MatchKind, + hirs: &[H], + ) -> Option<Prefilter> { + prefixes(kind, hirs) + .literals() + .and_then(|lits| Prefilter::new(kind, lits)) + } + + /// Run this prefilter on `haystack[span.start..end]` and return a matching + /// span if one exists. + /// + /// The span returned is guaranteed to have a start position greater than + /// or equal to the one given, and an end position less than or equal to + /// the one given. + /// + /// # Example + /// + /// This example shows how to build a prefilter directly from an [`Hir`] + /// expression, and use it to find an occurrence of a prefix from the regex + /// pattern. + /// + /// ``` + /// use regex_automata::{ + /// util::{prefilter::Prefilter, syntax}, + /// MatchKind, Span, + /// }; + /// + /// let hir = syntax::parse(r"Bruce \w+")?; + /// let pre = Prefilter::from_hir_prefix(MatchKind::LeftmostFirst, &hir) + /// .expect("a prefilter"); + /// let hay = "Hello Bruce Springsteen!"; + /// assert_eq!( + /// Some(Span::from(6..12)), + /// pre.find(hay.as_bytes(), Span::from(0..hay.len())), + /// ); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn find(&self, haystack: &[u8], span: Span) -> Option<Span> { + #[cfg(not(feature = "alloc"))] + { + unreachable!() + } + #[cfg(feature = "alloc")] + { + self.pre.find(haystack, span) + } + } + + /// Returns the span of a prefix of `haystack[span.start..span.end]` if + /// the prefilter matches. + /// + /// The span returned is guaranteed to have a start position equivalent to + /// the one given, and an end position less than or equal to the one given. + /// + /// # Example + /// + /// This example shows how to build a prefilter directly from an [`Hir`] + /// expression, and use it to find an occurrence of a prefix from the regex + /// pattern that begins at the start of a haystack only. + /// + /// ``` + /// use regex_automata::{ + /// util::{prefilter::Prefilter, syntax}, + /// MatchKind, Span, + /// }; + /// + /// let hir = syntax::parse(r"Bruce \w+")?; + /// let pre = Prefilter::from_hir_prefix(MatchKind::LeftmostFirst, &hir) + /// .expect("a prefilter"); + /// let hay = "Hello Bruce Springsteen!"; + /// // Nothing is found here because 'Bruce' does + /// // not occur at the beginning of our search. + /// assert_eq!( + /// None, + /// pre.prefix(hay.as_bytes(), Span::from(0..hay.len())), + /// ); + /// // But if we change where we start the search + /// // to begin where 'Bruce ' begins, then a + /// // match will be found. + /// assert_eq!( + /// Some(Span::from(6..12)), + /// pre.prefix(hay.as_bytes(), Span::from(6..hay.len())), + /// ); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span> { + #[cfg(not(feature = "alloc"))] + { + unreachable!() + } + #[cfg(feature = "alloc")] + { + self.pre.prefix(haystack, span) + } + } + + /// Returns the heap memory, in bytes, used by the underlying prefilter. + #[inline] + pub fn memory_usage(&self) -> usize { + #[cfg(not(feature = "alloc"))] + { + unreachable!() + } + #[cfg(feature = "alloc")] + { + self.pre.memory_usage() + } + } + + /// Implementations might return true here if they believe themselves to + /// be "fast." The concept of "fast" is deliberately left vague, but in + /// practice this usually corresponds to whether it's believed that SIMD + /// will be used. + /// + /// Why do we care about this? Well, some prefilter tricks tend to come + /// with their own bits of overhead, and so might only make sense if we + /// know that a scan will be *much* faster than the regex engine itself. + /// Otherwise, the trick may not be worth doing. Whether something is + /// "much" faster than the regex engine generally boils down to whether + /// SIMD is used. (But not always. Even a SIMD matcher with a high false + /// positive rate can become quite slow.) + /// + /// Even if this returns true, it is still possible for the prefilter to + /// be "slow." Remember, prefilters are just heuristics. We can't really + /// *know* a prefilter will be fast without actually trying the prefilter. + /// (Which of course we cannot afford to do.) + #[inline] + pub(crate) fn is_fast(&self) -> bool { + #[cfg(not(feature = "alloc"))] + { + unreachable!() + } + #[cfg(feature = "alloc")] + { + self.is_fast + } + } +} + +/// A trait for abstracting over prefilters. Basically, a prefilter is +/// something that do an unanchored *and* an anchored search in a haystack +/// within a given span. +/// +/// This exists pretty much only so that we can use prefilters as a trait +/// object (which is what `Prefilter` is). If we ever move off of trait objects +/// and to an enum, then it's likely this trait could be removed. +pub(crate) trait PrefilterI: + Debug + Send + Sync + RefUnwindSafe + UnwindSafe + 'static +{ + /// Run this prefilter on `haystack[span.start..end]` and return a matching + /// span if one exists. + /// + /// The span returned is guaranteed to have a start position greater than + /// or equal to the one given, and an end position less than or equal to + /// the one given. + fn find(&self, haystack: &[u8], span: Span) -> Option<Span>; + + /// Returns the span of a prefix of `haystack[span.start..span.end]` if + /// the prefilter matches. + /// + /// The span returned is guaranteed to have a start position equivalent to + /// the one given, and an end position less than or equal to the one given. + fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span>; + + /// Returns the heap memory, in bytes, used by the underlying prefilter. + fn memory_usage(&self) -> usize; + + /// Implementations might return true here if they believe themselves to + /// be "fast." See [`Prefilter::is_fast`] for more details. + fn is_fast(&self) -> bool; +} + +#[cfg(feature = "alloc")] +impl<P: PrefilterI + ?Sized> PrefilterI for Arc<P> { + #[cfg_attr(feature = "perf-inline", inline(always))] + fn find(&self, haystack: &[u8], span: Span) -> Option<Span> { + (&**self).find(haystack, span) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span> { + (&**self).prefix(haystack, span) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn memory_usage(&self) -> usize { + (&**self).memory_usage() + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn is_fast(&self) -> bool { + (&**self).is_fast() + } +} + +/// A type that encapsulates the selection of a prefilter algorithm from a +/// sequence of needles. +/// +/// The existence of this type is a little tricky, because we don't (currently) +/// use it for performing a search. Instead, we really only consume it by +/// converting the underlying prefilter into a trait object, whether that be +/// `dyn PrefilterI` or `dyn Strategy` (for the meta regex engine). In order +/// to avoid re-copying the prefilter selection logic, we isolate it here, and +/// then force anything downstream that wants to convert it to a trait object +/// to do trivial case analysis on it. +/// +/// One wonders whether we *should* use an enum instead of a trait object. +/// At time of writing, I chose trait objects based on instinct because 1) I +/// knew I wasn't going to inline anything and 2) there would potentially be +/// many different choices. However, as of time of writing, I haven't actually +/// compared the trait object approach to the enum approach. That probably +/// should be litigated, but I ran out of steam. +/// +/// Note that if the `alloc` feature is disabled, then values of this type +/// are (and should) never be constructed. Also, in practice, for any of the +/// prefilters to be selected, you'll need at least one of the `perf-literal-*` +/// features enabled. +#[derive(Clone, Debug)] +pub(crate) enum Choice { + Memchr(Memchr), + Memchr2(Memchr2), + Memchr3(Memchr3), + Memmem(Memmem), + Teddy(Teddy), + ByteSet(ByteSet), + AhoCorasick(AhoCorasick), +} + +impl Choice { + /// Select what is believed to be the best prefilter algorithm for the + /// match semantics and sequence of needles given. + /// + /// This selection algorithm uses the needles as given without any + /// modification. For example, if `[bar]` is given, then this doesn't + /// try to select `memchr` for `b`. Instead, it would select `memmem` + /// for `bar`. If callers would want `memchr` selected for `[bar]`, then + /// callers should massages the literals themselves. That is, callers are + /// responsible for heuristics surrounding which sequence of literals is + /// best. + /// + /// What this selection algorithm does is attempt to use the fastest + /// prefilter that works for the literals given. So if `[a, b]`, is given, + /// then `memchr2` is selected. + /// + /// Of course, which prefilter is selected is also subject to what + /// is available. For example, if `alloc` isn't enabled, then + /// that limits which prefilters can be selected. Similarly, if + /// `perf-literal-substring` isn't enabled, then nothing from the `memchr` + /// crate can be returned. + pub(crate) fn new<B: AsRef<[u8]>>( + kind: MatchKind, + needles: &[B], + ) -> Option<Choice> { + // An empty set means the regex matches nothing, so no sense in + // building a prefilter. + if needles.len() == 0 { + debug!("prefilter building failed: found empty set of literals"); + return None; + } + // If the regex can match the empty string, then the prefilter + // will by definition match at every position. This is obviously + // completely ineffective. + if needles.iter().any(|n| n.as_ref().is_empty()) { + debug!("prefilter building failed: literals match empty string"); + return None; + } + // BREADCRUMBS: Perhaps the literal optimizer should special case + // sequences of length two or three if the leading bytes of each are + // "rare"? Or perhaps, if there are two or three total possible leading + // bytes, regardless of the number of literals, and all are rare... + // Then well, perhaps we should use memchr2 or memchr3 in those cases? + if let Some(pre) = Memchr::new(kind, needles) { + debug!("prefilter built: memchr"); + return Some(Choice::Memchr(pre)); + } + if let Some(pre) = Memchr2::new(kind, needles) { + debug!("prefilter built: memchr2"); + return Some(Choice::Memchr2(pre)); + } + if let Some(pre) = Memchr3::new(kind, needles) { + debug!("prefilter built: memchr3"); + return Some(Choice::Memchr3(pre)); + } + if let Some(pre) = Memmem::new(kind, needles) { + debug!("prefilter built: memmem"); + return Some(Choice::Memmem(pre)); + } + if let Some(pre) = Teddy::new(kind, needles) { + debug!("prefilter built: teddy"); + return Some(Choice::Teddy(pre)); + } + if let Some(pre) = ByteSet::new(kind, needles) { + debug!("prefilter built: byteset"); + return Some(Choice::ByteSet(pre)); + } + if let Some(pre) = AhoCorasick::new(kind, needles) { + debug!("prefilter built: aho-corasick"); + return Some(Choice::AhoCorasick(pre)); + } + debug!("prefilter building failed: no strategy could be found"); + None + } +} + +/// Extracts all of the prefix literals from the given HIR expressions into a +/// single `Seq`. The literals in the sequence are ordered with respect to the +/// order of the given HIR expressions and consistent with the match semantics +/// given. +/// +/// The sequence returned is "optimized." That is, they may be shrunk or even +/// truncated according to heuristics with the intent of making them more +/// useful as a prefilter. (Which translates to both using faster algorithms +/// and minimizing the false positive rate.) +/// +/// Note that this erases any connection between the literals and which pattern +/// (or patterns) they came from. +/// +/// The match kind given must correspond to the match semantics of the regex +/// that is represented by the HIRs given. The match semantics may change the +/// literal sequence returned. +#[cfg(feature = "syntax")] +pub(crate) fn prefixes<H>(kind: MatchKind, hirs: &[H]) -> literal::Seq +where + H: core::borrow::Borrow<Hir>, +{ + let mut extractor = literal::Extractor::new(); + extractor.kind(literal::ExtractKind::Prefix); + + let mut prefixes = literal::Seq::empty(); + for hir in hirs { + prefixes.union(&mut extractor.extract(hir.borrow())); + } + debug!( + "prefixes (len={:?}, exact={:?}) extracted before optimization: {:?}", + prefixes.len(), + prefixes.is_exact(), + prefixes + ); + match kind { + MatchKind::All => { + prefixes.sort(); + prefixes.dedup(); + } + MatchKind::LeftmostFirst => { + prefixes.optimize_for_prefix_by_preference(); + } + } + debug!( + "prefixes (len={:?}, exact={:?}) extracted after optimization: {:?}", + prefixes.len(), + prefixes.is_exact(), + prefixes + ); + prefixes +} + +/// Like `prefixes`, but for all suffixes of all matches for the given HIRs. +#[cfg(feature = "syntax")] +pub(crate) fn suffixes<H>(kind: MatchKind, hirs: &[H]) -> literal::Seq +where + H: core::borrow::Borrow<Hir>, +{ + let mut extractor = literal::Extractor::new(); + extractor.kind(literal::ExtractKind::Suffix); + + let mut suffixes = literal::Seq::empty(); + for hir in hirs { + suffixes.union(&mut extractor.extract(hir.borrow())); + } + debug!( + "suffixes (len={:?}, exact={:?}) extracted before optimization: {:?}", + suffixes.len(), + suffixes.is_exact(), + suffixes + ); + match kind { + MatchKind::All => { + suffixes.sort(); + suffixes.dedup(); + } + MatchKind::LeftmostFirst => { + suffixes.optimize_for_suffix_by_preference(); + } + } + debug!( + "suffixes (len={:?}, exact={:?}) extracted after optimization: {:?}", + suffixes.len(), + suffixes.is_exact(), + suffixes + ); + suffixes +} diff --git a/vendor/regex-automata/src/util/prefilter/teddy.rs b/vendor/regex-automata/src/util/prefilter/teddy.rs new file mode 100644 index 000000000..fc79f2b2f --- /dev/null +++ b/vendor/regex-automata/src/util/prefilter/teddy.rs @@ -0,0 +1,160 @@ +use crate::util::{ + prefilter::PrefilterI, + search::{MatchKind, Span}, +}; + +#[derive(Clone, Debug)] +pub(crate) struct Teddy { + #[cfg(not(feature = "perf-literal-multisubstring"))] + _unused: (), + /// The actual Teddy searcher. + /// + /// Technically, it's possible that Teddy doesn't actually get used, since + /// Teddy does require its haystack to at least be of a certain size + /// (usually around the size of whatever vector is being used, so ~16 + /// or ~32 bytes). For haystacks shorter than that, the implementation + /// currently uses Rabin-Karp. + #[cfg(feature = "perf-literal-multisubstring")] + searcher: aho_corasick::packed::Searcher, + /// When running an anchored search, the packed searcher can't handle it so + /// we defer to Aho-Corasick itself. Kind of sad, but changing the packed + /// searchers to support anchored search would be difficult at worst and + /// annoying at best. Since packed searchers only apply to small numbers of + /// literals, we content ourselves that this is not much of an added cost. + /// (That packed searchers only work with a small number of literals is + /// also why we use a DFA here. Otherwise, the memory usage of a DFA would + /// likely be unacceptable.) + #[cfg(feature = "perf-literal-multisubstring")] + anchored_ac: aho_corasick::dfa::DFA, + /// The length of the smallest literal we look for. + /// + /// We use this as a heuristic to figure out whether this will be "fast" or + /// not. Generally, the longer the better, because longer needles are more + /// discriminating and thus reduce false positive rate. + #[cfg(feature = "perf-literal-multisubstring")] + minimum_len: usize, +} + +impl Teddy { + pub(crate) fn new<B: AsRef<[u8]>>( + kind: MatchKind, + needles: &[B], + ) -> Option<Teddy> { + #[cfg(not(feature = "perf-literal-multisubstring"))] + { + None + } + #[cfg(feature = "perf-literal-multisubstring")] + { + // We only really support leftmost-first semantics. In + // theory we could at least support leftmost-longest, as the + // aho-corasick crate does, but regex-automata doesn't know about + // leftmost-longest currently. + // + // And like the aho-corasick prefilter, if we're using `All` + // semantics, then we can still use leftmost semantics for a + // prefilter. (This might be a suspicious choice for the literal + // engine, which uses a prefilter as a regex engine directly, but + // that only happens when using leftmost-first semantics.) + let (packed_match_kind, ac_match_kind) = match kind { + MatchKind::LeftmostFirst | MatchKind::All => ( + aho_corasick::packed::MatchKind::LeftmostFirst, + aho_corasick::MatchKind::LeftmostFirst, + ), + }; + let minimum_len = + needles.iter().map(|n| n.as_ref().len()).min().unwrap_or(0); + let packed = aho_corasick::packed::Config::new() + .match_kind(packed_match_kind) + .builder() + .extend(needles) + .build()?; + let anchored_ac = aho_corasick::dfa::DFA::builder() + .match_kind(ac_match_kind) + .start_kind(aho_corasick::StartKind::Anchored) + .prefilter(false) + .build(needles) + .ok()?; + Some(Teddy { searcher: packed, anchored_ac, minimum_len }) + } + } +} + +impl PrefilterI for Teddy { + fn find(&self, haystack: &[u8], span: Span) -> Option<Span> { + #[cfg(not(feature = "perf-literal-multisubstring"))] + { + unreachable!() + } + #[cfg(feature = "perf-literal-multisubstring")] + { + let ac_span = + aho_corasick::Span { start: span.start, end: span.end }; + self.searcher + .find_in(haystack, ac_span) + .map(|m| Span { start: m.start(), end: m.end() }) + } + } + + fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span> { + #[cfg(not(feature = "perf-literal-multisubstring"))] + { + unreachable!() + } + #[cfg(feature = "perf-literal-multisubstring")] + { + use aho_corasick::automaton::Automaton; + let input = aho_corasick::Input::new(haystack) + .anchored(aho_corasick::Anchored::Yes) + .span(span.start..span.end); + self.anchored_ac + .try_find(&input) + // OK because we build the DFA with anchored support. + .expect("aho-corasick DFA should never fail") + .map(|m| Span { start: m.start(), end: m.end() }) + } + } + + fn memory_usage(&self) -> usize { + #[cfg(not(feature = "perf-literal-multisubstring"))] + { + unreachable!() + } + #[cfg(feature = "perf-literal-multisubstring")] + { + use aho_corasick::automaton::Automaton; + self.searcher.memory_usage() + self.anchored_ac.memory_usage() + } + } + + fn is_fast(&self) -> bool { + #[cfg(not(feature = "perf-literal-multisubstring"))] + { + unreachable!() + } + #[cfg(feature = "perf-literal-multisubstring")] + { + // Teddy is usually quite fast, but I have seen some cases where + // a large number of literals can overwhelm it and make it not so + // fast. We make an educated but conservative guess at a limit, at + // which point, we're not so comfortable thinking Teddy is "fast." + // + // Well... this used to incorporate a "limit" on the *number* + // of literals, but I have since changed it to a minimum on the + // *smallest* literal. Namely, when there is a very small literal + // (1 or 2 bytes), it is far more likely that it leads to a higher + // false positive rate. (Although, of course, not always. For + // example, 'zq' is likely to have a very low false positive rate.) + // But when we have 3 bytes, we have a really good chance of being + // quite discriminatory and thus fast. + // + // We may still want to add some kind of limit on the number of + // literals here, but keep in mind that Teddy already has its own + // somewhat small limit (64 at time of writing). The main issue + // here is that if 'is_fast' is false, it opens the door for the + // reverse inner optimization to kick in. We really only want to + // resort to the reverse inner optimization if we absolutely must. + self.minimum_len >= 3 + } + } +} diff --git a/vendor/regex-automata/src/util/primitives.rs b/vendor/regex-automata/src/util/primitives.rs new file mode 100644 index 000000000..5c5d187b0 --- /dev/null +++ b/vendor/regex-automata/src/util/primitives.rs @@ -0,0 +1,776 @@ +/*! +Lower level primitive types that are useful in a variety of circumstances. + +# Overview + +This list represents the principle types in this module and briefly describes +when you might want to use them. + +* [`PatternID`] - A type that represents the identifier of a regex pattern. +This is probably the most widely used type in this module (which is why it's +also re-exported in the crate root). +* [`StateID`] - A type the represents the identifier of a finite automaton +state. This is used for both NFAs and DFAs, with the notable exception of +the hybrid NFA/DFA. (The hybrid NFA/DFA uses a special purpose "lazy" state +identifier.) +* [`SmallIndex`] - The internal representation of both a `PatternID` and a +`StateID`. Its purpose is to serve as a type that can index memory without +being as big as a `usize` on 64-bit targets. The main idea behind this type +is that there are many things in regex engines that will, in practice, never +overflow a 32-bit integer. (For example, like the number of patterns in a regex +or the number of states in an NFA.) Thus, a `SmallIndex` can be used to index +memory without peppering `as` casts everywhere. Moreover, it forces callers +to handle errors in the case where, somehow, the value would otherwise overflow +either a 32-bit integer or a `usize` (e.g., on 16-bit targets). +* [`NonMaxUsize`] - Represents a `usize` that cannot be `usize::MAX`. As a +result, `Option<NonMaxUsize>` has the same size in memory as a `usize`. This +useful, for example, when representing the offsets of submatches since it +reduces memory usage by a factor of 2. It is a legal optimization since Rust +guarantees that slices never have a length that exceeds `isize::MAX`. +*/ + +use core::num::NonZeroUsize; + +#[cfg(feature = "alloc")] +use alloc::vec::Vec; + +use crate::util::int::{Usize, U16, U32, U64}; + +/// A `usize` that can never be `usize::MAX`. +/// +/// This is similar to `core::num::NonZeroUsize`, but instead of not permitting +/// a zero value, this does not permit a max value. +/// +/// This is useful in certain contexts where one wants to optimize the memory +/// usage of things that contain match offsets. Namely, since Rust slices +/// are guaranteed to never have a length exceeding `isize::MAX`, we can use +/// `usize::MAX` as a sentinel to indicate that no match was found. Indeed, +/// types like `Option<NonMaxUsize>` have exactly the same size in memory as a +/// `usize`. +/// +/// This type is defined to be `repr(transparent)` for +/// `core::num::NonZeroUsize`, which is in turn defined to be +/// `repr(transparent)` for `usize`. +#[derive(Clone, Copy, Eq, Hash, PartialEq, PartialOrd, Ord)] +#[repr(transparent)] +pub struct NonMaxUsize(NonZeroUsize); + +impl NonMaxUsize { + /// Create a new `NonMaxUsize` from the given value. + /// + /// This returns `None` only when the given value is equal to `usize::MAX`. + #[inline] + pub fn new(value: usize) -> Option<NonMaxUsize> { + NonZeroUsize::new(value.wrapping_add(1)).map(NonMaxUsize) + } + + /// Return the underlying `usize` value. The returned value is guaranteed + /// to not equal `usize::MAX`. + #[inline] + pub fn get(self) -> usize { + self.0.get().wrapping_sub(1) + } +} + +// We provide our own Debug impl because seeing the internal repr can be quite +// surprising if you aren't expecting it. e.g., 'NonMaxUsize(5)' vs just '5'. +impl core::fmt::Debug for NonMaxUsize { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + write!(f, "{:?}", self.get()) + } +} + +/// A type that represents a "small" index. +/// +/// The main idea of this type is to provide something that can index memory, +/// but uses less memory than `usize` on 64-bit systems. Specifically, its +/// representation is always a `u32` and has `repr(transparent)` enabled. (So +/// it is safe to transmute between a `u32` and a `SmallIndex`.) +/// +/// A small index is typically useful in cases where there is no practical way +/// that the index will overflow a 32-bit integer. A good example of this is +/// an NFA state. If you could somehow build an NFA with `2^30` states, its +/// memory usage would be exorbitant and its runtime execution would be so +/// slow as to be completely worthless. Therefore, this crate generally deems +/// it acceptable to return an error if it would otherwise build an NFA that +/// requires a slice longer than what a 32-bit integer can index. In exchange, +/// we can use 32-bit indices instead of 64-bit indices in various places. +/// +/// This type ensures this by providing a constructor that will return an error +/// if its argument cannot fit into the type. This makes it much easier to +/// handle these sorts of boundary cases that are otherwise extremely subtle. +/// +/// On all targets, this type guarantees that its value will fit in a `u32`, +/// `i32`, `usize` and an `isize`. This means that on 16-bit targets, for +/// example, this type's maximum value will never overflow an `isize`, +/// which means it will never overflow a `i16` even though its internal +/// representation is still a `u32`. +/// +/// The purpose for making the type fit into even signed integer types like +/// `isize` is to guarantee that the difference between any two small indices +/// is itself also a small index. This is useful in certain contexts, e.g., +/// for delta encoding. +/// +/// # Other types +/// +/// The following types wrap `SmallIndex` to provide a more focused use case: +/// +/// * [`PatternID`] is for representing the identifiers of patterns. +/// * [`StateID`] is for representing the identifiers of states in finite +/// automata. It is used for both NFAs and DFAs. +/// +/// # Representation +/// +/// This type is always represented internally by a `u32` and is marked as +/// `repr(transparent)`. Thus, this type always has the same representation as +/// a `u32`. It is thus safe to transmute between a `u32` and a `SmallIndex`. +/// +/// # Indexing +/// +/// For convenience, callers may use a `SmallIndex` to index slices. +/// +/// # Safety +/// +/// While a `SmallIndex` is meant to guarantee that its value fits into `usize` +/// without using as much space as a `usize` on all targets, callers must +/// not rely on this property for safety. Callers may choose to rely on this +/// property for correctness however. For example, creating a `SmallIndex` with +/// an invalid value can be done in entirely safe code. This may in turn result +/// in panics or silent logical errors. +#[derive( + Clone, Copy, Debug, Default, Eq, Hash, PartialEq, PartialOrd, Ord, +)] +#[repr(transparent)] +pub struct SmallIndex(u32); + +impl SmallIndex { + /// The maximum index value. + #[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))] + pub const MAX: SmallIndex = + // FIXME: Use as_usize() once const functions in traits are stable. + SmallIndex::new_unchecked(core::i32::MAX as usize - 1); + + /// The maximum index value. + #[cfg(target_pointer_width = "16")] + pub const MAX: SmallIndex = + SmallIndex::new_unchecked(core::isize::MAX - 1); + + /// The total number of values that can be represented as a small index. + pub const LIMIT: usize = SmallIndex::MAX.as_usize() + 1; + + /// The zero index value. + pub const ZERO: SmallIndex = SmallIndex::new_unchecked(0); + + /// The number of bytes that a single small index uses in memory. + pub const SIZE: usize = core::mem::size_of::<SmallIndex>(); + + /// Create a new small index. + /// + /// If the given index exceeds [`SmallIndex::MAX`], then this returns + /// an error. + #[inline] + pub fn new(index: usize) -> Result<SmallIndex, SmallIndexError> { + SmallIndex::try_from(index) + } + + /// Create a new small index without checking whether the given value + /// exceeds [`SmallIndex::MAX`]. + /// + /// Using this routine with an invalid index value will result in + /// unspecified behavior, but *not* undefined behavior. In particular, an + /// invalid index value is likely to cause panics or possibly even silent + /// logical errors. + /// + /// Callers must never rely on a `SmallIndex` to be within a certain range + /// for memory safety. + #[inline] + pub const fn new_unchecked(index: usize) -> SmallIndex { + // FIXME: Use as_u32() once const functions in traits are stable. + SmallIndex(index as u32) + } + + /// Like [`SmallIndex::new`], but panics if the given index is not valid. + #[inline] + pub fn must(index: usize) -> SmallIndex { + SmallIndex::new(index).expect("invalid small index") + } + + /// Return this small index as a `usize`. This is guaranteed to never + /// overflow `usize`. + #[inline] + pub const fn as_usize(&self) -> usize { + // FIXME: Use as_usize() once const functions in traits are stable. + self.0 as usize + } + + /// Return this small index as a `u64`. This is guaranteed to never + /// overflow. + #[inline] + pub const fn as_u64(&self) -> u64 { + // FIXME: Use u64::from() once const functions in traits are stable. + self.0 as u64 + } + + /// Return the internal `u32` of this small index. This is guaranteed to + /// never overflow `u32`. + #[inline] + pub const fn as_u32(&self) -> u32 { + self.0 + } + + /// Return the internal `u32` of this small index represented as an `i32`. + /// This is guaranteed to never overflow an `i32`. + #[inline] + pub const fn as_i32(&self) -> i32 { + // This is OK because we guarantee that our max value is <= i32::MAX. + self.0 as i32 + } + + /// Returns one more than this small index as a usize. + /// + /// Since a small index has constraints on its maximum value, adding `1` to + /// it will always fit in a `usize`, `u32` and a `i32`. + #[inline] + pub fn one_more(&self) -> usize { + self.as_usize() + 1 + } + + /// Decode this small index from the bytes given using the native endian + /// byte order for the current target. + /// + /// If the decoded integer is not representable as a small index for the + /// current target, then this returns an error. + #[inline] + pub fn from_ne_bytes( + bytes: [u8; 4], + ) -> Result<SmallIndex, SmallIndexError> { + let id = u32::from_ne_bytes(bytes); + if id > SmallIndex::MAX.as_u32() { + return Err(SmallIndexError { attempted: u64::from(id) }); + } + Ok(SmallIndex::new_unchecked(id.as_usize())) + } + + /// Decode this small index from the bytes given using the native endian + /// byte order for the current target. + /// + /// This is analogous to [`SmallIndex::new_unchecked`] in that is does not + /// check whether the decoded integer is representable as a small index. + #[inline] + pub fn from_ne_bytes_unchecked(bytes: [u8; 4]) -> SmallIndex { + SmallIndex::new_unchecked(u32::from_ne_bytes(bytes).as_usize()) + } + + /// Return the underlying small index integer as raw bytes in native endian + /// format. + #[inline] + pub fn to_ne_bytes(&self) -> [u8; 4] { + self.0.to_ne_bytes() + } +} + +impl<T> core::ops::Index<SmallIndex> for [T] { + type Output = T; + + #[inline] + fn index(&self, index: SmallIndex) -> &T { + &self[index.as_usize()] + } +} + +impl<T> core::ops::IndexMut<SmallIndex> for [T] { + #[inline] + fn index_mut(&mut self, index: SmallIndex) -> &mut T { + &mut self[index.as_usize()] + } +} + +#[cfg(feature = "alloc")] +impl<T> core::ops::Index<SmallIndex> for Vec<T> { + type Output = T; + + #[inline] + fn index(&self, index: SmallIndex) -> &T { + &self[index.as_usize()] + } +} + +#[cfg(feature = "alloc")] +impl<T> core::ops::IndexMut<SmallIndex> for Vec<T> { + #[inline] + fn index_mut(&mut self, index: SmallIndex) -> &mut T { + &mut self[index.as_usize()] + } +} + +impl From<u8> for SmallIndex { + fn from(index: u8) -> SmallIndex { + SmallIndex::new_unchecked(usize::from(index)) + } +} + +impl TryFrom<u16> for SmallIndex { + type Error = SmallIndexError; + + fn try_from(index: u16) -> Result<SmallIndex, SmallIndexError> { + if u32::from(index) > SmallIndex::MAX.as_u32() { + return Err(SmallIndexError { attempted: u64::from(index) }); + } + Ok(SmallIndex::new_unchecked(index.as_usize())) + } +} + +impl TryFrom<u32> for SmallIndex { + type Error = SmallIndexError; + + fn try_from(index: u32) -> Result<SmallIndex, SmallIndexError> { + if index > SmallIndex::MAX.as_u32() { + return Err(SmallIndexError { attempted: u64::from(index) }); + } + Ok(SmallIndex::new_unchecked(index.as_usize())) + } +} + +impl TryFrom<u64> for SmallIndex { + type Error = SmallIndexError; + + fn try_from(index: u64) -> Result<SmallIndex, SmallIndexError> { + if index > SmallIndex::MAX.as_u64() { + return Err(SmallIndexError { attempted: index }); + } + Ok(SmallIndex::new_unchecked(index.as_usize())) + } +} + +impl TryFrom<usize> for SmallIndex { + type Error = SmallIndexError; + + fn try_from(index: usize) -> Result<SmallIndex, SmallIndexError> { + if index > SmallIndex::MAX.as_usize() { + return Err(SmallIndexError { attempted: index.as_u64() }); + } + Ok(SmallIndex::new_unchecked(index)) + } +} + +#[cfg(test)] +impl quickcheck::Arbitrary for SmallIndex { + fn arbitrary(gen: &mut quickcheck::Gen) -> SmallIndex { + use core::cmp::max; + + let id = max(i32::MIN + 1, i32::arbitrary(gen)).abs(); + if id > SmallIndex::MAX.as_i32() { + SmallIndex::MAX + } else { + SmallIndex::new(usize::try_from(id).unwrap()).unwrap() + } + } +} + +/// This error occurs when a small index could not be constructed. +/// +/// This occurs when given an integer exceeding the maximum small index value. +/// +/// When the `std` feature is enabled, this implements the `Error` trait. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct SmallIndexError { + attempted: u64, +} + +impl SmallIndexError { + /// Returns the value that could not be converted to a small index. + pub fn attempted(&self) -> u64 { + self.attempted + } +} + +#[cfg(feature = "std")] +impl std::error::Error for SmallIndexError {} + +impl core::fmt::Display for SmallIndexError { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + write!( + f, + "failed to create small index from {:?}, which exceeds {:?}", + self.attempted(), + SmallIndex::MAX, + ) + } +} + +#[derive(Clone, Debug)] +pub(crate) struct SmallIndexIter { + rng: core::ops::Range<usize>, +} + +impl Iterator for SmallIndexIter { + type Item = SmallIndex; + + fn next(&mut self) -> Option<SmallIndex> { + if self.rng.start >= self.rng.end { + return None; + } + let next_id = self.rng.start + 1; + let id = core::mem::replace(&mut self.rng.start, next_id); + // new_unchecked is OK since we asserted that the number of + // elements in this iterator will fit in an ID at construction. + Some(SmallIndex::new_unchecked(id)) + } +} + +macro_rules! index_type_impls { + ($name:ident, $err:ident, $iter:ident, $withiter:ident) => { + impl $name { + /// The maximum value. + pub const MAX: $name = $name(SmallIndex::MAX); + + /// The total number of values that can be represented. + pub const LIMIT: usize = SmallIndex::LIMIT; + + /// The zero value. + pub const ZERO: $name = $name(SmallIndex::ZERO); + + /// The number of bytes that a single value uses in memory. + pub const SIZE: usize = SmallIndex::SIZE; + + /// Create a new value that is represented by a "small index." + /// + /// If the given index exceeds the maximum allowed value, then this + /// returns an error. + #[inline] + pub fn new(value: usize) -> Result<$name, $err> { + SmallIndex::new(value).map($name).map_err($err) + } + + /// Create a new value without checking whether the given argument + /// exceeds the maximum. + /// + /// Using this routine with an invalid value will result in + /// unspecified behavior, but *not* undefined behavior. In + /// particular, an invalid ID value is likely to cause panics or + /// possibly even silent logical errors. + /// + /// Callers must never rely on this type to be within a certain + /// range for memory safety. + #[inline] + pub const fn new_unchecked(value: usize) -> $name { + $name(SmallIndex::new_unchecked(value)) + } + + /// Like `new`, but panics if the given value is not valid. + #[inline] + pub fn must(value: usize) -> $name { + $name::new(value).expect(concat!( + "invalid ", + stringify!($name), + " value" + )) + } + + /// Return the internal value as a `usize`. This is guaranteed to + /// never overflow `usize`. + #[inline] + pub const fn as_usize(&self) -> usize { + self.0.as_usize() + } + + /// Return the internal value as a `u64`. This is guaranteed to + /// never overflow. + #[inline] + pub const fn as_u64(&self) -> u64 { + self.0.as_u64() + } + + /// Return the internal value as a `u32`. This is guaranteed to + /// never overflow `u32`. + #[inline] + pub const fn as_u32(&self) -> u32 { + self.0.as_u32() + } + + /// Return the internal value as a i32`. This is guaranteed to + /// never overflow an `i32`. + #[inline] + pub const fn as_i32(&self) -> i32 { + self.0.as_i32() + } + + /// Returns one more than this value as a usize. + /// + /// Since values represented by a "small index" have constraints + /// on their maximum value, adding `1` to it will always fit in a + /// `usize`, `u32` and a `i32`. + #[inline] + pub fn one_more(&self) -> usize { + self.0.one_more() + } + + /// Decode this value from the bytes given using the native endian + /// byte order for the current target. + /// + /// If the decoded integer is not representable as a small index + /// for the current target, then this returns an error. + #[inline] + pub fn from_ne_bytes(bytes: [u8; 4]) -> Result<$name, $err> { + SmallIndex::from_ne_bytes(bytes).map($name).map_err($err) + } + + /// Decode this value from the bytes given using the native endian + /// byte order for the current target. + /// + /// This is analogous to `new_unchecked` in that is does not check + /// whether the decoded integer is representable as a small index. + #[inline] + pub fn from_ne_bytes_unchecked(bytes: [u8; 4]) -> $name { + $name(SmallIndex::from_ne_bytes_unchecked(bytes)) + } + + /// Return the underlying integer as raw bytes in native endian + /// format. + #[inline] + pub fn to_ne_bytes(&self) -> [u8; 4] { + self.0.to_ne_bytes() + } + + /// Returns an iterator over all values from 0 up to and not + /// including the given length. + /// + /// If the given length exceeds this type's limit, then this + /// panics. + pub(crate) fn iter(len: usize) -> $iter { + $iter::new(len) + } + } + + // We write our own Debug impl so that we get things like PatternID(5) + // instead of PatternID(SmallIndex(5)). + impl core::fmt::Debug for $name { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + f.debug_tuple(stringify!($name)).field(&self.as_u32()).finish() + } + } + + impl<T> core::ops::Index<$name> for [T] { + type Output = T; + + #[inline] + fn index(&self, index: $name) -> &T { + &self[index.as_usize()] + } + } + + impl<T> core::ops::IndexMut<$name> for [T] { + #[inline] + fn index_mut(&mut self, index: $name) -> &mut T { + &mut self[index.as_usize()] + } + } + + #[cfg(feature = "alloc")] + impl<T> core::ops::Index<$name> for Vec<T> { + type Output = T; + + #[inline] + fn index(&self, index: $name) -> &T { + &self[index.as_usize()] + } + } + + #[cfg(feature = "alloc")] + impl<T> core::ops::IndexMut<$name> for Vec<T> { + #[inline] + fn index_mut(&mut self, index: $name) -> &mut T { + &mut self[index.as_usize()] + } + } + + impl From<u8> for $name { + fn from(value: u8) -> $name { + $name(SmallIndex::from(value)) + } + } + + impl TryFrom<u16> for $name { + type Error = $err; + + fn try_from(value: u16) -> Result<$name, $err> { + SmallIndex::try_from(value).map($name).map_err($err) + } + } + + impl TryFrom<u32> for $name { + type Error = $err; + + fn try_from(value: u32) -> Result<$name, $err> { + SmallIndex::try_from(value).map($name).map_err($err) + } + } + + impl TryFrom<u64> for $name { + type Error = $err; + + fn try_from(value: u64) -> Result<$name, $err> { + SmallIndex::try_from(value).map($name).map_err($err) + } + } + + impl TryFrom<usize> for $name { + type Error = $err; + + fn try_from(value: usize) -> Result<$name, $err> { + SmallIndex::try_from(value).map($name).map_err($err) + } + } + + #[cfg(test)] + impl quickcheck::Arbitrary for $name { + fn arbitrary(gen: &mut quickcheck::Gen) -> $name { + $name(SmallIndex::arbitrary(gen)) + } + } + + /// This error occurs when a value could not be constructed. + /// + /// This occurs when given an integer exceeding the maximum allowed + /// value. + /// + /// When the `std` feature is enabled, this implements the `Error` + /// trait. + #[derive(Clone, Debug, Eq, PartialEq)] + pub struct $err(SmallIndexError); + + impl $err { + /// Returns the value that could not be converted to an ID. + pub fn attempted(&self) -> u64 { + self.0.attempted() + } + } + + #[cfg(feature = "std")] + impl std::error::Error for $err {} + + impl core::fmt::Display for $err { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + write!( + f, + "failed to create {} from {:?}, which exceeds {:?}", + stringify!($name), + self.attempted(), + $name::MAX, + ) + } + } + + #[derive(Clone, Debug)] + pub(crate) struct $iter(SmallIndexIter); + + impl $iter { + fn new(len: usize) -> $iter { + assert!( + len <= $name::LIMIT, + "cannot create iterator for {} when number of \ + elements exceed {:?}", + stringify!($name), + $name::LIMIT, + ); + $iter(SmallIndexIter { rng: 0..len }) + } + } + + impl Iterator for $iter { + type Item = $name; + + fn next(&mut self) -> Option<$name> { + self.0.next().map($name) + } + } + + /// An iterator adapter that is like std::iter::Enumerate, but attaches + /// small index values instead. It requires `ExactSizeIterator`. At + /// construction, it ensures that the index of each element in the + /// iterator is representable in the corresponding small index type. + #[derive(Clone, Debug)] + pub(crate) struct $withiter<I> { + it: I, + ids: $iter, + } + + impl<I: Iterator + ExactSizeIterator> $withiter<I> { + fn new(it: I) -> $withiter<I> { + let ids = $name::iter(it.len()); + $withiter { it, ids } + } + } + + impl<I: Iterator + ExactSizeIterator> Iterator for $withiter<I> { + type Item = ($name, I::Item); + + fn next(&mut self) -> Option<($name, I::Item)> { + let item = self.it.next()?; + // Number of elements in this iterator must match, according + // to contract of ExactSizeIterator. + let id = self.ids.next().unwrap(); + Some((id, item)) + } + } + }; +} + +/// The identifier of a regex pattern, represented by a [`SmallIndex`]. +/// +/// The identifier for a pattern corresponds to its relative position among +/// other patterns in a single finite state machine. Namely, when building +/// a multi-pattern regex engine, one must supply a sequence of patterns to +/// match. The position (starting at 0) of each pattern in that sequence +/// represents its identifier. This identifier is in turn used to identify and +/// report matches of that pattern in various APIs. +/// +/// See the [`SmallIndex`] type for more information about what it means for +/// a pattern ID to be a "small index." +/// +/// Note that this type is defined in the +/// [`util::primitives`](crate::util::primitives) module, but it is also +/// re-exported at the crate root due to how common it is. +#[derive(Clone, Copy, Default, Eq, Hash, PartialEq, PartialOrd, Ord)] +#[repr(transparent)] +pub struct PatternID(SmallIndex); + +/// The identifier of a finite automaton state, represented by a +/// [`SmallIndex`]. +/// +/// Most regex engines in this crate are built on top of finite automata. Each +/// state in a finite automaton defines transitions from its state to another. +/// Those transitions point to other states via their identifiers, i.e., a +/// `StateID`. Since finite automata tend to contain many transitions, it is +/// much more memory efficient to define state IDs as small indices. +/// +/// See the [`SmallIndex`] type for more information about what it means for +/// a state ID to be a "small index." +#[derive(Clone, Copy, Default, Eq, Hash, PartialEq, PartialOrd, Ord)] +#[repr(transparent)] +pub struct StateID(SmallIndex); + +index_type_impls!(PatternID, PatternIDError, PatternIDIter, WithPatternIDIter); +index_type_impls!(StateID, StateIDError, StateIDIter, WithStateIDIter); + +/// A utility trait that defines a couple of adapters for making it convenient +/// to access indices as "small index" types. We require ExactSizeIterator so +/// that iterator construction can do a single check to make sure the index of +/// each element is representable by its small index type. +pub(crate) trait IteratorIndexExt: Iterator { + fn with_pattern_ids(self) -> WithPatternIDIter<Self> + where + Self: Sized + ExactSizeIterator, + { + WithPatternIDIter::new(self) + } + + fn with_state_ids(self) -> WithStateIDIter<Self> + where + Self: Sized + ExactSizeIterator, + { + WithStateIDIter::new(self) + } +} + +impl<I: Iterator> IteratorIndexExt for I {} diff --git a/vendor/regex-automata/src/util/search.rs b/vendor/regex-automata/src/util/search.rs new file mode 100644 index 000000000..39aec522b --- /dev/null +++ b/vendor/regex-automata/src/util/search.rs @@ -0,0 +1,1969 @@ +/*! +Types and routines that support the search APIs of most regex engines. + +This sub-module isn't exposed directly, but rather, its contents are exported +at the crate root due to the universality of most of the types and routines in +this module. +*/ + +use core::ops::{Range, RangeBounds}; + +use crate::util::{escape::DebugByte, primitives::PatternID, utf8}; + +/// The parameters for a regex search including the haystack to search. +/// +/// It turns out that regex searches have a few parameters, and in most cases, +/// those parameters have defaults that work in the vast majority of cases. +/// This `Input` type exists to make that common case seamnless while also +/// providing an avenue for changing the parameters of a search. In particular, +/// this type enables doing so without a combinatorial explosion of different +/// methods and/or superfluous parameters in the common cases. +/// +/// An `Input` permits configuring the following things: +/// +/// * Search only a substring of a haystack, while taking the broader context +/// into account for resolving look-around assertions. +/// * Indicating whether to search for all patterns in a regex, or to +/// only search for one pattern in particular. +/// * Whether to perform an anchored on unanchored search. +/// * Whether to report a match as early as possible. +/// +/// All of these parameters, except for the haystack, have sensible default +/// values. This means that the minimal search configuration is simply a call +/// to [`Input::new`] with your haystack. Setting any other parameter is +/// optional. +/// +/// Moreover, for any `H` that implements `AsRef<[u8]>`, there exists a +/// `From<H> for Input` implementation. This is useful because many of the +/// search APIs in this crate accept an `Into<Input>`. This means you can +/// provide string or byte strings to these routines directly, and they'll +/// automatically get converted into an `Input` for you. +/// +/// The lifetime parameter `'h` refers to the lifetime of the haystack. +/// +/// # Organization +/// +/// The API of `Input` is split into a few different parts: +/// +/// * A builder-like API that transforms a `Input` by value. Examples: +/// [`Input::span`] and [`Input::anchored`]. +/// * A setter API that permits mutating parameters in place. Examples: +/// [`Input::set_span`] and [`Input::set_anchored`]. +/// * A getter API that permits retrieving any of the search parameters. +/// Examples: [`Input::get_span`] and [`Input::get_anchored`]. +/// * A few convenience getter routines that don't conform to the above naming +/// pattern due to how common they are. Examples: [`Input::haystack`], +/// [`Input::start`] and [`Input::end`]. +/// * Miscellaneous predicates and other helper routines that are useful +/// in some contexts. Examples: [`Input::is_char_boundary`]. +/// +/// A `Input` exposes so much because it is meant to be used by both callers of +/// regex engines _and_ implementors of regex engines. A constraining factor is +/// that regex engines should accept a `&Input` as its lowest level API, which +/// means that implementors should only use the "getter" APIs of a `Input`. +/// +/// # Valid bounds and search termination +/// +/// An `Input` permits setting the bounds of a search via either +/// [`Input::span`] or [`Input::range`]. The bounds set must be valid, or +/// else a panic will occur. Bounds are valid if and only if: +/// +/// * The bounds represent a valid range into the input's haystack. +/// * **or** the end bound is a valid ending bound for the haystack *and* +/// the start bound is exactly one greater than the start bound. +/// +/// In the latter case, [`Input::is_done`] will return true and indicates any +/// search receiving such an input should immediately return with no match. +/// +/// Note that while `Input` is used for reverse searches in this crate, the +/// `Input::is_done` predicate assumes a forward search. Because unsigned +/// offsets are used internally, there is no way to tell from only the offsets +/// whether a reverse search is done or not. +/// +/// # Regex engine support +/// +/// Any regex engine accepting an `Input` must support at least the following +/// things: +/// +/// * Searching a `&[u8]` for matches. +/// * Searching a substring of `&[u8]` for a match, such that any match +/// reported must appear entirely within that substring. +/// * For a forwards search, a match should never be reported when +/// [`Input::is_done`] returns true. (For reverse searches, termination should +/// be handled outside of `Input`.) +/// +/// Supporting other aspects of an `Input` are optional, but regex engines +/// should handle aspects they don't support gracefully. How this is done is +/// generally up to the regex engine. This crate generally treats unsupported +/// anchored modes as an error to report for example, but for simplicity, in +/// the meta regex engine, trying to search with an invalid pattern ID just +/// results in no match being reported. +#[derive(Clone)] +pub struct Input<'h> { + haystack: &'h [u8], + span: Span, + anchored: Anchored, + earliest: bool, +} + +impl<'h> Input<'h> { + /// Create a new search configuration for the given haystack. + #[inline] + pub fn new<H: ?Sized + AsRef<[u8]>>(haystack: &'h H) -> Input<'h> { + Input { + haystack: haystack.as_ref(), + span: Span { start: 0, end: haystack.as_ref().len() }, + anchored: Anchored::No, + earliest: false, + } + } + + /// Set the span for this search. + /// + /// This routine does not panic if the span given is not a valid range for + /// this search's haystack. If this search is run with an invalid range, + /// then the most likely outcome is that the actual search execution will + /// panic. + /// + /// This routine is generic over how a span is provided. While + /// a [`Span`] may be given directly, one may also provide a + /// `std::ops::Range<usize>`. To provide anything supported by range + /// syntax, use the [`Input::range`] method. + /// + /// The default span is the entire haystack. + /// + /// Note that [`Input::range`] overrides this method and vice versa. + /// + /// # Panics + /// + /// This panics if the given span does not correspond to valid bounds in + /// the haystack or the termination of a search. + /// + /// # Example + /// + /// This example shows how the span of the search can impact whether a + /// match is reported or not. This is particularly relevant for look-around + /// operators, which might take things outside of the span into account + /// when determining whether they match. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{ + /// nfa::thompson::pikevm::PikeVM, + /// Match, Input, + /// }; + /// + /// // Look for 'at', but as a distinct word. + /// let re = PikeVM::new(r"\bat\b")?; + /// let mut cache = re.create_cache(); + /// let mut caps = re.create_captures(); + /// + /// // Our haystack contains 'at', but not as a distinct word. + /// let haystack = "batter"; + /// + /// // A standard search finds nothing, as expected. + /// let input = Input::new(haystack); + /// re.search(&mut cache, &input, &mut caps); + /// assert_eq!(None, caps.get_match()); + /// + /// // But if we wanted to search starting at position '1', we might + /// // slice the haystack. If we do this, it's impossible for the \b + /// // anchors to take the surrounding context into account! And thus, + /// // a match is produced. + /// let input = Input::new(&haystack[1..3]); + /// re.search(&mut cache, &input, &mut caps); + /// assert_eq!(Some(Match::must(0, 0..2)), caps.get_match()); + /// + /// // But if we specify the span of the search instead of slicing the + /// // haystack, then the regex engine can "see" outside of the span + /// // and resolve the anchors correctly. + /// let input = Input::new(haystack).span(1..3); + /// re.search(&mut cache, &input, &mut caps); + /// assert_eq!(None, caps.get_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// This may seem a little ham-fisted, but this scenario tends to come up + /// if some other regex engine found the match span and now you need to + /// re-process that span to look for capturing groups. (e.g., Run a faster + /// DFA first, find a match, then run the PikeVM on just the match span to + /// resolve capturing groups.) In order to implement that sort of logic + /// correctly, you need to set the span on the search instead of slicing + /// the haystack directly. + /// + /// The other advantage of using this routine to specify the bounds of the + /// search is that the match offsets are still reported in terms of the + /// original haystack. For example, the second search in the example above + /// reported a match at position `0`, even though `at` starts at offset + /// `1` because we sliced the haystack. + #[inline] + pub fn span<S: Into<Span>>(mut self, span: S) -> Input<'h> { + self.set_span(span); + self + } + + /// Like `Input::span`, but accepts any range instead. + /// + /// This routine does not panic if the range given is not a valid range for + /// this search's haystack. If this search is run with an invalid range, + /// then the most likely outcome is that the actual search execution will + /// panic. + /// + /// The default range is the entire haystack. + /// + /// Note that [`Input::span`] overrides this method and vice versa. + /// + /// # Panics + /// + /// This routine will panic if the given range could not be converted + /// to a valid [`Range`]. For example, this would panic when given + /// `0..=usize::MAX` since it cannot be represented using a half-open + /// interval in terms of `usize`. + /// + /// This also panics if the given range does not correspond to valid bounds + /// in the haystack or the termination of a search. + /// + /// # Example + /// + /// ``` + /// use regex_automata::Input; + /// + /// let input = Input::new("foobar"); + /// assert_eq!(0..6, input.get_range()); + /// + /// let input = Input::new("foobar").range(2..=4); + /// assert_eq!(2..5, input.get_range()); + /// ``` + #[inline] + pub fn range<R: RangeBounds<usize>>(mut self, range: R) -> Input<'h> { + self.set_range(range); + self + } + + /// Sets the anchor mode of a search. + /// + /// When a search is anchored (so that's [`Anchored::Yes`] or + /// [`Anchored::Pattern`]), a match must begin at the start of a search. + /// When a search is not anchored (that's [`Anchored::No`]), regex engines + /// will behave as if the pattern started with a `(?s-u:.)*?`. This prefix + /// permits a match to appear anywhere. + /// + /// By default, the anchored mode is [`Anchored::No`]. + /// + /// **WARNING:** this is subtly different than using a `^` at the start of + /// your regex. A `^` forces a regex to match exclusively at the start of + /// a haystack, regardless of where you begin your search. In contrast, + /// anchoring a search will allow your regex to match anywhere in your + /// haystack, but the match must start at the beginning of a search. + /// + /// For example, consider the haystack `aba` and the following searches: + /// + /// 1. The regex `^a` is compiled with `Anchored::No` and searches `aba` + /// starting at position `2`. Since `^` requires the match to start at + /// the beginning of the haystack and `2 > 0`, no match is found. + /// 2. The regex `a` is compiled with `Anchored::Yes` and searches `aba` + /// starting at position `2`. This reports a match at `[2, 3]` since + /// the match starts where the search started. Since there is no `^`, + /// there is no requirement for the match to start at the beginning of + /// the haystack. + /// 3. The regex `a` is compiled with `Anchored::Yes` and searches `aba` + /// starting at position `1`. Since `b` corresponds to position `1` and + /// since the search is anchored, it finds no match. While the regex + /// matches at other positions, configuring the search to be anchored + /// requires that it only report a match that begins at the same offset + /// as the beginning of the search. + /// 4. The regex `a` is compiled with `Anchored::No` and searches `aba` + /// starting at position `1`. Since the search is not anchored and + /// the regex does not start with `^`, the search executes as if there + /// is a `(?s:.)*?` prefix that permits it to match anywhere. Thus, it + /// reports a match at `[2, 3]`. + /// + /// Note that the [`Anchored::Pattern`] mode is like `Anchored::Yes`, + /// except it only reports matches for a particular pattern. + /// + /// # Example + /// + /// This demonstrates the differences between an anchored search and + /// a pattern that begins with `^` (as described in the above warning + /// message). + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson::pikevm::PikeVM, + /// Anchored, Match, Input, + /// }; + /// + /// let haystack = "aba"; + /// + /// let re = PikeVM::new(r"^a")?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// let input = Input::new(haystack).span(2..3).anchored(Anchored::No); + /// re.search(&mut cache, &input, &mut caps); + /// // No match is found because 2 is not the beginning of the haystack, + /// // which is what ^ requires. + /// assert_eq!(None, caps.get_match()); + /// + /// let re = PikeVM::new(r"a")?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// let input = Input::new(haystack).span(2..3).anchored(Anchored::Yes); + /// re.search(&mut cache, &input, &mut caps); + /// // An anchored search can still match anywhere in the haystack, it just + /// // must begin at the start of the search which is '2' in this case. + /// assert_eq!(Some(Match::must(0, 2..3)), caps.get_match()); + /// + /// let re = PikeVM::new(r"a")?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// let input = Input::new(haystack).span(1..3).anchored(Anchored::Yes); + /// re.search(&mut cache, &input, &mut caps); + /// // No match is found since we start searching at offset 1 which + /// // corresponds to 'b'. Since there is no '(?s:.)*?' prefix, no match + /// // is found. + /// assert_eq!(None, caps.get_match()); + /// + /// let re = PikeVM::new(r"a")?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// let input = Input::new(haystack).span(1..3).anchored(Anchored::No); + /// re.search(&mut cache, &input, &mut caps); + /// // Since anchored=no, an implicit '(?s:.)*?' prefix was added to the + /// // pattern. Even though the search starts at 'b', the 'match anything' + /// // prefix allows the search to match 'a'. + /// let expected = Some(Match::must(0, 2..3)); + /// assert_eq!(expected, caps.get_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn anchored(mut self, mode: Anchored) -> Input<'h> { + self.set_anchored(mode); + self + } + + /// Whether to execute an "earliest" search or not. + /// + /// When running a non-overlapping search, an "earliest" search will return + /// the match location as early as possible. For example, given a pattern + /// of `foo[0-9]+` and a haystack of `foo12345`, a normal leftmost search + /// will return `foo12345` as a match. But an "earliest" search for regex + /// engines that support "earliest" semantics will return `foo1` as a + /// match, since as soon as the first digit following `foo` is seen, it is + /// known to have found a match. + /// + /// Note that "earliest" semantics generally depend on the regex engine. + /// Different regex engines may determine there is a match at different + /// points. So there is no guarantee that "earliest" matches will always + /// return the same offsets for all regex engines. The "earliest" notion + /// is really about when the particular regex engine determines there is + /// a match rather than a consistent semantic unto itself. This is often + /// useful for implementing "did a match occur or not" predicates, but + /// sometimes the offset is useful as well. + /// + /// This is disabled by default. + /// + /// # Example + /// + /// This example shows the difference between "earliest" searching and + /// normal searching. + /// + /// ``` + /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match, Input}; + /// + /// let re = PikeVM::new(r"foo[0-9]+")?; + /// let mut cache = re.create_cache(); + /// let mut caps = re.create_captures(); + /// + /// // A normal search implements greediness like you expect. + /// let input = Input::new("foo12345"); + /// re.search(&mut cache, &input, &mut caps); + /// assert_eq!(Some(Match::must(0, 0..8)), caps.get_match()); + /// + /// // When 'earliest' is enabled and the regex engine supports + /// // it, the search will bail once it knows a match has been + /// // found. + /// let input = Input::new("foo12345").earliest(true); + /// re.search(&mut cache, &input, &mut caps); + /// assert_eq!(Some(Match::must(0, 0..4)), caps.get_match()); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn earliest(mut self, yes: bool) -> Input<'h> { + self.set_earliest(yes); + self + } + + /// Set the span for this search configuration. + /// + /// This is like the [`Input::span`] method, except this mutates the + /// span in place. + /// + /// This routine is generic over how a span is provided. While + /// a [`Span`] may be given directly, one may also provide a + /// `std::ops::Range<usize>`. + /// + /// # Panics + /// + /// This panics if the given span does not correspond to valid bounds in + /// the haystack or the termination of a search. + /// + /// # Example + /// + /// ``` + /// use regex_automata::Input; + /// + /// let mut input = Input::new("foobar"); + /// assert_eq!(0..6, input.get_range()); + /// input.set_span(2..4); + /// assert_eq!(2..4, input.get_range()); + /// ``` + #[inline] + pub fn set_span<S: Into<Span>>(&mut self, span: S) { + let span = span.into(); + assert!( + span.end <= self.haystack.len() + && span.start <= span.end.wrapping_add(1), + "invalid span {:?} for haystack of length {}", + span, + self.haystack.len(), + ); + self.span = span; + } + + /// Set the span for this search configuration given any range. + /// + /// This is like the [`Input::range`] method, except this mutates the + /// span in place. + /// + /// This routine does not panic if the range given is not a valid range for + /// this search's haystack. If this search is run with an invalid range, + /// then the most likely outcome is that the actual search execution will + /// panic. + /// + /// # Panics + /// + /// This routine will panic if the given range could not be converted + /// to a valid [`Range`]. For example, this would panic when given + /// `0..=usize::MAX` since it cannot be represented using a half-open + /// interval in terms of `usize`. + /// + /// This also panics if the given span does not correspond to valid bounds + /// in the haystack or the termination of a search. + /// + /// # Example + /// + /// ``` + /// use regex_automata::Input; + /// + /// let mut input = Input::new("foobar"); + /// assert_eq!(0..6, input.get_range()); + /// input.set_range(2..=4); + /// assert_eq!(2..5, input.get_range()); + /// ``` + #[inline] + pub fn set_range<R: RangeBounds<usize>>(&mut self, range: R) { + use core::ops::Bound; + + // It's a little weird to convert ranges into spans, and then spans + // back into ranges when we actually slice the haystack. Because + // of that process, we always represent everything as a half-open + // internal. Therefore, handling things like m..=n is a little awkward. + let start = match range.start_bound() { + Bound::Included(&i) => i, + // Can this case ever happen? Range syntax doesn't support it... + Bound::Excluded(&i) => i.checked_add(1).unwrap(), + Bound::Unbounded => 0, + }; + let end = match range.end_bound() { + Bound::Included(&i) => i.checked_add(1).unwrap(), + Bound::Excluded(&i) => i, + Bound::Unbounded => self.haystack().len(), + }; + self.set_span(Span { start, end }); + } + + /// Set the starting offset for the span for this search configuration. + /// + /// This is a convenience routine for only mutating the start of a span + /// without having to set the entire span. + /// + /// # Panics + /// + /// This panics if the span resulting from the new start position does not + /// correspond to valid bounds in the haystack or the termination of a + /// search. + /// + /// # Example + /// + /// ``` + /// use regex_automata::Input; + /// + /// let mut input = Input::new("foobar"); + /// assert_eq!(0..6, input.get_range()); + /// input.set_start(5); + /// assert_eq!(5..6, input.get_range()); + /// ``` + #[inline] + pub fn set_start(&mut self, start: usize) { + self.set_span(Span { start, ..self.get_span() }); + } + + /// Set the ending offset for the span for this search configuration. + /// + /// This is a convenience routine for only mutating the end of a span + /// without having to set the entire span. + /// + /// # Panics + /// + /// This panics if the span resulting from the new end position does not + /// correspond to valid bounds in the haystack or the termination of a + /// search. + /// + /// # Example + /// + /// ``` + /// use regex_automata::Input; + /// + /// let mut input = Input::new("foobar"); + /// assert_eq!(0..6, input.get_range()); + /// input.set_end(5); + /// assert_eq!(0..5, input.get_range()); + /// ``` + #[inline] + pub fn set_end(&mut self, end: usize) { + self.set_span(Span { end, ..self.get_span() }); + } + + /// Set the anchor mode of a search. + /// + /// This is like [`Input::anchored`], except it mutates the search + /// configuration in place. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{Anchored, Input, PatternID}; + /// + /// let mut input = Input::new("foobar"); + /// assert_eq!(Anchored::No, input.get_anchored()); + /// + /// let pid = PatternID::must(5); + /// input.set_anchored(Anchored::Pattern(pid)); + /// assert_eq!(Anchored::Pattern(pid), input.get_anchored()); + /// ``` + #[inline] + pub fn set_anchored(&mut self, mode: Anchored) { + self.anchored = mode; + } + + /// Set whether the search should execute in "earliest" mode or not. + /// + /// This is like [`Input::earliest`], except it mutates the search + /// configuration in place. + /// + /// # Example + /// + /// ``` + /// use regex_automata::Input; + /// + /// let mut input = Input::new("foobar"); + /// assert!(!input.get_earliest()); + /// input.set_earliest(true); + /// assert!(input.get_earliest()); + /// ``` + #[inline] + pub fn set_earliest(&mut self, yes: bool) { + self.earliest = yes; + } + + /// Return a borrow of the underlying haystack as a slice of bytes. + /// + /// # Example + /// + /// ``` + /// use regex_automata::Input; + /// + /// let input = Input::new("foobar"); + /// assert_eq!(b"foobar", input.haystack()); + /// ``` + #[inline] + pub fn haystack(&self) -> &[u8] { + self.haystack + } + + /// Return the start position of this search. + /// + /// This is a convenience routine for `search.get_span().start()`. + /// + /// When [`Input::is_done`] is `false`, this is guaranteed to return + /// an offset that is less than or equal to [`Input::end`]. Otherwise, + /// the offset is one greater than [`Input::end`]. + /// + /// # Example + /// + /// ``` + /// use regex_automata::Input; + /// + /// let input = Input::new("foobar"); + /// assert_eq!(0, input.start()); + /// + /// let input = Input::new("foobar").span(2..4); + /// assert_eq!(2, input.start()); + /// ``` + #[inline] + pub fn start(&self) -> usize { + self.get_span().start + } + + /// Return the end position of this search. + /// + /// This is a convenience routine for `search.get_span().end()`. + /// + /// This is guaranteed to return an offset that is a valid exclusive end + /// bound for this input's haystack. + /// + /// # Example + /// + /// ``` + /// use regex_automata::Input; + /// + /// let input = Input::new("foobar"); + /// assert_eq!(6, input.end()); + /// + /// let input = Input::new("foobar").span(2..4); + /// assert_eq!(4, input.end()); + /// ``` + #[inline] + pub fn end(&self) -> usize { + self.get_span().end + } + + /// Return the span for this search configuration. + /// + /// If one was not explicitly set, then the span corresponds to the entire + /// range of the haystack. + /// + /// When [`Input::is_done`] is `false`, the span returned is guaranteed + /// to correspond to valid bounds for this input's haystack. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{Input, Span}; + /// + /// let input = Input::new("foobar"); + /// assert_eq!(Span { start: 0, end: 6 }, input.get_span()); + /// ``` + #[inline] + pub fn get_span(&self) -> Span { + self.span + } + + /// Return the span as a range for this search configuration. + /// + /// If one was not explicitly set, then the span corresponds to the entire + /// range of the haystack. + /// + /// When [`Input::is_done`] is `false`, the range returned is guaranteed + /// to correspond to valid bounds for this input's haystack. + /// + /// # Example + /// + /// ``` + /// use regex_automata::Input; + /// + /// let input = Input::new("foobar"); + /// assert_eq!(0..6, input.get_range()); + /// ``` + #[inline] + pub fn get_range(&self) -> Range<usize> { + self.get_span().range() + } + + /// Return the anchored mode for this search configuration. + /// + /// If no anchored mode was set, then it defaults to [`Anchored::No`]. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{Anchored, Input, PatternID}; + /// + /// let mut input = Input::new("foobar"); + /// assert_eq!(Anchored::No, input.get_anchored()); + /// + /// let pid = PatternID::must(5); + /// input.set_anchored(Anchored::Pattern(pid)); + /// assert_eq!(Anchored::Pattern(pid), input.get_anchored()); + /// ``` + #[inline] + pub fn get_anchored(&self) -> Anchored { + self.anchored + } + + /// Return whether this search should execute in "earliest" mode. + /// + /// # Example + /// + /// ``` + /// use regex_automata::Input; + /// + /// let input = Input::new("foobar"); + /// assert!(!input.get_earliest()); + /// ``` + #[inline] + pub fn get_earliest(&self) -> bool { + self.earliest + } + + /// Return true if and only if this search can never return any other + /// matches. + /// + /// This occurs when the start position of this search is greater than the + /// end position of the search. + /// + /// # Example + /// + /// ``` + /// use regex_automata::Input; + /// + /// let mut input = Input::new("foobar"); + /// assert!(!input.is_done()); + /// input.set_start(6); + /// assert!(!input.is_done()); + /// input.set_start(7); + /// assert!(input.is_done()); + /// ``` + #[inline] + pub fn is_done(&self) -> bool { + self.get_span().start > self.get_span().end + } + + /// Returns true if and only if the given offset in this search's haystack + /// falls on a valid UTF-8 encoded codepoint boundary. + /// + /// If the haystack is not valid UTF-8, then the behavior of this routine + /// is unspecified. + /// + /// # Example + /// + /// This shows where codepoint boundaries do and don't exist in valid + /// UTF-8. + /// + /// ``` + /// use regex_automata::Input; + /// + /// let input = Input::new("☃"); + /// assert!(input.is_char_boundary(0)); + /// assert!(!input.is_char_boundary(1)); + /// assert!(!input.is_char_boundary(2)); + /// assert!(input.is_char_boundary(3)); + /// assert!(!input.is_char_boundary(4)); + /// ``` + #[inline] + pub fn is_char_boundary(&self, offset: usize) -> bool { + utf8::is_boundary(self.haystack(), offset) + } +} + +impl<'h> core::fmt::Debug for Input<'h> { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + use crate::util::escape::DebugHaystack; + + f.debug_struct("Input") + .field("haystack", &DebugHaystack(self.haystack())) + .field("span", &self.span) + .field("anchored", &self.anchored) + .field("earliest", &self.earliest) + .finish() + } +} + +impl<'h, H: ?Sized + AsRef<[u8]>> From<&'h H> for Input<'h> { + fn from(haystack: &'h H) -> Input<'h> { + Input::new(haystack) + } +} + +/// A representation of a span reported by a regex engine. +/// +/// A span corresponds to the starting and ending _byte offsets_ of a +/// contiguous region of bytes. The starting offset is inclusive while the +/// ending offset is exclusive. That is, a span is a half-open interval. +/// +/// A span is used to report the offsets of a match, but it is also used to +/// convey which region of a haystack should be searched via routines like +/// [`Input::span`]. +/// +/// This is basically equivalent to a `std::ops::Range<usize>`, except this +/// type implements `Copy` which makes it more ergonomic to use in the context +/// of this crate. Like a range, this implements `Index` for `[u8]` and `str`, +/// and `IndexMut` for `[u8]`. For convenience, this also impls `From<Range>`, +/// which means things like `Span::from(5..10)` work. +#[derive(Clone, Copy, Eq, Hash, PartialEq)] +pub struct Span { + /// The start offset of the span, inclusive. + pub start: usize, + /// The end offset of the span, exclusive. + pub end: usize, +} + +impl Span { + /// Returns this span as a range. + #[inline] + pub fn range(&self) -> Range<usize> { + Range::from(*self) + } + + /// Returns true when this span is empty. That is, when `start >= end`. + #[inline] + pub fn is_empty(&self) -> bool { + self.start >= self.end + } + + /// Returns the length of this span. + /// + /// This returns `0` in precisely the cases that `is_empty` returns `true`. + #[inline] + pub fn len(&self) -> usize { + self.end.saturating_sub(self.start) + } + + /// Returns true when the given offset is contained within this span. + /// + /// Note that an empty span contains no offsets and will always return + /// false. + #[inline] + pub fn contains(&self, offset: usize) -> bool { + !self.is_empty() && self.start <= offset && offset <= self.end + } + + /// Returns a new span with `offset` added to this span's `start` and `end` + /// values. + #[inline] + pub fn offset(&self, offset: usize) -> Span { + Span { start: self.start + offset, end: self.end + offset } + } +} + +impl core::fmt::Debug for Span { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + write!(f, "{}..{}", self.start, self.end) + } +} + +impl core::ops::Index<Span> for [u8] { + type Output = [u8]; + + #[inline] + fn index(&self, index: Span) -> &[u8] { + &self[index.range()] + } +} + +impl core::ops::IndexMut<Span> for [u8] { + #[inline] + fn index_mut(&mut self, index: Span) -> &mut [u8] { + &mut self[index.range()] + } +} + +impl core::ops::Index<Span> for str { + type Output = str; + + #[inline] + fn index(&self, index: Span) -> &str { + &self[index.range()] + } +} + +impl From<Range<usize>> for Span { + #[inline] + fn from(range: Range<usize>) -> Span { + Span { start: range.start, end: range.end } + } +} + +impl From<Span> for Range<usize> { + #[inline] + fn from(span: Span) -> Range<usize> { + Range { start: span.start, end: span.end } + } +} + +impl PartialEq<Range<usize>> for Span { + #[inline] + fn eq(&self, range: &Range<usize>) -> bool { + self.start == range.start && self.end == range.end + } +} + +impl PartialEq<Span> for Range<usize> { + #[inline] + fn eq(&self, span: &Span) -> bool { + self.start == span.start && self.end == span.end + } +} + +/// A representation of "half" of a match reported by a DFA. +/// +/// This is called a "half" match because it only includes the end location (or +/// start location for a reverse search) of a match. This corresponds to the +/// information that a single DFA scan can report. Getting the other half of +/// the match requires a second scan with a reversed DFA. +/// +/// A half match also includes the pattern that matched. The pattern is +/// identified by an ID, which corresponds to its position (starting from `0`) +/// relative to other patterns used to construct the corresponding DFA. If only +/// a single pattern is provided to the DFA, then all matches are guaranteed to +/// have a pattern ID of `0`. +#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] +pub struct HalfMatch { + /// The pattern ID. + pattern: PatternID, + /// The offset of the match. + /// + /// For forward searches, the offset is exclusive. For reverse searches, + /// the offset is inclusive. + offset: usize, +} + +impl HalfMatch { + /// Create a new half match from a pattern ID and a byte offset. + #[inline] + pub fn new(pattern: PatternID, offset: usize) -> HalfMatch { + HalfMatch { pattern, offset } + } + + /// Create a new half match from a pattern ID and a byte offset. + /// + /// This is like [`HalfMatch::new`], but accepts a `usize` instead of a + /// [`PatternID`]. This panics if the given `usize` is not representable + /// as a `PatternID`. + #[inline] + pub fn must(pattern: usize, offset: usize) -> HalfMatch { + HalfMatch::new(PatternID::new(pattern).unwrap(), offset) + } + + /// Returns the ID of the pattern that matched. + /// + /// The ID of a pattern is derived from the position in which it was + /// originally inserted into the corresponding DFA. The first pattern has + /// identifier `0`, and each subsequent pattern is `1`, `2` and so on. + #[inline] + pub fn pattern(&self) -> PatternID { + self.pattern + } + + /// The position of the match. + /// + /// If this match was produced by a forward search, then the offset is + /// exclusive. If this match was produced by a reverse search, then the + /// offset is inclusive. + #[inline] + pub fn offset(&self) -> usize { + self.offset + } +} + +/// A representation of a match reported by a regex engine. +/// +/// A match has two essential pieces of information: the [`PatternID`] that +/// matches, and the [`Span`] of the match in a haystack. +/// +/// The pattern is identified by an ID, which corresponds to its position +/// (starting from `0`) relative to other patterns used to construct the +/// corresponding regex engine. If only a single pattern is provided, then all +/// matches are guaranteed to have a pattern ID of `0`. +/// +/// Every match reported by a regex engine guarantees that its span has its +/// start offset as less than or equal to its end offset. +#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] +pub struct Match { + /// The pattern ID. + pattern: PatternID, + /// The underlying match span. + span: Span, +} + +impl Match { + /// Create a new match from a pattern ID and a span. + /// + /// This constructor is generic over how a span is provided. While + /// a [`Span`] may be given directly, one may also provide a + /// `std::ops::Range<usize>`. + /// + /// # Panics + /// + /// This panics if `end < start`. + /// + /// # Example + /// + /// This shows how to create a match for the first pattern in a regex + /// object using convenient range syntax. + /// + /// ``` + /// use regex_automata::{Match, PatternID}; + /// + /// let m = Match::new(PatternID::ZERO, 5..10); + /// assert_eq!(0, m.pattern().as_usize()); + /// assert_eq!(5, m.start()); + /// assert_eq!(10, m.end()); + /// ``` + #[inline] + pub fn new<S: Into<Span>>(pattern: PatternID, span: S) -> Match { + let span: Span = span.into(); + assert!(span.start <= span.end, "invalid match span"); + Match { pattern, span } + } + + /// Create a new match from a pattern ID and a byte offset span. + /// + /// This constructor is generic over how a span is provided. While + /// a [`Span`] may be given directly, one may also provide a + /// `std::ops::Range<usize>`. + /// + /// This is like [`Match::new`], but accepts a `usize` instead of a + /// [`PatternID`]. This panics if the given `usize` is not representable + /// as a `PatternID`. + /// + /// # Panics + /// + /// This panics if `end < start` or if `pattern > PatternID::MAX`. + /// + /// # Example + /// + /// This shows how to create a match for the third pattern in a regex + /// object using convenient range syntax. + /// + /// ``` + /// use regex_automata::Match; + /// + /// let m = Match::must(3, 5..10); + /// assert_eq!(3, m.pattern().as_usize()); + /// assert_eq!(5, m.start()); + /// assert_eq!(10, m.end()); + /// ``` + #[inline] + pub fn must<S: Into<Span>>(pattern: usize, span: S) -> Match { + Match::new(PatternID::must(pattern), span) + } + + /// Returns the ID of the pattern that matched. + /// + /// The ID of a pattern is derived from the position in which it was + /// originally inserted into the corresponding regex engine. The first + /// pattern has identifier `0`, and each subsequent pattern is `1`, `2` and + /// so on. + #[inline] + pub fn pattern(&self) -> PatternID { + self.pattern + } + + /// The starting position of the match. + /// + /// This is a convenience routine for `Match::span().start`. + #[inline] + pub fn start(&self) -> usize { + self.span().start + } + + /// The ending position of the match. + /// + /// This is a convenience routine for `Match::span().end`. + #[inline] + pub fn end(&self) -> usize { + self.span().end + } + + /// Returns the match span as a range. + /// + /// This is a convenience routine for `Match::span().range()`. + #[inline] + pub fn range(&self) -> core::ops::Range<usize> { + self.span().range() + } + + /// Returns the span for this match. + #[inline] + pub fn span(&self) -> Span { + self.span + } + + /// Returns true when the span in this match is empty. + /// + /// An empty match can only be returned when the regex itself can match + /// the empty string. + #[inline] + pub fn is_empty(&self) -> bool { + self.span().is_empty() + } + + /// Returns the length of this match. + /// + /// This returns `0` in precisely the cases that `is_empty` returns `true`. + #[inline] + pub fn len(&self) -> usize { + self.span().len() + } +} + +/// A set of `PatternID`s. +/// +/// A set of pattern identifiers is useful for recording which patterns have +/// matched a particular haystack. A pattern set _only_ includes pattern +/// identifiers. It does not include offset information. +/// +/// # Example +/// +/// This shows basic usage of a set. +/// +/// ``` +/// use regex_automata::{PatternID, PatternSet}; +/// +/// let pid1 = PatternID::must(5); +/// let pid2 = PatternID::must(8); +/// // Create a new empty set. +/// let mut set = PatternSet::new(10); +/// // Insert pattern IDs. +/// set.insert(pid1); +/// set.insert(pid2); +/// // Test membership. +/// assert!(set.contains(pid1)); +/// assert!(set.contains(pid2)); +/// // Get all members. +/// assert_eq!( +/// vec![5, 8], +/// set.iter().map(|p| p.as_usize()).collect::<Vec<usize>>(), +/// ); +/// // Clear the set. +/// set.clear(); +/// // Test that it is indeed empty. +/// assert!(set.is_empty()); +/// ``` +#[cfg(feature = "alloc")] +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct PatternSet { + /// The number of patterns set to 'true' in this set. + len: usize, + /// A map from PatternID to boolean of whether a pattern matches or not. + /// + /// This should probably be a bitset, but it's probably unlikely to matter + /// much in practice. + /// + /// The main downside of this representation (and similarly for a bitset) + /// is that iteration scales with the capacity of the set instead of + /// the length of the set. This doesn't seem likely to be a problem in + /// practice. + /// + /// Another alternative is to just use a 'SparseSet' for this. It does use + /// more memory (quite a bit more), but that seems fine I think compared + /// to the memory being used by the regex engine. The real hiccup with + /// it is that it yields pattern IDs in the order they were inserted. + /// Which is actually kind of nice, but at the time of writing, pattern + /// IDs are yielded in ascending order in the regex crate RegexSet API. + /// If we did change to 'SparseSet', we could provide an additional + /// 'iter_match_order' iterator, but keep the ascending order one for + /// compatibility. + which: alloc::boxed::Box<[bool]>, +} + +#[cfg(feature = "alloc")] +impl PatternSet { + /// Create a new set of pattern identifiers with the given capacity. + /// + /// The given capacity typically corresponds to (at least) the number of + /// patterns in a compiled regex object. + /// + /// # Panics + /// + /// This panics if the given capacity exceeds [`PatternID::LIMIT`]. This is + /// impossible if you use the `pattern_len()` method as defined on any of + /// the regex engines in this crate. Namely, a regex will fail to build by + /// returning an error if the number of patterns given to it exceeds the + /// limit. Therefore, the number of patterns in a valid regex is always + /// a correct capacity to provide here. + pub fn new(capacity: usize) -> PatternSet { + assert!( + capacity <= PatternID::LIMIT, + "pattern set capacity exceeds limit of {}", + PatternID::LIMIT, + ); + PatternSet { + len: 0, + which: alloc::vec![false; capacity].into_boxed_slice(), + } + } + + /// Clear this set such that it contains no pattern IDs. + pub fn clear(&mut self) { + self.len = 0; + for matched in self.which.iter_mut() { + *matched = false; + } + } + + /// Return true if and only if the given pattern identifier is in this set. + pub fn contains(&self, pid: PatternID) -> bool { + pid.as_usize() < self.capacity() && self.which[pid] + } + + /// Insert the given pattern identifier into this set and return `true` if + /// the given pattern ID was not previously in this set. + /// + /// If the pattern identifier is already in this set, then this is a no-op. + /// + /// Use [`PatternSet::try_insert`] for a fallible version of this routine. + /// + /// # Panics + /// + /// This panics if this pattern set has insufficient capacity to + /// store the given pattern ID. + pub fn insert(&mut self, pid: PatternID) -> bool { + self.try_insert(pid) + .expect("PatternSet should have sufficient capacity") + } + + /// Insert the given pattern identifier into this set and return `true` if + /// the given pattern ID was not previously in this set. + /// + /// If the pattern identifier is already in this set, then this is a no-op. + /// + /// # Errors + /// + /// This returns an error if this pattern set has insufficient capacity to + /// store the given pattern ID. + pub fn try_insert( + &mut self, + pid: PatternID, + ) -> Result<bool, PatternSetInsertError> { + if pid.as_usize() >= self.capacity() { + return Err(PatternSetInsertError { + attempted: pid, + capacity: self.capacity(), + }); + } + if self.which[pid] { + return Ok(false); + } + self.len += 1; + self.which[pid] = true; + Ok(true) + } + + /* + // This is currently commented out because it is unused and it is unclear + // whether it's useful or not. What's the harm in having it? When, if + // we ever wanted to change our representation to a 'SparseSet', then + // supporting this method would be a bit tricky. So in order to keep some + // API evolution flexibility, we leave it out for now. + + /// Remove the given pattern identifier from this set. + /// + /// If the pattern identifier was not previously in this set, then this + /// does not change the set and returns `false`. + /// + /// # Panics + /// + /// This panics if `pid` exceeds the capacity of this set. + pub fn remove(&mut self, pid: PatternID) -> bool { + if !self.which[pid] { + return false; + } + self.len -= 1; + self.which[pid] = false; + true + } + */ + + /// Return true if and only if this set has no pattern identifiers in it. + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Return true if and only if this set has the maximum number of pattern + /// identifiers in the set. This occurs precisely when `PatternSet::len() + /// == PatternSet::capacity()`. + /// + /// This particular property is useful to test because it may allow one to + /// stop a search earlier than you might otherwise. Namely, if a search is + /// only reporting which patterns match a haystack and if you know all of + /// the patterns match at a given point, then there's no new information + /// that can be learned by continuing the search. (Because a pattern set + /// does not keep track of offset information.) + pub fn is_full(&self) -> bool { + self.len() == self.capacity() + } + + /// Returns the total number of pattern identifiers in this set. + pub fn len(&self) -> usize { + self.len + } + + /// Returns the total number of pattern identifiers that may be stored + /// in this set. + /// + /// This is guaranteed to be less than or equal to [`PatternID::LIMIT`]. + /// + /// Typically, the capacity of a pattern set matches the number of patterns + /// in a regex object with which you are searching. + pub fn capacity(&self) -> usize { + self.which.len() + } + + /// Returns an iterator over all pattern identifiers in this set. + /// + /// The iterator yields pattern identifiers in ascending order, starting + /// at zero. + pub fn iter(&self) -> PatternSetIter<'_> { + PatternSetIter { it: self.which.iter().enumerate() } + } +} + +/// An error that occurs when a `PatternID` failed to insert into a +/// `PatternSet`. +/// +/// An insert fails when the given `PatternID` exceeds the configured capacity +/// of the `PatternSet`. +/// +/// This error is created by the [`PatternSet::try_insert`] routine. +#[cfg(feature = "alloc")] +#[derive(Clone, Debug)] +pub struct PatternSetInsertError { + attempted: PatternID, + capacity: usize, +} + +#[cfg(feature = "std")] +impl std::error::Error for PatternSetInsertError {} + +#[cfg(feature = "alloc")] +impl core::fmt::Display for PatternSetInsertError { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + write!( + f, + "failed to insert pattern ID {} into pattern set \ + with insufficiet capacity of {}", + self.attempted.as_usize(), + self.capacity, + ) + } +} + +/// An iterator over all pattern identifiers in a [`PatternSet`]. +/// +/// The lifetime parameter `'a` refers to the lifetime of the pattern set being +/// iterated over. +/// +/// This iterator is created by the [`PatternSet::iter`] method. +#[cfg(feature = "alloc")] +#[derive(Clone, Debug)] +pub struct PatternSetIter<'a> { + it: core::iter::Enumerate<core::slice::Iter<'a, bool>>, +} + +#[cfg(feature = "alloc")] +impl<'a> Iterator for PatternSetIter<'a> { + type Item = PatternID; + + fn next(&mut self) -> Option<PatternID> { + while let Some((index, &yes)) = self.it.next() { + if yes { + // Only valid 'PatternID' values can be inserted into the set + // and construction of the set panics if the capacity would + // permit storing invalid pattern IDs. Thus, 'yes' is only true + // precisely when 'index' corresponds to a valid 'PatternID'. + return Some(PatternID::new_unchecked(index)); + } + } + None + } + + fn size_hint(&self) -> (usize, Option<usize>) { + self.it.size_hint() + } +} + +#[cfg(feature = "alloc")] +impl<'a> DoubleEndedIterator for PatternSetIter<'a> { + fn next_back(&mut self) -> Option<PatternID> { + while let Some((index, &yes)) = self.it.next_back() { + if yes { + // Only valid 'PatternID' values can be inserted into the set + // and construction of the set panics if the capacity would + // permit storing invalid pattern IDs. Thus, 'yes' is only true + // precisely when 'index' corresponds to a valid 'PatternID'. + return Some(PatternID::new_unchecked(index)); + } + } + None + } +} + +/// The type of anchored search to perform. +/// +/// This is *almost* a boolean option. That is, you can either do an unanchored +/// search for any pattern in a regex, or you can do an anchored search for any +/// pattern in a regex. +/// +/// A third option exists that, assuming the regex engine supports it, permits +/// you to do an anchored search for a specific pattern. +/// +/// Note that there is no way to run an unanchored search for a specific +/// pattern. If you need that, you'll need to build separate regexes for each +/// pattern. +/// +/// # Errors +/// +/// If a regex engine does not support the anchored mode selected, then the +/// regex engine will return an error. While any non-trivial regex engine +/// should support at least one of the available anchored modes, there is no +/// singular mode that is guaranteed to be universally supported. Some regex +/// engines might only support unanchored searches (DFAs compiled without +/// anchored starting states) and some regex engines might only support +/// anchored searches (like the one-pass DFA). +/// +/// The specific error returned is a [`MatchError`] with a +/// [`MatchErrorKind::UnsupportedAnchored`] kind. The kind includes the +/// `Anchored` value given that is unsupported. +/// +/// Note that regex engines should report "no match" if, for example, an +/// `Anchored::Pattern` is provided with an invalid pattern ID _but_ where +/// anchored searches for a specific pattern are supported. This is smooths out +/// behavior such that it's possible to guarantee that an error never occurs +/// based on how the regex engine is configured. All regex engines in this +/// crate report "no match" when searching for an invalid pattern ID, but where +/// searching for a valid pattern ID is otherwise supported. +/// +/// # Example +/// +/// This example shows how to use the various `Anchored` modes to run a +/// search. We use the [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM) +/// because it supports all modes unconditionally. Some regex engines, like +/// the [`onepass::DFA`](crate::dfa::onepass::DFA) cannot support unanchored +/// searches. +/// +/// ``` +/// # if cfg!(miri) { return Ok(()); } // miri takes too long +/// use regex_automata::{ +/// nfa::thompson::pikevm::PikeVM, +/// Anchored, Input, Match, PatternID, +/// }; +/// +/// let re = PikeVM::new_many(&[ +/// r"Mrs. \w+", +/// r"Miss \w+", +/// r"Mr. \w+", +/// r"Ms. \w+", +/// ])?; +/// let mut cache = re.create_cache(); +/// let hay = "Hello Mr. Springsteen!"; +/// +/// // The default is to do an unanchored search. +/// assert_eq!(Some(Match::must(2, 6..21)), re.find(&mut cache, hay)); +/// // Explicitly ask for an unanchored search. Same as above. +/// let input = Input::new(hay).anchored(Anchored::No); +/// assert_eq!(Some(Match::must(2, 6..21)), re.find(&mut cache, hay)); +/// +/// // Now try an anchored search. Since the match doesn't start at the +/// // beginning of the haystack, no match is found! +/// let input = Input::new(hay).anchored(Anchored::Yes); +/// assert_eq!(None, re.find(&mut cache, input)); +/// +/// // We can try an anchored search again, but move the location of where +/// // we start the search. Note that the offsets reported are still in +/// // terms of the overall haystack and not relative to where we started +/// // the search. +/// let input = Input::new(hay).anchored(Anchored::Yes).range(6..); +/// assert_eq!(Some(Match::must(2, 6..21)), re.find(&mut cache, input)); +/// +/// // Now try an anchored search for a specific pattern. We specifically +/// // choose a pattern that we know doesn't match to prove that the search +/// // only looks for the pattern we provide. +/// let input = Input::new(hay) +/// .anchored(Anchored::Pattern(PatternID::must(1))) +/// .range(6..); +/// assert_eq!(None, re.find(&mut cache, input)); +/// +/// // But if we switch it to the pattern that we know matches, then we find +/// // the match. +/// let input = Input::new(hay) +/// .anchored(Anchored::Pattern(PatternID::must(2))) +/// .range(6..); +/// assert_eq!(Some(Match::must(2, 6..21)), re.find(&mut cache, input)); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum Anchored { + /// Run an unanchored search. This means a match may occur anywhere at or + /// after the start position of the search. + /// + /// This search can return a match for any pattern in the regex. + No, + /// Run an anchored search. This means that a match must begin at the + /// start position of the search. + /// + /// This search can return a match for any pattern in the regex. + Yes, + /// Run an anchored search for a specific pattern. This means that a match + /// must be for the given pattern and must begin at the start position of + /// the search. + Pattern(PatternID), +} + +impl Anchored { + /// Returns true if and only if this anchor mode corresponds to any kind of + /// anchored search. + /// + /// # Example + /// + /// This examples shows that both `Anchored::Yes` and `Anchored::Pattern` + /// are considered anchored searches. + /// + /// ``` + /// use regex_automata::{Anchored, PatternID}; + /// + /// assert!(!Anchored::No.is_anchored()); + /// assert!(Anchored::Yes.is_anchored()); + /// assert!(Anchored::Pattern(PatternID::ZERO).is_anchored()); + /// ``` + #[inline] + pub fn is_anchored(&self) -> bool { + matches!(*self, Anchored::Yes | Anchored::Pattern(_)) + } + + /// Returns the pattern ID associated with this configuration if it is an + /// anchored search for a specific pattern. Otherwise `None` is returned. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{Anchored, PatternID}; + /// + /// assert_eq!(None, Anchored::No.pattern()); + /// assert_eq!(None, Anchored::Yes.pattern()); + /// + /// let pid = PatternID::must(5); + /// assert_eq!(Some(pid), Anchored::Pattern(pid).pattern()); + /// ``` + #[inline] + pub fn pattern(&self) -> Option<PatternID> { + match *self { + Anchored::Pattern(pid) => Some(pid), + _ => None, + } + } +} + +/// The kind of match semantics to use for a regex pattern. +/// +/// The default match kind is `LeftmostFirst`, and this corresponds to the +/// match semantics used by most backtracking engines, such as Perl. +/// +/// # Leftmost first or "preference order" match semantics +/// +/// Leftmost-first semantics determine which match to report when there are +/// multiple paths through a regex that match at the same position. The tie is +/// essentially broken by how a backtracker would behave. For example, consider +/// running the regex `foofoofoo|foofoo|foo` on the haystack `foofoo`. In this +/// case, both the `foofoo` and `foo` branches match at position `0`. So should +/// the end of the match be `3` or `6`? +/// +/// A backtracker will conceptually work by trying `foofoofoo` and failing. +/// Then it will try `foofoo`, find the match and stop there. Thus, the +/// leftmost-first match position is `6`. This is called "leftmost-first" or +/// "preference order" because the order of the branches as written in the +/// regex pattern is what determines how to break the tie. +/// +/// (Note that leftmost-longest match semantics, which break ties by always +/// taking the longest matching string, are not currently supported by this +/// crate. These match semantics tend to be found in POSIX regex engines.) +/// +/// This example shows how leftmost-first semantics work, and how it even +/// applies to multi-pattern regexes: +/// +/// ``` +/// use regex_automata::{ +/// nfa::thompson::pikevm::PikeVM, +/// Match, +/// }; +/// +/// let re = PikeVM::new_many(&[ +/// r"foofoofoo", +/// r"foofoo", +/// r"foo", +/// ])?; +/// let mut cache = re.create_cache(); +/// let got: Vec<Match> = re.find_iter(&mut cache, "foofoo").collect(); +/// let expected = vec![Match::must(1, 0..6)]; +/// assert_eq!(expected, got); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +/// +/// # All matches +/// +/// The `All` match semantics report any and all matches, and generally will +/// attempt to match as much as possible. It doesn't respect any sort of match +/// priority at all, so things like non-greedy matching don't work in this +/// mode. +/// +/// The fact that non-greedy matching doesn't work generally makes most forms +/// of unanchored non-overlapping searches have unintuitive behavior. Namely, +/// unanchored searches behave as if there is a `(?s-u:.)*?` prefix at the +/// beginning of the pattern, which is specifically non-greedy. Since it will +/// be treated as greedy in `All` match semantics, this generally means that +/// it will first attempt to consume all of the haystack and is likely to wind +/// up skipping matches. +/// +/// Generally speaking, `All` should only be used in two circumstances: +/// +/// * When running an anchored search and there is a desire to match as much as +/// possible. For example, when building a reverse regex matcher to find the +/// start of a match after finding the end. In this case, the reverse search +/// is anchored to the end of the match found by the forward search. +/// * When running overlapping searches. Since `All` encodes all possible +/// matches, this is generally what you want for an overlapping search. If you +/// try to use leftmost-first in an overlapping search, it is likely to produce +/// counter-intuitive results since leftmost-first specifically excludes some +/// matches from its underlying finite state machine. +/// +/// This example demonstrates the counter-intuitive behavior of `All` semantics +/// when using a standard leftmost unanchored search: +/// +/// ``` +/// use regex_automata::{ +/// nfa::thompson::pikevm::PikeVM, +/// Match, MatchKind, +/// }; +/// +/// let re = PikeVM::builder() +/// .configure(PikeVM::config().match_kind(MatchKind::All)) +/// .build("foo")?; +/// let hay = "first foo second foo wat"; +/// let mut cache = re.create_cache(); +/// let got: Vec<Match> = re.find_iter(&mut cache, hay).collect(); +/// // Notice that it completely skips the first 'foo'! +/// let expected = vec![Match::must(0, 17..20)]; +/// assert_eq!(expected, got); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +/// +/// This second example shows how `All` semantics are useful for an overlapping +/// search. Note that we use lower level lazy DFA APIs here since the NFA +/// engines only currently support a very limited form of overlapping search. +/// +/// ``` +/// use regex_automata::{ +/// hybrid::dfa::{DFA, OverlappingState}, +/// HalfMatch, Input, MatchKind, +/// }; +/// +/// let re = DFA::builder() +/// // If we didn't set 'All' semantics here, then the regex would only +/// // match 'foo' at offset 3 and nothing else. Why? Because the state +/// // machine implements preference order and knows that the 'foofoo' and +/// // 'foofoofoo' branches can never match since 'foo' will always match +/// // when they match and take priority. +/// .configure(DFA::config().match_kind(MatchKind::All)) +/// .build(r"foo|foofoo|foofoofoo")?; +/// let mut cache = re.create_cache(); +/// let mut state = OverlappingState::start(); +/// let input = Input::new("foofoofoo"); +/// let mut got = vec![]; +/// loop { +/// re.try_search_overlapping_fwd(&mut cache, &input, &mut state)?; +/// let m = match state.get_match() { +/// None => break, +/// Some(m) => m, +/// }; +/// got.push(m); +/// } +/// let expected = vec![ +/// HalfMatch::must(0, 3), +/// HalfMatch::must(0, 6), +/// HalfMatch::must(0, 9), +/// ]; +/// assert_eq!(expected, got); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +#[non_exhaustive] +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum MatchKind { + /// Report all possible matches. + All, + /// Report only the leftmost matches. When multiple leftmost matches exist, + /// report the match corresponding to the part of the regex that appears + /// first in the syntax. + LeftmostFirst, + // There is prior art in RE2 that shows that we should be able to add + // LeftmostLongest too. The tricky part of it is supporting ungreedy + // repetitions. Instead of treating all NFA states as having equivalent + // priority (as in 'All') or treating all NFA states as having distinct + // priority based on order (as in 'LeftmostFirst'), we instead group NFA + // states into sets, and treat members of each set as having equivalent + // priority, but having greater priority than all following members + // of different sets. + // + // However, it's not clear whether it's really worth adding this. After + // all, leftmost-longest can be emulated when using literals by using + // leftmost-first and sorting the literals by length in descending order. + // However, this won't work for arbitrary regexes. e.g., `\w|\w\w` will + // always match `a` in `ab` when using leftmost-first, but leftmost-longest + // would match `ab`. +} + +impl MatchKind { + #[cfg(feature = "alloc")] + pub(crate) fn continue_past_first_match(&self) -> bool { + *self == MatchKind::All + } +} + +impl Default for MatchKind { + fn default() -> MatchKind { + MatchKind::LeftmostFirst + } +} + +/// An error indicating that a search stopped before reporting whether a +/// match exists or not. +/// +/// To be very clear, this error type implies that one cannot assume that no +/// matches occur, since the search stopped before completing. That is, if +/// you're looking for information about where a search determined that no +/// match can occur, then this error type does *not* give you that. (Indeed, at +/// the time of writing, if you need such a thing, you have to write your own +/// search routine.) +/// +/// Normally, when one searches for something, the response is either an +/// affirmative "it was found at this location" or a negative "not found at +/// all." However, in some cases, a regex engine can be configured to stop its +/// search before concluding whether a match exists or not. When this happens, +/// it may be important for the caller to know why the regex engine gave up and +/// where in the input it gave up at. This error type exposes the 'why' and the +/// 'where.' +/// +/// For example, the DFAs provided by this library generally cannot correctly +/// implement Unicode word boundaries. Instead, they provide an option to +/// eagerly support them on ASCII text (since Unicode word boundaries are +/// equivalent to ASCII word boundaries when searching ASCII text), but will +/// "give up" if a non-ASCII byte is seen. In such cases, one is usually +/// required to either report the failure to the caller (unergonomic) or +/// otherwise fall back to some other regex engine (ergonomic, but potentially +/// costly). +/// +/// More generally, some regex engines offer the ability for callers to specify +/// certain bytes that will trigger the regex engine to automatically quit if +/// they are seen. +/// +/// Still yet, there may be other reasons for a failed match. For example, +/// the hybrid DFA provided by this crate can be configured to give up if it +/// believes that it is not efficient. This in turn permits callers to choose a +/// different regex engine. +/// +/// (Note that DFAs are configured by default to never quit or give up in this +/// fashion. For example, by default, a DFA will fail to build if the regex +/// pattern contains a Unicode word boundary. One needs to opt into the "quit" +/// behavior via options, like +/// [`hybrid::dfa::Config::unicode_word_boundary`](crate::hybrid::dfa::Config::unicode_word_boundary).) +/// +/// There are a couple other ways a search +/// can fail. For example, when using the +/// [`BoundedBacktracker`](crate::nfa::thompson::backtrack::BoundedBacktracker) +/// with a haystack that is too long, or trying to run an unanchored search +/// with a [one-pass DFA](crate::dfa::onepass). +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct MatchError( + #[cfg(feature = "alloc")] alloc::boxed::Box<MatchErrorKind>, + #[cfg(not(feature = "alloc"))] MatchErrorKind, +); + +impl MatchError { + /// Create a new error value with the given kind. + /// + /// This is a more verbose version of the kind-specific constructors, + /// e.g., `MatchError::quit`. + pub fn new(kind: MatchErrorKind) -> MatchError { + #[cfg(feature = "alloc")] + { + MatchError(alloc::boxed::Box::new(kind)) + } + #[cfg(not(feature = "alloc"))] + { + MatchError(kind) + } + } + + /// Returns a reference to the underlying error kind. + pub fn kind(&self) -> &MatchErrorKind { + &self.0 + } + + /// Create a new "quit" error. The given `byte` corresponds to the value + /// that tripped a search's quit condition, and `offset` corresponds to the + /// location in the haystack at which the search quit. + /// + /// This is the same as calling `MatchError::new` with a + /// [`MatchErrorKind::Quit`] kind. + pub fn quit(byte: u8, offset: usize) -> MatchError { + MatchError::new(MatchErrorKind::Quit { byte, offset }) + } + + /// Create a new "gave up" error. The given `offset` corresponds to the + /// location in the haystack at which the search gave up. + /// + /// This is the same as calling `MatchError::new` with a + /// [`MatchErrorKind::GaveUp`] kind. + pub fn gave_up(offset: usize) -> MatchError { + MatchError::new(MatchErrorKind::GaveUp { offset }) + } + + /// Create a new "haystack too long" error. The given `len` corresponds to + /// the length of the haystack that was problematic. + /// + /// This is the same as calling `MatchError::new` with a + /// [`MatchErrorKind::HaystackTooLong`] kind. + pub fn haystack_too_long(len: usize) -> MatchError { + MatchError::new(MatchErrorKind::HaystackTooLong { len }) + } + + /// Create a new "unsupported anchored" error. This occurs when the caller + /// requests a search with an anchor mode that is not supported by the + /// regex engine. + /// + /// This is the same as calling `MatchError::new` with a + /// [`MatchErrorKind::UnsupportedAnchored`] kind. + pub fn unsupported_anchored(mode: Anchored) -> MatchError { + MatchError::new(MatchErrorKind::UnsupportedAnchored { mode }) + } +} + +/// The underlying kind of a [`MatchError`]. +/// +/// This is a **non-exhaustive** enum. That means new variants may be added in +/// a semver-compatible release. +#[non_exhaustive] +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum MatchErrorKind { + /// The search saw a "quit" byte at which it was instructed to stop + /// searching. + Quit { + /// The "quit" byte that was observed that caused the search to stop. + byte: u8, + /// The offset at which the quit byte was observed. + offset: usize, + }, + /// The search, based on heuristics, determined that it would be better + /// to stop, typically to provide the caller an opportunity to use an + /// alternative regex engine. + /// + /// Currently, the only way for this to occur is via the lazy DFA and + /// only when it is configured to do so (it will not return this error by + /// default). + GaveUp { + /// The offset at which the search stopped. This corresponds to the + /// position immediately following the last byte scanned. + offset: usize, + }, + /// This error occurs if the haystack given to the regex engine was too + /// long to be searched. This occurs, for example, with regex engines + /// like the bounded backtracker that have a configurable fixed amount of + /// capacity that is tied to the length of the haystack. Anything beyond + /// that configured limit will result in an error at search time. + HaystackTooLong { + /// The length of the haystack that exceeded the limit. + len: usize, + }, + /// An error indicating that a particular type of anchored search was + /// requested, but that the regex engine does not support it. + /// + /// Note that this error should not be returned by a regex engine simply + /// because the pattern ID is invalid (i.e., equal to or exceeds the number + /// of patterns in the regex). In that case, the regex engine should report + /// a non-match. + UnsupportedAnchored { + /// The anchored mode given that is unsupported. + mode: Anchored, + }, +} + +#[cfg(feature = "std")] +impl std::error::Error for MatchError {} + +impl core::fmt::Display for MatchError { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + match *self.kind() { + MatchErrorKind::Quit { byte, offset } => write!( + f, + "quit search after observing byte {:?} at offset {}", + DebugByte(byte), + offset, + ), + MatchErrorKind::GaveUp { offset } => { + write!(f, "gave up searching at offset {}", offset) + } + MatchErrorKind::HaystackTooLong { len } => { + write!(f, "haystack of length {} is too long", len) + } + MatchErrorKind::UnsupportedAnchored { mode: Anchored::Yes } => { + write!(f, "anchored searches are not supported or enabled") + } + MatchErrorKind::UnsupportedAnchored { mode: Anchored::No } => { + write!(f, "unanchored searches are not supported or enabled") + } + MatchErrorKind::UnsupportedAnchored { + mode: Anchored::Pattern(pid), + } => { + write!( + f, + "anchored searches for a specific pattern ({}) are \ + not supported or enabled", + pid.as_usize(), + ) + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + // We test that our 'MatchError' type is the size we expect. This isn't an + // API guarantee, but if the size increases, we really want to make sure we + // decide to do that intentionally. So this should be a speed bump. And in + // general, we should not increase the size without a very good reason. + // + // Why? Because low level search APIs return Result<.., MatchError>. When + // MatchError gets bigger, so to does the Result type. + // + // Now, when 'alloc' is enabled, we do box the error, which de-emphasizes + // the importance of keeping a small error type. But without 'alloc', we + // still want things to be small. + #[test] + fn match_error_size() { + let expected_size = if cfg!(feature = "alloc") { + core::mem::size_of::<usize>() + } else { + 2 * core::mem::size_of::<usize>() + }; + assert_eq!(expected_size, core::mem::size_of::<MatchError>()); + } + + // Same as above, but for the underlying match error kind. + #[cfg(target_pointer_width = "64")] + #[test] + fn match_error_kind_size() { + let expected_size = 2 * core::mem::size_of::<usize>(); + assert_eq!(expected_size, core::mem::size_of::<MatchErrorKind>()); + } + + #[cfg(target_pointer_width = "32")] + #[test] + fn match_error_kind_size() { + let expected_size = 3 * core::mem::size_of::<usize>(); + assert_eq!(expected_size, core::mem::size_of::<MatchErrorKind>()); + } +} diff --git a/vendor/regex-automata/src/util/sparse_set.rs b/vendor/regex-automata/src/util/sparse_set.rs index bf59e4469..cbaa0b6f4 100644 --- a/vendor/regex-automata/src/util/sparse_set.rs +++ b/vendor/regex-automata/src/util/sparse_set.rs @@ -1,6 +1,23 @@ -use alloc::{boxed::Box, vec, vec::Vec}; +/*! +This module defines a sparse set data structure. Its most interesting +properties are: -use crate::util::id::StateID; +* They preserve insertion order. +* Set membership testing is done in constant time. +* Set insertion is done in constant time. +* Clearing the set is done in constant time. + +The cost for doing this is that the capacity of the set needs to be known up +front, and the elements in the set are limited to state identifiers. + +These sets are principally used when traversing an NFA state graph. This +happens at search time, for example, in the PikeVM. It also happens during DFA +determinization. +*/ + +use alloc::{vec, vec::Vec}; + +use crate::util::primitives::StateID; /// A pairse of sparse sets. /// @@ -79,7 +96,12 @@ pub(crate) struct SparseSet { /// Sparse maps ids to their location in dense. /// /// A state ID is in the set if and only if - /// sparse[id] < dense.len() && id == dense[sparse[id]]. + /// sparse[id] < len && id == dense[sparse[id]]. + /// + /// Note that these are indices into 'dense'. It's a little weird to use + /// StateID here, but we know our length can never exceed the bounds of + /// StateID (enforced by 'resize') and StateID will be at most 4 bytes + /// where as a usize is likely double that in most cases. sparse: Vec<StateID>, } @@ -146,9 +168,9 @@ impl SparseSet { /// /// This is marked as inline(always) since the compiler won't inline it /// otherwise, and it's a fairly hot piece of code in DFA determinization. - #[inline(always)] - pub(crate) fn insert(&mut self, value: StateID) -> bool { - if self.contains(value) { + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn insert(&mut self, id: StateID) -> bool { + if self.contains(id) { return false; } @@ -158,30 +180,22 @@ impl SparseSet { "{:?} exceeds capacity of {:?} when inserting {:?}", i, self.capacity(), - value, + id, ); // OK since i < self.capacity() and self.capacity() is guaranteed to // be <= StateID::LIMIT. - let id = StateID::new_unchecked(i); - self.dense[id] = value; - self.sparse[value] = id; + let index = StateID::new_unchecked(i); + self.dense[index] = id; + self.sparse[id] = index; self.len += 1; true } /// Returns true if and only if this set contains the given value. #[inline] - pub(crate) fn contains(&self, value: StateID) -> bool { - let i = self.sparse[value]; - i.as_usize() < self.len() && self.dense[i] == value - } - - /// Returns the ith inserted element from this set. - /// - /// Panics when i >= self.len(). - #[inline] - pub(crate) fn get(&self, i: usize) -> StateID { - self.dense[i] + pub(crate) fn contains(&self, id: StateID) -> bool { + let index = self.sparse[id]; + index.as_usize() < self.len() && self.dense[index] == id } /// Clear this set such that it has no members. @@ -190,16 +204,21 @@ impl SparseSet { self.len = 0; } + #[inline] + pub(crate) fn iter(&self) -> SparseSetIter<'_> { + SparseSetIter(self.dense[..self.len()].iter()) + } + /// Returns the heap memory usage, in bytes, used by this sparse set. #[inline] pub(crate) fn memory_usage(&self) -> usize { - 2 * self.dense.len() * StateID::SIZE + self.dense.len() * StateID::SIZE + self.sparse.len() * StateID::SIZE } } impl core::fmt::Debug for SparseSet { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { - let elements: Vec<StateID> = self.into_iter().collect(); + let elements: Vec<StateID> = self.iter().collect(); f.debug_tuple("SparseSet").field(&elements).finish() } } @@ -210,20 +229,11 @@ impl core::fmt::Debug for SparseSet { #[derive(Debug)] pub(crate) struct SparseSetIter<'a>(core::slice::Iter<'a, StateID>); -impl<'a> IntoIterator for &'a SparseSet { - type Item = StateID; - type IntoIter = SparseSetIter<'a>; - - fn into_iter(self) -> Self::IntoIter { - SparseSetIter(self.dense[..self.len()].iter()) - } -} - impl<'a> Iterator for SparseSetIter<'a> { type Item = StateID; - #[inline(always)] + #[cfg_attr(feature = "perf-inline", inline(always))] fn next(&mut self) -> Option<StateID> { - self.0.next().map(|value| *value) + self.0.next().map(|&id| id) } } diff --git a/vendor/regex-automata/src/util/start.rs b/vendor/regex-automata/src/util/start.rs index 3c756fc26..4e360d083 100644 --- a/vendor/regex-automata/src/util/start.rs +++ b/vendor/regex-automata/src/util/start.rs @@ -1,21 +1,186 @@ -/// Represents the four possible starting configurations of a DFA search. +/*! +Provides some helpers for dealing with start state configurations in DFAs. + +[`Start`] represents the possible starting configurations, while +[`StartByteMap`] represents a way to retrieve the `Start` configuration for a +given position in a haystack. +*/ + +use crate::util::{ + look::LookMatcher, + search::Input, + wire::{self, DeserializeError, SerializeError}, +}; + +/// A map from every possible byte value to its corresponding starting +/// configuration. /// -/// The starting configuration is determined by inspecting the the beginning of -/// the haystack (up to 1 byte). Ultimately, this along with a pattern ID (if -/// specified) is what selects the start state to use in a DFA. +/// This map is used in order to lookup the start configuration for a particular +/// position in a haystack. This start configuration is then used in +/// combination with things like the anchored mode and pattern ID to fully +/// determine the start state. /// -/// In a DFA that doesn't have starting states for each pattern, then it will -/// have a maximum of four DFA start states. If the DFA was compiled with start -/// states for each pattern, then it will have a maximum of four DFA start -/// states for searching for any pattern, and then another maximum of four DFA -/// start states for executing an anchored search for each pattern. +/// Generally speaking, this map is only used for fully compiled DFAs and lazy +/// DFAs. For NFAs (including the one-pass DFA), the start state is generally +/// selected by virtue of traversing the NFA state graph. DFAs do the same +/// thing, but at build time and not search time. (Well, technically the lazy +/// DFA does it at search time, but it does enough work to cache the full +/// result of the epsilon closure that the NFA engines tend to need to do.) +#[derive(Clone)] +pub(crate) struct StartByteMap { + map: [Start; 256], +} + +impl StartByteMap { + /// Create a new map from byte values to their corresponding starting + /// configurations. The map is determined, in part, by how look-around + /// assertions are matched via the matcher given. + pub(crate) fn new(lookm: &LookMatcher) -> StartByteMap { + let mut map = [Start::NonWordByte; 256]; + map[usize::from(b'\n')] = Start::LineLF; + map[usize::from(b'\r')] = Start::LineCR; + map[usize::from(b'_')] = Start::WordByte; + + let mut byte = b'0'; + while byte <= b'9' { + map[usize::from(byte)] = Start::WordByte; + byte += 1; + } + byte = b'A'; + while byte <= b'Z' { + map[usize::from(byte)] = Start::WordByte; + byte += 1; + } + byte = b'a'; + while byte <= b'z' { + map[usize::from(byte)] = Start::WordByte; + byte += 1; + } + + let lineterm = lookm.get_line_terminator(); + // If our line terminator is normal, then it is already handled by + // the LineLF and LineCR configurations. But if it's weird, then we + // overwrite whatever was there before for that terminator with a + // special configuration. The trick here is that if the terminator + // is, say, a word byte like `a`, then callers seeing this start + // configuration need to account for that and build their DFA state as + // if it *also* came from a word byte. + if lineterm != b'\r' && lineterm != b'\n' { + map[usize::from(lineterm)] = Start::CustomLineTerminator; + } + StartByteMap { map } + } + + /// Return the forward starting configuration for the given `input`. + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn fwd(&self, input: &Input) -> Start { + match input + .start() + .checked_sub(1) + .and_then(|i| input.haystack().get(i)) + { + None => Start::Text, + Some(&byte) => self.get(byte), + } + } + + /// Return the reverse starting configuration for the given `input`. + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn rev(&self, input: &Input) -> Start { + match input.haystack().get(input.end()) { + None => Start::Text, + Some(&byte) => self.get(byte), + } + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn get(&self, byte: u8) -> Start { + self.map[usize::from(byte)] + } + + /// Deserializes a byte class map from the given slice. If the slice is of + /// insufficient length or otherwise contains an impossible mapping, then + /// an error is returned. Upon success, the number of bytes read along with + /// the map are returned. The number of bytes read is always a multiple of + /// 8. + pub(crate) fn from_bytes( + slice: &[u8], + ) -> Result<(StartByteMap, usize), DeserializeError> { + wire::check_slice_len(slice, 256, "start byte map")?; + let mut map = [Start::NonWordByte; 256]; + for (i, &repr) in slice[..256].iter().enumerate() { + map[i] = match Start::from_usize(usize::from(repr)) { + Some(start) => start, + None => { + return Err(DeserializeError::generic( + "found invalid starting configuration", + )) + } + }; + } + Ok((StartByteMap { map }, 256)) + } + + /// Writes this map to the given byte buffer. if the given buffer is too + /// small, then an error is returned. Upon success, the total number of + /// bytes written is returned. The number of bytes written is guaranteed to + /// be a multiple of 8. + pub(crate) fn write_to( + &self, + dst: &mut [u8], + ) -> Result<usize, SerializeError> { + let nwrite = self.write_to_len(); + if dst.len() < nwrite { + return Err(SerializeError::buffer_too_small("start byte map")); + } + for (i, &start) in self.map.iter().enumerate() { + dst[i] = start.as_u8(); + } + Ok(nwrite) + } + + /// Returns the total number of bytes written by `write_to`. + pub(crate) fn write_to_len(&self) -> usize { + 256 + } +} + +impl core::fmt::Debug for StartByteMap { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + use crate::util::escape::DebugByte; + + write!(f, "StartByteMap{{")?; + for byte in 0..=255 { + if byte > 0 { + write!(f, ", ")?; + } + let start = self.map[usize::from(byte)]; + write!(f, "{:?} => {:?}", DebugByte(byte), start)?; + } + write!(f, "}}")?; + Ok(()) + } +} + +/// Represents the six possible starting configurations of a DFA search. +/// +/// The starting configuration is determined by inspecting the the beginning +/// of the haystack (up to 1 byte). Ultimately, this along with a pattern ID +/// (if specified) and the type of search (anchored or not) is what selects the +/// start state to use in a DFA. /// -/// This ends up being represented as a table in the DFA (whether lazy or fully -/// built) where the stride of that table is 4, and each entry is an index into -/// the state transition table. Note though that multiple entries in the table -/// might point to the same state if the states would otherwise be equivalent. -/// (This is guaranteed by DFA minimization and may even be accomplished by -/// normal determinization, since it attempts to reuse equivalent states too.) +/// As one example, if a DFA only supports unanchored searches and does not +/// support anchored searches for each pattern, then it will have at most 6 +/// distinct start states. (Some start states may be reused if determinization +/// can determine that they will be equivalent.) If the DFA supports both +/// anchored and unanchored searches, then it will have a maximum of 12 +/// distinct start states. Finally, if the DFA also supports anchored searches +/// for each pattern, then it can have up to `12 + (N * 6)` start states, where +/// `N` is the number of patterns. +/// +/// Handling each of these starting configurations in the context of DFA +/// determinization can be *quite* tricky and subtle. But the code is small +/// and can be found at `crate::util::determinize::set_lookbehind_from_start`. #[derive(Clone, Copy, Debug, Eq, PartialEq)] pub(crate) enum Start { /// This occurs when the starting position is not any of the ones below. @@ -28,7 +193,20 @@ pub(crate) enum Start { Text = 2, /// This occurs when the byte immediately preceding the start of the search /// is a line terminator. Specifically, `\n`. - Line = 3, + LineLF = 3, + /// This occurs when the byte immediately preceding the start of the search + /// is a line terminator. Specifically, `\r`. + LineCR = 4, + /// This occurs when a custom line terminator has been set via a + /// `LookMatcher`, and when that line terminator is neither a `\r` or a + /// `\n`. + /// + /// If the custom line terminator is a word byte, then this start + /// configuration is still selected. DFAs that implement word boundary + /// assertions will likely need to check whether the custom line terminator + /// is a word byte, in which case, it should behave as if the byte + /// satisfies `\b` in addition to multi-line anchors. + CustomLineTerminator = 5, } impl Start { @@ -39,71 +217,90 @@ impl Start { 0 => Some(Start::NonWordByte), 1 => Some(Start::WordByte), 2 => Some(Start::Text), - 3 => Some(Start::Line), + 3 => Some(Start::LineLF), + 4 => Some(Start::LineCR), + 5 => Some(Start::CustomLineTerminator), _ => None, } } /// Returns the total number of starting state configurations. - pub(crate) fn count() -> usize { - 4 - } - - /// Returns the starting state configuration for the given search - /// parameters. If the given offset range is not valid, then this panics. - #[inline(always)] - pub(crate) fn from_position_fwd( - bytes: &[u8], - start: usize, - end: usize, - ) -> Start { - assert!( - bytes.get(start..end).is_some(), - "{}..{} is invalid", - start, - end - ); - if start == 0 { - Start::Text - } else if bytes[start - 1] == b'\n' { - Start::Line - } else if crate::util::is_word_byte(bytes[start - 1]) { - Start::WordByte - } else { - Start::NonWordByte - } + pub(crate) fn len() -> usize { + 6 } - /// Returns the starting state configuration for a reverse search with the - /// given search parameters. If the given offset range is not valid, then - /// this panics. - #[inline(always)] - pub(crate) fn from_position_rev( - bytes: &[u8], - start: usize, - end: usize, - ) -> Start { - assert!( - bytes.get(start..end).is_some(), - "{}..{} is invalid", - start, - end - ); - if end == bytes.len() { - Start::Text - } else if bytes[end] == b'\n' { - Start::Line - } else if crate::util::is_word_byte(bytes[end]) { - Start::WordByte - } else { - Start::NonWordByte - } + /// Return this starting configuration as `u8` integer. It is guaranteed to + /// be less than `Start::len()`. + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn as_u8(&self) -> u8 { + // AFAIK, 'as' is the only way to zero-cost convert an int enum to an + // actual int. + *self as u8 } - /// Return this starting configuration as an integer. It is guaranteed to - /// be less than `Start::count()`. - #[inline(always)] + /// Return this starting configuration as a `usize` integer. It is + /// guaranteed to be less than `Start::len()`. + #[cfg_attr(feature = "perf-inline", inline(always))] pub(crate) fn as_usize(&self) -> usize { - *self as usize + usize::from(self.as_u8()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn start_fwd_done_range() { + let smap = StartByteMap::new(&LookMatcher::default()); + assert_eq!(Start::Text, smap.fwd(&Input::new("").range(1..0))); + } + + #[test] + fn start_rev_done_range() { + let smap = StartByteMap::new(&LookMatcher::default()); + assert_eq!(Start::Text, smap.rev(&Input::new("").range(1..0))); + } + + #[test] + fn start_fwd() { + let f = |haystack, start, end| { + let smap = StartByteMap::new(&LookMatcher::default()); + let input = &Input::new(haystack).range(start..end); + smap.fwd(input) + }; + + assert_eq!(Start::Text, f("", 0, 0)); + assert_eq!(Start::Text, f("abc", 0, 3)); + assert_eq!(Start::Text, f("\nabc", 0, 3)); + + assert_eq!(Start::LineLF, f("\nabc", 1, 3)); + + assert_eq!(Start::LineCR, f("\rabc", 1, 3)); + + assert_eq!(Start::WordByte, f("abc", 1, 3)); + + assert_eq!(Start::NonWordByte, f(" abc", 1, 3)); + } + + #[test] + fn start_rev() { + let f = |haystack, start, end| { + let smap = StartByteMap::new(&LookMatcher::default()); + let input = &Input::new(haystack).range(start..end); + smap.rev(input) + }; + + assert_eq!(Start::Text, f("", 0, 0)); + assert_eq!(Start::Text, f("abc", 0, 3)); + assert_eq!(Start::Text, f("abc\n", 0, 4)); + + assert_eq!(Start::LineLF, f("abc\nz", 0, 3)); + + assert_eq!(Start::LineCR, f("abc\rz", 0, 3)); + + assert_eq!(Start::WordByte, f("abc", 0, 2)); + + assert_eq!(Start::NonWordByte, f("abc ", 0, 3)); } } diff --git a/vendor/regex-automata/src/util/syntax.rs b/vendor/regex-automata/src/util/syntax.rs index 88beeee75..78e3cf9a1 100644 --- a/vendor/regex-automata/src/util/syntax.rs +++ b/vendor/regex-automata/src/util/syntax.rs @@ -1,4 +1,132 @@ -use regex_syntax::ParserBuilder; +/*! +Utilities for dealing with the syntax of a regular expression. + +This module currently only exposes a [`Config`] type that +itself represents a wrapper around the configuration for a +[`regex-syntax::ParserBuilder`](regex_syntax::ParserBuilder). The purpose of +this wrapper is to make configuring syntax options very similar to how other +configuration is done throughout this crate. Namely, instead of duplicating +syntax options across every builder (of which there are many), we instead +create small config objects like this one that can be passed around and +composed. +*/ + +use alloc::{vec, vec::Vec}; + +use regex_syntax::{ + ast, + hir::{self, Hir}, + Error, ParserBuilder, +}; + +/// A convenience routine for parsing a pattern into an HIR value with the +/// default configuration. +/// +/// # Example +/// +/// This shows how to parse a pattern into an HIR value: +/// +/// ``` +/// use regex_automata::util::syntax; +/// +/// let hir = syntax::parse(r"([a-z]+)|([0-9]+)")?; +/// assert_eq!(Some(1), hir.properties().static_explicit_captures_len()); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +pub fn parse(pattern: &str) -> Result<Hir, Error> { + parse_with(pattern, &Config::default()) +} + +/// A convenience routine for parsing many patterns into HIR value with the +/// default configuration. +/// +/// # Example +/// +/// This shows how to parse many patterns into an corresponding HIR values: +/// +/// ``` +/// use { +/// regex_automata::util::syntax, +/// regex_syntax::hir::Properties, +/// }; +/// +/// let hirs = syntax::parse_many(&[ +/// r"([a-z]+)|([0-9]+)", +/// r"foo(A-Z]+)bar", +/// ])?; +/// let props = Properties::union(hirs.iter().map(|h| h.properties())); +/// assert_eq!(Some(1), props.static_explicit_captures_len()); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +pub fn parse_many<P: AsRef<str>>(patterns: &[P]) -> Result<Vec<Hir>, Error> { + parse_many_with(patterns, &Config::default()) +} + +/// A convenience routine for parsing a pattern into an HIR value using a +/// `Config`. +/// +/// # Example +/// +/// This shows how to parse a pattern into an HIR value with a non-default +/// configuration: +/// +/// ``` +/// use regex_automata::util::syntax; +/// +/// let hir = syntax::parse_with( +/// r"^[a-z]+$", +/// &syntax::Config::new().multi_line(true).crlf(true), +/// )?; +/// assert!(hir.properties().look_set().contains_anchor_crlf()); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +pub fn parse_with(pattern: &str, config: &Config) -> Result<Hir, Error> { + let mut builder = ParserBuilder::new(); + config.apply(&mut builder); + builder.build().parse(pattern) +} + +/// A convenience routine for parsing many patterns into HIR values using a +/// `Config`. +/// +/// # Example +/// +/// This shows how to parse many patterns into an corresponding HIR values +/// with a non-default configuration: +/// +/// ``` +/// use { +/// regex_automata::util::syntax, +/// regex_syntax::hir::Properties, +/// }; +/// +/// let patterns = &[ +/// r"([a-z]+)|([0-9]+)", +/// r"\W", +/// r"foo(A-Z]+)bar", +/// ]; +/// let config = syntax::Config::new().unicode(false).utf8(false); +/// let hirs = syntax::parse_many_with(patterns, &config)?; +/// let props = Properties::union(hirs.iter().map(|h| h.properties())); +/// assert!(!props.is_utf8()); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +pub fn parse_many_with<P: AsRef<str>>( + patterns: &[P], + config: &Config, +) -> Result<Vec<Hir>, Error> { + let mut builder = ParserBuilder::new(); + config.apply(&mut builder); + let mut hirs = vec![]; + for p in patterns.iter() { + hirs.push(builder.build().parse(p.as_ref())?); + } + Ok(hirs) +} /// A common set of configuration options that apply to the syntax of a regex. /// @@ -14,10 +142,12 @@ use regex_syntax::ParserBuilder; /// in this crate. Instead of re-defining them on every engine's builder, they /// are instead provided here as one cohesive unit. #[derive(Clone, Copy, Debug)] -pub struct SyntaxConfig { +pub struct Config { case_insensitive: bool, multi_line: bool, dot_matches_new_line: bool, + crlf: bool, + line_terminator: u8, swap_greed: bool, ignore_whitespace: bool, unicode: bool, @@ -26,14 +156,16 @@ pub struct SyntaxConfig { octal: bool, } -impl SyntaxConfig { +impl Config { /// Return a new default syntax configuration. - pub fn new() -> SyntaxConfig { + pub fn new() -> Config { // These defaults match the ones used in regex-syntax. - SyntaxConfig { + Config { case_insensitive: false, multi_line: false, dot_matches_new_line: false, + crlf: false, + line_terminator: b'\n', swap_greed: false, ignore_whitespace: false, unicode: true, @@ -51,7 +183,7 @@ impl SyntaxConfig { /// /// By default this is disabled. It may alternatively be selectively /// enabled in the regular expression itself via the `i` flag. - pub fn case_insensitive(mut self, yes: bool) -> SyntaxConfig { + pub fn case_insensitive(mut self, yes: bool) -> Config { self.case_insensitive = yes; self } @@ -66,7 +198,7 @@ impl SyntaxConfig { /// /// By default this is disabled. It may alternatively be selectively /// enabled in the regular expression itself via the `m` flag. - pub fn multi_line(mut self, yes: bool) -> SyntaxConfig { + pub fn multi_line(mut self, yes: bool) -> Config { self.multi_line = yes; self } @@ -77,7 +209,7 @@ impl SyntaxConfig { /// then `.` will match any character except for a new line character. /// /// Note that `.` is impacted by whether the "unicode" setting is enabled - /// or not. When Unicode is enabled (the defualt), `.` will match any UTF-8 + /// or not. When Unicode is enabled (the default), `.` will match any UTF-8 /// encoding of any Unicode scalar value (sans a new line, depending on /// whether this "dot matches new line" option is enabled). When Unicode /// mode is disabled, `.` will match any byte instead. Because of this, @@ -87,11 +219,53 @@ impl SyntaxConfig { /// /// By default this is disabled. It may alternatively be selectively /// enabled in the regular expression itself via the `s` flag. - pub fn dot_matches_new_line(mut self, yes: bool) -> SyntaxConfig { + pub fn dot_matches_new_line(mut self, yes: bool) -> Config { self.dot_matches_new_line = yes; self } + /// Enable or disable the "CRLF mode" flag by default. + /// + /// By default this is disabled. It may alternatively be selectively + /// enabled in the regular expression itself via the `R` flag. + /// + /// When CRLF mode is enabled, the following happens: + /// + /// * Unless `dot_matches_new_line` is enabled, `.` will match any character + /// except for `\r` and `\n`. + /// * When `multi_line` mode is enabled, `^` and `$` will treat `\r\n`, + /// `\r` and `\n` as line terminators. And in particular, neither will + /// match between a `\r` and a `\n`. + pub fn crlf(mut self, yes: bool) -> Config { + self.crlf = yes; + self + } + + /// Sets the line terminator for use with `(?u-s:.)` and `(?-us:.)`. + /// + /// Namely, instead of `.` (by default) matching everything except for `\n`, + /// this will cause `.` to match everything except for the byte given. + /// + /// If `.` is used in a context where Unicode mode is enabled and this byte + /// isn't ASCII, then an error will be returned. When Unicode mode is + /// disabled, then any byte is permitted, but will return an error if UTF-8 + /// mode is enabled and it is a non-ASCII byte. + /// + /// In short, any ASCII value for a line terminator is always okay. But a + /// non-ASCII byte might result in an error depending on whether Unicode + /// mode or UTF-8 mode are enabled. + /// + /// Note that if `R` mode is enabled then it always takes precedence and + /// the line terminator will be treated as `\r` and `\n` simultaneously. + /// + /// Note also that this *doesn't* impact the look-around assertions + /// `(?m:^)` and `(?m:$)`. That's usually controlled by additional + /// configuration in the regex engine itself. + pub fn line_terminator(mut self, byte: u8) -> Config { + self.line_terminator = byte; + self + } + /// Enable or disable the "swap greed" flag by default. /// /// When this is enabled, `.*` (for example) will become ungreedy and `.*?` @@ -99,7 +273,7 @@ impl SyntaxConfig { /// /// By default this is disabled. It may alternatively be selectively /// enabled in the regular expression itself via the `U` flag. - pub fn swap_greed(mut self, yes: bool) -> SyntaxConfig { + pub fn swap_greed(mut self, yes: bool) -> Config { self.swap_greed = yes; self } @@ -112,7 +286,7 @@ impl SyntaxConfig { /// /// By default, this is disabled. It may be selectively enabled in the /// regular expression by using the `x` flag regardless of this setting. - pub fn ignore_whitespace(mut self, yes: bool) -> SyntaxConfig { + pub fn ignore_whitespace(mut self, yes: bool) -> Config { self.ignore_whitespace = yes; self } @@ -131,7 +305,7 @@ impl SyntaxConfig { /// time. This is especially noticeable if your regex contains character /// classes like `\w` that are impacted by whether Unicode is enabled or /// not. If Unicode is not necessary, you are encouraged to disable it. - pub fn unicode(mut self, yes: bool) -> SyntaxConfig { + pub fn unicode(mut self, yes: bool) -> Config { self.unicode = yes; self } @@ -139,7 +313,7 @@ impl SyntaxConfig { /// When disabled, the builder will permit the construction of a regular /// expression that may match invalid UTF-8. /// - /// For example, when [`SyntaxConfig::unicode`] is disabled, then + /// For example, when [`Config::unicode`] is disabled, then /// expressions like `[^a]` may match invalid UTF-8 since they can match /// any single byte that is not `a`. By default, these sub-expressions /// are disallowed to avoid returning offsets that split a UTF-8 @@ -150,7 +324,7 @@ impl SyntaxConfig { /// When enabled (the default), the builder is guaranteed to produce a /// regex that will only ever match valid UTF-8 (otherwise, the builder /// will return an error). - pub fn utf8(mut self, yes: bool) -> SyntaxConfig { + pub fn utf8(mut self, yes: bool) -> Config { self.utf8 = yes; self } @@ -171,7 +345,7 @@ impl SyntaxConfig { /// if callers want to put a limit on the amount of heap space used, then /// they should impose a limit on the length, in bytes, of the concrete /// pattern string. In particular, this is viable since the parser will - /// limit itself to heap space proportional to the lenth of the pattern + /// limit itself to heap space proportional to the length of the pattern /// string. /// /// Note that a nest limit of `0` will return a nest limit error for most @@ -180,7 +354,7 @@ impl SyntaxConfig { /// in a nest depth of `1`. In general, a nest limit is not something that /// manifests in an obvious way in the concrete syntax, therefore, it /// should not be used in a granular way. - pub fn nest_limit(mut self, limit: u32) -> SyntaxConfig { + pub fn nest_limit(mut self, limit: u32) -> Config { self.nest_limit = limit; self } @@ -200,7 +374,7 @@ impl SyntaxConfig { /// message will explicitly mention that backreferences aren't supported. /// /// Octal syntax is disabled by default. - pub fn octal(mut self, yes: bool) -> SyntaxConfig { + pub fn octal(mut self, yes: bool) -> Config { self.octal = yes; self } @@ -225,6 +399,16 @@ impl SyntaxConfig { self.dot_matches_new_line } + /// Returns whether "CRLF" mode is enabled. + pub fn get_crlf(&self) -> bool { + self.crlf + } + + /// Returns the line terminator in this syntax configuration. + pub fn get_line_terminator(&self) -> u8 { + self.line_terminator + } + /// Returns whether "swap greed" mode is enabled. pub fn get_swap_greed(&self) -> bool { self.swap_greed @@ -257,16 +441,42 @@ impl SyntaxConfig { .case_insensitive(self.case_insensitive) .multi_line(self.multi_line) .dot_matches_new_line(self.dot_matches_new_line) + .crlf(self.crlf) + .line_terminator(self.line_terminator) .swap_greed(self.swap_greed) .ignore_whitespace(self.ignore_whitespace) - .allow_invalid_utf8(!self.utf8) + .utf8(self.utf8) + .nest_limit(self.nest_limit) + .octal(self.octal); + } + + /// Applies this configuration to the given AST parser. + pub(crate) fn apply_ast(&self, builder: &mut ast::parse::ParserBuilder) { + builder + .ignore_whitespace(self.ignore_whitespace) .nest_limit(self.nest_limit) .octal(self.octal); } + + /// Applies this configuration to the given AST-to-HIR translator. + pub(crate) fn apply_hir( + &self, + builder: &mut hir::translate::TranslatorBuilder, + ) { + builder + .unicode(self.unicode) + .case_insensitive(self.case_insensitive) + .multi_line(self.multi_line) + .crlf(self.crlf) + .dot_matches_new_line(self.dot_matches_new_line) + .line_terminator(self.line_terminator) + .swap_greed(self.swap_greed) + .utf8(self.utf8); + } } -impl Default for SyntaxConfig { - fn default() -> SyntaxConfig { - SyntaxConfig::new() +impl Default for Config { + fn default() -> Config { + Config::new() } } diff --git a/vendor/regex-automata/src/util/unicode_data/mod.rs b/vendor/regex-automata/src/util/unicode_data/mod.rs new file mode 100644 index 000000000..fc7b1c738 --- /dev/null +++ b/vendor/regex-automata/src/util/unicode_data/mod.rs @@ -0,0 +1,17 @@ +// This cfg should match the one in src/util/look.rs that uses perl_word. +#[cfg(all( + // We have to explicitly want to support Unicode word boundaries. + feature = "unicode-word-boundary", + not(all( + // If we don't have regex-syntax at all, then we definitely need to + // bring our own \w data table. + feature = "syntax", + // If unicode-perl is enabled, then regex-syntax/unicode-perl is + // also enabled, which in turn means we can use regex-syntax's + // is_word_character routine (and thus use its data tables). But if + // unicode-perl is not enabled, even if syntax is, then we need to + // bring our own. + feature = "unicode-perl", + )), +))] +pub(crate) mod perl_word; diff --git a/vendor/regex-automata/src/util/unicode_data/perl_word.rs b/vendor/regex-automata/src/util/unicode_data/perl_word.rs new file mode 100644 index 000000000..74d62656f --- /dev/null +++ b/vendor/regex-automata/src/util/unicode_data/perl_word.rs @@ -0,0 +1,781 @@ +// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: +// +// ucd-generate perl-word tmp/ucd-15.0.0/ --chars +// +// Unicode version: 15.0.0. +// +// ucd-generate 0.2.15 is available on crates.io. + +pub const PERL_WORD: &'static [(char, char)] = &[ + ('0', '9'), + ('A', 'Z'), + ('_', '_'), + ('a', 'z'), + ('ª', 'ª'), + ('µ', 'µ'), + ('º', 'º'), + ('À', 'Ö'), + ('Ø', 'ö'), + ('ø', 'ˁ'), + ('ˆ', 'ˑ'), + ('ˠ', 'ˤ'), + ('ˬ', 'ˬ'), + ('ˮ', 'ˮ'), + ('\u{300}', 'ʹ'), + ('Ͷ', 'ͷ'), + ('ͺ', 'ͽ'), + ('Ϳ', 'Ϳ'), + ('Ά', 'Ά'), + ('Έ', 'Ί'), + ('Ό', 'Ό'), + ('Ύ', 'Ρ'), + ('Σ', 'ϵ'), + ('Ϸ', 'ҁ'), + ('\u{483}', 'ԯ'), + ('Ա', 'Ֆ'), + ('ՙ', 'ՙ'), + ('ՠ', 'ֈ'), + ('\u{591}', '\u{5bd}'), + ('\u{5bf}', '\u{5bf}'), + ('\u{5c1}', '\u{5c2}'), + ('\u{5c4}', '\u{5c5}'), + ('\u{5c7}', '\u{5c7}'), + ('א', 'ת'), + ('ׯ', 'ײ'), + ('\u{610}', '\u{61a}'), + ('ؠ', '٩'), + ('ٮ', 'ۓ'), + ('ە', '\u{6dc}'), + ('\u{6df}', '\u{6e8}'), + ('\u{6ea}', 'ۼ'), + ('ۿ', 'ۿ'), + ('ܐ', '\u{74a}'), + ('ݍ', 'ޱ'), + ('߀', 'ߵ'), + ('ߺ', 'ߺ'), + ('\u{7fd}', '\u{7fd}'), + ('ࠀ', '\u{82d}'), + ('ࡀ', '\u{85b}'), + ('ࡠ', 'ࡪ'), + ('ࡰ', 'ࢇ'), + ('ࢉ', 'ࢎ'), + ('\u{898}', '\u{8e1}'), + ('\u{8e3}', '\u{963}'), + ('०', '९'), + ('ॱ', 'ঃ'), + ('অ', 'ঌ'), + ('এ', 'ঐ'), + ('ও', 'ন'), + ('প', 'র'), + ('ল', 'ল'), + ('শ', 'হ'), + ('\u{9bc}', '\u{9c4}'), + ('ে', 'ৈ'), + ('ো', 'ৎ'), + ('\u{9d7}', '\u{9d7}'), + ('ড়', 'ঢ়'), + ('য়', '\u{9e3}'), + ('০', 'ৱ'), + ('ৼ', 'ৼ'), + ('\u{9fe}', '\u{9fe}'), + ('\u{a01}', 'ਃ'), + ('ਅ', 'ਊ'), + ('ਏ', 'ਐ'), + ('ਓ', 'ਨ'), + ('ਪ', 'ਰ'), + ('ਲ', 'ਲ਼'), + ('ਵ', 'ਸ਼'), + ('ਸ', 'ਹ'), + ('\u{a3c}', '\u{a3c}'), + ('ਾ', '\u{a42}'), + ('\u{a47}', '\u{a48}'), + ('\u{a4b}', '\u{a4d}'), + ('\u{a51}', '\u{a51}'), + ('ਖ਼', 'ੜ'), + ('ਫ਼', 'ਫ਼'), + ('੦', '\u{a75}'), + ('\u{a81}', 'ઃ'), + ('અ', 'ઍ'), + ('એ', 'ઑ'), + ('ઓ', 'ન'), + ('પ', 'ર'), + ('લ', 'ળ'), + ('વ', 'હ'), + ('\u{abc}', '\u{ac5}'), + ('\u{ac7}', 'ૉ'), + ('ો', '\u{acd}'), + ('ૐ', 'ૐ'), + ('ૠ', '\u{ae3}'), + ('૦', '૯'), + ('ૹ', '\u{aff}'), + ('\u{b01}', 'ଃ'), + ('ଅ', 'ଌ'), + ('ଏ', 'ଐ'), + ('ଓ', 'ନ'), + ('ପ', 'ର'), + ('ଲ', 'ଳ'), + ('ଵ', 'ହ'), + ('\u{b3c}', '\u{b44}'), + ('େ', 'ୈ'), + ('ୋ', '\u{b4d}'), + ('\u{b55}', '\u{b57}'), + ('ଡ଼', 'ଢ଼'), + ('ୟ', '\u{b63}'), + ('୦', '୯'), + ('ୱ', 'ୱ'), + ('\u{b82}', 'ஃ'), + ('அ', 'ஊ'), + ('எ', 'ஐ'), + ('ஒ', 'க'), + ('ங', 'ச'), + ('ஜ', 'ஜ'), + ('ஞ', 'ட'), + ('ண', 'த'), + ('ந', 'ப'), + ('ம', 'ஹ'), + ('\u{bbe}', 'ூ'), + ('ெ', 'ை'), + ('ொ', '\u{bcd}'), + ('ௐ', 'ௐ'), + ('\u{bd7}', '\u{bd7}'), + ('௦', '௯'), + ('\u{c00}', 'ఌ'), + ('ఎ', 'ఐ'), + ('ఒ', 'న'), + ('ప', 'హ'), + ('\u{c3c}', 'ౄ'), + ('\u{c46}', '\u{c48}'), + ('\u{c4a}', '\u{c4d}'), + ('\u{c55}', '\u{c56}'), + ('ౘ', 'ౚ'), + ('ౝ', 'ౝ'), + ('ౠ', '\u{c63}'), + ('౦', '౯'), + ('ಀ', 'ಃ'), + ('ಅ', 'ಌ'), + ('ಎ', 'ಐ'), + ('ಒ', 'ನ'), + ('ಪ', 'ಳ'), + ('ವ', 'ಹ'), + ('\u{cbc}', 'ೄ'), + ('\u{cc6}', 'ೈ'), + ('ೊ', '\u{ccd}'), + ('\u{cd5}', '\u{cd6}'), + ('ೝ', 'ೞ'), + ('ೠ', '\u{ce3}'), + ('೦', '೯'), + ('ೱ', 'ೳ'), + ('\u{d00}', 'ഌ'), + ('എ', 'ഐ'), + ('ഒ', '\u{d44}'), + ('െ', 'ൈ'), + ('ൊ', 'ൎ'), + ('ൔ', '\u{d57}'), + ('ൟ', '\u{d63}'), + ('൦', '൯'), + ('ൺ', 'ൿ'), + ('\u{d81}', 'ඃ'), + ('අ', 'ඖ'), + ('ක', 'න'), + ('ඳ', 'ර'), + ('ල', 'ල'), + ('ව', 'ෆ'), + ('\u{dca}', '\u{dca}'), + ('\u{dcf}', '\u{dd4}'), + ('\u{dd6}', '\u{dd6}'), + ('ෘ', '\u{ddf}'), + ('෦', '෯'), + ('ෲ', 'ෳ'), + ('ก', '\u{e3a}'), + ('เ', '\u{e4e}'), + ('๐', '๙'), + ('ກ', 'ຂ'), + ('ຄ', 'ຄ'), + ('ຆ', 'ຊ'), + ('ຌ', 'ຣ'), + ('ລ', 'ລ'), + ('ວ', 'ຽ'), + ('ເ', 'ໄ'), + ('ໆ', 'ໆ'), + ('\u{ec8}', '\u{ece}'), + ('໐', '໙'), + ('ໜ', 'ໟ'), + ('ༀ', 'ༀ'), + ('\u{f18}', '\u{f19}'), + ('༠', '༩'), + ('\u{f35}', '\u{f35}'), + ('\u{f37}', '\u{f37}'), + ('\u{f39}', '\u{f39}'), + ('༾', 'ཇ'), + ('ཉ', 'ཬ'), + ('\u{f71}', '\u{f84}'), + ('\u{f86}', '\u{f97}'), + ('\u{f99}', '\u{fbc}'), + ('\u{fc6}', '\u{fc6}'), + ('က', '၉'), + ('ၐ', '\u{109d}'), + ('Ⴀ', 'Ⴥ'), + ('Ⴧ', 'Ⴧ'), + ('Ⴭ', 'Ⴭ'), + ('ა', 'ჺ'), + ('ჼ', 'ቈ'), + ('ቊ', 'ቍ'), + ('ቐ', 'ቖ'), + ('ቘ', 'ቘ'), + ('ቚ', 'ቝ'), + ('በ', 'ኈ'), + ('ኊ', 'ኍ'), + ('ነ', 'ኰ'), + ('ኲ', 'ኵ'), + ('ኸ', 'ኾ'), + ('ዀ', 'ዀ'), + ('ዂ', 'ዅ'), + ('ወ', 'ዖ'), + ('ዘ', 'ጐ'), + ('ጒ', 'ጕ'), + ('ጘ', 'ፚ'), + ('\u{135d}', '\u{135f}'), + ('ᎀ', 'ᎏ'), + ('Ꭰ', 'Ᏽ'), + ('ᏸ', 'ᏽ'), + ('ᐁ', 'ᙬ'), + ('ᙯ', 'ᙿ'), + ('ᚁ', 'ᚚ'), + ('ᚠ', 'ᛪ'), + ('ᛮ', 'ᛸ'), + ('ᜀ', '᜕'), + ('ᜟ', '᜴'), + ('ᝀ', '\u{1753}'), + ('ᝠ', 'ᝬ'), + ('ᝮ', 'ᝰ'), + ('\u{1772}', '\u{1773}'), + ('ក', '\u{17d3}'), + ('ៗ', 'ៗ'), + ('ៜ', '\u{17dd}'), + ('០', '៩'), + ('\u{180b}', '\u{180d}'), + ('\u{180f}', '᠙'), + ('ᠠ', 'ᡸ'), + ('ᢀ', 'ᢪ'), + ('ᢰ', 'ᣵ'), + ('ᤀ', 'ᤞ'), + ('\u{1920}', 'ᤫ'), + ('ᤰ', '\u{193b}'), + ('᥆', 'ᥭ'), + ('ᥰ', 'ᥴ'), + ('ᦀ', 'ᦫ'), + ('ᦰ', 'ᧉ'), + ('᧐', '᧙'), + ('ᨀ', '\u{1a1b}'), + ('ᨠ', '\u{1a5e}'), + ('\u{1a60}', '\u{1a7c}'), + ('\u{1a7f}', '᪉'), + ('᪐', '᪙'), + ('ᪧ', 'ᪧ'), + ('\u{1ab0}', '\u{1ace}'), + ('\u{1b00}', 'ᭌ'), + ('᭐', '᭙'), + ('\u{1b6b}', '\u{1b73}'), + ('\u{1b80}', '᯳'), + ('ᰀ', '\u{1c37}'), + ('᱀', '᱉'), + ('ᱍ', 'ᱽ'), + ('ᲀ', 'ᲈ'), + ('Ა', 'Ჺ'), + ('Ჽ', 'Ჿ'), + ('\u{1cd0}', '\u{1cd2}'), + ('\u{1cd4}', 'ᳺ'), + ('ᴀ', 'ἕ'), + ('Ἐ', 'Ἕ'), + ('ἠ', 'ὅ'), + ('Ὀ', 'Ὅ'), + ('ὐ', 'ὗ'), + ('Ὑ', 'Ὑ'), + ('Ὓ', 'Ὓ'), + ('Ὕ', 'Ὕ'), + ('Ὗ', 'ώ'), + ('ᾀ', 'ᾴ'), + ('ᾶ', 'ᾼ'), + ('ι', 'ι'), + ('ῂ', 'ῄ'), + ('ῆ', 'ῌ'), + ('ῐ', 'ΐ'), + ('ῖ', 'Ί'), + ('ῠ', 'Ῥ'), + ('ῲ', 'ῴ'), + ('ῶ', 'ῼ'), + ('\u{200c}', '\u{200d}'), + ('‿', '⁀'), + ('⁔', '⁔'), + ('ⁱ', 'ⁱ'), + ('ⁿ', 'ⁿ'), + ('ₐ', 'ₜ'), + ('\u{20d0}', '\u{20f0}'), + ('ℂ', 'ℂ'), + ('ℇ', 'ℇ'), + ('ℊ', 'ℓ'), + ('ℕ', 'ℕ'), + ('ℙ', 'ℝ'), + ('ℤ', 'ℤ'), + ('Ω', 'Ω'), + ('ℨ', 'ℨ'), + ('K', 'ℭ'), + ('ℯ', 'ℹ'), + ('ℼ', 'ℿ'), + ('ⅅ', 'ⅉ'), + ('ⅎ', 'ⅎ'), + ('Ⅰ', 'ↈ'), + ('Ⓐ', 'ⓩ'), + ('Ⰰ', 'ⳤ'), + ('Ⳬ', 'ⳳ'), + ('ⴀ', 'ⴥ'), + ('ⴧ', 'ⴧ'), + ('ⴭ', 'ⴭ'), + ('ⴰ', 'ⵧ'), + ('ⵯ', 'ⵯ'), + ('\u{2d7f}', 'ⶖ'), + ('ⶠ', 'ⶦ'), + ('ⶨ', 'ⶮ'), + ('ⶰ', 'ⶶ'), + ('ⶸ', 'ⶾ'), + ('ⷀ', 'ⷆ'), + ('ⷈ', 'ⷎ'), + ('ⷐ', 'ⷖ'), + ('ⷘ', 'ⷞ'), + ('\u{2de0}', '\u{2dff}'), + ('ⸯ', 'ⸯ'), + ('々', '〇'), + ('〡', '\u{302f}'), + ('〱', '〵'), + ('〸', '〼'), + ('ぁ', 'ゖ'), + ('\u{3099}', '\u{309a}'), + ('ゝ', 'ゟ'), + ('ァ', 'ヺ'), + ('ー', 'ヿ'), + ('ㄅ', 'ㄯ'), + ('ㄱ', 'ㆎ'), + ('ㆠ', 'ㆿ'), + ('ㇰ', 'ㇿ'), + ('㐀', '䶿'), + ('一', 'ꒌ'), + ('ꓐ', 'ꓽ'), + ('ꔀ', 'ꘌ'), + ('ꘐ', 'ꘫ'), + ('Ꙁ', '\u{a672}'), + ('\u{a674}', '\u{a67d}'), + ('ꙿ', '\u{a6f1}'), + ('ꜗ', 'ꜟ'), + ('Ꜣ', 'ꞈ'), + ('Ꞌ', 'ꟊ'), + ('Ꟑ', 'ꟑ'), + ('ꟓ', 'ꟓ'), + ('ꟕ', 'ꟙ'), + ('ꟲ', 'ꠧ'), + ('\u{a82c}', '\u{a82c}'), + ('ꡀ', 'ꡳ'), + ('ꢀ', '\u{a8c5}'), + ('꣐', '꣙'), + ('\u{a8e0}', 'ꣷ'), + ('ꣻ', 'ꣻ'), + ('ꣽ', '\u{a92d}'), + ('ꤰ', '꥓'), + ('ꥠ', 'ꥼ'), + ('\u{a980}', '꧀'), + ('ꧏ', '꧙'), + ('ꧠ', 'ꧾ'), + ('ꨀ', '\u{aa36}'), + ('ꩀ', 'ꩍ'), + ('꩐', '꩙'), + ('ꩠ', 'ꩶ'), + ('ꩺ', 'ꫂ'), + ('ꫛ', 'ꫝ'), + ('ꫠ', 'ꫯ'), + ('ꫲ', '\u{aaf6}'), + ('ꬁ', 'ꬆ'), + ('ꬉ', 'ꬎ'), + ('ꬑ', 'ꬖ'), + ('ꬠ', 'ꬦ'), + ('ꬨ', 'ꬮ'), + ('ꬰ', 'ꭚ'), + ('ꭜ', 'ꭩ'), + ('ꭰ', 'ꯪ'), + ('꯬', '\u{abed}'), + ('꯰', '꯹'), + ('가', '힣'), + ('ힰ', 'ퟆ'), + ('ퟋ', 'ퟻ'), + ('豈', '舘'), + ('並', '龎'), + ('ff', 'st'), + ('ﬓ', 'ﬗ'), + ('יִ', 'ﬨ'), + ('שׁ', 'זּ'), + ('טּ', 'לּ'), + ('מּ', 'מּ'), + ('נּ', 'סּ'), + ('ףּ', 'פּ'), + ('צּ', 'ﮱ'), + ('ﯓ', 'ﴽ'), + ('ﵐ', 'ﶏ'), + ('ﶒ', 'ﷇ'), + ('ﷰ', 'ﷻ'), + ('\u{fe00}', '\u{fe0f}'), + ('\u{fe20}', '\u{fe2f}'), + ('︳', '︴'), + ('﹍', '﹏'), + ('ﹰ', 'ﹴ'), + ('ﹶ', 'ﻼ'), + ('0', '9'), + ('A', 'Z'), + ('_', '_'), + ('a', 'z'), + ('ヲ', 'ᄒ'), + ('ᅡ', 'ᅦ'), + ('ᅧ', 'ᅬ'), + ('ᅭ', 'ᅲ'), + ('ᅳ', 'ᅵ'), + ('𐀀', '𐀋'), + ('𐀍', '𐀦'), + ('𐀨', '𐀺'), + ('𐀼', '𐀽'), + ('𐀿', '𐁍'), + ('𐁐', '𐁝'), + ('𐂀', '𐃺'), + ('𐅀', '𐅴'), + ('\u{101fd}', '\u{101fd}'), + ('𐊀', '𐊜'), + ('𐊠', '𐋐'), + ('\u{102e0}', '\u{102e0}'), + ('𐌀', '𐌟'), + ('𐌭', '𐍊'), + ('𐍐', '\u{1037a}'), + ('𐎀', '𐎝'), + ('𐎠', '𐏃'), + ('𐏈', '𐏏'), + ('𐏑', '𐏕'), + ('𐐀', '𐒝'), + ('𐒠', '𐒩'), + ('𐒰', '𐓓'), + ('𐓘', '𐓻'), + ('𐔀', '𐔧'), + ('𐔰', '𐕣'), + ('𐕰', '𐕺'), + ('𐕼', '𐖊'), + ('𐖌', '𐖒'), + ('𐖔', '𐖕'), + ('𐖗', '𐖡'), + ('𐖣', '𐖱'), + ('𐖳', '𐖹'), + ('𐖻', '𐖼'), + ('𐘀', '𐜶'), + ('𐝀', '𐝕'), + ('𐝠', '𐝧'), + ('𐞀', '𐞅'), + ('𐞇', '𐞰'), + ('𐞲', '𐞺'), + ('𐠀', '𐠅'), + ('𐠈', '𐠈'), + ('𐠊', '𐠵'), + ('𐠷', '𐠸'), + ('𐠼', '𐠼'), + ('𐠿', '𐡕'), + ('𐡠', '𐡶'), + ('𐢀', '𐢞'), + ('𐣠', '𐣲'), + ('𐣴', '𐣵'), + ('𐤀', '𐤕'), + ('𐤠', '𐤹'), + ('𐦀', '𐦷'), + ('𐦾', '𐦿'), + ('𐨀', '\u{10a03}'), + ('\u{10a05}', '\u{10a06}'), + ('\u{10a0c}', '𐨓'), + ('𐨕', '𐨗'), + ('𐨙', '𐨵'), + ('\u{10a38}', '\u{10a3a}'), + ('\u{10a3f}', '\u{10a3f}'), + ('𐩠', '𐩼'), + ('𐪀', '𐪜'), + ('𐫀', '𐫇'), + ('𐫉', '\u{10ae6}'), + ('𐬀', '𐬵'), + ('𐭀', '𐭕'), + ('𐭠', '𐭲'), + ('𐮀', '𐮑'), + ('𐰀', '𐱈'), + ('𐲀', '𐲲'), + ('𐳀', '𐳲'), + ('𐴀', '\u{10d27}'), + ('𐴰', '𐴹'), + ('𐺀', '𐺩'), + ('\u{10eab}', '\u{10eac}'), + ('𐺰', '𐺱'), + ('\u{10efd}', '𐼜'), + ('𐼧', '𐼧'), + ('𐼰', '\u{10f50}'), + ('𐽰', '\u{10f85}'), + ('𐾰', '𐿄'), + ('𐿠', '𐿶'), + ('𑀀', '\u{11046}'), + ('𑁦', '𑁵'), + ('\u{1107f}', '\u{110ba}'), + ('\u{110c2}', '\u{110c2}'), + ('𑃐', '𑃨'), + ('𑃰', '𑃹'), + ('\u{11100}', '\u{11134}'), + ('𑄶', '𑄿'), + ('𑅄', '𑅇'), + ('𑅐', '\u{11173}'), + ('𑅶', '𑅶'), + ('\u{11180}', '𑇄'), + ('\u{111c9}', '\u{111cc}'), + ('𑇎', '𑇚'), + ('𑇜', '𑇜'), + ('𑈀', '𑈑'), + ('𑈓', '\u{11237}'), + ('\u{1123e}', '\u{11241}'), + ('𑊀', '𑊆'), + ('𑊈', '𑊈'), + ('𑊊', '𑊍'), + ('𑊏', '𑊝'), + ('𑊟', '𑊨'), + ('𑊰', '\u{112ea}'), + ('𑋰', '𑋹'), + ('\u{11300}', '𑌃'), + ('𑌅', '𑌌'), + ('𑌏', '𑌐'), + ('𑌓', '𑌨'), + ('𑌪', '𑌰'), + ('𑌲', '𑌳'), + ('𑌵', '𑌹'), + ('\u{1133b}', '𑍄'), + ('𑍇', '𑍈'), + ('𑍋', '𑍍'), + ('𑍐', '𑍐'), + ('\u{11357}', '\u{11357}'), + ('𑍝', '𑍣'), + ('\u{11366}', '\u{1136c}'), + ('\u{11370}', '\u{11374}'), + ('𑐀', '𑑊'), + ('𑑐', '𑑙'), + ('\u{1145e}', '𑑡'), + ('𑒀', '𑓅'), + ('𑓇', '𑓇'), + ('𑓐', '𑓙'), + ('𑖀', '\u{115b5}'), + ('𑖸', '\u{115c0}'), + ('𑗘', '\u{115dd}'), + ('𑘀', '\u{11640}'), + ('𑙄', '𑙄'), + ('𑙐', '𑙙'), + ('𑚀', '𑚸'), + ('𑛀', '𑛉'), + ('𑜀', '𑜚'), + ('\u{1171d}', '\u{1172b}'), + ('𑜰', '𑜹'), + ('𑝀', '𑝆'), + ('𑠀', '\u{1183a}'), + ('𑢠', '𑣩'), + ('𑣿', '𑤆'), + ('𑤉', '𑤉'), + ('𑤌', '𑤓'), + ('𑤕', '𑤖'), + ('𑤘', '𑤵'), + ('𑤷', '𑤸'), + ('\u{1193b}', '\u{11943}'), + ('𑥐', '𑥙'), + ('𑦠', '𑦧'), + ('𑦪', '\u{119d7}'), + ('\u{119da}', '𑧡'), + ('𑧣', '𑧤'), + ('𑨀', '\u{11a3e}'), + ('\u{11a47}', '\u{11a47}'), + ('𑩐', '\u{11a99}'), + ('𑪝', '𑪝'), + ('𑪰', '𑫸'), + ('𑰀', '𑰈'), + ('𑰊', '\u{11c36}'), + ('\u{11c38}', '𑱀'), + ('𑱐', '𑱙'), + ('𑱲', '𑲏'), + ('\u{11c92}', '\u{11ca7}'), + ('𑲩', '\u{11cb6}'), + ('𑴀', '𑴆'), + ('𑴈', '𑴉'), + ('𑴋', '\u{11d36}'), + ('\u{11d3a}', '\u{11d3a}'), + ('\u{11d3c}', '\u{11d3d}'), + ('\u{11d3f}', '\u{11d47}'), + ('𑵐', '𑵙'), + ('𑵠', '𑵥'), + ('𑵧', '𑵨'), + ('𑵪', '𑶎'), + ('\u{11d90}', '\u{11d91}'), + ('𑶓', '𑶘'), + ('𑶠', '𑶩'), + ('𑻠', '𑻶'), + ('\u{11f00}', '𑼐'), + ('𑼒', '\u{11f3a}'), + ('𑼾', '\u{11f42}'), + ('𑽐', '𑽙'), + ('𑾰', '𑾰'), + ('𒀀', '𒎙'), + ('𒐀', '𒑮'), + ('𒒀', '𒕃'), + ('𒾐', '𒿰'), + ('𓀀', '𓐯'), + ('\u{13440}', '\u{13455}'), + ('𔐀', '𔙆'), + ('𖠀', '𖨸'), + ('𖩀', '𖩞'), + ('𖩠', '𖩩'), + ('𖩰', '𖪾'), + ('𖫀', '𖫉'), + ('𖫐', '𖫭'), + ('\u{16af0}', '\u{16af4}'), + ('𖬀', '\u{16b36}'), + ('𖭀', '𖭃'), + ('𖭐', '𖭙'), + ('𖭣', '𖭷'), + ('𖭽', '𖮏'), + ('𖹀', '𖹿'), + ('𖼀', '𖽊'), + ('\u{16f4f}', '𖾇'), + ('\u{16f8f}', '𖾟'), + ('𖿠', '𖿡'), + ('𖿣', '\u{16fe4}'), + ('𖿰', '𖿱'), + ('𗀀', '𘟷'), + ('𘠀', '𘳕'), + ('𘴀', '𘴈'), + ('𚿰', '𚿳'), + ('𚿵', '𚿻'), + ('𚿽', '𚿾'), + ('𛀀', '𛄢'), + ('𛄲', '𛄲'), + ('𛅐', '𛅒'), + ('𛅕', '𛅕'), + ('𛅤', '𛅧'), + ('𛅰', '𛋻'), + ('𛰀', '𛱪'), + ('𛱰', '𛱼'), + ('𛲀', '𛲈'), + ('𛲐', '𛲙'), + ('\u{1bc9d}', '\u{1bc9e}'), + ('\u{1cf00}', '\u{1cf2d}'), + ('\u{1cf30}', '\u{1cf46}'), + ('\u{1d165}', '\u{1d169}'), + ('𝅭', '\u{1d172}'), + ('\u{1d17b}', '\u{1d182}'), + ('\u{1d185}', '\u{1d18b}'), + ('\u{1d1aa}', '\u{1d1ad}'), + ('\u{1d242}', '\u{1d244}'), + ('𝐀', '𝑔'), + ('𝑖', '𝒜'), + ('𝒞', '𝒟'), + ('𝒢', '𝒢'), + ('𝒥', '𝒦'), + ('𝒩', '𝒬'), + ('𝒮', '𝒹'), + ('𝒻', '𝒻'), + ('𝒽', '𝓃'), + ('𝓅', '𝔅'), + ('𝔇', '𝔊'), + ('𝔍', '𝔔'), + ('𝔖', '𝔜'), + ('𝔞', '𝔹'), + ('𝔻', '𝔾'), + ('𝕀', '𝕄'), + ('𝕆', '𝕆'), + ('𝕊', '𝕐'), + ('𝕒', '𝚥'), + ('𝚨', '𝛀'), + ('𝛂', '𝛚'), + ('𝛜', '𝛺'), + ('𝛼', '𝜔'), + ('𝜖', '𝜴'), + ('𝜶', '𝝎'), + ('𝝐', '𝝮'), + ('𝝰', '𝞈'), + ('𝞊', '𝞨'), + ('𝞪', '𝟂'), + ('𝟄', '𝟋'), + ('𝟎', '𝟿'), + ('\u{1da00}', '\u{1da36}'), + ('\u{1da3b}', '\u{1da6c}'), + ('\u{1da75}', '\u{1da75}'), + ('\u{1da84}', '\u{1da84}'), + ('\u{1da9b}', '\u{1da9f}'), + ('\u{1daa1}', '\u{1daaf}'), + ('𝼀', '𝼞'), + ('𝼥', '𝼪'), + ('\u{1e000}', '\u{1e006}'), + ('\u{1e008}', '\u{1e018}'), + ('\u{1e01b}', '\u{1e021}'), + ('\u{1e023}', '\u{1e024}'), + ('\u{1e026}', '\u{1e02a}'), + ('𞀰', '𞁭'), + ('\u{1e08f}', '\u{1e08f}'), + ('𞄀', '𞄬'), + ('\u{1e130}', '𞄽'), + ('𞅀', '𞅉'), + ('𞅎', '𞅎'), + ('𞊐', '\u{1e2ae}'), + ('𞋀', '𞋹'), + ('𞓐', '𞓹'), + ('𞟠', '𞟦'), + ('𞟨', '𞟫'), + ('𞟭', '𞟮'), + ('𞟰', '𞟾'), + ('𞠀', '𞣄'), + ('\u{1e8d0}', '\u{1e8d6}'), + ('𞤀', '𞥋'), + ('𞥐', '𞥙'), + ('𞸀', '𞸃'), + ('𞸅', '𞸟'), + ('𞸡', '𞸢'), + ('𞸤', '𞸤'), + ('𞸧', '𞸧'), + ('𞸩', '𞸲'), + ('𞸴', '𞸷'), + ('𞸹', '𞸹'), + ('𞸻', '𞸻'), + ('𞹂', '𞹂'), + ('𞹇', '𞹇'), + ('𞹉', '𞹉'), + ('𞹋', '𞹋'), + ('𞹍', '𞹏'), + ('𞹑', '𞹒'), + ('𞹔', '𞹔'), + ('𞹗', '𞹗'), + ('𞹙', '𞹙'), + ('𞹛', '𞹛'), + ('𞹝', '𞹝'), + ('𞹟', '𞹟'), + ('𞹡', '𞹢'), + ('𞹤', '𞹤'), + ('𞹧', '𞹪'), + ('𞹬', '𞹲'), + ('𞹴', '𞹷'), + ('𞹹', '𞹼'), + ('𞹾', '𞹾'), + ('𞺀', '𞺉'), + ('𞺋', '𞺛'), + ('𞺡', '𞺣'), + ('𞺥', '𞺩'), + ('𞺫', '𞺻'), + ('🄰', '🅉'), + ('🅐', '🅩'), + ('🅰', '🆉'), + ('🯰', '🯹'), + ('𠀀', '𪛟'), + ('𪜀', '𫜹'), + ('𫝀', '𫠝'), + ('𫠠', '𬺡'), + ('𬺰', '𮯠'), + ('丽', '𪘀'), + ('𰀀', '𱍊'), + ('𱍐', '𲎯'), + ('\u{e0100}', '\u{e01ef}'), +]; diff --git a/vendor/regex-automata/src/util/utf8.rs b/vendor/regex-automata/src/util/utf8.rs new file mode 100644 index 000000000..91b27efe0 --- /dev/null +++ b/vendor/regex-automata/src/util/utf8.rs @@ -0,0 +1,196 @@ +/*! +Utilities for dealing with UTF-8. + +This module provides some UTF-8 related helper routines, including an +incremental decoder. +*/ + +/// Returns true if and only if the given byte is considered a word character. +/// This only applies to ASCII. +/// +/// This was copied from regex-syntax so that we can use it to determine the +/// starting DFA state while searching without depending on regex-syntax. The +/// definition is never going to change, so there's no maintenance/bit-rot +/// hazard here. +#[cfg_attr(feature = "perf-inline", inline(always))] +pub(crate) fn is_word_byte(b: u8) -> bool { + const fn mkwordset() -> [bool; 256] { + // FIXME: Use as_usize() once const functions in traits are stable. + let mut set = [false; 256]; + set[b'_' as usize] = true; + + let mut byte = b'0'; + while byte <= b'9' { + set[byte as usize] = true; + byte += 1; + } + byte = b'A'; + while byte <= b'Z' { + set[byte as usize] = true; + byte += 1; + } + byte = b'a'; + while byte <= b'z' { + set[byte as usize] = true; + byte += 1; + } + set + } + const WORD: [bool; 256] = mkwordset(); + WORD[b as usize] +} + +/// Decodes the next UTF-8 encoded codepoint from the given byte slice. +/// +/// If no valid encoding of a codepoint exists at the beginning of the given +/// byte slice, then the first byte is returned instead. +/// +/// This returns `None` if and only if `bytes` is empty. +/// +/// This never panics. +/// +/// *WARNING*: This is not designed for performance. If you're looking for a +/// fast UTF-8 decoder, this is not it. If you feel like you need one in this +/// crate, then please file an issue and discuss your use case. +#[cfg_attr(feature = "perf-inline", inline(always))] +pub(crate) fn decode(bytes: &[u8]) -> Option<Result<char, u8>> { + if bytes.is_empty() { + return None; + } + let len = match len(bytes[0]) { + None => return Some(Err(bytes[0])), + Some(len) if len > bytes.len() => return Some(Err(bytes[0])), + Some(1) => return Some(Ok(char::from(bytes[0]))), + Some(len) => len, + }; + match core::str::from_utf8(&bytes[..len]) { + Ok(s) => Some(Ok(s.chars().next().unwrap())), + Err(_) => Some(Err(bytes[0])), + } +} + +/// Decodes the last UTF-8 encoded codepoint from the given byte slice. +/// +/// If no valid encoding of a codepoint exists at the end of the given byte +/// slice, then the last byte is returned instead. +/// +/// This returns `None` if and only if `bytes` is empty. +#[cfg_attr(feature = "perf-inline", inline(always))] +pub(crate) fn decode_last(bytes: &[u8]) -> Option<Result<char, u8>> { + if bytes.is_empty() { + return None; + } + let mut start = bytes.len() - 1; + let limit = bytes.len().saturating_sub(4); + while start > limit && !is_leading_or_invalid_byte(bytes[start]) { + start -= 1; + } + match decode(&bytes[start..]) { + None => None, + Some(Ok(ch)) => Some(Ok(ch)), + Some(Err(_)) => Some(Err(bytes[bytes.len() - 1])), + } +} + +/// Given a UTF-8 leading byte, this returns the total number of code units +/// in the following encoded codepoint. +/// +/// If the given byte is not a valid UTF-8 leading byte, then this returns +/// `None`. +#[cfg_attr(feature = "perf-inline", inline(always))] +fn len(byte: u8) -> Option<usize> { + if byte <= 0x7F { + return Some(1); + } else if byte & 0b1100_0000 == 0b1000_0000 { + return None; + } else if byte <= 0b1101_1111 { + Some(2) + } else if byte <= 0b1110_1111 { + Some(3) + } else if byte <= 0b1111_0111 { + Some(4) + } else { + None + } +} + +/// Returns true if and only if the given offset in the given bytes falls on a +/// valid UTF-8 encoded codepoint boundary. +/// +/// If `bytes` is not valid UTF-8, then the behavior of this routine is +/// unspecified. +#[cfg_attr(feature = "perf-inline", inline(always))] +pub(crate) fn is_boundary(bytes: &[u8], i: usize) -> bool { + match bytes.get(i) { + // The position at the end of the bytes always represents an empty + // string, which is a valid boundary. But anything after that doesn't + // make much sense to call valid a boundary. + None => i == bytes.len(), + // Other than ASCII (where the most significant bit is never set), + // valid starting bytes always have their most significant two bits + // set, where as continuation bytes never have their second most + // significant bit set. Therefore, this only returns true when bytes[i] + // corresponds to a byte that begins a valid UTF-8 encoding of a + // Unicode scalar value. + Some(&b) => b <= 0b0111_1111 || b >= 0b1100_0000, + } +} + +/// Returns true if and only if the given byte is either a valid leading UTF-8 +/// byte, or is otherwise an invalid byte that can never appear anywhere in a +/// valid UTF-8 sequence. +#[cfg_attr(feature = "perf-inline", inline(always))] +fn is_leading_or_invalid_byte(b: u8) -> bool { + // In the ASCII case, the most significant bit is never set. The leading + // byte of a 2/3/4-byte sequence always has the top two most significant + // bits set. For bytes that can never appear anywhere in valid UTF-8, this + // also returns true, since every such byte has its two most significant + // bits set: + // + // \xC0 :: 11000000 + // \xC1 :: 11000001 + // \xF5 :: 11110101 + // \xF6 :: 11110110 + // \xF7 :: 11110111 + // \xF8 :: 11111000 + // \xF9 :: 11111001 + // \xFA :: 11111010 + // \xFB :: 11111011 + // \xFC :: 11111100 + // \xFD :: 11111101 + // \xFE :: 11111110 + // \xFF :: 11111111 + (b & 0b1100_0000) != 0b1000_0000 +} + +/* +/// Returns the smallest possible index of the next valid UTF-8 sequence +/// starting after `i`. +/// +/// For all inputs, including invalid UTF-8 and any value of `i`, the return +/// value is guaranteed to be greater than `i`. (If there is no value greater +/// than `i` that fits in `usize`, then this panics.) +/// +/// Generally speaking, this should only be called on `text` when it is +/// permitted to assume that it is valid UTF-8 and where either `i >= +/// text.len()` or where `text[i]` is a leading byte of a UTF-8 sequence. +/// +/// NOTE: This method was used in a previous conception of iterators where we +/// specifically tried to skip over empty matches that split a codepoint by +/// simply requiring that our next search begin at the beginning of codepoint. +/// But we ended up changing that technique to always advance by 1 byte and +/// then filter out matches that split a codepoint after-the-fact. Thus, we no +/// longer use this method. But I've kept it around in case we want to switch +/// back to this approach. Its guarantees are a little subtle, so I'd prefer +/// not to rebuild it from whole cloth. +pub(crate) fn next(text: &[u8], i: usize) -> usize { + let b = match text.get(i) { + None => return i.checked_add(1).unwrap(), + Some(&b) => b, + }; + // For cases where we see an invalid UTF-8 byte, there isn't much we can do + // other than just start at the next byte. + let inc = len(b).unwrap_or(1); + i.checked_add(inc).unwrap() +} +*/ diff --git a/vendor/regex-automata/src/util/bytes.rs b/vendor/regex-automata/src/util/wire.rs index 5877bb149..ecf4fd8c0 100644 --- a/vendor/regex-automata/src/util/bytes.rs +++ b/vendor/regex-automata/src/util/wire.rs @@ -1,3 +1,10 @@ +/*! +Types and routines that support the wire format of finite automata. + +Currently, this module just exports a few error types and some small helpers +for deserializing [dense DFAs](crate::dfa::dense::DFA) using correct alignment. +*/ + /* A collection of helper functions, types and traits for serializing automata. @@ -10,10 +17,10 @@ generally designed such that deserialization is cheap. More specifically, that deserialization can be done in constant time. (The idea being that you can embed it into your binary or mmap it, and then use it immediately.) -In order to achieve this, most of the structures in this crate use an in-memory -representation that very closely corresponds to its binary serialized form. -This pervades and complicates everything, and in some cases, requires dealing -with alignment and reasoning about safety. +In order to achieve this, the dense and sparse DFAs in this crate use an +in-memory representation that very closely corresponds to its binary serialized +form. This pervades and complicates everything, and in some cases, requires +dealing with alignment and reasoning about safety. This technique does have major advantages. In particular, it permits doing the potentially costly work of compiling a finite state machine in an offline @@ -43,7 +50,29 @@ use core::{ #[cfg(feature = "alloc")] use alloc::{vec, vec::Vec}; -use crate::util::id::{PatternID, PatternIDError, StateID, StateIDError}; +use crate::util::{ + int::Pointer, + primitives::{PatternID, PatternIDError, StateID, StateIDError}, +}; + +/// A hack to align a smaller type `B` with a bigger type `T`. +/// +/// The usual use of this is with `B = [u8]` and `T = u32`. That is, +/// it permits aligning a sequence of bytes on a 4-byte boundary. This +/// is useful in contexts where one wants to embed a serialized [dense +/// DFA](crate::dfa::dense::DFA) into a Rust a program while guaranteeing the +/// alignment required for the DFA. +/// +/// See [`dense::DFA::from_bytes`](crate::dfa::dense::DFA::from_bytes) for an +/// example of how to use this type. +#[repr(C)] +#[derive(Debug)] +pub struct AlignAs<B: ?Sized, T> { + /// A zero-sized field indicating the alignment we want. + pub _align: [T; 0], + /// A possibly non-sized field containing a sequence of bytes. + pub bytes: B, +} /// An error that occurs when serializing an object from this crate. /// @@ -117,7 +146,6 @@ enum DeserializeErrorKind { Generic { msg: &'static str }, BufferTooSmall { what: &'static str }, InvalidUsize { what: &'static str }, - InvalidVarint { what: &'static str }, VersionMismatch { expected: u32, found: u32 }, EndianMismatch { expected: u32, found: u32 }, AlignmentMismatch { alignment: usize, address: usize }, @@ -136,14 +164,10 @@ impl DeserializeError { DeserializeError(DeserializeErrorKind::BufferTooSmall { what }) } - pub(crate) fn invalid_usize(what: &'static str) -> DeserializeError { + fn invalid_usize(what: &'static str) -> DeserializeError { DeserializeError(DeserializeErrorKind::InvalidUsize { what }) } - fn invalid_varint(what: &'static str) -> DeserializeError { - DeserializeError(DeserializeErrorKind::InvalidVarint { what }) - } - fn version_mismatch(expected: u32, found: u32) -> DeserializeError { DeserializeError(DeserializeErrorKind::VersionMismatch { expected, @@ -176,7 +200,7 @@ impl DeserializeError { DeserializeError(DeserializeErrorKind::ArithmeticOverflow { what }) } - pub(crate) fn pattern_id_error( + fn pattern_id_error( err: PatternIDError, what: &'static str, ) -> DeserializeError { @@ -206,9 +230,6 @@ impl core::fmt::Display for DeserializeError { InvalidUsize { what } => { write!(f, "{} is too big to fit in a usize", what) } - InvalidVarint { what } => { - write!(f, "could not decode valid varint for {}", what) - } VersionMismatch { expected, found } => write!( f, "unsupported version: \ @@ -248,14 +269,63 @@ impl core::fmt::Display for DeserializeError { } } +/// Safely converts a `&[u32]` to `&[StateID]` with zero cost. +#[cfg_attr(feature = "perf-inline", inline(always))] +pub(crate) fn u32s_to_state_ids(slice: &[u32]) -> &[StateID] { + // SAFETY: This is safe because StateID is defined to have the same memory + // representation as a u32 (it is repr(transparent)). While not every u32 + // is a "valid" StateID, callers are not permitted to rely on the validity + // of StateIDs for memory safety. It can only lead to logical errors. (This + // is why StateID::new_unchecked is safe.) + unsafe { + core::slice::from_raw_parts( + slice.as_ptr().cast::<StateID>(), + slice.len(), + ) + } +} + +/// Safely converts a `&mut [u32]` to `&mut [StateID]` with zero cost. +pub(crate) fn u32s_to_state_ids_mut(slice: &mut [u32]) -> &mut [StateID] { + // SAFETY: This is safe because StateID is defined to have the same memory + // representation as a u32 (it is repr(transparent)). While not every u32 + // is a "valid" StateID, callers are not permitted to rely on the validity + // of StateIDs for memory safety. It can only lead to logical errors. (This + // is why StateID::new_unchecked is safe.) + unsafe { + core::slice::from_raw_parts_mut( + slice.as_mut_ptr().cast::<StateID>(), + slice.len(), + ) + } +} + +/// Safely converts a `&[u32]` to `&[PatternID]` with zero cost. +#[cfg_attr(feature = "perf-inline", inline(always))] +pub(crate) fn u32s_to_pattern_ids(slice: &[u32]) -> &[PatternID] { + // SAFETY: This is safe because PatternID is defined to have the same + // memory representation as a u32 (it is repr(transparent)). While not + // every u32 is a "valid" PatternID, callers are not permitted to rely + // on the validity of PatternIDs for memory safety. It can only lead to + // logical errors. (This is why PatternID::new_unchecked is safe.) + unsafe { + core::slice::from_raw_parts( + slice.as_ptr().cast::<PatternID>(), + slice.len(), + ) + } +} + /// Checks that the given slice has an alignment that matches `T`. /// /// This is useful for checking that a slice has an appropriate alignment /// before casting it to a &[T]. Note though that alignment is not itself /// sufficient to perform the cast for any `T`. -pub fn check_alignment<T>(slice: &[u8]) -> Result<(), DeserializeError> { +pub(crate) fn check_alignment<T>( + slice: &[u8], +) -> Result<(), DeserializeError> { let alignment = core::mem::align_of::<T>(); - let address = slice.as_ptr() as usize; + let address = slice.as_ptr().as_usize(); if address % alignment == 0 { return Ok(()); } @@ -271,7 +341,7 @@ pub fn check_alignment<T>(slice: &[u8]) -> Result<(), DeserializeError> { /// before the label. /// /// This returns the number of bytes read from the given slice. -pub fn skip_initial_padding(slice: &[u8]) -> usize { +pub(crate) fn skip_initial_padding(slice: &[u8]) -> usize { let mut nread = 0; while nread < 7 && nread < slice.len() && slice[nread] == 0 { nread += 1; @@ -296,33 +366,48 @@ pub fn skip_initial_padding(slice: &[u8]) -> usize { /// practice, we never need anything bigger in this crate, and so this function /// does some sanity asserts under the assumption of a max alignment of `8`. #[cfg(feature = "alloc")] -pub fn alloc_aligned_buffer<T>(size: usize) -> (Vec<u8>, usize) { - // FIXME: This is a kludge because there's no easy way to allocate a - // Vec<u8> with an alignment guaranteed to be greater than 1. We could - // create a Vec<u32>, but this cannot be safely transmuted to a Vec<u8> - // without concern, since reallocing or dropping the Vec<u8> is UB - // (different alignment than the initial allocation). We could define a - // wrapper type to manage this for us, but it seems like more machinery - // than it's worth. - let mut buf = vec![0; size]; +pub(crate) fn alloc_aligned_buffer<T>(size: usize) -> (Vec<u8>, usize) { + // NOTE: This is a kludge because there's no easy way to allocate a Vec<u8> + // with an alignment guaranteed to be greater than 1. We could create a + // Vec<u32>, but this cannot be safely transmuted to a Vec<u8> without + // concern, since reallocing or dropping the Vec<u8> is UB (different + // alignment than the initial allocation). We could define a wrapper type + // to manage this for us, but it seems like more machinery than it's worth. + let buf = vec![0; size]; let align = core::mem::align_of::<T>(); - let address = buf.as_ptr() as usize; + let address = buf.as_ptr().as_usize(); + if address % align == 0 { + return (buf, 0); + } + // Let's try this again. We have to create a totally new alloc with + // the maximum amount of bytes we might need. We can't just extend our + // pre-existing 'buf' because that might create a new alloc with a + // different alignment. + let extra = align - 1; + let mut buf = vec![0; size + extra]; + let address = buf.as_ptr().as_usize(); + // The code below handles the case where 'address' is aligned to T, so if + // we got lucky and 'address' is now aligned to T (when it previously + // wasn't), then we're done. if address % align == 0 { + buf.truncate(size); return (buf, 0); } - // It's not quite clear how to robustly test this code, since the allocator - // in my environment appears to always return addresses aligned to at - // least 8 bytes, even when the alignment requirement is smaller. A feeble - // attempt at ensuring correctness is provided with asserts. - let padding = ((address & !0b111).checked_add(8).unwrap()) + let padding = ((address & !(align - 1)).checked_add(align).unwrap()) .checked_sub(address) .unwrap(); assert!(padding <= 7, "padding of {} is bigger than 7", padding); - buf.extend(core::iter::repeat(0).take(padding)); + assert!( + padding <= extra, + "padding of {} is bigger than extra {} bytes", + padding, + extra + ); + buf.truncate(size + padding); assert_eq!(size + padding, buf.len()); assert_eq!( 0, - buf[padding..].as_ptr() as usize % align, + buf[padding..].as_ptr().as_usize() % align, "expected end of initial padding to be aligned to {}", align, ); @@ -332,12 +417,12 @@ pub fn alloc_aligned_buffer<T>(size: usize) -> (Vec<u8>, usize) { /// Reads a NUL terminated label starting at the beginning of the given slice. /// /// If a NUL terminated label could not be found, then an error is returned. -/// Similary, if a label is found but doesn't match the expected label, then +/// Similarly, if a label is found but doesn't match the expected label, then /// an error is returned. /// /// Upon success, the total number of bytes read (including padding bytes) is /// returned. -pub fn read_label( +pub(crate) fn read_label( slice: &[u8], expected_label: &'static str, ) -> Result<usize, DeserializeError> { @@ -376,7 +461,7 @@ pub fn read_label( /// /// Upon success, the total number of bytes written (including padding) is /// returned. -pub fn write_label( +pub(crate) fn write_label( label: &str, dst: &mut [u8], ) -> Result<usize, SerializeError> { @@ -396,7 +481,7 @@ pub fn write_label( /// for the given label. This panics if the given label contains a NUL byte or /// is longer than 255 bytes. (The size restriction exists so that searching /// for a label during deserialization can be done in small bounded space.) -pub fn write_label_len(label: &str) -> usize { +pub(crate) fn write_label_len(label: &str) -> usize { if label.len() > 255 { panic!("label must not be longer than 255 bytes"); } @@ -413,7 +498,9 @@ pub fn write_label_len(label: &str) -> usize { /// this returns an error. /// /// Upon success, the total number of bytes read is returned. -pub fn read_endianness_check(slice: &[u8]) -> Result<usize, DeserializeError> { +pub(crate) fn read_endianness_check( + slice: &[u8], +) -> Result<usize, DeserializeError> { let (n, nr) = try_read_u32(slice, "endianness check")?; assert_eq!(nr, write_endianness_check_len()); if n != 0xFEFF { @@ -429,7 +516,7 @@ pub fn read_endianness_check(slice: &[u8]) -> Result<usize, DeserializeError> { /// endianness is used. /// /// Upon success, the total number of bytes written is returned. -pub fn write_endianness_check<E: Endian>( +pub(crate) fn write_endianness_check<E: Endian>( dst: &mut [u8], ) -> Result<usize, SerializeError> { let nwrite = write_endianness_check_len(); @@ -441,7 +528,7 @@ pub fn write_endianness_check<E: Endian>( } /// Returns the number of bytes written by the endianness check. -pub fn write_endianness_check_len() -> usize { +pub(crate) fn write_endianness_check_len() -> usize { size_of::<u32>() } @@ -454,7 +541,7 @@ pub fn write_endianness_check_len() -> usize { /// N.B. Currently, we require that the version number is exactly equivalent. /// In the future, if we bump the version number without a semver bump, then /// we'll need to relax this a bit and support older versions. -pub fn read_version( +pub(crate) fn read_version( slice: &[u8], expected_version: u32, ) -> Result<usize, DeserializeError> { @@ -473,7 +560,7 @@ pub fn read_version( /// code supports the format of the serialized object. /// /// Upon success, the total number of bytes written is returned. -pub fn write_version<E: Endian>( +pub(crate) fn write_version<E: Endian>( version: u32, dst: &mut [u8], ) -> Result<usize, SerializeError> { @@ -486,7 +573,7 @@ pub fn write_version<E: Endian>( } /// Returns the number of bytes written by writing the version number. -pub fn write_version_len() -> usize { +pub(crate) fn write_version_len() -> usize { size_of::<u32>() } @@ -495,7 +582,7 @@ pub fn write_version_len() -> usize { /// ID limit for the current target, then this returns an error. /// /// Upon success, this also returns the number of bytes read. -pub fn read_pattern_id( +pub(crate) fn read_pattern_id( slice: &[u8], what: &'static str, ) -> Result<(PatternID, usize), DeserializeError> { @@ -511,7 +598,7 @@ pub fn read_pattern_id( /// to be a valid pattern ID. /// /// This also returns the number of bytes read. -pub fn read_pattern_id_unchecked(slice: &[u8]) -> (PatternID, usize) { +pub(crate) fn read_pattern_id_unchecked(slice: &[u8]) -> (PatternID, usize) { let pid = PatternID::from_ne_bytes_unchecked( slice[..PatternID::SIZE].try_into().unwrap(), ); @@ -522,7 +609,10 @@ pub fn read_pattern_id_unchecked(slice: &[u8]) -> (PatternID, usize) { /// using the specified endianness. The given slice must have length at least /// `PatternID::SIZE`, or else this panics. Upon success, the total number of /// bytes written is returned. -pub fn write_pattern_id<E: Endian>(pid: PatternID, dst: &mut [u8]) -> usize { +pub(crate) fn write_pattern_id<E: Endian>( + pid: PatternID, + dst: &mut [u8], +) -> usize { E::write_u32(pid.as_u32(), dst); PatternID::SIZE } @@ -532,7 +622,7 @@ pub fn write_pattern_id<E: Endian>(pid: PatternID, dst: &mut [u8]) -> usize { /// the current target, then this returns an error. /// /// Upon success, this also returns the number of bytes read. -pub fn try_read_state_id( +pub(crate) fn try_read_state_id( slice: &[u8], what: &'static str, ) -> Result<(StateID, usize), DeserializeError> { @@ -547,7 +637,7 @@ pub fn try_read_state_id( /// limit for the current target, then this returns an error. /// /// Upon success, this also returns the number of bytes read. -pub fn read_state_id( +pub(crate) fn read_state_id( slice: &[u8], what: &'static str, ) -> Result<(StateID, usize), DeserializeError> { @@ -563,7 +653,7 @@ pub fn read_state_id( /// to be a valid state ID. /// /// This also returns the number of bytes read. -pub fn read_state_id_unchecked(slice: &[u8]) -> (StateID, usize) { +pub(crate) fn read_state_id_unchecked(slice: &[u8]) -> (StateID, usize) { let sid = StateID::from_ne_bytes_unchecked( slice[..StateID::SIZE].try_into().unwrap(), ); @@ -574,7 +664,10 @@ pub fn read_state_id_unchecked(slice: &[u8]) -> (StateID, usize) { /// using the specified endianness. The given slice must have length at least /// `StateID::SIZE`, or else this panics. Upon success, the total number of /// bytes written is returned. -pub fn write_state_id<E: Endian>(sid: StateID, dst: &mut [u8]) -> usize { +pub(crate) fn write_state_id<E: Endian>( + sid: StateID, + dst: &mut [u8], +) -> usize { E::write_u32(sid.as_u32(), dst); StateID::SIZE } @@ -587,7 +680,7 @@ pub fn write_state_id<E: Endian>(sid: StateID, dst: &mut [u8]) -> usize { /// singular form. /// /// Upon success, this also returns the number of bytes read. -pub fn try_read_u16_as_usize( +pub(crate) fn try_read_u16_as_usize( slice: &[u8], what: &'static str, ) -> Result<(usize, usize), DeserializeError> { @@ -606,7 +699,7 @@ pub fn try_read_u16_as_usize( /// singular form. /// /// Upon success, this also returns the number of bytes read. -pub fn try_read_u32_as_usize( +pub(crate) fn try_read_u32_as_usize( slice: &[u8], what: &'static str, ) -> Result<(usize, usize), DeserializeError> { @@ -624,13 +717,11 @@ pub fn try_read_u32_as_usize( /// singular form. /// /// Upon success, this also returns the number of bytes read. -pub fn try_read_u16( +pub(crate) fn try_read_u16( slice: &[u8], what: &'static str, ) -> Result<(u16, usize), DeserializeError> { - if slice.len() < size_of::<u16>() { - return Err(DeserializeError::buffer_too_small(what)); - } + check_slice_len(slice, size_of::<u16>(), what)?; Ok((read_u16(slice), size_of::<u16>())) } @@ -641,23 +732,36 @@ pub fn try_read_u16( /// singular form. /// /// Upon success, this also returns the number of bytes read. -pub fn try_read_u32( +pub(crate) fn try_read_u32( slice: &[u8], what: &'static str, ) -> Result<(u32, usize), DeserializeError> { - if slice.len() < size_of::<u32>() { - return Err(DeserializeError::buffer_too_small(what)); - } + check_slice_len(slice, size_of::<u32>(), what)?; Ok((read_u32(slice), size_of::<u32>())) } +/// Try to read a u128 from the beginning of the given slice in native endian +/// format. If the slice has fewer than 16 bytes, then this returns an error. +/// The error message will include the `what` description of what is being +/// deserialized, for better error messages. `what` should be a noun in +/// singular form. +/// +/// Upon success, this also returns the number of bytes read. +pub(crate) fn try_read_u128( + slice: &[u8], + what: &'static str, +) -> Result<(u128, usize), DeserializeError> { + check_slice_len(slice, size_of::<u128>(), what)?; + Ok((read_u128(slice), size_of::<u128>())) +} + /// Read a u16 from the beginning of the given slice in native endian format. /// If the slice has fewer than 2 bytes, then this panics. /// /// Marked as inline to speed up sparse searching which decodes integers from /// its automaton at search time. -#[inline(always)] -pub fn read_u16(slice: &[u8]) -> u16 { +#[cfg_attr(feature = "perf-inline", inline(always))] +pub(crate) fn read_u16(slice: &[u8]) -> u16 { let bytes: [u8; 2] = slice[..size_of::<u16>()].try_into().unwrap(); u16::from_ne_bytes(bytes) } @@ -667,115 +771,23 @@ pub fn read_u16(slice: &[u8]) -> u16 { /// /// Marked as inline to speed up sparse searching which decodes integers from /// its automaton at search time. -#[inline(always)] -pub fn read_u32(slice: &[u8]) -> u32 { +#[cfg_attr(feature = "perf-inline", inline(always))] +pub(crate) fn read_u32(slice: &[u8]) -> u32 { let bytes: [u8; 4] = slice[..size_of::<u32>()].try_into().unwrap(); u32::from_ne_bytes(bytes) } -/// Read a u64 from the beginning of the given slice in native endian format. -/// If the slice has fewer than 8 bytes, then this panics. -/// -/// Marked as inline to speed up sparse searching which decodes integers from -/// its automaton at search time. -#[inline(always)] -pub fn read_u64(slice: &[u8]) -> u64 { - let bytes: [u8; 8] = slice[..size_of::<u64>()].try_into().unwrap(); - u64::from_ne_bytes(bytes) -} - -/// Write a variable sized integer and return the total number of bytes -/// written. If the slice was not big enough to contain the bytes, then this -/// returns an error including the "what" description in it. This does no -/// padding. -/// -/// See: https://developers.google.com/protocol-buffers/docs/encoding#varints -#[allow(dead_code)] -pub fn write_varu64( - mut n: u64, - what: &'static str, - dst: &mut [u8], -) -> Result<usize, SerializeError> { - let mut i = 0; - while n >= 0b1000_0000 { - if i >= dst.len() { - return Err(SerializeError::buffer_too_small(what)); - } - dst[i] = (n as u8) | 0b1000_0000; - n >>= 7; - i += 1; - } - if i >= dst.len() { - return Err(SerializeError::buffer_too_small(what)); - } - dst[i] = n as u8; - Ok(i + 1) -} - -/// Returns the total number of bytes that would be writen to encode n as a -/// variable sized integer. -/// -/// See: https://developers.google.com/protocol-buffers/docs/encoding#varints -#[allow(dead_code)] -pub fn write_varu64_len(mut n: u64) -> usize { - let mut i = 0; - while n >= 0b1000_0000 { - n >>= 7; - i += 1; - } - i + 1 -} - -/// Like read_varu64, but attempts to cast the result to usize. If the integer -/// cannot fit into a usize, then an error is returned. -#[allow(dead_code)] -pub fn read_varu64_as_usize( - slice: &[u8], - what: &'static str, -) -> Result<(usize, usize), DeserializeError> { - let (n, nread) = read_varu64(slice, what)?; - let n = usize::try_from(n) - .map_err(|_| DeserializeError::invalid_usize(what))?; - Ok((n, nread)) -} - -/// Reads a variable sized integer from the beginning of slice, and returns the -/// integer along with the total number of bytes read. If a valid variable -/// sized integer could not be found, then an error is returned that includes -/// the "what" description in it. -/// -/// https://developers.google.com/protocol-buffers/docs/encoding#varints -#[allow(dead_code)] -pub fn read_varu64( - slice: &[u8], - what: &'static str, -) -> Result<(u64, usize), DeserializeError> { - let mut n: u64 = 0; - let mut shift: u32 = 0; - // The biggest possible value is u64::MAX, which needs all 64 bits which - // requires 10 bytes (because 7 * 9 < 64). We use a limit to avoid reading - // an unnecessary number of bytes. - let limit = cmp::min(slice.len(), 10); - for (i, &b) in slice[..limit].iter().enumerate() { - if b < 0b1000_0000 { - return match (b as u64).checked_shl(shift) { - None => Err(DeserializeError::invalid_varint(what)), - Some(b) => Ok((n | b, i + 1)), - }; - } - match ((b as u64) & 0b0111_1111).checked_shl(shift) { - None => return Err(DeserializeError::invalid_varint(what)), - Some(b) => n |= b, - } - shift += 7; - } - Err(DeserializeError::invalid_varint(what)) +/// Read a u128 from the beginning of the given slice in native endian format. +/// If the slice has fewer than 16 bytes, then this panics. +pub(crate) fn read_u128(slice: &[u8]) -> u128 { + let bytes: [u8; 16] = slice[..size_of::<u128>()].try_into().unwrap(); + u128::from_ne_bytes(bytes) } /// Checks that the given slice has some minimal length. If it's smaller than /// the bound given, then a "buffer too small" error is returned with `what` /// describing what the buffer represents. -pub fn check_slice_len<T>( +pub(crate) fn check_slice_len<T>( slice: &[T], at_least_len: usize, what: &'static str, @@ -790,7 +802,7 @@ pub fn check_slice_len<T>( /// 'what' in the error message. /// /// This is useful when doing arithmetic with untrusted data. -pub fn mul( +pub(crate) fn mul( a: usize, b: usize, what: &'static str, @@ -805,7 +817,7 @@ pub fn mul( /// 'what' in the error message. /// /// This is useful when doing arithmetic with untrusted data. -pub fn add( +pub(crate) fn add( a: usize, b: usize, what: &'static str, @@ -820,7 +832,7 @@ pub fn add( /// 'what' in the error message. /// /// This is useful when doing arithmetic with untrusted data. -pub fn shl( +pub(crate) fn shl( a: usize, b: usize, what: &'static str, @@ -833,11 +845,18 @@ pub fn shl( } } +/// Returns the number of additional bytes required to add to the given length +/// in order to make the total length a multiple of 4. The return value is +/// always less than 4. +pub(crate) fn padding_len(non_padding_len: usize) -> usize { + (4 - (non_padding_len & 0b11)) & 0b11 +} + /// A simple trait for writing code generic over endianness. /// /// This is similar to what byteorder provides, but we only need a very small /// subset. -pub trait Endian { +pub(crate) trait Endian { /// Writes a u16 to the given destination buffer in a particular /// endianness. If the destination buffer has a length smaller than 2, then /// this panics. @@ -852,17 +871,22 @@ pub trait Endian { /// endianness. If the destination buffer has a length smaller than 8, then /// this panics. fn write_u64(n: u64, dst: &mut [u8]); + + /// Writes a u128 to the given destination buffer in a particular + /// endianness. If the destination buffer has a length smaller than 16, + /// then this panics. + fn write_u128(n: u128, dst: &mut [u8]); } /// Little endian writing. -pub enum LE {} +pub(crate) enum LE {} /// Big endian writing. -pub enum BE {} +pub(crate) enum BE {} #[cfg(target_endian = "little")] -pub type NE = LE; +pub(crate) type NE = LE; #[cfg(target_endian = "big")] -pub type NE = BE; +pub(crate) type NE = BE; impl Endian for LE { fn write_u16(n: u16, dst: &mut [u8]) { @@ -876,6 +900,10 @@ impl Endian for LE { fn write_u64(n: u64, dst: &mut [u8]) { dst[..8].copy_from_slice(&n.to_le_bytes()); } + + fn write_u128(n: u128, dst: &mut [u8]) { + dst[..16].copy_from_slice(&n.to_le_bytes()); + } } impl Endian for BE { @@ -890,13 +918,10 @@ impl Endian for BE { fn write_u64(n: u64, dst: &mut [u8]) { dst[..8].copy_from_slice(&n.to_be_bytes()); } -} -/// Returns the number of additional bytes required to add to the given length -/// in order to make the total length a multiple of 4. The return value is -/// always less than 4. -pub fn padding_len(non_padding_len: usize) -> usize { - (4 - (non_padding_len & 0b11)) & 0b11 + fn write_u128(n: u128, dst: &mut [u8]) { + dst[..16].copy_from_slice(&n.to_be_bytes()); + } } #[cfg(all(test, feature = "alloc"))] |