From 5363f350887b1e5b5dd21a86f88c8af9d7fea6da Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Wed, 17 Apr 2024 14:18:25 +0200 Subject: Merging upstream version 1.67.1+dfsg1. Signed-off-by: Daniel Baumann --- vendor/regex-automata/src/classes.rs | 271 ----------------------------------- 1 file changed, 271 deletions(-) delete mode 100644 vendor/regex-automata/src/classes.rs (limited to 'vendor/regex-automata/src/classes.rs') diff --git a/vendor/regex-automata/src/classes.rs b/vendor/regex-automata/src/classes.rs deleted file mode 100644 index 143908b3a..000000000 --- a/vendor/regex-automata/src/classes.rs +++ /dev/null @@ -1,271 +0,0 @@ -use core::fmt; - -/// A representation of byte oriented equivalence classes. -/// -/// This is used in a DFA to reduce the size of the transition table. This can -/// have a particularly large impact not only on the total size of a dense DFA, -/// but also on compile times. -#[derive(Clone, Copy)] -pub struct ByteClasses([u8; 256]); - -impl ByteClasses { - /// Creates a new set of equivalence classes where all bytes are mapped to - /// the same class. - pub fn empty() -> ByteClasses { - ByteClasses([0; 256]) - } - - /// Creates a new set of equivalence classes where each byte belongs to - /// its own equivalence class. - pub fn singletons() -> ByteClasses { - let mut classes = ByteClasses::empty(); - for i in 0..256 { - classes.set(i as u8, i as u8); - } - classes - } - - /// Copies the byte classes given. The given slice must have length 0 or - /// length 256. Slices of length 0 are treated as singletons (every byte - /// is its own class). - pub fn from_slice(slice: &[u8]) -> ByteClasses { - assert!(slice.is_empty() || slice.len() == 256); - - if slice.is_empty() { - ByteClasses::singletons() - } else { - let mut classes = ByteClasses::empty(); - for (b, &class) in slice.iter().enumerate() { - classes.set(b as u8, class); - } - classes - } - } - - /// Set the equivalence class for the given byte. - #[inline] - pub fn set(&mut self, byte: u8, class: u8) { - self.0[byte as usize] = class; - } - - /// Get the equivalence class for the given byte. - #[inline] - pub fn get(&self, byte: u8) -> u8 { - self.0[byte as usize] - } - - /// Get the equivalence class for the given byte while forcefully - /// eliding bounds checks. - #[inline] - pub unsafe fn get_unchecked(&self, byte: u8) -> u8 { - *self.0.get_unchecked(byte as usize) - } - - /// Return the total number of elements in the alphabet represented by - /// these equivalence classes. Equivalently, this returns the total number - /// of equivalence classes. - #[inline] - pub fn alphabet_len(&self) -> usize { - self.0[255] as usize + 1 - } - - /// Returns true if and only if every byte in this class maps to its own - /// equivalence class. Equivalently, there are 256 equivalence classes - /// and each class contains exactly one byte. - #[inline] - pub fn is_singleton(&self) -> bool { - self.alphabet_len() == 256 - } - - /// Returns an iterator over a sequence of representative bytes from each - /// equivalence class. Namely, this yields exactly N items, where N is - /// equivalent to the number of equivalence classes. Each item is an - /// arbitrary byte drawn from each equivalence class. - /// - /// This is useful when one is determinizing an NFA and the NFA's alphabet - /// hasn't been converted to equivalence classes yet. Picking an arbitrary - /// byte from each equivalence class then permits a full exploration of - /// the NFA instead of using every possible byte value. - #[cfg(feature = "std")] - pub fn representatives(&self) -> ByteClassRepresentatives { - ByteClassRepresentatives { classes: self, byte: 0, last_class: None } - } - - /// Returns all of the bytes in the given equivalence class. - /// - /// The second element in the tuple indicates the number of elements in - /// the array. - fn elements(&self, equiv: u8) -> ([u8; 256], usize) { - let (mut array, mut len) = ([0; 256], 0); - for b in 0..256 { - if self.get(b as u8) == equiv { - array[len] = b as u8; - len += 1; - } - } - (array, len) - } -} - -impl fmt::Debug for ByteClasses { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - if self.is_singleton() { - write!(f, "ByteClasses({{singletons}})") - } else { - write!(f, "ByteClasses(")?; - for equiv in 0..self.alphabet_len() { - let (members, len) = self.elements(equiv as u8); - write!(f, "{} => {:?}", equiv, &members[..len])?; - } - write!(f, ")") - } - } -} - -/// An iterator over representative bytes from each equivalence class. -#[cfg(feature = "std")] -#[derive(Debug)] -pub struct ByteClassRepresentatives<'a> { - classes: &'a ByteClasses, - byte: usize, - last_class: Option, -} - -#[cfg(feature = "std")] -impl<'a> Iterator for ByteClassRepresentatives<'a> { - type Item = u8; - - fn next(&mut self) -> Option { - while self.byte < 256 { - let byte = self.byte as u8; - let class = self.classes.get(byte); - self.byte += 1; - - if self.last_class != Some(class) { - self.last_class = Some(class); - return Some(byte); - } - } - None - } -} - -/// A byte class set keeps track of an *approximation* of equivalence classes -/// of bytes during NFA construction. That is, every byte in an equivalence -/// class cannot discriminate between a match and a non-match. -/// -/// For example, in the regex `[ab]+`, the bytes `a` and `b` would be in the -/// same equivalence class because it never matters whether an `a` or a `b` is -/// seen, and no combination of `a`s and `b`s in the text can discriminate -/// a match. -/// -/// Note though that this does not compute the minimal set of equivalence -/// classes. For example, in the regex `[ac]+`, both `a` and `c` are in the -/// same equivalence class for the same reason that `a` and `b` are in the -/// same equivalence class in the aforementioned regex. However, in this -/// implementation, `a` and `c` are put into distinct equivalence classes. -/// The reason for this is implementation complexity. In the future, we should -/// endeavor to compute the minimal equivalence classes since they can have a -/// rather large impact on the size of the DFA. -/// -/// The representation here is 256 booleans, all initially set to false. Each -/// boolean maps to its corresponding byte based on position. A `true` value -/// indicates the end of an equivalence class, where its corresponding byte -/// and all of the bytes corresponding to all previous contiguous `false` -/// values are in the same equivalence class. -/// -/// This particular representation only permits contiguous ranges of bytes to -/// be in the same equivalence class, which means that we can never discover -/// the true minimal set of equivalence classes. -#[cfg(feature = "std")] -#[derive(Debug)] -pub struct ByteClassSet(Vec); - -#[cfg(feature = "std")] -impl ByteClassSet { - /// Create a new set of byte classes where all bytes are part of the same - /// equivalence class. - pub fn new() -> Self { - ByteClassSet(vec![false; 256]) - } - - /// Indicate the the range of byte given (inclusive) can discriminate a - /// match between it and all other bytes outside of the range. - pub fn set_range(&mut self, start: u8, end: u8) { - debug_assert!(start <= end); - if start > 0 { - self.0[start as usize - 1] = true; - } - self.0[end as usize] = true; - } - - /// Convert this boolean set to a map that maps all byte values to their - /// corresponding equivalence class. The last mapping indicates the largest - /// equivalence class identifier (which is never bigger than 255). - pub fn byte_classes(&self) -> ByteClasses { - let mut classes = ByteClasses::empty(); - let mut class = 0u8; - let mut i = 0; - loop { - classes.set(i as u8, class as u8); - if i >= 255 { - break; - } - if self.0[i] { - class = class.checked_add(1).unwrap(); - } - i += 1; - } - classes - } -} - -#[cfg(test)] -mod tests { - #[cfg(feature = "std")] - #[test] - fn byte_classes() { - use super::ByteClassSet; - - let mut set = ByteClassSet::new(); - set.set_range(b'a', b'z'); - - let classes = set.byte_classes(); - assert_eq!(classes.get(0), 0); - assert_eq!(classes.get(1), 0); - assert_eq!(classes.get(2), 0); - assert_eq!(classes.get(b'a' - 1), 0); - assert_eq!(classes.get(b'a'), 1); - assert_eq!(classes.get(b'm'), 1); - assert_eq!(classes.get(b'z'), 1); - assert_eq!(classes.get(b'z' + 1), 2); - assert_eq!(classes.get(254), 2); - assert_eq!(classes.get(255), 2); - - let mut set = ByteClassSet::new(); - set.set_range(0, 2); - set.set_range(4, 6); - let classes = set.byte_classes(); - assert_eq!(classes.get(0), 0); - assert_eq!(classes.get(1), 0); - assert_eq!(classes.get(2), 0); - assert_eq!(classes.get(3), 1); - assert_eq!(classes.get(4), 2); - assert_eq!(classes.get(5), 2); - assert_eq!(classes.get(6), 2); - assert_eq!(classes.get(7), 3); - assert_eq!(classes.get(255), 3); - } - - #[cfg(feature = "std")] - #[test] - fn full_byte_classes() { - use super::ByteClassSet; - - let mut set = ByteClassSet::new(); - for i in 0..256u16 { - set.set_range(i as u8, i as u8); - } - assert_eq!(set.byte_classes().alphabet_len(), 256); - } -} -- cgit v1.2.3