diff options
Diffstat (limited to 'third_party/rust/aho-corasick/src/util')
-rw-r--r-- | third_party/rust/aho-corasick/src/util/alphabet.rs | 409 | ||||
-rw-r--r-- | third_party/rust/aho-corasick/src/util/buffer.rs | 124 | ||||
-rw-r--r-- | third_party/rust/aho-corasick/src/util/byte_frequencies.rs | 258 | ||||
-rw-r--r-- | third_party/rust/aho-corasick/src/util/debug.rs | 26 | ||||
-rw-r--r-- | third_party/rust/aho-corasick/src/util/error.rs | 259 | ||||
-rw-r--r-- | third_party/rust/aho-corasick/src/util/int.rs | 284 | ||||
-rw-r--r-- | third_party/rust/aho-corasick/src/util/mod.rs | 12 | ||||
-rw-r--r-- | third_party/rust/aho-corasick/src/util/prefilter.rs | 924 | ||||
-rw-r--r-- | third_party/rust/aho-corasick/src/util/primitives.rs | 759 | ||||
-rw-r--r-- | third_party/rust/aho-corasick/src/util/remapper.rs | 214 | ||||
-rw-r--r-- | third_party/rust/aho-corasick/src/util/search.rs | 1148 | ||||
-rw-r--r-- | third_party/rust/aho-corasick/src/util/special.rs | 42 |
12 files changed, 4459 insertions, 0 deletions
diff --git a/third_party/rust/aho-corasick/src/util/alphabet.rs b/third_party/rust/aho-corasick/src/util/alphabet.rs new file mode 100644 index 0000000000..69724fa3ab --- /dev/null +++ b/third_party/rust/aho-corasick/src/util/alphabet.rs @@ -0,0 +1,409 @@ +use crate::util::int::Usize; + +/// A representation of byte oriented equivalence classes. +/// +/// This is used in finite state machines to reduce the size of the transition +/// table. This can have a particularly large impact not only on the total size +/// of an FSM, but also on FSM build times because it reduces the number of +/// transitions that need to be visited/set. +#[derive(Clone, Copy)] +pub(crate) struct ByteClasses([u8; 256]); + +impl ByteClasses { + /// Creates a new set of equivalence classes where all bytes are mapped to + /// the same class. + pub(crate) fn empty() -> ByteClasses { + ByteClasses([0; 256]) + } + + /// Creates a new set of equivalence classes where each byte belongs to + /// its own equivalence class. + pub(crate) fn singletons() -> ByteClasses { + let mut classes = ByteClasses::empty(); + for b in 0..=255 { + classes.set(b, b); + } + classes + } + + /// Set the equivalence class for the given byte. + #[inline] + pub(crate) fn set(&mut self, byte: u8, class: u8) { + self.0[usize::from(byte)] = class; + } + + /// Get the equivalence class for the given byte. + #[inline] + pub(crate) fn get(&self, byte: u8) -> u8 { + self.0[usize::from(byte)] + } + + /// Return the total number of elements in the alphabet represented by + /// these equivalence classes. Equivalently, this returns the total number + /// of equivalence classes. + #[inline] + pub(crate) fn alphabet_len(&self) -> usize { + // Add one since the number of equivalence classes is one bigger than + // the last one. + usize::from(self.0[255]) + 1 + } + + /// Returns the stride, as a base-2 exponent, required for these + /// equivalence classes. + /// + /// The stride is always the smallest power of 2 that is greater than or + /// equal to the alphabet length. This is done so that converting between + /// state IDs and indices can be done with shifts alone, which is much + /// faster than integer division. The "stride2" is the exponent. i.e., + /// `2^stride2 = stride`. + pub(crate) fn stride2(&self) -> usize { + let zeros = self.alphabet_len().next_power_of_two().trailing_zeros(); + usize::try_from(zeros).unwrap() + } + + /// Returns the stride for these equivalence classes, which corresponds + /// to the smallest power of 2 greater than or equal to the number of + /// equivalence classes. + pub(crate) fn stride(&self) -> usize { + 1 << self.stride2() + } + + /// Returns true if and only if every byte in this class maps to its own + /// equivalence class. Equivalently, there are 257 equivalence classes + /// and each class contains exactly one byte (plus the special EOI class). + #[inline] + pub(crate) fn is_singleton(&self) -> bool { + self.alphabet_len() == 256 + } + + /// Returns an iterator over all equivalence classes in this set. + pub(crate) fn iter(&self) -> ByteClassIter { + ByteClassIter { it: 0..self.alphabet_len() } + } + + /// Returns an iterator of the bytes in the given equivalence class. + pub(crate) fn elements(&self, class: u8) -> ByteClassElements { + ByteClassElements { classes: self, class, bytes: 0..=255 } + } + + /// Returns an iterator of byte ranges in the given equivalence class. + /// + /// That is, a sequence of contiguous ranges are returned. Typically, every + /// class maps to a single contiguous range. + fn element_ranges(&self, class: u8) -> ByteClassElementRanges { + ByteClassElementRanges { elements: self.elements(class), range: None } + } +} + +impl core::fmt::Debug for ByteClasses { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + if self.is_singleton() { + write!(f, "ByteClasses(<one-class-per-byte>)") + } else { + write!(f, "ByteClasses(")?; + for (i, class) in self.iter().enumerate() { + if i > 0 { + write!(f, ", ")?; + } + write!(f, "{:?} => [", class)?; + for (start, end) in self.element_ranges(class) { + if start == end { + write!(f, "{:?}", start)?; + } else { + write!(f, "{:?}-{:?}", start, end)?; + } + } + write!(f, "]")?; + } + write!(f, ")") + } + } +} + +/// An iterator over each equivalence class. +#[derive(Debug)] +pub(crate) struct ByteClassIter { + it: core::ops::Range<usize>, +} + +impl Iterator for ByteClassIter { + type Item = u8; + + fn next(&mut self) -> Option<u8> { + self.it.next().map(|class| class.as_u8()) + } +} + +/// An iterator over all elements in a specific equivalence class. +#[derive(Debug)] +pub(crate) struct ByteClassElements<'a> { + classes: &'a ByteClasses, + class: u8, + bytes: core::ops::RangeInclusive<u8>, +} + +impl<'a> Iterator for ByteClassElements<'a> { + type Item = u8; + + fn next(&mut self) -> Option<u8> { + while let Some(byte) = self.bytes.next() { + if self.class == self.classes.get(byte) { + return Some(byte); + } + } + None + } +} + +/// An iterator over all elements in an equivalence class expressed as a +/// sequence of contiguous ranges. +#[derive(Debug)] +pub(crate) struct ByteClassElementRanges<'a> { + elements: ByteClassElements<'a>, + range: Option<(u8, u8)>, +} + +impl<'a> Iterator for ByteClassElementRanges<'a> { + type Item = (u8, u8); + + fn next(&mut self) -> Option<(u8, u8)> { + loop { + let element = match self.elements.next() { + None => return self.range.take(), + Some(element) => element, + }; + match self.range.take() { + None => { + self.range = Some((element, element)); + } + Some((start, end)) => { + if usize::from(end) + 1 != usize::from(element) { + self.range = Some((element, element)); + return Some((start, end)); + } + self.range = Some((start, element)); + } + } + } + } +} + +/// A partitioning of bytes into equivalence classes. +/// +/// A byte class set keeps track of an *approximation* of equivalence classes +/// of bytes during NFA construction. That is, every byte in an equivalence +/// class cannot discriminate between a match and a non-match. +/// +/// Note that this may not compute the minimal set of equivalence classes. +/// Basically, any byte in a pattern given to the noncontiguous NFA builder +/// will automatically be treated as its own equivalence class. All other +/// bytes---any byte not in any pattern---will be treated as their own +/// equivalence classes. In theory, all bytes not in any pattern should +/// be part of a single equivalence class, but in practice, we only treat +/// contiguous ranges of bytes as an equivalence class. So the number of +/// classes computed may be bigger than necessary. This usually doesn't make +/// much of a difference, and keeps the implementation simple. +#[derive(Clone, Debug)] +pub(crate) struct ByteClassSet(ByteSet); + +impl Default for ByteClassSet { + fn default() -> ByteClassSet { + ByteClassSet::empty() + } +} + +impl ByteClassSet { + /// Create a new set of byte classes where all bytes are part of the same + /// equivalence class. + pub(crate) fn empty() -> Self { + ByteClassSet(ByteSet::empty()) + } + + /// Indicate the the range of byte given (inclusive) can discriminate a + /// match between it and all other bytes outside of the range. + pub(crate) fn set_range(&mut self, start: u8, end: u8) { + debug_assert!(start <= end); + if start > 0 { + self.0.add(start - 1); + } + self.0.add(end); + } + + /// Convert this boolean set to a map that maps all byte values to their + /// corresponding equivalence class. The last mapping indicates the largest + /// equivalence class identifier (which is never bigger than 255). + pub(crate) fn byte_classes(&self) -> ByteClasses { + let mut classes = ByteClasses::empty(); + let mut class = 0u8; + let mut b = 0u8; + loop { + classes.set(b, class); + if b == 255 { + break; + } + if self.0.contains(b) { + class = class.checked_add(1).unwrap(); + } + b = b.checked_add(1).unwrap(); + } + classes + } +} + +/// A simple set of bytes that is reasonably cheap to copy and allocation free. +#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)] +pub(crate) struct ByteSet { + bits: BitSet, +} + +/// The representation of a byte set. Split out so that we can define a +/// convenient Debug impl for it while keeping "ByteSet" in the output. +#[derive(Clone, Copy, Default, Eq, PartialEq)] +struct BitSet([u128; 2]); + +impl ByteSet { + /// Create an empty set of bytes. + pub(crate) fn empty() -> ByteSet { + ByteSet { bits: BitSet([0; 2]) } + } + + /// Add a byte to this set. + /// + /// If the given byte already belongs to this set, then this is a no-op. + pub(crate) fn add(&mut self, byte: u8) { + let bucket = byte / 128; + let bit = byte % 128; + self.bits.0[usize::from(bucket)] |= 1 << bit; + } + + /// Return true if and only if the given byte is in this set. + pub(crate) fn contains(&self, byte: u8) -> bool { + let bucket = byte / 128; + let bit = byte % 128; + self.bits.0[usize::from(bucket)] & (1 << bit) > 0 + } +} + +impl core::fmt::Debug for BitSet { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + let mut fmtd = f.debug_set(); + for b in 0u8..=255 { + if (ByteSet { bits: *self }).contains(b) { + fmtd.entry(&b); + } + } + fmtd.finish() + } +} + +#[cfg(test)] +mod tests { + use alloc::{vec, vec::Vec}; + + use super::*; + + #[test] + fn byte_classes() { + let mut set = ByteClassSet::empty(); + set.set_range(b'a', b'z'); + + let classes = set.byte_classes(); + assert_eq!(classes.get(0), 0); + assert_eq!(classes.get(1), 0); + assert_eq!(classes.get(2), 0); + assert_eq!(classes.get(b'a' - 1), 0); + assert_eq!(classes.get(b'a'), 1); + assert_eq!(classes.get(b'm'), 1); + assert_eq!(classes.get(b'z'), 1); + assert_eq!(classes.get(b'z' + 1), 2); + assert_eq!(classes.get(254), 2); + assert_eq!(classes.get(255), 2); + + let mut set = ByteClassSet::empty(); + set.set_range(0, 2); + set.set_range(4, 6); + let classes = set.byte_classes(); + assert_eq!(classes.get(0), 0); + assert_eq!(classes.get(1), 0); + assert_eq!(classes.get(2), 0); + assert_eq!(classes.get(3), 1); + assert_eq!(classes.get(4), 2); + assert_eq!(classes.get(5), 2); + assert_eq!(classes.get(6), 2); + assert_eq!(classes.get(7), 3); + assert_eq!(classes.get(255), 3); + } + + #[test] + fn full_byte_classes() { + let mut set = ByteClassSet::empty(); + for b in 0u8..=255 { + set.set_range(b, b); + } + assert_eq!(set.byte_classes().alphabet_len(), 256); + } + + #[test] + fn elements_typical() { + let mut set = ByteClassSet::empty(); + set.set_range(b'b', b'd'); + set.set_range(b'g', b'm'); + set.set_range(b'z', b'z'); + let classes = set.byte_classes(); + // class 0: \x00-a + // class 1: b-d + // class 2: e-f + // class 3: g-m + // class 4: n-y + // class 5: z-z + // class 6: \x7B-\xFF + assert_eq!(classes.alphabet_len(), 7); + + let elements = classes.elements(0).collect::<Vec<_>>(); + assert_eq!(elements.len(), 98); + assert_eq!(elements[0], b'\x00'); + assert_eq!(elements[97], b'a'); + + let elements = classes.elements(1).collect::<Vec<_>>(); + assert_eq!(elements, vec![b'b', b'c', b'd'],); + + let elements = classes.elements(2).collect::<Vec<_>>(); + assert_eq!(elements, vec![b'e', b'f'],); + + let elements = classes.elements(3).collect::<Vec<_>>(); + assert_eq!(elements, vec![b'g', b'h', b'i', b'j', b'k', b'l', b'm',],); + + let elements = classes.elements(4).collect::<Vec<_>>(); + assert_eq!(elements.len(), 12); + assert_eq!(elements[0], b'n'); + assert_eq!(elements[11], b'y'); + + let elements = classes.elements(5).collect::<Vec<_>>(); + assert_eq!(elements, vec![b'z']); + + let elements = classes.elements(6).collect::<Vec<_>>(); + assert_eq!(elements.len(), 133); + assert_eq!(elements[0], b'\x7B'); + assert_eq!(elements[132], b'\xFF'); + } + + #[test] + fn elements_singletons() { + let classes = ByteClasses::singletons(); + assert_eq!(classes.alphabet_len(), 256); + + let elements = classes.elements(b'a').collect::<Vec<_>>(); + assert_eq!(elements, vec![b'a']); + } + + #[test] + fn elements_empty() { + let classes = ByteClasses::empty(); + assert_eq!(classes.alphabet_len(), 1); + + let elements = classes.elements(0).collect::<Vec<_>>(); + assert_eq!(elements.len(), 256); + assert_eq!(elements[0], b'\x00'); + assert_eq!(elements[255], b'\xFF'); + } +} diff --git a/third_party/rust/aho-corasick/src/util/buffer.rs b/third_party/rust/aho-corasick/src/util/buffer.rs new file mode 100644 index 0000000000..e9e982af58 --- /dev/null +++ b/third_party/rust/aho-corasick/src/util/buffer.rs @@ -0,0 +1,124 @@ +use alloc::{vec, vec::Vec}; + +/// The default buffer capacity that we use for the stream buffer. +const DEFAULT_BUFFER_CAPACITY: usize = 64 * (1 << 10); // 64 KB + +/// A fairly simple roll buffer for supporting stream searches. +/// +/// This buffer acts as a temporary place to store a fixed amount of data when +/// reading from a stream. Its central purpose is to allow "rolling" some +/// suffix of the data to the beginning of the buffer before refilling it with +/// more data from the stream. For example, let's say we are trying to match +/// "foobar" on a stream. When we report the match, we'd like to not only +/// report the correct offsets at which the match occurs, but also the matching +/// bytes themselves. So let's say our stream is a file with the following +/// contents: `test test foobar test test`. Now assume that we happen to read +/// the aforementioned file in two chunks: `test test foo` and `bar test test`. +/// Naively, it would not be possible to report a single contiguous `foobar` +/// match, but this roll buffer allows us to do that. Namely, after the second +/// read, the contents of the buffer should be `st foobar test test`, where the +/// search should ultimately resume immediately after `foo`. (The prefix `st ` +/// is included because the roll buffer saves N bytes at the end of the buffer, +/// where N is the maximum possible length of a match.) +/// +/// A lot of the logic for dealing with this is unfortunately split out between +/// this roll buffer and the `StreamChunkIter`. +/// +/// Note also that this buffer is not actually required to just report matches. +/// Because a `Match` is just some offsets. But it *is* required for supporting +/// things like `try_stream_replace_all` because that needs some mechanism for +/// knowing which bytes in the stream correspond to a match and which don't. So +/// when a match occurs across two `read` calls, *something* needs to retain +/// the bytes from the previous `read` call because you don't know before the +/// second read call whether a match exists or not. +#[derive(Debug)] +pub(crate) struct Buffer { + /// The raw buffer contents. This has a fixed size and never increases. + buf: Vec<u8>, + /// The minimum size of the buffer, which is equivalent to the maximum + /// possible length of a match. This corresponds to the amount that we + /// roll + min: usize, + /// The end of the contents of this buffer. + end: usize, +} + +impl Buffer { + /// Create a new buffer for stream searching. The minimum buffer length + /// given should be the size of the maximum possible match length. + pub(crate) fn new(min_buffer_len: usize) -> Buffer { + let min = core::cmp::max(1, min_buffer_len); + // The minimum buffer amount is also the amount that we roll our + // buffer in order to support incremental searching. To this end, + // our actual capacity needs to be at least 1 byte bigger than our + // minimum amount, otherwise we won't have any overlap. In actuality, + // we want our buffer to be a bit bigger than that for performance + // reasons, so we set a lower bound of `8 * min`. + // + // TODO: It would be good to find a way to test the streaming + // implementation with the minimal buffer size. For now, we just + // uncomment out the next line and comment out the subsequent line. + // let capacity = 1 + min; + let capacity = core::cmp::max(min * 8, DEFAULT_BUFFER_CAPACITY); + Buffer { buf: vec![0; capacity], min, end: 0 } + } + + /// Return the contents of this buffer. + #[inline] + pub(crate) fn buffer(&self) -> &[u8] { + &self.buf[..self.end] + } + + /// Return the minimum size of the buffer. The only way a buffer may be + /// smaller than this is if the stream itself contains less than the + /// minimum buffer amount. + #[inline] + pub(crate) fn min_buffer_len(&self) -> usize { + self.min + } + + /// Return all free capacity in this buffer. + fn free_buffer(&mut self) -> &mut [u8] { + &mut self.buf[self.end..] + } + + /// Refill the contents of this buffer by reading as much as possible into + /// this buffer's free capacity. If no more bytes could be read, then this + /// returns false. Otherwise, this reads until it has filled the buffer + /// past the minimum amount. + pub(crate) fn fill<R: std::io::Read>( + &mut self, + mut rdr: R, + ) -> std::io::Result<bool> { + let mut readany = false; + loop { + let readlen = rdr.read(self.free_buffer())?; + if readlen == 0 { + return Ok(readany); + } + readany = true; + self.end += readlen; + if self.buffer().len() >= self.min { + return Ok(true); + } + } + } + + /// Roll the contents of the buffer so that the suffix of this buffer is + /// moved to the front and all other contents are dropped. The size of the + /// suffix corresponds precisely to the minimum buffer length. + /// + /// This should only be called when the entire contents of this buffer have + /// been searched. + pub(crate) fn roll(&mut self) { + let roll_start = self + .end + .checked_sub(self.min) + .expect("buffer capacity should be bigger than minimum amount"); + let roll_end = roll_start + self.min; + + assert!(roll_end <= self.end); + self.buf.copy_within(roll_start..roll_end, 0); + self.end = self.min; + } +} diff --git a/third_party/rust/aho-corasick/src/util/byte_frequencies.rs b/third_party/rust/aho-corasick/src/util/byte_frequencies.rs new file mode 100644 index 0000000000..c313b629db --- /dev/null +++ b/third_party/rust/aho-corasick/src/util/byte_frequencies.rs @@ -0,0 +1,258 @@ +pub const BYTE_FREQUENCIES: [u8; 256] = [ + 55, // '\x00' + 52, // '\x01' + 51, // '\x02' + 50, // '\x03' + 49, // '\x04' + 48, // '\x05' + 47, // '\x06' + 46, // '\x07' + 45, // '\x08' + 103, // '\t' + 242, // '\n' + 66, // '\x0b' + 67, // '\x0c' + 229, // '\r' + 44, // '\x0e' + 43, // '\x0f' + 42, // '\x10' + 41, // '\x11' + 40, // '\x12' + 39, // '\x13' + 38, // '\x14' + 37, // '\x15' + 36, // '\x16' + 35, // '\x17' + 34, // '\x18' + 33, // '\x19' + 56, // '\x1a' + 32, // '\x1b' + 31, // '\x1c' + 30, // '\x1d' + 29, // '\x1e' + 28, // '\x1f' + 255, // ' ' + 148, // '!' + 164, // '"' + 149, // '#' + 136, // '$' + 160, // '%' + 155, // '&' + 173, // "'" + 221, // '(' + 222, // ')' + 134, // '*' + 122, // '+' + 232, // ',' + 202, // '-' + 215, // '.' + 224, // '/' + 208, // '0' + 220, // '1' + 204, // '2' + 187, // '3' + 183, // '4' + 179, // '5' + 177, // '6' + 168, // '7' + 178, // '8' + 200, // '9' + 226, // ':' + 195, // ';' + 154, // '<' + 184, // '=' + 174, // '>' + 126, // '?' + 120, // '@' + 191, // 'A' + 157, // 'B' + 194, // 'C' + 170, // 'D' + 189, // 'E' + 162, // 'F' + 161, // 'G' + 150, // 'H' + 193, // 'I' + 142, // 'J' + 137, // 'K' + 171, // 'L' + 176, // 'M' + 185, // 'N' + 167, // 'O' + 186, // 'P' + 112, // 'Q' + 175, // 'R' + 192, // 'S' + 188, // 'T' + 156, // 'U' + 140, // 'V' + 143, // 'W' + 123, // 'X' + 133, // 'Y' + 128, // 'Z' + 147, // '[' + 138, // '\\' + 146, // ']' + 114, // '^' + 223, // '_' + 151, // '`' + 249, // 'a' + 216, // 'b' + 238, // 'c' + 236, // 'd' + 253, // 'e' + 227, // 'f' + 218, // 'g' + 230, // 'h' + 247, // 'i' + 135, // 'j' + 180, // 'k' + 241, // 'l' + 233, // 'm' + 246, // 'n' + 244, // 'o' + 231, // 'p' + 139, // 'q' + 245, // 'r' + 243, // 's' + 251, // 't' + 235, // 'u' + 201, // 'v' + 196, // 'w' + 240, // 'x' + 214, // 'y' + 152, // 'z' + 182, // '{' + 205, // '|' + 181, // '}' + 127, // '~' + 27, // '\x7f' + 212, // '\x80' + 211, // '\x81' + 210, // '\x82' + 213, // '\x83' + 228, // '\x84' + 197, // '\x85' + 169, // '\x86' + 159, // '\x87' + 131, // '\x88' + 172, // '\x89' + 105, // '\x8a' + 80, // '\x8b' + 98, // '\x8c' + 96, // '\x8d' + 97, // '\x8e' + 81, // '\x8f' + 207, // '\x90' + 145, // '\x91' + 116, // '\x92' + 115, // '\x93' + 144, // '\x94' + 130, // '\x95' + 153, // '\x96' + 121, // '\x97' + 107, // '\x98' + 132, // '\x99' + 109, // '\x9a' + 110, // '\x9b' + 124, // '\x9c' + 111, // '\x9d' + 82, // '\x9e' + 108, // '\x9f' + 118, // '\xa0' + 141, // '¡' + 113, // '¢' + 129, // '£' + 119, // '¤' + 125, // '¥' + 165, // '¦' + 117, // '§' + 92, // '¨' + 106, // '©' + 83, // 'ª' + 72, // '«' + 99, // '¬' + 93, // '\xad' + 65, // '®' + 79, // '¯' + 166, // '°' + 237, // '±' + 163, // '²' + 199, // '³' + 190, // '´' + 225, // 'µ' + 209, // '¶' + 203, // '·' + 198, // '¸' + 217, // '¹' + 219, // 'º' + 206, // '»' + 234, // '¼' + 248, // '½' + 158, // '¾' + 239, // '¿' + 255, // 'À' + 255, // 'Á' + 255, // 'Â' + 255, // 'Ã' + 255, // 'Ä' + 255, // 'Å' + 255, // 'Æ' + 255, // 'Ç' + 255, // 'È' + 255, // 'É' + 255, // 'Ê' + 255, // 'Ë' + 255, // 'Ì' + 255, // 'Í' + 255, // 'Î' + 255, // 'Ï' + 255, // 'Ð' + 255, // 'Ñ' + 255, // 'Ò' + 255, // 'Ó' + 255, // 'Ô' + 255, // 'Õ' + 255, // 'Ö' + 255, // '×' + 255, // 'Ø' + 255, // 'Ù' + 255, // 'Ú' + 255, // 'Û' + 255, // 'Ü' + 255, // 'Ý' + 255, // 'Þ' + 255, // 'ß' + 255, // 'à' + 255, // 'á' + 255, // 'â' + 255, // 'ã' + 255, // 'ä' + 255, // 'å' + 255, // 'æ' + 255, // 'ç' + 255, // 'è' + 255, // 'é' + 255, // 'ê' + 255, // 'ë' + 255, // 'ì' + 255, // 'í' + 255, // 'î' + 255, // 'ï' + 255, // 'ð' + 255, // 'ñ' + 255, // 'ò' + 255, // 'ó' + 255, // 'ô' + 255, // 'õ' + 255, // 'ö' + 255, // '÷' + 255, // 'ø' + 255, // 'ù' + 255, // 'ú' + 255, // 'û' + 255, // 'ü' + 255, // 'ý' + 255, // 'þ' + 255, // 'ÿ' +]; diff --git a/third_party/rust/aho-corasick/src/util/debug.rs b/third_party/rust/aho-corasick/src/util/debug.rs new file mode 100644 index 0000000000..22b5f2231f --- /dev/null +++ b/third_party/rust/aho-corasick/src/util/debug.rs @@ -0,0 +1,26 @@ +/// A type that wraps a single byte with a convenient fmt::Debug impl that +/// escapes the byte. +pub(crate) struct DebugByte(pub(crate) u8); + +impl core::fmt::Debug for DebugByte { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + // Special case ASCII space. It's too hard to read otherwise, so + // put quotes around it. I sometimes wonder whether just '\x20' would + // be better... + if self.0 == b' ' { + return write!(f, "' '"); + } + // 10 bytes is enough to cover any output from ascii::escape_default. + let mut bytes = [0u8; 10]; + let mut len = 0; + for (i, mut b) in core::ascii::escape_default(self.0).enumerate() { + // capitalize \xab to \xAB + if i >= 2 && b'a' <= b && b <= b'f' { + b -= 32; + } + bytes[len] = b; + len += 1; + } + write!(f, "{}", core::str::from_utf8(&bytes[..len]).unwrap()) + } +} diff --git a/third_party/rust/aho-corasick/src/util/error.rs b/third_party/rust/aho-corasick/src/util/error.rs new file mode 100644 index 0000000000..326d04657b --- /dev/null +++ b/third_party/rust/aho-corasick/src/util/error.rs @@ -0,0 +1,259 @@ +use crate::util::{ + primitives::{PatternID, SmallIndex}, + search::MatchKind, +}; + +/// An error that occurred during the construction of an Aho-Corasick +/// automaton. +/// +/// Build errors occur when some kind of limit has been exceeded, either in the +/// number of states, the number of patterns of the length of a pattern. These +/// limits aren't part of the public API, but they should generally be large +/// enough to handle most use cases. +/// +/// When the `std` feature is enabled, this implements the `std::error::Error` +/// trait. +#[derive(Clone, Debug)] +pub struct BuildError { + kind: ErrorKind, +} + +/// The kind of error that occurred. +#[derive(Clone, Debug)] +enum ErrorKind { + /// An error that occurs when allocating a new state would result in an + /// identifier that exceeds the capacity of a `StateID`. + StateIDOverflow { + /// The maximum possible id. + max: u64, + /// The maximum ID requested. + requested_max: u64, + }, + /// An error that occurs when adding a pattern to an Aho-Corasick + /// automaton would result in an identifier that exceeds the capacity of a + /// `PatternID`. + PatternIDOverflow { + /// The maximum possible id. + max: u64, + /// The maximum ID requested. + requested_max: u64, + }, + /// Occurs when a pattern string is given to the Aho-Corasick constructor + /// that is too long. + PatternTooLong { + /// The ID of the pattern that was too long. + pattern: PatternID, + /// The length that was too long. + len: usize, + }, +} + +impl BuildError { + pub(crate) fn state_id_overflow( + max: u64, + requested_max: u64, + ) -> BuildError { + BuildError { kind: ErrorKind::StateIDOverflow { max, requested_max } } + } + + pub(crate) fn pattern_id_overflow( + max: u64, + requested_max: u64, + ) -> BuildError { + BuildError { + kind: ErrorKind::PatternIDOverflow { max, requested_max }, + } + } + + pub(crate) fn pattern_too_long( + pattern: PatternID, + len: usize, + ) -> BuildError { + BuildError { kind: ErrorKind::PatternTooLong { pattern, len } } + } +} + +#[cfg(feature = "std")] +impl std::error::Error for BuildError {} + +impl core::fmt::Display for BuildError { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + match self.kind { + ErrorKind::StateIDOverflow { max, requested_max } => { + write!( + f, + "state identifier overflow: failed to create state ID \ + from {}, which exceeds the max of {}", + requested_max, max, + ) + } + ErrorKind::PatternIDOverflow { max, requested_max } => { + write!( + f, + "pattern identifier overflow: failed to create pattern ID \ + from {}, which exceeds the max of {}", + requested_max, max, + ) + } + ErrorKind::PatternTooLong { pattern, len } => { + write!( + f, + "pattern {} with length {} exceeds \ + the maximum pattern length of {}", + pattern.as_usize(), + len, + SmallIndex::MAX.as_usize(), + ) + } + } + } +} + +/// An error that occurred during an Aho-Corasick search. +/// +/// An error that occurs during a search is limited to some kind of +/// misconfiguration that resulted in an illegal call. Stated differently, +/// whether an error occurs is not dependent on the specific bytes in the +/// haystack. +/// +/// Examples of misconfiguration: +/// +/// * Executing a stream or overlapping search on a searcher that was built was +/// something other than [`MatchKind::Standard`](crate::MatchKind::Standard) +/// semantics. +/// * Requested an anchored or an unanchored search on a searcher that doesn't +/// support unanchored or anchored searches, respectively. +/// +/// When the `std` feature is enabled, this implements the `std::error::Error` +/// trait. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct MatchError(alloc::boxed::Box<MatchErrorKind>); + +impl MatchError { + /// Create a new error value with the given kind. + /// + /// This is a more verbose version of the kind-specific constructors, e.g., + /// `MatchError::unsupported_stream`. + pub fn new(kind: MatchErrorKind) -> MatchError { + MatchError(alloc::boxed::Box::new(kind)) + } + + /// Returns a reference to the underlying error kind. + pub fn kind(&self) -> &MatchErrorKind { + &self.0 + } + + /// Create a new "invalid anchored search" error. This occurs when the + /// caller requests an anchored search but where anchored searches aren't + /// supported. + /// + /// This is the same as calling `MatchError::new` with a + /// [`MatchErrorKind::InvalidInputAnchored`] kind. + pub fn invalid_input_anchored() -> MatchError { + MatchError::new(MatchErrorKind::InvalidInputAnchored) + } + + /// Create a new "invalid unanchored search" error. This occurs when the + /// caller requests an unanchored search but where unanchored searches + /// aren't supported. + /// + /// This is the same as calling `MatchError::new` with a + /// [`MatchErrorKind::InvalidInputUnanchored`] kind. + pub fn invalid_input_unanchored() -> MatchError { + MatchError::new(MatchErrorKind::InvalidInputUnanchored) + } + + /// Create a new "unsupported stream search" error. This occurs when the + /// caller requests a stream search while using an Aho-Corasick automaton + /// with a match kind other than [`MatchKind::Standard`]. + /// + /// The match kind given should be the match kind of the automaton. It + /// should never be `MatchKind::Standard`. + pub fn unsupported_stream(got: MatchKind) -> MatchError { + MatchError::new(MatchErrorKind::UnsupportedStream { got }) + } + + /// Create a new "unsupported overlapping search" error. This occurs when + /// the caller requests an overlapping search while using an Aho-Corasick + /// automaton with a match kind other than [`MatchKind::Standard`]. + /// + /// The match kind given should be the match kind of the automaton. It + /// should never be `MatchKind::Standard`. + pub fn unsupported_overlapping(got: MatchKind) -> MatchError { + MatchError::new(MatchErrorKind::UnsupportedOverlapping { got }) + } + + /// Create a new "unsupported empty pattern" error. This occurs when the + /// caller requests a search for which matching an automaton that contains + /// an empty pattern string is not supported. + pub fn unsupported_empty() -> MatchError { + MatchError::new(MatchErrorKind::UnsupportedEmpty) + } +} + +/// The underlying kind of a [`MatchError`]. +/// +/// This is a **non-exhaustive** enum. That means new variants may be added in +/// a semver-compatible release. +#[non_exhaustive] +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum MatchErrorKind { + /// An error indicating that an anchored search was requested, but from a + /// searcher that was built without anchored support. + InvalidInputAnchored, + /// An error indicating that an unanchored search was requested, but from a + /// searcher that was built without unanchored support. + InvalidInputUnanchored, + /// An error indicating that a stream search was attempted on an + /// Aho-Corasick automaton with an unsupported `MatchKind`. + UnsupportedStream { + /// The match semantics for the automaton that was used. + got: MatchKind, + }, + /// An error indicating that an overlapping search was attempted on an + /// Aho-Corasick automaton with an unsupported `MatchKind`. + UnsupportedOverlapping { + /// The match semantics for the automaton that was used. + got: MatchKind, + }, + /// An error indicating that the operation requested doesn't support + /// automatons that contain an empty pattern string. + UnsupportedEmpty, +} + +#[cfg(feature = "std")] +impl std::error::Error for MatchError {} + +impl core::fmt::Display for MatchError { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + match *self.kind() { + MatchErrorKind::InvalidInputAnchored => { + write!(f, "anchored searches are not supported or enabled") + } + MatchErrorKind::InvalidInputUnanchored => { + write!(f, "unanchored searches are not supported or enabled") + } + MatchErrorKind::UnsupportedStream { got } => { + write!( + f, + "match kind {:?} does not support stream searching", + got, + ) + } + MatchErrorKind::UnsupportedOverlapping { got } => { + write!( + f, + "match kind {:?} does not support overlapping searches", + got, + ) + } + MatchErrorKind::UnsupportedEmpty => { + write!( + f, + "matching with an empty pattern string is not \ + supported for this operation", + ) + } + } + } +} diff --git a/third_party/rust/aho-corasick/src/util/int.rs b/third_party/rust/aho-corasick/src/util/int.rs new file mode 100644 index 0000000000..28ede7a47f --- /dev/null +++ b/third_party/rust/aho-corasick/src/util/int.rs @@ -0,0 +1,284 @@ +/*! +This module provides several integer oriented traits for converting between +both fixed size integers and integers whose size varies based on the target +(like `usize`). + +The main design principle for this module is to centralize all uses of `as`. +The thinking here is that `as` makes it very easy to perform accidental lossy +conversions, and if we centralize all its uses here under more descriptive +higher level operations, its use and correctness becomes easier to audit. + +This was copied mostly wholesale from `regex-automata`. + +NOTE: for simplicity, we don't take target pointer width into account here for +`usize` conversions. Since we currently only panic in debug mode, skipping the +check when it can be proven it isn't needed at compile time doesn't really +matter. Now, if we wind up wanting to do as many checks as possible in release +mode, then we would want to skip those when we know the conversions are always +non-lossy. +*/ + +pub(crate) trait U8 { + fn as_usize(self) -> usize; +} + +impl U8 for u8 { + fn as_usize(self) -> usize { + usize::from(self) + } +} + +pub(crate) trait U16 { + fn as_usize(self) -> usize; + fn low_u8(self) -> u8; + fn high_u8(self) -> u8; +} + +impl U16 for u16 { + fn as_usize(self) -> usize { + usize::from(self) + } + + fn low_u8(self) -> u8 { + self as u8 + } + + fn high_u8(self) -> u8 { + (self >> 8) as u8 + } +} + +pub(crate) trait U32 { + fn as_usize(self) -> usize; + fn low_u8(self) -> u8; + fn low_u16(self) -> u16; + fn high_u16(self) -> u16; +} + +impl U32 for u32 { + #[inline] + fn as_usize(self) -> usize { + #[cfg(debug_assertions)] + { + usize::try_from(self).expect("u32 overflowed usize") + } + #[cfg(not(debug_assertions))] + { + self as usize + } + } + + fn low_u8(self) -> u8 { + self as u8 + } + + fn low_u16(self) -> u16 { + self as u16 + } + + fn high_u16(self) -> u16 { + (self >> 16) as u16 + } +} + +pub(crate) trait U64 { + fn as_usize(self) -> usize; + fn low_u8(self) -> u8; + fn low_u16(self) -> u16; + fn low_u32(self) -> u32; + fn high_u32(self) -> u32; +} + +impl U64 for u64 { + fn as_usize(self) -> usize { + #[cfg(debug_assertions)] + { + usize::try_from(self).expect("u64 overflowed usize") + } + #[cfg(not(debug_assertions))] + { + self as usize + } + } + + fn low_u8(self) -> u8 { + self as u8 + } + + fn low_u16(self) -> u16 { + self as u16 + } + + fn low_u32(self) -> u32 { + self as u32 + } + + fn high_u32(self) -> u32 { + (self >> 32) as u32 + } +} + +pub(crate) trait I8 { + fn as_usize(self) -> usize; + fn to_bits(self) -> u8; + fn from_bits(n: u8) -> i8; +} + +impl I8 for i8 { + fn as_usize(self) -> usize { + #[cfg(debug_assertions)] + { + usize::try_from(self).expect("i8 overflowed usize") + } + #[cfg(not(debug_assertions))] + { + self as usize + } + } + + fn to_bits(self) -> u8 { + self as u8 + } + + fn from_bits(n: u8) -> i8 { + n as i8 + } +} + +pub(crate) trait I32 { + fn as_usize(self) -> usize; + fn to_bits(self) -> u32; + fn from_bits(n: u32) -> i32; +} + +impl I32 for i32 { + fn as_usize(self) -> usize { + #[cfg(debug_assertions)] + { + usize::try_from(self).expect("i32 overflowed usize") + } + #[cfg(not(debug_assertions))] + { + self as usize + } + } + + fn to_bits(self) -> u32 { + self as u32 + } + + fn from_bits(n: u32) -> i32 { + n as i32 + } +} + +pub(crate) trait I64 { + fn as_usize(self) -> usize; + fn to_bits(self) -> u64; + fn from_bits(n: u64) -> i64; +} + +impl I64 for i64 { + fn as_usize(self) -> usize { + #[cfg(debug_assertions)] + { + usize::try_from(self).expect("i64 overflowed usize") + } + #[cfg(not(debug_assertions))] + { + self as usize + } + } + + fn to_bits(self) -> u64 { + self as u64 + } + + fn from_bits(n: u64) -> i64 { + n as i64 + } +} + +pub(crate) trait Usize { + fn as_u8(self) -> u8; + fn as_u16(self) -> u16; + fn as_u32(self) -> u32; + fn as_u64(self) -> u64; +} + +impl Usize for usize { + fn as_u8(self) -> u8 { + #[cfg(debug_assertions)] + { + u8::try_from(self).expect("usize overflowed u8") + } + #[cfg(not(debug_assertions))] + { + self as u8 + } + } + + fn as_u16(self) -> u16 { + #[cfg(debug_assertions)] + { + u16::try_from(self).expect("usize overflowed u16") + } + #[cfg(not(debug_assertions))] + { + self as u16 + } + } + + fn as_u32(self) -> u32 { + #[cfg(debug_assertions)] + { + u32::try_from(self).expect("usize overflowed u32") + } + #[cfg(not(debug_assertions))] + { + self as u32 + } + } + + fn as_u64(self) -> u64 { + #[cfg(debug_assertions)] + { + u64::try_from(self).expect("usize overflowed u64") + } + #[cfg(not(debug_assertions))] + { + self as u64 + } + } +} + +// Pointers aren't integers, but we convert pointers to integers to perform +// offset arithmetic in some places. (And no, we don't convert the integers +// back to pointers.) So add 'as_usize' conversions here too for completeness. +// +// These 'as' casts are actually okay because they're always non-lossy. But the +// idea here is to just try and remove as much 'as' as possible, particularly +// in this crate where we are being really paranoid about offsets and making +// sure we don't panic on inputs that might be untrusted. This way, the 'as' +// casts become easier to audit if they're all in one place, even when some of +// them are actually okay 100% of the time. + +pub(crate) trait Pointer { + fn as_usize(self) -> usize; +} + +impl<T> Pointer for *const T { + fn as_usize(self) -> usize { + self as usize + } +} + +pub(crate) trait PointerMut { + fn as_usize(self) -> usize; +} + +impl<T> PointerMut for *mut T { + fn as_usize(self) -> usize { + self as usize + } +} diff --git a/third_party/rust/aho-corasick/src/util/mod.rs b/third_party/rust/aho-corasick/src/util/mod.rs new file mode 100644 index 0000000000..f7a1ddd07b --- /dev/null +++ b/third_party/rust/aho-corasick/src/util/mod.rs @@ -0,0 +1,12 @@ +pub(crate) mod alphabet; +#[cfg(feature = "std")] +pub(crate) mod buffer; +pub(crate) mod byte_frequencies; +pub(crate) mod debug; +pub(crate) mod error; +pub(crate) mod int; +pub(crate) mod prefilter; +pub(crate) mod primitives; +pub(crate) mod remapper; +pub(crate) mod search; +pub(crate) mod special; diff --git a/third_party/rust/aho-corasick/src/util/prefilter.rs b/third_party/rust/aho-corasick/src/util/prefilter.rs new file mode 100644 index 0000000000..f5ddc75b7c --- /dev/null +++ b/third_party/rust/aho-corasick/src/util/prefilter.rs @@ -0,0 +1,924 @@ +use core::{ + cmp, + fmt::Debug, + panic::{RefUnwindSafe, UnwindSafe}, + u8, +}; + +use alloc::{sync::Arc, vec, vec::Vec}; + +use crate::{ + packed, + util::{ + alphabet::ByteSet, + search::{Match, MatchKind, Span}, + }, +}; + +/// A prefilter for accelerating a search. +/// +/// This crate uses prefilters in the core search implementations to accelerate +/// common cases. They typically only apply to cases where there are a small +/// number of patterns (less than 100 or so), but when they do, thoughput can +/// be boosted considerably, perhaps by an order of magnitude. When a prefilter +/// is active, it is used whenever a search enters an automaton's start state. +/// +/// Currently, prefilters cannot be constructed by +/// callers. A `Prefilter` can only be accessed via the +/// [`Automaton::prefilter`](crate::automaton::Automaton::prefilter) +/// method and used to execute a search. In other words, a prefilter can be +/// used to optimize your own search implementation if necessary, but cannot do +/// much else. If you have a use case for more APIs, please submit an issue. +#[derive(Clone, Debug)] +pub struct Prefilter { + finder: Arc<dyn PrefilterI>, + memory_usage: usize, +} + +impl Prefilter { + /// Execute a search in the haystack within the span given. If a match or + /// a possible match is returned, then it is guaranteed to occur within + /// the bounds of the span. + /// + /// If the span provided is invalid for the given haystack, then behavior + /// is unspecified. + #[inline] + pub fn find_in(&self, haystack: &[u8], span: Span) -> Candidate { + self.finder.find_in(haystack, span) + } + + #[inline] + pub(crate) fn memory_usage(&self) -> usize { + self.memory_usage + } +} + +/// A candidate is the result of running a prefilter on a haystack at a +/// particular position. +/// +/// The result is either no match, a confirmed match or a possible match. +/// +/// When no match is returned, the prefilter is guaranteeing that no possible +/// match can be found in the haystack, and the caller may trust this. That is, +/// all correct prefilters must never report false negatives. +/// +/// In some cases, a prefilter can confirm a match very quickly, in which case, +/// the caller may use this to stop what it's doing and report the match. In +/// this case, prefilter implementations must never report a false positive. +/// In other cases, the prefilter can only report a potential match, in which +/// case the callers must attempt to confirm the match. In this case, prefilter +/// implementations are permitted to return false positives. +#[derive(Clone, Debug)] +pub enum Candidate { + /// No match was found. Since false negatives are not possible, this means + /// the search can quit as it is guaranteed not to find another match. + None, + /// A confirmed match was found. Callers do not need to confirm it. + Match(Match), + /// The start of a possible match was found. Callers must confirm it before + /// reporting it as a match. + PossibleStartOfMatch(usize), +} + +impl Candidate { + /// Convert this candidate into an option. This is useful when callers + /// do not distinguish between true positives and false positives (i.e., + /// the caller must always confirm the match). + pub fn into_option(self) -> Option<usize> { + match self { + Candidate::None => None, + Candidate::Match(ref m) => Some(m.start()), + Candidate::PossibleStartOfMatch(start) => Some(start), + } + } +} + +/// A prefilter describes the behavior of fast literal scanners for quickly +/// skipping past bytes in the haystack that we know cannot possibly +/// participate in a match. +trait PrefilterI: + Send + Sync + RefUnwindSafe + UnwindSafe + Debug + 'static +{ + /// Returns the next possible match candidate. This may yield false + /// positives, so callers must confirm a match starting at the position + /// returned. This, however, must never produce false negatives. That is, + /// this must, at minimum, return the starting position of the next match + /// in the given haystack after or at the given position. + fn find_in(&self, haystack: &[u8], span: Span) -> Candidate; +} + +impl<P: PrefilterI + ?Sized> PrefilterI for Arc<P> { + #[inline(always)] + fn find_in(&self, haystack: &[u8], span: Span) -> Candidate { + (**self).find_in(haystack, span) + } +} + +/// A builder for constructing the best possible prefilter. When constructed, +/// this builder will heuristically select the best prefilter it can build, +/// if any, and discard the rest. +#[derive(Debug)] +pub(crate) struct Builder { + count: usize, + ascii_case_insensitive: bool, + start_bytes: StartBytesBuilder, + rare_bytes: RareBytesBuilder, + memmem: MemmemBuilder, + packed: Option<packed::Builder>, + // If we run across a condition that suggests we shouldn't use a prefilter + // at all (like an empty pattern), then disable prefilters entirely. + enabled: bool, +} + +impl Builder { + /// Create a new builder for constructing the best possible prefilter. + pub(crate) fn new(kind: MatchKind) -> Builder { + let pbuilder = kind + .as_packed() + .map(|kind| packed::Config::new().match_kind(kind).builder()); + Builder { + count: 0, + ascii_case_insensitive: false, + start_bytes: StartBytesBuilder::new(), + rare_bytes: RareBytesBuilder::new(), + memmem: MemmemBuilder::default(), + packed: pbuilder, + enabled: true, + } + } + + /// Enable ASCII case insensitivity. When set, byte strings added to this + /// builder will be interpreted without respect to ASCII case. + pub(crate) fn ascii_case_insensitive(mut self, yes: bool) -> Builder { + self.ascii_case_insensitive = yes; + self.start_bytes = self.start_bytes.ascii_case_insensitive(yes); + self.rare_bytes = self.rare_bytes.ascii_case_insensitive(yes); + self + } + + /// Return a prefilter suitable for quickly finding potential matches. + /// + /// All patterns added to an Aho-Corasick automaton should be added to this + /// builder before attempting to construct the prefilter. + pub(crate) fn build(&self) -> Option<Prefilter> { + if !self.enabled { + debug!("prefilter not enabled, skipping"); + return None; + } + // If we only have one pattern, then deferring to memmem is always + // the best choice. This is kind of a weird case, because, well, why + // use Aho-Corasick if you only have one pattern? But maybe you don't + // know exactly how many patterns you'll get up front, and you need to + // support the option of multiple patterns. So instead of relying on + // the caller to branch and use memmem explicitly, we just do it for + // them. + if !self.ascii_case_insensitive { + if let Some(pre) = self.memmem.build() { + debug!("using memmem prefilter"); + return Some(pre); + } + } + let (packed, patlen, minlen) = if self.ascii_case_insensitive { + (None, usize::MAX, 0) + } else { + let patlen = self.packed.as_ref().map_or(usize::MAX, |p| p.len()); + let minlen = self.packed.as_ref().map_or(0, |p| p.minimum_len()); + let packed = + self.packed.as_ref().and_then(|b| b.build()).map(|s| { + let memory_usage = s.memory_usage(); + debug!( + "built packed prefilter (len: {}, \ + minimum pattern len: {}, memory usage: {}) \ + for consideration", + patlen, minlen, memory_usage, + ); + Prefilter { finder: Arc::new(Packed(s)), memory_usage } + }); + (packed, patlen, minlen) + }; + match (self.start_bytes.build(), self.rare_bytes.build()) { + // If we could build both start and rare prefilters, then there are + // a few cases in which we'd want to use the start-byte prefilter + // over the rare-byte prefilter, since the former has lower + // overhead. + (prestart @ Some(_), prerare @ Some(_)) => { + debug!( + "both start (len={}, rank={}) and \ + rare (len={}, rank={}) byte prefilters \ + are available", + self.start_bytes.count, + self.start_bytes.rank_sum, + self.rare_bytes.count, + self.rare_bytes.rank_sum, + ); + if patlen <= 16 + && minlen >= 2 + && self.start_bytes.count >= 3 + && self.rare_bytes.count >= 3 + { + debug!( + "start and rare byte prefilters available, but \ + they're probably slower than packed so using \ + packed" + ); + return packed; + } + // If the start-byte prefilter can scan for a smaller number + // of bytes than the rare-byte prefilter, then it's probably + // faster. + let has_fewer_bytes = + self.start_bytes.count < self.rare_bytes.count; + // Otherwise, if the combined frequency rank of the detected + // bytes in the start-byte prefilter is "close" to the combined + // frequency rank of the rare-byte prefilter, then we pick + // the start-byte prefilter even if the rare-byte prefilter + // heuristically searches for rare bytes. This is because the + // rare-byte prefilter has higher constant costs, so we tend to + // prefer the start-byte prefilter when we can. + let has_rarer_bytes = + self.start_bytes.rank_sum <= self.rare_bytes.rank_sum + 50; + if has_fewer_bytes { + debug!( + "using start byte prefilter because it has fewer + bytes to search for than the rare byte prefilter", + ); + prestart + } else if has_rarer_bytes { + debug!( + "using start byte prefilter because its byte \ + frequency rank was determined to be \ + \"good enough\" relative to the rare byte prefilter \ + byte frequency rank", + ); + prestart + } else { + debug!("using rare byte prefilter"); + prerare + } + } + (prestart @ Some(_), None) => { + if patlen <= 16 && minlen >= 2 && self.start_bytes.count >= 3 { + debug!( + "start byte prefilter available, but \ + it's probably slower than packed so using \ + packed" + ); + return packed; + } + debug!( + "have start byte prefilter but not rare byte prefilter, \ + so using start byte prefilter", + ); + prestart + } + (None, prerare @ Some(_)) => { + if patlen <= 16 && minlen >= 2 && self.rare_bytes.count >= 3 { + debug!( + "rare byte prefilter available, but \ + it's probably slower than packed so using \ + packed" + ); + return packed; + } + debug!( + "have rare byte prefilter but not start byte prefilter, \ + so using rare byte prefilter", + ); + prerare + } + (None, None) if self.ascii_case_insensitive => { + debug!( + "no start or rare byte prefilter and ASCII case \ + insensitivity was enabled, so skipping prefilter", + ); + None + } + (None, None) => { + if packed.is_some() { + debug!("falling back to packed prefilter"); + } else { + debug!("no prefilter available"); + } + packed + } + } + } + + /// Add a literal string to this prefilter builder. + pub(crate) fn add(&mut self, bytes: &[u8]) { + if bytes.is_empty() { + self.enabled = false; + } + if !self.enabled { + return; + } + self.count += 1; + self.start_bytes.add(bytes); + self.rare_bytes.add(bytes); + self.memmem.add(bytes); + if let Some(ref mut pbuilder) = self.packed { + pbuilder.add(bytes); + } + } +} + +/// A type that wraps a packed searcher and implements the `Prefilter` +/// interface. +#[derive(Clone, Debug)] +struct Packed(packed::Searcher); + +impl PrefilterI for Packed { + fn find_in(&self, haystack: &[u8], span: Span) -> Candidate { + self.0 + .find_in(&haystack, span) + .map_or(Candidate::None, Candidate::Match) + } +} + +/// A builder for constructing a prefilter that uses memmem. +#[derive(Debug, Default)] +struct MemmemBuilder { + /// The number of patterns that have been added. + count: usize, + /// The singular pattern to search for. This is only set when count==1. + one: Option<Vec<u8>>, +} + +impl MemmemBuilder { + fn build(&self) -> Option<Prefilter> { + #[cfg(all(feature = "std", feature = "perf-literal"))] + fn imp(builder: &MemmemBuilder) -> Option<Prefilter> { + let pattern = builder.one.as_ref()?; + assert_eq!(1, builder.count); + let finder = Arc::new(Memmem( + memchr::memmem::Finder::new(pattern).into_owned(), + )); + let memory_usage = pattern.len(); + Some(Prefilter { finder, memory_usage }) + } + + #[cfg(not(all(feature = "std", feature = "perf-literal")))] + fn imp(_: &MemmemBuilder) -> Option<Prefilter> { + None + } + + imp(self) + } + + fn add(&mut self, bytes: &[u8]) { + self.count += 1; + if self.count == 1 { + self.one = Some(bytes.to_vec()); + } else { + self.one = None; + } + } +} + +/// A type that wraps a SIMD accelerated single substring search from the +/// `memchr` crate for use as a prefilter. +/// +/// Currently, this prefilter is only active for Aho-Corasick searchers with +/// a single pattern. In theory, this could be extended to support searchers +/// that have a common prefix of more than one byte (for one byte, we would use +/// memchr), but it's not clear if it's worth it or not. +/// +/// Also, unfortunately, this currently also requires the 'std' feature to +/// be enabled. That's because memchr doesn't have a no-std-but-with-alloc +/// mode, and so APIs like Finder::into_owned aren't available when 'std' is +/// disabled. But there should be an 'alloc' feature that brings in APIs like +/// Finder::into_owned but doesn't use std-only features like runtime CPU +/// feature detection. +#[cfg(all(feature = "std", feature = "perf-literal"))] +#[derive(Clone, Debug)] +struct Memmem(memchr::memmem::Finder<'static>); + +#[cfg(all(feature = "std", feature = "perf-literal"))] +impl PrefilterI for Memmem { + fn find_in(&self, haystack: &[u8], span: Span) -> Candidate { + use crate::util::primitives::PatternID; + + self.0.find(&haystack[span]).map_or(Candidate::None, |i| { + let start = span.start + i; + let end = start + self.0.needle().len(); + // N.B. We can declare a match and use a fixed pattern ID here + // because a Memmem prefilter is only ever created for searchers + // with exactly one pattern. Thus, every match is always a match + // and it is always for the first and only pattern. + Candidate::Match(Match::new(PatternID::ZERO, start..end)) + }) + } +} + +/// A builder for constructing a rare byte prefilter. +/// +/// A rare byte prefilter attempts to pick out a small set of rare bytes that +/// occurr in the patterns, and then quickly scan to matches of those rare +/// bytes. +#[derive(Clone, Debug)] +struct RareBytesBuilder { + /// Whether this prefilter should account for ASCII case insensitivity or + /// not. + ascii_case_insensitive: bool, + /// A set of rare bytes, indexed by byte value. + rare_set: ByteSet, + /// A set of byte offsets associated with bytes in a pattern. An entry + /// corresponds to a particular bytes (its index) and is only non-zero if + /// the byte occurred at an offset greater than 0 in at least one pattern. + /// + /// If a byte's offset is not representable in 8 bits, then the rare bytes + /// prefilter becomes inert. + byte_offsets: RareByteOffsets, + /// Whether this is available as a prefilter or not. This can be set to + /// false during construction if a condition is seen that invalidates the + /// use of the rare-byte prefilter. + available: bool, + /// The number of bytes set to an active value in `byte_offsets`. + count: usize, + /// The sum of frequency ranks for the rare bytes detected. This is + /// intended to give a heuristic notion of how rare the bytes are. + rank_sum: u16, +} + +/// A set of byte offsets, keyed by byte. +#[derive(Clone, Copy)] +struct RareByteOffsets { + /// Each entry corresponds to the maximum offset of the corresponding + /// byte across all patterns seen. + set: [RareByteOffset; 256], +} + +impl RareByteOffsets { + /// Create a new empty set of rare byte offsets. + pub(crate) fn empty() -> RareByteOffsets { + RareByteOffsets { set: [RareByteOffset::default(); 256] } + } + + /// Add the given offset for the given byte to this set. If the offset is + /// greater than the existing offset, then it overwrites the previous + /// value and returns false. If there is no previous value set, then this + /// sets it and returns true. + pub(crate) fn set(&mut self, byte: u8, off: RareByteOffset) { + self.set[byte as usize].max = + cmp::max(self.set[byte as usize].max, off.max); + } +} + +impl core::fmt::Debug for RareByteOffsets { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + let mut offsets = vec![]; + for off in self.set.iter() { + if off.max > 0 { + offsets.push(off); + } + } + f.debug_struct("RareByteOffsets").field("set", &offsets).finish() + } +} + +/// Offsets associated with an occurrence of a "rare" byte in any of the +/// patterns used to construct a single Aho-Corasick automaton. +#[derive(Clone, Copy, Debug)] +struct RareByteOffset { + /// The maximum offset at which a particular byte occurs from the start + /// of any pattern. This is used as a shift amount. That is, when an + /// occurrence of this byte is found, the candidate position reported by + /// the prefilter is `position_of_byte - max`, such that the automaton + /// will begin its search at a position that is guaranteed to observe a + /// match. + /// + /// To avoid accidentally quadratic behavior, a prefilter is considered + /// ineffective when it is asked to start scanning from a position that it + /// has already scanned past. + /// + /// Using a `u8` here means that if we ever see a pattern that's longer + /// than 255 bytes, then the entire rare byte prefilter is disabled. + max: u8, +} + +impl Default for RareByteOffset { + fn default() -> RareByteOffset { + RareByteOffset { max: 0 } + } +} + +impl RareByteOffset { + /// Create a new rare byte offset. If the given offset is too big, then + /// None is returned. In that case, callers should render the rare bytes + /// prefilter inert. + fn new(max: usize) -> Option<RareByteOffset> { + if max > u8::MAX as usize { + None + } else { + Some(RareByteOffset { max: max as u8 }) + } + } +} + +impl RareBytesBuilder { + /// Create a new builder for constructing a rare byte prefilter. + fn new() -> RareBytesBuilder { + RareBytesBuilder { + ascii_case_insensitive: false, + rare_set: ByteSet::empty(), + byte_offsets: RareByteOffsets::empty(), + available: true, + count: 0, + rank_sum: 0, + } + } + + /// Enable ASCII case insensitivity. When set, byte strings added to this + /// builder will be interpreted without respect to ASCII case. + fn ascii_case_insensitive(mut self, yes: bool) -> RareBytesBuilder { + self.ascii_case_insensitive = yes; + self + } + + /// Build the rare bytes prefilter. + /// + /// If there are more than 3 distinct rare bytes found, or if heuristics + /// otherwise determine that this prefilter should not be used, then `None` + /// is returned. + fn build(&self) -> Option<Prefilter> { + #[cfg(feature = "perf-literal")] + fn imp(builder: &RareBytesBuilder) -> Option<Prefilter> { + if !builder.available || builder.count > 3 { + return None; + } + let (mut bytes, mut len) = ([0; 3], 0); + for b in 0..=255 { + if builder.rare_set.contains(b) { + bytes[len] = b as u8; + len += 1; + } + } + let finder: Arc<dyn PrefilterI> = match len { + 0 => return None, + 1 => Arc::new(RareBytesOne { + byte1: bytes[0], + offset: builder.byte_offsets.set[bytes[0] as usize], + }), + 2 => Arc::new(RareBytesTwo { + offsets: builder.byte_offsets, + byte1: bytes[0], + byte2: bytes[1], + }), + 3 => Arc::new(RareBytesThree { + offsets: builder.byte_offsets, + byte1: bytes[0], + byte2: bytes[1], + byte3: bytes[2], + }), + _ => unreachable!(), + }; + Some(Prefilter { finder, memory_usage: 0 }) + } + + #[cfg(not(feature = "perf-literal"))] + fn imp(_: &RareBytesBuilder) -> Option<Prefilter> { + None + } + + imp(self) + } + + /// Add a byte string to this builder. + /// + /// All patterns added to an Aho-Corasick automaton should be added to this + /// builder before attempting to construct the prefilter. + fn add(&mut self, bytes: &[u8]) { + // If we've already given up, then do nothing. + if !self.available { + return; + } + // If we've already blown our budget, then don't waste time looking + // for more rare bytes. + if self.count > 3 { + self.available = false; + return; + } + // If the pattern is too long, then our offset table is bunk, so + // give up. + if bytes.len() >= 256 { + self.available = false; + return; + } + let mut rarest = match bytes.get(0) { + None => return, + Some(&b) => (b, freq_rank(b)), + }; + // The idea here is to look for the rarest byte in each pattern, and + // add that to our set. As a special exception, if we see a byte that + // we've already added, then we immediately stop and choose that byte, + // even if there's another rare byte in the pattern. This helps us + // apply the rare byte optimization in more cases by attempting to pick + // bytes that are in common between patterns. So for example, if we + // were searching for `Sherlock` and `lockjaw`, then this would pick + // `k` for both patterns, resulting in the use of `memchr` instead of + // `memchr2` for `k` and `j`. + let mut found = false; + for (pos, &b) in bytes.iter().enumerate() { + self.set_offset(pos, b); + if found { + continue; + } + if self.rare_set.contains(b) { + found = true; + continue; + } + let rank = freq_rank(b); + if rank < rarest.1 { + rarest = (b, rank); + } + } + if !found { + self.add_rare_byte(rarest.0); + } + } + + fn set_offset(&mut self, pos: usize, byte: u8) { + // This unwrap is OK because pos is never bigger than our max. + let offset = RareByteOffset::new(pos).unwrap(); + self.byte_offsets.set(byte, offset); + if self.ascii_case_insensitive { + self.byte_offsets.set(opposite_ascii_case(byte), offset); + } + } + + fn add_rare_byte(&mut self, byte: u8) { + self.add_one_rare_byte(byte); + if self.ascii_case_insensitive { + self.add_one_rare_byte(opposite_ascii_case(byte)); + } + } + + fn add_one_rare_byte(&mut self, byte: u8) { + if !self.rare_set.contains(byte) { + self.rare_set.add(byte); + self.count += 1; + self.rank_sum += freq_rank(byte) as u16; + } + } +} + +/// A prefilter for scanning for a single "rare" byte. +#[cfg(feature = "perf-literal")] +#[derive(Clone, Debug)] +struct RareBytesOne { + byte1: u8, + offset: RareByteOffset, +} + +#[cfg(feature = "perf-literal")] +impl PrefilterI for RareBytesOne { + fn find_in(&self, haystack: &[u8], span: Span) -> Candidate { + memchr::memchr(self.byte1, &haystack[span]) + .map(|i| { + let pos = span.start + i; + cmp::max( + span.start, + pos.saturating_sub(usize::from(self.offset.max)), + ) + }) + .map_or(Candidate::None, Candidate::PossibleStartOfMatch) + } +} + +/// A prefilter for scanning for two "rare" bytes. +#[cfg(feature = "perf-literal")] +#[derive(Clone, Debug)] +struct RareBytesTwo { + offsets: RareByteOffsets, + byte1: u8, + byte2: u8, +} + +#[cfg(feature = "perf-literal")] +impl PrefilterI for RareBytesTwo { + fn find_in(&self, haystack: &[u8], span: Span) -> Candidate { + memchr::memchr2(self.byte1, self.byte2, &haystack[span]) + .map(|i| { + let pos = span.start + i; + let offset = self.offsets.set[usize::from(haystack[pos])].max; + cmp::max(span.start, pos.saturating_sub(usize::from(offset))) + }) + .map_or(Candidate::None, Candidate::PossibleStartOfMatch) + } +} + +/// A prefilter for scanning for three "rare" bytes. +#[cfg(feature = "perf-literal")] +#[derive(Clone, Debug)] +struct RareBytesThree { + offsets: RareByteOffsets, + byte1: u8, + byte2: u8, + byte3: u8, +} + +#[cfg(feature = "perf-literal")] +impl PrefilterI for RareBytesThree { + fn find_in(&self, haystack: &[u8], span: Span) -> Candidate { + memchr::memchr3(self.byte1, self.byte2, self.byte3, &haystack[span]) + .map(|i| { + let pos = span.start + i; + let offset = self.offsets.set[usize::from(haystack[pos])].max; + cmp::max(span.start, pos.saturating_sub(usize::from(offset))) + }) + .map_or(Candidate::None, Candidate::PossibleStartOfMatch) + } +} + +/// A builder for constructing a starting byte prefilter. +/// +/// A starting byte prefilter is a simplistic prefilter that looks for possible +/// matches by reporting all positions corresponding to a particular byte. This +/// generally only takes affect when there are at most 3 distinct possible +/// starting bytes. e.g., the patterns `foo`, `bar`, and `baz` have two +/// distinct starting bytes (`f` and `b`), and this prefilter returns all +/// occurrences of either `f` or `b`. +/// +/// In some cases, a heuristic frequency analysis may determine that it would +/// be better not to use this prefilter even when there are 3 or fewer distinct +/// starting bytes. +#[derive(Clone, Debug)] +struct StartBytesBuilder { + /// Whether this prefilter should account for ASCII case insensitivity or + /// not. + ascii_case_insensitive: bool, + /// The set of starting bytes observed. + byteset: Vec<bool>, + /// The number of bytes set to true in `byteset`. + count: usize, + /// The sum of frequency ranks for the rare bytes detected. This is + /// intended to give a heuristic notion of how rare the bytes are. + rank_sum: u16, +} + +impl StartBytesBuilder { + /// Create a new builder for constructing a start byte prefilter. + fn new() -> StartBytesBuilder { + StartBytesBuilder { + ascii_case_insensitive: false, + byteset: vec![false; 256], + count: 0, + rank_sum: 0, + } + } + + /// Enable ASCII case insensitivity. When set, byte strings added to this + /// builder will be interpreted without respect to ASCII case. + fn ascii_case_insensitive(mut self, yes: bool) -> StartBytesBuilder { + self.ascii_case_insensitive = yes; + self + } + + /// Build the starting bytes prefilter. + /// + /// If there are more than 3 distinct starting bytes, or if heuristics + /// otherwise determine that this prefilter should not be used, then `None` + /// is returned. + fn build(&self) -> Option<Prefilter> { + #[cfg(feature = "perf-literal")] + fn imp(builder: &StartBytesBuilder) -> Option<Prefilter> { + if builder.count > 3 { + return None; + } + let (mut bytes, mut len) = ([0; 3], 0); + for b in 0..256 { + if !builder.byteset[b] { + continue; + } + // We don't handle non-ASCII bytes for now. Getting non-ASCII + // bytes right is trickier, since we generally don't want to put + // a leading UTF-8 code unit into a prefilter that isn't ASCII, + // since they can frequently. Instead, it would be better to use a + // continuation byte, but this requires more sophisticated analysis + // of the automaton and a richer prefilter API. + if b > 0x7F { + return None; + } + bytes[len] = b as u8; + len += 1; + } + let finder: Arc<dyn PrefilterI> = match len { + 0 => return None, + 1 => Arc::new(StartBytesOne { byte1: bytes[0] }), + 2 => Arc::new(StartBytesTwo { + byte1: bytes[0], + byte2: bytes[1], + }), + 3 => Arc::new(StartBytesThree { + byte1: bytes[0], + byte2: bytes[1], + byte3: bytes[2], + }), + _ => unreachable!(), + }; + Some(Prefilter { finder, memory_usage: 0 }) + } + + #[cfg(not(feature = "perf-literal"))] + fn imp(_: &StartBytesBuilder) -> Option<Prefilter> { + None + } + + imp(self) + } + + /// Add a byte string to this builder. + /// + /// All patterns added to an Aho-Corasick automaton should be added to this + /// builder before attempting to construct the prefilter. + fn add(&mut self, bytes: &[u8]) { + if self.count > 3 { + return; + } + if let Some(&byte) = bytes.get(0) { + self.add_one_byte(byte); + if self.ascii_case_insensitive { + self.add_one_byte(opposite_ascii_case(byte)); + } + } + } + + fn add_one_byte(&mut self, byte: u8) { + if !self.byteset[byte as usize] { + self.byteset[byte as usize] = true; + self.count += 1; + self.rank_sum += freq_rank(byte) as u16; + } + } +} + +/// A prefilter for scanning for a single starting byte. +#[cfg(feature = "perf-literal")] +#[derive(Clone, Debug)] +struct StartBytesOne { + byte1: u8, +} + +#[cfg(feature = "perf-literal")] +impl PrefilterI for StartBytesOne { + fn find_in(&self, haystack: &[u8], span: Span) -> Candidate { + memchr::memchr(self.byte1, &haystack[span]) + .map(|i| span.start + i) + .map_or(Candidate::None, Candidate::PossibleStartOfMatch) + } +} + +/// A prefilter for scanning for two starting bytes. +#[cfg(feature = "perf-literal")] +#[derive(Clone, Debug)] +struct StartBytesTwo { + byte1: u8, + byte2: u8, +} + +#[cfg(feature = "perf-literal")] +impl PrefilterI for StartBytesTwo { + fn find_in(&self, haystack: &[u8], span: Span) -> Candidate { + memchr::memchr2(self.byte1, self.byte2, &haystack[span]) + .map(|i| span.start + i) + .map_or(Candidate::None, Candidate::PossibleStartOfMatch) + } +} + +/// A prefilter for scanning for three starting bytes. +#[cfg(feature = "perf-literal")] +#[derive(Clone, Debug)] +struct StartBytesThree { + byte1: u8, + byte2: u8, + byte3: u8, +} + +#[cfg(feature = "perf-literal")] +impl PrefilterI for StartBytesThree { + fn find_in(&self, haystack: &[u8], span: Span) -> Candidate { + memchr::memchr3(self.byte1, self.byte2, self.byte3, &haystack[span]) + .map(|i| span.start + i) + .map_or(Candidate::None, Candidate::PossibleStartOfMatch) + } +} + +/// If the given byte is an ASCII letter, then return it in the opposite case. +/// e.g., Given `b'A'`, this returns `b'a'`, and given `b'a'`, this returns +/// `b'A'`. If a non-ASCII letter is given, then the given byte is returned. +pub(crate) fn opposite_ascii_case(b: u8) -> u8 { + if b'A' <= b && b <= b'Z' { + b.to_ascii_lowercase() + } else if b'a' <= b && b <= b'z' { + b.to_ascii_uppercase() + } else { + b + } +} + +/// Return the frequency rank of the given byte. The higher the rank, the more +/// common the byte (heuristically speaking). +fn freq_rank(b: u8) -> u8 { + use crate::util::byte_frequencies::BYTE_FREQUENCIES; + BYTE_FREQUENCIES[b as usize] +} diff --git a/third_party/rust/aho-corasick/src/util/primitives.rs b/third_party/rust/aho-corasick/src/util/primitives.rs new file mode 100644 index 0000000000..784d397171 --- /dev/null +++ b/third_party/rust/aho-corasick/src/util/primitives.rs @@ -0,0 +1,759 @@ +/*! +Lower level primitive types that are useful in a variety of circumstances. + +# Overview + +This list represents the principle types in this module and briefly describes +when you might want to use them. + +* [`PatternID`] - A type that represents the identifier of a regex pattern. +This is probably the most widely used type in this module (which is why it's +also re-exported in the crate root). +* [`StateID`] - A type the represents the identifier of a finite automaton +state. This is used for both NFAs and DFAs, with the notable exception of +the hybrid NFA/DFA. (The hybrid NFA/DFA uses a special purpose "lazy" state +identifier.) +* [`SmallIndex`] - The internal representation of both a `PatternID` and a +`StateID`. Its purpose is to serve as a type that can index memory without +being as big as a `usize` on 64-bit targets. The main idea behind this type +is that there are many things in regex engines that will, in practice, never +overflow a 32-bit integer. (For example, like the number of patterns in a regex +or the number of states in an NFA.) Thus, a `SmallIndex` can be used to index +memory without peppering `as` casts everywhere. Moreover, it forces callers +to handle errors in the case where, somehow, the value would otherwise overflow +either a 32-bit integer or a `usize` (e.g., on 16-bit targets). +*/ + +// The macro we use to define some types below adds methods that we don't +// use on some of the types. There isn't much, so we just squash the warning. +#![allow(dead_code)] + +use alloc::vec::Vec; + +use crate::util::int::{Usize, U16, U32, U64}; + +/// A type that represents a "small" index. +/// +/// The main idea of this type is to provide something that can index memory, +/// but uses less memory than `usize` on 64-bit systems. Specifically, its +/// representation is always a `u32` and has `repr(transparent)` enabled. (So +/// it is safe to transmute between a `u32` and a `SmallIndex`.) +/// +/// A small index is typically useful in cases where there is no practical way +/// that the index will overflow a 32-bit integer. A good example of this is +/// an NFA state. If you could somehow build an NFA with `2^30` states, its +/// memory usage would be exorbitant and its runtime execution would be so +/// slow as to be completely worthless. Therefore, this crate generally deems +/// it acceptable to return an error if it would otherwise build an NFA that +/// requires a slice longer than what a 32-bit integer can index. In exchange, +/// we can use 32-bit indices instead of 64-bit indices in various places. +/// +/// This type ensures this by providing a constructor that will return an error +/// if its argument cannot fit into the type. This makes it much easier to +/// handle these sorts of boundary cases that are otherwise extremely subtle. +/// +/// On all targets, this type guarantees that its value will fit in a `u32`, +/// `i32`, `usize` and an `isize`. This means that on 16-bit targets, for +/// example, this type's maximum value will never overflow an `isize`, +/// which means it will never overflow a `i16` even though its internal +/// representation is still a `u32`. +/// +/// The purpose for making the type fit into even signed integer types like +/// `isize` is to guarantee that the difference between any two small indices +/// is itself also a small index. This is useful in certain contexts, e.g., +/// for delta encoding. +/// +/// # Other types +/// +/// The following types wrap `SmallIndex` to provide a more focused use case: +/// +/// * [`PatternID`] is for representing the identifiers of patterns. +/// * [`StateID`] is for representing the identifiers of states in finite +/// automata. It is used for both NFAs and DFAs. +/// +/// # Representation +/// +/// This type is always represented internally by a `u32` and is marked as +/// `repr(transparent)`. Thus, this type always has the same representation as +/// a `u32`. It is thus safe to transmute between a `u32` and a `SmallIndex`. +/// +/// # Indexing +/// +/// For convenience, callers may use a `SmallIndex` to index slices. +/// +/// # Safety +/// +/// While a `SmallIndex` is meant to guarantee that its value fits into `usize` +/// without using as much space as a `usize` on all targets, callers must +/// not rely on this property for safety. Callers may choose to rely on this +/// property for correctness however. For example, creating a `SmallIndex` with +/// an invalid value can be done in entirely safe code. This may in turn result +/// in panics or silent logical errors. +#[derive( + Clone, Copy, Debug, Default, Eq, Hash, PartialEq, PartialOrd, Ord, +)] +#[repr(transparent)] +pub(crate) struct SmallIndex(u32); + +impl SmallIndex { + /// The maximum index value. + #[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))] + pub const MAX: SmallIndex = + // FIXME: Use as_usize() once const functions in traits are stable. + SmallIndex::new_unchecked(core::i32::MAX as usize - 1); + + /// The maximum index value. + #[cfg(target_pointer_width = "16")] + pub const MAX: SmallIndex = + SmallIndex::new_unchecked(core::isize::MAX - 1); + + /// The total number of values that can be represented as a small index. + pub const LIMIT: usize = SmallIndex::MAX.as_usize() + 1; + + /// The zero index value. + pub const ZERO: SmallIndex = SmallIndex::new_unchecked(0); + + /// The number of bytes that a single small index uses in memory. + pub const SIZE: usize = core::mem::size_of::<SmallIndex>(); + + /// Create a new small index. + /// + /// If the given index exceeds [`SmallIndex::MAX`], then this returns + /// an error. + #[inline] + pub fn new(index: usize) -> Result<SmallIndex, SmallIndexError> { + SmallIndex::try_from(index) + } + + /// Create a new small index without checking whether the given value + /// exceeds [`SmallIndex::MAX`]. + /// + /// Using this routine with an invalid index value will result in + /// unspecified behavior, but *not* undefined behavior. In particular, an + /// invalid index value is likely to cause panics or possibly even silent + /// logical errors. + /// + /// Callers must never rely on a `SmallIndex` to be within a certain range + /// for memory safety. + #[inline] + pub const fn new_unchecked(index: usize) -> SmallIndex { + // FIXME: Use as_u32() once const functions in traits are stable. + SmallIndex::from_u32_unchecked(index as u32) + } + + /// Create a new small index from a `u32` without checking whether the + /// given value exceeds [`SmallIndex::MAX`]. + /// + /// Using this routine with an invalid index value will result in + /// unspecified behavior, but *not* undefined behavior. In particular, an + /// invalid index value is likely to cause panics or possibly even silent + /// logical errors. + /// + /// Callers must never rely on a `SmallIndex` to be within a certain range + /// for memory safety. + #[inline] + pub const fn from_u32_unchecked(index: u32) -> SmallIndex { + SmallIndex(index) + } + + /// Like [`SmallIndex::new`], but panics if the given index is not valid. + #[inline] + pub fn must(index: usize) -> SmallIndex { + SmallIndex::new(index).expect("invalid small index") + } + + /// Return this small index as a `usize`. This is guaranteed to never + /// overflow `usize`. + #[inline] + pub const fn as_usize(&self) -> usize { + // FIXME: Use as_usize() once const functions in traits are stable. + self.0 as usize + } + + /// Return this small index as a `u64`. This is guaranteed to never + /// overflow. + #[inline] + pub const fn as_u64(&self) -> u64 { + // FIXME: Use u64::from() once const functions in traits are stable. + self.0 as u64 + } + + /// Return the internal `u32` of this small index. This is guaranteed to + /// never overflow `u32`. + #[inline] + pub const fn as_u32(&self) -> u32 { + self.0 + } + + /// Return the internal `u32` of this small index represented as an `i32`. + /// This is guaranteed to never overflow an `i32`. + #[inline] + pub const fn as_i32(&self) -> i32 { + // This is OK because we guarantee that our max value is <= i32::MAX. + self.0 as i32 + } + + /// Returns one more than this small index as a usize. + /// + /// Since a small index has constraints on its maximum value, adding `1` to + /// it will always fit in a `usize`, `isize`, `u32` and a `i32`. + #[inline] + pub fn one_more(&self) -> usize { + self.as_usize() + 1 + } + + /// Decode this small index from the bytes given using the native endian + /// byte order for the current target. + /// + /// If the decoded integer is not representable as a small index for the + /// current target, then this returns an error. + #[inline] + pub fn from_ne_bytes( + bytes: [u8; 4], + ) -> Result<SmallIndex, SmallIndexError> { + let id = u32::from_ne_bytes(bytes); + if id > SmallIndex::MAX.as_u32() { + return Err(SmallIndexError { attempted: u64::from(id) }); + } + Ok(SmallIndex::new_unchecked(id.as_usize())) + } + + /// Decode this small index from the bytes given using the native endian + /// byte order for the current target. + /// + /// This is analogous to [`SmallIndex::new_unchecked`] in that is does not + /// check whether the decoded integer is representable as a small index. + #[inline] + pub fn from_ne_bytes_unchecked(bytes: [u8; 4]) -> SmallIndex { + SmallIndex::new_unchecked(u32::from_ne_bytes(bytes).as_usize()) + } + + /// Return the underlying small index integer as raw bytes in native endian + /// format. + #[inline] + pub fn to_ne_bytes(&self) -> [u8; 4] { + self.0.to_ne_bytes() + } +} + +impl<T> core::ops::Index<SmallIndex> for [T] { + type Output = T; + + #[inline] + fn index(&self, index: SmallIndex) -> &T { + &self[index.as_usize()] + } +} + +impl<T> core::ops::IndexMut<SmallIndex> for [T] { + #[inline] + fn index_mut(&mut self, index: SmallIndex) -> &mut T { + &mut self[index.as_usize()] + } +} + +impl<T> core::ops::Index<SmallIndex> for Vec<T> { + type Output = T; + + #[inline] + fn index(&self, index: SmallIndex) -> &T { + &self[index.as_usize()] + } +} + +impl<T> core::ops::IndexMut<SmallIndex> for Vec<T> { + #[inline] + fn index_mut(&mut self, index: SmallIndex) -> &mut T { + &mut self[index.as_usize()] + } +} + +impl From<StateID> for SmallIndex { + fn from(sid: StateID) -> SmallIndex { + sid.0 + } +} + +impl From<PatternID> for SmallIndex { + fn from(pid: PatternID) -> SmallIndex { + pid.0 + } +} + +impl From<u8> for SmallIndex { + fn from(index: u8) -> SmallIndex { + SmallIndex::new_unchecked(usize::from(index)) + } +} + +impl TryFrom<u16> for SmallIndex { + type Error = SmallIndexError; + + fn try_from(index: u16) -> Result<SmallIndex, SmallIndexError> { + if u32::from(index) > SmallIndex::MAX.as_u32() { + return Err(SmallIndexError { attempted: u64::from(index) }); + } + Ok(SmallIndex::new_unchecked(index.as_usize())) + } +} + +impl TryFrom<u32> for SmallIndex { + type Error = SmallIndexError; + + fn try_from(index: u32) -> Result<SmallIndex, SmallIndexError> { + if index > SmallIndex::MAX.as_u32() { + return Err(SmallIndexError { attempted: u64::from(index) }); + } + Ok(SmallIndex::new_unchecked(index.as_usize())) + } +} + +impl TryFrom<u64> for SmallIndex { + type Error = SmallIndexError; + + fn try_from(index: u64) -> Result<SmallIndex, SmallIndexError> { + if index > SmallIndex::MAX.as_u64() { + return Err(SmallIndexError { attempted: index }); + } + Ok(SmallIndex::new_unchecked(index.as_usize())) + } +} + +impl TryFrom<usize> for SmallIndex { + type Error = SmallIndexError; + + fn try_from(index: usize) -> Result<SmallIndex, SmallIndexError> { + if index > SmallIndex::MAX.as_usize() { + return Err(SmallIndexError { attempted: index.as_u64() }); + } + Ok(SmallIndex::new_unchecked(index)) + } +} + +/// This error occurs when a small index could not be constructed. +/// +/// This occurs when given an integer exceeding the maximum small index value. +/// +/// When the `std` feature is enabled, this implements the `Error` trait. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct SmallIndexError { + attempted: u64, +} + +impl SmallIndexError { + /// Returns the value that could not be converted to a small index. + pub fn attempted(&self) -> u64 { + self.attempted + } +} + +#[cfg(feature = "std")] +impl std::error::Error for SmallIndexError {} + +impl core::fmt::Display for SmallIndexError { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + write!( + f, + "failed to create small index from {:?}, which exceeds {:?}", + self.attempted(), + SmallIndex::MAX, + ) + } +} + +#[derive(Clone, Debug)] +pub(crate) struct SmallIndexIter { + rng: core::ops::Range<usize>, +} + +impl Iterator for SmallIndexIter { + type Item = SmallIndex; + + fn next(&mut self) -> Option<SmallIndex> { + if self.rng.start >= self.rng.end { + return None; + } + let next_id = self.rng.start + 1; + let id = core::mem::replace(&mut self.rng.start, next_id); + // new_unchecked is OK since we asserted that the number of + // elements in this iterator will fit in an ID at construction. + Some(SmallIndex::new_unchecked(id)) + } +} + +macro_rules! index_type_impls { + ($name:ident, $err:ident, $iter:ident, $withiter:ident) => { + impl $name { + /// The maximum value. + pub const MAX: $name = $name(SmallIndex::MAX); + + /// The total number of values that can be represented. + pub const LIMIT: usize = SmallIndex::LIMIT; + + /// The zero value. + pub const ZERO: $name = $name(SmallIndex::ZERO); + + /// The number of bytes that a single value uses in memory. + pub const SIZE: usize = SmallIndex::SIZE; + + /// Create a new value that is represented by a "small index." + /// + /// If the given index exceeds the maximum allowed value, then this + /// returns an error. + #[inline] + pub fn new(value: usize) -> Result<$name, $err> { + SmallIndex::new(value).map($name).map_err($err) + } + + /// Create a new value without checking whether the given argument + /// exceeds the maximum. + /// + /// Using this routine with an invalid value will result in + /// unspecified behavior, but *not* undefined behavior. In + /// particular, an invalid ID value is likely to cause panics or + /// possibly even silent logical errors. + /// + /// Callers must never rely on this type to be within a certain + /// range for memory safety. + #[inline] + pub const fn new_unchecked(value: usize) -> $name { + $name(SmallIndex::new_unchecked(value)) + } + + /// Create a new value from a `u32` without checking whether the + /// given value exceeds the maximum. + /// + /// Using this routine with an invalid value will result in + /// unspecified behavior, but *not* undefined behavior. In + /// particular, an invalid ID value is likely to cause panics or + /// possibly even silent logical errors. + /// + /// Callers must never rely on this type to be within a certain + /// range for memory safety. + #[inline] + pub const fn from_u32_unchecked(index: u32) -> $name { + $name(SmallIndex::from_u32_unchecked(index)) + } + + /// Like `new`, but panics if the given value is not valid. + #[inline] + pub fn must(value: usize) -> $name { + $name::new(value).expect(concat!( + "invalid ", + stringify!($name), + " value" + )) + } + + /// Return the internal value as a `usize`. This is guaranteed to + /// never overflow `usize`. + #[inline] + pub const fn as_usize(&self) -> usize { + self.0.as_usize() + } + + /// Return the internal value as a `u64`. This is guaranteed to + /// never overflow. + #[inline] + pub const fn as_u64(&self) -> u64 { + self.0.as_u64() + } + + /// Return the internal value as a `u32`. This is guaranteed to + /// never overflow `u32`. + #[inline] + pub const fn as_u32(&self) -> u32 { + self.0.as_u32() + } + + /// Return the internal value as a `i32`. This is guaranteed to + /// never overflow an `i32`. + #[inline] + pub const fn as_i32(&self) -> i32 { + self.0.as_i32() + } + + /// Returns one more than this value as a usize. + /// + /// Since values represented by a "small index" have constraints + /// on their maximum value, adding `1` to it will always fit in a + /// `usize`, `u32` and a `i32`. + #[inline] + pub fn one_more(&self) -> usize { + self.0.one_more() + } + + /// Decode this value from the bytes given using the native endian + /// byte order for the current target. + /// + /// If the decoded integer is not representable as a small index + /// for the current target, then this returns an error. + #[inline] + pub fn from_ne_bytes(bytes: [u8; 4]) -> Result<$name, $err> { + SmallIndex::from_ne_bytes(bytes).map($name).map_err($err) + } + + /// Decode this value from the bytes given using the native endian + /// byte order for the current target. + /// + /// This is analogous to `new_unchecked` in that is does not check + /// whether the decoded integer is representable as a small index. + #[inline] + pub fn from_ne_bytes_unchecked(bytes: [u8; 4]) -> $name { + $name(SmallIndex::from_ne_bytes_unchecked(bytes)) + } + + /// Return the underlying integer as raw bytes in native endian + /// format. + #[inline] + pub fn to_ne_bytes(&self) -> [u8; 4] { + self.0.to_ne_bytes() + } + + /// Returns an iterator over all values from 0 up to and not + /// including the given length. + /// + /// If the given length exceeds this type's limit, then this + /// panics. + pub(crate) fn iter(len: usize) -> $iter { + $iter::new(len) + } + } + + // We write our own Debug impl so that we get things like PatternID(5) + // instead of PatternID(SmallIndex(5)). + impl core::fmt::Debug for $name { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + f.debug_tuple(stringify!($name)).field(&self.as_u32()).finish() + } + } + + impl<T> core::ops::Index<$name> for [T] { + type Output = T; + + #[inline] + fn index(&self, index: $name) -> &T { + &self[index.as_usize()] + } + } + + impl<T> core::ops::IndexMut<$name> for [T] { + #[inline] + fn index_mut(&mut self, index: $name) -> &mut T { + &mut self[index.as_usize()] + } + } + + impl<T> core::ops::Index<$name> for Vec<T> { + type Output = T; + + #[inline] + fn index(&self, index: $name) -> &T { + &self[index.as_usize()] + } + } + + impl<T> core::ops::IndexMut<$name> for Vec<T> { + #[inline] + fn index_mut(&mut self, index: $name) -> &mut T { + &mut self[index.as_usize()] + } + } + + impl From<SmallIndex> for $name { + fn from(index: SmallIndex) -> $name { + $name(index) + } + } + + impl From<u8> for $name { + fn from(value: u8) -> $name { + $name(SmallIndex::from(value)) + } + } + + impl TryFrom<u16> for $name { + type Error = $err; + + fn try_from(value: u16) -> Result<$name, $err> { + SmallIndex::try_from(value).map($name).map_err($err) + } + } + + impl TryFrom<u32> for $name { + type Error = $err; + + fn try_from(value: u32) -> Result<$name, $err> { + SmallIndex::try_from(value).map($name).map_err($err) + } + } + + impl TryFrom<u64> for $name { + type Error = $err; + + fn try_from(value: u64) -> Result<$name, $err> { + SmallIndex::try_from(value).map($name).map_err($err) + } + } + + impl TryFrom<usize> for $name { + type Error = $err; + + fn try_from(value: usize) -> Result<$name, $err> { + SmallIndex::try_from(value).map($name).map_err($err) + } + } + + /// This error occurs when an ID could not be constructed. + /// + /// This occurs when given an integer exceeding the maximum allowed + /// value. + /// + /// When the `std` feature is enabled, this implements the `Error` + /// trait. + #[derive(Clone, Debug, Eq, PartialEq)] + pub struct $err(SmallIndexError); + + impl $err { + /// Returns the value that could not be converted to an ID. + pub fn attempted(&self) -> u64 { + self.0.attempted() + } + } + + #[cfg(feature = "std")] + impl std::error::Error for $err {} + + impl core::fmt::Display for $err { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + write!( + f, + "failed to create {} from {:?}, which exceeds {:?}", + stringify!($name), + self.attempted(), + $name::MAX, + ) + } + } + + #[derive(Clone, Debug)] + pub(crate) struct $iter(SmallIndexIter); + + impl $iter { + fn new(len: usize) -> $iter { + assert!( + len <= $name::LIMIT, + "cannot create iterator for {} when number of \ + elements exceed {:?}", + stringify!($name), + $name::LIMIT, + ); + $iter(SmallIndexIter { rng: 0..len }) + } + } + + impl Iterator for $iter { + type Item = $name; + + fn next(&mut self) -> Option<$name> { + self.0.next().map($name) + } + } + + /// An iterator adapter that is like std::iter::Enumerate, but attaches + /// small index values instead. It requires `ExactSizeIterator`. At + /// construction, it ensures that the index of each element in the + /// iterator is representable in the corresponding small index type. + #[derive(Clone, Debug)] + pub(crate) struct $withiter<I> { + it: I, + ids: $iter, + } + + impl<I: Iterator + ExactSizeIterator> $withiter<I> { + fn new(it: I) -> $withiter<I> { + let ids = $name::iter(it.len()); + $withiter { it, ids } + } + } + + impl<I: Iterator + ExactSizeIterator> Iterator for $withiter<I> { + type Item = ($name, I::Item); + + fn next(&mut self) -> Option<($name, I::Item)> { + let item = self.it.next()?; + // Number of elements in this iterator must match, according + // to contract of ExactSizeIterator. + let id = self.ids.next().unwrap(); + Some((id, item)) + } + } + }; +} + +/// The identifier of a pattern in an Aho-Corasick automaton. +/// +/// It is represented by a `u32` even on 64-bit systems in order to conserve +/// space. Namely, on all targets, this type guarantees that its value will +/// fit in a `u32`, `i32`, `usize` and an `isize`. This means that on 16-bit +/// targets, for example, this type's maximum value will never overflow an +/// `isize`, which means it will never overflow a `i16` even though its +/// internal representation is still a `u32`. +/// +/// # Safety +/// +/// While a `PatternID` is meant to guarantee that its value fits into `usize` +/// without using as much space as a `usize` on all targets, callers must +/// not rely on this property for safety. Callers may choose to rely on this +/// property for correctness however. For example, creating a `StateID` with an +/// invalid value can be done in entirely safe code. This may in turn result in +/// panics or silent logical errors. +#[derive(Clone, Copy, Default, Eq, Hash, PartialEq, PartialOrd, Ord)] +#[repr(transparent)] +pub struct PatternID(SmallIndex); + +/// The identifier of a finite automaton state. +/// +/// It is represented by a `u32` even on 64-bit systems in order to conserve +/// space. Namely, on all targets, this type guarantees that its value will +/// fit in a `u32`, `i32`, `usize` and an `isize`. This means that on 16-bit +/// targets, for example, this type's maximum value will never overflow an +/// `isize`, which means it will never overflow a `i16` even though its +/// internal representation is still a `u32`. +/// +/// # Safety +/// +/// While a `StateID` is meant to guarantee that its value fits into `usize` +/// without using as much space as a `usize` on all targets, callers must +/// not rely on this property for safety. Callers may choose to rely on this +/// property for correctness however. For example, creating a `StateID` with an +/// invalid value can be done in entirely safe code. This may in turn result in +/// panics or silent logical errors. +#[derive(Clone, Copy, Default, Eq, Hash, PartialEq, PartialOrd, Ord)] +#[repr(transparent)] +pub struct StateID(SmallIndex); + +index_type_impls!(PatternID, PatternIDError, PatternIDIter, WithPatternIDIter); +index_type_impls!(StateID, StateIDError, StateIDIter, WithStateIDIter); + +/// A utility trait that defines a couple of adapters for making it convenient +/// to access indices as "small index" types. We require ExactSizeIterator so +/// that iterator construction can do a single check to make sure the index of +/// each element is representable by its small index type. +pub(crate) trait IteratorIndexExt: Iterator { + fn with_pattern_ids(self) -> WithPatternIDIter<Self> + where + Self: Sized + ExactSizeIterator, + { + WithPatternIDIter::new(self) + } + + fn with_state_ids(self) -> WithStateIDIter<Self> + where + Self: Sized + ExactSizeIterator, + { + WithStateIDIter::new(self) + } +} + +impl<I: Iterator> IteratorIndexExt for I {} diff --git a/third_party/rust/aho-corasick/src/util/remapper.rs b/third_party/rust/aho-corasick/src/util/remapper.rs new file mode 100644 index 0000000000..7c47a082cd --- /dev/null +++ b/third_party/rust/aho-corasick/src/util/remapper.rs @@ -0,0 +1,214 @@ +use alloc::vec::Vec; + +use crate::{nfa::noncontiguous, util::primitives::StateID}; + +/// Remappable is a tightly coupled abstraction that facilitates remapping +/// state identifiers in DFAs. +/// +/// The main idea behind remapping state IDs is that DFAs often need to check +/// if a certain state is a "special" state of some kind (like a match state) +/// during a search. Since this is extremely perf critical code, we want this +/// check to be as fast as possible. Partitioning state IDs into, for example, +/// into "non-match" and "match" states means one can tell if a state is a +/// match state via a simple comparison of the state ID. +/// +/// The issue is that during the DFA construction process, it's not +/// particularly easy to partition the states. Instead, the simplest thing is +/// to often just do a pass over all of the states and shuffle them into their +/// desired partitionings. To do that, we need a mechanism for swapping states. +/// Hence, this abstraction. +/// +/// Normally, for such little code, I would just duplicate it. But this is a +/// key optimization and the implementation is a bit subtle. So the abstraction +/// is basically a ham-fisted attempt at DRY. The only place we use this is in +/// the dense and one-pass DFAs. +/// +/// See also src/dfa/special.rs for a more detailed explanation of how dense +/// DFAs are partitioned. +pub(crate) trait Remappable: core::fmt::Debug { + /// Return the total number of states. + fn state_len(&self) -> usize; + + /// Swap the states pointed to by the given IDs. The underlying finite + /// state machine should be mutated such that all of the transitions in + /// `id1` are now in the memory region where the transitions for `id2` + /// were, and all of the transitions in `id2` are now in the memory region + /// where the transitions for `id1` were. + /// + /// Essentially, this "moves" `id1` to `id2` and `id2` to `id1`. + /// + /// It is expected that, after calling this, the underlying state machine + /// will be left in an inconsistent state, since any other transitions + /// pointing to, e.g., `id1` need to be updated to point to `id2`, since + /// that's where `id1` moved to. + /// + /// In order to "fix" the underlying inconsistent state, a `Remapper` + /// should be used to guarantee that `remap` is called at the appropriate + /// time. + fn swap_states(&mut self, id1: StateID, id2: StateID); + + /// This must remap every single state ID in the underlying value according + /// to the function given. For example, in a DFA, this should remap every + /// transition and every starting state ID. + fn remap(&mut self, map: impl Fn(StateID) -> StateID); +} + +/// Remapper is an abstraction the manages the remapping of state IDs in a +/// finite state machine. This is useful when one wants to shuffle states into +/// different positions in the machine. +/// +/// One of the key complexities this manages is the ability to correctly move +/// one state multiple times. +/// +/// Once shuffling is complete, `remap` must be called, which will rewrite +/// all pertinent transitions to updated state IDs. Neglecting to call `remap` +/// will almost certainly result in a corrupt machine. +#[derive(Debug)] +pub(crate) struct Remapper { + /// A map from the index of a state to its pre-multiplied identifier. + /// + /// When a state is swapped with another, then their corresponding + /// locations in this map are also swapped. Thus, its new position will + /// still point to its old pre-multiplied StateID. + /// + /// While there is a bit more to it, this then allows us to rewrite the + /// state IDs in a DFA's transition table in a single pass. This is done + /// by iterating over every ID in this map, then iterating over each + /// transition for the state at that ID and re-mapping the transition from + /// `old_id` to `map[dfa.to_index(old_id)]`. That is, we find the position + /// in this map where `old_id` *started*, and set it to where it ended up + /// after all swaps have been completed. + map: Vec<StateID>, + /// A way to map indices to state IDs (and back). + idx: IndexMapper, +} + +impl Remapper { + /// Create a new remapper from the given remappable implementation. The + /// remapper can then be used to swap states. The remappable value given + /// here must the same one given to `swap` and `remap`. + /// + /// The given stride should be the stride of the transition table expressed + /// as a power of 2. This stride is used to map between state IDs and state + /// indices. If state IDs and state indices are equivalent, then provide + /// a `stride2` of `0`, which acts as an identity. + pub(crate) fn new(r: &impl Remappable, stride2: usize) -> Remapper { + let idx = IndexMapper { stride2 }; + let map = (0..r.state_len()).map(|i| idx.to_state_id(i)).collect(); + Remapper { map, idx } + } + + /// Swap two states. Once this is called, callers must follow through to + /// call `remap`, or else it's possible for the underlying remappable + /// value to be in a corrupt state. + pub(crate) fn swap( + &mut self, + r: &mut impl Remappable, + id1: StateID, + id2: StateID, + ) { + if id1 == id2 { + return; + } + r.swap_states(id1, id2); + self.map.swap(self.idx.to_index(id1), self.idx.to_index(id2)); + } + + /// Complete the remapping process by rewriting all state IDs in the + /// remappable value according to the swaps performed. + pub(crate) fn remap(mut self, r: &mut impl Remappable) { + // Update the map to account for states that have been swapped + // multiple times. For example, if (A, C) and (C, G) are swapped, then + // transitions previously pointing to A should now point to G. But if + // we don't update our map, they will erroneously be set to C. All we + // do is follow the swaps in our map until we see our original state + // ID. + // + // The intuition here is to think about how changes are made to the + // map: only through pairwise swaps. That means that starting at any + // given state, it is always possible to find the loop back to that + // state by following the swaps represented in the map (which might be + // 0 swaps). + // + // We are also careful to clone the map before starting in order to + // freeze it. We use the frozen map to find our loops, since we need to + // update our map as well. Without freezing it, our updates could break + // the loops referenced above and produce incorrect results. + let oldmap = self.map.clone(); + for i in 0..r.state_len() { + let cur_id = self.idx.to_state_id(i); + let mut new_id = oldmap[i]; + if cur_id == new_id { + continue; + } + loop { + let id = oldmap[self.idx.to_index(new_id)]; + if cur_id == id { + self.map[i] = new_id; + break; + } + new_id = id; + } + } + r.remap(|sid| self.map[self.idx.to_index(sid)]); + } +} + +/// A simple type for mapping between state indices and state IDs. +/// +/// The reason why this exists is because state IDs are "premultiplied" in a +/// DFA. That is, in order to get to the transitions for a particular state, +/// one need only use the state ID as-is, instead of having to multiply it by +/// transition table's stride. +/// +/// The downside of this is that it's inconvenient to map between state IDs +/// using a dense map, e.g., Vec<StateID>. That's because state IDs look like +/// `0`, `stride`, `2*stride`, `3*stride`, etc., instead of `0`, `1`, `2`, `3`, +/// etc. +/// +/// Since our state IDs are premultiplied, we can convert back-and-forth +/// between IDs and indices by simply unmultiplying the IDs and multiplying the +/// indices. +/// +/// Note that for a sparse NFA, state IDs and indices are equivalent. In this +/// case, we set the stride of the index mapped to be `0`, which acts as an +/// identity. +#[derive(Debug)] +struct IndexMapper { + /// The power of 2 corresponding to the stride of the corresponding + /// transition table. 'id >> stride2' de-multiplies an ID while 'index << + /// stride2' pre-multiplies an index to an ID. + stride2: usize, +} + +impl IndexMapper { + /// Convert a state ID to a state index. + fn to_index(&self, id: StateID) -> usize { + id.as_usize() >> self.stride2 + } + + /// Convert a state index to a state ID. + fn to_state_id(&self, index: usize) -> StateID { + // CORRECTNESS: If the given index is not valid, then it is not + // required for this to panic or return a valid state ID. We'll "just" + // wind up with panics or silent logic errors at some other point. But + // this is OK because if Remappable::state_len is correct and so is + // 'to_index', then all inputs to 'to_state_id' should be valid indices + // and thus transform into valid state IDs. + StateID::new_unchecked(index << self.stride2) + } +} + +impl Remappable for noncontiguous::NFA { + fn state_len(&self) -> usize { + noncontiguous::NFA::states(self).len() + } + + fn swap_states(&mut self, id1: StateID, id2: StateID) { + noncontiguous::NFA::swap_states(self, id1, id2) + } + + fn remap(&mut self, map: impl Fn(StateID) -> StateID) { + noncontiguous::NFA::remap(self, map) + } +} diff --git a/third_party/rust/aho-corasick/src/util/search.rs b/third_party/rust/aho-corasick/src/util/search.rs new file mode 100644 index 0000000000..59b7035e1f --- /dev/null +++ b/third_party/rust/aho-corasick/src/util/search.rs @@ -0,0 +1,1148 @@ +use core::ops::{Range, RangeBounds}; + +use crate::util::primitives::PatternID; + +/// The configuration and the haystack to use for an Aho-Corasick search. +/// +/// When executing a search, there are a few parameters one might want to +/// configure: +/// +/// * The haystack to search, provided to the [`Input::new`] constructor. This +/// is the only required parameter. +/// * The span _within_ the haystack to limit a search to. (The default +/// is the entire haystack.) This is configured via [`Input::span`] or +/// [`Input::range`]. +/// * Whether to run an unanchored (matches can occur anywhere after the +/// start of the search) or anchored (matches can only occur beginning at +/// the start of the search) search. Unanchored search is the default. This is +/// configured via [`Input::anchored`]. +/// * Whether to quit the search as soon as a match has been found, regardless +/// of the [`MatchKind`] that the searcher was built with. This is configured +/// via [`Input::earliest`]. +/// +/// For most cases, the defaults for all optional parameters are appropriate. +/// The utility of this type is that it keeps the default or common case simple +/// while permitting tweaking parameters in more niche use cases while reusing +/// the same search APIs. +/// +/// # Valid bounds and search termination +/// +/// An `Input` permits setting the bounds of a search via either +/// [`Input::span`] or [`Input::range`]. The bounds set must be valid, or +/// else a panic will occur. Bounds are valid if and only if: +/// +/// * The bounds represent a valid range into the input's haystack. +/// * **or** the end bound is a valid ending bound for the haystack *and* +/// the start bound is exactly one greater than the end bound. +/// +/// In the latter case, [`Input::is_done`] will return true and indicates any +/// search receiving such an input should immediately return with no match. +/// +/// Other than representing "search is complete," the `Input::span` and +/// `Input::range` APIs are never necessary. Instead, callers can slice the +/// haystack instead, e.g., with `&haystack[start..end]`. With that said, they +/// can be more convenient than slicing because the match positions reported +/// when using `Input::span` or `Input::range` are in terms of the original +/// haystack. If you instead use `&haystack[start..end]`, then you'll need to +/// add `start` to any match position returned in order for it to be a correct +/// index into `haystack`. +/// +/// # Example: `&str` and `&[u8]` automatically convert to an `Input` +/// +/// There is a `From<&T> for Input` implementation for all `T: AsRef<[u8]>`. +/// Additionally, the [`AhoCorasick`](crate::AhoCorasick) search APIs accept +/// a `Into<Input>`. These two things combined together mean you can provide +/// things like `&str` and `&[u8]` to search APIs when the defaults are +/// suitable, but also an `Input` when they're not. For example: +/// +/// ``` +/// use aho_corasick::{AhoCorasick, Anchored, Input, Match, StartKind}; +/// +/// // Build a searcher that supports both unanchored and anchored modes. +/// let ac = AhoCorasick::builder() +/// .start_kind(StartKind::Both) +/// .build(&["abcd", "b"]) +/// .unwrap(); +/// let haystack = "abcd"; +/// +/// // A search using default parameters is unanchored. With standard +/// // semantics, this finds `b` first. +/// assert_eq!( +/// Some(Match::must(1, 1..2)), +/// ac.find(haystack), +/// ); +/// // Using the same 'find' routine, we can provide an 'Input' explicitly +/// // that is configured to do an anchored search. Since 'b' doesn't start +/// // at the beginning of the search, it is not reported as a match. +/// assert_eq!( +/// Some(Match::must(0, 0..4)), +/// ac.find(Input::new(haystack).anchored(Anchored::Yes)), +/// ); +/// ``` +#[derive(Clone)] +pub struct Input<'h> { + haystack: &'h [u8], + span: Span, + anchored: Anchored, + earliest: bool, +} + +impl<'h> Input<'h> { + /// Create a new search configuration for the given haystack. + #[inline] + pub fn new<H: ?Sized + AsRef<[u8]>>(haystack: &'h H) -> Input<'h> { + Input { + haystack: haystack.as_ref(), + span: Span { start: 0, end: haystack.as_ref().len() }, + anchored: Anchored::No, + earliest: false, + } + } + + /// Set the span for this search. + /// + /// This routine is generic over how a span is provided. While + /// a [`Span`] may be given directly, one may also provide a + /// `std::ops::Range<usize>`. To provide anything supported by range + /// syntax, use the [`Input::range`] method. + /// + /// The default span is the entire haystack. + /// + /// Note that [`Input::range`] overrides this method and vice versa. + /// + /// # Panics + /// + /// This panics if the given span does not correspond to valid bounds in + /// the haystack or the termination of a search. + /// + /// # Example + /// + /// This example shows how the span of the search can impact whether a + /// match is reported or not. + /// + /// ``` + /// use aho_corasick::{AhoCorasick, Input, MatchKind}; + /// + /// let patterns = &["b", "abcd", "abc"]; + /// let haystack = "abcd"; + /// + /// let ac = AhoCorasick::builder() + /// .match_kind(MatchKind::LeftmostFirst) + /// .build(patterns) + /// .unwrap(); + /// let input = Input::new(haystack).span(0..3); + /// let mat = ac.try_find(input)?.expect("should have a match"); + /// // Without the span stopping the search early, 'abcd' would be reported + /// // because it is the correct leftmost-first match. + /// assert_eq!("abc", &haystack[mat.span()]); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn span<S: Into<Span>>(mut self, span: S) -> Input<'h> { + self.set_span(span); + self + } + + /// Like `Input::span`, but accepts any range instead. + /// + /// The default range is the entire haystack. + /// + /// Note that [`Input::span`] overrides this method and vice versa. + /// + /// # Panics + /// + /// This routine will panic if the given range could not be converted + /// to a valid [`Range`]. For example, this would panic when given + /// `0..=usize::MAX` since it cannot be represented using a half-open + /// interval in terms of `usize`. + /// + /// This routine also panics if the given range does not correspond to + /// valid bounds in the haystack or the termination of a search. + /// + /// # Example + /// + /// ``` + /// use aho_corasick::Input; + /// + /// let input = Input::new("foobar"); + /// assert_eq!(0..6, input.get_range()); + /// + /// let input = Input::new("foobar").range(2..=4); + /// assert_eq!(2..5, input.get_range()); + /// ``` + #[inline] + pub fn range<R: RangeBounds<usize>>(mut self, range: R) -> Input<'h> { + self.set_range(range); + self + } + + /// Sets the anchor mode of a search. + /// + /// When a search is anchored (via [`Anchored::Yes`]), a match must begin + /// at the start of a search. When a search is not anchored (that's + /// [`Anchored::No`]), searchers will look for a match anywhere in the + /// haystack. + /// + /// By default, the anchored mode is [`Anchored::No`]. + /// + /// # Support for anchored searches + /// + /// Anchored or unanchored searches might not always be available, + /// depending on the type of searcher used and its configuration: + /// + /// * [`noncontiguous::NFA`](crate::nfa::noncontiguous::NFA) always + /// supports both unanchored and anchored searches. + /// * [`contiguous::NFA`](crate::nfa::contiguous::NFA) always supports both + /// unanchored and anchored searches. + /// * [`dfa::DFA`](crate::dfa::DFA) supports only unanchored + /// searches by default. + /// [`dfa::Builder::start_kind`](crate::dfa::Builder::start_kind) can + /// be used to change the default to supporting both kinds of searches + /// or even just anchored searches. + /// * [`AhoCorasick`](crate::AhoCorasick) inherits the same setup as a + /// `DFA`. Namely, it only supports unanchored searches by default, but + /// [`AhoCorasickBuilder::start_kind`](crate::AhoCorasickBuilder::start_kind) + /// can change this. + /// + /// If you try to execute a search using a `try_` ("fallible") method with + /// an unsupported anchor mode, then an error will be returned. For calls + /// to infallible search methods, a panic will result. + /// + /// # Example + /// + /// This demonstrates the differences between an anchored search and + /// an unanchored search. Notice that we build our `AhoCorasick` searcher + /// with [`StartKind::Both`] so that it supports both unanchored and + /// anchored searches simultaneously. + /// + /// ``` + /// use aho_corasick::{ + /// AhoCorasick, Anchored, Input, MatchKind, StartKind, + /// }; + /// + /// let patterns = &["bcd"]; + /// let haystack = "abcd"; + /// + /// let ac = AhoCorasick::builder() + /// .start_kind(StartKind::Both) + /// .build(patterns) + /// .unwrap(); + /// + /// // Note that 'Anchored::No' is the default, so it doesn't need to + /// // be explicitly specified here. + /// let input = Input::new(haystack); + /// let mat = ac.try_find(input)?.expect("should have a match"); + /// assert_eq!("bcd", &haystack[mat.span()]); + /// + /// // While 'bcd' occurs in the haystack, it does not begin where our + /// // search begins, so no match is found. + /// let input = Input::new(haystack).anchored(Anchored::Yes); + /// assert_eq!(None, ac.try_find(input)?); + /// + /// // However, if we start our search where 'bcd' starts, then we will + /// // find a match. + /// let input = Input::new(haystack).range(1..).anchored(Anchored::Yes); + /// let mat = ac.try_find(input)?.expect("should have a match"); + /// assert_eq!("bcd", &haystack[mat.span()]); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn anchored(mut self, mode: Anchored) -> Input<'h> { + self.set_anchored(mode); + self + } + + /// Whether to execute an "earliest" search or not. + /// + /// When running a non-overlapping search, an "earliest" search will + /// return the match location as early as possible. For example, given + /// the patterns `abc` and `b`, and a haystack of `abc`, a normal + /// leftmost-first search will return `abc` as a match. But an "earliest" + /// search will return as soon as it is known that a match occurs, which + /// happens once `b` is seen. + /// + /// Note that when using [`MatchKind::Standard`], the "earliest" option + /// has no effect since standard semantics are already "earliest." Note + /// also that this has no effect in overlapping searches, since overlapping + /// searches also use standard semantics and report all possible matches. + /// + /// This is disabled by default. + /// + /// # Example + /// + /// This example shows the difference between "earliest" searching and + /// normal leftmost searching. + /// + /// ``` + /// use aho_corasick::{AhoCorasick, Anchored, Input, MatchKind, StartKind}; + /// + /// let patterns = &["abc", "b"]; + /// let haystack = "abc"; + /// + /// let ac = AhoCorasick::builder() + /// .match_kind(MatchKind::LeftmostFirst) + /// .build(patterns) + /// .unwrap(); + /// + /// // The normal leftmost-first match. + /// let input = Input::new(haystack); + /// let mat = ac.try_find(input)?.expect("should have a match"); + /// assert_eq!("abc", &haystack[mat.span()]); + /// + /// // The "earliest" possible match, even if it isn't leftmost-first. + /// let input = Input::new(haystack).earliest(true); + /// let mat = ac.try_find(input)?.expect("should have a match"); + /// assert_eq!("b", &haystack[mat.span()]); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn earliest(mut self, yes: bool) -> Input<'h> { + self.set_earliest(yes); + self + } + + /// Set the span for this search configuration. + /// + /// This is like the [`Input::span`] method, except this mutates the + /// span in place. + /// + /// This routine is generic over how a span is provided. While + /// a [`Span`] may be given directly, one may also provide a + /// `std::ops::Range<usize>`. + /// + /// # Panics + /// + /// This panics if the given span does not correspond to valid bounds in + /// the haystack or the termination of a search. + /// + /// # Example + /// + /// ``` + /// use aho_corasick::Input; + /// + /// let mut input = Input::new("foobar"); + /// assert_eq!(0..6, input.get_range()); + /// input.set_span(2..4); + /// assert_eq!(2..4, input.get_range()); + /// ``` + #[inline] + pub fn set_span<S: Into<Span>>(&mut self, span: S) { + let span = span.into(); + assert!( + span.end <= self.haystack.len() + && span.start <= span.end.wrapping_add(1), + "invalid span {:?} for haystack of length {}", + span, + self.haystack.len(), + ); + self.span = span; + } + + /// Set the span for this search configuration given any range. + /// + /// This is like the [`Input::range`] method, except this mutates the + /// span in place. + /// + /// # Panics + /// + /// This routine will panic if the given range could not be converted + /// to a valid [`Range`]. For example, this would panic when given + /// `0..=usize::MAX` since it cannot be represented using a half-open + /// interval in terms of `usize`. + /// + /// This routine also panics if the given range does not correspond to + /// valid bounds in the haystack or the termination of a search. + /// + /// # Example + /// + /// ``` + /// use aho_corasick::Input; + /// + /// let mut input = Input::new("foobar"); + /// assert_eq!(0..6, input.get_range()); + /// input.set_range(2..=4); + /// assert_eq!(2..5, input.get_range()); + /// ``` + #[inline] + pub fn set_range<R: RangeBounds<usize>>(&mut self, range: R) { + use core::ops::Bound; + + // It's a little weird to convert ranges into spans, and then spans + // back into ranges when we actually slice the haystack. Because + // of that process, we always represent everything as a half-open + // internal. Therefore, handling things like m..=n is a little awkward. + let start = match range.start_bound() { + Bound::Included(&i) => i, + // Can this case ever happen? Range syntax doesn't support it... + Bound::Excluded(&i) => i.checked_add(1).unwrap(), + Bound::Unbounded => 0, + }; + let end = match range.end_bound() { + Bound::Included(&i) => i.checked_add(1).unwrap(), + Bound::Excluded(&i) => i, + Bound::Unbounded => self.haystack().len(), + }; + self.set_span(Span { start, end }); + } + + /// Set the starting offset for the span for this search configuration. + /// + /// This is a convenience routine for only mutating the start of a span + /// without having to set the entire span. + /// + /// # Panics + /// + /// This panics if the given span does not correspond to valid bounds in + /// the haystack or the termination of a search. + /// + /// # Example + /// + /// ``` + /// use aho_corasick::Input; + /// + /// let mut input = Input::new("foobar"); + /// assert_eq!(0..6, input.get_range()); + /// input.set_start(5); + /// assert_eq!(5..6, input.get_range()); + /// ``` + #[inline] + pub fn set_start(&mut self, start: usize) { + self.set_span(Span { start, ..self.get_span() }); + } + + /// Set the ending offset for the span for this search configuration. + /// + /// This is a convenience routine for only mutating the end of a span + /// without having to set the entire span. + /// + /// # Panics + /// + /// This panics if the given span does not correspond to valid bounds in + /// the haystack or the termination of a search. + /// + /// # Example + /// + /// ``` + /// use aho_corasick::Input; + /// + /// let mut input = Input::new("foobar"); + /// assert_eq!(0..6, input.get_range()); + /// input.set_end(5); + /// assert_eq!(0..5, input.get_range()); + /// ``` + #[inline] + pub fn set_end(&mut self, end: usize) { + self.set_span(Span { end, ..self.get_span() }); + } + + /// Set the anchor mode of a search. + /// + /// This is like [`Input::anchored`], except it mutates the search + /// configuration in place. + /// + /// # Example + /// + /// ``` + /// use aho_corasick::{Anchored, Input}; + /// + /// let mut input = Input::new("foobar"); + /// assert_eq!(Anchored::No, input.get_anchored()); + /// + /// input.set_anchored(Anchored::Yes); + /// assert_eq!(Anchored::Yes, input.get_anchored()); + /// ``` + #[inline] + pub fn set_anchored(&mut self, mode: Anchored) { + self.anchored = mode; + } + + /// Set whether the search should execute in "earliest" mode or not. + /// + /// This is like [`Input::earliest`], except it mutates the search + /// configuration in place. + /// + /// # Example + /// + /// ``` + /// use aho_corasick::Input; + /// + /// let mut input = Input::new("foobar"); + /// assert!(!input.get_earliest()); + /// input.set_earliest(true); + /// assert!(input.get_earliest()); + /// ``` + #[inline] + pub fn set_earliest(&mut self, yes: bool) { + self.earliest = yes; + } + + /// Return a borrow of the underlying haystack as a slice of bytes. + /// + /// # Example + /// + /// ``` + /// use aho_corasick::Input; + /// + /// let input = Input::new("foobar"); + /// assert_eq!(b"foobar", input.haystack()); + /// ``` + #[inline] + pub fn haystack(&self) -> &[u8] { + self.haystack + } + + /// Return the start position of this search. + /// + /// This is a convenience routine for `search.get_span().start()`. + /// + /// # Example + /// + /// ``` + /// use aho_corasick::Input; + /// + /// let input = Input::new("foobar"); + /// assert_eq!(0, input.start()); + /// + /// let input = Input::new("foobar").span(2..4); + /// assert_eq!(2, input.start()); + /// ``` + #[inline] + pub fn start(&self) -> usize { + self.get_span().start + } + + /// Return the end position of this search. + /// + /// This is a convenience routine for `search.get_span().end()`. + /// + /// # Example + /// + /// ``` + /// use aho_corasick::Input; + /// + /// let input = Input::new("foobar"); + /// assert_eq!(6, input.end()); + /// + /// let input = Input::new("foobar").span(2..4); + /// assert_eq!(4, input.end()); + /// ``` + #[inline] + pub fn end(&self) -> usize { + self.get_span().end + } + + /// Return the span for this search configuration. + /// + /// If one was not explicitly set, then the span corresponds to the entire + /// range of the haystack. + /// + /// # Example + /// + /// ``` + /// use aho_corasick::{Input, Span}; + /// + /// let input = Input::new("foobar"); + /// assert_eq!(Span { start: 0, end: 6 }, input.get_span()); + /// ``` + #[inline] + pub fn get_span(&self) -> Span { + self.span + } + + /// Return the span as a range for this search configuration. + /// + /// If one was not explicitly set, then the span corresponds to the entire + /// range of the haystack. + /// + /// # Example + /// + /// ``` + /// use aho_corasick::Input; + /// + /// let input = Input::new("foobar"); + /// assert_eq!(0..6, input.get_range()); + /// ``` + #[inline] + pub fn get_range(&self) -> Range<usize> { + self.get_span().range() + } + + /// Return the anchored mode for this search configuration. + /// + /// If no anchored mode was set, then it defaults to [`Anchored::No`]. + /// + /// # Example + /// + /// ``` + /// use aho_corasick::{Anchored, Input}; + /// + /// let mut input = Input::new("foobar"); + /// assert_eq!(Anchored::No, input.get_anchored()); + /// + /// input.set_anchored(Anchored::Yes); + /// assert_eq!(Anchored::Yes, input.get_anchored()); + /// ``` + #[inline] + pub fn get_anchored(&self) -> Anchored { + self.anchored + } + + /// Return whether this search should execute in "earliest" mode. + /// + /// # Example + /// + /// ``` + /// use aho_corasick::Input; + /// + /// let input = Input::new("foobar"); + /// assert!(!input.get_earliest()); + /// ``` + #[inline] + pub fn get_earliest(&self) -> bool { + self.earliest + } + + /// Return true if this input has been exhausted, which in turn means all + /// subsequent searches will return no matches. + /// + /// This occurs precisely when the start position of this search is greater + /// than the end position of the search. + /// + /// # Example + /// + /// ``` + /// use aho_corasick::Input; + /// + /// let mut input = Input::new("foobar"); + /// assert!(!input.is_done()); + /// input.set_start(6); + /// assert!(!input.is_done()); + /// input.set_start(7); + /// assert!(input.is_done()); + /// ``` + #[inline] + pub fn is_done(&self) -> bool { + self.get_span().start > self.get_span().end + } +} + +impl<'h> core::fmt::Debug for Input<'h> { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + let mut fmter = f.debug_struct("Input"); + match core::str::from_utf8(self.haystack()) { + Ok(nice) => fmter.field("haystack", &nice), + Err(_) => fmter.field("haystack", &self.haystack()), + } + .field("span", &self.span) + .field("anchored", &self.anchored) + .field("earliest", &self.earliest) + .finish() + } +} + +impl<'h, H: ?Sized + AsRef<[u8]>> From<&'h H> for Input<'h> { + #[inline] + fn from(haystack: &'h H) -> Input<'h> { + Input::new(haystack) + } +} + +/// A representation of a range in a haystack. +/// +/// A span corresponds to the starting and ending _byte offsets_ of a +/// contiguous region of bytes. The starting offset is inclusive while the +/// ending offset is exclusive. That is, a span is a half-open interval. +/// +/// A span is used to report the offsets of a match, but it is also used to +/// convey which region of a haystack should be searched via routines like +/// [`Input::span`]. +/// +/// This is basically equivalent to a `std::ops::Range<usize>`, except this +/// type implements `Copy` which makes it more ergonomic to use in the context +/// of this crate. Indeed, `Span` exists only because `Range<usize>` does +/// not implement `Copy`. Like a range, this implements `Index` for `[u8]` +/// and `str`, and `IndexMut` for `[u8]`. For convenience, this also impls +/// `From<Range>`, which means things like `Span::from(5..10)` work. +/// +/// There are no constraints on the values of a span. It is, for example, legal +/// to create a span where `start > end`. +#[derive(Clone, Copy, Eq, Hash, PartialEq)] +pub struct Span { + /// The start offset of the span, inclusive. + pub start: usize, + /// The end offset of the span, exclusive. + pub end: usize, +} + +impl Span { + /// Returns this span as a range. + #[inline] + pub fn range(&self) -> Range<usize> { + Range::from(*self) + } + + /// Returns true when this span is empty. That is, when `start >= end`. + #[inline] + pub fn is_empty(&self) -> bool { + self.start >= self.end + } + + /// Returns the length of this span. + /// + /// This returns `0` in precisely the cases that `is_empty` returns `true`. + #[inline] + pub fn len(&self) -> usize { + self.end.saturating_sub(self.start) + } + + /// Returns true when the given offset is contained within this span. + /// + /// Note that an empty span contains no offsets and will always return + /// false. + #[inline] + pub fn contains(&self, offset: usize) -> bool { + !self.is_empty() && self.start <= offset && offset <= self.end + } + + /// Returns a new span with `offset` added to this span's `start` and `end` + /// values. + #[inline] + pub fn offset(&self, offset: usize) -> Span { + Span { start: self.start + offset, end: self.end + offset } + } +} + +impl core::fmt::Debug for Span { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + write!(f, "{}..{}", self.start, self.end) + } +} + +impl core::ops::Index<Span> for [u8] { + type Output = [u8]; + + #[inline] + fn index(&self, index: Span) -> &[u8] { + &self[index.range()] + } +} + +impl core::ops::IndexMut<Span> for [u8] { + #[inline] + fn index_mut(&mut self, index: Span) -> &mut [u8] { + &mut self[index.range()] + } +} + +impl core::ops::Index<Span> for str { + type Output = str; + + #[inline] + fn index(&self, index: Span) -> &str { + &self[index.range()] + } +} + +impl From<Range<usize>> for Span { + #[inline] + fn from(range: Range<usize>) -> Span { + Span { start: range.start, end: range.end } + } +} + +impl From<Span> for Range<usize> { + #[inline] + fn from(span: Span) -> Range<usize> { + Range { start: span.start, end: span.end } + } +} + +impl PartialEq<Range<usize>> for Span { + #[inline] + fn eq(&self, range: &Range<usize>) -> bool { + self.start == range.start && self.end == range.end + } +} + +impl PartialEq<Span> for Range<usize> { + #[inline] + fn eq(&self, span: &Span) -> bool { + self.start == span.start && self.end == span.end + } +} + +/// The type of anchored search to perform. +/// +/// If an Aho-Corasick searcher does not support the anchored mode selected, +/// then the search will return an error or panic, depending on whether a +/// fallible or an infallible routine was called. +#[non_exhaustive] +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum Anchored { + /// Run an unanchored search. This means a match may occur anywhere at or + /// after the start position of the search up until the end position of the + /// search. + No, + /// Run an anchored search. This means that a match must begin at the start + /// position of the search and end before the end position of the search. + Yes, +} + +impl Anchored { + /// Returns true if and only if this anchor mode corresponds to an anchored + /// search. + /// + /// # Example + /// + /// ``` + /// use aho_corasick::Anchored; + /// + /// assert!(!Anchored::No.is_anchored()); + /// assert!(Anchored::Yes.is_anchored()); + /// ``` + #[inline] + pub fn is_anchored(&self) -> bool { + matches!(*self, Anchored::Yes) + } +} + +/// A representation of a match reported by an Aho-Corasick searcher. +/// +/// A match has two essential pieces of information: the [`PatternID`] that +/// matches, and the [`Span`] of the match in a haystack. +/// +/// The pattern is identified by an ID, which corresponds to its position +/// (starting from `0`) relative to other patterns used to construct the +/// corresponding searcher. If only a single pattern is provided, then all +/// matches are guaranteed to have a pattern ID of `0`. +/// +/// Every match reported by a searcher guarantees that its span has its start +/// offset as less than or equal to its end offset. +#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] +pub struct Match { + /// The pattern ID. + pattern: PatternID, + /// The underlying match span. + span: Span, +} + +impl Match { + /// Create a new match from a pattern ID and a span. + /// + /// This constructor is generic over how a span is provided. While + /// a [`Span`] may be given directly, one may also provide a + /// `std::ops::Range<usize>`. + /// + /// # Panics + /// + /// This panics if `end < start`. + /// + /// # Example + /// + /// This shows how to create a match for the first pattern in an + /// Aho-Corasick searcher using convenient range syntax. + /// + /// ``` + /// use aho_corasick::{Match, PatternID}; + /// + /// let m = Match::new(PatternID::ZERO, 5..10); + /// assert_eq!(0, m.pattern().as_usize()); + /// assert_eq!(5, m.start()); + /// assert_eq!(10, m.end()); + /// ``` + #[inline] + pub fn new<S: Into<Span>>(pattern: PatternID, span: S) -> Match { + let span = span.into(); + assert!(span.start <= span.end, "invalid match span"); + Match { pattern, span } + } + + /// Create a new match from a pattern ID and a byte offset span. + /// + /// This constructor is generic over how a span is provided. While + /// a [`Span`] may be given directly, one may also provide a + /// `std::ops::Range<usize>`. + /// + /// This is like [`Match::new`], but accepts a `usize` instead of a + /// [`PatternID`]. This panics if the given `usize` is not representable + /// as a `PatternID`. + /// + /// # Panics + /// + /// This panics if `end < start` or if `pattern > PatternID::MAX`. + /// + /// # Example + /// + /// This shows how to create a match for the third pattern in an + /// Aho-Corasick searcher using convenient range syntax. + /// + /// ``` + /// use aho_corasick::Match; + /// + /// let m = Match::must(3, 5..10); + /// assert_eq!(3, m.pattern().as_usize()); + /// assert_eq!(5, m.start()); + /// assert_eq!(10, m.end()); + /// ``` + #[inline] + pub fn must<S: Into<Span>>(pattern: usize, span: S) -> Match { + Match::new(PatternID::must(pattern), span) + } + + /// Returns the ID of the pattern that matched. + /// + /// The ID of a pattern is derived from the position in which it was + /// originally inserted into the corresponding searcher. The first pattern + /// has identifier `0`, and each subsequent pattern is `1`, `2` and so on. + #[inline] + pub fn pattern(&self) -> PatternID { + self.pattern + } + + /// The starting position of the match. + /// + /// This is a convenience routine for `Match::span().start`. + #[inline] + pub fn start(&self) -> usize { + self.span().start + } + + /// The ending position of the match. + /// + /// This is a convenience routine for `Match::span().end`. + #[inline] + pub fn end(&self) -> usize { + self.span().end + } + + /// Returns the match span as a range. + /// + /// This is a convenience routine for `Match::span().range()`. + #[inline] + pub fn range(&self) -> core::ops::Range<usize> { + self.span().range() + } + + /// Returns the span for this match. + #[inline] + pub fn span(&self) -> Span { + self.span + } + + /// Returns true when the span in this match is empty. + /// + /// An empty match can only be returned when empty pattern is in the + /// Aho-Corasick searcher. + #[inline] + pub fn is_empty(&self) -> bool { + self.span().is_empty() + } + + /// Returns the length of this match. + /// + /// This returns `0` in precisely the cases that `is_empty` returns `true`. + #[inline] + pub fn len(&self) -> usize { + self.span().len() + } + + /// Returns a new match with `offset` added to its span's `start` and `end` + /// values. + #[inline] + pub fn offset(&self, offset: usize) -> Match { + Match { + pattern: self.pattern, + span: Span { + start: self.start() + offset, + end: self.end() + offset, + }, + } + } +} + +/// A knob for controlling the match semantics of an Aho-Corasick automaton. +/// +/// There are two generally different ways that Aho-Corasick automatons can +/// report matches. The first way is the "standard" approach that results from +/// implementing most textbook explanations of Aho-Corasick. The second way is +/// to report only the leftmost non-overlapping matches. The leftmost approach +/// is in turn split into two different ways of resolving ambiguous matches: +/// leftmost-first and leftmost-longest. +/// +/// The `Standard` match kind is the default and is the only one that supports +/// overlapping matches and stream searching. (Trying to find overlapping or +/// streaming matches using leftmost match semantics will result in an error in +/// fallible APIs and a panic when using infallibe APIs.) The `Standard` match +/// kind will report matches as they are seen. When searching for overlapping +/// matches, then all possible matches are reported. When searching for +/// non-overlapping matches, the first match seen is reported. For example, for +/// non-overlapping matches, given the patterns `abcd` and `b` and the haystack +/// `abcdef`, only a match for `b` is reported since it is detected first. The +/// `abcd` match is never reported since it overlaps with the `b` match. +/// +/// In contrast, the leftmost match kind always prefers the leftmost match +/// among all possible matches. Given the same example as above with `abcd` and +/// `b` as patterns and `abcdef` as the haystack, the leftmost match is `abcd` +/// since it begins before the `b` match, even though the `b` match is detected +/// before the `abcd` match. In this case, the `b` match is not reported at all +/// since it overlaps with the `abcd` match. +/// +/// The difference between leftmost-first and leftmost-longest is in how they +/// resolve ambiguous matches when there are multiple leftmost matches to +/// choose from. Leftmost-first always chooses the pattern that was provided +/// earliest, where as leftmost-longest always chooses the longest matching +/// pattern. For example, given the patterns `a` and `ab` and the subject +/// string `ab`, the leftmost-first match is `a` but the leftmost-longest match +/// is `ab`. Conversely, if the patterns were given in reverse order, i.e., +/// `ab` and `a`, then both the leftmost-first and leftmost-longest matches +/// would be `ab`. Stated differently, the leftmost-first match depends on the +/// order in which the patterns were given to the Aho-Corasick automaton. +/// Because of that, when leftmost-first matching is used, if a pattern `A` +/// that appears before a pattern `B` is a prefix of `B`, then it is impossible +/// to ever observe a match of `B`. +/// +/// If you're not sure which match kind to pick, then stick with the standard +/// kind, which is the default. In particular, if you need overlapping or +/// streaming matches, then you _must_ use the standard kind. The leftmost +/// kinds are useful in specific circumstances. For example, leftmost-first can +/// be very useful as a way to implement match priority based on the order of +/// patterns given and leftmost-longest can be useful for dictionary searching +/// such that only the longest matching words are reported. +/// +/// # Relationship with regular expression alternations +/// +/// Understanding match semantics can be a little tricky, and one easy way +/// to conceptualize non-overlapping matches from an Aho-Corasick automaton +/// is to think about them as a simple alternation of literals in a regular +/// expression. For example, let's say we wanted to match the strings +/// `Sam` and `Samwise`, which would turn into the regex `Sam|Samwise`. It +/// turns out that regular expression engines have two different ways of +/// matching this alternation. The first way, leftmost-longest, is commonly +/// found in POSIX compatible implementations of regular expressions (such as +/// `grep`). The second way, leftmost-first, is commonly found in backtracking +/// implementations such as Perl. (Some regex engines, such as RE2 and Rust's +/// regex engine do not use backtracking, but still implement leftmost-first +/// semantics in an effort to match the behavior of dominant backtracking +/// regex engines such as those found in Perl, Ruby, Python, Javascript and +/// PHP.) +/// +/// That is, when matching `Sam|Samwise` against `Samwise`, a POSIX regex +/// will match `Samwise` because it is the longest possible match, but a +/// Perl-like regex will match `Sam` since it appears earlier in the +/// alternation. Indeed, the regex `Sam|Samwise` in a Perl-like regex engine +/// will never match `Samwise` since `Sam` will always have higher priority. +/// Conversely, matching the regex `Samwise|Sam` against `Samwise` will lead to +/// a match of `Samwise` in both POSIX and Perl-like regexes since `Samwise` is +/// still longest match, but it also appears earlier than `Sam`. +/// +/// The "standard" match semantics of Aho-Corasick generally don't correspond +/// to the match semantics of any large group of regex implementations, so +/// there's no direct analogy that can be made here. Standard match semantics +/// are generally useful for overlapping matches, or if you just want to see +/// matches as they are detected. +/// +/// The main conclusion to draw from this section is that the match semantics +/// can be tweaked to precisely match either Perl-like regex alternations or +/// POSIX regex alternations. +#[non_exhaustive] +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum MatchKind { + /// Use standard match semantics, which support overlapping matches. When + /// used with non-overlapping matches, matches are reported as they are + /// seen. + Standard, + /// Use leftmost-first match semantics, which reports leftmost matches. + /// When there are multiple possible leftmost matches, the match + /// corresponding to the pattern that appeared earlier when constructing + /// the automaton is reported. + /// + /// This does **not** support overlapping matches or stream searching. If + /// this match kind is used, attempting to find overlapping matches or + /// stream matches will fail. + LeftmostFirst, + /// Use leftmost-longest match semantics, which reports leftmost matches. + /// When there are multiple possible leftmost matches, the longest match + /// is chosen. + /// + /// This does **not** support overlapping matches or stream searching. If + /// this match kind is used, attempting to find overlapping matches or + /// stream matches will fail. + LeftmostLongest, +} + +/// The default match kind is `MatchKind::Standard`. +impl Default for MatchKind { + fn default() -> MatchKind { + MatchKind::Standard + } +} + +impl MatchKind { + #[inline] + pub(crate) fn is_standard(&self) -> bool { + matches!(*self, MatchKind::Standard) + } + + #[inline] + pub(crate) fn is_leftmost(&self) -> bool { + matches!(*self, MatchKind::LeftmostFirst | MatchKind::LeftmostLongest) + } + + #[inline] + pub(crate) fn is_leftmost_first(&self) -> bool { + matches!(*self, MatchKind::LeftmostFirst) + } + + /// Convert this match kind into a packed match kind. If this match kind + /// corresponds to standard semantics, then this returns None, since + /// packed searching does not support standard semantics. + #[inline] + pub(crate) fn as_packed(&self) -> Option<crate::packed::MatchKind> { + match *self { + MatchKind::Standard => None, + MatchKind::LeftmostFirst => { + Some(crate::packed::MatchKind::LeftmostFirst) + } + MatchKind::LeftmostLongest => { + Some(crate::packed::MatchKind::LeftmostLongest) + } + } + } +} + +/// The kind of anchored starting configurations to support in an Aho-Corasick +/// searcher. +/// +/// Depending on which searcher is used internally by +/// [`AhoCorasick`](crate::AhoCorasick), supporting both unanchored +/// and anchored searches can be quite costly. For this reason, +/// [`AhoCorasickBuilder::start_kind`](crate::AhoCorasickBuilder::start_kind) +/// can be used to configure whether your searcher supports unanchored, +/// anchored or both kinds of searches. +/// +/// This searcher configuration knob works in concert with the search time +/// configuration [`Input::anchored`]. Namely, if one requests an unsupported +/// anchored mode, then the search will either panic or return an error, +/// depending on whether you're using infallible or fallibe APIs, respectively. +/// +/// `AhoCorasick` by default only supports unanchored searches. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum StartKind { + /// Support both anchored and unanchored searches. + Both, + /// Support only unanchored searches. Requesting an anchored search will + /// return an error in fallible APIs and panic in infallible APIs. + Unanchored, + /// Support only anchored searches. Requesting an unanchored search will + /// return an error in fallible APIs and panic in infallible APIs. + Anchored, +} + +impl Default for StartKind { + fn default() -> StartKind { + StartKind::Unanchored + } +} diff --git a/third_party/rust/aho-corasick/src/util/special.rs b/third_party/rust/aho-corasick/src/util/special.rs new file mode 100644 index 0000000000..beeba40c89 --- /dev/null +++ b/third_party/rust/aho-corasick/src/util/special.rs @@ -0,0 +1,42 @@ +use crate::util::primitives::StateID; + +/// A collection of sentinel state IDs for Aho-Corasick automata. +/// +/// This specifically enables the technique by which we determine which states +/// are dead, matches or start states. Namely, by arranging states in a +/// particular order, we can determine the type of a state simply by looking at +/// its ID. +#[derive(Clone, Debug)] +pub(crate) struct Special { + /// The maximum ID of all the "special" states. This corresponds either to + /// start_anchored_id when a prefilter is active and max_match_id when a + /// prefilter is not active. The idea here is that if there is no prefilter, + /// then there is no point in treating start states as special. + pub(crate) max_special_id: StateID, + /// The maximum ID of all the match states. Any state ID bigger than this + /// is guaranteed to be a non-match ID. + /// + /// It is possible and legal for max_match_id to be equal to + /// start_anchored_id, which occurs precisely in the case where the empty + /// string is a pattern that was added to the underlying automaton. + pub(crate) max_match_id: StateID, + /// The state ID of the start state used for unanchored searches. + pub(crate) start_unanchored_id: StateID, + /// The state ID of the start state used for anchored searches. This is + /// always start_unanchored_id+1. + pub(crate) start_anchored_id: StateID, +} + +impl Special { + /// Create a new set of "special" state IDs with all IDs initialized to + /// zero. The general idea here is that they will be updated and set to + /// correct values later. + pub(crate) fn zero() -> Special { + Special { + max_special_id: StateID::ZERO, + max_match_id: StateID::ZERO, + start_unanchored_id: StateID::ZERO, + start_anchored_id: StateID::ZERO, + } + } +} |