summaryrefslogtreecommitdiffstats
path: root/third_party/rust/aho-corasick/src/util
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/rust/aho-corasick/src/util')
-rw-r--r--third_party/rust/aho-corasick/src/util/alphabet.rs409
-rw-r--r--third_party/rust/aho-corasick/src/util/buffer.rs124
-rw-r--r--third_party/rust/aho-corasick/src/util/byte_frequencies.rs258
-rw-r--r--third_party/rust/aho-corasick/src/util/debug.rs26
-rw-r--r--third_party/rust/aho-corasick/src/util/error.rs259
-rw-r--r--third_party/rust/aho-corasick/src/util/int.rs284
-rw-r--r--third_party/rust/aho-corasick/src/util/mod.rs12
-rw-r--r--third_party/rust/aho-corasick/src/util/prefilter.rs924
-rw-r--r--third_party/rust/aho-corasick/src/util/primitives.rs759
-rw-r--r--third_party/rust/aho-corasick/src/util/remapper.rs214
-rw-r--r--third_party/rust/aho-corasick/src/util/search.rs1148
-rw-r--r--third_party/rust/aho-corasick/src/util/special.rs42
12 files changed, 4459 insertions, 0 deletions
diff --git a/third_party/rust/aho-corasick/src/util/alphabet.rs b/third_party/rust/aho-corasick/src/util/alphabet.rs
new file mode 100644
index 0000000000..69724fa3ab
--- /dev/null
+++ b/third_party/rust/aho-corasick/src/util/alphabet.rs
@@ -0,0 +1,409 @@
+use crate::util::int::Usize;
+
+/// A representation of byte oriented equivalence classes.
+///
+/// This is used in finite state machines to reduce the size of the transition
+/// table. This can have a particularly large impact not only on the total size
+/// of an FSM, but also on FSM build times because it reduces the number of
+/// transitions that need to be visited/set.
+#[derive(Clone, Copy)]
+pub(crate) struct ByteClasses([u8; 256]);
+
+impl ByteClasses {
+ /// Creates a new set of equivalence classes where all bytes are mapped to
+ /// the same class.
+ pub(crate) fn empty() -> ByteClasses {
+ ByteClasses([0; 256])
+ }
+
+ /// Creates a new set of equivalence classes where each byte belongs to
+ /// its own equivalence class.
+ pub(crate) fn singletons() -> ByteClasses {
+ let mut classes = ByteClasses::empty();
+ for b in 0..=255 {
+ classes.set(b, b);
+ }
+ classes
+ }
+
+ /// Set the equivalence class for the given byte.
+ #[inline]
+ pub(crate) fn set(&mut self, byte: u8, class: u8) {
+ self.0[usize::from(byte)] = class;
+ }
+
+ /// Get the equivalence class for the given byte.
+ #[inline]
+ pub(crate) fn get(&self, byte: u8) -> u8 {
+ self.0[usize::from(byte)]
+ }
+
+ /// Return the total number of elements in the alphabet represented by
+ /// these equivalence classes. Equivalently, this returns the total number
+ /// of equivalence classes.
+ #[inline]
+ pub(crate) fn alphabet_len(&self) -> usize {
+ // Add one since the number of equivalence classes is one bigger than
+ // the last one.
+ usize::from(self.0[255]) + 1
+ }
+
+ /// Returns the stride, as a base-2 exponent, required for these
+ /// equivalence classes.
+ ///
+ /// The stride is always the smallest power of 2 that is greater than or
+ /// equal to the alphabet length. This is done so that converting between
+ /// state IDs and indices can be done with shifts alone, which is much
+ /// faster than integer division. The "stride2" is the exponent. i.e.,
+ /// `2^stride2 = stride`.
+ pub(crate) fn stride2(&self) -> usize {
+ let zeros = self.alphabet_len().next_power_of_two().trailing_zeros();
+ usize::try_from(zeros).unwrap()
+ }
+
+ /// Returns the stride for these equivalence classes, which corresponds
+ /// to the smallest power of 2 greater than or equal to the number of
+ /// equivalence classes.
+ pub(crate) fn stride(&self) -> usize {
+ 1 << self.stride2()
+ }
+
+ /// Returns true if and only if every byte in this class maps to its own
+ /// equivalence class. Equivalently, there are 257 equivalence classes
+ /// and each class contains exactly one byte (plus the special EOI class).
+ #[inline]
+ pub(crate) fn is_singleton(&self) -> bool {
+ self.alphabet_len() == 256
+ }
+
+ /// Returns an iterator over all equivalence classes in this set.
+ pub(crate) fn iter(&self) -> ByteClassIter {
+ ByteClassIter { it: 0..self.alphabet_len() }
+ }
+
+ /// Returns an iterator of the bytes in the given equivalence class.
+ pub(crate) fn elements(&self, class: u8) -> ByteClassElements {
+ ByteClassElements { classes: self, class, bytes: 0..=255 }
+ }
+
+ /// Returns an iterator of byte ranges in the given equivalence class.
+ ///
+ /// That is, a sequence of contiguous ranges are returned. Typically, every
+ /// class maps to a single contiguous range.
+ fn element_ranges(&self, class: u8) -> ByteClassElementRanges {
+ ByteClassElementRanges { elements: self.elements(class), range: None }
+ }
+}
+
+impl core::fmt::Debug for ByteClasses {
+ fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+ if self.is_singleton() {
+ write!(f, "ByteClasses(<one-class-per-byte>)")
+ } else {
+ write!(f, "ByteClasses(")?;
+ for (i, class) in self.iter().enumerate() {
+ if i > 0 {
+ write!(f, ", ")?;
+ }
+ write!(f, "{:?} => [", class)?;
+ for (start, end) in self.element_ranges(class) {
+ if start == end {
+ write!(f, "{:?}", start)?;
+ } else {
+ write!(f, "{:?}-{:?}", start, end)?;
+ }
+ }
+ write!(f, "]")?;
+ }
+ write!(f, ")")
+ }
+ }
+}
+
+/// An iterator over each equivalence class.
+#[derive(Debug)]
+pub(crate) struct ByteClassIter {
+ it: core::ops::Range<usize>,
+}
+
+impl Iterator for ByteClassIter {
+ type Item = u8;
+
+ fn next(&mut self) -> Option<u8> {
+ self.it.next().map(|class| class.as_u8())
+ }
+}
+
+/// An iterator over all elements in a specific equivalence class.
+#[derive(Debug)]
+pub(crate) struct ByteClassElements<'a> {
+ classes: &'a ByteClasses,
+ class: u8,
+ bytes: core::ops::RangeInclusive<u8>,
+}
+
+impl<'a> Iterator for ByteClassElements<'a> {
+ type Item = u8;
+
+ fn next(&mut self) -> Option<u8> {
+ while let Some(byte) = self.bytes.next() {
+ if self.class == self.classes.get(byte) {
+ return Some(byte);
+ }
+ }
+ None
+ }
+}
+
+/// An iterator over all elements in an equivalence class expressed as a
+/// sequence of contiguous ranges.
+#[derive(Debug)]
+pub(crate) struct ByteClassElementRanges<'a> {
+ elements: ByteClassElements<'a>,
+ range: Option<(u8, u8)>,
+}
+
+impl<'a> Iterator for ByteClassElementRanges<'a> {
+ type Item = (u8, u8);
+
+ fn next(&mut self) -> Option<(u8, u8)> {
+ loop {
+ let element = match self.elements.next() {
+ None => return self.range.take(),
+ Some(element) => element,
+ };
+ match self.range.take() {
+ None => {
+ self.range = Some((element, element));
+ }
+ Some((start, end)) => {
+ if usize::from(end) + 1 != usize::from(element) {
+ self.range = Some((element, element));
+ return Some((start, end));
+ }
+ self.range = Some((start, element));
+ }
+ }
+ }
+ }
+}
+
+/// A partitioning of bytes into equivalence classes.
+///
+/// A byte class set keeps track of an *approximation* of equivalence classes
+/// of bytes during NFA construction. That is, every byte in an equivalence
+/// class cannot discriminate between a match and a non-match.
+///
+/// Note that this may not compute the minimal set of equivalence classes.
+/// Basically, any byte in a pattern given to the noncontiguous NFA builder
+/// will automatically be treated as its own equivalence class. All other
+/// bytes---any byte not in any pattern---will be treated as their own
+/// equivalence classes. In theory, all bytes not in any pattern should
+/// be part of a single equivalence class, but in practice, we only treat
+/// contiguous ranges of bytes as an equivalence class. So the number of
+/// classes computed may be bigger than necessary. This usually doesn't make
+/// much of a difference, and keeps the implementation simple.
+#[derive(Clone, Debug)]
+pub(crate) struct ByteClassSet(ByteSet);
+
+impl Default for ByteClassSet {
+ fn default() -> ByteClassSet {
+ ByteClassSet::empty()
+ }
+}
+
+impl ByteClassSet {
+ /// Create a new set of byte classes where all bytes are part of the same
+ /// equivalence class.
+ pub(crate) fn empty() -> Self {
+ ByteClassSet(ByteSet::empty())
+ }
+
+ /// Indicate the the range of byte given (inclusive) can discriminate a
+ /// match between it and all other bytes outside of the range.
+ pub(crate) fn set_range(&mut self, start: u8, end: u8) {
+ debug_assert!(start <= end);
+ if start > 0 {
+ self.0.add(start - 1);
+ }
+ self.0.add(end);
+ }
+
+ /// Convert this boolean set to a map that maps all byte values to their
+ /// corresponding equivalence class. The last mapping indicates the largest
+ /// equivalence class identifier (which is never bigger than 255).
+ pub(crate) fn byte_classes(&self) -> ByteClasses {
+ let mut classes = ByteClasses::empty();
+ let mut class = 0u8;
+ let mut b = 0u8;
+ loop {
+ classes.set(b, class);
+ if b == 255 {
+ break;
+ }
+ if self.0.contains(b) {
+ class = class.checked_add(1).unwrap();
+ }
+ b = b.checked_add(1).unwrap();
+ }
+ classes
+ }
+}
+
+/// A simple set of bytes that is reasonably cheap to copy and allocation free.
+#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
+pub(crate) struct ByteSet {
+ bits: BitSet,
+}
+
+/// The representation of a byte set. Split out so that we can define a
+/// convenient Debug impl for it while keeping "ByteSet" in the output.
+#[derive(Clone, Copy, Default, Eq, PartialEq)]
+struct BitSet([u128; 2]);
+
+impl ByteSet {
+ /// Create an empty set of bytes.
+ pub(crate) fn empty() -> ByteSet {
+ ByteSet { bits: BitSet([0; 2]) }
+ }
+
+ /// Add a byte to this set.
+ ///
+ /// If the given byte already belongs to this set, then this is a no-op.
+ pub(crate) fn add(&mut self, byte: u8) {
+ let bucket = byte / 128;
+ let bit = byte % 128;
+ self.bits.0[usize::from(bucket)] |= 1 << bit;
+ }
+
+ /// Return true if and only if the given byte is in this set.
+ pub(crate) fn contains(&self, byte: u8) -> bool {
+ let bucket = byte / 128;
+ let bit = byte % 128;
+ self.bits.0[usize::from(bucket)] & (1 << bit) > 0
+ }
+}
+
+impl core::fmt::Debug for BitSet {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ let mut fmtd = f.debug_set();
+ for b in 0u8..=255 {
+ if (ByteSet { bits: *self }).contains(b) {
+ fmtd.entry(&b);
+ }
+ }
+ fmtd.finish()
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use alloc::{vec, vec::Vec};
+
+ use super::*;
+
+ #[test]
+ fn byte_classes() {
+ let mut set = ByteClassSet::empty();
+ set.set_range(b'a', b'z');
+
+ let classes = set.byte_classes();
+ assert_eq!(classes.get(0), 0);
+ assert_eq!(classes.get(1), 0);
+ assert_eq!(classes.get(2), 0);
+ assert_eq!(classes.get(b'a' - 1), 0);
+ assert_eq!(classes.get(b'a'), 1);
+ assert_eq!(classes.get(b'm'), 1);
+ assert_eq!(classes.get(b'z'), 1);
+ assert_eq!(classes.get(b'z' + 1), 2);
+ assert_eq!(classes.get(254), 2);
+ assert_eq!(classes.get(255), 2);
+
+ let mut set = ByteClassSet::empty();
+ set.set_range(0, 2);
+ set.set_range(4, 6);
+ let classes = set.byte_classes();
+ assert_eq!(classes.get(0), 0);
+ assert_eq!(classes.get(1), 0);
+ assert_eq!(classes.get(2), 0);
+ assert_eq!(classes.get(3), 1);
+ assert_eq!(classes.get(4), 2);
+ assert_eq!(classes.get(5), 2);
+ assert_eq!(classes.get(6), 2);
+ assert_eq!(classes.get(7), 3);
+ assert_eq!(classes.get(255), 3);
+ }
+
+ #[test]
+ fn full_byte_classes() {
+ let mut set = ByteClassSet::empty();
+ for b in 0u8..=255 {
+ set.set_range(b, b);
+ }
+ assert_eq!(set.byte_classes().alphabet_len(), 256);
+ }
+
+ #[test]
+ fn elements_typical() {
+ let mut set = ByteClassSet::empty();
+ set.set_range(b'b', b'd');
+ set.set_range(b'g', b'm');
+ set.set_range(b'z', b'z');
+ let classes = set.byte_classes();
+ // class 0: \x00-a
+ // class 1: b-d
+ // class 2: e-f
+ // class 3: g-m
+ // class 4: n-y
+ // class 5: z-z
+ // class 6: \x7B-\xFF
+ assert_eq!(classes.alphabet_len(), 7);
+
+ let elements = classes.elements(0).collect::<Vec<_>>();
+ assert_eq!(elements.len(), 98);
+ assert_eq!(elements[0], b'\x00');
+ assert_eq!(elements[97], b'a');
+
+ let elements = classes.elements(1).collect::<Vec<_>>();
+ assert_eq!(elements, vec![b'b', b'c', b'd'],);
+
+ let elements = classes.elements(2).collect::<Vec<_>>();
+ assert_eq!(elements, vec![b'e', b'f'],);
+
+ let elements = classes.elements(3).collect::<Vec<_>>();
+ assert_eq!(elements, vec![b'g', b'h', b'i', b'j', b'k', b'l', b'm',],);
+
+ let elements = classes.elements(4).collect::<Vec<_>>();
+ assert_eq!(elements.len(), 12);
+ assert_eq!(elements[0], b'n');
+ assert_eq!(elements[11], b'y');
+
+ let elements = classes.elements(5).collect::<Vec<_>>();
+ assert_eq!(elements, vec![b'z']);
+
+ let elements = classes.elements(6).collect::<Vec<_>>();
+ assert_eq!(elements.len(), 133);
+ assert_eq!(elements[0], b'\x7B');
+ assert_eq!(elements[132], b'\xFF');
+ }
+
+ #[test]
+ fn elements_singletons() {
+ let classes = ByteClasses::singletons();
+ assert_eq!(classes.alphabet_len(), 256);
+
+ let elements = classes.elements(b'a').collect::<Vec<_>>();
+ assert_eq!(elements, vec![b'a']);
+ }
+
+ #[test]
+ fn elements_empty() {
+ let classes = ByteClasses::empty();
+ assert_eq!(classes.alphabet_len(), 1);
+
+ let elements = classes.elements(0).collect::<Vec<_>>();
+ assert_eq!(elements.len(), 256);
+ assert_eq!(elements[0], b'\x00');
+ assert_eq!(elements[255], b'\xFF');
+ }
+}
diff --git a/third_party/rust/aho-corasick/src/util/buffer.rs b/third_party/rust/aho-corasick/src/util/buffer.rs
new file mode 100644
index 0000000000..e9e982af58
--- /dev/null
+++ b/third_party/rust/aho-corasick/src/util/buffer.rs
@@ -0,0 +1,124 @@
+use alloc::{vec, vec::Vec};
+
+/// The default buffer capacity that we use for the stream buffer.
+const DEFAULT_BUFFER_CAPACITY: usize = 64 * (1 << 10); // 64 KB
+
+/// A fairly simple roll buffer for supporting stream searches.
+///
+/// This buffer acts as a temporary place to store a fixed amount of data when
+/// reading from a stream. Its central purpose is to allow "rolling" some
+/// suffix of the data to the beginning of the buffer before refilling it with
+/// more data from the stream. For example, let's say we are trying to match
+/// "foobar" on a stream. When we report the match, we'd like to not only
+/// report the correct offsets at which the match occurs, but also the matching
+/// bytes themselves. So let's say our stream is a file with the following
+/// contents: `test test foobar test test`. Now assume that we happen to read
+/// the aforementioned file in two chunks: `test test foo` and `bar test test`.
+/// Naively, it would not be possible to report a single contiguous `foobar`
+/// match, but this roll buffer allows us to do that. Namely, after the second
+/// read, the contents of the buffer should be `st foobar test test`, where the
+/// search should ultimately resume immediately after `foo`. (The prefix `st `
+/// is included because the roll buffer saves N bytes at the end of the buffer,
+/// where N is the maximum possible length of a match.)
+///
+/// A lot of the logic for dealing with this is unfortunately split out between
+/// this roll buffer and the `StreamChunkIter`.
+///
+/// Note also that this buffer is not actually required to just report matches.
+/// Because a `Match` is just some offsets. But it *is* required for supporting
+/// things like `try_stream_replace_all` because that needs some mechanism for
+/// knowing which bytes in the stream correspond to a match and which don't. So
+/// when a match occurs across two `read` calls, *something* needs to retain
+/// the bytes from the previous `read` call because you don't know before the
+/// second read call whether a match exists or not.
+#[derive(Debug)]
+pub(crate) struct Buffer {
+ /// The raw buffer contents. This has a fixed size and never increases.
+ buf: Vec<u8>,
+ /// The minimum size of the buffer, which is equivalent to the maximum
+ /// possible length of a match. This corresponds to the amount that we
+ /// roll
+ min: usize,
+ /// The end of the contents of this buffer.
+ end: usize,
+}
+
+impl Buffer {
+ /// Create a new buffer for stream searching. The minimum buffer length
+ /// given should be the size of the maximum possible match length.
+ pub(crate) fn new(min_buffer_len: usize) -> Buffer {
+ let min = core::cmp::max(1, min_buffer_len);
+ // The minimum buffer amount is also the amount that we roll our
+ // buffer in order to support incremental searching. To this end,
+ // our actual capacity needs to be at least 1 byte bigger than our
+ // minimum amount, otherwise we won't have any overlap. In actuality,
+ // we want our buffer to be a bit bigger than that for performance
+ // reasons, so we set a lower bound of `8 * min`.
+ //
+ // TODO: It would be good to find a way to test the streaming
+ // implementation with the minimal buffer size. For now, we just
+ // uncomment out the next line and comment out the subsequent line.
+ // let capacity = 1 + min;
+ let capacity = core::cmp::max(min * 8, DEFAULT_BUFFER_CAPACITY);
+ Buffer { buf: vec![0; capacity], min, end: 0 }
+ }
+
+ /// Return the contents of this buffer.
+ #[inline]
+ pub(crate) fn buffer(&self) -> &[u8] {
+ &self.buf[..self.end]
+ }
+
+ /// Return the minimum size of the buffer. The only way a buffer may be
+ /// smaller than this is if the stream itself contains less than the
+ /// minimum buffer amount.
+ #[inline]
+ pub(crate) fn min_buffer_len(&self) -> usize {
+ self.min
+ }
+
+ /// Return all free capacity in this buffer.
+ fn free_buffer(&mut self) -> &mut [u8] {
+ &mut self.buf[self.end..]
+ }
+
+ /// Refill the contents of this buffer by reading as much as possible into
+ /// this buffer's free capacity. If no more bytes could be read, then this
+ /// returns false. Otherwise, this reads until it has filled the buffer
+ /// past the minimum amount.
+ pub(crate) fn fill<R: std::io::Read>(
+ &mut self,
+ mut rdr: R,
+ ) -> std::io::Result<bool> {
+ let mut readany = false;
+ loop {
+ let readlen = rdr.read(self.free_buffer())?;
+ if readlen == 0 {
+ return Ok(readany);
+ }
+ readany = true;
+ self.end += readlen;
+ if self.buffer().len() >= self.min {
+ return Ok(true);
+ }
+ }
+ }
+
+ /// Roll the contents of the buffer so that the suffix of this buffer is
+ /// moved to the front and all other contents are dropped. The size of the
+ /// suffix corresponds precisely to the minimum buffer length.
+ ///
+ /// This should only be called when the entire contents of this buffer have
+ /// been searched.
+ pub(crate) fn roll(&mut self) {
+ let roll_start = self
+ .end
+ .checked_sub(self.min)
+ .expect("buffer capacity should be bigger than minimum amount");
+ let roll_end = roll_start + self.min;
+
+ assert!(roll_end <= self.end);
+ self.buf.copy_within(roll_start..roll_end, 0);
+ self.end = self.min;
+ }
+}
diff --git a/third_party/rust/aho-corasick/src/util/byte_frequencies.rs b/third_party/rust/aho-corasick/src/util/byte_frequencies.rs
new file mode 100644
index 0000000000..c313b629db
--- /dev/null
+++ b/third_party/rust/aho-corasick/src/util/byte_frequencies.rs
@@ -0,0 +1,258 @@
+pub const BYTE_FREQUENCIES: [u8; 256] = [
+ 55, // '\x00'
+ 52, // '\x01'
+ 51, // '\x02'
+ 50, // '\x03'
+ 49, // '\x04'
+ 48, // '\x05'
+ 47, // '\x06'
+ 46, // '\x07'
+ 45, // '\x08'
+ 103, // '\t'
+ 242, // '\n'
+ 66, // '\x0b'
+ 67, // '\x0c'
+ 229, // '\r'
+ 44, // '\x0e'
+ 43, // '\x0f'
+ 42, // '\x10'
+ 41, // '\x11'
+ 40, // '\x12'
+ 39, // '\x13'
+ 38, // '\x14'
+ 37, // '\x15'
+ 36, // '\x16'
+ 35, // '\x17'
+ 34, // '\x18'
+ 33, // '\x19'
+ 56, // '\x1a'
+ 32, // '\x1b'
+ 31, // '\x1c'
+ 30, // '\x1d'
+ 29, // '\x1e'
+ 28, // '\x1f'
+ 255, // ' '
+ 148, // '!'
+ 164, // '"'
+ 149, // '#'
+ 136, // '$'
+ 160, // '%'
+ 155, // '&'
+ 173, // "'"
+ 221, // '('
+ 222, // ')'
+ 134, // '*'
+ 122, // '+'
+ 232, // ','
+ 202, // '-'
+ 215, // '.'
+ 224, // '/'
+ 208, // '0'
+ 220, // '1'
+ 204, // '2'
+ 187, // '3'
+ 183, // '4'
+ 179, // '5'
+ 177, // '6'
+ 168, // '7'
+ 178, // '8'
+ 200, // '9'
+ 226, // ':'
+ 195, // ';'
+ 154, // '<'
+ 184, // '='
+ 174, // '>'
+ 126, // '?'
+ 120, // '@'
+ 191, // 'A'
+ 157, // 'B'
+ 194, // 'C'
+ 170, // 'D'
+ 189, // 'E'
+ 162, // 'F'
+ 161, // 'G'
+ 150, // 'H'
+ 193, // 'I'
+ 142, // 'J'
+ 137, // 'K'
+ 171, // 'L'
+ 176, // 'M'
+ 185, // 'N'
+ 167, // 'O'
+ 186, // 'P'
+ 112, // 'Q'
+ 175, // 'R'
+ 192, // 'S'
+ 188, // 'T'
+ 156, // 'U'
+ 140, // 'V'
+ 143, // 'W'
+ 123, // 'X'
+ 133, // 'Y'
+ 128, // 'Z'
+ 147, // '['
+ 138, // '\\'
+ 146, // ']'
+ 114, // '^'
+ 223, // '_'
+ 151, // '`'
+ 249, // 'a'
+ 216, // 'b'
+ 238, // 'c'
+ 236, // 'd'
+ 253, // 'e'
+ 227, // 'f'
+ 218, // 'g'
+ 230, // 'h'
+ 247, // 'i'
+ 135, // 'j'
+ 180, // 'k'
+ 241, // 'l'
+ 233, // 'm'
+ 246, // 'n'
+ 244, // 'o'
+ 231, // 'p'
+ 139, // 'q'
+ 245, // 'r'
+ 243, // 's'
+ 251, // 't'
+ 235, // 'u'
+ 201, // 'v'
+ 196, // 'w'
+ 240, // 'x'
+ 214, // 'y'
+ 152, // 'z'
+ 182, // '{'
+ 205, // '|'
+ 181, // '}'
+ 127, // '~'
+ 27, // '\x7f'
+ 212, // '\x80'
+ 211, // '\x81'
+ 210, // '\x82'
+ 213, // '\x83'
+ 228, // '\x84'
+ 197, // '\x85'
+ 169, // '\x86'
+ 159, // '\x87'
+ 131, // '\x88'
+ 172, // '\x89'
+ 105, // '\x8a'
+ 80, // '\x8b'
+ 98, // '\x8c'
+ 96, // '\x8d'
+ 97, // '\x8e'
+ 81, // '\x8f'
+ 207, // '\x90'
+ 145, // '\x91'
+ 116, // '\x92'
+ 115, // '\x93'
+ 144, // '\x94'
+ 130, // '\x95'
+ 153, // '\x96'
+ 121, // '\x97'
+ 107, // '\x98'
+ 132, // '\x99'
+ 109, // '\x9a'
+ 110, // '\x9b'
+ 124, // '\x9c'
+ 111, // '\x9d'
+ 82, // '\x9e'
+ 108, // '\x9f'
+ 118, // '\xa0'
+ 141, // '¡'
+ 113, // '¢'
+ 129, // '£'
+ 119, // '¤'
+ 125, // '¥'
+ 165, // '¦'
+ 117, // '§'
+ 92, // '¨'
+ 106, // '©'
+ 83, // 'ª'
+ 72, // '«'
+ 99, // '¬'
+ 93, // '\xad'
+ 65, // '®'
+ 79, // '¯'
+ 166, // '°'
+ 237, // '±'
+ 163, // '²'
+ 199, // '³'
+ 190, // '´'
+ 225, // 'µ'
+ 209, // '¶'
+ 203, // '·'
+ 198, // '¸'
+ 217, // '¹'
+ 219, // 'º'
+ 206, // '»'
+ 234, // '¼'
+ 248, // '½'
+ 158, // '¾'
+ 239, // '¿'
+ 255, // 'À'
+ 255, // 'Á'
+ 255, // 'Â'
+ 255, // 'Ã'
+ 255, // 'Ä'
+ 255, // 'Å'
+ 255, // 'Æ'
+ 255, // 'Ç'
+ 255, // 'È'
+ 255, // 'É'
+ 255, // 'Ê'
+ 255, // 'Ë'
+ 255, // 'Ì'
+ 255, // 'Í'
+ 255, // 'Î'
+ 255, // 'Ï'
+ 255, // 'Ð'
+ 255, // 'Ñ'
+ 255, // 'Ò'
+ 255, // 'Ó'
+ 255, // 'Ô'
+ 255, // 'Õ'
+ 255, // 'Ö'
+ 255, // '×'
+ 255, // 'Ø'
+ 255, // 'Ù'
+ 255, // 'Ú'
+ 255, // 'Û'
+ 255, // 'Ü'
+ 255, // 'Ý'
+ 255, // 'Þ'
+ 255, // 'ß'
+ 255, // 'à'
+ 255, // 'á'
+ 255, // 'â'
+ 255, // 'ã'
+ 255, // 'ä'
+ 255, // 'å'
+ 255, // 'æ'
+ 255, // 'ç'
+ 255, // 'è'
+ 255, // 'é'
+ 255, // 'ê'
+ 255, // 'ë'
+ 255, // 'ì'
+ 255, // 'í'
+ 255, // 'î'
+ 255, // 'ï'
+ 255, // 'ð'
+ 255, // 'ñ'
+ 255, // 'ò'
+ 255, // 'ó'
+ 255, // 'ô'
+ 255, // 'õ'
+ 255, // 'ö'
+ 255, // '÷'
+ 255, // 'ø'
+ 255, // 'ù'
+ 255, // 'ú'
+ 255, // 'û'
+ 255, // 'ü'
+ 255, // 'ý'
+ 255, // 'þ'
+ 255, // 'ÿ'
+];
diff --git a/third_party/rust/aho-corasick/src/util/debug.rs b/third_party/rust/aho-corasick/src/util/debug.rs
new file mode 100644
index 0000000000..22b5f2231f
--- /dev/null
+++ b/third_party/rust/aho-corasick/src/util/debug.rs
@@ -0,0 +1,26 @@
+/// A type that wraps a single byte with a convenient fmt::Debug impl that
+/// escapes the byte.
+pub(crate) struct DebugByte(pub(crate) u8);
+
+impl core::fmt::Debug for DebugByte {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ // Special case ASCII space. It's too hard to read otherwise, so
+ // put quotes around it. I sometimes wonder whether just '\x20' would
+ // be better...
+ if self.0 == b' ' {
+ return write!(f, "' '");
+ }
+ // 10 bytes is enough to cover any output from ascii::escape_default.
+ let mut bytes = [0u8; 10];
+ let mut len = 0;
+ for (i, mut b) in core::ascii::escape_default(self.0).enumerate() {
+ // capitalize \xab to \xAB
+ if i >= 2 && b'a' <= b && b <= b'f' {
+ b -= 32;
+ }
+ bytes[len] = b;
+ len += 1;
+ }
+ write!(f, "{}", core::str::from_utf8(&bytes[..len]).unwrap())
+ }
+}
diff --git a/third_party/rust/aho-corasick/src/util/error.rs b/third_party/rust/aho-corasick/src/util/error.rs
new file mode 100644
index 0000000000..326d04657b
--- /dev/null
+++ b/third_party/rust/aho-corasick/src/util/error.rs
@@ -0,0 +1,259 @@
+use crate::util::{
+ primitives::{PatternID, SmallIndex},
+ search::MatchKind,
+};
+
+/// An error that occurred during the construction of an Aho-Corasick
+/// automaton.
+///
+/// Build errors occur when some kind of limit has been exceeded, either in the
+/// number of states, the number of patterns of the length of a pattern. These
+/// limits aren't part of the public API, but they should generally be large
+/// enough to handle most use cases.
+///
+/// When the `std` feature is enabled, this implements the `std::error::Error`
+/// trait.
+#[derive(Clone, Debug)]
+pub struct BuildError {
+ kind: ErrorKind,
+}
+
+/// The kind of error that occurred.
+#[derive(Clone, Debug)]
+enum ErrorKind {
+ /// An error that occurs when allocating a new state would result in an
+ /// identifier that exceeds the capacity of a `StateID`.
+ StateIDOverflow {
+ /// The maximum possible id.
+ max: u64,
+ /// The maximum ID requested.
+ requested_max: u64,
+ },
+ /// An error that occurs when adding a pattern to an Aho-Corasick
+ /// automaton would result in an identifier that exceeds the capacity of a
+ /// `PatternID`.
+ PatternIDOverflow {
+ /// The maximum possible id.
+ max: u64,
+ /// The maximum ID requested.
+ requested_max: u64,
+ },
+ /// Occurs when a pattern string is given to the Aho-Corasick constructor
+ /// that is too long.
+ PatternTooLong {
+ /// The ID of the pattern that was too long.
+ pattern: PatternID,
+ /// The length that was too long.
+ len: usize,
+ },
+}
+
+impl BuildError {
+ pub(crate) fn state_id_overflow(
+ max: u64,
+ requested_max: u64,
+ ) -> BuildError {
+ BuildError { kind: ErrorKind::StateIDOverflow { max, requested_max } }
+ }
+
+ pub(crate) fn pattern_id_overflow(
+ max: u64,
+ requested_max: u64,
+ ) -> BuildError {
+ BuildError {
+ kind: ErrorKind::PatternIDOverflow { max, requested_max },
+ }
+ }
+
+ pub(crate) fn pattern_too_long(
+ pattern: PatternID,
+ len: usize,
+ ) -> BuildError {
+ BuildError { kind: ErrorKind::PatternTooLong { pattern, len } }
+ }
+}
+
+#[cfg(feature = "std")]
+impl std::error::Error for BuildError {}
+
+impl core::fmt::Display for BuildError {
+ fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+ match self.kind {
+ ErrorKind::StateIDOverflow { max, requested_max } => {
+ write!(
+ f,
+ "state identifier overflow: failed to create state ID \
+ from {}, which exceeds the max of {}",
+ requested_max, max,
+ )
+ }
+ ErrorKind::PatternIDOverflow { max, requested_max } => {
+ write!(
+ f,
+ "pattern identifier overflow: failed to create pattern ID \
+ from {}, which exceeds the max of {}",
+ requested_max, max,
+ )
+ }
+ ErrorKind::PatternTooLong { pattern, len } => {
+ write!(
+ f,
+ "pattern {} with length {} exceeds \
+ the maximum pattern length of {}",
+ pattern.as_usize(),
+ len,
+ SmallIndex::MAX.as_usize(),
+ )
+ }
+ }
+ }
+}
+
+/// An error that occurred during an Aho-Corasick search.
+///
+/// An error that occurs during a search is limited to some kind of
+/// misconfiguration that resulted in an illegal call. Stated differently,
+/// whether an error occurs is not dependent on the specific bytes in the
+/// haystack.
+///
+/// Examples of misconfiguration:
+///
+/// * Executing a stream or overlapping search on a searcher that was built was
+/// something other than [`MatchKind::Standard`](crate::MatchKind::Standard)
+/// semantics.
+/// * Requested an anchored or an unanchored search on a searcher that doesn't
+/// support unanchored or anchored searches, respectively.
+///
+/// When the `std` feature is enabled, this implements the `std::error::Error`
+/// trait.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct MatchError(alloc::boxed::Box<MatchErrorKind>);
+
+impl MatchError {
+ /// Create a new error value with the given kind.
+ ///
+ /// This is a more verbose version of the kind-specific constructors, e.g.,
+ /// `MatchError::unsupported_stream`.
+ pub fn new(kind: MatchErrorKind) -> MatchError {
+ MatchError(alloc::boxed::Box::new(kind))
+ }
+
+ /// Returns a reference to the underlying error kind.
+ pub fn kind(&self) -> &MatchErrorKind {
+ &self.0
+ }
+
+ /// Create a new "invalid anchored search" error. This occurs when the
+ /// caller requests an anchored search but where anchored searches aren't
+ /// supported.
+ ///
+ /// This is the same as calling `MatchError::new` with a
+ /// [`MatchErrorKind::InvalidInputAnchored`] kind.
+ pub fn invalid_input_anchored() -> MatchError {
+ MatchError::new(MatchErrorKind::InvalidInputAnchored)
+ }
+
+ /// Create a new "invalid unanchored search" error. This occurs when the
+ /// caller requests an unanchored search but where unanchored searches
+ /// aren't supported.
+ ///
+ /// This is the same as calling `MatchError::new` with a
+ /// [`MatchErrorKind::InvalidInputUnanchored`] kind.
+ pub fn invalid_input_unanchored() -> MatchError {
+ MatchError::new(MatchErrorKind::InvalidInputUnanchored)
+ }
+
+ /// Create a new "unsupported stream search" error. This occurs when the
+ /// caller requests a stream search while using an Aho-Corasick automaton
+ /// with a match kind other than [`MatchKind::Standard`].
+ ///
+ /// The match kind given should be the match kind of the automaton. It
+ /// should never be `MatchKind::Standard`.
+ pub fn unsupported_stream(got: MatchKind) -> MatchError {
+ MatchError::new(MatchErrorKind::UnsupportedStream { got })
+ }
+
+ /// Create a new "unsupported overlapping search" error. This occurs when
+ /// the caller requests an overlapping search while using an Aho-Corasick
+ /// automaton with a match kind other than [`MatchKind::Standard`].
+ ///
+ /// The match kind given should be the match kind of the automaton. It
+ /// should never be `MatchKind::Standard`.
+ pub fn unsupported_overlapping(got: MatchKind) -> MatchError {
+ MatchError::new(MatchErrorKind::UnsupportedOverlapping { got })
+ }
+
+ /// Create a new "unsupported empty pattern" error. This occurs when the
+ /// caller requests a search for which matching an automaton that contains
+ /// an empty pattern string is not supported.
+ pub fn unsupported_empty() -> MatchError {
+ MatchError::new(MatchErrorKind::UnsupportedEmpty)
+ }
+}
+
+/// The underlying kind of a [`MatchError`].
+///
+/// This is a **non-exhaustive** enum. That means new variants may be added in
+/// a semver-compatible release.
+#[non_exhaustive]
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub enum MatchErrorKind {
+ /// An error indicating that an anchored search was requested, but from a
+ /// searcher that was built without anchored support.
+ InvalidInputAnchored,
+ /// An error indicating that an unanchored search was requested, but from a
+ /// searcher that was built without unanchored support.
+ InvalidInputUnanchored,
+ /// An error indicating that a stream search was attempted on an
+ /// Aho-Corasick automaton with an unsupported `MatchKind`.
+ UnsupportedStream {
+ /// The match semantics for the automaton that was used.
+ got: MatchKind,
+ },
+ /// An error indicating that an overlapping search was attempted on an
+ /// Aho-Corasick automaton with an unsupported `MatchKind`.
+ UnsupportedOverlapping {
+ /// The match semantics for the automaton that was used.
+ got: MatchKind,
+ },
+ /// An error indicating that the operation requested doesn't support
+ /// automatons that contain an empty pattern string.
+ UnsupportedEmpty,
+}
+
+#[cfg(feature = "std")]
+impl std::error::Error for MatchError {}
+
+impl core::fmt::Display for MatchError {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ match *self.kind() {
+ MatchErrorKind::InvalidInputAnchored => {
+ write!(f, "anchored searches are not supported or enabled")
+ }
+ MatchErrorKind::InvalidInputUnanchored => {
+ write!(f, "unanchored searches are not supported or enabled")
+ }
+ MatchErrorKind::UnsupportedStream { got } => {
+ write!(
+ f,
+ "match kind {:?} does not support stream searching",
+ got,
+ )
+ }
+ MatchErrorKind::UnsupportedOverlapping { got } => {
+ write!(
+ f,
+ "match kind {:?} does not support overlapping searches",
+ got,
+ )
+ }
+ MatchErrorKind::UnsupportedEmpty => {
+ write!(
+ f,
+ "matching with an empty pattern string is not \
+ supported for this operation",
+ )
+ }
+ }
+ }
+}
diff --git a/third_party/rust/aho-corasick/src/util/int.rs b/third_party/rust/aho-corasick/src/util/int.rs
new file mode 100644
index 0000000000..28ede7a47f
--- /dev/null
+++ b/third_party/rust/aho-corasick/src/util/int.rs
@@ -0,0 +1,284 @@
+/*!
+This module provides several integer oriented traits for converting between
+both fixed size integers and integers whose size varies based on the target
+(like `usize`).
+
+The main design principle for this module is to centralize all uses of `as`.
+The thinking here is that `as` makes it very easy to perform accidental lossy
+conversions, and if we centralize all its uses here under more descriptive
+higher level operations, its use and correctness becomes easier to audit.
+
+This was copied mostly wholesale from `regex-automata`.
+
+NOTE: for simplicity, we don't take target pointer width into account here for
+`usize` conversions. Since we currently only panic in debug mode, skipping the
+check when it can be proven it isn't needed at compile time doesn't really
+matter. Now, if we wind up wanting to do as many checks as possible in release
+mode, then we would want to skip those when we know the conversions are always
+non-lossy.
+*/
+
+pub(crate) trait U8 {
+ fn as_usize(self) -> usize;
+}
+
+impl U8 for u8 {
+ fn as_usize(self) -> usize {
+ usize::from(self)
+ }
+}
+
+pub(crate) trait U16 {
+ fn as_usize(self) -> usize;
+ fn low_u8(self) -> u8;
+ fn high_u8(self) -> u8;
+}
+
+impl U16 for u16 {
+ fn as_usize(self) -> usize {
+ usize::from(self)
+ }
+
+ fn low_u8(self) -> u8 {
+ self as u8
+ }
+
+ fn high_u8(self) -> u8 {
+ (self >> 8) as u8
+ }
+}
+
+pub(crate) trait U32 {
+ fn as_usize(self) -> usize;
+ fn low_u8(self) -> u8;
+ fn low_u16(self) -> u16;
+ fn high_u16(self) -> u16;
+}
+
+impl U32 for u32 {
+ #[inline]
+ fn as_usize(self) -> usize {
+ #[cfg(debug_assertions)]
+ {
+ usize::try_from(self).expect("u32 overflowed usize")
+ }
+ #[cfg(not(debug_assertions))]
+ {
+ self as usize
+ }
+ }
+
+ fn low_u8(self) -> u8 {
+ self as u8
+ }
+
+ fn low_u16(self) -> u16 {
+ self as u16
+ }
+
+ fn high_u16(self) -> u16 {
+ (self >> 16) as u16
+ }
+}
+
+pub(crate) trait U64 {
+ fn as_usize(self) -> usize;
+ fn low_u8(self) -> u8;
+ fn low_u16(self) -> u16;
+ fn low_u32(self) -> u32;
+ fn high_u32(self) -> u32;
+}
+
+impl U64 for u64 {
+ fn as_usize(self) -> usize {
+ #[cfg(debug_assertions)]
+ {
+ usize::try_from(self).expect("u64 overflowed usize")
+ }
+ #[cfg(not(debug_assertions))]
+ {
+ self as usize
+ }
+ }
+
+ fn low_u8(self) -> u8 {
+ self as u8
+ }
+
+ fn low_u16(self) -> u16 {
+ self as u16
+ }
+
+ fn low_u32(self) -> u32 {
+ self as u32
+ }
+
+ fn high_u32(self) -> u32 {
+ (self >> 32) as u32
+ }
+}
+
+pub(crate) trait I8 {
+ fn as_usize(self) -> usize;
+ fn to_bits(self) -> u8;
+ fn from_bits(n: u8) -> i8;
+}
+
+impl I8 for i8 {
+ fn as_usize(self) -> usize {
+ #[cfg(debug_assertions)]
+ {
+ usize::try_from(self).expect("i8 overflowed usize")
+ }
+ #[cfg(not(debug_assertions))]
+ {
+ self as usize
+ }
+ }
+
+ fn to_bits(self) -> u8 {
+ self as u8
+ }
+
+ fn from_bits(n: u8) -> i8 {
+ n as i8
+ }
+}
+
+pub(crate) trait I32 {
+ fn as_usize(self) -> usize;
+ fn to_bits(self) -> u32;
+ fn from_bits(n: u32) -> i32;
+}
+
+impl I32 for i32 {
+ fn as_usize(self) -> usize {
+ #[cfg(debug_assertions)]
+ {
+ usize::try_from(self).expect("i32 overflowed usize")
+ }
+ #[cfg(not(debug_assertions))]
+ {
+ self as usize
+ }
+ }
+
+ fn to_bits(self) -> u32 {
+ self as u32
+ }
+
+ fn from_bits(n: u32) -> i32 {
+ n as i32
+ }
+}
+
+pub(crate) trait I64 {
+ fn as_usize(self) -> usize;
+ fn to_bits(self) -> u64;
+ fn from_bits(n: u64) -> i64;
+}
+
+impl I64 for i64 {
+ fn as_usize(self) -> usize {
+ #[cfg(debug_assertions)]
+ {
+ usize::try_from(self).expect("i64 overflowed usize")
+ }
+ #[cfg(not(debug_assertions))]
+ {
+ self as usize
+ }
+ }
+
+ fn to_bits(self) -> u64 {
+ self as u64
+ }
+
+ fn from_bits(n: u64) -> i64 {
+ n as i64
+ }
+}
+
+pub(crate) trait Usize {
+ fn as_u8(self) -> u8;
+ fn as_u16(self) -> u16;
+ fn as_u32(self) -> u32;
+ fn as_u64(self) -> u64;
+}
+
+impl Usize for usize {
+ fn as_u8(self) -> u8 {
+ #[cfg(debug_assertions)]
+ {
+ u8::try_from(self).expect("usize overflowed u8")
+ }
+ #[cfg(not(debug_assertions))]
+ {
+ self as u8
+ }
+ }
+
+ fn as_u16(self) -> u16 {
+ #[cfg(debug_assertions)]
+ {
+ u16::try_from(self).expect("usize overflowed u16")
+ }
+ #[cfg(not(debug_assertions))]
+ {
+ self as u16
+ }
+ }
+
+ fn as_u32(self) -> u32 {
+ #[cfg(debug_assertions)]
+ {
+ u32::try_from(self).expect("usize overflowed u32")
+ }
+ #[cfg(not(debug_assertions))]
+ {
+ self as u32
+ }
+ }
+
+ fn as_u64(self) -> u64 {
+ #[cfg(debug_assertions)]
+ {
+ u64::try_from(self).expect("usize overflowed u64")
+ }
+ #[cfg(not(debug_assertions))]
+ {
+ self as u64
+ }
+ }
+}
+
+// Pointers aren't integers, but we convert pointers to integers to perform
+// offset arithmetic in some places. (And no, we don't convert the integers
+// back to pointers.) So add 'as_usize' conversions here too for completeness.
+//
+// These 'as' casts are actually okay because they're always non-lossy. But the
+// idea here is to just try and remove as much 'as' as possible, particularly
+// in this crate where we are being really paranoid about offsets and making
+// sure we don't panic on inputs that might be untrusted. This way, the 'as'
+// casts become easier to audit if they're all in one place, even when some of
+// them are actually okay 100% of the time.
+
+pub(crate) trait Pointer {
+ fn as_usize(self) -> usize;
+}
+
+impl<T> Pointer for *const T {
+ fn as_usize(self) -> usize {
+ self as usize
+ }
+}
+
+pub(crate) trait PointerMut {
+ fn as_usize(self) -> usize;
+}
+
+impl<T> PointerMut for *mut T {
+ fn as_usize(self) -> usize {
+ self as usize
+ }
+}
diff --git a/third_party/rust/aho-corasick/src/util/mod.rs b/third_party/rust/aho-corasick/src/util/mod.rs
new file mode 100644
index 0000000000..f7a1ddd07b
--- /dev/null
+++ b/third_party/rust/aho-corasick/src/util/mod.rs
@@ -0,0 +1,12 @@
+pub(crate) mod alphabet;
+#[cfg(feature = "std")]
+pub(crate) mod buffer;
+pub(crate) mod byte_frequencies;
+pub(crate) mod debug;
+pub(crate) mod error;
+pub(crate) mod int;
+pub(crate) mod prefilter;
+pub(crate) mod primitives;
+pub(crate) mod remapper;
+pub(crate) mod search;
+pub(crate) mod special;
diff --git a/third_party/rust/aho-corasick/src/util/prefilter.rs b/third_party/rust/aho-corasick/src/util/prefilter.rs
new file mode 100644
index 0000000000..f5ddc75b7c
--- /dev/null
+++ b/third_party/rust/aho-corasick/src/util/prefilter.rs
@@ -0,0 +1,924 @@
+use core::{
+ cmp,
+ fmt::Debug,
+ panic::{RefUnwindSafe, UnwindSafe},
+ u8,
+};
+
+use alloc::{sync::Arc, vec, vec::Vec};
+
+use crate::{
+ packed,
+ util::{
+ alphabet::ByteSet,
+ search::{Match, MatchKind, Span},
+ },
+};
+
+/// A prefilter for accelerating a search.
+///
+/// This crate uses prefilters in the core search implementations to accelerate
+/// common cases. They typically only apply to cases where there are a small
+/// number of patterns (less than 100 or so), but when they do, thoughput can
+/// be boosted considerably, perhaps by an order of magnitude. When a prefilter
+/// is active, it is used whenever a search enters an automaton's start state.
+///
+/// Currently, prefilters cannot be constructed by
+/// callers. A `Prefilter` can only be accessed via the
+/// [`Automaton::prefilter`](crate::automaton::Automaton::prefilter)
+/// method and used to execute a search. In other words, a prefilter can be
+/// used to optimize your own search implementation if necessary, but cannot do
+/// much else. If you have a use case for more APIs, please submit an issue.
+#[derive(Clone, Debug)]
+pub struct Prefilter {
+ finder: Arc<dyn PrefilterI>,
+ memory_usage: usize,
+}
+
+impl Prefilter {
+ /// Execute a search in the haystack within the span given. If a match or
+ /// a possible match is returned, then it is guaranteed to occur within
+ /// the bounds of the span.
+ ///
+ /// If the span provided is invalid for the given haystack, then behavior
+ /// is unspecified.
+ #[inline]
+ pub fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
+ self.finder.find_in(haystack, span)
+ }
+
+ #[inline]
+ pub(crate) fn memory_usage(&self) -> usize {
+ self.memory_usage
+ }
+}
+
+/// A candidate is the result of running a prefilter on a haystack at a
+/// particular position.
+///
+/// The result is either no match, a confirmed match or a possible match.
+///
+/// When no match is returned, the prefilter is guaranteeing that no possible
+/// match can be found in the haystack, and the caller may trust this. That is,
+/// all correct prefilters must never report false negatives.
+///
+/// In some cases, a prefilter can confirm a match very quickly, in which case,
+/// the caller may use this to stop what it's doing and report the match. In
+/// this case, prefilter implementations must never report a false positive.
+/// In other cases, the prefilter can only report a potential match, in which
+/// case the callers must attempt to confirm the match. In this case, prefilter
+/// implementations are permitted to return false positives.
+#[derive(Clone, Debug)]
+pub enum Candidate {
+ /// No match was found. Since false negatives are not possible, this means
+ /// the search can quit as it is guaranteed not to find another match.
+ None,
+ /// A confirmed match was found. Callers do not need to confirm it.
+ Match(Match),
+ /// The start of a possible match was found. Callers must confirm it before
+ /// reporting it as a match.
+ PossibleStartOfMatch(usize),
+}
+
+impl Candidate {
+ /// Convert this candidate into an option. This is useful when callers
+ /// do not distinguish between true positives and false positives (i.e.,
+ /// the caller must always confirm the match).
+ pub fn into_option(self) -> Option<usize> {
+ match self {
+ Candidate::None => None,
+ Candidate::Match(ref m) => Some(m.start()),
+ Candidate::PossibleStartOfMatch(start) => Some(start),
+ }
+ }
+}
+
+/// A prefilter describes the behavior of fast literal scanners for quickly
+/// skipping past bytes in the haystack that we know cannot possibly
+/// participate in a match.
+trait PrefilterI:
+ Send + Sync + RefUnwindSafe + UnwindSafe + Debug + 'static
+{
+ /// Returns the next possible match candidate. This may yield false
+ /// positives, so callers must confirm a match starting at the position
+ /// returned. This, however, must never produce false negatives. That is,
+ /// this must, at minimum, return the starting position of the next match
+ /// in the given haystack after or at the given position.
+ fn find_in(&self, haystack: &[u8], span: Span) -> Candidate;
+}
+
+impl<P: PrefilterI + ?Sized> PrefilterI for Arc<P> {
+ #[inline(always)]
+ fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
+ (**self).find_in(haystack, span)
+ }
+}
+
+/// A builder for constructing the best possible prefilter. When constructed,
+/// this builder will heuristically select the best prefilter it can build,
+/// if any, and discard the rest.
+#[derive(Debug)]
+pub(crate) struct Builder {
+ count: usize,
+ ascii_case_insensitive: bool,
+ start_bytes: StartBytesBuilder,
+ rare_bytes: RareBytesBuilder,
+ memmem: MemmemBuilder,
+ packed: Option<packed::Builder>,
+ // If we run across a condition that suggests we shouldn't use a prefilter
+ // at all (like an empty pattern), then disable prefilters entirely.
+ enabled: bool,
+}
+
+impl Builder {
+ /// Create a new builder for constructing the best possible prefilter.
+ pub(crate) fn new(kind: MatchKind) -> Builder {
+ let pbuilder = kind
+ .as_packed()
+ .map(|kind| packed::Config::new().match_kind(kind).builder());
+ Builder {
+ count: 0,
+ ascii_case_insensitive: false,
+ start_bytes: StartBytesBuilder::new(),
+ rare_bytes: RareBytesBuilder::new(),
+ memmem: MemmemBuilder::default(),
+ packed: pbuilder,
+ enabled: true,
+ }
+ }
+
+ /// Enable ASCII case insensitivity. When set, byte strings added to this
+ /// builder will be interpreted without respect to ASCII case.
+ pub(crate) fn ascii_case_insensitive(mut self, yes: bool) -> Builder {
+ self.ascii_case_insensitive = yes;
+ self.start_bytes = self.start_bytes.ascii_case_insensitive(yes);
+ self.rare_bytes = self.rare_bytes.ascii_case_insensitive(yes);
+ self
+ }
+
+ /// Return a prefilter suitable for quickly finding potential matches.
+ ///
+ /// All patterns added to an Aho-Corasick automaton should be added to this
+ /// builder before attempting to construct the prefilter.
+ pub(crate) fn build(&self) -> Option<Prefilter> {
+ if !self.enabled {
+ debug!("prefilter not enabled, skipping");
+ return None;
+ }
+ // If we only have one pattern, then deferring to memmem is always
+ // the best choice. This is kind of a weird case, because, well, why
+ // use Aho-Corasick if you only have one pattern? But maybe you don't
+ // know exactly how many patterns you'll get up front, and you need to
+ // support the option of multiple patterns. So instead of relying on
+ // the caller to branch and use memmem explicitly, we just do it for
+ // them.
+ if !self.ascii_case_insensitive {
+ if let Some(pre) = self.memmem.build() {
+ debug!("using memmem prefilter");
+ return Some(pre);
+ }
+ }
+ let (packed, patlen, minlen) = if self.ascii_case_insensitive {
+ (None, usize::MAX, 0)
+ } else {
+ let patlen = self.packed.as_ref().map_or(usize::MAX, |p| p.len());
+ let minlen = self.packed.as_ref().map_or(0, |p| p.minimum_len());
+ let packed =
+ self.packed.as_ref().and_then(|b| b.build()).map(|s| {
+ let memory_usage = s.memory_usage();
+ debug!(
+ "built packed prefilter (len: {}, \
+ minimum pattern len: {}, memory usage: {}) \
+ for consideration",
+ patlen, minlen, memory_usage,
+ );
+ Prefilter { finder: Arc::new(Packed(s)), memory_usage }
+ });
+ (packed, patlen, minlen)
+ };
+ match (self.start_bytes.build(), self.rare_bytes.build()) {
+ // If we could build both start and rare prefilters, then there are
+ // a few cases in which we'd want to use the start-byte prefilter
+ // over the rare-byte prefilter, since the former has lower
+ // overhead.
+ (prestart @ Some(_), prerare @ Some(_)) => {
+ debug!(
+ "both start (len={}, rank={}) and \
+ rare (len={}, rank={}) byte prefilters \
+ are available",
+ self.start_bytes.count,
+ self.start_bytes.rank_sum,
+ self.rare_bytes.count,
+ self.rare_bytes.rank_sum,
+ );
+ if patlen <= 16
+ && minlen >= 2
+ && self.start_bytes.count >= 3
+ && self.rare_bytes.count >= 3
+ {
+ debug!(
+ "start and rare byte prefilters available, but \
+ they're probably slower than packed so using \
+ packed"
+ );
+ return packed;
+ }
+ // If the start-byte prefilter can scan for a smaller number
+ // of bytes than the rare-byte prefilter, then it's probably
+ // faster.
+ let has_fewer_bytes =
+ self.start_bytes.count < self.rare_bytes.count;
+ // Otherwise, if the combined frequency rank of the detected
+ // bytes in the start-byte prefilter is "close" to the combined
+ // frequency rank of the rare-byte prefilter, then we pick
+ // the start-byte prefilter even if the rare-byte prefilter
+ // heuristically searches for rare bytes. This is because the
+ // rare-byte prefilter has higher constant costs, so we tend to
+ // prefer the start-byte prefilter when we can.
+ let has_rarer_bytes =
+ self.start_bytes.rank_sum <= self.rare_bytes.rank_sum + 50;
+ if has_fewer_bytes {
+ debug!(
+ "using start byte prefilter because it has fewer
+ bytes to search for than the rare byte prefilter",
+ );
+ prestart
+ } else if has_rarer_bytes {
+ debug!(
+ "using start byte prefilter because its byte \
+ frequency rank was determined to be \
+ \"good enough\" relative to the rare byte prefilter \
+ byte frequency rank",
+ );
+ prestart
+ } else {
+ debug!("using rare byte prefilter");
+ prerare
+ }
+ }
+ (prestart @ Some(_), None) => {
+ if patlen <= 16 && minlen >= 2 && self.start_bytes.count >= 3 {
+ debug!(
+ "start byte prefilter available, but \
+ it's probably slower than packed so using \
+ packed"
+ );
+ return packed;
+ }
+ debug!(
+ "have start byte prefilter but not rare byte prefilter, \
+ so using start byte prefilter",
+ );
+ prestart
+ }
+ (None, prerare @ Some(_)) => {
+ if patlen <= 16 && minlen >= 2 && self.rare_bytes.count >= 3 {
+ debug!(
+ "rare byte prefilter available, but \
+ it's probably slower than packed so using \
+ packed"
+ );
+ return packed;
+ }
+ debug!(
+ "have rare byte prefilter but not start byte prefilter, \
+ so using rare byte prefilter",
+ );
+ prerare
+ }
+ (None, None) if self.ascii_case_insensitive => {
+ debug!(
+ "no start or rare byte prefilter and ASCII case \
+ insensitivity was enabled, so skipping prefilter",
+ );
+ None
+ }
+ (None, None) => {
+ if packed.is_some() {
+ debug!("falling back to packed prefilter");
+ } else {
+ debug!("no prefilter available");
+ }
+ packed
+ }
+ }
+ }
+
+ /// Add a literal string to this prefilter builder.
+ pub(crate) fn add(&mut self, bytes: &[u8]) {
+ if bytes.is_empty() {
+ self.enabled = false;
+ }
+ if !self.enabled {
+ return;
+ }
+ self.count += 1;
+ self.start_bytes.add(bytes);
+ self.rare_bytes.add(bytes);
+ self.memmem.add(bytes);
+ if let Some(ref mut pbuilder) = self.packed {
+ pbuilder.add(bytes);
+ }
+ }
+}
+
+/// A type that wraps a packed searcher and implements the `Prefilter`
+/// interface.
+#[derive(Clone, Debug)]
+struct Packed(packed::Searcher);
+
+impl PrefilterI for Packed {
+ fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
+ self.0
+ .find_in(&haystack, span)
+ .map_or(Candidate::None, Candidate::Match)
+ }
+}
+
+/// A builder for constructing a prefilter that uses memmem.
+#[derive(Debug, Default)]
+struct MemmemBuilder {
+ /// The number of patterns that have been added.
+ count: usize,
+ /// The singular pattern to search for. This is only set when count==1.
+ one: Option<Vec<u8>>,
+}
+
+impl MemmemBuilder {
+ fn build(&self) -> Option<Prefilter> {
+ #[cfg(all(feature = "std", feature = "perf-literal"))]
+ fn imp(builder: &MemmemBuilder) -> Option<Prefilter> {
+ let pattern = builder.one.as_ref()?;
+ assert_eq!(1, builder.count);
+ let finder = Arc::new(Memmem(
+ memchr::memmem::Finder::new(pattern).into_owned(),
+ ));
+ let memory_usage = pattern.len();
+ Some(Prefilter { finder, memory_usage })
+ }
+
+ #[cfg(not(all(feature = "std", feature = "perf-literal")))]
+ fn imp(_: &MemmemBuilder) -> Option<Prefilter> {
+ None
+ }
+
+ imp(self)
+ }
+
+ fn add(&mut self, bytes: &[u8]) {
+ self.count += 1;
+ if self.count == 1 {
+ self.one = Some(bytes.to_vec());
+ } else {
+ self.one = None;
+ }
+ }
+}
+
+/// A type that wraps a SIMD accelerated single substring search from the
+/// `memchr` crate for use as a prefilter.
+///
+/// Currently, this prefilter is only active for Aho-Corasick searchers with
+/// a single pattern. In theory, this could be extended to support searchers
+/// that have a common prefix of more than one byte (for one byte, we would use
+/// memchr), but it's not clear if it's worth it or not.
+///
+/// Also, unfortunately, this currently also requires the 'std' feature to
+/// be enabled. That's because memchr doesn't have a no-std-but-with-alloc
+/// mode, and so APIs like Finder::into_owned aren't available when 'std' is
+/// disabled. But there should be an 'alloc' feature that brings in APIs like
+/// Finder::into_owned but doesn't use std-only features like runtime CPU
+/// feature detection.
+#[cfg(all(feature = "std", feature = "perf-literal"))]
+#[derive(Clone, Debug)]
+struct Memmem(memchr::memmem::Finder<'static>);
+
+#[cfg(all(feature = "std", feature = "perf-literal"))]
+impl PrefilterI for Memmem {
+ fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
+ use crate::util::primitives::PatternID;
+
+ self.0.find(&haystack[span]).map_or(Candidate::None, |i| {
+ let start = span.start + i;
+ let end = start + self.0.needle().len();
+ // N.B. We can declare a match and use a fixed pattern ID here
+ // because a Memmem prefilter is only ever created for searchers
+ // with exactly one pattern. Thus, every match is always a match
+ // and it is always for the first and only pattern.
+ Candidate::Match(Match::new(PatternID::ZERO, start..end))
+ })
+ }
+}
+
+/// A builder for constructing a rare byte prefilter.
+///
+/// A rare byte prefilter attempts to pick out a small set of rare bytes that
+/// occurr in the patterns, and then quickly scan to matches of those rare
+/// bytes.
+#[derive(Clone, Debug)]
+struct RareBytesBuilder {
+ /// Whether this prefilter should account for ASCII case insensitivity or
+ /// not.
+ ascii_case_insensitive: bool,
+ /// A set of rare bytes, indexed by byte value.
+ rare_set: ByteSet,
+ /// A set of byte offsets associated with bytes in a pattern. An entry
+ /// corresponds to a particular bytes (its index) and is only non-zero if
+ /// the byte occurred at an offset greater than 0 in at least one pattern.
+ ///
+ /// If a byte's offset is not representable in 8 bits, then the rare bytes
+ /// prefilter becomes inert.
+ byte_offsets: RareByteOffsets,
+ /// Whether this is available as a prefilter or not. This can be set to
+ /// false during construction if a condition is seen that invalidates the
+ /// use of the rare-byte prefilter.
+ available: bool,
+ /// The number of bytes set to an active value in `byte_offsets`.
+ count: usize,
+ /// The sum of frequency ranks for the rare bytes detected. This is
+ /// intended to give a heuristic notion of how rare the bytes are.
+ rank_sum: u16,
+}
+
+/// A set of byte offsets, keyed by byte.
+#[derive(Clone, Copy)]
+struct RareByteOffsets {
+ /// Each entry corresponds to the maximum offset of the corresponding
+ /// byte across all patterns seen.
+ set: [RareByteOffset; 256],
+}
+
+impl RareByteOffsets {
+ /// Create a new empty set of rare byte offsets.
+ pub(crate) fn empty() -> RareByteOffsets {
+ RareByteOffsets { set: [RareByteOffset::default(); 256] }
+ }
+
+ /// Add the given offset for the given byte to this set. If the offset is
+ /// greater than the existing offset, then it overwrites the previous
+ /// value and returns false. If there is no previous value set, then this
+ /// sets it and returns true.
+ pub(crate) fn set(&mut self, byte: u8, off: RareByteOffset) {
+ self.set[byte as usize].max =
+ cmp::max(self.set[byte as usize].max, off.max);
+ }
+}
+
+impl core::fmt::Debug for RareByteOffsets {
+ fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+ let mut offsets = vec![];
+ for off in self.set.iter() {
+ if off.max > 0 {
+ offsets.push(off);
+ }
+ }
+ f.debug_struct("RareByteOffsets").field("set", &offsets).finish()
+ }
+}
+
+/// Offsets associated with an occurrence of a "rare" byte in any of the
+/// patterns used to construct a single Aho-Corasick automaton.
+#[derive(Clone, Copy, Debug)]
+struct RareByteOffset {
+ /// The maximum offset at which a particular byte occurs from the start
+ /// of any pattern. This is used as a shift amount. That is, when an
+ /// occurrence of this byte is found, the candidate position reported by
+ /// the prefilter is `position_of_byte - max`, such that the automaton
+ /// will begin its search at a position that is guaranteed to observe a
+ /// match.
+ ///
+ /// To avoid accidentally quadratic behavior, a prefilter is considered
+ /// ineffective when it is asked to start scanning from a position that it
+ /// has already scanned past.
+ ///
+ /// Using a `u8` here means that if we ever see a pattern that's longer
+ /// than 255 bytes, then the entire rare byte prefilter is disabled.
+ max: u8,
+}
+
+impl Default for RareByteOffset {
+ fn default() -> RareByteOffset {
+ RareByteOffset { max: 0 }
+ }
+}
+
+impl RareByteOffset {
+ /// Create a new rare byte offset. If the given offset is too big, then
+ /// None is returned. In that case, callers should render the rare bytes
+ /// prefilter inert.
+ fn new(max: usize) -> Option<RareByteOffset> {
+ if max > u8::MAX as usize {
+ None
+ } else {
+ Some(RareByteOffset { max: max as u8 })
+ }
+ }
+}
+
+impl RareBytesBuilder {
+ /// Create a new builder for constructing a rare byte prefilter.
+ fn new() -> RareBytesBuilder {
+ RareBytesBuilder {
+ ascii_case_insensitive: false,
+ rare_set: ByteSet::empty(),
+ byte_offsets: RareByteOffsets::empty(),
+ available: true,
+ count: 0,
+ rank_sum: 0,
+ }
+ }
+
+ /// Enable ASCII case insensitivity. When set, byte strings added to this
+ /// builder will be interpreted without respect to ASCII case.
+ fn ascii_case_insensitive(mut self, yes: bool) -> RareBytesBuilder {
+ self.ascii_case_insensitive = yes;
+ self
+ }
+
+ /// Build the rare bytes prefilter.
+ ///
+ /// If there are more than 3 distinct rare bytes found, or if heuristics
+ /// otherwise determine that this prefilter should not be used, then `None`
+ /// is returned.
+ fn build(&self) -> Option<Prefilter> {
+ #[cfg(feature = "perf-literal")]
+ fn imp(builder: &RareBytesBuilder) -> Option<Prefilter> {
+ if !builder.available || builder.count > 3 {
+ return None;
+ }
+ let (mut bytes, mut len) = ([0; 3], 0);
+ for b in 0..=255 {
+ if builder.rare_set.contains(b) {
+ bytes[len] = b as u8;
+ len += 1;
+ }
+ }
+ let finder: Arc<dyn PrefilterI> = match len {
+ 0 => return None,
+ 1 => Arc::new(RareBytesOne {
+ byte1: bytes[0],
+ offset: builder.byte_offsets.set[bytes[0] as usize],
+ }),
+ 2 => Arc::new(RareBytesTwo {
+ offsets: builder.byte_offsets,
+ byte1: bytes[0],
+ byte2: bytes[1],
+ }),
+ 3 => Arc::new(RareBytesThree {
+ offsets: builder.byte_offsets,
+ byte1: bytes[0],
+ byte2: bytes[1],
+ byte3: bytes[2],
+ }),
+ _ => unreachable!(),
+ };
+ Some(Prefilter { finder, memory_usage: 0 })
+ }
+
+ #[cfg(not(feature = "perf-literal"))]
+ fn imp(_: &RareBytesBuilder) -> Option<Prefilter> {
+ None
+ }
+
+ imp(self)
+ }
+
+ /// Add a byte string to this builder.
+ ///
+ /// All patterns added to an Aho-Corasick automaton should be added to this
+ /// builder before attempting to construct the prefilter.
+ fn add(&mut self, bytes: &[u8]) {
+ // If we've already given up, then do nothing.
+ if !self.available {
+ return;
+ }
+ // If we've already blown our budget, then don't waste time looking
+ // for more rare bytes.
+ if self.count > 3 {
+ self.available = false;
+ return;
+ }
+ // If the pattern is too long, then our offset table is bunk, so
+ // give up.
+ if bytes.len() >= 256 {
+ self.available = false;
+ return;
+ }
+ let mut rarest = match bytes.get(0) {
+ None => return,
+ Some(&b) => (b, freq_rank(b)),
+ };
+ // The idea here is to look for the rarest byte in each pattern, and
+ // add that to our set. As a special exception, if we see a byte that
+ // we've already added, then we immediately stop and choose that byte,
+ // even if there's another rare byte in the pattern. This helps us
+ // apply the rare byte optimization in more cases by attempting to pick
+ // bytes that are in common between patterns. So for example, if we
+ // were searching for `Sherlock` and `lockjaw`, then this would pick
+ // `k` for both patterns, resulting in the use of `memchr` instead of
+ // `memchr2` for `k` and `j`.
+ let mut found = false;
+ for (pos, &b) in bytes.iter().enumerate() {
+ self.set_offset(pos, b);
+ if found {
+ continue;
+ }
+ if self.rare_set.contains(b) {
+ found = true;
+ continue;
+ }
+ let rank = freq_rank(b);
+ if rank < rarest.1 {
+ rarest = (b, rank);
+ }
+ }
+ if !found {
+ self.add_rare_byte(rarest.0);
+ }
+ }
+
+ fn set_offset(&mut self, pos: usize, byte: u8) {
+ // This unwrap is OK because pos is never bigger than our max.
+ let offset = RareByteOffset::new(pos).unwrap();
+ self.byte_offsets.set(byte, offset);
+ if self.ascii_case_insensitive {
+ self.byte_offsets.set(opposite_ascii_case(byte), offset);
+ }
+ }
+
+ fn add_rare_byte(&mut self, byte: u8) {
+ self.add_one_rare_byte(byte);
+ if self.ascii_case_insensitive {
+ self.add_one_rare_byte(opposite_ascii_case(byte));
+ }
+ }
+
+ fn add_one_rare_byte(&mut self, byte: u8) {
+ if !self.rare_set.contains(byte) {
+ self.rare_set.add(byte);
+ self.count += 1;
+ self.rank_sum += freq_rank(byte) as u16;
+ }
+ }
+}
+
+/// A prefilter for scanning for a single "rare" byte.
+#[cfg(feature = "perf-literal")]
+#[derive(Clone, Debug)]
+struct RareBytesOne {
+ byte1: u8,
+ offset: RareByteOffset,
+}
+
+#[cfg(feature = "perf-literal")]
+impl PrefilterI for RareBytesOne {
+ fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
+ memchr::memchr(self.byte1, &haystack[span])
+ .map(|i| {
+ let pos = span.start + i;
+ cmp::max(
+ span.start,
+ pos.saturating_sub(usize::from(self.offset.max)),
+ )
+ })
+ .map_or(Candidate::None, Candidate::PossibleStartOfMatch)
+ }
+}
+
+/// A prefilter for scanning for two "rare" bytes.
+#[cfg(feature = "perf-literal")]
+#[derive(Clone, Debug)]
+struct RareBytesTwo {
+ offsets: RareByteOffsets,
+ byte1: u8,
+ byte2: u8,
+}
+
+#[cfg(feature = "perf-literal")]
+impl PrefilterI for RareBytesTwo {
+ fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
+ memchr::memchr2(self.byte1, self.byte2, &haystack[span])
+ .map(|i| {
+ let pos = span.start + i;
+ let offset = self.offsets.set[usize::from(haystack[pos])].max;
+ cmp::max(span.start, pos.saturating_sub(usize::from(offset)))
+ })
+ .map_or(Candidate::None, Candidate::PossibleStartOfMatch)
+ }
+}
+
+/// A prefilter for scanning for three "rare" bytes.
+#[cfg(feature = "perf-literal")]
+#[derive(Clone, Debug)]
+struct RareBytesThree {
+ offsets: RareByteOffsets,
+ byte1: u8,
+ byte2: u8,
+ byte3: u8,
+}
+
+#[cfg(feature = "perf-literal")]
+impl PrefilterI for RareBytesThree {
+ fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
+ memchr::memchr3(self.byte1, self.byte2, self.byte3, &haystack[span])
+ .map(|i| {
+ let pos = span.start + i;
+ let offset = self.offsets.set[usize::from(haystack[pos])].max;
+ cmp::max(span.start, pos.saturating_sub(usize::from(offset)))
+ })
+ .map_or(Candidate::None, Candidate::PossibleStartOfMatch)
+ }
+}
+
+/// A builder for constructing a starting byte prefilter.
+///
+/// A starting byte prefilter is a simplistic prefilter that looks for possible
+/// matches by reporting all positions corresponding to a particular byte. This
+/// generally only takes affect when there are at most 3 distinct possible
+/// starting bytes. e.g., the patterns `foo`, `bar`, and `baz` have two
+/// distinct starting bytes (`f` and `b`), and this prefilter returns all
+/// occurrences of either `f` or `b`.
+///
+/// In some cases, a heuristic frequency analysis may determine that it would
+/// be better not to use this prefilter even when there are 3 or fewer distinct
+/// starting bytes.
+#[derive(Clone, Debug)]
+struct StartBytesBuilder {
+ /// Whether this prefilter should account for ASCII case insensitivity or
+ /// not.
+ ascii_case_insensitive: bool,
+ /// The set of starting bytes observed.
+ byteset: Vec<bool>,
+ /// The number of bytes set to true in `byteset`.
+ count: usize,
+ /// The sum of frequency ranks for the rare bytes detected. This is
+ /// intended to give a heuristic notion of how rare the bytes are.
+ rank_sum: u16,
+}
+
+impl StartBytesBuilder {
+ /// Create a new builder for constructing a start byte prefilter.
+ fn new() -> StartBytesBuilder {
+ StartBytesBuilder {
+ ascii_case_insensitive: false,
+ byteset: vec![false; 256],
+ count: 0,
+ rank_sum: 0,
+ }
+ }
+
+ /// Enable ASCII case insensitivity. When set, byte strings added to this
+ /// builder will be interpreted without respect to ASCII case.
+ fn ascii_case_insensitive(mut self, yes: bool) -> StartBytesBuilder {
+ self.ascii_case_insensitive = yes;
+ self
+ }
+
+ /// Build the starting bytes prefilter.
+ ///
+ /// If there are more than 3 distinct starting bytes, or if heuristics
+ /// otherwise determine that this prefilter should not be used, then `None`
+ /// is returned.
+ fn build(&self) -> Option<Prefilter> {
+ #[cfg(feature = "perf-literal")]
+ fn imp(builder: &StartBytesBuilder) -> Option<Prefilter> {
+ if builder.count > 3 {
+ return None;
+ }
+ let (mut bytes, mut len) = ([0; 3], 0);
+ for b in 0..256 {
+ if !builder.byteset[b] {
+ continue;
+ }
+ // We don't handle non-ASCII bytes for now. Getting non-ASCII
+ // bytes right is trickier, since we generally don't want to put
+ // a leading UTF-8 code unit into a prefilter that isn't ASCII,
+ // since they can frequently. Instead, it would be better to use a
+ // continuation byte, but this requires more sophisticated analysis
+ // of the automaton and a richer prefilter API.
+ if b > 0x7F {
+ return None;
+ }
+ bytes[len] = b as u8;
+ len += 1;
+ }
+ let finder: Arc<dyn PrefilterI> = match len {
+ 0 => return None,
+ 1 => Arc::new(StartBytesOne { byte1: bytes[0] }),
+ 2 => Arc::new(StartBytesTwo {
+ byte1: bytes[0],
+ byte2: bytes[1],
+ }),
+ 3 => Arc::new(StartBytesThree {
+ byte1: bytes[0],
+ byte2: bytes[1],
+ byte3: bytes[2],
+ }),
+ _ => unreachable!(),
+ };
+ Some(Prefilter { finder, memory_usage: 0 })
+ }
+
+ #[cfg(not(feature = "perf-literal"))]
+ fn imp(_: &StartBytesBuilder) -> Option<Prefilter> {
+ None
+ }
+
+ imp(self)
+ }
+
+ /// Add a byte string to this builder.
+ ///
+ /// All patterns added to an Aho-Corasick automaton should be added to this
+ /// builder before attempting to construct the prefilter.
+ fn add(&mut self, bytes: &[u8]) {
+ if self.count > 3 {
+ return;
+ }
+ if let Some(&byte) = bytes.get(0) {
+ self.add_one_byte(byte);
+ if self.ascii_case_insensitive {
+ self.add_one_byte(opposite_ascii_case(byte));
+ }
+ }
+ }
+
+ fn add_one_byte(&mut self, byte: u8) {
+ if !self.byteset[byte as usize] {
+ self.byteset[byte as usize] = true;
+ self.count += 1;
+ self.rank_sum += freq_rank(byte) as u16;
+ }
+ }
+}
+
+/// A prefilter for scanning for a single starting byte.
+#[cfg(feature = "perf-literal")]
+#[derive(Clone, Debug)]
+struct StartBytesOne {
+ byte1: u8,
+}
+
+#[cfg(feature = "perf-literal")]
+impl PrefilterI for StartBytesOne {
+ fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
+ memchr::memchr(self.byte1, &haystack[span])
+ .map(|i| span.start + i)
+ .map_or(Candidate::None, Candidate::PossibleStartOfMatch)
+ }
+}
+
+/// A prefilter for scanning for two starting bytes.
+#[cfg(feature = "perf-literal")]
+#[derive(Clone, Debug)]
+struct StartBytesTwo {
+ byte1: u8,
+ byte2: u8,
+}
+
+#[cfg(feature = "perf-literal")]
+impl PrefilterI for StartBytesTwo {
+ fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
+ memchr::memchr2(self.byte1, self.byte2, &haystack[span])
+ .map(|i| span.start + i)
+ .map_or(Candidate::None, Candidate::PossibleStartOfMatch)
+ }
+}
+
+/// A prefilter for scanning for three starting bytes.
+#[cfg(feature = "perf-literal")]
+#[derive(Clone, Debug)]
+struct StartBytesThree {
+ byte1: u8,
+ byte2: u8,
+ byte3: u8,
+}
+
+#[cfg(feature = "perf-literal")]
+impl PrefilterI for StartBytesThree {
+ fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
+ memchr::memchr3(self.byte1, self.byte2, self.byte3, &haystack[span])
+ .map(|i| span.start + i)
+ .map_or(Candidate::None, Candidate::PossibleStartOfMatch)
+ }
+}
+
+/// If the given byte is an ASCII letter, then return it in the opposite case.
+/// e.g., Given `b'A'`, this returns `b'a'`, and given `b'a'`, this returns
+/// `b'A'`. If a non-ASCII letter is given, then the given byte is returned.
+pub(crate) fn opposite_ascii_case(b: u8) -> u8 {
+ if b'A' <= b && b <= b'Z' {
+ b.to_ascii_lowercase()
+ } else if b'a' <= b && b <= b'z' {
+ b.to_ascii_uppercase()
+ } else {
+ b
+ }
+}
+
+/// Return the frequency rank of the given byte. The higher the rank, the more
+/// common the byte (heuristically speaking).
+fn freq_rank(b: u8) -> u8 {
+ use crate::util::byte_frequencies::BYTE_FREQUENCIES;
+ BYTE_FREQUENCIES[b as usize]
+}
diff --git a/third_party/rust/aho-corasick/src/util/primitives.rs b/third_party/rust/aho-corasick/src/util/primitives.rs
new file mode 100644
index 0000000000..784d397171
--- /dev/null
+++ b/third_party/rust/aho-corasick/src/util/primitives.rs
@@ -0,0 +1,759 @@
+/*!
+Lower level primitive types that are useful in a variety of circumstances.
+
+# Overview
+
+This list represents the principle types in this module and briefly describes
+when you might want to use them.
+
+* [`PatternID`] - A type that represents the identifier of a regex pattern.
+This is probably the most widely used type in this module (which is why it's
+also re-exported in the crate root).
+* [`StateID`] - A type the represents the identifier of a finite automaton
+state. This is used for both NFAs and DFAs, with the notable exception of
+the hybrid NFA/DFA. (The hybrid NFA/DFA uses a special purpose "lazy" state
+identifier.)
+* [`SmallIndex`] - The internal representation of both a `PatternID` and a
+`StateID`. Its purpose is to serve as a type that can index memory without
+being as big as a `usize` on 64-bit targets. The main idea behind this type
+is that there are many things in regex engines that will, in practice, never
+overflow a 32-bit integer. (For example, like the number of patterns in a regex
+or the number of states in an NFA.) Thus, a `SmallIndex` can be used to index
+memory without peppering `as` casts everywhere. Moreover, it forces callers
+to handle errors in the case where, somehow, the value would otherwise overflow
+either a 32-bit integer or a `usize` (e.g., on 16-bit targets).
+*/
+
+// The macro we use to define some types below adds methods that we don't
+// use on some of the types. There isn't much, so we just squash the warning.
+#![allow(dead_code)]
+
+use alloc::vec::Vec;
+
+use crate::util::int::{Usize, U16, U32, U64};
+
+/// A type that represents a "small" index.
+///
+/// The main idea of this type is to provide something that can index memory,
+/// but uses less memory than `usize` on 64-bit systems. Specifically, its
+/// representation is always a `u32` and has `repr(transparent)` enabled. (So
+/// it is safe to transmute between a `u32` and a `SmallIndex`.)
+///
+/// A small index is typically useful in cases where there is no practical way
+/// that the index will overflow a 32-bit integer. A good example of this is
+/// an NFA state. If you could somehow build an NFA with `2^30` states, its
+/// memory usage would be exorbitant and its runtime execution would be so
+/// slow as to be completely worthless. Therefore, this crate generally deems
+/// it acceptable to return an error if it would otherwise build an NFA that
+/// requires a slice longer than what a 32-bit integer can index. In exchange,
+/// we can use 32-bit indices instead of 64-bit indices in various places.
+///
+/// This type ensures this by providing a constructor that will return an error
+/// if its argument cannot fit into the type. This makes it much easier to
+/// handle these sorts of boundary cases that are otherwise extremely subtle.
+///
+/// On all targets, this type guarantees that its value will fit in a `u32`,
+/// `i32`, `usize` and an `isize`. This means that on 16-bit targets, for
+/// example, this type's maximum value will never overflow an `isize`,
+/// which means it will never overflow a `i16` even though its internal
+/// representation is still a `u32`.
+///
+/// The purpose for making the type fit into even signed integer types like
+/// `isize` is to guarantee that the difference between any two small indices
+/// is itself also a small index. This is useful in certain contexts, e.g.,
+/// for delta encoding.
+///
+/// # Other types
+///
+/// The following types wrap `SmallIndex` to provide a more focused use case:
+///
+/// * [`PatternID`] is for representing the identifiers of patterns.
+/// * [`StateID`] is for representing the identifiers of states in finite
+/// automata. It is used for both NFAs and DFAs.
+///
+/// # Representation
+///
+/// This type is always represented internally by a `u32` and is marked as
+/// `repr(transparent)`. Thus, this type always has the same representation as
+/// a `u32`. It is thus safe to transmute between a `u32` and a `SmallIndex`.
+///
+/// # Indexing
+///
+/// For convenience, callers may use a `SmallIndex` to index slices.
+///
+/// # Safety
+///
+/// While a `SmallIndex` is meant to guarantee that its value fits into `usize`
+/// without using as much space as a `usize` on all targets, callers must
+/// not rely on this property for safety. Callers may choose to rely on this
+/// property for correctness however. For example, creating a `SmallIndex` with
+/// an invalid value can be done in entirely safe code. This may in turn result
+/// in panics or silent logical errors.
+#[derive(
+ Clone, Copy, Debug, Default, Eq, Hash, PartialEq, PartialOrd, Ord,
+)]
+#[repr(transparent)]
+pub(crate) struct SmallIndex(u32);
+
+impl SmallIndex {
+ /// The maximum index value.
+ #[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))]
+ pub const MAX: SmallIndex =
+ // FIXME: Use as_usize() once const functions in traits are stable.
+ SmallIndex::new_unchecked(core::i32::MAX as usize - 1);
+
+ /// The maximum index value.
+ #[cfg(target_pointer_width = "16")]
+ pub const MAX: SmallIndex =
+ SmallIndex::new_unchecked(core::isize::MAX - 1);
+
+ /// The total number of values that can be represented as a small index.
+ pub const LIMIT: usize = SmallIndex::MAX.as_usize() + 1;
+
+ /// The zero index value.
+ pub const ZERO: SmallIndex = SmallIndex::new_unchecked(0);
+
+ /// The number of bytes that a single small index uses in memory.
+ pub const SIZE: usize = core::mem::size_of::<SmallIndex>();
+
+ /// Create a new small index.
+ ///
+ /// If the given index exceeds [`SmallIndex::MAX`], then this returns
+ /// an error.
+ #[inline]
+ pub fn new(index: usize) -> Result<SmallIndex, SmallIndexError> {
+ SmallIndex::try_from(index)
+ }
+
+ /// Create a new small index without checking whether the given value
+ /// exceeds [`SmallIndex::MAX`].
+ ///
+ /// Using this routine with an invalid index value will result in
+ /// unspecified behavior, but *not* undefined behavior. In particular, an
+ /// invalid index value is likely to cause panics or possibly even silent
+ /// logical errors.
+ ///
+ /// Callers must never rely on a `SmallIndex` to be within a certain range
+ /// for memory safety.
+ #[inline]
+ pub const fn new_unchecked(index: usize) -> SmallIndex {
+ // FIXME: Use as_u32() once const functions in traits are stable.
+ SmallIndex::from_u32_unchecked(index as u32)
+ }
+
+ /// Create a new small index from a `u32` without checking whether the
+ /// given value exceeds [`SmallIndex::MAX`].
+ ///
+ /// Using this routine with an invalid index value will result in
+ /// unspecified behavior, but *not* undefined behavior. In particular, an
+ /// invalid index value is likely to cause panics or possibly even silent
+ /// logical errors.
+ ///
+ /// Callers must never rely on a `SmallIndex` to be within a certain range
+ /// for memory safety.
+ #[inline]
+ pub const fn from_u32_unchecked(index: u32) -> SmallIndex {
+ SmallIndex(index)
+ }
+
+ /// Like [`SmallIndex::new`], but panics if the given index is not valid.
+ #[inline]
+ pub fn must(index: usize) -> SmallIndex {
+ SmallIndex::new(index).expect("invalid small index")
+ }
+
+ /// Return this small index as a `usize`. This is guaranteed to never
+ /// overflow `usize`.
+ #[inline]
+ pub const fn as_usize(&self) -> usize {
+ // FIXME: Use as_usize() once const functions in traits are stable.
+ self.0 as usize
+ }
+
+ /// Return this small index as a `u64`. This is guaranteed to never
+ /// overflow.
+ #[inline]
+ pub const fn as_u64(&self) -> u64 {
+ // FIXME: Use u64::from() once const functions in traits are stable.
+ self.0 as u64
+ }
+
+ /// Return the internal `u32` of this small index. This is guaranteed to
+ /// never overflow `u32`.
+ #[inline]
+ pub const fn as_u32(&self) -> u32 {
+ self.0
+ }
+
+ /// Return the internal `u32` of this small index represented as an `i32`.
+ /// This is guaranteed to never overflow an `i32`.
+ #[inline]
+ pub const fn as_i32(&self) -> i32 {
+ // This is OK because we guarantee that our max value is <= i32::MAX.
+ self.0 as i32
+ }
+
+ /// Returns one more than this small index as a usize.
+ ///
+ /// Since a small index has constraints on its maximum value, adding `1` to
+ /// it will always fit in a `usize`, `isize`, `u32` and a `i32`.
+ #[inline]
+ pub fn one_more(&self) -> usize {
+ self.as_usize() + 1
+ }
+
+ /// Decode this small index from the bytes given using the native endian
+ /// byte order for the current target.
+ ///
+ /// If the decoded integer is not representable as a small index for the
+ /// current target, then this returns an error.
+ #[inline]
+ pub fn from_ne_bytes(
+ bytes: [u8; 4],
+ ) -> Result<SmallIndex, SmallIndexError> {
+ let id = u32::from_ne_bytes(bytes);
+ if id > SmallIndex::MAX.as_u32() {
+ return Err(SmallIndexError { attempted: u64::from(id) });
+ }
+ Ok(SmallIndex::new_unchecked(id.as_usize()))
+ }
+
+ /// Decode this small index from the bytes given using the native endian
+ /// byte order for the current target.
+ ///
+ /// This is analogous to [`SmallIndex::new_unchecked`] in that is does not
+ /// check whether the decoded integer is representable as a small index.
+ #[inline]
+ pub fn from_ne_bytes_unchecked(bytes: [u8; 4]) -> SmallIndex {
+ SmallIndex::new_unchecked(u32::from_ne_bytes(bytes).as_usize())
+ }
+
+ /// Return the underlying small index integer as raw bytes in native endian
+ /// format.
+ #[inline]
+ pub fn to_ne_bytes(&self) -> [u8; 4] {
+ self.0.to_ne_bytes()
+ }
+}
+
+impl<T> core::ops::Index<SmallIndex> for [T] {
+ type Output = T;
+
+ #[inline]
+ fn index(&self, index: SmallIndex) -> &T {
+ &self[index.as_usize()]
+ }
+}
+
+impl<T> core::ops::IndexMut<SmallIndex> for [T] {
+ #[inline]
+ fn index_mut(&mut self, index: SmallIndex) -> &mut T {
+ &mut self[index.as_usize()]
+ }
+}
+
+impl<T> core::ops::Index<SmallIndex> for Vec<T> {
+ type Output = T;
+
+ #[inline]
+ fn index(&self, index: SmallIndex) -> &T {
+ &self[index.as_usize()]
+ }
+}
+
+impl<T> core::ops::IndexMut<SmallIndex> for Vec<T> {
+ #[inline]
+ fn index_mut(&mut self, index: SmallIndex) -> &mut T {
+ &mut self[index.as_usize()]
+ }
+}
+
+impl From<StateID> for SmallIndex {
+ fn from(sid: StateID) -> SmallIndex {
+ sid.0
+ }
+}
+
+impl From<PatternID> for SmallIndex {
+ fn from(pid: PatternID) -> SmallIndex {
+ pid.0
+ }
+}
+
+impl From<u8> for SmallIndex {
+ fn from(index: u8) -> SmallIndex {
+ SmallIndex::new_unchecked(usize::from(index))
+ }
+}
+
+impl TryFrom<u16> for SmallIndex {
+ type Error = SmallIndexError;
+
+ fn try_from(index: u16) -> Result<SmallIndex, SmallIndexError> {
+ if u32::from(index) > SmallIndex::MAX.as_u32() {
+ return Err(SmallIndexError { attempted: u64::from(index) });
+ }
+ Ok(SmallIndex::new_unchecked(index.as_usize()))
+ }
+}
+
+impl TryFrom<u32> for SmallIndex {
+ type Error = SmallIndexError;
+
+ fn try_from(index: u32) -> Result<SmallIndex, SmallIndexError> {
+ if index > SmallIndex::MAX.as_u32() {
+ return Err(SmallIndexError { attempted: u64::from(index) });
+ }
+ Ok(SmallIndex::new_unchecked(index.as_usize()))
+ }
+}
+
+impl TryFrom<u64> for SmallIndex {
+ type Error = SmallIndexError;
+
+ fn try_from(index: u64) -> Result<SmallIndex, SmallIndexError> {
+ if index > SmallIndex::MAX.as_u64() {
+ return Err(SmallIndexError { attempted: index });
+ }
+ Ok(SmallIndex::new_unchecked(index.as_usize()))
+ }
+}
+
+impl TryFrom<usize> for SmallIndex {
+ type Error = SmallIndexError;
+
+ fn try_from(index: usize) -> Result<SmallIndex, SmallIndexError> {
+ if index > SmallIndex::MAX.as_usize() {
+ return Err(SmallIndexError { attempted: index.as_u64() });
+ }
+ Ok(SmallIndex::new_unchecked(index))
+ }
+}
+
+/// This error occurs when a small index could not be constructed.
+///
+/// This occurs when given an integer exceeding the maximum small index value.
+///
+/// When the `std` feature is enabled, this implements the `Error` trait.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct SmallIndexError {
+ attempted: u64,
+}
+
+impl SmallIndexError {
+ /// Returns the value that could not be converted to a small index.
+ pub fn attempted(&self) -> u64 {
+ self.attempted
+ }
+}
+
+#[cfg(feature = "std")]
+impl std::error::Error for SmallIndexError {}
+
+impl core::fmt::Display for SmallIndexError {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ write!(
+ f,
+ "failed to create small index from {:?}, which exceeds {:?}",
+ self.attempted(),
+ SmallIndex::MAX,
+ )
+ }
+}
+
+#[derive(Clone, Debug)]
+pub(crate) struct SmallIndexIter {
+ rng: core::ops::Range<usize>,
+}
+
+impl Iterator for SmallIndexIter {
+ type Item = SmallIndex;
+
+ fn next(&mut self) -> Option<SmallIndex> {
+ if self.rng.start >= self.rng.end {
+ return None;
+ }
+ let next_id = self.rng.start + 1;
+ let id = core::mem::replace(&mut self.rng.start, next_id);
+ // new_unchecked is OK since we asserted that the number of
+ // elements in this iterator will fit in an ID at construction.
+ Some(SmallIndex::new_unchecked(id))
+ }
+}
+
+macro_rules! index_type_impls {
+ ($name:ident, $err:ident, $iter:ident, $withiter:ident) => {
+ impl $name {
+ /// The maximum value.
+ pub const MAX: $name = $name(SmallIndex::MAX);
+
+ /// The total number of values that can be represented.
+ pub const LIMIT: usize = SmallIndex::LIMIT;
+
+ /// The zero value.
+ pub const ZERO: $name = $name(SmallIndex::ZERO);
+
+ /// The number of bytes that a single value uses in memory.
+ pub const SIZE: usize = SmallIndex::SIZE;
+
+ /// Create a new value that is represented by a "small index."
+ ///
+ /// If the given index exceeds the maximum allowed value, then this
+ /// returns an error.
+ #[inline]
+ pub fn new(value: usize) -> Result<$name, $err> {
+ SmallIndex::new(value).map($name).map_err($err)
+ }
+
+ /// Create a new value without checking whether the given argument
+ /// exceeds the maximum.
+ ///
+ /// Using this routine with an invalid value will result in
+ /// unspecified behavior, but *not* undefined behavior. In
+ /// particular, an invalid ID value is likely to cause panics or
+ /// possibly even silent logical errors.
+ ///
+ /// Callers must never rely on this type to be within a certain
+ /// range for memory safety.
+ #[inline]
+ pub const fn new_unchecked(value: usize) -> $name {
+ $name(SmallIndex::new_unchecked(value))
+ }
+
+ /// Create a new value from a `u32` without checking whether the
+ /// given value exceeds the maximum.
+ ///
+ /// Using this routine with an invalid value will result in
+ /// unspecified behavior, but *not* undefined behavior. In
+ /// particular, an invalid ID value is likely to cause panics or
+ /// possibly even silent logical errors.
+ ///
+ /// Callers must never rely on this type to be within a certain
+ /// range for memory safety.
+ #[inline]
+ pub const fn from_u32_unchecked(index: u32) -> $name {
+ $name(SmallIndex::from_u32_unchecked(index))
+ }
+
+ /// Like `new`, but panics if the given value is not valid.
+ #[inline]
+ pub fn must(value: usize) -> $name {
+ $name::new(value).expect(concat!(
+ "invalid ",
+ stringify!($name),
+ " value"
+ ))
+ }
+
+ /// Return the internal value as a `usize`. This is guaranteed to
+ /// never overflow `usize`.
+ #[inline]
+ pub const fn as_usize(&self) -> usize {
+ self.0.as_usize()
+ }
+
+ /// Return the internal value as a `u64`. This is guaranteed to
+ /// never overflow.
+ #[inline]
+ pub const fn as_u64(&self) -> u64 {
+ self.0.as_u64()
+ }
+
+ /// Return the internal value as a `u32`. This is guaranteed to
+ /// never overflow `u32`.
+ #[inline]
+ pub const fn as_u32(&self) -> u32 {
+ self.0.as_u32()
+ }
+
+ /// Return the internal value as a `i32`. This is guaranteed to
+ /// never overflow an `i32`.
+ #[inline]
+ pub const fn as_i32(&self) -> i32 {
+ self.0.as_i32()
+ }
+
+ /// Returns one more than this value as a usize.
+ ///
+ /// Since values represented by a "small index" have constraints
+ /// on their maximum value, adding `1` to it will always fit in a
+ /// `usize`, `u32` and a `i32`.
+ #[inline]
+ pub fn one_more(&self) -> usize {
+ self.0.one_more()
+ }
+
+ /// Decode this value from the bytes given using the native endian
+ /// byte order for the current target.
+ ///
+ /// If the decoded integer is not representable as a small index
+ /// for the current target, then this returns an error.
+ #[inline]
+ pub fn from_ne_bytes(bytes: [u8; 4]) -> Result<$name, $err> {
+ SmallIndex::from_ne_bytes(bytes).map($name).map_err($err)
+ }
+
+ /// Decode this value from the bytes given using the native endian
+ /// byte order for the current target.
+ ///
+ /// This is analogous to `new_unchecked` in that is does not check
+ /// whether the decoded integer is representable as a small index.
+ #[inline]
+ pub fn from_ne_bytes_unchecked(bytes: [u8; 4]) -> $name {
+ $name(SmallIndex::from_ne_bytes_unchecked(bytes))
+ }
+
+ /// Return the underlying integer as raw bytes in native endian
+ /// format.
+ #[inline]
+ pub fn to_ne_bytes(&self) -> [u8; 4] {
+ self.0.to_ne_bytes()
+ }
+
+ /// Returns an iterator over all values from 0 up to and not
+ /// including the given length.
+ ///
+ /// If the given length exceeds this type's limit, then this
+ /// panics.
+ pub(crate) fn iter(len: usize) -> $iter {
+ $iter::new(len)
+ }
+ }
+
+ // We write our own Debug impl so that we get things like PatternID(5)
+ // instead of PatternID(SmallIndex(5)).
+ impl core::fmt::Debug for $name {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ f.debug_tuple(stringify!($name)).field(&self.as_u32()).finish()
+ }
+ }
+
+ impl<T> core::ops::Index<$name> for [T] {
+ type Output = T;
+
+ #[inline]
+ fn index(&self, index: $name) -> &T {
+ &self[index.as_usize()]
+ }
+ }
+
+ impl<T> core::ops::IndexMut<$name> for [T] {
+ #[inline]
+ fn index_mut(&mut self, index: $name) -> &mut T {
+ &mut self[index.as_usize()]
+ }
+ }
+
+ impl<T> core::ops::Index<$name> for Vec<T> {
+ type Output = T;
+
+ #[inline]
+ fn index(&self, index: $name) -> &T {
+ &self[index.as_usize()]
+ }
+ }
+
+ impl<T> core::ops::IndexMut<$name> for Vec<T> {
+ #[inline]
+ fn index_mut(&mut self, index: $name) -> &mut T {
+ &mut self[index.as_usize()]
+ }
+ }
+
+ impl From<SmallIndex> for $name {
+ fn from(index: SmallIndex) -> $name {
+ $name(index)
+ }
+ }
+
+ impl From<u8> for $name {
+ fn from(value: u8) -> $name {
+ $name(SmallIndex::from(value))
+ }
+ }
+
+ impl TryFrom<u16> for $name {
+ type Error = $err;
+
+ fn try_from(value: u16) -> Result<$name, $err> {
+ SmallIndex::try_from(value).map($name).map_err($err)
+ }
+ }
+
+ impl TryFrom<u32> for $name {
+ type Error = $err;
+
+ fn try_from(value: u32) -> Result<$name, $err> {
+ SmallIndex::try_from(value).map($name).map_err($err)
+ }
+ }
+
+ impl TryFrom<u64> for $name {
+ type Error = $err;
+
+ fn try_from(value: u64) -> Result<$name, $err> {
+ SmallIndex::try_from(value).map($name).map_err($err)
+ }
+ }
+
+ impl TryFrom<usize> for $name {
+ type Error = $err;
+
+ fn try_from(value: usize) -> Result<$name, $err> {
+ SmallIndex::try_from(value).map($name).map_err($err)
+ }
+ }
+
+ /// This error occurs when an ID could not be constructed.
+ ///
+ /// This occurs when given an integer exceeding the maximum allowed
+ /// value.
+ ///
+ /// When the `std` feature is enabled, this implements the `Error`
+ /// trait.
+ #[derive(Clone, Debug, Eq, PartialEq)]
+ pub struct $err(SmallIndexError);
+
+ impl $err {
+ /// Returns the value that could not be converted to an ID.
+ pub fn attempted(&self) -> u64 {
+ self.0.attempted()
+ }
+ }
+
+ #[cfg(feature = "std")]
+ impl std::error::Error for $err {}
+
+ impl core::fmt::Display for $err {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ write!(
+ f,
+ "failed to create {} from {:?}, which exceeds {:?}",
+ stringify!($name),
+ self.attempted(),
+ $name::MAX,
+ )
+ }
+ }
+
+ #[derive(Clone, Debug)]
+ pub(crate) struct $iter(SmallIndexIter);
+
+ impl $iter {
+ fn new(len: usize) -> $iter {
+ assert!(
+ len <= $name::LIMIT,
+ "cannot create iterator for {} when number of \
+ elements exceed {:?}",
+ stringify!($name),
+ $name::LIMIT,
+ );
+ $iter(SmallIndexIter { rng: 0..len })
+ }
+ }
+
+ impl Iterator for $iter {
+ type Item = $name;
+
+ fn next(&mut self) -> Option<$name> {
+ self.0.next().map($name)
+ }
+ }
+
+ /// An iterator adapter that is like std::iter::Enumerate, but attaches
+ /// small index values instead. It requires `ExactSizeIterator`. At
+ /// construction, it ensures that the index of each element in the
+ /// iterator is representable in the corresponding small index type.
+ #[derive(Clone, Debug)]
+ pub(crate) struct $withiter<I> {
+ it: I,
+ ids: $iter,
+ }
+
+ impl<I: Iterator + ExactSizeIterator> $withiter<I> {
+ fn new(it: I) -> $withiter<I> {
+ let ids = $name::iter(it.len());
+ $withiter { it, ids }
+ }
+ }
+
+ impl<I: Iterator + ExactSizeIterator> Iterator for $withiter<I> {
+ type Item = ($name, I::Item);
+
+ fn next(&mut self) -> Option<($name, I::Item)> {
+ let item = self.it.next()?;
+ // Number of elements in this iterator must match, according
+ // to contract of ExactSizeIterator.
+ let id = self.ids.next().unwrap();
+ Some((id, item))
+ }
+ }
+ };
+}
+
+/// The identifier of a pattern in an Aho-Corasick automaton.
+///
+/// It is represented by a `u32` even on 64-bit systems in order to conserve
+/// space. Namely, on all targets, this type guarantees that its value will
+/// fit in a `u32`, `i32`, `usize` and an `isize`. This means that on 16-bit
+/// targets, for example, this type's maximum value will never overflow an
+/// `isize`, which means it will never overflow a `i16` even though its
+/// internal representation is still a `u32`.
+///
+/// # Safety
+///
+/// While a `PatternID` is meant to guarantee that its value fits into `usize`
+/// without using as much space as a `usize` on all targets, callers must
+/// not rely on this property for safety. Callers may choose to rely on this
+/// property for correctness however. For example, creating a `StateID` with an
+/// invalid value can be done in entirely safe code. This may in turn result in
+/// panics or silent logical errors.
+#[derive(Clone, Copy, Default, Eq, Hash, PartialEq, PartialOrd, Ord)]
+#[repr(transparent)]
+pub struct PatternID(SmallIndex);
+
+/// The identifier of a finite automaton state.
+///
+/// It is represented by a `u32` even on 64-bit systems in order to conserve
+/// space. Namely, on all targets, this type guarantees that its value will
+/// fit in a `u32`, `i32`, `usize` and an `isize`. This means that on 16-bit
+/// targets, for example, this type's maximum value will never overflow an
+/// `isize`, which means it will never overflow a `i16` even though its
+/// internal representation is still a `u32`.
+///
+/// # Safety
+///
+/// While a `StateID` is meant to guarantee that its value fits into `usize`
+/// without using as much space as a `usize` on all targets, callers must
+/// not rely on this property for safety. Callers may choose to rely on this
+/// property for correctness however. For example, creating a `StateID` with an
+/// invalid value can be done in entirely safe code. This may in turn result in
+/// panics or silent logical errors.
+#[derive(Clone, Copy, Default, Eq, Hash, PartialEq, PartialOrd, Ord)]
+#[repr(transparent)]
+pub struct StateID(SmallIndex);
+
+index_type_impls!(PatternID, PatternIDError, PatternIDIter, WithPatternIDIter);
+index_type_impls!(StateID, StateIDError, StateIDIter, WithStateIDIter);
+
+/// A utility trait that defines a couple of adapters for making it convenient
+/// to access indices as "small index" types. We require ExactSizeIterator so
+/// that iterator construction can do a single check to make sure the index of
+/// each element is representable by its small index type.
+pub(crate) trait IteratorIndexExt: Iterator {
+ fn with_pattern_ids(self) -> WithPatternIDIter<Self>
+ where
+ Self: Sized + ExactSizeIterator,
+ {
+ WithPatternIDIter::new(self)
+ }
+
+ fn with_state_ids(self) -> WithStateIDIter<Self>
+ where
+ Self: Sized + ExactSizeIterator,
+ {
+ WithStateIDIter::new(self)
+ }
+}
+
+impl<I: Iterator> IteratorIndexExt for I {}
diff --git a/third_party/rust/aho-corasick/src/util/remapper.rs b/third_party/rust/aho-corasick/src/util/remapper.rs
new file mode 100644
index 0000000000..7c47a082cd
--- /dev/null
+++ b/third_party/rust/aho-corasick/src/util/remapper.rs
@@ -0,0 +1,214 @@
+use alloc::vec::Vec;
+
+use crate::{nfa::noncontiguous, util::primitives::StateID};
+
+/// Remappable is a tightly coupled abstraction that facilitates remapping
+/// state identifiers in DFAs.
+///
+/// The main idea behind remapping state IDs is that DFAs often need to check
+/// if a certain state is a "special" state of some kind (like a match state)
+/// during a search. Since this is extremely perf critical code, we want this
+/// check to be as fast as possible. Partitioning state IDs into, for example,
+/// into "non-match" and "match" states means one can tell if a state is a
+/// match state via a simple comparison of the state ID.
+///
+/// The issue is that during the DFA construction process, it's not
+/// particularly easy to partition the states. Instead, the simplest thing is
+/// to often just do a pass over all of the states and shuffle them into their
+/// desired partitionings. To do that, we need a mechanism for swapping states.
+/// Hence, this abstraction.
+///
+/// Normally, for such little code, I would just duplicate it. But this is a
+/// key optimization and the implementation is a bit subtle. So the abstraction
+/// is basically a ham-fisted attempt at DRY. The only place we use this is in
+/// the dense and one-pass DFAs.
+///
+/// See also src/dfa/special.rs for a more detailed explanation of how dense
+/// DFAs are partitioned.
+pub(crate) trait Remappable: core::fmt::Debug {
+ /// Return the total number of states.
+ fn state_len(&self) -> usize;
+
+ /// Swap the states pointed to by the given IDs. The underlying finite
+ /// state machine should be mutated such that all of the transitions in
+ /// `id1` are now in the memory region where the transitions for `id2`
+ /// were, and all of the transitions in `id2` are now in the memory region
+ /// where the transitions for `id1` were.
+ ///
+ /// Essentially, this "moves" `id1` to `id2` and `id2` to `id1`.
+ ///
+ /// It is expected that, after calling this, the underlying state machine
+ /// will be left in an inconsistent state, since any other transitions
+ /// pointing to, e.g., `id1` need to be updated to point to `id2`, since
+ /// that's where `id1` moved to.
+ ///
+ /// In order to "fix" the underlying inconsistent state, a `Remapper`
+ /// should be used to guarantee that `remap` is called at the appropriate
+ /// time.
+ fn swap_states(&mut self, id1: StateID, id2: StateID);
+
+ /// This must remap every single state ID in the underlying value according
+ /// to the function given. For example, in a DFA, this should remap every
+ /// transition and every starting state ID.
+ fn remap(&mut self, map: impl Fn(StateID) -> StateID);
+}
+
+/// Remapper is an abstraction the manages the remapping of state IDs in a
+/// finite state machine. This is useful when one wants to shuffle states into
+/// different positions in the machine.
+///
+/// One of the key complexities this manages is the ability to correctly move
+/// one state multiple times.
+///
+/// Once shuffling is complete, `remap` must be called, which will rewrite
+/// all pertinent transitions to updated state IDs. Neglecting to call `remap`
+/// will almost certainly result in a corrupt machine.
+#[derive(Debug)]
+pub(crate) struct Remapper {
+ /// A map from the index of a state to its pre-multiplied identifier.
+ ///
+ /// When a state is swapped with another, then their corresponding
+ /// locations in this map are also swapped. Thus, its new position will
+ /// still point to its old pre-multiplied StateID.
+ ///
+ /// While there is a bit more to it, this then allows us to rewrite the
+ /// state IDs in a DFA's transition table in a single pass. This is done
+ /// by iterating over every ID in this map, then iterating over each
+ /// transition for the state at that ID and re-mapping the transition from
+ /// `old_id` to `map[dfa.to_index(old_id)]`. That is, we find the position
+ /// in this map where `old_id` *started*, and set it to where it ended up
+ /// after all swaps have been completed.
+ map: Vec<StateID>,
+ /// A way to map indices to state IDs (and back).
+ idx: IndexMapper,
+}
+
+impl Remapper {
+ /// Create a new remapper from the given remappable implementation. The
+ /// remapper can then be used to swap states. The remappable value given
+ /// here must the same one given to `swap` and `remap`.
+ ///
+ /// The given stride should be the stride of the transition table expressed
+ /// as a power of 2. This stride is used to map between state IDs and state
+ /// indices. If state IDs and state indices are equivalent, then provide
+ /// a `stride2` of `0`, which acts as an identity.
+ pub(crate) fn new(r: &impl Remappable, stride2: usize) -> Remapper {
+ let idx = IndexMapper { stride2 };
+ let map = (0..r.state_len()).map(|i| idx.to_state_id(i)).collect();
+ Remapper { map, idx }
+ }
+
+ /// Swap two states. Once this is called, callers must follow through to
+ /// call `remap`, or else it's possible for the underlying remappable
+ /// value to be in a corrupt state.
+ pub(crate) fn swap(
+ &mut self,
+ r: &mut impl Remappable,
+ id1: StateID,
+ id2: StateID,
+ ) {
+ if id1 == id2 {
+ return;
+ }
+ r.swap_states(id1, id2);
+ self.map.swap(self.idx.to_index(id1), self.idx.to_index(id2));
+ }
+
+ /// Complete the remapping process by rewriting all state IDs in the
+ /// remappable value according to the swaps performed.
+ pub(crate) fn remap(mut self, r: &mut impl Remappable) {
+ // Update the map to account for states that have been swapped
+ // multiple times. For example, if (A, C) and (C, G) are swapped, then
+ // transitions previously pointing to A should now point to G. But if
+ // we don't update our map, they will erroneously be set to C. All we
+ // do is follow the swaps in our map until we see our original state
+ // ID.
+ //
+ // The intuition here is to think about how changes are made to the
+ // map: only through pairwise swaps. That means that starting at any
+ // given state, it is always possible to find the loop back to that
+ // state by following the swaps represented in the map (which might be
+ // 0 swaps).
+ //
+ // We are also careful to clone the map before starting in order to
+ // freeze it. We use the frozen map to find our loops, since we need to
+ // update our map as well. Without freezing it, our updates could break
+ // the loops referenced above and produce incorrect results.
+ let oldmap = self.map.clone();
+ for i in 0..r.state_len() {
+ let cur_id = self.idx.to_state_id(i);
+ let mut new_id = oldmap[i];
+ if cur_id == new_id {
+ continue;
+ }
+ loop {
+ let id = oldmap[self.idx.to_index(new_id)];
+ if cur_id == id {
+ self.map[i] = new_id;
+ break;
+ }
+ new_id = id;
+ }
+ }
+ r.remap(|sid| self.map[self.idx.to_index(sid)]);
+ }
+}
+
+/// A simple type for mapping between state indices and state IDs.
+///
+/// The reason why this exists is because state IDs are "premultiplied" in a
+/// DFA. That is, in order to get to the transitions for a particular state,
+/// one need only use the state ID as-is, instead of having to multiply it by
+/// transition table's stride.
+///
+/// The downside of this is that it's inconvenient to map between state IDs
+/// using a dense map, e.g., Vec<StateID>. That's because state IDs look like
+/// `0`, `stride`, `2*stride`, `3*stride`, etc., instead of `0`, `1`, `2`, `3`,
+/// etc.
+///
+/// Since our state IDs are premultiplied, we can convert back-and-forth
+/// between IDs and indices by simply unmultiplying the IDs and multiplying the
+/// indices.
+///
+/// Note that for a sparse NFA, state IDs and indices are equivalent. In this
+/// case, we set the stride of the index mapped to be `0`, which acts as an
+/// identity.
+#[derive(Debug)]
+struct IndexMapper {
+ /// The power of 2 corresponding to the stride of the corresponding
+ /// transition table. 'id >> stride2' de-multiplies an ID while 'index <<
+ /// stride2' pre-multiplies an index to an ID.
+ stride2: usize,
+}
+
+impl IndexMapper {
+ /// Convert a state ID to a state index.
+ fn to_index(&self, id: StateID) -> usize {
+ id.as_usize() >> self.stride2
+ }
+
+ /// Convert a state index to a state ID.
+ fn to_state_id(&self, index: usize) -> StateID {
+ // CORRECTNESS: If the given index is not valid, then it is not
+ // required for this to panic or return a valid state ID. We'll "just"
+ // wind up with panics or silent logic errors at some other point. But
+ // this is OK because if Remappable::state_len is correct and so is
+ // 'to_index', then all inputs to 'to_state_id' should be valid indices
+ // and thus transform into valid state IDs.
+ StateID::new_unchecked(index << self.stride2)
+ }
+}
+
+impl Remappable for noncontiguous::NFA {
+ fn state_len(&self) -> usize {
+ noncontiguous::NFA::states(self).len()
+ }
+
+ fn swap_states(&mut self, id1: StateID, id2: StateID) {
+ noncontiguous::NFA::swap_states(self, id1, id2)
+ }
+
+ fn remap(&mut self, map: impl Fn(StateID) -> StateID) {
+ noncontiguous::NFA::remap(self, map)
+ }
+}
diff --git a/third_party/rust/aho-corasick/src/util/search.rs b/third_party/rust/aho-corasick/src/util/search.rs
new file mode 100644
index 0000000000..59b7035e1f
--- /dev/null
+++ b/third_party/rust/aho-corasick/src/util/search.rs
@@ -0,0 +1,1148 @@
+use core::ops::{Range, RangeBounds};
+
+use crate::util::primitives::PatternID;
+
+/// The configuration and the haystack to use for an Aho-Corasick search.
+///
+/// When executing a search, there are a few parameters one might want to
+/// configure:
+///
+/// * The haystack to search, provided to the [`Input::new`] constructor. This
+/// is the only required parameter.
+/// * The span _within_ the haystack to limit a search to. (The default
+/// is the entire haystack.) This is configured via [`Input::span`] or
+/// [`Input::range`].
+/// * Whether to run an unanchored (matches can occur anywhere after the
+/// start of the search) or anchored (matches can only occur beginning at
+/// the start of the search) search. Unanchored search is the default. This is
+/// configured via [`Input::anchored`].
+/// * Whether to quit the search as soon as a match has been found, regardless
+/// of the [`MatchKind`] that the searcher was built with. This is configured
+/// via [`Input::earliest`].
+///
+/// For most cases, the defaults for all optional parameters are appropriate.
+/// The utility of this type is that it keeps the default or common case simple
+/// while permitting tweaking parameters in more niche use cases while reusing
+/// the same search APIs.
+///
+/// # Valid bounds and search termination
+///
+/// An `Input` permits setting the bounds of a search via either
+/// [`Input::span`] or [`Input::range`]. The bounds set must be valid, or
+/// else a panic will occur. Bounds are valid if and only if:
+///
+/// * The bounds represent a valid range into the input's haystack.
+/// * **or** the end bound is a valid ending bound for the haystack *and*
+/// the start bound is exactly one greater than the end bound.
+///
+/// In the latter case, [`Input::is_done`] will return true and indicates any
+/// search receiving such an input should immediately return with no match.
+///
+/// Other than representing "search is complete," the `Input::span` and
+/// `Input::range` APIs are never necessary. Instead, callers can slice the
+/// haystack instead, e.g., with `&haystack[start..end]`. With that said, they
+/// can be more convenient than slicing because the match positions reported
+/// when using `Input::span` or `Input::range` are in terms of the original
+/// haystack. If you instead use `&haystack[start..end]`, then you'll need to
+/// add `start` to any match position returned in order for it to be a correct
+/// index into `haystack`.
+///
+/// # Example: `&str` and `&[u8]` automatically convert to an `Input`
+///
+/// There is a `From<&T> for Input` implementation for all `T: AsRef<[u8]>`.
+/// Additionally, the [`AhoCorasick`](crate::AhoCorasick) search APIs accept
+/// a `Into<Input>`. These two things combined together mean you can provide
+/// things like `&str` and `&[u8]` to search APIs when the defaults are
+/// suitable, but also an `Input` when they're not. For example:
+///
+/// ```
+/// use aho_corasick::{AhoCorasick, Anchored, Input, Match, StartKind};
+///
+/// // Build a searcher that supports both unanchored and anchored modes.
+/// let ac = AhoCorasick::builder()
+/// .start_kind(StartKind::Both)
+/// .build(&["abcd", "b"])
+/// .unwrap();
+/// let haystack = "abcd";
+///
+/// // A search using default parameters is unanchored. With standard
+/// // semantics, this finds `b` first.
+/// assert_eq!(
+/// Some(Match::must(1, 1..2)),
+/// ac.find(haystack),
+/// );
+/// // Using the same 'find' routine, we can provide an 'Input' explicitly
+/// // that is configured to do an anchored search. Since 'b' doesn't start
+/// // at the beginning of the search, it is not reported as a match.
+/// assert_eq!(
+/// Some(Match::must(0, 0..4)),
+/// ac.find(Input::new(haystack).anchored(Anchored::Yes)),
+/// );
+/// ```
+#[derive(Clone)]
+pub struct Input<'h> {
+ haystack: &'h [u8],
+ span: Span,
+ anchored: Anchored,
+ earliest: bool,
+}
+
+impl<'h> Input<'h> {
+ /// Create a new search configuration for the given haystack.
+ #[inline]
+ pub fn new<H: ?Sized + AsRef<[u8]>>(haystack: &'h H) -> Input<'h> {
+ Input {
+ haystack: haystack.as_ref(),
+ span: Span { start: 0, end: haystack.as_ref().len() },
+ anchored: Anchored::No,
+ earliest: false,
+ }
+ }
+
+ /// Set the span for this search.
+ ///
+ /// This routine is generic over how a span is provided. While
+ /// a [`Span`] may be given directly, one may also provide a
+ /// `std::ops::Range<usize>`. To provide anything supported by range
+ /// syntax, use the [`Input::range`] method.
+ ///
+ /// The default span is the entire haystack.
+ ///
+ /// Note that [`Input::range`] overrides this method and vice versa.
+ ///
+ /// # Panics
+ ///
+ /// This panics if the given span does not correspond to valid bounds in
+ /// the haystack or the termination of a search.
+ ///
+ /// # Example
+ ///
+ /// This example shows how the span of the search can impact whether a
+ /// match is reported or not.
+ ///
+ /// ```
+ /// use aho_corasick::{AhoCorasick, Input, MatchKind};
+ ///
+ /// let patterns = &["b", "abcd", "abc"];
+ /// let haystack = "abcd";
+ ///
+ /// let ac = AhoCorasick::builder()
+ /// .match_kind(MatchKind::LeftmostFirst)
+ /// .build(patterns)
+ /// .unwrap();
+ /// let input = Input::new(haystack).span(0..3);
+ /// let mat = ac.try_find(input)?.expect("should have a match");
+ /// // Without the span stopping the search early, 'abcd' would be reported
+ /// // because it is the correct leftmost-first match.
+ /// assert_eq!("abc", &haystack[mat.span()]);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn span<S: Into<Span>>(mut self, span: S) -> Input<'h> {
+ self.set_span(span);
+ self
+ }
+
+ /// Like `Input::span`, but accepts any range instead.
+ ///
+ /// The default range is the entire haystack.
+ ///
+ /// Note that [`Input::span`] overrides this method and vice versa.
+ ///
+ /// # Panics
+ ///
+ /// This routine will panic if the given range could not be converted
+ /// to a valid [`Range`]. For example, this would panic when given
+ /// `0..=usize::MAX` since it cannot be represented using a half-open
+ /// interval in terms of `usize`.
+ ///
+ /// This routine also panics if the given range does not correspond to
+ /// valid bounds in the haystack or the termination of a search.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use aho_corasick::Input;
+ ///
+ /// let input = Input::new("foobar");
+ /// assert_eq!(0..6, input.get_range());
+ ///
+ /// let input = Input::new("foobar").range(2..=4);
+ /// assert_eq!(2..5, input.get_range());
+ /// ```
+ #[inline]
+ pub fn range<R: RangeBounds<usize>>(mut self, range: R) -> Input<'h> {
+ self.set_range(range);
+ self
+ }
+
+ /// Sets the anchor mode of a search.
+ ///
+ /// When a search is anchored (via [`Anchored::Yes`]), a match must begin
+ /// at the start of a search. When a search is not anchored (that's
+ /// [`Anchored::No`]), searchers will look for a match anywhere in the
+ /// haystack.
+ ///
+ /// By default, the anchored mode is [`Anchored::No`].
+ ///
+ /// # Support for anchored searches
+ ///
+ /// Anchored or unanchored searches might not always be available,
+ /// depending on the type of searcher used and its configuration:
+ ///
+ /// * [`noncontiguous::NFA`](crate::nfa::noncontiguous::NFA) always
+ /// supports both unanchored and anchored searches.
+ /// * [`contiguous::NFA`](crate::nfa::contiguous::NFA) always supports both
+ /// unanchored and anchored searches.
+ /// * [`dfa::DFA`](crate::dfa::DFA) supports only unanchored
+ /// searches by default.
+ /// [`dfa::Builder::start_kind`](crate::dfa::Builder::start_kind) can
+ /// be used to change the default to supporting both kinds of searches
+ /// or even just anchored searches.
+ /// * [`AhoCorasick`](crate::AhoCorasick) inherits the same setup as a
+ /// `DFA`. Namely, it only supports unanchored searches by default, but
+ /// [`AhoCorasickBuilder::start_kind`](crate::AhoCorasickBuilder::start_kind)
+ /// can change this.
+ ///
+ /// If you try to execute a search using a `try_` ("fallible") method with
+ /// an unsupported anchor mode, then an error will be returned. For calls
+ /// to infallible search methods, a panic will result.
+ ///
+ /// # Example
+ ///
+ /// This demonstrates the differences between an anchored search and
+ /// an unanchored search. Notice that we build our `AhoCorasick` searcher
+ /// with [`StartKind::Both`] so that it supports both unanchored and
+ /// anchored searches simultaneously.
+ ///
+ /// ```
+ /// use aho_corasick::{
+ /// AhoCorasick, Anchored, Input, MatchKind, StartKind,
+ /// };
+ ///
+ /// let patterns = &["bcd"];
+ /// let haystack = "abcd";
+ ///
+ /// let ac = AhoCorasick::builder()
+ /// .start_kind(StartKind::Both)
+ /// .build(patterns)
+ /// .unwrap();
+ ///
+ /// // Note that 'Anchored::No' is the default, so it doesn't need to
+ /// // be explicitly specified here.
+ /// let input = Input::new(haystack);
+ /// let mat = ac.try_find(input)?.expect("should have a match");
+ /// assert_eq!("bcd", &haystack[mat.span()]);
+ ///
+ /// // While 'bcd' occurs in the haystack, it does not begin where our
+ /// // search begins, so no match is found.
+ /// let input = Input::new(haystack).anchored(Anchored::Yes);
+ /// assert_eq!(None, ac.try_find(input)?);
+ ///
+ /// // However, if we start our search where 'bcd' starts, then we will
+ /// // find a match.
+ /// let input = Input::new(haystack).range(1..).anchored(Anchored::Yes);
+ /// let mat = ac.try_find(input)?.expect("should have a match");
+ /// assert_eq!("bcd", &haystack[mat.span()]);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn anchored(mut self, mode: Anchored) -> Input<'h> {
+ self.set_anchored(mode);
+ self
+ }
+
+ /// Whether to execute an "earliest" search or not.
+ ///
+ /// When running a non-overlapping search, an "earliest" search will
+ /// return the match location as early as possible. For example, given
+ /// the patterns `abc` and `b`, and a haystack of `abc`, a normal
+ /// leftmost-first search will return `abc` as a match. But an "earliest"
+ /// search will return as soon as it is known that a match occurs, which
+ /// happens once `b` is seen.
+ ///
+ /// Note that when using [`MatchKind::Standard`], the "earliest" option
+ /// has no effect since standard semantics are already "earliest." Note
+ /// also that this has no effect in overlapping searches, since overlapping
+ /// searches also use standard semantics and report all possible matches.
+ ///
+ /// This is disabled by default.
+ ///
+ /// # Example
+ ///
+ /// This example shows the difference between "earliest" searching and
+ /// normal leftmost searching.
+ ///
+ /// ```
+ /// use aho_corasick::{AhoCorasick, Anchored, Input, MatchKind, StartKind};
+ ///
+ /// let patterns = &["abc", "b"];
+ /// let haystack = "abc";
+ ///
+ /// let ac = AhoCorasick::builder()
+ /// .match_kind(MatchKind::LeftmostFirst)
+ /// .build(patterns)
+ /// .unwrap();
+ ///
+ /// // The normal leftmost-first match.
+ /// let input = Input::new(haystack);
+ /// let mat = ac.try_find(input)?.expect("should have a match");
+ /// assert_eq!("abc", &haystack[mat.span()]);
+ ///
+ /// // The "earliest" possible match, even if it isn't leftmost-first.
+ /// let input = Input::new(haystack).earliest(true);
+ /// let mat = ac.try_find(input)?.expect("should have a match");
+ /// assert_eq!("b", &haystack[mat.span()]);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn earliest(mut self, yes: bool) -> Input<'h> {
+ self.set_earliest(yes);
+ self
+ }
+
+ /// Set the span for this search configuration.
+ ///
+ /// This is like the [`Input::span`] method, except this mutates the
+ /// span in place.
+ ///
+ /// This routine is generic over how a span is provided. While
+ /// a [`Span`] may be given directly, one may also provide a
+ /// `std::ops::Range<usize>`.
+ ///
+ /// # Panics
+ ///
+ /// This panics if the given span does not correspond to valid bounds in
+ /// the haystack or the termination of a search.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use aho_corasick::Input;
+ ///
+ /// let mut input = Input::new("foobar");
+ /// assert_eq!(0..6, input.get_range());
+ /// input.set_span(2..4);
+ /// assert_eq!(2..4, input.get_range());
+ /// ```
+ #[inline]
+ pub fn set_span<S: Into<Span>>(&mut self, span: S) {
+ let span = span.into();
+ assert!(
+ span.end <= self.haystack.len()
+ && span.start <= span.end.wrapping_add(1),
+ "invalid span {:?} for haystack of length {}",
+ span,
+ self.haystack.len(),
+ );
+ self.span = span;
+ }
+
+ /// Set the span for this search configuration given any range.
+ ///
+ /// This is like the [`Input::range`] method, except this mutates the
+ /// span in place.
+ ///
+ /// # Panics
+ ///
+ /// This routine will panic if the given range could not be converted
+ /// to a valid [`Range`]. For example, this would panic when given
+ /// `0..=usize::MAX` since it cannot be represented using a half-open
+ /// interval in terms of `usize`.
+ ///
+ /// This routine also panics if the given range does not correspond to
+ /// valid bounds in the haystack or the termination of a search.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use aho_corasick::Input;
+ ///
+ /// let mut input = Input::new("foobar");
+ /// assert_eq!(0..6, input.get_range());
+ /// input.set_range(2..=4);
+ /// assert_eq!(2..5, input.get_range());
+ /// ```
+ #[inline]
+ pub fn set_range<R: RangeBounds<usize>>(&mut self, range: R) {
+ use core::ops::Bound;
+
+ // It's a little weird to convert ranges into spans, and then spans
+ // back into ranges when we actually slice the haystack. Because
+ // of that process, we always represent everything as a half-open
+ // internal. Therefore, handling things like m..=n is a little awkward.
+ let start = match range.start_bound() {
+ Bound::Included(&i) => i,
+ // Can this case ever happen? Range syntax doesn't support it...
+ Bound::Excluded(&i) => i.checked_add(1).unwrap(),
+ Bound::Unbounded => 0,
+ };
+ let end = match range.end_bound() {
+ Bound::Included(&i) => i.checked_add(1).unwrap(),
+ Bound::Excluded(&i) => i,
+ Bound::Unbounded => self.haystack().len(),
+ };
+ self.set_span(Span { start, end });
+ }
+
+ /// Set the starting offset for the span for this search configuration.
+ ///
+ /// This is a convenience routine for only mutating the start of a span
+ /// without having to set the entire span.
+ ///
+ /// # Panics
+ ///
+ /// This panics if the given span does not correspond to valid bounds in
+ /// the haystack or the termination of a search.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use aho_corasick::Input;
+ ///
+ /// let mut input = Input::new("foobar");
+ /// assert_eq!(0..6, input.get_range());
+ /// input.set_start(5);
+ /// assert_eq!(5..6, input.get_range());
+ /// ```
+ #[inline]
+ pub fn set_start(&mut self, start: usize) {
+ self.set_span(Span { start, ..self.get_span() });
+ }
+
+ /// Set the ending offset for the span for this search configuration.
+ ///
+ /// This is a convenience routine for only mutating the end of a span
+ /// without having to set the entire span.
+ ///
+ /// # Panics
+ ///
+ /// This panics if the given span does not correspond to valid bounds in
+ /// the haystack or the termination of a search.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use aho_corasick::Input;
+ ///
+ /// let mut input = Input::new("foobar");
+ /// assert_eq!(0..6, input.get_range());
+ /// input.set_end(5);
+ /// assert_eq!(0..5, input.get_range());
+ /// ```
+ #[inline]
+ pub fn set_end(&mut self, end: usize) {
+ self.set_span(Span { end, ..self.get_span() });
+ }
+
+ /// Set the anchor mode of a search.
+ ///
+ /// This is like [`Input::anchored`], except it mutates the search
+ /// configuration in place.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use aho_corasick::{Anchored, Input};
+ ///
+ /// let mut input = Input::new("foobar");
+ /// assert_eq!(Anchored::No, input.get_anchored());
+ ///
+ /// input.set_anchored(Anchored::Yes);
+ /// assert_eq!(Anchored::Yes, input.get_anchored());
+ /// ```
+ #[inline]
+ pub fn set_anchored(&mut self, mode: Anchored) {
+ self.anchored = mode;
+ }
+
+ /// Set whether the search should execute in "earliest" mode or not.
+ ///
+ /// This is like [`Input::earliest`], except it mutates the search
+ /// configuration in place.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use aho_corasick::Input;
+ ///
+ /// let mut input = Input::new("foobar");
+ /// assert!(!input.get_earliest());
+ /// input.set_earliest(true);
+ /// assert!(input.get_earliest());
+ /// ```
+ #[inline]
+ pub fn set_earliest(&mut self, yes: bool) {
+ self.earliest = yes;
+ }
+
+ /// Return a borrow of the underlying haystack as a slice of bytes.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use aho_corasick::Input;
+ ///
+ /// let input = Input::new("foobar");
+ /// assert_eq!(b"foobar", input.haystack());
+ /// ```
+ #[inline]
+ pub fn haystack(&self) -> &[u8] {
+ self.haystack
+ }
+
+ /// Return the start position of this search.
+ ///
+ /// This is a convenience routine for `search.get_span().start()`.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use aho_corasick::Input;
+ ///
+ /// let input = Input::new("foobar");
+ /// assert_eq!(0, input.start());
+ ///
+ /// let input = Input::new("foobar").span(2..4);
+ /// assert_eq!(2, input.start());
+ /// ```
+ #[inline]
+ pub fn start(&self) -> usize {
+ self.get_span().start
+ }
+
+ /// Return the end position of this search.
+ ///
+ /// This is a convenience routine for `search.get_span().end()`.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use aho_corasick::Input;
+ ///
+ /// let input = Input::new("foobar");
+ /// assert_eq!(6, input.end());
+ ///
+ /// let input = Input::new("foobar").span(2..4);
+ /// assert_eq!(4, input.end());
+ /// ```
+ #[inline]
+ pub fn end(&self) -> usize {
+ self.get_span().end
+ }
+
+ /// Return the span for this search configuration.
+ ///
+ /// If one was not explicitly set, then the span corresponds to the entire
+ /// range of the haystack.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use aho_corasick::{Input, Span};
+ ///
+ /// let input = Input::new("foobar");
+ /// assert_eq!(Span { start: 0, end: 6 }, input.get_span());
+ /// ```
+ #[inline]
+ pub fn get_span(&self) -> Span {
+ self.span
+ }
+
+ /// Return the span as a range for this search configuration.
+ ///
+ /// If one was not explicitly set, then the span corresponds to the entire
+ /// range of the haystack.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use aho_corasick::Input;
+ ///
+ /// let input = Input::new("foobar");
+ /// assert_eq!(0..6, input.get_range());
+ /// ```
+ #[inline]
+ pub fn get_range(&self) -> Range<usize> {
+ self.get_span().range()
+ }
+
+ /// Return the anchored mode for this search configuration.
+ ///
+ /// If no anchored mode was set, then it defaults to [`Anchored::No`].
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use aho_corasick::{Anchored, Input};
+ ///
+ /// let mut input = Input::new("foobar");
+ /// assert_eq!(Anchored::No, input.get_anchored());
+ ///
+ /// input.set_anchored(Anchored::Yes);
+ /// assert_eq!(Anchored::Yes, input.get_anchored());
+ /// ```
+ #[inline]
+ pub fn get_anchored(&self) -> Anchored {
+ self.anchored
+ }
+
+ /// Return whether this search should execute in "earliest" mode.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use aho_corasick::Input;
+ ///
+ /// let input = Input::new("foobar");
+ /// assert!(!input.get_earliest());
+ /// ```
+ #[inline]
+ pub fn get_earliest(&self) -> bool {
+ self.earliest
+ }
+
+ /// Return true if this input has been exhausted, which in turn means all
+ /// subsequent searches will return no matches.
+ ///
+ /// This occurs precisely when the start position of this search is greater
+ /// than the end position of the search.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use aho_corasick::Input;
+ ///
+ /// let mut input = Input::new("foobar");
+ /// assert!(!input.is_done());
+ /// input.set_start(6);
+ /// assert!(!input.is_done());
+ /// input.set_start(7);
+ /// assert!(input.is_done());
+ /// ```
+ #[inline]
+ pub fn is_done(&self) -> bool {
+ self.get_span().start > self.get_span().end
+ }
+}
+
+impl<'h> core::fmt::Debug for Input<'h> {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ let mut fmter = f.debug_struct("Input");
+ match core::str::from_utf8(self.haystack()) {
+ Ok(nice) => fmter.field("haystack", &nice),
+ Err(_) => fmter.field("haystack", &self.haystack()),
+ }
+ .field("span", &self.span)
+ .field("anchored", &self.anchored)
+ .field("earliest", &self.earliest)
+ .finish()
+ }
+}
+
+impl<'h, H: ?Sized + AsRef<[u8]>> From<&'h H> for Input<'h> {
+ #[inline]
+ fn from(haystack: &'h H) -> Input<'h> {
+ Input::new(haystack)
+ }
+}
+
+/// A representation of a range in a haystack.
+///
+/// A span corresponds to the starting and ending _byte offsets_ of a
+/// contiguous region of bytes. The starting offset is inclusive while the
+/// ending offset is exclusive. That is, a span is a half-open interval.
+///
+/// A span is used to report the offsets of a match, but it is also used to
+/// convey which region of a haystack should be searched via routines like
+/// [`Input::span`].
+///
+/// This is basically equivalent to a `std::ops::Range<usize>`, except this
+/// type implements `Copy` which makes it more ergonomic to use in the context
+/// of this crate. Indeed, `Span` exists only because `Range<usize>` does
+/// not implement `Copy`. Like a range, this implements `Index` for `[u8]`
+/// and `str`, and `IndexMut` for `[u8]`. For convenience, this also impls
+/// `From<Range>`, which means things like `Span::from(5..10)` work.
+///
+/// There are no constraints on the values of a span. It is, for example, legal
+/// to create a span where `start > end`.
+#[derive(Clone, Copy, Eq, Hash, PartialEq)]
+pub struct Span {
+ /// The start offset of the span, inclusive.
+ pub start: usize,
+ /// The end offset of the span, exclusive.
+ pub end: usize,
+}
+
+impl Span {
+ /// Returns this span as a range.
+ #[inline]
+ pub fn range(&self) -> Range<usize> {
+ Range::from(*self)
+ }
+
+ /// Returns true when this span is empty. That is, when `start >= end`.
+ #[inline]
+ pub fn is_empty(&self) -> bool {
+ self.start >= self.end
+ }
+
+ /// Returns the length of this span.
+ ///
+ /// This returns `0` in precisely the cases that `is_empty` returns `true`.
+ #[inline]
+ pub fn len(&self) -> usize {
+ self.end.saturating_sub(self.start)
+ }
+
+ /// Returns true when the given offset is contained within this span.
+ ///
+ /// Note that an empty span contains no offsets and will always return
+ /// false.
+ #[inline]
+ pub fn contains(&self, offset: usize) -> bool {
+ !self.is_empty() && self.start <= offset && offset <= self.end
+ }
+
+ /// Returns a new span with `offset` added to this span's `start` and `end`
+ /// values.
+ #[inline]
+ pub fn offset(&self, offset: usize) -> Span {
+ Span { start: self.start + offset, end: self.end + offset }
+ }
+}
+
+impl core::fmt::Debug for Span {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ write!(f, "{}..{}", self.start, self.end)
+ }
+}
+
+impl core::ops::Index<Span> for [u8] {
+ type Output = [u8];
+
+ #[inline]
+ fn index(&self, index: Span) -> &[u8] {
+ &self[index.range()]
+ }
+}
+
+impl core::ops::IndexMut<Span> for [u8] {
+ #[inline]
+ fn index_mut(&mut self, index: Span) -> &mut [u8] {
+ &mut self[index.range()]
+ }
+}
+
+impl core::ops::Index<Span> for str {
+ type Output = str;
+
+ #[inline]
+ fn index(&self, index: Span) -> &str {
+ &self[index.range()]
+ }
+}
+
+impl From<Range<usize>> for Span {
+ #[inline]
+ fn from(range: Range<usize>) -> Span {
+ Span { start: range.start, end: range.end }
+ }
+}
+
+impl From<Span> for Range<usize> {
+ #[inline]
+ fn from(span: Span) -> Range<usize> {
+ Range { start: span.start, end: span.end }
+ }
+}
+
+impl PartialEq<Range<usize>> for Span {
+ #[inline]
+ fn eq(&self, range: &Range<usize>) -> bool {
+ self.start == range.start && self.end == range.end
+ }
+}
+
+impl PartialEq<Span> for Range<usize> {
+ #[inline]
+ fn eq(&self, span: &Span) -> bool {
+ self.start == span.start && self.end == span.end
+ }
+}
+
+/// The type of anchored search to perform.
+///
+/// If an Aho-Corasick searcher does not support the anchored mode selected,
+/// then the search will return an error or panic, depending on whether a
+/// fallible or an infallible routine was called.
+#[non_exhaustive]
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+pub enum Anchored {
+ /// Run an unanchored search. This means a match may occur anywhere at or
+ /// after the start position of the search up until the end position of the
+ /// search.
+ No,
+ /// Run an anchored search. This means that a match must begin at the start
+ /// position of the search and end before the end position of the search.
+ Yes,
+}
+
+impl Anchored {
+ /// Returns true if and only if this anchor mode corresponds to an anchored
+ /// search.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use aho_corasick::Anchored;
+ ///
+ /// assert!(!Anchored::No.is_anchored());
+ /// assert!(Anchored::Yes.is_anchored());
+ /// ```
+ #[inline]
+ pub fn is_anchored(&self) -> bool {
+ matches!(*self, Anchored::Yes)
+ }
+}
+
+/// A representation of a match reported by an Aho-Corasick searcher.
+///
+/// A match has two essential pieces of information: the [`PatternID`] that
+/// matches, and the [`Span`] of the match in a haystack.
+///
+/// The pattern is identified by an ID, which corresponds to its position
+/// (starting from `0`) relative to other patterns used to construct the
+/// corresponding searcher. If only a single pattern is provided, then all
+/// matches are guaranteed to have a pattern ID of `0`.
+///
+/// Every match reported by a searcher guarantees that its span has its start
+/// offset as less than or equal to its end offset.
+#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
+pub struct Match {
+ /// The pattern ID.
+ pattern: PatternID,
+ /// The underlying match span.
+ span: Span,
+}
+
+impl Match {
+ /// Create a new match from a pattern ID and a span.
+ ///
+ /// This constructor is generic over how a span is provided. While
+ /// a [`Span`] may be given directly, one may also provide a
+ /// `std::ops::Range<usize>`.
+ ///
+ /// # Panics
+ ///
+ /// This panics if `end < start`.
+ ///
+ /// # Example
+ ///
+ /// This shows how to create a match for the first pattern in an
+ /// Aho-Corasick searcher using convenient range syntax.
+ ///
+ /// ```
+ /// use aho_corasick::{Match, PatternID};
+ ///
+ /// let m = Match::new(PatternID::ZERO, 5..10);
+ /// assert_eq!(0, m.pattern().as_usize());
+ /// assert_eq!(5, m.start());
+ /// assert_eq!(10, m.end());
+ /// ```
+ #[inline]
+ pub fn new<S: Into<Span>>(pattern: PatternID, span: S) -> Match {
+ let span = span.into();
+ assert!(span.start <= span.end, "invalid match span");
+ Match { pattern, span }
+ }
+
+ /// Create a new match from a pattern ID and a byte offset span.
+ ///
+ /// This constructor is generic over how a span is provided. While
+ /// a [`Span`] may be given directly, one may also provide a
+ /// `std::ops::Range<usize>`.
+ ///
+ /// This is like [`Match::new`], but accepts a `usize` instead of a
+ /// [`PatternID`]. This panics if the given `usize` is not representable
+ /// as a `PatternID`.
+ ///
+ /// # Panics
+ ///
+ /// This panics if `end < start` or if `pattern > PatternID::MAX`.
+ ///
+ /// # Example
+ ///
+ /// This shows how to create a match for the third pattern in an
+ /// Aho-Corasick searcher using convenient range syntax.
+ ///
+ /// ```
+ /// use aho_corasick::Match;
+ ///
+ /// let m = Match::must(3, 5..10);
+ /// assert_eq!(3, m.pattern().as_usize());
+ /// assert_eq!(5, m.start());
+ /// assert_eq!(10, m.end());
+ /// ```
+ #[inline]
+ pub fn must<S: Into<Span>>(pattern: usize, span: S) -> Match {
+ Match::new(PatternID::must(pattern), span)
+ }
+
+ /// Returns the ID of the pattern that matched.
+ ///
+ /// The ID of a pattern is derived from the position in which it was
+ /// originally inserted into the corresponding searcher. The first pattern
+ /// has identifier `0`, and each subsequent pattern is `1`, `2` and so on.
+ #[inline]
+ pub fn pattern(&self) -> PatternID {
+ self.pattern
+ }
+
+ /// The starting position of the match.
+ ///
+ /// This is a convenience routine for `Match::span().start`.
+ #[inline]
+ pub fn start(&self) -> usize {
+ self.span().start
+ }
+
+ /// The ending position of the match.
+ ///
+ /// This is a convenience routine for `Match::span().end`.
+ #[inline]
+ pub fn end(&self) -> usize {
+ self.span().end
+ }
+
+ /// Returns the match span as a range.
+ ///
+ /// This is a convenience routine for `Match::span().range()`.
+ #[inline]
+ pub fn range(&self) -> core::ops::Range<usize> {
+ self.span().range()
+ }
+
+ /// Returns the span for this match.
+ #[inline]
+ pub fn span(&self) -> Span {
+ self.span
+ }
+
+ /// Returns true when the span in this match is empty.
+ ///
+ /// An empty match can only be returned when empty pattern is in the
+ /// Aho-Corasick searcher.
+ #[inline]
+ pub fn is_empty(&self) -> bool {
+ self.span().is_empty()
+ }
+
+ /// Returns the length of this match.
+ ///
+ /// This returns `0` in precisely the cases that `is_empty` returns `true`.
+ #[inline]
+ pub fn len(&self) -> usize {
+ self.span().len()
+ }
+
+ /// Returns a new match with `offset` added to its span's `start` and `end`
+ /// values.
+ #[inline]
+ pub fn offset(&self, offset: usize) -> Match {
+ Match {
+ pattern: self.pattern,
+ span: Span {
+ start: self.start() + offset,
+ end: self.end() + offset,
+ },
+ }
+ }
+}
+
+/// A knob for controlling the match semantics of an Aho-Corasick automaton.
+///
+/// There are two generally different ways that Aho-Corasick automatons can
+/// report matches. The first way is the "standard" approach that results from
+/// implementing most textbook explanations of Aho-Corasick. The second way is
+/// to report only the leftmost non-overlapping matches. The leftmost approach
+/// is in turn split into two different ways of resolving ambiguous matches:
+/// leftmost-first and leftmost-longest.
+///
+/// The `Standard` match kind is the default and is the only one that supports
+/// overlapping matches and stream searching. (Trying to find overlapping or
+/// streaming matches using leftmost match semantics will result in an error in
+/// fallible APIs and a panic when using infallibe APIs.) The `Standard` match
+/// kind will report matches as they are seen. When searching for overlapping
+/// matches, then all possible matches are reported. When searching for
+/// non-overlapping matches, the first match seen is reported. For example, for
+/// non-overlapping matches, given the patterns `abcd` and `b` and the haystack
+/// `abcdef`, only a match for `b` is reported since it is detected first. The
+/// `abcd` match is never reported since it overlaps with the `b` match.
+///
+/// In contrast, the leftmost match kind always prefers the leftmost match
+/// among all possible matches. Given the same example as above with `abcd` and
+/// `b` as patterns and `abcdef` as the haystack, the leftmost match is `abcd`
+/// since it begins before the `b` match, even though the `b` match is detected
+/// before the `abcd` match. In this case, the `b` match is not reported at all
+/// since it overlaps with the `abcd` match.
+///
+/// The difference between leftmost-first and leftmost-longest is in how they
+/// resolve ambiguous matches when there are multiple leftmost matches to
+/// choose from. Leftmost-first always chooses the pattern that was provided
+/// earliest, where as leftmost-longest always chooses the longest matching
+/// pattern. For example, given the patterns `a` and `ab` and the subject
+/// string `ab`, the leftmost-first match is `a` but the leftmost-longest match
+/// is `ab`. Conversely, if the patterns were given in reverse order, i.e.,
+/// `ab` and `a`, then both the leftmost-first and leftmost-longest matches
+/// would be `ab`. Stated differently, the leftmost-first match depends on the
+/// order in which the patterns were given to the Aho-Corasick automaton.
+/// Because of that, when leftmost-first matching is used, if a pattern `A`
+/// that appears before a pattern `B` is a prefix of `B`, then it is impossible
+/// to ever observe a match of `B`.
+///
+/// If you're not sure which match kind to pick, then stick with the standard
+/// kind, which is the default. In particular, if you need overlapping or
+/// streaming matches, then you _must_ use the standard kind. The leftmost
+/// kinds are useful in specific circumstances. For example, leftmost-first can
+/// be very useful as a way to implement match priority based on the order of
+/// patterns given and leftmost-longest can be useful for dictionary searching
+/// such that only the longest matching words are reported.
+///
+/// # Relationship with regular expression alternations
+///
+/// Understanding match semantics can be a little tricky, and one easy way
+/// to conceptualize non-overlapping matches from an Aho-Corasick automaton
+/// is to think about them as a simple alternation of literals in a regular
+/// expression. For example, let's say we wanted to match the strings
+/// `Sam` and `Samwise`, which would turn into the regex `Sam|Samwise`. It
+/// turns out that regular expression engines have two different ways of
+/// matching this alternation. The first way, leftmost-longest, is commonly
+/// found in POSIX compatible implementations of regular expressions (such as
+/// `grep`). The second way, leftmost-first, is commonly found in backtracking
+/// implementations such as Perl. (Some regex engines, such as RE2 and Rust's
+/// regex engine do not use backtracking, but still implement leftmost-first
+/// semantics in an effort to match the behavior of dominant backtracking
+/// regex engines such as those found in Perl, Ruby, Python, Javascript and
+/// PHP.)
+///
+/// That is, when matching `Sam|Samwise` against `Samwise`, a POSIX regex
+/// will match `Samwise` because it is the longest possible match, but a
+/// Perl-like regex will match `Sam` since it appears earlier in the
+/// alternation. Indeed, the regex `Sam|Samwise` in a Perl-like regex engine
+/// will never match `Samwise` since `Sam` will always have higher priority.
+/// Conversely, matching the regex `Samwise|Sam` against `Samwise` will lead to
+/// a match of `Samwise` in both POSIX and Perl-like regexes since `Samwise` is
+/// still longest match, but it also appears earlier than `Sam`.
+///
+/// The "standard" match semantics of Aho-Corasick generally don't correspond
+/// to the match semantics of any large group of regex implementations, so
+/// there's no direct analogy that can be made here. Standard match semantics
+/// are generally useful for overlapping matches, or if you just want to see
+/// matches as they are detected.
+///
+/// The main conclusion to draw from this section is that the match semantics
+/// can be tweaked to precisely match either Perl-like regex alternations or
+/// POSIX regex alternations.
+#[non_exhaustive]
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+pub enum MatchKind {
+ /// Use standard match semantics, which support overlapping matches. When
+ /// used with non-overlapping matches, matches are reported as they are
+ /// seen.
+ Standard,
+ /// Use leftmost-first match semantics, which reports leftmost matches.
+ /// When there are multiple possible leftmost matches, the match
+ /// corresponding to the pattern that appeared earlier when constructing
+ /// the automaton is reported.
+ ///
+ /// This does **not** support overlapping matches or stream searching. If
+ /// this match kind is used, attempting to find overlapping matches or
+ /// stream matches will fail.
+ LeftmostFirst,
+ /// Use leftmost-longest match semantics, which reports leftmost matches.
+ /// When there are multiple possible leftmost matches, the longest match
+ /// is chosen.
+ ///
+ /// This does **not** support overlapping matches or stream searching. If
+ /// this match kind is used, attempting to find overlapping matches or
+ /// stream matches will fail.
+ LeftmostLongest,
+}
+
+/// The default match kind is `MatchKind::Standard`.
+impl Default for MatchKind {
+ fn default() -> MatchKind {
+ MatchKind::Standard
+ }
+}
+
+impl MatchKind {
+ #[inline]
+ pub(crate) fn is_standard(&self) -> bool {
+ matches!(*self, MatchKind::Standard)
+ }
+
+ #[inline]
+ pub(crate) fn is_leftmost(&self) -> bool {
+ matches!(*self, MatchKind::LeftmostFirst | MatchKind::LeftmostLongest)
+ }
+
+ #[inline]
+ pub(crate) fn is_leftmost_first(&self) -> bool {
+ matches!(*self, MatchKind::LeftmostFirst)
+ }
+
+ /// Convert this match kind into a packed match kind. If this match kind
+ /// corresponds to standard semantics, then this returns None, since
+ /// packed searching does not support standard semantics.
+ #[inline]
+ pub(crate) fn as_packed(&self) -> Option<crate::packed::MatchKind> {
+ match *self {
+ MatchKind::Standard => None,
+ MatchKind::LeftmostFirst => {
+ Some(crate::packed::MatchKind::LeftmostFirst)
+ }
+ MatchKind::LeftmostLongest => {
+ Some(crate::packed::MatchKind::LeftmostLongest)
+ }
+ }
+ }
+}
+
+/// The kind of anchored starting configurations to support in an Aho-Corasick
+/// searcher.
+///
+/// Depending on which searcher is used internally by
+/// [`AhoCorasick`](crate::AhoCorasick), supporting both unanchored
+/// and anchored searches can be quite costly. For this reason,
+/// [`AhoCorasickBuilder::start_kind`](crate::AhoCorasickBuilder::start_kind)
+/// can be used to configure whether your searcher supports unanchored,
+/// anchored or both kinds of searches.
+///
+/// This searcher configuration knob works in concert with the search time
+/// configuration [`Input::anchored`]. Namely, if one requests an unsupported
+/// anchored mode, then the search will either panic or return an error,
+/// depending on whether you're using infallible or fallibe APIs, respectively.
+///
+/// `AhoCorasick` by default only supports unanchored searches.
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+pub enum StartKind {
+ /// Support both anchored and unanchored searches.
+ Both,
+ /// Support only unanchored searches. Requesting an anchored search will
+ /// return an error in fallible APIs and panic in infallible APIs.
+ Unanchored,
+ /// Support only anchored searches. Requesting an unanchored search will
+ /// return an error in fallible APIs and panic in infallible APIs.
+ Anchored,
+}
+
+impl Default for StartKind {
+ fn default() -> StartKind {
+ StartKind::Unanchored
+ }
+}
diff --git a/third_party/rust/aho-corasick/src/util/special.rs b/third_party/rust/aho-corasick/src/util/special.rs
new file mode 100644
index 0000000000..beeba40c89
--- /dev/null
+++ b/third_party/rust/aho-corasick/src/util/special.rs
@@ -0,0 +1,42 @@
+use crate::util::primitives::StateID;
+
+/// A collection of sentinel state IDs for Aho-Corasick automata.
+///
+/// This specifically enables the technique by which we determine which states
+/// are dead, matches or start states. Namely, by arranging states in a
+/// particular order, we can determine the type of a state simply by looking at
+/// its ID.
+#[derive(Clone, Debug)]
+pub(crate) struct Special {
+ /// The maximum ID of all the "special" states. This corresponds either to
+ /// start_anchored_id when a prefilter is active and max_match_id when a
+ /// prefilter is not active. The idea here is that if there is no prefilter,
+ /// then there is no point in treating start states as special.
+ pub(crate) max_special_id: StateID,
+ /// The maximum ID of all the match states. Any state ID bigger than this
+ /// is guaranteed to be a non-match ID.
+ ///
+ /// It is possible and legal for max_match_id to be equal to
+ /// start_anchored_id, which occurs precisely in the case where the empty
+ /// string is a pattern that was added to the underlying automaton.
+ pub(crate) max_match_id: StateID,
+ /// The state ID of the start state used for unanchored searches.
+ pub(crate) start_unanchored_id: StateID,
+ /// The state ID of the start state used for anchored searches. This is
+ /// always start_unanchored_id+1.
+ pub(crate) start_anchored_id: StateID,
+}
+
+impl Special {
+ /// Create a new set of "special" state IDs with all IDs initialized to
+ /// zero. The general idea here is that they will be updated and set to
+ /// correct values later.
+ pub(crate) fn zero() -> Special {
+ Special {
+ max_special_id: StateID::ZERO,
+ max_match_id: StateID::ZERO,
+ start_unanchored_id: StateID::ZERO,
+ start_anchored_id: StateID::ZERO,
+ }
+ }
+}