summaryrefslogtreecommitdiffstats
path: root/third_party/rust/aho-corasick/src/util/prefilter.rs
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-19 00:47:55 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-19 00:47:55 +0000
commit26a029d407be480d791972afb5975cf62c9360a6 (patch)
treef435a8308119effd964b339f76abb83a57c29483 /third_party/rust/aho-corasick/src/util/prefilter.rs
parentInitial commit. (diff)
downloadfirefox-26a029d407be480d791972afb5975cf62c9360a6.tar.xz
firefox-26a029d407be480d791972afb5975cf62c9360a6.zip
Adding upstream version 124.0.1.upstream/124.0.1
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/rust/aho-corasick/src/util/prefilter.rs')
-rw-r--r--third_party/rust/aho-corasick/src/util/prefilter.rs924
1 files changed, 924 insertions, 0 deletions
diff --git a/third_party/rust/aho-corasick/src/util/prefilter.rs b/third_party/rust/aho-corasick/src/util/prefilter.rs
new file mode 100644
index 0000000000..f5ddc75b7c
--- /dev/null
+++ b/third_party/rust/aho-corasick/src/util/prefilter.rs
@@ -0,0 +1,924 @@
+use core::{
+ cmp,
+ fmt::Debug,
+ panic::{RefUnwindSafe, UnwindSafe},
+ u8,
+};
+
+use alloc::{sync::Arc, vec, vec::Vec};
+
+use crate::{
+ packed,
+ util::{
+ alphabet::ByteSet,
+ search::{Match, MatchKind, Span},
+ },
+};
+
+/// A prefilter for accelerating a search.
+///
+/// This crate uses prefilters in the core search implementations to accelerate
+/// common cases. They typically only apply to cases where there are a small
+/// number of patterns (less than 100 or so), but when they do, thoughput can
+/// be boosted considerably, perhaps by an order of magnitude. When a prefilter
+/// is active, it is used whenever a search enters an automaton's start state.
+///
+/// Currently, prefilters cannot be constructed by
+/// callers. A `Prefilter` can only be accessed via the
+/// [`Automaton::prefilter`](crate::automaton::Automaton::prefilter)
+/// method and used to execute a search. In other words, a prefilter can be
+/// used to optimize your own search implementation if necessary, but cannot do
+/// much else. If you have a use case for more APIs, please submit an issue.
+#[derive(Clone, Debug)]
+pub struct Prefilter {
+ finder: Arc<dyn PrefilterI>,
+ memory_usage: usize,
+}
+
+impl Prefilter {
+ /// Execute a search in the haystack within the span given. If a match or
+ /// a possible match is returned, then it is guaranteed to occur within
+ /// the bounds of the span.
+ ///
+ /// If the span provided is invalid for the given haystack, then behavior
+ /// is unspecified.
+ #[inline]
+ pub fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
+ self.finder.find_in(haystack, span)
+ }
+
+ #[inline]
+ pub(crate) fn memory_usage(&self) -> usize {
+ self.memory_usage
+ }
+}
+
+/// A candidate is the result of running a prefilter on a haystack at a
+/// particular position.
+///
+/// The result is either no match, a confirmed match or a possible match.
+///
+/// When no match is returned, the prefilter is guaranteeing that no possible
+/// match can be found in the haystack, and the caller may trust this. That is,
+/// all correct prefilters must never report false negatives.
+///
+/// In some cases, a prefilter can confirm a match very quickly, in which case,
+/// the caller may use this to stop what it's doing and report the match. In
+/// this case, prefilter implementations must never report a false positive.
+/// In other cases, the prefilter can only report a potential match, in which
+/// case the callers must attempt to confirm the match. In this case, prefilter
+/// implementations are permitted to return false positives.
+#[derive(Clone, Debug)]
+pub enum Candidate {
+ /// No match was found. Since false negatives are not possible, this means
+ /// the search can quit as it is guaranteed not to find another match.
+ None,
+ /// A confirmed match was found. Callers do not need to confirm it.
+ Match(Match),
+ /// The start of a possible match was found. Callers must confirm it before
+ /// reporting it as a match.
+ PossibleStartOfMatch(usize),
+}
+
+impl Candidate {
+ /// Convert this candidate into an option. This is useful when callers
+ /// do not distinguish between true positives and false positives (i.e.,
+ /// the caller must always confirm the match).
+ pub fn into_option(self) -> Option<usize> {
+ match self {
+ Candidate::None => None,
+ Candidate::Match(ref m) => Some(m.start()),
+ Candidate::PossibleStartOfMatch(start) => Some(start),
+ }
+ }
+}
+
+/// A prefilter describes the behavior of fast literal scanners for quickly
+/// skipping past bytes in the haystack that we know cannot possibly
+/// participate in a match.
+trait PrefilterI:
+ Send + Sync + RefUnwindSafe + UnwindSafe + Debug + 'static
+{
+ /// Returns the next possible match candidate. This may yield false
+ /// positives, so callers must confirm a match starting at the position
+ /// returned. This, however, must never produce false negatives. That is,
+ /// this must, at minimum, return the starting position of the next match
+ /// in the given haystack after or at the given position.
+ fn find_in(&self, haystack: &[u8], span: Span) -> Candidate;
+}
+
+impl<P: PrefilterI + ?Sized> PrefilterI for Arc<P> {
+ #[inline(always)]
+ fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
+ (**self).find_in(haystack, span)
+ }
+}
+
+/// A builder for constructing the best possible prefilter. When constructed,
+/// this builder will heuristically select the best prefilter it can build,
+/// if any, and discard the rest.
+#[derive(Debug)]
+pub(crate) struct Builder {
+ count: usize,
+ ascii_case_insensitive: bool,
+ start_bytes: StartBytesBuilder,
+ rare_bytes: RareBytesBuilder,
+ memmem: MemmemBuilder,
+ packed: Option<packed::Builder>,
+ // If we run across a condition that suggests we shouldn't use a prefilter
+ // at all (like an empty pattern), then disable prefilters entirely.
+ enabled: bool,
+}
+
+impl Builder {
+ /// Create a new builder for constructing the best possible prefilter.
+ pub(crate) fn new(kind: MatchKind) -> Builder {
+ let pbuilder = kind
+ .as_packed()
+ .map(|kind| packed::Config::new().match_kind(kind).builder());
+ Builder {
+ count: 0,
+ ascii_case_insensitive: false,
+ start_bytes: StartBytesBuilder::new(),
+ rare_bytes: RareBytesBuilder::new(),
+ memmem: MemmemBuilder::default(),
+ packed: pbuilder,
+ enabled: true,
+ }
+ }
+
+ /// Enable ASCII case insensitivity. When set, byte strings added to this
+ /// builder will be interpreted without respect to ASCII case.
+ pub(crate) fn ascii_case_insensitive(mut self, yes: bool) -> Builder {
+ self.ascii_case_insensitive = yes;
+ self.start_bytes = self.start_bytes.ascii_case_insensitive(yes);
+ self.rare_bytes = self.rare_bytes.ascii_case_insensitive(yes);
+ self
+ }
+
+ /// Return a prefilter suitable for quickly finding potential matches.
+ ///
+ /// All patterns added to an Aho-Corasick automaton should be added to this
+ /// builder before attempting to construct the prefilter.
+ pub(crate) fn build(&self) -> Option<Prefilter> {
+ if !self.enabled {
+ debug!("prefilter not enabled, skipping");
+ return None;
+ }
+ // If we only have one pattern, then deferring to memmem is always
+ // the best choice. This is kind of a weird case, because, well, why
+ // use Aho-Corasick if you only have one pattern? But maybe you don't
+ // know exactly how many patterns you'll get up front, and you need to
+ // support the option of multiple patterns. So instead of relying on
+ // the caller to branch and use memmem explicitly, we just do it for
+ // them.
+ if !self.ascii_case_insensitive {
+ if let Some(pre) = self.memmem.build() {
+ debug!("using memmem prefilter");
+ return Some(pre);
+ }
+ }
+ let (packed, patlen, minlen) = if self.ascii_case_insensitive {
+ (None, usize::MAX, 0)
+ } else {
+ let patlen = self.packed.as_ref().map_or(usize::MAX, |p| p.len());
+ let minlen = self.packed.as_ref().map_or(0, |p| p.minimum_len());
+ let packed =
+ self.packed.as_ref().and_then(|b| b.build()).map(|s| {
+ let memory_usage = s.memory_usage();
+ debug!(
+ "built packed prefilter (len: {}, \
+ minimum pattern len: {}, memory usage: {}) \
+ for consideration",
+ patlen, minlen, memory_usage,
+ );
+ Prefilter { finder: Arc::new(Packed(s)), memory_usage }
+ });
+ (packed, patlen, minlen)
+ };
+ match (self.start_bytes.build(), self.rare_bytes.build()) {
+ // If we could build both start and rare prefilters, then there are
+ // a few cases in which we'd want to use the start-byte prefilter
+ // over the rare-byte prefilter, since the former has lower
+ // overhead.
+ (prestart @ Some(_), prerare @ Some(_)) => {
+ debug!(
+ "both start (len={}, rank={}) and \
+ rare (len={}, rank={}) byte prefilters \
+ are available",
+ self.start_bytes.count,
+ self.start_bytes.rank_sum,
+ self.rare_bytes.count,
+ self.rare_bytes.rank_sum,
+ );
+ if patlen <= 16
+ && minlen >= 2
+ && self.start_bytes.count >= 3
+ && self.rare_bytes.count >= 3
+ {
+ debug!(
+ "start and rare byte prefilters available, but \
+ they're probably slower than packed so using \
+ packed"
+ );
+ return packed;
+ }
+ // If the start-byte prefilter can scan for a smaller number
+ // of bytes than the rare-byte prefilter, then it's probably
+ // faster.
+ let has_fewer_bytes =
+ self.start_bytes.count < self.rare_bytes.count;
+ // Otherwise, if the combined frequency rank of the detected
+ // bytes in the start-byte prefilter is "close" to the combined
+ // frequency rank of the rare-byte prefilter, then we pick
+ // the start-byte prefilter even if the rare-byte prefilter
+ // heuristically searches for rare bytes. This is because the
+ // rare-byte prefilter has higher constant costs, so we tend to
+ // prefer the start-byte prefilter when we can.
+ let has_rarer_bytes =
+ self.start_bytes.rank_sum <= self.rare_bytes.rank_sum + 50;
+ if has_fewer_bytes {
+ debug!(
+ "using start byte prefilter because it has fewer
+ bytes to search for than the rare byte prefilter",
+ );
+ prestart
+ } else if has_rarer_bytes {
+ debug!(
+ "using start byte prefilter because its byte \
+ frequency rank was determined to be \
+ \"good enough\" relative to the rare byte prefilter \
+ byte frequency rank",
+ );
+ prestart
+ } else {
+ debug!("using rare byte prefilter");
+ prerare
+ }
+ }
+ (prestart @ Some(_), None) => {
+ if patlen <= 16 && minlen >= 2 && self.start_bytes.count >= 3 {
+ debug!(
+ "start byte prefilter available, but \
+ it's probably slower than packed so using \
+ packed"
+ );
+ return packed;
+ }
+ debug!(
+ "have start byte prefilter but not rare byte prefilter, \
+ so using start byte prefilter",
+ );
+ prestart
+ }
+ (None, prerare @ Some(_)) => {
+ if patlen <= 16 && minlen >= 2 && self.rare_bytes.count >= 3 {
+ debug!(
+ "rare byte prefilter available, but \
+ it's probably slower than packed so using \
+ packed"
+ );
+ return packed;
+ }
+ debug!(
+ "have rare byte prefilter but not start byte prefilter, \
+ so using rare byte prefilter",
+ );
+ prerare
+ }
+ (None, None) if self.ascii_case_insensitive => {
+ debug!(
+ "no start or rare byte prefilter and ASCII case \
+ insensitivity was enabled, so skipping prefilter",
+ );
+ None
+ }
+ (None, None) => {
+ if packed.is_some() {
+ debug!("falling back to packed prefilter");
+ } else {
+ debug!("no prefilter available");
+ }
+ packed
+ }
+ }
+ }
+
+ /// Add a literal string to this prefilter builder.
+ pub(crate) fn add(&mut self, bytes: &[u8]) {
+ if bytes.is_empty() {
+ self.enabled = false;
+ }
+ if !self.enabled {
+ return;
+ }
+ self.count += 1;
+ self.start_bytes.add(bytes);
+ self.rare_bytes.add(bytes);
+ self.memmem.add(bytes);
+ if let Some(ref mut pbuilder) = self.packed {
+ pbuilder.add(bytes);
+ }
+ }
+}
+
+/// A type that wraps a packed searcher and implements the `Prefilter`
+/// interface.
+#[derive(Clone, Debug)]
+struct Packed(packed::Searcher);
+
+impl PrefilterI for Packed {
+ fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
+ self.0
+ .find_in(&haystack, span)
+ .map_or(Candidate::None, Candidate::Match)
+ }
+}
+
+/// A builder for constructing a prefilter that uses memmem.
+#[derive(Debug, Default)]
+struct MemmemBuilder {
+ /// The number of patterns that have been added.
+ count: usize,
+ /// The singular pattern to search for. This is only set when count==1.
+ one: Option<Vec<u8>>,
+}
+
+impl MemmemBuilder {
+ fn build(&self) -> Option<Prefilter> {
+ #[cfg(all(feature = "std", feature = "perf-literal"))]
+ fn imp(builder: &MemmemBuilder) -> Option<Prefilter> {
+ let pattern = builder.one.as_ref()?;
+ assert_eq!(1, builder.count);
+ let finder = Arc::new(Memmem(
+ memchr::memmem::Finder::new(pattern).into_owned(),
+ ));
+ let memory_usage = pattern.len();
+ Some(Prefilter { finder, memory_usage })
+ }
+
+ #[cfg(not(all(feature = "std", feature = "perf-literal")))]
+ fn imp(_: &MemmemBuilder) -> Option<Prefilter> {
+ None
+ }
+
+ imp(self)
+ }
+
+ fn add(&mut self, bytes: &[u8]) {
+ self.count += 1;
+ if self.count == 1 {
+ self.one = Some(bytes.to_vec());
+ } else {
+ self.one = None;
+ }
+ }
+}
+
+/// A type that wraps a SIMD accelerated single substring search from the
+/// `memchr` crate for use as a prefilter.
+///
+/// Currently, this prefilter is only active for Aho-Corasick searchers with
+/// a single pattern. In theory, this could be extended to support searchers
+/// that have a common prefix of more than one byte (for one byte, we would use
+/// memchr), but it's not clear if it's worth it or not.
+///
+/// Also, unfortunately, this currently also requires the 'std' feature to
+/// be enabled. That's because memchr doesn't have a no-std-but-with-alloc
+/// mode, and so APIs like Finder::into_owned aren't available when 'std' is
+/// disabled. But there should be an 'alloc' feature that brings in APIs like
+/// Finder::into_owned but doesn't use std-only features like runtime CPU
+/// feature detection.
+#[cfg(all(feature = "std", feature = "perf-literal"))]
+#[derive(Clone, Debug)]
+struct Memmem(memchr::memmem::Finder<'static>);
+
+#[cfg(all(feature = "std", feature = "perf-literal"))]
+impl PrefilterI for Memmem {
+ fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
+ use crate::util::primitives::PatternID;
+
+ self.0.find(&haystack[span]).map_or(Candidate::None, |i| {
+ let start = span.start + i;
+ let end = start + self.0.needle().len();
+ // N.B. We can declare a match and use a fixed pattern ID here
+ // because a Memmem prefilter is only ever created for searchers
+ // with exactly one pattern. Thus, every match is always a match
+ // and it is always for the first and only pattern.
+ Candidate::Match(Match::new(PatternID::ZERO, start..end))
+ })
+ }
+}
+
+/// A builder for constructing a rare byte prefilter.
+///
+/// A rare byte prefilter attempts to pick out a small set of rare bytes that
+/// occurr in the patterns, and then quickly scan to matches of those rare
+/// bytes.
+#[derive(Clone, Debug)]
+struct RareBytesBuilder {
+ /// Whether this prefilter should account for ASCII case insensitivity or
+ /// not.
+ ascii_case_insensitive: bool,
+ /// A set of rare bytes, indexed by byte value.
+ rare_set: ByteSet,
+ /// A set of byte offsets associated with bytes in a pattern. An entry
+ /// corresponds to a particular bytes (its index) and is only non-zero if
+ /// the byte occurred at an offset greater than 0 in at least one pattern.
+ ///
+ /// If a byte's offset is not representable in 8 bits, then the rare bytes
+ /// prefilter becomes inert.
+ byte_offsets: RareByteOffsets,
+ /// Whether this is available as a prefilter or not. This can be set to
+ /// false during construction if a condition is seen that invalidates the
+ /// use of the rare-byte prefilter.
+ available: bool,
+ /// The number of bytes set to an active value in `byte_offsets`.
+ count: usize,
+ /// The sum of frequency ranks for the rare bytes detected. This is
+ /// intended to give a heuristic notion of how rare the bytes are.
+ rank_sum: u16,
+}
+
+/// A set of byte offsets, keyed by byte.
+#[derive(Clone, Copy)]
+struct RareByteOffsets {
+ /// Each entry corresponds to the maximum offset of the corresponding
+ /// byte across all patterns seen.
+ set: [RareByteOffset; 256],
+}
+
+impl RareByteOffsets {
+ /// Create a new empty set of rare byte offsets.
+ pub(crate) fn empty() -> RareByteOffsets {
+ RareByteOffsets { set: [RareByteOffset::default(); 256] }
+ }
+
+ /// Add the given offset for the given byte to this set. If the offset is
+ /// greater than the existing offset, then it overwrites the previous
+ /// value and returns false. If there is no previous value set, then this
+ /// sets it and returns true.
+ pub(crate) fn set(&mut self, byte: u8, off: RareByteOffset) {
+ self.set[byte as usize].max =
+ cmp::max(self.set[byte as usize].max, off.max);
+ }
+}
+
+impl core::fmt::Debug for RareByteOffsets {
+ fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+ let mut offsets = vec![];
+ for off in self.set.iter() {
+ if off.max > 0 {
+ offsets.push(off);
+ }
+ }
+ f.debug_struct("RareByteOffsets").field("set", &offsets).finish()
+ }
+}
+
+/// Offsets associated with an occurrence of a "rare" byte in any of the
+/// patterns used to construct a single Aho-Corasick automaton.
+#[derive(Clone, Copy, Debug)]
+struct RareByteOffset {
+ /// The maximum offset at which a particular byte occurs from the start
+ /// of any pattern. This is used as a shift amount. That is, when an
+ /// occurrence of this byte is found, the candidate position reported by
+ /// the prefilter is `position_of_byte - max`, such that the automaton
+ /// will begin its search at a position that is guaranteed to observe a
+ /// match.
+ ///
+ /// To avoid accidentally quadratic behavior, a prefilter is considered
+ /// ineffective when it is asked to start scanning from a position that it
+ /// has already scanned past.
+ ///
+ /// Using a `u8` here means that if we ever see a pattern that's longer
+ /// than 255 bytes, then the entire rare byte prefilter is disabled.
+ max: u8,
+}
+
+impl Default for RareByteOffset {
+ fn default() -> RareByteOffset {
+ RareByteOffset { max: 0 }
+ }
+}
+
+impl RareByteOffset {
+ /// Create a new rare byte offset. If the given offset is too big, then
+ /// None is returned. In that case, callers should render the rare bytes
+ /// prefilter inert.
+ fn new(max: usize) -> Option<RareByteOffset> {
+ if max > u8::MAX as usize {
+ None
+ } else {
+ Some(RareByteOffset { max: max as u8 })
+ }
+ }
+}
+
+impl RareBytesBuilder {
+ /// Create a new builder for constructing a rare byte prefilter.
+ fn new() -> RareBytesBuilder {
+ RareBytesBuilder {
+ ascii_case_insensitive: false,
+ rare_set: ByteSet::empty(),
+ byte_offsets: RareByteOffsets::empty(),
+ available: true,
+ count: 0,
+ rank_sum: 0,
+ }
+ }
+
+ /// Enable ASCII case insensitivity. When set, byte strings added to this
+ /// builder will be interpreted without respect to ASCII case.
+ fn ascii_case_insensitive(mut self, yes: bool) -> RareBytesBuilder {
+ self.ascii_case_insensitive = yes;
+ self
+ }
+
+ /// Build the rare bytes prefilter.
+ ///
+ /// If there are more than 3 distinct rare bytes found, or if heuristics
+ /// otherwise determine that this prefilter should not be used, then `None`
+ /// is returned.
+ fn build(&self) -> Option<Prefilter> {
+ #[cfg(feature = "perf-literal")]
+ fn imp(builder: &RareBytesBuilder) -> Option<Prefilter> {
+ if !builder.available || builder.count > 3 {
+ return None;
+ }
+ let (mut bytes, mut len) = ([0; 3], 0);
+ for b in 0..=255 {
+ if builder.rare_set.contains(b) {
+ bytes[len] = b as u8;
+ len += 1;
+ }
+ }
+ let finder: Arc<dyn PrefilterI> = match len {
+ 0 => return None,
+ 1 => Arc::new(RareBytesOne {
+ byte1: bytes[0],
+ offset: builder.byte_offsets.set[bytes[0] as usize],
+ }),
+ 2 => Arc::new(RareBytesTwo {
+ offsets: builder.byte_offsets,
+ byte1: bytes[0],
+ byte2: bytes[1],
+ }),
+ 3 => Arc::new(RareBytesThree {
+ offsets: builder.byte_offsets,
+ byte1: bytes[0],
+ byte2: bytes[1],
+ byte3: bytes[2],
+ }),
+ _ => unreachable!(),
+ };
+ Some(Prefilter { finder, memory_usage: 0 })
+ }
+
+ #[cfg(not(feature = "perf-literal"))]
+ fn imp(_: &RareBytesBuilder) -> Option<Prefilter> {
+ None
+ }
+
+ imp(self)
+ }
+
+ /// Add a byte string to this builder.
+ ///
+ /// All patterns added to an Aho-Corasick automaton should be added to this
+ /// builder before attempting to construct the prefilter.
+ fn add(&mut self, bytes: &[u8]) {
+ // If we've already given up, then do nothing.
+ if !self.available {
+ return;
+ }
+ // If we've already blown our budget, then don't waste time looking
+ // for more rare bytes.
+ if self.count > 3 {
+ self.available = false;
+ return;
+ }
+ // If the pattern is too long, then our offset table is bunk, so
+ // give up.
+ if bytes.len() >= 256 {
+ self.available = false;
+ return;
+ }
+ let mut rarest = match bytes.get(0) {
+ None => return,
+ Some(&b) => (b, freq_rank(b)),
+ };
+ // The idea here is to look for the rarest byte in each pattern, and
+ // add that to our set. As a special exception, if we see a byte that
+ // we've already added, then we immediately stop and choose that byte,
+ // even if there's another rare byte in the pattern. This helps us
+ // apply the rare byte optimization in more cases by attempting to pick
+ // bytes that are in common between patterns. So for example, if we
+ // were searching for `Sherlock` and `lockjaw`, then this would pick
+ // `k` for both patterns, resulting in the use of `memchr` instead of
+ // `memchr2` for `k` and `j`.
+ let mut found = false;
+ for (pos, &b) in bytes.iter().enumerate() {
+ self.set_offset(pos, b);
+ if found {
+ continue;
+ }
+ if self.rare_set.contains(b) {
+ found = true;
+ continue;
+ }
+ let rank = freq_rank(b);
+ if rank < rarest.1 {
+ rarest = (b, rank);
+ }
+ }
+ if !found {
+ self.add_rare_byte(rarest.0);
+ }
+ }
+
+ fn set_offset(&mut self, pos: usize, byte: u8) {
+ // This unwrap is OK because pos is never bigger than our max.
+ let offset = RareByteOffset::new(pos).unwrap();
+ self.byte_offsets.set(byte, offset);
+ if self.ascii_case_insensitive {
+ self.byte_offsets.set(opposite_ascii_case(byte), offset);
+ }
+ }
+
+ fn add_rare_byte(&mut self, byte: u8) {
+ self.add_one_rare_byte(byte);
+ if self.ascii_case_insensitive {
+ self.add_one_rare_byte(opposite_ascii_case(byte));
+ }
+ }
+
+ fn add_one_rare_byte(&mut self, byte: u8) {
+ if !self.rare_set.contains(byte) {
+ self.rare_set.add(byte);
+ self.count += 1;
+ self.rank_sum += freq_rank(byte) as u16;
+ }
+ }
+}
+
+/// A prefilter for scanning for a single "rare" byte.
+#[cfg(feature = "perf-literal")]
+#[derive(Clone, Debug)]
+struct RareBytesOne {
+ byte1: u8,
+ offset: RareByteOffset,
+}
+
+#[cfg(feature = "perf-literal")]
+impl PrefilterI for RareBytesOne {
+ fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
+ memchr::memchr(self.byte1, &haystack[span])
+ .map(|i| {
+ let pos = span.start + i;
+ cmp::max(
+ span.start,
+ pos.saturating_sub(usize::from(self.offset.max)),
+ )
+ })
+ .map_or(Candidate::None, Candidate::PossibleStartOfMatch)
+ }
+}
+
+/// A prefilter for scanning for two "rare" bytes.
+#[cfg(feature = "perf-literal")]
+#[derive(Clone, Debug)]
+struct RareBytesTwo {
+ offsets: RareByteOffsets,
+ byte1: u8,
+ byte2: u8,
+}
+
+#[cfg(feature = "perf-literal")]
+impl PrefilterI for RareBytesTwo {
+ fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
+ memchr::memchr2(self.byte1, self.byte2, &haystack[span])
+ .map(|i| {
+ let pos = span.start + i;
+ let offset = self.offsets.set[usize::from(haystack[pos])].max;
+ cmp::max(span.start, pos.saturating_sub(usize::from(offset)))
+ })
+ .map_or(Candidate::None, Candidate::PossibleStartOfMatch)
+ }
+}
+
+/// A prefilter for scanning for three "rare" bytes.
+#[cfg(feature = "perf-literal")]
+#[derive(Clone, Debug)]
+struct RareBytesThree {
+ offsets: RareByteOffsets,
+ byte1: u8,
+ byte2: u8,
+ byte3: u8,
+}
+
+#[cfg(feature = "perf-literal")]
+impl PrefilterI for RareBytesThree {
+ fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
+ memchr::memchr3(self.byte1, self.byte2, self.byte3, &haystack[span])
+ .map(|i| {
+ let pos = span.start + i;
+ let offset = self.offsets.set[usize::from(haystack[pos])].max;
+ cmp::max(span.start, pos.saturating_sub(usize::from(offset)))
+ })
+ .map_or(Candidate::None, Candidate::PossibleStartOfMatch)
+ }
+}
+
+/// A builder for constructing a starting byte prefilter.
+///
+/// A starting byte prefilter is a simplistic prefilter that looks for possible
+/// matches by reporting all positions corresponding to a particular byte. This
+/// generally only takes affect when there are at most 3 distinct possible
+/// starting bytes. e.g., the patterns `foo`, `bar`, and `baz` have two
+/// distinct starting bytes (`f` and `b`), and this prefilter returns all
+/// occurrences of either `f` or `b`.
+///
+/// In some cases, a heuristic frequency analysis may determine that it would
+/// be better not to use this prefilter even when there are 3 or fewer distinct
+/// starting bytes.
+#[derive(Clone, Debug)]
+struct StartBytesBuilder {
+ /// Whether this prefilter should account for ASCII case insensitivity or
+ /// not.
+ ascii_case_insensitive: bool,
+ /// The set of starting bytes observed.
+ byteset: Vec<bool>,
+ /// The number of bytes set to true in `byteset`.
+ count: usize,
+ /// The sum of frequency ranks for the rare bytes detected. This is
+ /// intended to give a heuristic notion of how rare the bytes are.
+ rank_sum: u16,
+}
+
+impl StartBytesBuilder {
+ /// Create a new builder for constructing a start byte prefilter.
+ fn new() -> StartBytesBuilder {
+ StartBytesBuilder {
+ ascii_case_insensitive: false,
+ byteset: vec![false; 256],
+ count: 0,
+ rank_sum: 0,
+ }
+ }
+
+ /// Enable ASCII case insensitivity. When set, byte strings added to this
+ /// builder will be interpreted without respect to ASCII case.
+ fn ascii_case_insensitive(mut self, yes: bool) -> StartBytesBuilder {
+ self.ascii_case_insensitive = yes;
+ self
+ }
+
+ /// Build the starting bytes prefilter.
+ ///
+ /// If there are more than 3 distinct starting bytes, or if heuristics
+ /// otherwise determine that this prefilter should not be used, then `None`
+ /// is returned.
+ fn build(&self) -> Option<Prefilter> {
+ #[cfg(feature = "perf-literal")]
+ fn imp(builder: &StartBytesBuilder) -> Option<Prefilter> {
+ if builder.count > 3 {
+ return None;
+ }
+ let (mut bytes, mut len) = ([0; 3], 0);
+ for b in 0..256 {
+ if !builder.byteset[b] {
+ continue;
+ }
+ // We don't handle non-ASCII bytes for now. Getting non-ASCII
+ // bytes right is trickier, since we generally don't want to put
+ // a leading UTF-8 code unit into a prefilter that isn't ASCII,
+ // since they can frequently. Instead, it would be better to use a
+ // continuation byte, but this requires more sophisticated analysis
+ // of the automaton and a richer prefilter API.
+ if b > 0x7F {
+ return None;
+ }
+ bytes[len] = b as u8;
+ len += 1;
+ }
+ let finder: Arc<dyn PrefilterI> = match len {
+ 0 => return None,
+ 1 => Arc::new(StartBytesOne { byte1: bytes[0] }),
+ 2 => Arc::new(StartBytesTwo {
+ byte1: bytes[0],
+ byte2: bytes[1],
+ }),
+ 3 => Arc::new(StartBytesThree {
+ byte1: bytes[0],
+ byte2: bytes[1],
+ byte3: bytes[2],
+ }),
+ _ => unreachable!(),
+ };
+ Some(Prefilter { finder, memory_usage: 0 })
+ }
+
+ #[cfg(not(feature = "perf-literal"))]
+ fn imp(_: &StartBytesBuilder) -> Option<Prefilter> {
+ None
+ }
+
+ imp(self)
+ }
+
+ /// Add a byte string to this builder.
+ ///
+ /// All patterns added to an Aho-Corasick automaton should be added to this
+ /// builder before attempting to construct the prefilter.
+ fn add(&mut self, bytes: &[u8]) {
+ if self.count > 3 {
+ return;
+ }
+ if let Some(&byte) = bytes.get(0) {
+ self.add_one_byte(byte);
+ if self.ascii_case_insensitive {
+ self.add_one_byte(opposite_ascii_case(byte));
+ }
+ }
+ }
+
+ fn add_one_byte(&mut self, byte: u8) {
+ if !self.byteset[byte as usize] {
+ self.byteset[byte as usize] = true;
+ self.count += 1;
+ self.rank_sum += freq_rank(byte) as u16;
+ }
+ }
+}
+
+/// A prefilter for scanning for a single starting byte.
+#[cfg(feature = "perf-literal")]
+#[derive(Clone, Debug)]
+struct StartBytesOne {
+ byte1: u8,
+}
+
+#[cfg(feature = "perf-literal")]
+impl PrefilterI for StartBytesOne {
+ fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
+ memchr::memchr(self.byte1, &haystack[span])
+ .map(|i| span.start + i)
+ .map_or(Candidate::None, Candidate::PossibleStartOfMatch)
+ }
+}
+
+/// A prefilter for scanning for two starting bytes.
+#[cfg(feature = "perf-literal")]
+#[derive(Clone, Debug)]
+struct StartBytesTwo {
+ byte1: u8,
+ byte2: u8,
+}
+
+#[cfg(feature = "perf-literal")]
+impl PrefilterI for StartBytesTwo {
+ fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
+ memchr::memchr2(self.byte1, self.byte2, &haystack[span])
+ .map(|i| span.start + i)
+ .map_or(Candidate::None, Candidate::PossibleStartOfMatch)
+ }
+}
+
+/// A prefilter for scanning for three starting bytes.
+#[cfg(feature = "perf-literal")]
+#[derive(Clone, Debug)]
+struct StartBytesThree {
+ byte1: u8,
+ byte2: u8,
+ byte3: u8,
+}
+
+#[cfg(feature = "perf-literal")]
+impl PrefilterI for StartBytesThree {
+ fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
+ memchr::memchr3(self.byte1, self.byte2, self.byte3, &haystack[span])
+ .map(|i| span.start + i)
+ .map_or(Candidate::None, Candidate::PossibleStartOfMatch)
+ }
+}
+
+/// If the given byte is an ASCII letter, then return it in the opposite case.
+/// e.g., Given `b'A'`, this returns `b'a'`, and given `b'a'`, this returns
+/// `b'A'`. If a non-ASCII letter is given, then the given byte is returned.
+pub(crate) fn opposite_ascii_case(b: u8) -> u8 {
+ if b'A' <= b && b <= b'Z' {
+ b.to_ascii_lowercase()
+ } else if b'a' <= b && b <= b'z' {
+ b.to_ascii_uppercase()
+ } else {
+ b
+ }
+}
+
+/// Return the frequency rank of the given byte. The higher the rank, the more
+/// common the byte (heuristically speaking).
+fn freq_rank(b: u8) -> u8 {
+ use crate::util::byte_frequencies::BYTE_FREQUENCIES;
+ BYTE_FREQUENCIES[b as usize]
+}