summaryrefslogtreecommitdiffstats
path: root/third_party/rust/aho-corasick/src/packed/api.rs
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/rust/aho-corasick/src/packed/api.rs')
-rw-r--r--third_party/rust/aho-corasick/src/packed/api.rs687
1 files changed, 687 insertions, 0 deletions
diff --git a/third_party/rust/aho-corasick/src/packed/api.rs b/third_party/rust/aho-corasick/src/packed/api.rs
new file mode 100644
index 0000000000..44f0bc9be3
--- /dev/null
+++ b/third_party/rust/aho-corasick/src/packed/api.rs
@@ -0,0 +1,687 @@
+use alloc::sync::Arc;
+
+use crate::{
+ packed::{pattern::Patterns, rabinkarp::RabinKarp, teddy},
+ util::search::{Match, Span},
+};
+
+/// This is a limit placed on the total number of patterns we're willing to try
+/// and match at once. As more sophisticated algorithms are added, this number
+/// may be increased.
+const PATTERN_LIMIT: usize = 128;
+
+/// A knob for controlling the match semantics of a packed multiple string
+/// searcher.
+///
+/// This differs from the [`MatchKind`](crate::MatchKind) type in the top-level
+/// crate module in that it doesn't support "standard" match semantics,
+/// and instead only supports leftmost-first or leftmost-longest. Namely,
+/// "standard" semantics cannot be easily supported by packed searchers.
+///
+/// For more information on the distinction between leftmost-first and
+/// leftmost-longest, see the docs on the top-level `MatchKind` type.
+///
+/// Unlike the top-level `MatchKind` type, the default match semantics for this
+/// type are leftmost-first.
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+#[non_exhaustive]
+pub enum MatchKind {
+ /// Use leftmost-first match semantics, which reports leftmost matches.
+ /// When there are multiple possible leftmost matches, the match
+ /// corresponding to the pattern that appeared earlier when constructing
+ /// the automaton is reported.
+ ///
+ /// This is the default.
+ LeftmostFirst,
+ /// Use leftmost-longest match semantics, which reports leftmost matches.
+ /// When there are multiple possible leftmost matches, the longest match
+ /// is chosen.
+ LeftmostLongest,
+}
+
+impl Default for MatchKind {
+ fn default() -> MatchKind {
+ MatchKind::LeftmostFirst
+ }
+}
+
+/// The configuration for a packed multiple pattern searcher.
+///
+/// The configuration is currently limited only to being able to select the
+/// match semantics (leftmost-first or leftmost-longest) of a searcher. In the
+/// future, more knobs may be made available.
+///
+/// A configuration produces a [`packed::Builder`](Builder), which in turn can
+/// be used to construct a [`packed::Searcher`](Searcher) for searching.
+///
+/// # Example
+///
+/// This example shows how to use leftmost-longest semantics instead of the
+/// default (leftmost-first).
+///
+/// ```
+/// use aho_corasick::{packed::{Config, MatchKind}, PatternID};
+///
+/// # fn example() -> Option<()> {
+/// let searcher = Config::new()
+/// .match_kind(MatchKind::LeftmostLongest)
+/// .builder()
+/// .add("foo")
+/// .add("foobar")
+/// .build()?;
+/// let matches: Vec<PatternID> = searcher
+/// .find_iter("foobar")
+/// .map(|mat| mat.pattern())
+/// .collect();
+/// assert_eq!(vec![PatternID::must(1)], matches);
+/// # Some(()) }
+/// # if cfg!(all(feature = "std", any(
+/// # target_arch = "x86_64", target_arch = "aarch64",
+/// # ))) {
+/// # example().unwrap()
+/// # } else {
+/// # assert!(example().is_none());
+/// # }
+/// ```
+#[derive(Clone, Debug)]
+pub struct Config {
+ kind: MatchKind,
+ force: Option<ForceAlgorithm>,
+ only_teddy_fat: Option<bool>,
+ only_teddy_256bit: Option<bool>,
+ heuristic_pattern_limits: bool,
+}
+
+/// An internal option for forcing the use of a particular packed algorithm.
+///
+/// When an algorithm is forced, if a searcher could not be constructed for it,
+/// then no searcher will be returned even if an alternative algorithm would
+/// work.
+#[derive(Clone, Debug)]
+enum ForceAlgorithm {
+ Teddy,
+ RabinKarp,
+}
+
+impl Default for Config {
+ fn default() -> Config {
+ Config::new()
+ }
+}
+
+impl Config {
+ /// Create a new default configuration. A default configuration uses
+ /// leftmost-first match semantics.
+ pub fn new() -> Config {
+ Config {
+ kind: MatchKind::LeftmostFirst,
+ force: None,
+ only_teddy_fat: None,
+ only_teddy_256bit: None,
+ heuristic_pattern_limits: true,
+ }
+ }
+
+ /// Create a packed builder from this configuration. The builder can be
+ /// used to accumulate patterns and create a [`Searcher`] from them.
+ pub fn builder(&self) -> Builder {
+ Builder::from_config(self.clone())
+ }
+
+ /// Set the match semantics for this configuration.
+ pub fn match_kind(&mut self, kind: MatchKind) -> &mut Config {
+ self.kind = kind;
+ self
+ }
+
+ /// An undocumented method for forcing the use of the Teddy algorithm.
+ ///
+ /// This is only exposed for more precise testing and benchmarks. Callers
+ /// should not use it as it is not part of the API stability guarantees of
+ /// this crate.
+ #[doc(hidden)]
+ pub fn only_teddy(&mut self, yes: bool) -> &mut Config {
+ if yes {
+ self.force = Some(ForceAlgorithm::Teddy);
+ } else {
+ self.force = None;
+ }
+ self
+ }
+
+ /// An undocumented method for forcing the use of the Fat Teddy algorithm.
+ ///
+ /// This is only exposed for more precise testing and benchmarks. Callers
+ /// should not use it as it is not part of the API stability guarantees of
+ /// this crate.
+ #[doc(hidden)]
+ pub fn only_teddy_fat(&mut self, yes: Option<bool>) -> &mut Config {
+ self.only_teddy_fat = yes;
+ self
+ }
+
+ /// An undocumented method for forcing the use of SSE (`Some(false)`) or
+ /// AVX (`Some(true)`) algorithms.
+ ///
+ /// This is only exposed for more precise testing and benchmarks. Callers
+ /// should not use it as it is not part of the API stability guarantees of
+ /// this crate.
+ #[doc(hidden)]
+ pub fn only_teddy_256bit(&mut self, yes: Option<bool>) -> &mut Config {
+ self.only_teddy_256bit = yes;
+ self
+ }
+
+ /// An undocumented method for forcing the use of the Rabin-Karp algorithm.
+ ///
+ /// This is only exposed for more precise testing and benchmarks. Callers
+ /// should not use it as it is not part of the API stability guarantees of
+ /// this crate.
+ #[doc(hidden)]
+ pub fn only_rabin_karp(&mut self, yes: bool) -> &mut Config {
+ if yes {
+ self.force = Some(ForceAlgorithm::RabinKarp);
+ } else {
+ self.force = None;
+ }
+ self
+ }
+
+ /// Request that heuristic limitations on the number of patterns be
+ /// employed. This useful to disable for benchmarking where one wants to
+ /// explore how Teddy performs on large number of patterns even if the
+ /// heuristics would otherwise refuse construction.
+ ///
+ /// This is enabled by default.
+ pub fn heuristic_pattern_limits(&mut self, yes: bool) -> &mut Config {
+ self.heuristic_pattern_limits = yes;
+ self
+ }
+}
+
+/// A builder for constructing a packed searcher from a collection of patterns.
+///
+/// # Example
+///
+/// This example shows how to use a builder to construct a searcher. By
+/// default, leftmost-first match semantics are used.
+///
+/// ```
+/// use aho_corasick::{packed::{Builder, MatchKind}, PatternID};
+///
+/// # fn example() -> Option<()> {
+/// let searcher = Builder::new()
+/// .add("foobar")
+/// .add("foo")
+/// .build()?;
+/// let matches: Vec<PatternID> = searcher
+/// .find_iter("foobar")
+/// .map(|mat| mat.pattern())
+/// .collect();
+/// assert_eq!(vec![PatternID::ZERO], matches);
+/// # Some(()) }
+/// # if cfg!(all(feature = "std", any(
+/// # target_arch = "x86_64", target_arch = "aarch64",
+/// # ))) {
+/// # example().unwrap()
+/// # } else {
+/// # assert!(example().is_none());
+/// # }
+/// ```
+#[derive(Clone, Debug)]
+pub struct Builder {
+ /// The configuration of this builder and subsequent matcher.
+ config: Config,
+ /// Set to true if the builder detects that a matcher cannot be built.
+ inert: bool,
+ /// The patterns provided by the caller.
+ patterns: Patterns,
+}
+
+impl Builder {
+ /// Create a new builder for constructing a multi-pattern searcher. This
+ /// constructor uses the default configuration.
+ pub fn new() -> Builder {
+ Builder::from_config(Config::new())
+ }
+
+ fn from_config(config: Config) -> Builder {
+ Builder { config, inert: false, patterns: Patterns::new() }
+ }
+
+ /// Build a searcher from the patterns added to this builder so far.
+ pub fn build(&self) -> Option<Searcher> {
+ if self.inert || self.patterns.is_empty() {
+ return None;
+ }
+ let mut patterns = self.patterns.clone();
+ patterns.set_match_kind(self.config.kind);
+ let patterns = Arc::new(patterns);
+ let rabinkarp = RabinKarp::new(&patterns);
+ // Effectively, we only want to return a searcher if we can use Teddy,
+ // since Teddy is our only fast packed searcher at the moment.
+ // Rabin-Karp is only used when searching haystacks smaller than what
+ // Teddy can support. Thus, the only way to get a Rabin-Karp searcher
+ // is to force it using undocumented APIs (for tests/benchmarks).
+ let (search_kind, minimum_len) = match self.config.force {
+ None | Some(ForceAlgorithm::Teddy) => {
+ debug!("trying to build Teddy packed matcher");
+ let teddy = match self.build_teddy(Arc::clone(&patterns)) {
+ None => return None,
+ Some(teddy) => teddy,
+ };
+ let minimum_len = teddy.minimum_len();
+ (SearchKind::Teddy(teddy), minimum_len)
+ }
+ Some(ForceAlgorithm::RabinKarp) => {
+ debug!("using Rabin-Karp packed matcher");
+ (SearchKind::RabinKarp, 0)
+ }
+ };
+ Some(Searcher { patterns, rabinkarp, search_kind, minimum_len })
+ }
+
+ fn build_teddy(&self, patterns: Arc<Patterns>) -> Option<teddy::Searcher> {
+ teddy::Builder::new()
+ .only_256bit(self.config.only_teddy_256bit)
+ .only_fat(self.config.only_teddy_fat)
+ .heuristic_pattern_limits(self.config.heuristic_pattern_limits)
+ .build(patterns)
+ }
+
+ /// Add the given pattern to this set to match.
+ ///
+ /// The order in which patterns are added is significant. Namely, when
+ /// using leftmost-first match semantics, then when multiple patterns can
+ /// match at a particular location, the pattern that was added first is
+ /// used as the match.
+ ///
+ /// If the number of patterns added exceeds the amount supported by packed
+ /// searchers, then the builder will stop accumulating patterns and render
+ /// itself inert. At this point, constructing a searcher will always return
+ /// `None`.
+ pub fn add<P: AsRef<[u8]>>(&mut self, pattern: P) -> &mut Builder {
+ if self.inert {
+ return self;
+ } else if self.patterns.len() >= PATTERN_LIMIT {
+ self.inert = true;
+ self.patterns.reset();
+ return self;
+ }
+ // Just in case PATTERN_LIMIT increases beyond u16::MAX.
+ assert!(self.patterns.len() <= core::u16::MAX as usize);
+
+ let pattern = pattern.as_ref();
+ if pattern.is_empty() {
+ self.inert = true;
+ self.patterns.reset();
+ return self;
+ }
+ self.patterns.add(pattern);
+ self
+ }
+
+ /// Add the given iterator of patterns to this set to match.
+ ///
+ /// The iterator must yield elements that can be converted into a `&[u8]`.
+ ///
+ /// The order in which patterns are added is significant. Namely, when
+ /// using leftmost-first match semantics, then when multiple patterns can
+ /// match at a particular location, the pattern that was added first is
+ /// used as the match.
+ ///
+ /// If the number of patterns added exceeds the amount supported by packed
+ /// searchers, then the builder will stop accumulating patterns and render
+ /// itself inert. At this point, constructing a searcher will always return
+ /// `None`.
+ pub fn extend<I, P>(&mut self, patterns: I) -> &mut Builder
+ where
+ I: IntoIterator<Item = P>,
+ P: AsRef<[u8]>,
+ {
+ for p in patterns {
+ self.add(p);
+ }
+ self
+ }
+
+ /// Returns the number of patterns added to this builder.
+ pub fn len(&self) -> usize {
+ self.patterns.len()
+ }
+
+ /// Returns the length, in bytes, of the shortest pattern added.
+ pub fn minimum_len(&self) -> usize {
+ self.patterns.minimum_len()
+ }
+}
+
+impl Default for Builder {
+ fn default() -> Builder {
+ Builder::new()
+ }
+}
+
+/// A packed searcher for quickly finding occurrences of multiple patterns.
+///
+/// If callers need more flexible construction, or if one wants to change the
+/// match semantics (either leftmost-first or leftmost-longest), then one can
+/// use the [`Config`] and/or [`Builder`] types for more fine grained control.
+///
+/// # Example
+///
+/// This example shows how to create a searcher from an iterator of patterns.
+/// By default, leftmost-first match semantics are used.
+///
+/// ```
+/// use aho_corasick::{packed::{MatchKind, Searcher}, PatternID};
+///
+/// # fn example() -> Option<()> {
+/// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
+/// let matches: Vec<PatternID> = searcher
+/// .find_iter("foobar")
+/// .map(|mat| mat.pattern())
+/// .collect();
+/// assert_eq!(vec![PatternID::ZERO], matches);
+/// # Some(()) }
+/// # if cfg!(all(feature = "std", any(
+/// # target_arch = "x86_64", target_arch = "aarch64",
+/// # ))) {
+/// # example().unwrap()
+/// # } else {
+/// # assert!(example().is_none());
+/// # }
+/// ```
+#[derive(Clone, Debug)]
+pub struct Searcher {
+ patterns: Arc<Patterns>,
+ rabinkarp: RabinKarp,
+ search_kind: SearchKind,
+ minimum_len: usize,
+}
+
+#[derive(Clone, Debug)]
+enum SearchKind {
+ Teddy(teddy::Searcher),
+ RabinKarp,
+}
+
+impl Searcher {
+ /// A convenience function for constructing a searcher from an iterator
+ /// of things that can be converted to a `&[u8]`.
+ ///
+ /// If a searcher could not be constructed (either because of an
+ /// unsupported CPU or because there are too many patterns), then `None`
+ /// is returned.
+ ///
+ /// # Example
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use aho_corasick::{packed::{MatchKind, Searcher}, PatternID};
+ ///
+ /// # fn example() -> Option<()> {
+ /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
+ /// let matches: Vec<PatternID> = searcher
+ /// .find_iter("foobar")
+ /// .map(|mat| mat.pattern())
+ /// .collect();
+ /// assert_eq!(vec![PatternID::ZERO], matches);
+ /// # Some(()) }
+ /// # if cfg!(all(feature = "std", any(
+ /// # target_arch = "x86_64", target_arch = "aarch64",
+ /// # ))) {
+ /// # example().unwrap()
+ /// # } else {
+ /// # assert!(example().is_none());
+ /// # }
+ /// ```
+ pub fn new<I, P>(patterns: I) -> Option<Searcher>
+ where
+ I: IntoIterator<Item = P>,
+ P: AsRef<[u8]>,
+ {
+ Builder::new().extend(patterns).build()
+ }
+
+ /// A convenience function for calling `Config::new()`.
+ ///
+ /// This is useful for avoiding an additional import.
+ pub fn config() -> Config {
+ Config::new()
+ }
+
+ /// A convenience function for calling `Builder::new()`.
+ ///
+ /// This is useful for avoiding an additional import.
+ pub fn builder() -> Builder {
+ Builder::new()
+ }
+
+ /// Return the first occurrence of any of the patterns in this searcher,
+ /// according to its match semantics, in the given haystack. The `Match`
+ /// returned will include the identifier of the pattern that matched, which
+ /// corresponds to the index of the pattern (starting from `0`) in which it
+ /// was added.
+ ///
+ /// # Example
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use aho_corasick::{packed::{MatchKind, Searcher}, PatternID};
+ ///
+ /// # fn example() -> Option<()> {
+ /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
+ /// let mat = searcher.find("foobar")?;
+ /// assert_eq!(PatternID::ZERO, mat.pattern());
+ /// assert_eq!(0, mat.start());
+ /// assert_eq!(6, mat.end());
+ /// # Some(()) }
+ /// # if cfg!(all(feature = "std", any(
+ /// # target_arch = "x86_64", target_arch = "aarch64",
+ /// # ))) {
+ /// # example().unwrap()
+ /// # } else {
+ /// # assert!(example().is_none());
+ /// # }
+ /// ```
+ #[inline]
+ pub fn find<B: AsRef<[u8]>>(&self, haystack: B) -> Option<Match> {
+ let haystack = haystack.as_ref();
+ self.find_in(haystack, Span::from(0..haystack.len()))
+ }
+
+ /// Return the first occurrence of any of the patterns in this searcher,
+ /// according to its match semantics, in the given haystack starting from
+ /// the given position.
+ ///
+ /// The `Match` returned will include the identifier of the pattern that
+ /// matched, which corresponds to the index of the pattern (starting from
+ /// `0`) in which it was added. The offsets in the `Match` will be relative
+ /// to the start of `haystack` (and not `at`).
+ ///
+ /// # Example
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use aho_corasick::{packed::{MatchKind, Searcher}, PatternID, Span};
+ ///
+ /// # fn example() -> Option<()> {
+ /// let haystack = "foofoobar";
+ /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
+ /// let mat = searcher.find_in(haystack, Span::from(3..haystack.len()))?;
+ /// assert_eq!(PatternID::ZERO, mat.pattern());
+ /// assert_eq!(3, mat.start());
+ /// assert_eq!(9, mat.end());
+ /// # Some(()) }
+ /// # if cfg!(all(feature = "std", any(
+ /// # target_arch = "x86_64", target_arch = "aarch64",
+ /// # ))) {
+ /// # example().unwrap()
+ /// # } else {
+ /// # assert!(example().is_none());
+ /// # }
+ /// ```
+ #[inline]
+ pub fn find_in<B: AsRef<[u8]>>(
+ &self,
+ haystack: B,
+ span: Span,
+ ) -> Option<Match> {
+ let haystack = haystack.as_ref();
+ match self.search_kind {
+ SearchKind::Teddy(ref teddy) => {
+ if haystack[span].len() < teddy.minimum_len() {
+ return self.find_in_slow(haystack, span);
+ }
+ teddy.find(&haystack[..span.end], span.start)
+ }
+ SearchKind::RabinKarp => {
+ self.rabinkarp.find_at(&haystack[..span.end], span.start)
+ }
+ }
+ }
+
+ /// Return an iterator of non-overlapping occurrences of the patterns in
+ /// this searcher, according to its match semantics, in the given haystack.
+ ///
+ /// # Example
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use aho_corasick::{packed::{MatchKind, Searcher}, PatternID};
+ ///
+ /// # fn example() -> Option<()> {
+ /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
+ /// let matches: Vec<PatternID> = searcher
+ /// .find_iter("foobar fooba foofoo")
+ /// .map(|mat| mat.pattern())
+ /// .collect();
+ /// assert_eq!(vec![
+ /// PatternID::must(0),
+ /// PatternID::must(1),
+ /// PatternID::must(1),
+ /// PatternID::must(1),
+ /// ], matches);
+ /// # Some(()) }
+ /// # if cfg!(all(feature = "std", any(
+ /// # target_arch = "x86_64", target_arch = "aarch64",
+ /// # ))) {
+ /// # example().unwrap()
+ /// # } else {
+ /// # assert!(example().is_none());
+ /// # }
+ /// ```
+ #[inline]
+ pub fn find_iter<'a, 'b, B: ?Sized + AsRef<[u8]>>(
+ &'a self,
+ haystack: &'b B,
+ ) -> FindIter<'a, 'b> {
+ let haystack = haystack.as_ref();
+ let span = Span::from(0..haystack.len());
+ FindIter { searcher: self, haystack, span }
+ }
+
+ /// Returns the match kind used by this packed searcher.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use aho_corasick::packed::{MatchKind, Searcher};
+ ///
+ /// # fn example() -> Option<()> {
+ /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
+ /// // leftmost-first is the default.
+ /// assert_eq!(&MatchKind::LeftmostFirst, searcher.match_kind());
+ /// # Some(()) }
+ /// # if cfg!(all(feature = "std", any(
+ /// # target_arch = "x86_64", target_arch = "aarch64",
+ /// # ))) {
+ /// # example().unwrap()
+ /// # } else {
+ /// # assert!(example().is_none());
+ /// # }
+ /// ```
+ #[inline]
+ pub fn match_kind(&self) -> &MatchKind {
+ self.patterns.match_kind()
+ }
+
+ /// Returns the minimum length of a haystack that is required in order for
+ /// packed searching to be effective.
+ ///
+ /// In some cases, the underlying packed searcher may not be able to search
+ /// very short haystacks. When that occurs, the implementation will defer
+ /// to a slower non-packed searcher (which is still generally faster than
+ /// Aho-Corasick for a small number of patterns). However, callers may
+ /// want to avoid ever using the slower variant, which one can do by
+ /// never passing a haystack shorter than the minimum length returned by
+ /// this method.
+ #[inline]
+ pub fn minimum_len(&self) -> usize {
+ self.minimum_len
+ }
+
+ /// Returns the approximate total amount of heap used by this searcher, in
+ /// units of bytes.
+ #[inline]
+ pub fn memory_usage(&self) -> usize {
+ self.patterns.memory_usage()
+ + self.rabinkarp.memory_usage()
+ + self.search_kind.memory_usage()
+ }
+
+ /// Use a slow (non-packed) searcher.
+ ///
+ /// This is useful when a packed searcher could be constructed, but could
+ /// not be used to search a specific haystack. For example, if Teddy was
+ /// built but the haystack is smaller than ~34 bytes, then Teddy might not
+ /// be able to run.
+ fn find_in_slow(&self, haystack: &[u8], span: Span) -> Option<Match> {
+ self.rabinkarp.find_at(&haystack[..span.end], span.start)
+ }
+}
+
+impl SearchKind {
+ fn memory_usage(&self) -> usize {
+ match *self {
+ SearchKind::Teddy(ref ted) => ted.memory_usage(),
+ SearchKind::RabinKarp => 0,
+ }
+ }
+}
+
+/// An iterator over non-overlapping matches from a packed searcher.
+///
+/// The lifetime `'s` refers to the lifetime of the underlying [`Searcher`],
+/// while the lifetime `'h` refers to the lifetime of the haystack being
+/// searched.
+#[derive(Debug)]
+pub struct FindIter<'s, 'h> {
+ searcher: &'s Searcher,
+ haystack: &'h [u8],
+ span: Span,
+}
+
+impl<'s, 'h> Iterator for FindIter<'s, 'h> {
+ type Item = Match;
+
+ fn next(&mut self) -> Option<Match> {
+ if self.span.start > self.span.end {
+ return None;
+ }
+ match self.searcher.find_in(&self.haystack, self.span) {
+ None => None,
+ Some(m) => {
+ self.span.start = m.end();
+ Some(m)
+ }
+ }
+ }
+}