diff options
Diffstat (limited to 'third_party/rust/aho-corasick/src/lib.rs')
-rw-r--r-- | third_party/rust/aho-corasick/src/lib.rs | 326 |
1 files changed, 326 insertions, 0 deletions
diff --git a/third_party/rust/aho-corasick/src/lib.rs b/third_party/rust/aho-corasick/src/lib.rs new file mode 100644 index 0000000000..20e8b81115 --- /dev/null +++ b/third_party/rust/aho-corasick/src/lib.rs @@ -0,0 +1,326 @@ +/*! +A library for finding occurrences of many patterns at once. This library +provides multiple pattern search principally through an implementation of the +[Aho-Corasick algorithm](https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm), +which builds a fast finite state machine for executing searches in linear time. + +Additionally, this library provides a number of configuration options for +building the automaton that permit controlling the space versus time trade +off. Other features include simple ASCII case insensitive matching, finding +overlapping matches, replacements, searching streams and even searching and +replacing text in streams. + +Finally, unlike most other Aho-Corasick implementations, this one +supports enabling [leftmost-first](MatchKind::LeftmostFirst) or +[leftmost-longest](MatchKind::LeftmostLongest) match semantics, using a +(seemingly) novel alternative construction algorithm. For more details on what +match semantics means, see the [`MatchKind`] type. + +# Overview + +This section gives a brief overview of the primary types in this crate: + +* [`AhoCorasick`] is the primary type and represents an Aho-Corasick automaton. +This is the type you use to execute searches. +* [`AhoCorasickBuilder`] can be used to build an Aho-Corasick automaton, and +supports configuring a number of options. +* [`Match`] represents a single match reported by an Aho-Corasick automaton. +Each match has two pieces of information: the pattern that matched and the +start and end byte offsets corresponding to the position in the haystack at +which it matched. + +# Example: basic searching + +This example shows how to search for occurrences of multiple patterns +simultaneously. Each match includes the pattern that matched along with the +byte offsets of the match. + +``` +use aho_corasick::{AhoCorasick, PatternID}; + +let patterns = &["apple", "maple", "Snapple"]; +let haystack = "Nobody likes maple in their apple flavored Snapple."; + +let ac = AhoCorasick::new(patterns).unwrap(); +let mut matches = vec![]; +for mat in ac.find_iter(haystack) { + matches.push((mat.pattern(), mat.start(), mat.end())); +} +assert_eq!(matches, vec![ + (PatternID::must(1), 13, 18), + (PatternID::must(0), 28, 33), + (PatternID::must(2), 43, 50), +]); +``` + +# Example: case insensitivity + +This is like the previous example, but matches `Snapple` case insensitively +using `AhoCorasickBuilder`: + +``` +use aho_corasick::{AhoCorasick, PatternID}; + +let patterns = &["apple", "maple", "snapple"]; +let haystack = "Nobody likes maple in their apple flavored Snapple."; + +let ac = AhoCorasick::builder() + .ascii_case_insensitive(true) + .build(patterns) + .unwrap(); +let mut matches = vec![]; +for mat in ac.find_iter(haystack) { + matches.push((mat.pattern(), mat.start(), mat.end())); +} +assert_eq!(matches, vec![ + (PatternID::must(1), 13, 18), + (PatternID::must(0), 28, 33), + (PatternID::must(2), 43, 50), +]); +``` + +# Example: replacing matches in a stream + +This example shows how to execute a search and replace on a stream without +loading the entire stream into memory first. + +``` +# #[cfg(feature = "std")] { +use aho_corasick::AhoCorasick; + +# fn example() -> Result<(), std::io::Error> { +let patterns = &["fox", "brown", "quick"]; +let replace_with = &["sloth", "grey", "slow"]; + +// In a real example, these might be `std::fs::File`s instead. All you need to +// do is supply a pair of `std::io::Read` and `std::io::Write` implementations. +let rdr = "The quick brown fox."; +let mut wtr = vec![]; + +let ac = AhoCorasick::new(patterns).unwrap(); +ac.try_stream_replace_all(rdr.as_bytes(), &mut wtr, replace_with)?; +assert_eq!(b"The slow grey sloth.".to_vec(), wtr); +# Ok(()) }; example().unwrap() +# } +``` + +# Example: finding the leftmost first match + +In the textbook description of Aho-Corasick, its formulation is typically +structured such that it reports all possible matches, even when they overlap +with another. In many cases, overlapping matches may not be desired, such as +the case of finding all successive non-overlapping matches like you might with +a standard regular expression. + +Unfortunately the "obvious" way to modify the Aho-Corasick algorithm to do +this doesn't always work in the expected way, since it will report matches as +soon as they are seen. For example, consider matching the regex `Samwise|Sam` +against the text `Samwise`. Most regex engines (that are Perl-like, or +non-POSIX) will report `Samwise` as a match, but the standard Aho-Corasick +algorithm modified for reporting non-overlapping matches will report `Sam`. + +A novel contribution of this library is the ability to change the match +semantics of Aho-Corasick (without additional search time overhead) such that +`Samwise` is reported instead. For example, here's the standard approach: + +``` +use aho_corasick::AhoCorasick; + +let patterns = &["Samwise", "Sam"]; +let haystack = "Samwise"; + +let ac = AhoCorasick::new(patterns).unwrap(); +let mat = ac.find(haystack).expect("should have a match"); +assert_eq!("Sam", &haystack[mat.start()..mat.end()]); +``` + +And now here's the leftmost-first version, which matches how a Perl-like +regex will work: + +``` +use aho_corasick::{AhoCorasick, MatchKind}; + +let patterns = &["Samwise", "Sam"]; +let haystack = "Samwise"; + +let ac = AhoCorasick::builder() + .match_kind(MatchKind::LeftmostFirst) + .build(patterns) + .unwrap(); +let mat = ac.find(haystack).expect("should have a match"); +assert_eq!("Samwise", &haystack[mat.start()..mat.end()]); +``` + +In addition to leftmost-first semantics, this library also supports +leftmost-longest semantics, which match the POSIX behavior of a regular +expression alternation. See [`MatchKind`] for more details. + +# Prefilters + +While an Aho-Corasick automaton can perform admirably when compared to more +naive solutions, it is generally slower than more specialized algorithms that +are accelerated using vector instructions such as SIMD. + +For that reason, this library will internally use a "prefilter" to attempt +to accelerate searches when possible. Currently, this library has several +different algorithms it might use depending on the patterns provided. Once the +number of patterns gets too big, prefilters are no longer used. + +While a prefilter is generally good to have on by default since it works +well in the common case, it can lead to less predictable or even sub-optimal +performance in some cases. For that reason, prefilters can be explicitly +disabled via [`AhoCorasickBuilder::prefilter`]. + +# Lower level APIs + +This crate also provides several sub-modules that collectively expose many of +the implementation details of the main [`AhoCorasick`] type. Most users of this +library can completely ignore the submodules and their contents, but if you +needed finer grained control, some parts of them may be useful to you. Here is +a brief overview of each and why you might want to use them: + +* The [`packed`] sub-module contains a lower level API for using fast +vectorized routines for finding a small number of patterns in a haystack. +You might want to use this API when you want to completely side-step using +Aho-Corasick automata. Otherwise, the fast vectorized routines are used +automatically as prefilters for `AhoCorasick` searches whenever possible. +* The [`automaton`] sub-module provides a lower level finite state +machine interface that the various Aho-Corasick implementations in +this crate implement. This sub-module's main contribution is the +[`Automaton`](automaton::Automaton) trait, which permits manually walking the +state transitions of an Aho-Corasick automaton. +* The [`dfa`] and [`nfa`] sub-modules provide DFA and NFA implementations of +the aforementioned `Automaton` trait. The main reason one might want to use +these sub-modules is to get access to a type that implements the `Automaton` +trait. (The top-level `AhoCorasick` type does not implement the `Automaton` +trait.) + +As mentioned above, if you aren't sure whether you need these sub-modules, +you should be able to safely ignore them and just focus on the [`AhoCorasick`] +type. + +# Crate features + +This crate exposes a few features for controlling dependency usage and whether +this crate can be used without the standard library. + +* **std** - + Enables support for the standard library. This feature is enabled by + default. When disabled, only `core` and `alloc` are used. At an API + level, enabling `std` enables `std::error::Error` trait impls for the + various error types, and higher level stream search routines such as + [`AhoCorasick::try_stream_find_iter`]. But the `std` feature is also required + to enable vectorized prefilters. Prefilters can greatly accelerate searches, + but generally only apply when the number of patterns is small (less than + ~100). +* **perf-literal** - + Enables support for literal prefilters that use vectorized routines from + external crates. This feature is enabled by default. If you're only using + Aho-Corasick for large numbers of patterns or otherwise can abide lower + throughput when searching with a small number of patterns, then it is + reasonable to disable this feature. +* **logging** - + Enables a dependency on the `log` crate and emits messages to aide in + diagnostics. This feature is disabled by default. +*/ + +#![no_std] +#![deny(missing_docs)] +#![deny(rustdoc::broken_intra_doc_links)] +#![cfg_attr(docsrs, feature(doc_auto_cfg))] + +extern crate alloc; +#[cfg(any(test, feature = "std"))] +extern crate std; + +#[cfg(doctest)] +doc_comment::doctest!("../README.md"); + +#[cfg(feature = "std")] +pub use crate::ahocorasick::StreamFindIter; +pub use crate::{ + ahocorasick::{ + AhoCorasick, AhoCorasickBuilder, AhoCorasickKind, FindIter, + FindOverlappingIter, + }, + util::{ + error::{BuildError, MatchError, MatchErrorKind}, + primitives::{PatternID, PatternIDError}, + search::{Anchored, Input, Match, MatchKind, Span, StartKind}, + }, +}; + +#[macro_use] +mod macros; + +mod ahocorasick; +pub mod automaton; +pub mod dfa; +pub mod nfa; +pub mod packed; +#[cfg(test)] +mod tests; +// I wrote out the module for implementing fst::Automaton only to later realize +// that this would make fst a public dependency and fst is not at 1.0 yet. I +// decided to just keep the code in tree, but build it only during tests. +// +// TODO: I think I've changed my mind again. I'm considering pushing it out +// into either a separate crate or into 'fst' directly as an optional feature. +// #[cfg(test)] +// #[allow(dead_code)] +// mod transducer; +pub(crate) mod util; + +#[cfg(test)] +mod testoibits { + use std::panic::{RefUnwindSafe, UnwindSafe}; + + use super::*; + + fn assert_all<T: Send + Sync + UnwindSafe + RefUnwindSafe>() {} + + #[test] + fn oibits_main() { + assert_all::<AhoCorasick>(); + assert_all::<AhoCorasickBuilder>(); + assert_all::<AhoCorasickKind>(); + assert_all::<FindIter>(); + assert_all::<FindOverlappingIter>(); + + assert_all::<BuildError>(); + assert_all::<MatchError>(); + assert_all::<MatchErrorKind>(); + + assert_all::<Anchored>(); + assert_all::<Input>(); + assert_all::<Match>(); + assert_all::<MatchKind>(); + assert_all::<Span>(); + assert_all::<StartKind>(); + } + + #[test] + fn oibits_automaton() { + use crate::{automaton, dfa::DFA}; + + assert_all::<automaton::FindIter<DFA>>(); + assert_all::<automaton::FindOverlappingIter<DFA>>(); + #[cfg(feature = "std")] + assert_all::<automaton::StreamFindIter<DFA, std::io::Stdin>>(); + assert_all::<automaton::OverlappingState>(); + + assert_all::<automaton::Prefilter>(); + assert_all::<automaton::Candidate>(); + } + + #[test] + fn oibits_packed() { + use crate::packed; + + assert_all::<packed::Config>(); + assert_all::<packed::Builder>(); + assert_all::<packed::Searcher>(); + assert_all::<packed::FindIter>(); + assert_all::<packed::MatchKind>(); + } +} |