use crate::ahocorasick::MatchKind; use crate::prefilter::{self, Candidate, Prefilter, PrefilterState}; use crate::state_id::{dead_id, fail_id, StateID}; use crate::Match; // NOTE: This trait essentially started as a copy of the same trait from from // regex-automata, with some wording changed since we use this trait for // NFAs in addition to DFAs in this crate. Additionally, we do not export // this trait. It's only used internally to reduce code duplication. The // regex-automata crate needs to expose it because its Regex type is generic // over implementations of this trait. In this crate, we encapsulate everything // behind the AhoCorasick type. // // This trait is a bit of a mess, but it's not quite clear how to fix it. // Basically, there are several competing concerns: // // * We need performance, so everything effectively needs to get monomorphized. // * There are several variations on searching Aho-Corasick automatons: // overlapping, standard and leftmost. Overlapping and standard are somewhat // combined together below, but there is no real way to combine standard with // leftmost. Namely, leftmost requires continuing a search even after a match // is found, in order to correctly disambiguate a match. // * On top of that, *sometimes* callers want to know which state the automaton // is in after searching. This is principally useful for overlapping and // stream searches. However, when callers don't care about this, we really // do not want to be forced to compute it, since it sometimes requires extra // work. Thus, there are effectively two copies of leftmost searching: one // for tracking the state ID and one that doesn't. We should ideally do the // same for standard searching, but my sanity stopped me. // SAFETY RATIONALE: Previously, the code below went to some length to remove // all bounds checks. This generally produced tighter assembly and lead to // 20-50% improvements in micro-benchmarks on corpora made up of random // characters. This somewhat makes sense, since the branch predictor is going // to be at its worse on random text. // // However, using the aho-corasick-debug tool and manually benchmarking // different inputs, the code *with* bounds checks actually wound up being // slightly faster: // // $ cat input // Sherlock Holmes // John Watson // Professor Moriarty // Irene Adler // Mary Watson // // $ aho-corasick-debug-safe \ // input OpenSubtitles2018.raw.sample.en --kind leftmost-first --dfa // pattern read time: 32.824µs // automaton build time: 444.687µs // automaton heap usage: 72392 bytes // match count: 639 // count time: 1.809961702s // // $ aho-corasick-debug-master \ // input OpenSubtitles2018.raw.sample.en --kind leftmost-first --dfa // pattern read time: 31.425µs // automaton build time: 317.434µs // automaton heap usage: 72392 bytes // match count: 639 // count time: 2.059157705s // // I was able to reproduce this result on two different machines (an i5 and // an i7). Therefore, we go the route of safe code for now. /// A trait describing the interface of an Aho-Corasick finite state machine. /// /// Every automaton has exactly one fail state, one dead state and exactly one /// start state. Generally, these correspond to the first, second and third /// states, respectively. The dead state is always treated as a sentinel. That /// is, no correct Aho-Corasick automaton will ever transition into the fail /// state. The dead state, however, can be transitioned into, but only when /// leftmost-first or leftmost-longest match semantics are enabled and only /// when at least one match has been observed. /// /// Every automaton also has one or more match states, such that /// `Automaton::is_match_state(id)` returns `true` if and only if `id` /// corresponds to a match state. pub trait Automaton { /// The representation used for state identifiers in this automaton. /// /// Typically, this is one of `u8`, `u16`, `u32`, `u64` or `usize`. type ID: StateID; /// The type of matching that should be done. fn match_kind(&self) -> &MatchKind; /// Returns true if and only if this automaton uses anchored searches. fn anchored(&self) -> bool; /// An optional prefilter for quickly skipping to the next candidate match. /// A prefilter must report at least every match, although it may report /// positions that do not correspond to a match. That is, it must not allow /// false negatives, but can allow false positives. /// /// Currently, a prefilter only runs when the automaton is in the start /// state. That is, the position reported by a prefilter should always /// correspond to the start of a potential match. fn prefilter(&self) -> Option<&dyn Prefilter>; /// Return the identifier of this automaton's start state. fn start_state(&self) -> Self::ID; /// Returns true if and only if the given state identifier refers to a /// valid state. fn is_valid(&self, id: Self::ID) -> bool; /// Returns true if and only if the given identifier corresponds to a match /// state. /// /// The state ID given must be valid, or else implementors may panic. fn is_match_state(&self, id: Self::ID) -> bool; /// Returns true if and only if the given identifier corresponds to a state /// that is either the dead state or a match state. /// /// Depending on the implementation of the automaton, this routine can /// be used to save a branch in the core matching loop. Nevertheless, /// `is_match_state(id) || id == dead_id()` is always a valid /// implementation. Indeed, this is the default implementation. /// /// The state ID given must be valid, or else implementors may panic. fn is_match_or_dead_state(&self, id: Self::ID) -> bool { id == dead_id() || self.is_match_state(id) } /// If the given state is a match state, return the match corresponding /// to the given match index. `end` must be the ending position of the /// detected match. If no match exists or if `match_index` exceeds the /// number of matches in this state, then `None` is returned. /// /// The state ID given must be valid, or else implementors may panic. /// /// If the given state ID is correct and if the `match_index` is less than /// the number of matches for that state, then this is guaranteed to return /// a match. fn get_match( &self, id: Self::ID, match_index: usize, end: usize, ) -> Option; /// Returns the number of matches for the given state. If the given state /// is not a match state, then this returns 0. /// /// The state ID given must be valid, or else implementors must panic. fn match_count(&self, id: Self::ID) -> usize; /// Given the current state that this automaton is in and the next input /// byte, this method returns the identifier of the next state. The /// identifier returned must always be valid and may never correspond to /// the fail state. The returned identifier may, however, point to the /// dead state. /// /// This is not safe so that implementors may look up the next state /// without memory safety checks such as bounds checks. As such, callers /// must ensure that the given identifier corresponds to a valid automaton /// state. Implementors must, in turn, ensure that this routine is safe for /// all valid state identifiers and for all possible `u8` values. fn next_state(&self, current: Self::ID, input: u8) -> Self::ID; /// Like next_state, but debug_asserts that the underlying /// implementation never returns a `fail_id()` for the next state. fn next_state_no_fail(&self, current: Self::ID, input: u8) -> Self::ID { let next = self.next_state(current, input); // We should never see a transition to the failure state. debug_assert!( next != fail_id(), "automaton should never return fail_id for next state" ); next } /// Execute a search using standard match semantics. /// /// This can be used even when the automaton was constructed with leftmost /// match semantics when you want to find the earliest possible match. This /// can also be used as part of an overlapping search implementation. /// /// N.B. This does not report a match if `state_id` is given as a matching /// state. As such, this should not be used directly. #[inline(always)] fn standard_find_at( &self, prestate: &mut PrefilterState, haystack: &[u8], at: usize, state_id: &mut Self::ID, ) -> Option { if let Some(pre) = self.prefilter() { self.standard_find_at_imp( prestate, Some(pre), haystack, at, state_id, ) } else { self.standard_find_at_imp(prestate, None, haystack, at, state_id) } } // It's important for this to always be inlined. Namely, its only caller // is standard_find_at, and the inlining should remove the case analysis // for prefilter scanning when there is no prefilter available. #[inline(always)] fn standard_find_at_imp( &self, prestate: &mut PrefilterState, prefilter: Option<&dyn Prefilter>, haystack: &[u8], mut at: usize, state_id: &mut Self::ID, ) -> Option { while at < haystack.len() { if let Some(pre) = prefilter { if prestate.is_effective(at) && *state_id == self.start_state() { let c = prefilter::next(prestate, pre, haystack, at) .into_option(); match c { None => return None, Some(i) => { at = i; } } } } // CORRECTNESS: next_state is correct for all possible u8 values, // so the only thing we're concerned about is the validity of // `state_id`. `state_id` either comes from the caller (in which // case, we assume it is correct), or it comes from the return // value of next_state, which is guaranteed to be correct. *state_id = self.next_state_no_fail(*state_id, haystack[at]); at += 1; // This routine always quits immediately after seeing a // match, and since dead states can only come after seeing // a match, seeing a dead state here is impossible. (Unless // we have an anchored automaton, in which case, dead states // are used to stop a search.) debug_assert!( *state_id != dead_id() || self.anchored(), "standard find should never see a dead state" ); if self.is_match_or_dead_state(*state_id) { return if *state_id == dead_id() { None } else { self.get_match(*state_id, 0, at) }; } } None } /// Execute a search using leftmost (either first or longest) match /// semantics. /// /// The principle difference between searching with standard semantics and /// searching with leftmost semantics is that leftmost searching will /// continue searching even after a match has been found. Once a match /// is found, the search does not stop until either the haystack has been /// exhausted or a dead state is observed in the automaton. (Dead states /// only exist in automatons constructed with leftmost semantics.) That is, /// we rely on the construction of the automaton to tell us when to quit. #[inline(never)] fn leftmost_find_at( &self, prestate: &mut PrefilterState, haystack: &[u8], at: usize, state_id: &mut Self::ID, ) -> Option { if let Some(pre) = self.prefilter() { self.leftmost_find_at_imp( prestate, Some(pre), haystack, at, state_id, ) } else { self.leftmost_find_at_imp(prestate, None, haystack, at, state_id) } } // It's important for this to always be inlined. Namely, its only caller // is leftmost_find_at, and the inlining should remove the case analysis // for prefilter scanning when there is no prefilter available. #[inline(always)] fn leftmost_find_at_imp( &self, prestate: &mut PrefilterState, prefilter: Option<&dyn Prefilter>, haystack: &[u8], mut at: usize, state_id: &mut Self::ID, ) -> Option { debug_assert!(self.match_kind().is_leftmost()); if self.anchored() && at > 0 && *state_id == self.start_state() { return None; } let mut last_match = self.get_match(*state_id, 0, at); while at < haystack.len() { if let Some(pre) = prefilter { if prestate.is_effective(at) && *state_id == self.start_state() { let c = prefilter::next(prestate, pre, haystack, at) .into_option(); match c { None => return None, Some(i) => { at = i; } } } } // CORRECTNESS: next_state is correct for all possible u8 values, // so the only thing we're concerned about is the validity of // `state_id`. `state_id` either comes from the caller (in which // case, we assume it is correct), or it comes from the return // value of next_state, which is guaranteed to be correct. *state_id = self.next_state_no_fail(*state_id, haystack[at]); at += 1; if self.is_match_or_dead_state(*state_id) { if *state_id == dead_id() { // The only way to enter into a dead state is if a match // has been found, so we assert as much. This is different // from normal automata, where you might enter a dead state // if you know a subsequent match will never be found // (regardless of whether a match has already been found). // For Aho-Corasick, it is built so that we can match at // any position, so the possibility of a match always // exists. // // (Unless we have an anchored automaton, in which case, // dead states are used to stop a search.) debug_assert!( last_match.is_some() || self.anchored(), "dead state should only be seen after match" ); return last_match; } last_match = self.get_match(*state_id, 0, at); } } last_match } /// This is like leftmost_find_at, but does not need to track a caller /// provided state id. In other words, the only output of this routine is a /// match, if one exists. /// /// It is regrettable that we need to effectively copy a chunk of /// implementation twice, but when we don't need to track the state ID, we /// can allow the prefilter to report matches immediately without having /// to re-confirm them with the automaton. The re-confirmation step is /// necessary in leftmost_find_at because tracing through the automaton is /// the only way to correctly set the state ID. (Perhaps an alternative /// would be to keep a map from pattern ID to matching state ID, but that /// complicates the code and still doesn't permit us to defer to the /// prefilter entirely when possible.) /// /// I did try a few things to avoid the code duplication here, but nothing /// optimized as well as this approach. (In microbenchmarks, there was /// about a 25% difference.) #[inline(never)] fn leftmost_find_at_no_state( &self, prestate: &mut PrefilterState, haystack: &[u8], at: usize, ) -> Option { if let Some(pre) = self.prefilter() { self.leftmost_find_at_no_state_imp( prestate, Some(pre), haystack, at, ) } else { self.leftmost_find_at_no_state_imp(prestate, None, haystack, at) } } // It's important for this to always be inlined. Namely, its only caller // is leftmost_find_at_no_state, and the inlining should remove the case // analysis for prefilter scanning when there is no prefilter available. #[inline(always)] fn leftmost_find_at_no_state_imp( &self, prestate: &mut PrefilterState, prefilter: Option<&dyn Prefilter>, haystack: &[u8], mut at: usize, ) -> Option { debug_assert!(self.match_kind().is_leftmost()); if self.anchored() && at > 0 { return None; } // If our prefilter handles confirmation of matches 100% of the // time, and since we don't need to track state IDs, we can avoid // Aho-Corasick completely. if let Some(pre) = prefilter { // We should never have a prefilter during an anchored search. debug_assert!(!self.anchored()); if !pre.reports_false_positives() { return match pre.next_candidate(prestate, haystack, at) { Candidate::None => None, Candidate::Match(m) => Some(m), Candidate::PossibleStartOfMatch(_) => unreachable!(), }; } } let mut state_id = self.start_state(); let mut last_match = self.get_match(state_id, 0, at); while at < haystack.len() { if let Some(pre) = prefilter { if prestate.is_effective(at) && state_id == self.start_state() { match prefilter::next(prestate, pre, haystack, at) { Candidate::None => return None, // Since we aren't tracking a state ID, we can // quit early once we know we have a match. Candidate::Match(m) => return Some(m), Candidate::PossibleStartOfMatch(i) => { at = i; } } } } // CORRECTNESS: next_state is correct for all possible u8 values, // so the only thing we're concerned about is the validity of // `state_id`. `state_id` either comes from the caller (in which // case, we assume it is correct), or it comes from the return // value of next_state, which is guaranteed to be correct. state_id = self.next_state_no_fail(state_id, haystack[at]); at += 1; if self.is_match_or_dead_state(state_id) { if state_id == dead_id() { // The only way to enter into a dead state is if a // match has been found, so we assert as much. This // is different from normal automata, where you might // enter a dead state if you know a subsequent match // will never be found (regardless of whether a match // has already been found). For Aho-Corasick, it is // built so that we can match at any position, so the // possibility of a match always exists. // // (Unless we have an anchored automaton, in which // case, dead states are used to stop a search.) debug_assert!( last_match.is_some() || self.anchored(), "dead state should only be seen after match" ); return last_match; } last_match = self.get_match(state_id, 0, at); } } last_match } /// Execute an overlapping search. /// /// When executing an overlapping match, the previous state ID in addition /// to the previous match index should be given. If there are more matches /// at the given state, then the match is reported and the given index is /// incremented. #[inline(always)] fn overlapping_find_at( &self, prestate: &mut PrefilterState, haystack: &[u8], at: usize, state_id: &mut Self::ID, match_index: &mut usize, ) -> Option { if self.anchored() && at > 0 && *state_id == self.start_state() { return None; } let match_count = self.match_count(*state_id); if *match_index < match_count { // This is guaranteed to return a match since // match_index < match_count. let result = self.get_match(*state_id, *match_index, at); debug_assert!(result.is_some(), "must be a match"); *match_index += 1; return result; } *match_index = 0; match self.standard_find_at(prestate, haystack, at, state_id) { None => None, Some(m) => { *match_index = 1; Some(m) } } } /// Return the earliest match found. This returns as soon as we know that /// we have a match. As such, this does not necessarily correspond to the /// leftmost starting match, but rather, the leftmost position at which a /// match ends. #[inline(always)] fn earliest_find_at( &self, prestate: &mut PrefilterState, haystack: &[u8], at: usize, state_id: &mut Self::ID, ) -> Option { if *state_id == self.start_state() { if self.anchored() && at > 0 { return None; } if let Some(m) = self.get_match(*state_id, 0, at) { return Some(m); } } self.standard_find_at(prestate, haystack, at, state_id) } /// A convenience function for finding the next match according to the /// match semantics of this automaton. For standard match semantics, this /// finds the earliest match. Otherwise, the leftmost match is found. #[inline(always)] fn find_at( &self, prestate: &mut PrefilterState, haystack: &[u8], at: usize, state_id: &mut Self::ID, ) -> Option { match *self.match_kind() { MatchKind::Standard => { self.earliest_find_at(prestate, haystack, at, state_id) } MatchKind::LeftmostFirst | MatchKind::LeftmostLongest => { self.leftmost_find_at(prestate, haystack, at, state_id) } MatchKind::__Nonexhaustive => unreachable!(), } } /// Like find_at, but does not track state identifiers. This permits some /// optimizations when a prefilter that confirms its own matches is /// present. #[inline(always)] fn find_at_no_state( &self, prestate: &mut PrefilterState, haystack: &[u8], at: usize, ) -> Option { match *self.match_kind() { MatchKind::Standard => { let mut state = self.start_state(); self.earliest_find_at(prestate, haystack, at, &mut state) } MatchKind::LeftmostFirst | MatchKind::LeftmostLongest => { self.leftmost_find_at_no_state(prestate, haystack, at) } MatchKind::__Nonexhaustive => unreachable!(), } } }