summaryrefslogtreecommitdiffstats
path: root/third_party/rust/aho-corasick/src/dfa.rs
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/rust/aho-corasick/src/dfa.rs')
-rw-r--r--third_party/rust/aho-corasick/src/dfa.rs814
1 files changed, 814 insertions, 0 deletions
diff --git a/third_party/rust/aho-corasick/src/dfa.rs b/third_party/rust/aho-corasick/src/dfa.rs
new file mode 100644
index 0000000000..f0370a6168
--- /dev/null
+++ b/third_party/rust/aho-corasick/src/dfa.rs
@@ -0,0 +1,814 @@
+/*!
+Provides direct access to a DFA implementation of Aho-Corasick.
+
+This is a low-level API that generally only needs to be used in niche
+circumstances. When possible, prefer using [`AhoCorasick`](crate::AhoCorasick)
+instead of a DFA directly. Using an `DFA` directly is typically only necessary
+when one needs access to the [`Automaton`] trait implementation.
+*/
+
+use alloc::{vec, vec::Vec};
+
+use crate::{
+ automaton::Automaton,
+ nfa::noncontiguous,
+ util::{
+ alphabet::ByteClasses,
+ error::{BuildError, MatchError},
+ int::{Usize, U32},
+ prefilter::Prefilter,
+ primitives::{IteratorIndexExt, PatternID, SmallIndex, StateID},
+ search::{Anchored, MatchKind, StartKind},
+ special::Special,
+ },
+};
+
+/// A DFA implementation of Aho-Corasick.
+///
+/// When possible, prefer using [`AhoCorasick`](crate::AhoCorasick) instead of
+/// this type directly. Using a `DFA` directly is typically only necessary when
+/// one needs access to the [`Automaton`] trait implementation.
+///
+/// This DFA can only be built by first constructing a [`noncontiguous::NFA`].
+/// Both [`DFA::new`] and [`Builder::build`] do this for you automatically, but
+/// [`Builder::build_from_noncontiguous`] permits doing it explicitly.
+///
+/// A DFA provides the best possible search performance (in this crate) via two
+/// mechanisms:
+///
+/// * All states use a dense representation for their transitions.
+/// * All failure transitions are pre-computed such that they are never
+/// explicitly handled at search time.
+///
+/// These two facts combined mean that every state transition is performed
+/// using a constant number of instructions. However, this comes at
+/// great cost. The memory usage of a DFA can be quite exorbitant.
+/// It is potentially multiple orders of magnitude greater than a
+/// [`contiguous::NFA`](crate::nfa::contiguous::NFA) for example. In exchange,
+/// a DFA will typically have better search speed than a `contiguous::NFA`, but
+/// not by orders of magnitude.
+///
+/// Unless you have a small number of patterns or memory usage is not a concern
+/// and search performance is critical, a DFA is usually not the best choice.
+///
+/// Moreover, unlike the NFAs in this crate, it is costly for a DFA to
+/// support for anchored and unanchored search configurations. Namely,
+/// since failure transitions are pre-computed, supporting both anchored
+/// and unanchored searches requires a duplication of the transition table,
+/// making the memory usage of such a DFA ever bigger. (The NFAs in this crate
+/// unconditionally support both anchored and unanchored searches because there
+/// is essentially no added cost for doing so.) It is for this reason that
+/// a DFA's support for anchored and unanchored searches can be configured
+/// via [`Builder::start_kind`]. By default, a DFA only supports unanchored
+/// searches.
+///
+/// # Example
+///
+/// This example shows how to build an `DFA` directly and use it to execute
+/// [`Automaton::try_find`]:
+///
+/// ```
+/// use aho_corasick::{
+/// automaton::Automaton,
+/// dfa::DFA,
+/// Input, Match,
+/// };
+///
+/// let patterns = &["b", "abc", "abcd"];
+/// let haystack = "abcd";
+///
+/// let nfa = DFA::new(patterns).unwrap();
+/// assert_eq!(
+/// Some(Match::must(0, 1..2)),
+/// nfa.try_find(&Input::new(haystack))?,
+/// );
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+///
+/// It is also possible to implement your own version of `try_find`. See the
+/// [`Automaton`] documentation for an example.
+#[derive(Clone)]
+pub struct DFA {
+ /// The DFA transition table. IDs in this table are pre-multiplied. So
+ /// instead of the IDs being 0, 1, 2, 3, ..., they are 0*stride, 1*stride,
+ /// 2*stride, 3*stride, ...
+ trans: Vec<StateID>,
+ /// The matches for every match state in this DFA. This is first indexed by
+ /// state index (so that's `sid >> stride2`) and then by order in which the
+ /// matches are meant to occur.
+ matches: Vec<Vec<PatternID>>,
+ /// The amount of heap memory used, in bytes, by the inner Vecs of
+ /// 'matches'.
+ matches_memory_usage: usize,
+ /// The length of each pattern. This is used to compute the start offset
+ /// of a match.
+ pattern_lens: Vec<SmallIndex>,
+ /// A prefilter for accelerating searches, if one exists.
+ prefilter: Option<Prefilter>,
+ /// The match semantics built into this DFA.
+ match_kind: MatchKind,
+ /// The total number of states in this DFA.
+ state_len: usize,
+ /// The alphabet size, or total number of equivalence classes, for this
+ /// DFA. Note that the actual number of transitions in each state is
+ /// stride=2^stride2, where stride is the smallest power of 2 greater than
+ /// or equal to alphabet_len. We do things this way so that we can use
+ /// bitshifting to go from a state ID to an index into 'matches'.
+ alphabet_len: usize,
+ /// The exponent with a base 2, such that stride=2^stride2. Given a state
+ /// index 'i', its state identifier is 'i << stride2'. Given a state
+ /// identifier 'sid', its state index is 'sid >> stride2'.
+ stride2: usize,
+ /// The equivalence classes for this DFA. All transitions are defined on
+ /// equivalence classes and not on the 256 distinct byte values.
+ byte_classes: ByteClasses,
+ /// The length of the shortest pattern in this automaton.
+ min_pattern_len: usize,
+ /// The length of the longest pattern in this automaton.
+ max_pattern_len: usize,
+ /// The information required to deduce which states are "special" in this
+ /// DFA.
+ special: Special,
+}
+
+impl DFA {
+ /// Create a new Aho-Corasick DFA using the default configuration.
+ ///
+ /// Use a [`Builder`] if you want to change the configuration.
+ pub fn new<I, P>(patterns: I) -> Result<DFA, BuildError>
+ where
+ I: IntoIterator<Item = P>,
+ P: AsRef<[u8]>,
+ {
+ DFA::builder().build(patterns)
+ }
+
+ /// A convenience method for returning a new Aho-Corasick DFA builder.
+ ///
+ /// This usually permits one to just import the `DFA` type.
+ pub fn builder() -> Builder {
+ Builder::new()
+ }
+}
+
+impl DFA {
+ /// A sentinel state ID indicating that a search should stop once it has
+ /// entered this state. When a search stops, it returns a match if one has
+ /// been found, otherwise no match. A DFA always has an actual dead state
+ /// at this ID.
+ ///
+ /// N.B. DFAs, unlike NFAs, do not have any notion of a FAIL state.
+ /// Namely, the whole point of a DFA is that the FAIL state is completely
+ /// compiled away. That is, DFA construction involves pre-computing the
+ /// failure transitions everywhere, such that failure transitions are no
+ /// longer used at search time. This, combined with its uniformly dense
+ /// representation, are the two most important factors in why it's faster
+ /// than the NFAs in this crate.
+ const DEAD: StateID = StateID::new_unchecked(0);
+
+ /// Adds the given pattern IDs as matches to the given state and also
+ /// records the added memory usage.
+ fn set_matches(
+ &mut self,
+ sid: StateID,
+ pids: impl Iterator<Item = PatternID>,
+ ) {
+ let index = (sid.as_usize() >> self.stride2).checked_sub(2).unwrap();
+ let mut at_least_one = false;
+ for pid in pids {
+ self.matches[index].push(pid);
+ self.matches_memory_usage += PatternID::SIZE;
+ at_least_one = true;
+ }
+ assert!(at_least_one, "match state must have non-empty pids");
+ }
+}
+
+// SAFETY: 'start_state' always returns a valid state ID, 'next_state' always
+// returns a valid state ID given a valid state ID. We otherwise claim that
+// all other methods are correct as well.
+unsafe impl Automaton for DFA {
+ #[inline(always)]
+ fn start_state(&self, anchored: Anchored) -> Result<StateID, MatchError> {
+ // Either of the start state IDs can be DEAD, in which case, support
+ // for that type of search is not provided by this DFA. Which start
+ // state IDs are inactive depends on the 'StartKind' configuration at
+ // DFA construction time.
+ match anchored {
+ Anchored::No => {
+ let start = self.special.start_unanchored_id;
+ if start == DFA::DEAD {
+ Err(MatchError::invalid_input_unanchored())
+ } else {
+ Ok(start)
+ }
+ }
+ Anchored::Yes => {
+ let start = self.special.start_anchored_id;
+ if start == DFA::DEAD {
+ Err(MatchError::invalid_input_anchored())
+ } else {
+ Ok(start)
+ }
+ }
+ }
+ }
+
+ #[inline(always)]
+ fn next_state(
+ &self,
+ _anchored: Anchored,
+ sid: StateID,
+ byte: u8,
+ ) -> StateID {
+ let class = self.byte_classes.get(byte);
+ self.trans[(sid.as_u32() + u32::from(class)).as_usize()]
+ }
+
+ #[inline(always)]
+ fn is_special(&self, sid: StateID) -> bool {
+ sid <= self.special.max_special_id
+ }
+
+ #[inline(always)]
+ fn is_dead(&self, sid: StateID) -> bool {
+ sid == DFA::DEAD
+ }
+
+ #[inline(always)]
+ fn is_match(&self, sid: StateID) -> bool {
+ !self.is_dead(sid) && sid <= self.special.max_match_id
+ }
+
+ #[inline(always)]
+ fn is_start(&self, sid: StateID) -> bool {
+ sid == self.special.start_unanchored_id
+ || sid == self.special.start_anchored_id
+ }
+
+ #[inline(always)]
+ fn match_kind(&self) -> MatchKind {
+ self.match_kind
+ }
+
+ #[inline(always)]
+ fn patterns_len(&self) -> usize {
+ self.pattern_lens.len()
+ }
+
+ #[inline(always)]
+ fn pattern_len(&self, pid: PatternID) -> usize {
+ self.pattern_lens[pid].as_usize()
+ }
+
+ #[inline(always)]
+ fn min_pattern_len(&self) -> usize {
+ self.min_pattern_len
+ }
+
+ #[inline(always)]
+ fn max_pattern_len(&self) -> usize {
+ self.max_pattern_len
+ }
+
+ #[inline(always)]
+ fn match_len(&self, sid: StateID) -> usize {
+ debug_assert!(self.is_match(sid));
+ let offset = (sid.as_usize() >> self.stride2) - 2;
+ self.matches[offset].len()
+ }
+
+ #[inline(always)]
+ fn match_pattern(&self, sid: StateID, index: usize) -> PatternID {
+ debug_assert!(self.is_match(sid));
+ let offset = (sid.as_usize() >> self.stride2) - 2;
+ self.matches[offset][index]
+ }
+
+ #[inline(always)]
+ fn memory_usage(&self) -> usize {
+ use core::mem::size_of;
+
+ (self.trans.len() * size_of::<u32>())
+ + (self.matches.len() * size_of::<Vec<PatternID>>())
+ + self.matches_memory_usage
+ + (self.pattern_lens.len() * size_of::<SmallIndex>())
+ + self.prefilter.as_ref().map_or(0, |p| p.memory_usage())
+ }
+
+ #[inline(always)]
+ fn prefilter(&self) -> Option<&Prefilter> {
+ self.prefilter.as_ref()
+ }
+}
+
+impl core::fmt::Debug for DFA {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ use crate::{
+ automaton::{fmt_state_indicator, sparse_transitions},
+ util::debug::DebugByte,
+ };
+
+ writeln!(f, "dfa::DFA(")?;
+ for index in 0..self.state_len {
+ let sid = StateID::new_unchecked(index << self.stride2);
+ // While we do currently include the FAIL state in the transition
+ // table (to simplify construction), it is never actually used. It
+ // poses problems with the code below because it gets treated as
+ // a match state incidentally when it is, of course, not. So we
+ // special case it. The fail state is always the first state after
+ // the dead state.
+ //
+ // If the construction is changed to remove the fail state (it
+ // probably should be), then this special case should be updated.
+ if index == 1 {
+ writeln!(f, "F {:06}:", sid.as_usize())?;
+ continue;
+ }
+ fmt_state_indicator(f, self, sid)?;
+ write!(f, "{:06}: ", sid.as_usize())?;
+
+ let it = (0..self.byte_classes.alphabet_len()).map(|class| {
+ (class.as_u8(), self.trans[sid.as_usize() + class])
+ });
+ for (i, (start, end, next)) in sparse_transitions(it).enumerate() {
+ if i > 0 {
+ write!(f, ", ")?;
+ }
+ if start == end {
+ write!(
+ f,
+ "{:?} => {:?}",
+ DebugByte(start),
+ next.as_usize()
+ )?;
+ } else {
+ write!(
+ f,
+ "{:?}-{:?} => {:?}",
+ DebugByte(start),
+ DebugByte(end),
+ next.as_usize()
+ )?;
+ }
+ }
+ write!(f, "\n")?;
+ if self.is_match(sid) {
+ write!(f, " matches: ")?;
+ for i in 0..self.match_len(sid) {
+ if i > 0 {
+ write!(f, ", ")?;
+ }
+ let pid = self.match_pattern(sid, i);
+ write!(f, "{}", pid.as_usize())?;
+ }
+ write!(f, "\n")?;
+ }
+ }
+ writeln!(f, "match kind: {:?}", self.match_kind)?;
+ writeln!(f, "prefilter: {:?}", self.prefilter.is_some())?;
+ writeln!(f, "state length: {:?}", self.state_len)?;
+ writeln!(f, "pattern length: {:?}", self.patterns_len())?;
+ writeln!(f, "shortest pattern length: {:?}", self.min_pattern_len)?;
+ writeln!(f, "longest pattern length: {:?}", self.max_pattern_len)?;
+ writeln!(f, "alphabet length: {:?}", self.alphabet_len)?;
+ writeln!(f, "stride: {:?}", 1 << self.stride2)?;
+ writeln!(f, "byte classes: {:?}", self.byte_classes)?;
+ writeln!(f, "memory usage: {:?}", self.memory_usage())?;
+ writeln!(f, ")")?;
+ Ok(())
+ }
+}
+
+/// A builder for configuring an Aho-Corasick DFA.
+///
+/// This builder has a subset of the options available to a
+/// [`AhoCorasickBuilder`](crate::AhoCorasickBuilder). Of the shared options,
+/// their behavior is identical.
+#[derive(Clone, Debug)]
+pub struct Builder {
+ noncontiguous: noncontiguous::Builder,
+ start_kind: StartKind,
+ byte_classes: bool,
+}
+
+impl Default for Builder {
+ fn default() -> Builder {
+ Builder {
+ noncontiguous: noncontiguous::Builder::new(),
+ start_kind: StartKind::Unanchored,
+ byte_classes: true,
+ }
+ }
+}
+
+impl Builder {
+ /// Create a new builder for configuring an Aho-Corasick DFA.
+ pub fn new() -> Builder {
+ Builder::default()
+ }
+
+ /// Build an Aho-Corasick DFA from the given iterator of patterns.
+ ///
+ /// A builder may be reused to create more DFAs.
+ pub fn build<I, P>(&self, patterns: I) -> Result<DFA, BuildError>
+ where
+ I: IntoIterator<Item = P>,
+ P: AsRef<[u8]>,
+ {
+ let nnfa = self.noncontiguous.build(patterns)?;
+ self.build_from_noncontiguous(&nnfa)
+ }
+
+ /// Build an Aho-Corasick DFA from the given noncontiguous NFA.
+ ///
+ /// Note that when this method is used, only the `start_kind` and
+ /// `byte_classes` settings on this builder are respected. The other
+ /// settings only apply to the initial construction of the Aho-Corasick
+ /// automaton. Since using this method requires that initial construction
+ /// has already completed, all settings impacting only initial construction
+ /// are no longer relevant.
+ pub fn build_from_noncontiguous(
+ &self,
+ nnfa: &noncontiguous::NFA,
+ ) -> Result<DFA, BuildError> {
+ debug!("building DFA");
+ let byte_classes = if self.byte_classes {
+ nnfa.byte_classes().clone()
+ } else {
+ ByteClasses::singletons()
+ };
+ let state_len = match self.start_kind {
+ StartKind::Unanchored | StartKind::Anchored => nnfa.states().len(),
+ StartKind::Both => {
+ // These unwraps are OK because we know that the number of
+ // NFA states is < StateID::LIMIT which is in turn less than
+ // i32::MAX. Thus, there is always room to multiply by 2.
+ // Finally, the number of states is always at least 4 in the
+ // NFA (DEAD, FAIL, START-UNANCHORED, START-ANCHORED), so the
+ // subtraction of 4 is okay.
+ //
+ // Note that we subtract 4 because the "anchored" part of
+ // the DFA duplicates the unanchored part (without failure
+ // transitions), but reuses the DEAD, FAIL and START states.
+ nnfa.states()
+ .len()
+ .checked_mul(2)
+ .unwrap()
+ .checked_sub(4)
+ .unwrap()
+ }
+ };
+ let trans_len =
+ match state_len.checked_shl(byte_classes.stride2().as_u32()) {
+ Some(trans_len) => trans_len,
+ None => {
+ return Err(BuildError::state_id_overflow(
+ StateID::MAX.as_u64(),
+ usize::MAX.as_u64(),
+ ))
+ }
+ };
+ StateID::new(trans_len.checked_sub(byte_classes.stride()).unwrap())
+ .map_err(|e| {
+ BuildError::state_id_overflow(
+ StateID::MAX.as_u64(),
+ e.attempted(),
+ )
+ })?;
+ let num_match_states = match self.start_kind {
+ StartKind::Unanchored | StartKind::Anchored => {
+ nnfa.special().max_match_id.as_usize().checked_sub(1).unwrap()
+ }
+ StartKind::Both => nnfa
+ .special()
+ .max_match_id
+ .as_usize()
+ .checked_sub(1)
+ .unwrap()
+ .checked_mul(2)
+ .unwrap(),
+ };
+ let mut dfa = DFA {
+ trans: vec![DFA::DEAD; trans_len],
+ matches: vec![vec![]; num_match_states],
+ matches_memory_usage: 0,
+ pattern_lens: nnfa.pattern_lens_raw().to_vec(),
+ prefilter: nnfa.prefilter().map(|p| p.clone()),
+ match_kind: nnfa.match_kind(),
+ state_len,
+ alphabet_len: byte_classes.alphabet_len(),
+ stride2: byte_classes.stride2(),
+ byte_classes,
+ min_pattern_len: nnfa.min_pattern_len(),
+ max_pattern_len: nnfa.max_pattern_len(),
+ // The special state IDs are set later.
+ special: Special::zero(),
+ };
+ match self.start_kind {
+ StartKind::Both => {
+ self.finish_build_both_starts(nnfa, &mut dfa);
+ }
+ StartKind::Unanchored => {
+ self.finish_build_one_start(Anchored::No, nnfa, &mut dfa);
+ }
+ StartKind::Anchored => {
+ self.finish_build_one_start(Anchored::Yes, nnfa, &mut dfa)
+ }
+ }
+ debug!(
+ "DFA built, <states: {:?}, size: {:?}, \
+ alphabet len: {:?}, stride: {:?}>",
+ dfa.state_len,
+ dfa.memory_usage(),
+ dfa.byte_classes.alphabet_len(),
+ dfa.byte_classes.stride(),
+ );
+ // The vectors can grow ~twice as big during construction because a
+ // Vec amortizes growth. But here, let's shrink things back down to
+ // what we actually need since we're never going to add more to it.
+ dfa.trans.shrink_to_fit();
+ dfa.pattern_lens.shrink_to_fit();
+ dfa.matches.shrink_to_fit();
+ // TODO: We might also want to shrink each Vec inside of `dfa.matches`,
+ // or even better, convert it to one contiguous allocation. But I think
+ // I went with nested allocs for good reason (can't remember), so this
+ // may be tricky to do. I decided not to shrink them here because it
+ // might require a fair bit of work to do. It's unclear whether it's
+ // worth it.
+ Ok(dfa)
+ }
+
+ /// Finishes building a DFA for either unanchored or anchored searches,
+ /// but NOT both.
+ fn finish_build_one_start(
+ &self,
+ anchored: Anchored,
+ nnfa: &noncontiguous::NFA,
+ dfa: &mut DFA,
+ ) {
+ // This function always succeeds because we check above that all of the
+ // states in the NFA can be mapped to DFA state IDs.
+ let stride2 = dfa.stride2;
+ let old2new = |oldsid: StateID| {
+ StateID::new_unchecked(oldsid.as_usize() << stride2)
+ };
+ for (oldsid, state) in nnfa.states().iter().with_state_ids() {
+ let newsid = old2new(oldsid);
+ if state.is_match() {
+ dfa.set_matches(newsid, nnfa.iter_matches(oldsid));
+ }
+ sparse_iter(
+ nnfa,
+ oldsid,
+ &dfa.byte_classes,
+ |byte, class, mut oldnextsid| {
+ if oldnextsid == noncontiguous::NFA::FAIL {
+ if anchored.is_anchored() {
+ oldnextsid = noncontiguous::NFA::DEAD;
+ } else {
+ oldnextsid = nnfa.next_state(
+ Anchored::No,
+ state.fail(),
+ byte,
+ );
+ }
+ }
+ dfa.trans[newsid.as_usize() + usize::from(class)] =
+ old2new(oldnextsid);
+ },
+ );
+ }
+ // Now that we've remapped all the IDs in our states, all that's left
+ // is remapping the special state IDs.
+ let old = nnfa.special();
+ let new = &mut dfa.special;
+ new.max_special_id = old2new(old.max_special_id);
+ new.max_match_id = old2new(old.max_match_id);
+ if anchored.is_anchored() {
+ new.start_unanchored_id = DFA::DEAD;
+ new.start_anchored_id = old2new(old.start_anchored_id);
+ } else {
+ new.start_unanchored_id = old2new(old.start_unanchored_id);
+ new.start_anchored_id = DFA::DEAD;
+ }
+ }
+
+ /// Finishes building a DFA that supports BOTH unanchored and anchored
+ /// searches. It works by inter-leaving unanchored states with anchored
+ /// states in the same transition table. This way, we avoid needing to
+ /// re-shuffle states afterward to ensure that our states still look like
+ /// DEAD, MATCH, ..., START-UNANCHORED, START-ANCHORED, NON-MATCH, ...
+ ///
+ /// Honestly this is pretty inscrutable... Simplifications are most
+ /// welcome.
+ fn finish_build_both_starts(
+ &self,
+ nnfa: &noncontiguous::NFA,
+ dfa: &mut DFA,
+ ) {
+ let stride2 = dfa.stride2;
+ let stride = 1 << stride2;
+ let mut remap_unanchored = vec![DFA::DEAD; nnfa.states().len()];
+ let mut remap_anchored = vec![DFA::DEAD; nnfa.states().len()];
+ let mut is_anchored = vec![false; dfa.state_len];
+ let mut newsid = DFA::DEAD;
+ let next_dfa_id =
+ |sid: StateID| StateID::new_unchecked(sid.as_usize() + stride);
+ for (oldsid, state) in nnfa.states().iter().with_state_ids() {
+ if oldsid == noncontiguous::NFA::DEAD
+ || oldsid == noncontiguous::NFA::FAIL
+ {
+ remap_unanchored[oldsid] = newsid;
+ remap_anchored[oldsid] = newsid;
+ newsid = next_dfa_id(newsid);
+ } else if oldsid == nnfa.special().start_unanchored_id
+ || oldsid == nnfa.special().start_anchored_id
+ {
+ if oldsid == nnfa.special().start_unanchored_id {
+ remap_unanchored[oldsid] = newsid;
+ remap_anchored[oldsid] = DFA::DEAD;
+ } else {
+ remap_unanchored[oldsid] = DFA::DEAD;
+ remap_anchored[oldsid] = newsid;
+ is_anchored[newsid.as_usize() >> stride2] = true;
+ }
+ if state.is_match() {
+ dfa.set_matches(newsid, nnfa.iter_matches(oldsid));
+ }
+ sparse_iter(
+ nnfa,
+ oldsid,
+ &dfa.byte_classes,
+ |_, class, oldnextsid| {
+ let class = usize::from(class);
+ if oldnextsid == noncontiguous::NFA::FAIL {
+ dfa.trans[newsid.as_usize() + class] = DFA::DEAD;
+ } else {
+ dfa.trans[newsid.as_usize() + class] = oldnextsid;
+ }
+ },
+ );
+ newsid = next_dfa_id(newsid);
+ } else {
+ let unewsid = newsid;
+ newsid = next_dfa_id(newsid);
+ let anewsid = newsid;
+ newsid = next_dfa_id(newsid);
+
+ remap_unanchored[oldsid] = unewsid;
+ remap_anchored[oldsid] = anewsid;
+ is_anchored[anewsid.as_usize() >> stride2] = true;
+ if state.is_match() {
+ dfa.set_matches(unewsid, nnfa.iter_matches(oldsid));
+ dfa.set_matches(anewsid, nnfa.iter_matches(oldsid));
+ }
+ sparse_iter(
+ nnfa,
+ oldsid,
+ &dfa.byte_classes,
+ |byte, class, oldnextsid| {
+ let class = usize::from(class);
+ if oldnextsid == noncontiguous::NFA::FAIL {
+ dfa.trans[unewsid.as_usize() + class] = nnfa
+ .next_state(Anchored::No, state.fail(), byte);
+ } else {
+ dfa.trans[unewsid.as_usize() + class] = oldnextsid;
+ dfa.trans[anewsid.as_usize() + class] = oldnextsid;
+ }
+ },
+ );
+ }
+ }
+ for i in 0..dfa.state_len {
+ let sid = i << stride2;
+ if is_anchored[i] {
+ for next in dfa.trans[sid..][..stride].iter_mut() {
+ *next = remap_anchored[*next];
+ }
+ } else {
+ for next in dfa.trans[sid..][..stride].iter_mut() {
+ *next = remap_unanchored[*next];
+ }
+ }
+ }
+ // Now that we've remapped all the IDs in our states, all that's left
+ // is remapping the special state IDs.
+ let old = nnfa.special();
+ let new = &mut dfa.special;
+ new.max_special_id = remap_anchored[old.max_special_id];
+ new.max_match_id = remap_anchored[old.max_match_id];
+ new.start_unanchored_id = remap_unanchored[old.start_unanchored_id];
+ new.start_anchored_id = remap_anchored[old.start_anchored_id];
+ }
+
+ /// Set the desired match semantics.
+ ///
+ /// This only applies when using [`Builder::build`] and not
+ /// [`Builder::build_from_noncontiguous`].
+ ///
+ /// See
+ /// [`AhoCorasickBuilder::match_kind`](crate::AhoCorasickBuilder::match_kind)
+ /// for more documentation and examples.
+ pub fn match_kind(&mut self, kind: MatchKind) -> &mut Builder {
+ self.noncontiguous.match_kind(kind);
+ self
+ }
+
+ /// Enable ASCII-aware case insensitive matching.
+ ///
+ /// This only applies when using [`Builder::build`] and not
+ /// [`Builder::build_from_noncontiguous`].
+ ///
+ /// See
+ /// [`AhoCorasickBuilder::ascii_case_insensitive`](crate::AhoCorasickBuilder::ascii_case_insensitive)
+ /// for more documentation and examples.
+ pub fn ascii_case_insensitive(&mut self, yes: bool) -> &mut Builder {
+ self.noncontiguous.ascii_case_insensitive(yes);
+ self
+ }
+
+ /// Enable heuristic prefilter optimizations.
+ ///
+ /// This only applies when using [`Builder::build`] and not
+ /// [`Builder::build_from_noncontiguous`].
+ ///
+ /// See
+ /// [`AhoCorasickBuilder::prefilter`](crate::AhoCorasickBuilder::prefilter)
+ /// for more documentation and examples.
+ pub fn prefilter(&mut self, yes: bool) -> &mut Builder {
+ self.noncontiguous.prefilter(yes);
+ self
+ }
+
+ /// Sets the starting state configuration for the automaton.
+ ///
+ /// See
+ /// [`AhoCorasickBuilder::start_kind`](crate::AhoCorasickBuilder::start_kind)
+ /// for more documentation and examples.
+ pub fn start_kind(&mut self, kind: StartKind) -> &mut Builder {
+ self.start_kind = kind;
+ self
+ }
+
+ /// A debug setting for whether to attempt to shrink the size of the
+ /// automaton's alphabet or not.
+ ///
+ /// This should never be enabled unless you're debugging an automaton.
+ /// Namely, disabling byte classes makes transitions easier to reason
+ /// about, since they use the actual bytes instead of equivalence classes.
+ /// Disabling this confers no performance benefit at search time.
+ ///
+ /// See
+ /// [`AhoCorasickBuilder::byte_classes`](crate::AhoCorasickBuilder::byte_classes)
+ /// for more documentation and examples.
+ pub fn byte_classes(&mut self, yes: bool) -> &mut Builder {
+ self.byte_classes = yes;
+ self
+ }
+}
+
+/// Iterate over all possible equivalence class transitions in this state.
+/// The closure is called for all transitions with a distinct equivalence
+/// class, even those not explicitly represented in this sparse state. For
+/// any implicitly defined transitions, the given closure is called with
+/// the fail state ID.
+///
+/// The closure is guaranteed to be called precisely
+/// `byte_classes.alphabet_len()` times, once for every possible class in
+/// ascending order.
+fn sparse_iter<F: FnMut(u8, u8, StateID)>(
+ nnfa: &noncontiguous::NFA,
+ oldsid: StateID,
+ classes: &ByteClasses,
+ mut f: F,
+) {
+ let mut prev_class = None;
+ let mut byte = 0usize;
+ for t in nnfa.iter_trans(oldsid) {
+ while byte < usize::from(t.byte()) {
+ let rep = byte.as_u8();
+ let class = classes.get(rep);
+ byte += 1;
+ if prev_class != Some(class) {
+ f(rep, class, noncontiguous::NFA::FAIL);
+ prev_class = Some(class);
+ }
+ }
+ let rep = t.byte();
+ let class = classes.get(rep);
+ byte += 1;
+ if prev_class != Some(class) {
+ f(rep, class, t.next());
+ prev_class = Some(class);
+ }
+ }
+ for b in byte..=255 {
+ let rep = b.as_u8();
+ let class = classes.get(rep);
+ if prev_class != Some(class) {
+ f(rep, class, noncontiguous::NFA::FAIL);
+ prev_class = Some(class);
+ }
+ }
+}