use crate::{ dfa::DEAD, util::{ primitives::StateID, wire::{self, DeserializeError, Endian, SerializeError}, }, }; macro_rules! err { ($msg:expr) => { return Err(DeserializeError::generic($msg)); }; } // Special represents the identifiers in a DFA that correspond to "special" // states. If a state is one or more of the following, then it is considered // special: // // * dead - A non-matching state where all outgoing transitions lead back to // itself. There is only one of these, regardless of whether minimization // has run. The dead state always has an ID of 0. i.e., It is always the // first state in a DFA. // * quit - A state that is entered whenever a byte is seen that should cause // a DFA to give up and stop searching. This results in a MatchError::quit // error being returned at search time. The default configuration for a DFA // has no quit bytes, which means this state is unreachable by default, // although it is always present for reasons of implementation simplicity. // This state is only reachable when the caller configures the DFA to quit // on certain bytes. There is always exactly one of these states and it // is always the second state. (Its actual ID depends on the size of the // alphabet in dense DFAs, since state IDs are premultiplied in order to // allow them to be used directly as indices into the transition table.) // * match - An accepting state, i.e., indicative of a match. There may be // zero or more of these states. // * accelerated - A state where all of its outgoing transitions, except a // few, loop back to itself. These states are candidates for acceleration // via memchr during search. There may be zero or more of these states. // * start - A non-matching state that indicates where the automaton should // start during a search. There is always at least one starting state and // all are guaranteed to be non-match states. (A start state cannot be a // match state because the DFAs in this crate delay all matches by one byte. // So every search that finds a match must move through one transition to // some other match state, even when searching an empty string.) // // These are not mutually exclusive categories. Namely, the following // overlappings can occur: // // * {dead, start} - If a DFA can never lead to a match and it is minimized, // then it will typically compile to something where all starting IDs point // to the DFA's dead state. // * {match, accelerated} - It is possible for a match state to have the // majority of its transitions loop back to itself, which means it's // possible for a match state to be accelerated. // * {start, accelerated} - Similarly, it is possible for a start state to be // accelerated. Note that it is possible for an accelerated state to be // neither a match or a start state. Also note that just because both match // and start states overlap with accelerated states does not mean that // match and start states overlap with each other. In fact, they are // guaranteed not to overlap. // // As a special mention, every DFA always has a dead and a quit state, even // though from the perspective of the DFA, they are equivalent. (Indeed, // minimization special cases them to ensure they don't get merged.) The // purpose of keeping them distinct is to use the quit state as a sentinel to // distguish between whether a search finished successfully without finding // anything or whether it gave up before finishing. // // So the main problem we want to solve here is the *fast* detection of whether // a state is special or not. And we also want to do this while storing as // little extra data as possible. AND we want to be able to quickly determine // which categories a state falls into above if it is special. // // We achieve this by essentially shuffling all special states to the beginning // of a DFA. That is, all special states appear before every other non-special // state. By representing special states this way, we can determine whether a // state is special or not by a single comparison, where special.max is the // identifier of the last special state in the DFA: // // if current_state <= special.max: // ... do something with special state // // The only thing left to do is to determine what kind of special state // it is. Because what we do next depends on that. Since special states // are typically rare, we can afford to do a bit more extra work, but we'd // still like this to be as fast as possible. The trick we employ here is to // continue shuffling states even within the special state range. Such that // one contiguous region corresponds to match states, another for start states // and then an overlapping range for accelerated states. At a high level, our // special state detection might look like this (for leftmost searching, where // we continue searching even after seeing a match): // // byte = input[offset] // current_state = next_state(current_state, byte) // offset += 1 // if current_state <= special.max: // if current_state == 0: // # We can never leave a dead state, so this always marks the // # end of our search. // return last_match // if current_state == special.quit_id: // # A quit state means we give up. If he DFA has no quit state, // # then special.quit_id == 0 == dead, which is handled by the // # conditional above. // return Err(MatchError::quit { byte, offset: offset - 1 }) // if special.min_match <= current_state <= special.max_match: // last_match = Some(offset) // if special.min_accel <= current_state <= special.max_accel: // offset = accelerate(input, offset) // last_match = Some(offset) // elif special.min_start <= current_state <= special.max_start: // offset = prefilter.find(input, offset) // if special.min_accel <= current_state <= special.max_accel: // offset = accelerate(input, offset) // elif special.min_accel <= current_state <= special.max_accel: // offset = accelerate(input, offset) // // There are some small details left out of the logic above. For example, // in order to accelerate a state, we need to know which bytes to search for. // This in turn implies some extra data we need to store in the DFA. To keep // things compact, we would ideally only store // // N = special.max_accel - special.min_accel + 1 // // items. But state IDs are premultiplied, which means they are not contiguous. // So in order to take a state ID and index an array of accelerated structures, // we need to do: // // i = (state_id - special.min_accel) / stride // // (N.B. 'stride' is always a power of 2, so the above can be implemented via // '(state_id - special.min_accel) >> stride2', where 'stride2' is x in // 2^x=stride.) // // Moreover, some of these specialty categories may be empty. For example, // DFAs are not required to have any match states or any accelerated states. // In that case, the lower and upper bounds are both set to 0 (the dead state // ID) and the first `current_state == 0` check subsumes cases where the // ranges are empty. // // Loop unrolling, if applicable, has also been left out of the logic above. // // Graphically, the ranges look like this, where asterisks indicate ranges // that can be empty. Each 'x' is a state. // // quit // dead| // || // xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx // | | | | start | | // | |-------------| |-------| | // | match* | | | | // | | | | | // | |----------| | | // | accel* | | // | | | // | | | // |----------------------------|------------------------ // special non-special* #[derive(Clone, Copy, Debug)] pub(crate) struct Special { /// The identifier of the last special state in a DFA. A state is special /// if and only if its identifier is less than or equal to `max`. pub(crate) max: StateID, /// The identifier of the quit state in a DFA. (There is no analogous field /// for the dead state since the dead state's ID is always zero, regardless /// of state ID size.) pub(crate) quit_id: StateID, /// The identifier of the first match state. pub(crate) min_match: StateID, /// The identifier of the last match state. pub(crate) max_match: StateID, /// The identifier of the first accelerated state. pub(crate) min_accel: StateID, /// The identifier of the last accelerated state. pub(crate) max_accel: StateID, /// The identifier of the first start state. pub(crate) min_start: StateID, /// The identifier of the last start state. pub(crate) max_start: StateID, } impl Special { /// Creates a new set of special ranges for a DFA. All ranges are initially /// set to only contain the dead state. This is interpreted as an empty /// range. #[cfg(feature = "dfa-build")] pub(crate) fn new() -> Special { Special { max: DEAD, quit_id: DEAD, min_match: DEAD, max_match: DEAD, min_accel: DEAD, max_accel: DEAD, min_start: DEAD, max_start: DEAD, } } /// Remaps all of the special state identifiers using the function given. #[cfg(feature = "dfa-build")] pub(crate) fn remap(&self, map: impl Fn(StateID) -> StateID) -> Special { Special { max: map(self.max), quit_id: map(self.quit_id), min_match: map(self.min_match), max_match: map(self.max_match), min_accel: map(self.min_accel), max_accel: map(self.max_accel), min_start: map(self.min_start), max_start: map(self.max_start), } } /// Deserialize the given bytes into special state ranges. If the slice /// given is not big enough, then this returns an error. Similarly, if /// any of the expected invariants around special state ranges aren't /// upheld, an error is returned. Note that this does not guarantee that /// the information returned is correct. /// /// Upon success, this returns the number of bytes read in addition to the /// special state IDs themselves. pub(crate) fn from_bytes( mut slice: &[u8], ) -> Result<(Special, usize), DeserializeError> { wire::check_slice_len(slice, 8 * StateID::SIZE, "special states")?; let mut nread = 0; let mut read_id = |what| -> Result { let (id, nr) = wire::try_read_state_id(slice, what)?; nread += nr; slice = &slice[StateID::SIZE..]; Ok(id) }; let max = read_id("special max id")?; let quit_id = read_id("special quit id")?; let min_match = read_id("special min match id")?; let max_match = read_id("special max match id")?; let min_accel = read_id("special min accel id")?; let max_accel = read_id("special max accel id")?; let min_start = read_id("special min start id")?; let max_start = read_id("special max start id")?; let special = Special { max, quit_id, min_match, max_match, min_accel, max_accel, min_start, max_start, }; special.validate()?; assert_eq!(nread, special.write_to_len()); Ok((special, nread)) } /// Validate that the information describing special states satisfies /// all known invariants. pub(crate) fn validate(&self) -> Result<(), DeserializeError> { // Check that both ends of the range are DEAD or neither are. if self.min_match == DEAD && self.max_match != DEAD { err!("min_match is DEAD, but max_match is not"); } if self.min_match != DEAD && self.max_match == DEAD { err!("max_match is DEAD, but min_match is not"); } if self.min_accel == DEAD && self.max_accel != DEAD { err!("min_accel is DEAD, but max_accel is not"); } if self.min_accel != DEAD && self.max_accel == DEAD { err!("max_accel is DEAD, but min_accel is not"); } if self.min_start == DEAD && self.max_start != DEAD { err!("min_start is DEAD, but max_start is not"); } if self.min_start != DEAD && self.max_start == DEAD { err!("max_start is DEAD, but min_start is not"); } // Check that ranges are well formed. if self.min_match > self.max_match { err!("min_match should not be greater than max_match"); } if self.min_accel > self.max_accel { err!("min_accel should not be greater than max_accel"); } if self.min_start > self.max_start { err!("min_start should not be greater than max_start"); } // Check that ranges are ordered with respect to one another. if self.matches() && self.quit_id >= self.min_match { err!("quit_id should not be greater than min_match"); } if self.accels() && self.quit_id >= self.min_accel { err!("quit_id should not be greater than min_accel"); } if self.starts() && self.quit_id >= self.min_start { err!("quit_id should not be greater than min_start"); } if self.matches() && self.accels() && self.min_accel < self.min_match { err!("min_match should not be greater than min_accel"); } if self.matches() && self.starts() && self.min_start < self.min_match { err!("min_match should not be greater than min_start"); } if self.accels() && self.starts() && self.min_start < self.min_accel { err!("min_accel should not be greater than min_start"); } // Check that max is at least as big as everything else. if self.max < self.quit_id { err!("quit_id should not be greater than max"); } if self.max < self.max_match { err!("max_match should not be greater than max"); } if self.max < self.max_accel { err!("max_accel should not be greater than max"); } if self.max < self.max_start { err!("max_start should not be greater than max"); } Ok(()) } /// Validate that the special state information is compatible with the /// given state len. pub(crate) fn validate_state_len( &self, len: usize, stride2: usize, ) -> Result<(), DeserializeError> { // We assume that 'validate' has already passed, so we know that 'max' // is truly the max. So all we need to check is that the max state ID // is less than the state ID len. The max legal value here is len-1, // which occurs when there are no non-special states. if (self.max.as_usize() >> stride2) >= len { err!("max should not be greater than or equal to state length"); } Ok(()) } /// Write the IDs and ranges for special states to the given byte buffer. /// The buffer given must have enough room to store all data, otherwise /// this will return an error. The number of bytes written is returned /// on success. The number of bytes written is guaranteed to be a multiple /// of 8. pub(crate) fn write_to( &self, dst: &mut [u8], ) -> Result { use crate::util::wire::write_state_id as write; if dst.len() < self.write_to_len() { return Err(SerializeError::buffer_too_small("special state ids")); } let mut nwrite = 0; nwrite += write::(self.max, &mut dst[nwrite..]); nwrite += write::(self.quit_id, &mut dst[nwrite..]); nwrite += write::(self.min_match, &mut dst[nwrite..]); nwrite += write::(self.max_match, &mut dst[nwrite..]); nwrite += write::(self.min_accel, &mut dst[nwrite..]); nwrite += write::(self.max_accel, &mut dst[nwrite..]); nwrite += write::(self.min_start, &mut dst[nwrite..]); nwrite += write::(self.max_start, &mut dst[nwrite..]); assert_eq!( self.write_to_len(), nwrite, "expected to write certain number of bytes", ); assert_eq!( nwrite % 8, 0, "expected to write multiple of 8 bytes for special states", ); Ok(nwrite) } /// Returns the total number of bytes written by `write_to`. pub(crate) fn write_to_len(&self) -> usize { 8 * StateID::SIZE } /// Sets the maximum special state ID based on the current values. This /// should be used once all possible state IDs are set. #[cfg(feature = "dfa-build")] pub(crate) fn set_max(&mut self) { use core::cmp::max; self.max = max( self.quit_id, max(self.max_match, max(self.max_accel, self.max_start)), ); } /// Sets the maximum special state ID such that starting states are not /// considered "special." This also marks the min/max starting states as /// DEAD such that 'is_start_state' always returns false, even if the state /// is actually a starting state. /// /// This is useful when there is no prefilter set. It will avoid /// ping-ponging between the hot path in the DFA search code and the start /// state handling code, which is typically only useful for executing a /// prefilter. #[cfg(feature = "dfa-build")] pub(crate) fn set_no_special_start_states(&mut self) { use core::cmp::max; self.max = max(self.quit_id, max(self.max_match, self.max_accel)); self.min_start = DEAD; self.max_start = DEAD; } /// Returns true if and only if the given state ID is a special state. #[inline] pub(crate) fn is_special_state(&self, id: StateID) -> bool { id <= self.max } /// Returns true if and only if the given state ID is a dead state. #[inline] pub(crate) fn is_dead_state(&self, id: StateID) -> bool { id == DEAD } /// Returns true if and only if the given state ID is a quit state. #[inline] pub(crate) fn is_quit_state(&self, id: StateID) -> bool { !self.is_dead_state(id) && self.quit_id == id } /// Returns true if and only if the given state ID is a match state. #[inline] pub(crate) fn is_match_state(&self, id: StateID) -> bool { !self.is_dead_state(id) && self.min_match <= id && id <= self.max_match } /// Returns true if and only if the given state ID is an accel state. #[inline] pub(crate) fn is_accel_state(&self, id: StateID) -> bool { !self.is_dead_state(id) && self.min_accel <= id && id <= self.max_accel } /// Returns true if and only if the given state ID is a start state. #[inline] pub(crate) fn is_start_state(&self, id: StateID) -> bool { !self.is_dead_state(id) && self.min_start <= id && id <= self.max_start } /// Returns the total number of match states for a dense table based DFA. #[inline] pub(crate) fn match_len(&self, stride: usize) -> usize { if self.matches() { (self.max_match.as_usize() - self.min_match.as_usize() + stride) / stride } else { 0 } } /// Returns true if and only if there is at least one match state. #[inline] pub(crate) fn matches(&self) -> bool { self.min_match != DEAD } /// Returns the total number of accel states. #[cfg(feature = "dfa-build")] pub(crate) fn accel_len(&self, stride: usize) -> usize { if self.accels() { (self.max_accel.as_usize() - self.min_accel.as_usize() + stride) / stride } else { 0 } } /// Returns true if and only if there is at least one accel state. #[inline] pub(crate) fn accels(&self) -> bool { self.min_accel != DEAD } /// Returns true if and only if there is at least one start state. #[inline] pub(crate) fn starts(&self) -> bool { self.min_start != DEAD } }