From c23a457e72abe608715ac76f076f47dc42af07a5 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Thu, 30 May 2024 20:31:44 +0200 Subject: Merging upstream version 1.74.1+dfsg1. Signed-off-by: Daniel Baumann --- vendor/regex-automata/src/dfa/sparse.rs | 1279 ++++++++++++++++++++----------- 1 file changed, 826 insertions(+), 453 deletions(-) (limited to 'vendor/regex-automata/src/dfa/sparse.rs') diff --git a/vendor/regex-automata/src/dfa/sparse.rs b/vendor/regex-automata/src/dfa/sparse.rs index 346606987..5d8ec2340 100644 --- a/vendor/regex-automata/src/dfa/sparse.rs +++ b/vendor/regex-automata/src/dfa/sparse.rs @@ -14,7 +14,7 @@ example, this configures a sparse DFA to do an overlapping search: ``` use regex_automata::{ dfa::{Automaton, OverlappingState, dense}, - HalfMatch, MatchKind, + HalfMatch, Input, MatchKind, }; let dense_re = dense::Builder::new() @@ -23,25 +23,21 @@ let dense_re = dense::Builder::new() let sparse_re = dense_re.to_sparse()?; // Setup our haystack and initial start state. -let haystack = b"Samwise"; +let input = Input::new("Samwise"); let mut state = OverlappingState::start(); // First, 'Sam' will match. -let end1 = sparse_re.find_overlapping_fwd_at( - None, None, haystack, 0, haystack.len(), &mut state, -)?; -assert_eq!(end1, Some(HalfMatch::must(0, 3))); +sparse_re.try_search_overlapping_fwd(&input, &mut state)?; +assert_eq!(Some(HalfMatch::must(0, 3)), state.get_match()); // And now 'Samwise' will match. -let end2 = sparse_re.find_overlapping_fwd_at( - None, None, haystack, 3, haystack.len(), &mut state, -)?; -assert_eq!(end2, Some(HalfMatch::must(0, 7))); +sparse_re.try_search_overlapping_fwd(&input, &mut state)?; +assert_eq!(Some(HalfMatch::must(0, 7)), state.get_match()); # Ok::<(), Box>(()) ``` */ -#[cfg(feature = "alloc")] +#[cfg(feature = "dfa-build")] use core::iter; use core::{ convert::{TryFrom, TryInto}, @@ -49,23 +45,27 @@ use core::{ mem::size_of, }; -#[cfg(feature = "alloc")] -use alloc::{collections::BTreeSet, vec, vec::Vec}; +#[cfg(feature = "dfa-build")] +use alloc::{vec, vec::Vec}; -#[cfg(feature = "alloc")] -use crate::dfa::{dense, error::Error}; +#[cfg(feature = "dfa-build")] +use crate::dfa::dense::{self, BuildError}; use crate::{ dfa::{ automaton::{fmt_state_indicator, Automaton}, + dense::Flags, special::Special, - DEAD, + StartKind, DEAD, }, util::{ - alphabet::ByteClasses, - bytes::{self, DeserializeError, Endian, SerializeError}, - id::{PatternID, StateID}, - start::Start, - DebugByte, + alphabet::{ByteClasses, ByteSet}, + escape::DebugByte, + int::{Pointer, Usize, U16, U32}, + prefilter::Prefilter, + primitives::{PatternID, StateID}, + search::{Anchored, Input, MatchError}, + start::{Start, StartByteMap}, + wire::{self, DeserializeError, Endian, SerializeError}, }, }; @@ -107,14 +107,11 @@ const VERSION: u32 = 2; /// for searching. For example: /// /// ``` -/// use regex_automata::{ -/// dfa::{Automaton, sparse::DFA}, -/// HalfMatch, -/// }; +/// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input}; /// /// let dfa = DFA::new("foo[0-9]+")?; -/// let expected = HalfMatch::must(0, 8); -/// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?); +/// let expected = Some(HalfMatch::must(0, 8)); +/// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); /// # Ok::<(), Box>(()) /// ``` #[derive(Clone)] @@ -130,12 +127,15 @@ pub struct DFA { // // That is, a lot of the complexity is pushed down into how each state // itself is represented. - trans: Transitions, - starts: StartTable, + tt: Transitions, + st: StartTable, special: Special, + pre: Option, + quitset: ByteSet, + flags: Flags, } -#[cfg(feature = "alloc")] +#[cfg(feature = "dfa-build")] impl DFA> { /// Parse the given regular expression using a default configuration and /// return the corresponding sparse DFA. @@ -149,18 +149,16 @@ impl DFA> { /// # Example /// /// ``` - /// use regex_automata::{ - /// dfa::{Automaton, sparse}, - /// HalfMatch, - /// }; + /// use regex_automata::{dfa::{Automaton, sparse}, HalfMatch, Input}; /// /// let dfa = sparse::DFA::new("foo[0-9]+bar")?; /// - /// let expected = HalfMatch::must(0, 11); - /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345bar")?); + /// let expected = Some(HalfMatch::must(0, 11)); + /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345bar"))?); /// # Ok::<(), Box>(()) /// ``` - pub fn new(pattern: &str) -> Result>, Error> { + #[cfg(feature = "syntax")] + pub fn new(pattern: &str) -> Result>, BuildError> { dense::Builder::new() .build(pattern) .and_then(|dense| dense.to_sparse()) @@ -178,26 +176,24 @@ impl DFA> { /// # Example /// /// ``` - /// use regex_automata::{ - /// dfa::{Automaton, sparse}, - /// HalfMatch, - /// }; + /// use regex_automata::{dfa::{Automaton, sparse}, HalfMatch, Input}; /// /// let dfa = sparse::DFA::new_many(&["[0-9]+", "[a-z]+"])?; - /// let expected = HalfMatch::must(1, 3); - /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345bar")?); + /// let expected = Some(HalfMatch::must(1, 3)); + /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345bar"))?); /// # Ok::<(), Box>(()) /// ``` + #[cfg(feature = "syntax")] pub fn new_many>( patterns: &[P], - ) -> Result>, Error> { + ) -> Result>, BuildError> { dense::Builder::new() .build_many(patterns) .and_then(|dense| dense.to_sparse()) } } -#[cfg(feature = "alloc")] +#[cfg(feature = "dfa-build")] impl DFA> { /// Create a new DFA that matches every input. /// @@ -206,17 +202,17 @@ impl DFA> { /// ``` /// use regex_automata::{ /// dfa::{Automaton, sparse}, - /// HalfMatch, + /// HalfMatch, Input, /// }; /// /// let dfa = sparse::DFA::always_match()?; /// - /// let expected = HalfMatch::must(0, 0); - /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"")?); - /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo")?); + /// let expected = Some(HalfMatch::must(0, 0)); + /// assert_eq!(expected, dfa.try_search_fwd(&Input::new(""))?); + /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo"))?); /// # Ok::<(), Box>(()) /// ``` - pub fn always_match() -> Result>, Error> { + pub fn always_match() -> Result>, BuildError> { dense::DFA::always_match()?.to_sparse() } @@ -225,21 +221,21 @@ impl DFA> { /// # Example /// /// ``` - /// use regex_automata::dfa::{Automaton, sparse}; + /// use regex_automata::{dfa::{Automaton, sparse}, Input}; /// /// let dfa = sparse::DFA::never_match()?; - /// assert_eq!(None, dfa.find_leftmost_fwd(b"")?); - /// assert_eq!(None, dfa.find_leftmost_fwd(b"foo")?); + /// assert_eq!(None, dfa.try_search_fwd(&Input::new(""))?); + /// assert_eq!(None, dfa.try_search_fwd(&Input::new("foo"))?); /// # Ok::<(), Box>(()) /// ``` - pub fn never_match() -> Result>, Error> { + pub fn never_match() -> Result>, BuildError> { dense::DFA::never_match()?.to_sparse() } /// The implementation for constructing a sparse DFA from a dense DFA. pub(crate) fn from_dense>( dfa: &dense::DFA, - ) -> Result>, Error> { + ) -> Result>, BuildError> { // In order to build the transition table, we need to be able to write // state identifiers for each of the "next" transitions in each state. // Our state identifiers correspond to the byte offset in the @@ -249,35 +245,35 @@ impl DFA> { // of the transition table happens in two passes. // // In the first pass, we fill out the shell of each state, which - // includes the transition count, the input byte ranges and zero-filled - // space for the transitions and accelerators, if present. In this - // first pass, we also build up a map from the state identifier index - // of the dense DFA to the state identifier in this sparse DFA. + // includes the transition length, the input byte ranges and + // zero-filled space for the transitions and accelerators, if present. + // In this first pass, we also build up a map from the state identifier + // index of the dense DFA to the state identifier in this sparse DFA. // // In the second pass, we fill in the transitions based on the map // built in the first pass. // The capacity given here reflects a minimum. (Well, the true minimum // is likely even bigger, but hopefully this saves a few reallocs.) - let mut sparse = Vec::with_capacity(StateID::SIZE * dfa.state_count()); + let mut sparse = Vec::with_capacity(StateID::SIZE * dfa.state_len()); // This maps state indices from the dense DFA to StateIDs in the sparse // DFA. We build out this map on the first pass, and then use it in the // second pass to back-fill our transitions. - let mut remap: Vec = vec![DEAD; dfa.state_count()]; + let mut remap: Vec = vec![DEAD; dfa.state_len()]; for state in dfa.states() { let pos = sparse.len(); - remap[dfa.to_index(state.id())] = - StateID::new(pos).map_err(|_| Error::too_many_states())?; - // zero-filled space for the transition count + remap[dfa.to_index(state.id())] = StateID::new(pos) + .map_err(|_| BuildError::too_many_states())?; + // zero-filled space for the transition length sparse.push(0); sparse.push(0); - let mut transition_count = 0; + let mut transition_len = 0; for (unit1, unit2, _) in state.sparse_transitions() { match (unit1.as_u8(), unit2.as_u8()) { (Some(b1), Some(b2)) => { - transition_count += 1; + transition_len += 1; sparse.push(b1); sparse.push(b2); } @@ -298,40 +294,40 @@ impl DFA> { // N.B. The loop above is not guaranteed to yield the EOI // transition, since it may point to a DEAD state. By putting // it here, we always write the EOI transition, and thus - // guarantee that our transition count is >0. Why do we always + // guarantee that our transition length is >0. Why do we always // need the EOI transition? Because in order to implement // Automaton::next_eoi_state, this lets us just ask for the last // transition. There are probably other/better ways to do this. - transition_count += 1; + transition_len += 1; sparse.push(0); sparse.push(0); - // Check some assumptions about transition count. + // Check some assumptions about transition length. assert_ne!( - transition_count, 0, - "transition count should be non-zero", + transition_len, 0, + "transition length should be non-zero", ); assert!( - transition_count <= 257, - "expected transition count {} to be <= 257", - transition_count, + transition_len <= 257, + "expected transition length {} to be <= 257", + transition_len, ); - // Fill in the transition count. - // Since transition count is always <= 257, we use the most + // Fill in the transition length. + // Since transition length is always <= 257, we use the most // significant bit to indicate whether this is a match state or // not. let ntrans = if dfa.is_match_state(state.id()) { - transition_count | (1 << 15) + transition_len | (1 << 15) } else { - transition_count + transition_len }; - bytes::NE::write_u16(ntrans, &mut sparse[pos..]); + wire::NE::write_u16(ntrans, &mut sparse[pos..]); // zero-fill the actual transitions. - // Unwraps are OK since transition_count <= 257 and our minimum + // Unwraps are OK since transition_length <= 257 and our minimum // support usize size is 16-bits. - let zeros = usize::try_from(transition_count) + let zeros = usize::try_from(transition_len) .unwrap() .checked_mul(StateID::SIZE) .unwrap(); @@ -355,18 +351,18 @@ impl DFA> { sparse.extend(iter::repeat(0).take(zeros)); // Now write the length prefix. - bytes::NE::write_u32( + wire::NE::write_u32( // Will never fail since u32::MAX is invalid pattern ID. // Thus, the number of pattern IDs is representable by a // u32. - plen.try_into().expect("pattern ID count fits in u32"), + plen.try_into().expect("pattern ID length fits in u32"), &mut sparse[pos..], ); pos += size_of::(); // Now write the pattern IDs. for &pid in dfa.pattern_id_slice(state.id()) { - pos += bytes::write_pattern_id::( + pos += wire::write_pattern_id::( pid, &mut sparse[pos..], ); @@ -384,28 +380,31 @@ impl DFA> { } let mut new = DFA { - trans: Transitions { + tt: Transitions { sparse, classes: dfa.byte_classes().clone(), - count: dfa.state_count(), - patterns: dfa.pattern_count(), + state_len: dfa.state_len(), + pattern_len: dfa.pattern_len(), }, - starts: StartTable::from_dense_dfa(dfa, &remap)?, + st: StartTable::from_dense_dfa(dfa, &remap)?, special: dfa.special().remap(|id| remap[dfa.to_index(id)]), + pre: dfa.get_prefilter().map(|p| p.clone()), + quitset: dfa.quitset().clone(), + flags: dfa.flags().clone(), }; // And here's our second pass. Iterate over all of the dense states // again, and update the transitions in each of the states in the // sparse DFA. for old_state in dfa.states() { let new_id = remap[dfa.to_index(old_state.id())]; - let mut new_state = new.trans.state_mut(new_id); + let mut new_state = new.tt.state_mut(new_id); let sparse = old_state.sparse_transitions(); for (i, (_, _, next)) in sparse.enumerate() { let next = remap[dfa.to_index(next)]; new_state.set_next_at(i, next); } } - trace!( + debug!( "created sparse DFA, memory usage: {} (dense memory usage: {})", new.memory_usage(), dfa.memory_usage(), @@ -419,9 +418,12 @@ impl> DFA { /// DFA returned always uses `&[u8]` for its transitions. pub fn as_ref<'a>(&'a self) -> DFA<&'a [u8]> { DFA { - trans: self.trans.as_ref(), - starts: self.starts.as_ref(), + tt: self.tt.as_ref(), + st: self.st.as_ref(), special: self.special, + pre: self.pre.clone(), + quitset: self.quitset, + flags: self.flags, } } @@ -431,36 +433,67 @@ impl> DFA { /// Effectively, this returns a sparse DFA whose transitions live on the /// heap. #[cfg(feature = "alloc")] - pub fn to_owned(&self) -> DFA> { + pub fn to_owned(&self) -> DFA> { DFA { - trans: self.trans.to_owned(), - starts: self.starts.to_owned(), + tt: self.tt.to_owned(), + st: self.st.to_owned(), special: self.special, + pre: self.pre.clone(), + quitset: self.quitset, + flags: self.flags, } } - /// Returns the memory usage, in bytes, of this DFA. + /// Returns the starting state configuration for this DFA. /// - /// The memory usage is computed based on the number of bytes used to - /// represent this DFA. - /// - /// This does **not** include the stack size used up by this DFA. To - /// compute that, use `std::mem::size_of::()`. - pub fn memory_usage(&self) -> usize { - self.trans.memory_usage() + self.starts.memory_usage() + /// The default is [`StartKind::Both`], which means the DFA supports both + /// unanchored and anchored searches. However, this can generally lead to + /// bigger DFAs. Therefore, a DFA might be compiled with support for just + /// unanchored or anchored searches. In that case, running a search with + /// an unsupported configuration will panic. + pub fn start_kind(&self) -> StartKind { + self.st.kind } /// Returns true only if this DFA has starting states for each pattern. /// /// When a DFA has starting states for each pattern, then a search with the /// DFA can be configured to only look for anchored matches of a specific - /// pattern. Specifically, APIs like [`Automaton::find_earliest_fwd_at`] - /// can accept a non-None `pattern_id` if and only if this method returns - /// true. Otherwise, calling `find_earliest_fwd_at` will panic. + /// pattern. Specifically, APIs like [`Automaton::try_search_fwd`] can + /// accept a [`Anchored::Pattern`] if and only if this method returns true. + /// Otherwise, an error will be returned. /// /// Note that if the DFA is empty, this always returns false. - pub fn has_starts_for_each_pattern(&self) -> bool { - self.starts.patterns > 0 + pub fn starts_for_each_pattern(&self) -> bool { + self.st.pattern_len.is_some() + } + + /// Returns the equivalence classes that make up the alphabet for this DFA. + /// + /// Unless [`dense::Config::byte_classes`] was disabled, it is possible + /// that multiple distinct bytes are grouped into the same equivalence + /// class if it is impossible for them to discriminate between a match and + /// a non-match. This has the effect of reducing the overall alphabet size + /// and in turn potentially substantially reducing the size of the DFA's + /// transition table. + /// + /// The downside of using equivalence classes like this is that every state + /// transition will automatically use this map to convert an arbitrary + /// byte to its corresponding equivalence class. In practice this has a + /// negligible impact on performance. + pub fn byte_classes(&self) -> &ByteClasses { + &self.tt.classes + } + + /// Returns the memory usage, in bytes, of this DFA. + /// + /// The memory usage is computed based on the number of bytes used to + /// represent this DFA. + /// + /// This does **not** include the stack size used up by this DFA. To + /// compute that, use `std::mem::size_of::()`. + pub fn memory_usage(&self) -> usize { + self.tt.memory_usage() + self.st.memory_usage() } } @@ -488,10 +521,7 @@ impl> DFA { /// This example shows how to serialize and deserialize a DFA: /// /// ``` - /// use regex_automata::{ - /// dfa::{Automaton, sparse::DFA}, - /// HalfMatch, - /// }; + /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input}; /// /// // Compile our original DFA. /// let original_dfa = DFA::new("foo[0-9]+")?; @@ -503,13 +533,13 @@ impl> DFA { /// // ignore it. /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf)?.0; /// - /// let expected = HalfMatch::must(0, 8); - /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?); + /// let expected = Some(HalfMatch::must(0, 8)); + /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); /// # Ok::<(), Box>(()) /// ``` - #[cfg(feature = "alloc")] + #[cfg(feature = "dfa-build")] pub fn to_bytes_little_endian(&self) -> Vec { - self.to_bytes::() + self.to_bytes::() } /// Serialize this DFA as raw bytes to a `Vec` in big endian @@ -533,10 +563,7 @@ impl> DFA { /// This example shows how to serialize and deserialize a DFA: /// /// ``` - /// use regex_automata::{ - /// dfa::{Automaton, sparse::DFA}, - /// HalfMatch, - /// }; + /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input}; /// /// // Compile our original DFA. /// let original_dfa = DFA::new("foo[0-9]+")?; @@ -548,13 +575,13 @@ impl> DFA { /// // ignore it. /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf)?.0; /// - /// let expected = HalfMatch::must(0, 8); - /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?); + /// let expected = Some(HalfMatch::must(0, 8)); + /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); /// # Ok::<(), Box>(()) /// ``` - #[cfg(feature = "alloc")] + #[cfg(feature = "dfa-build")] pub fn to_bytes_big_endian(&self) -> Vec { - self.to_bytes::() + self.to_bytes::() } /// Serialize this DFA as raw bytes to a `Vec` in native endian @@ -587,10 +614,7 @@ impl> DFA { /// This example shows how to serialize and deserialize a DFA: /// /// ``` - /// use regex_automata::{ - /// dfa::{Automaton, sparse::DFA}, - /// HalfMatch, - /// }; + /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input}; /// /// // Compile our original DFA. /// let original_dfa = DFA::new("foo[0-9]+")?; @@ -600,18 +624,18 @@ impl> DFA { /// // ignore it. /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf)?.0; /// - /// let expected = HalfMatch::must(0, 8); - /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?); + /// let expected = Some(HalfMatch::must(0, 8)); + /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); /// # Ok::<(), Box>(()) /// ``` - #[cfg(feature = "alloc")] + #[cfg(feature = "dfa-build")] pub fn to_bytes_native_endian(&self) -> Vec { - self.to_bytes::() + self.to_bytes::() } /// The implementation of the public `to_bytes` serialization methods, /// which is generic over endianness. - #[cfg(feature = "alloc")] + #[cfg(feature = "dfa-build")] fn to_bytes(&self) -> Vec { let mut buf = vec![0; self.write_to_len()]; // This should always succeed since the only possible serialization @@ -645,10 +669,7 @@ impl> DFA { /// dynamic memory allocation. /// /// ``` - /// use regex_automata::{ - /// dfa::{Automaton, sparse::DFA}, - /// HalfMatch, - /// }; + /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input}; /// /// // Compile our original DFA. /// let original_dfa = DFA::new("foo[0-9]+")?; @@ -660,15 +681,15 @@ impl> DFA { /// let written = original_dfa.write_to_native_endian(&mut buf)?; /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf[..written])?.0; /// - /// let expected = HalfMatch::must(0, 8); - /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?); + /// let expected = Some(HalfMatch::must(0, 8)); + /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); /// # Ok::<(), Box>(()) /// ``` pub fn write_to_little_endian( &self, dst: &mut [u8], ) -> Result { - self.write_to::(dst) + self.write_to::(dst) } /// Serialize this DFA as raw bytes to the given slice, in big endian @@ -695,10 +716,7 @@ impl> DFA { /// dynamic memory allocation. /// /// ``` - /// use regex_automata::{ - /// dfa::{Automaton, sparse::DFA}, - /// HalfMatch, - /// }; + /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input}; /// /// // Compile our original DFA. /// let original_dfa = DFA::new("foo[0-9]+")?; @@ -710,15 +728,15 @@ impl> DFA { /// let written = original_dfa.write_to_native_endian(&mut buf)?; /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf[..written])?.0; /// - /// let expected = HalfMatch::must(0, 8); - /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?); + /// let expected = Some(HalfMatch::must(0, 8)); + /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); /// # Ok::<(), Box>(()) /// ``` pub fn write_to_big_endian( &self, dst: &mut [u8], ) -> Result { - self.write_to::(dst) + self.write_to::(dst) } /// Serialize this DFA as raw bytes to the given slice, in native endian @@ -754,10 +772,7 @@ impl> DFA { /// dynamic memory allocation. /// /// ``` - /// use regex_automata::{ - /// dfa::{Automaton, sparse::DFA}, - /// HalfMatch, - /// }; + /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input}; /// /// // Compile our original DFA. /// let original_dfa = DFA::new("foo[0-9]+")?; @@ -767,15 +782,15 @@ impl> DFA { /// let written = original_dfa.write_to_native_endian(&mut buf)?; /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf[..written])?.0; /// - /// let expected = HalfMatch::must(0, 8); - /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?); + /// let expected = Some(HalfMatch::must(0, 8)); + /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); /// # Ok::<(), Box>(()) /// ``` pub fn write_to_native_endian( &self, dst: &mut [u8], ) -> Result { - self.write_to::(dst) + self.write_to::(dst) } /// The implementation of the public `write_to` serialization methods, @@ -785,17 +800,19 @@ impl> DFA { dst: &mut [u8], ) -> Result { let mut nw = 0; - nw += bytes::write_label(LABEL, &mut dst[nw..])?; - nw += bytes::write_endianness_check::(&mut dst[nw..])?; - nw += bytes::write_version::(VERSION, &mut dst[nw..])?; + nw += wire::write_label(LABEL, &mut dst[nw..])?; + nw += wire::write_endianness_check::(&mut dst[nw..])?; + nw += wire::write_version::(VERSION, &mut dst[nw..])?; nw += { // Currently unused, intended for future flexibility E::write_u32(0, &mut dst[nw..]); size_of::() }; - nw += self.trans.write_to::(&mut dst[nw..])?; - nw += self.starts.write_to::(&mut dst[nw..])?; + nw += self.flags.write_to::(&mut dst[nw..])?; + nw += self.tt.write_to::(&mut dst[nw..])?; + nw += self.st.write_to::(&mut dst[nw..])?; nw += self.special.write_to::(&mut dst[nw..])?; + nw += self.quitset.write_to::(&mut dst[nw..])?; Ok(nw) } @@ -817,10 +834,7 @@ impl> DFA { /// a sparse DFA. /// /// ``` - /// use regex_automata::{ - /// dfa::{Automaton, sparse::DFA}, - /// HalfMatch, - /// }; + /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input}; /// /// // Compile our original DFA. /// let original_dfa = DFA::new("foo[0-9]+")?; @@ -829,18 +843,20 @@ impl> DFA { /// let written = original_dfa.write_to_native_endian(&mut buf)?; /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf[..written])?.0; /// - /// let expected = HalfMatch::must(0, 8); - /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?); + /// let expected = Some(HalfMatch::must(0, 8)); + /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); /// # Ok::<(), Box>(()) /// ``` pub fn write_to_len(&self) -> usize { - bytes::write_label_len(LABEL) - + bytes::write_endianness_check_len() - + bytes::write_version_len() + wire::write_label_len(LABEL) + + wire::write_endianness_check_len() + + wire::write_version_len() + size_of::() // unused, intended for future flexibility - + self.trans.write_to_len() - + self.starts.write_to_len() + + self.flags.write_to_len() + + self.tt.write_to_len() + + self.st.write_to_len() + self.special.write_to_len() + + self.quitset.write_to_len() } } @@ -901,17 +917,14 @@ impl<'a> DFA<&'a [u8]> { /// and then use it for searching. /// /// ``` - /// use regex_automata::{ - /// dfa::{Automaton, sparse::DFA}, - /// HalfMatch, - /// }; + /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input}; /// /// let initial = DFA::new("foo[0-9]+")?; /// let bytes = initial.to_bytes_native_endian(); /// let dfa: DFA<&[u8]> = DFA::from_bytes(&bytes)?.0; /// - /// let expected = HalfMatch::must(0, 8); - /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?); + /// let expected = Some(HalfMatch::must(0, 8)); + /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); /// # Ok::<(), Box>(()) /// ``` /// @@ -927,7 +940,7 @@ impl<'a> DFA<&'a [u8]> { /// a file: /// /// ```no_run - /// use regex_automata::dfa::{Automaton, sparse::DFA}; + /// use regex_automata::dfa::sparse::DFA; /// /// let dfa = DFA::new("foo[0-9]+")?; /// @@ -943,23 +956,22 @@ impl<'a> DFA<&'a [u8]> { /// /// And now the second part is embedding the DFA into the compiled program /// and deserializing it at runtime on first use. We use conditional - /// compilation to choose the correct endianness. As mentioned above, we - /// do not need to employ any special tricks to ensure a proper alignment, - /// since a sparse DFA has no alignment requirements. + /// compilation to choose the correct endianness. We do not need to employ + /// any special tricks to ensure a proper alignment, since a sparse DFA has + /// no alignment requirements. /// /// ```no_run /// use regex_automata::{ - /// dfa::{Automaton, sparse}, - /// HalfMatch, + /// dfa::{Automaton, sparse::DFA}, + /// util::lazy::Lazy, + /// HalfMatch, Input, /// }; /// - /// type DFA = sparse::DFA<&'static [u8]>; - /// - /// fn get_foo() -> &'static DFA { - /// use std::cell::Cell; - /// use std::mem::MaybeUninit; - /// use std::sync::Once; - /// + /// // This crate provides its own "lazy" type, kind of like + /// // lazy_static! or once_cell::sync::Lazy. But it works in no-alloc + /// // no-std environments and let's us write this using completely + /// // safe code. + /// static RE: Lazy> = Lazy::new(|| { /// # const _: &str = stringify! { /// #[cfg(target_endian = "big")] /// static BYTES: &[u8] = include_bytes!("foo.bigendian.dfa"); @@ -968,33 +980,13 @@ impl<'a> DFA<&'a [u8]> { /// # }; /// # static BYTES: &[u8] = b""; /// - /// struct Lazy(Cell>); - /// // SAFETY: This is safe because DFA impls Sync. - /// unsafe impl Sync for Lazy {} - /// - /// static INIT: Once = Once::new(); - /// static DFA: Lazy = Lazy(Cell::new(MaybeUninit::uninit())); - /// - /// INIT.call_once(|| { - /// let (dfa, _) = DFA::from_bytes(BYTES) - /// .expect("serialized DFA should be valid"); - /// // SAFETY: This is guaranteed to only execute once, and all - /// // we do with the pointer is write the DFA to it. - /// unsafe { - /// (*DFA.0.as_ptr()).as_mut_ptr().write(dfa); - /// } - /// }); - /// // SAFETY: DFA is guaranteed to by initialized via INIT and is - /// // stored in static memory. - /// unsafe { - /// let dfa = (*DFA.0.as_ptr()).as_ptr(); - /// std::mem::transmute::<*const DFA, &'static DFA>(dfa) - /// } - /// } - /// - /// let dfa = get_foo(); - /// let expected = HalfMatch::must(0, 8); - /// assert_eq!(Ok(Some(expected)), dfa.find_leftmost_fwd(b"foo12345")); + /// let (dfa, _) = DFA::from_bytes(BYTES) + /// .expect("serialized DFA should be valid"); + /// dfa + /// }); + /// + /// let expected = Ok(Some(HalfMatch::must(0, 8))); + /// assert_eq!(expected, RE.try_search_fwd(&Input::new("foo12345"))); /// ``` /// /// Alternatively, consider using @@ -1009,8 +1001,8 @@ impl<'a> DFA<&'a [u8]> { // (by trying to decode every state) and start state ID list below. If // either validation fails, then we return an error. let (dfa, nread) = unsafe { DFA::from_bytes_unchecked(slice)? }; - dfa.trans.validate()?; - dfa.starts.validate(&dfa.trans)?; + dfa.tt.validate(&dfa.special)?; + dfa.st.validate(&dfa.special, &dfa.tt)?; // N.B. dfa.special doesn't have a way to do unchecked deserialization, // so it has already been validated. Ok((dfa, nread)) @@ -1029,23 +1021,20 @@ impl<'a> DFA<&'a [u8]> { /// /// # Safety /// - /// This routine is unsafe because it permits callers to provide + /// This routine is not safe because it permits callers to provide /// arbitrary transitions with possibly incorrect state identifiers. While /// the various serialization routines will never return an incorrect - /// DFA, there is no guarantee that the bytes provided here - /// are correct. While `from_bytes_unchecked` will still do several forms - /// of basic validation, this routine does not check that the transitions - /// themselves are correct. Given an incorrect transition table, it is - /// possible for the search routines to access out-of-bounds memory because - /// of explicit bounds check elision. + /// DFA, there is no guarantee that the bytes provided here are correct. + /// While `from_bytes_unchecked` will still do several forms of basic + /// validation, this routine does not check that the transitions themselves + /// are correct. Given an incorrect transition table, it is possible for + /// the search routines to access out-of-bounds memory because of explicit + /// bounds check elision. /// /// # Example /// /// ``` - /// use regex_automata::{ - /// dfa::{Automaton, sparse::DFA}, - /// HalfMatch, - /// }; + /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input}; /// /// let initial = DFA::new("foo[0-9]+")?; /// let bytes = initial.to_bytes_native_endian(); @@ -1053,8 +1042,8 @@ impl<'a> DFA<&'a [u8]> { /// // directly from a compatible serialization routine. /// let dfa: DFA<&[u8]> = unsafe { DFA::from_bytes_unchecked(&bytes)?.0 }; /// - /// let expected = HalfMatch::must(0, 8); - /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?); + /// let expected = Some(HalfMatch::must(0, 8)); + /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); /// # Ok::<(), Box>(()) /// ``` pub unsafe fn from_bytes_unchecked( @@ -1062,56 +1051,70 @@ impl<'a> DFA<&'a [u8]> { ) -> Result<(DFA<&'a [u8]>, usize), DeserializeError> { let mut nr = 0; - nr += bytes::read_label(&slice[nr..], LABEL)?; - nr += bytes::read_endianness_check(&slice[nr..])?; - nr += bytes::read_version(&slice[nr..], VERSION)?; + nr += wire::read_label(&slice[nr..], LABEL)?; + nr += wire::read_endianness_check(&slice[nr..])?; + nr += wire::read_version(&slice[nr..], VERSION)?; - let _unused = bytes::try_read_u32(&slice[nr..], "unused space")?; + let _unused = wire::try_read_u32(&slice[nr..], "unused space")?; nr += size_of::(); - let (trans, nread) = Transitions::from_bytes_unchecked(&slice[nr..])?; + let (flags, nread) = Flags::from_bytes(&slice[nr..])?; + nr += nread; + + let (tt, nread) = Transitions::from_bytes_unchecked(&slice[nr..])?; nr += nread; - let (starts, nread) = StartTable::from_bytes_unchecked(&slice[nr..])?; + let (st, nread) = StartTable::from_bytes_unchecked(&slice[nr..])?; nr += nread; let (special, nread) = Special::from_bytes(&slice[nr..])?; nr += nread; - if special.max.as_usize() >= trans.sparse().len() { + if special.max.as_usize() >= tt.sparse().len() { return Err(DeserializeError::generic( "max should not be greater than or equal to sparse bytes", )); } - Ok((DFA { trans, starts, special }, nr)) + let (quitset, nread) = ByteSet::from_bytes(&slice[nr..])?; + nr += nread; + + // Prefilters don't support serialization, so they're always absent. + let pre = None; + Ok((DFA { tt, st, special, pre, quitset, flags }, nr)) } } impl> fmt::Debug for DFA { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { writeln!(f, "sparse::DFA(")?; - for state in self.trans.states() { + for state in self.tt.states() { fmt_state_indicator(f, self, state.id())?; - writeln!(f, "{:06?}: {:?}", state.id(), state)?; + writeln!(f, "{:06?}: {:?}", state.id().as_usize(), state)?; } writeln!(f, "")?; - for (i, (start_id, sty, pid)) in self.starts.iter().enumerate() { - if i % self.starts.stride == 0 { - match pid { - None => writeln!(f, "START-GROUP(ALL)")?, - Some(pid) => { - writeln!(f, "START_GROUP(pattern: {:?})", pid)? - } + for (i, (start_id, anchored, sty)) in self.st.iter().enumerate() { + if i % self.st.stride == 0 { + match anchored { + Anchored::No => writeln!(f, "START-GROUP(unanchored)")?, + Anchored::Yes => writeln!(f, "START-GROUP(anchored)")?, + Anchored::Pattern(pid) => writeln!( + f, + "START_GROUP(pattern: {:?})", + pid.as_usize() + )?, } } writeln!(f, " {:?} => {:06?}", sty, start_id.as_usize())?; } - writeln!(f, "state count: {:?}", self.trans.count)?; + writeln!(f, "state length: {:?}", self.tt.state_len)?; + writeln!(f, "pattern length: {:?}", self.pattern_len())?; + writeln!(f, "flags: {:?}", self.flags)?; writeln!(f, ")")?; Ok(()) } } +// SAFETY: We assert that our implementation of each method is correct. unsafe impl> Automaton for DFA { #[inline] fn is_special_state(&self, id: StateID) -> bool { @@ -1145,10 +1148,10 @@ unsafe impl> Automaton for DFA { // This is marked as inline to help dramatically boost sparse searching, // which decodes each state it enters to follow the next transition. - #[inline(always)] + #[cfg_attr(feature = "perf-inline", inline(always))] fn next_state(&self, current: StateID, input: u8) -> StateID { - let input = self.trans.classes.get(input); - self.trans.state(current).next(input) + let input = self.tt.classes.get(input); + self.tt.state(current).next(input) } #[inline] @@ -1162,17 +1165,17 @@ unsafe impl> Automaton for DFA { #[inline] fn next_eoi_state(&self, current: StateID) -> StateID { - self.trans.state(current).next_eoi() + self.tt.state(current).next_eoi() } #[inline] - fn pattern_count(&self) -> usize { - self.trans.patterns + fn pattern_len(&self) -> usize { + self.tt.pattern_len } #[inline] - fn match_count(&self, id: StateID) -> usize { - self.trans.state(id).pattern_count() + fn match_len(&self, id: StateID) -> usize { + self.tt.state(id).pattern_len() } #[inline] @@ -1182,39 +1185,76 @@ unsafe impl> Automaton for DFA { // that finds the pattern ID from the state machine, which requires // a bit of slicing/pointer-chasing. This optimization tends to only // matter when matches are frequent. - if self.trans.patterns == 1 { + if self.tt.pattern_len == 1 { return PatternID::ZERO; } - self.trans.state(id).pattern_id(match_index) + self.tt.state(id).pattern_id(match_index) + } + + #[inline] + fn has_empty(&self) -> bool { + self.flags.has_empty + } + + #[inline] + fn is_utf8(&self) -> bool { + self.flags.is_utf8 + } + + #[inline] + fn is_always_start_anchored(&self) -> bool { + self.flags.is_always_start_anchored } #[inline] fn start_state_forward( &self, - pattern_id: Option, - bytes: &[u8], - start: usize, - end: usize, - ) -> StateID { - let index = Start::from_position_fwd(bytes, start, end); - self.starts.start(index, pattern_id) + input: &Input<'_>, + ) -> Result { + if !self.quitset.is_empty() && input.start() > 0 { + let offset = input.start() - 1; + let byte = input.haystack()[offset]; + if self.quitset.contains(byte) { + return Err(MatchError::quit(byte, offset)); + } + } + let start = self.st.start_map.fwd(&input); + self.st.start(input, start) } #[inline] fn start_state_reverse( &self, - pattern_id: Option, - bytes: &[u8], - start: usize, - end: usize, - ) -> StateID { - let index = Start::from_position_rev(bytes, start, end); - self.starts.start(index, pattern_id) + input: &Input<'_>, + ) -> Result { + if !self.quitset.is_empty() && input.end() < input.haystack().len() { + let offset = input.end(); + let byte = input.haystack()[offset]; + if self.quitset.contains(byte) { + return Err(MatchError::quit(byte, offset)); + } + } + let start = self.st.start_map.rev(&input); + self.st.start(input, start) + } + + #[inline] + fn universal_start_state(&self, mode: Anchored) -> Option { + match mode { + Anchored::No => self.st.universal_start_unanchored, + Anchored::Yes => self.st.universal_start_anchored, + Anchored::Pattern(_) => None, + } } #[inline] fn accelerator(&self, id: StateID) -> &[u8] { - self.trans.state(id).accelerator() + self.tt.state(id).accelerator() + } + + #[inline] + fn get_prefilter(&self) -> Option<&Prefilter> { + self.pre.as_ref() } } @@ -1278,43 +1318,38 @@ struct Transitions { /// least one state---the dead state---even the empty DFA. In particular, /// the dead state always has ID 0 and is correspondingly always the first /// state. The dead state is never a match state. - count: usize, + state_len: usize, /// The total number of unique patterns represented by these match states. - patterns: usize, + pattern_len: usize, } impl<'a> Transitions<&'a [u8]> { unsafe fn from_bytes_unchecked( mut slice: &'a [u8], ) -> Result<(Transitions<&'a [u8]>, usize), DeserializeError> { - let slice_start = slice.as_ptr() as usize; + let slice_start = slice.as_ptr().as_usize(); - let (state_count, nr) = - bytes::try_read_u32_as_usize(&slice, "state count")?; + let (state_len, nr) = + wire::try_read_u32_as_usize(&slice, "state length")?; slice = &slice[nr..]; - let (pattern_count, nr) = - bytes::try_read_u32_as_usize(&slice, "pattern count")?; + let (pattern_len, nr) = + wire::try_read_u32_as_usize(&slice, "pattern length")?; slice = &slice[nr..]; let (classes, nr) = ByteClasses::from_bytes(&slice)?; slice = &slice[nr..]; let (len, nr) = - bytes::try_read_u32_as_usize(&slice, "sparse transitions length")?; + wire::try_read_u32_as_usize(&slice, "sparse transitions length")?; slice = &slice[nr..]; - bytes::check_slice_len(slice, len, "sparse states byte length")?; + wire::check_slice_len(slice, len, "sparse states byte length")?; let sparse = &slice[..len]; slice = &slice[len..]; - let trans = Transitions { - sparse, - classes, - count: state_count, - patterns: pattern_count, - }; - Ok((trans, slice.as_ptr() as usize - slice_start)) + let trans = Transitions { sparse, classes, state_len, pattern_len }; + Ok((trans, slice.as_ptr().as_usize() - slice_start)) } } @@ -1334,12 +1369,12 @@ impl> Transitions { } dst = &mut dst[..nwrite]; - // write state count - E::write_u32(u32::try_from(self.count).unwrap(), dst); + // write state length + E::write_u32(u32::try_from(self.state_len).unwrap(), dst); dst = &mut dst[size_of::()..]; - // write pattern count - E::write_u32(u32::try_from(self.patterns).unwrap(), dst); + // write pattern length + E::write_u32(u32::try_from(self.pattern_len).unwrap(), dst); dst = &mut dst[size_of::()..]; // write byte class map @@ -1351,15 +1386,22 @@ impl> Transitions { dst = &mut dst[size_of::()..]; // write actual transitions - dst.copy_from_slice(self.sparse()); + let mut id = DEAD; + while id.as_usize() < self.sparse().len() { + let state = self.state(id); + let n = state.write_to::(&mut dst)?; + dst = &mut dst[n..]; + // The next ID is the offset immediately following `state`. + id = StateID::new(id.as_usize() + state.write_to_len()).unwrap(); + } Ok(nwrite) } /// Returns the number of bytes the serialized form of this transition /// table will use. fn write_to_len(&self) -> usize { - size_of::() // state count - + size_of::() // pattern count + size_of::() // state length + + size_of::() // pattern length + self.classes.write_to_len() + size_of::() // sparse transitions length + self.sparse().len() @@ -1369,7 +1411,7 @@ impl> Transitions { /// /// That is, every state ID can be used to correctly index a state in this /// table. - fn validate(&self) -> Result<(), DeserializeError> { + fn validate(&self, sp: &Special) -> Result<(), DeserializeError> { // In order to validate everything, we not only need to make sure we // can decode every state, but that every transition in every state // points to a valid state. There are many duplicative transitions, so @@ -1381,10 +1423,22 @@ impl> Transitions { // whether doing something more clever is worth it just yet. If you're // profiling this code and need it to run faster, please file an issue. // + // OK, so we also use this to record the set of valid state IDs. Since + // it is possible for a transition to point to an invalid state ID that + // still (somehow) deserializes to a valid state. So we need to make + // sure our transitions are limited to actually correct state IDs. + // The problem is, I'm not sure how to do this verification step in + // no-std no-alloc mode. I think we'd *have* to store the set of valid + // state IDs in the DFA itself. For now, we don't do this verification + // in no-std no-alloc mode. The worst thing that can happen is an + // incorrect result. But no panics or memory safety problems should + // result. Because we still do validate that the state itself is + // "valid" in the sense that everything it points to actually exists. + // // ---AG struct Seen { #[cfg(feature = "alloc")] - set: BTreeSet, + set: alloc::collections::BTreeSet, #[cfg(not(feature = "alloc"))] set: core::marker::PhantomData, } @@ -1392,7 +1446,7 @@ impl> Transitions { #[cfg(feature = "alloc")] impl Seen { fn new() -> Seen { - Seen { set: BTreeSet::new() } + Seen { set: alloc::collections::BTreeSet::new() } } fn insert(&mut self, id: StateID) { self.set.insert(id); @@ -1416,38 +1470,78 @@ impl> Transitions { let mut verified: Seen = Seen::new(); // We need to make sure that we decode the correct number of states. // Otherwise, an empty set of transitions would validate even if the - // recorded state count is non-empty. - let mut count = 0; + // recorded state length is non-empty. + let mut len = 0; // We can't use the self.states() iterator because it assumes the state // encodings are valid. It could panic if they aren't. let mut id = DEAD; while id.as_usize() < self.sparse().len() { - let state = self.try_state(id)?; + // Before we even decode the state, we check that the ID itself + // is well formed. That is, if it's a special state then it must + // actually be a quit, dead, accel, match or start state. + if sp.is_special_state(id) { + let is_actually_special = sp.is_dead_state(id) + || sp.is_quit_state(id) + || sp.is_match_state(id) + || sp.is_start_state(id) + || sp.is_accel_state(id); + if !is_actually_special { + // This is kind of a cryptic error message... + return Err(DeserializeError::generic( + "found sparse state tagged as special but \ + wasn't actually special", + )); + } + } + let state = self.try_state(sp, id)?; verified.insert(id); // The next ID should be the offset immediately following `state`. - id = StateID::new(bytes::add( + id = StateID::new(wire::add( id.as_usize(), - state.bytes_len(), + state.write_to_len(), "next state ID offset", )?) .map_err(|err| { DeserializeError::state_id_error(err, "next state ID offset") })?; - count += 1; - - // Now check that all transitions in this state are correct. + len += 1; + } + // Now that we've checked that all top-level states are correct and + // importantly, collected a set of valid state IDs, we have all the + // information we need to check that all transitions are correct too. + // + // Note that we can't use `valid_ids` to iterate because it will + // be empty in no-std no-alloc contexts. (And yes, that means our + // verification isn't quite as good.) We can use `self.states()` + // though at least, since we know that all states can at least be + // decoded and traversed correctly. + for state in self.states() { + // Check that all transitions in this state are correct. for i in 0..state.ntrans { let to = state.next_at(i); - if verified.contains(&to) { - continue; + // For no-alloc, we just check that the state can decode. It is + // technically possible that the state ID could still point to + // a non-existent state even if it decodes (fuzzing proved this + // to be true), but it shouldn't result in any memory unsafety + // or panics in non-debug mode. + #[cfg(not(feature = "alloc"))] + { + let _ = self.try_state(sp, to)?; + } + #[cfg(feature = "alloc")] + { + if !verified.contains(&to) { + return Err(DeserializeError::generic( + "found transition that points to a \ + non-existent state", + )); + } } - let _ = self.try_state(to)?; - verified.insert(id); } } - if count != self.count { + if len != self.state_len { return Err(DeserializeError::generic( - "mismatching sparse state count", + "mismatching sparse state length", )); } Ok(()) @@ -1458,19 +1552,19 @@ impl> Transitions { Transitions { sparse: self.sparse(), classes: self.classes.clone(), - count: self.count, - patterns: self.patterns, + state_len: self.state_len, + pattern_len: self.pattern_len, } } /// Converts these transitions to an owned value. #[cfg(feature = "alloc")] - fn to_owned(&self) -> Transitions> { + fn to_owned(&self) -> Transitions> { Transitions { sparse: self.sparse().to_vec(), classes: self.classes.clone(), - count: self.count, - patterns: self.patterns, + state_len: self.state_len, + pattern_len: self.pattern_len, } } @@ -1483,10 +1577,10 @@ impl> Transitions { /// functions involved are also inlined, which should hopefully eliminate /// a lot of the extraneous decoding that is never needed just to follow /// the next transition. - #[inline(always)] + #[cfg_attr(feature = "perf-inline", inline(always))] fn state(&self, id: StateID) -> State<'_> { let mut state = &self.sparse()[id.as_usize()..]; - let mut ntrans = bytes::read_u16(&state) as usize; + let mut ntrans = wire::read_u16(&state).as_usize(); let is_match = (1 << 15) & ntrans != 0; ntrans &= !(1 << 15); state = &state[2..]; @@ -1494,13 +1588,13 @@ impl> Transitions { let (input_ranges, state) = state.split_at(ntrans * 2); let (next, state) = state.split_at(ntrans * StateID::SIZE); let (pattern_ids, state) = if is_match { - let npats = bytes::read_u32(&state) as usize; + let npats = wire::read_u32(&state).as_usize(); state[4..].split_at(npats * 4) } else { (&[][..], state) }; - let accel_len = state[0] as usize; + let accel_len = usize::from(state[0]); let accel = &state[1..accel_len + 1]; State { id, is_match, ntrans, input_ranges, next, pattern_ids, accel } } @@ -1513,27 +1607,44 @@ impl> Transitions { /// all of its data is consistent. It does not verify that its state ID /// transitions point to valid states themselves, nor does it verify that /// every pattern ID is valid. - fn try_state(&self, id: StateID) -> Result, DeserializeError> { + fn try_state( + &self, + sp: &Special, + id: StateID, + ) -> Result, DeserializeError> { if id.as_usize() > self.sparse().len() { - return Err(DeserializeError::generic("invalid sparse state ID")); + return Err(DeserializeError::generic( + "invalid caller provided sparse state ID", + )); } let mut state = &self.sparse()[id.as_usize()..]; // Encoding format starts with a u16 that stores the total number of // transitions in this state. let (mut ntrans, _) = - bytes::try_read_u16_as_usize(state, "state transition count")?; + wire::try_read_u16_as_usize(state, "state transition length")?; let is_match = ((1 << 15) & ntrans) != 0; ntrans &= !(1 << 15); state = &state[2..]; if ntrans > 257 || ntrans == 0 { - return Err(DeserializeError::generic("invalid transition count")); + return Err(DeserializeError::generic( + "invalid transition length", + )); + } + if is_match && !sp.is_match_state(id) { + return Err(DeserializeError::generic( + "state marked as match but not in match ID range", + )); + } else if !is_match && sp.is_match_state(id) { + return Err(DeserializeError::generic( + "state in match ID range but not marked as match state", + )); } // Each transition has two pieces: an inclusive range of bytes on which // it is defined, and the state ID that those bytes transition to. The // pairs come first, followed by a corresponding sequence of state IDs. let input_ranges_len = ntrans.checked_mul(2).unwrap(); - bytes::check_slice_len(state, input_ranges_len, "sparse byte pairs")?; + wire::check_slice_len(state, input_ranges_len, "sparse byte pairs")?; let (input_ranges, state) = state.split_at(input_ranges_len); // Every range should be of the form A-B, where A<=B. for pair in input_ranges.chunks(2) { @@ -1549,13 +1660,13 @@ impl> Transitions { let next_len = ntrans .checked_mul(self.id_len()) .expect("state size * #trans should always fit in a usize"); - bytes::check_slice_len(state, next_len, "sparse trans state IDs")?; + wire::check_slice_len(state, next_len, "sparse trans state IDs")?; let (next, state) = state.split_at(next_len); // We can at least verify that every state ID is in bounds. for idbytes in next.chunks(self.id_len()) { let (id, _) = - bytes::read_state_id(idbytes, "sparse state ID in try_state")?; - bytes::check_slice_len( + wire::read_state_id(idbytes, "sparse state ID in try_state")?; + wire::check_slice_len( self.sparse(), id.as_usize(), "invalid sparse state ID", @@ -1567,19 +1678,24 @@ impl> Transitions { // encoded 32-bit integers. let (pattern_ids, state) = if is_match { let (npats, nr) = - bytes::try_read_u32_as_usize(state, "pattern ID count")?; + wire::try_read_u32_as_usize(state, "pattern ID length")?; let state = &state[nr..]; + if npats == 0 { + return Err(DeserializeError::generic( + "state marked as a match, but has no pattern IDs", + )); + } let pattern_ids_len = - bytes::mul(npats, 4, "sparse pattern ID byte length")?; - bytes::check_slice_len( + wire::mul(npats, 4, "sparse pattern ID byte length")?; + wire::check_slice_len( state, pattern_ids_len, "sparse pattern IDs", )?; let (pattern_ids, state) = state.split_at(pattern_ids_len); for patbytes in pattern_ids.chunks(PatternID::SIZE) { - bytes::read_pattern_id( + wire::read_pattern_id( patbytes, "sparse pattern ID in try_state", )?; @@ -1597,21 +1713,30 @@ impl> Transitions { if state.is_empty() { return Err(DeserializeError::generic("no accelerator length")); } - let (accel_len, state) = (state[0] as usize, &state[1..]); + let (accel_len, state) = (usize::from(state[0]), &state[1..]); if accel_len > 3 { return Err(DeserializeError::generic( "sparse invalid accelerator length", )); + } else if accel_len == 0 && sp.is_accel_state(id) { + return Err(DeserializeError::generic( + "got no accelerators in state, but in accelerator ID range", + )); + } else if accel_len > 0 && !sp.is_accel_state(id) { + return Err(DeserializeError::generic( + "state in accelerator ID range, but has no accelerators", + )); } - bytes::check_slice_len( + + wire::check_slice_len( state, accel_len, "sparse corrupt accelerator length", )?; let (accel, _) = (&state[..accel_len], &state[accel_len..]); - Ok(State { + let state = State { id, is_match, ntrans, @@ -1619,7 +1744,13 @@ impl> Transitions { next, pattern_ids, accel, - }) + }; + if sp.is_quit_state(state.next_at(state.ntrans - 1)) { + return Err(DeserializeError::generic( + "state with EOI transition to quit state is illegal", + )); + } + Ok(state) } /// Return an iterator over all of the states in this DFA. @@ -1648,13 +1779,13 @@ impl> Transitions { } } -#[cfg(feature = "alloc")] +#[cfg(feature = "dfa-build")] impl> Transitions { /// Return a convenient mutable representation of the given state. /// This panics if the state is invalid. fn state_mut(&mut self, id: StateID) -> StateMut<'_> { let mut state = &mut self.sparse_mut()[id.as_usize()..]; - let mut ntrans = bytes::read_u16(&state) as usize; + let mut ntrans = wire::read_u16(&state).as_usize(); let is_match = (1 << 15) & ntrans != 0; ntrans &= !(1 << 15); state = &mut state[2..]; @@ -1662,13 +1793,13 @@ impl> Transitions { let (input_ranges, state) = state.split_at_mut(ntrans * 2); let (next, state) = state.split_at_mut(ntrans * StateID::SIZE); let (pattern_ids, state) = if is_match { - let npats = bytes::read_u32(&state) as usize; + let npats = wire::read_u32(&state).as_usize(); state[4..].split_at_mut(npats * 4) } else { (&mut [][..], state) }; - let accel_len = state[0] as usize; + let accel_len = usize::from(state[0]); let accel = &mut state[1..accel_len + 1]; StateMut { id, @@ -1702,53 +1833,85 @@ struct StartTable { /// In practice, T is either Vec or &[u8] and has no alignment /// requirements. /// - /// The first `stride` (currently always 4) entries always correspond to - /// the start states for the entire DFA. After that, there are - /// `stride * patterns` state IDs, where `patterns` may be zero in the - /// case of a DFA with no patterns or in the case where the DFA was built - /// without enabling starting states for each pattern. + /// The first `2 * stride` (currently always 8) entries always correspond + /// to the starts states for the entire DFA, with the first 4 entries being + /// for unanchored searches and the second 4 entries being for anchored + /// searches. To keep things simple, we always use 8 entries even if the + /// `StartKind` is not both. + /// + /// After that, there are `stride * patterns` state IDs, where `patterns` + /// may be zero in the case of a DFA with no patterns or in the case where + /// the DFA was built without enabling starting states for each pattern. table: T, + /// The starting state configuration supported. When 'both', both + /// unanchored and anchored searches work. When 'unanchored', anchored + /// searches panic. When 'anchored', unanchored searches panic. + kind: StartKind, + /// The start state configuration for every possible byte. + start_map: StartByteMap, /// The number of starting state IDs per pattern. stride: usize, /// The total number of patterns for which starting states are encoded. - /// This may be zero for non-empty DFAs when the DFA was built without - /// start states for each pattern. - patterns: usize, + /// This is `None` for DFAs that were built without start states for each + /// pattern. Thus, one cannot use this field to say how many patterns + /// are in the DFA in all cases. It is specific to how many patterns are + /// represented in this start table. + pattern_len: Option, + /// The universal starting state for unanchored searches. This is only + /// present when the DFA supports unanchored searches and when all starting + /// state IDs for an unanchored search are equivalent. + universal_start_unanchored: Option, + /// The universal starting state for anchored searches. This is only + /// present when the DFA supports anchored searches and when all starting + /// state IDs for an anchored search are equivalent. + universal_start_anchored: Option, } -#[cfg(feature = "alloc")] +#[cfg(feature = "dfa-build")] impl StartTable> { - fn new(patterns: usize) -> StartTable> { - let stride = Start::count(); + fn new>( + dfa: &dense::DFA, + pattern_len: Option, + ) -> StartTable> { + let stride = Start::len(); // This is OK since the only way we're here is if a dense DFA could be // constructed successfully, which uses the same space. let len = stride - .checked_mul(patterns) + .checked_mul(pattern_len.unwrap_or(0)) .unwrap() - .checked_add(stride) + .checked_add(stride.checked_mul(2).unwrap()) .unwrap() .checked_mul(StateID::SIZE) .unwrap(); - StartTable { table: vec![0; len], stride, patterns } + StartTable { + table: vec![0; len], + kind: dfa.start_kind(), + start_map: dfa.start_map().clone(), + stride, + pattern_len, + universal_start_unanchored: dfa + .universal_start_state(Anchored::No), + universal_start_anchored: dfa.universal_start_state(Anchored::Yes), + } } fn from_dense_dfa>( dfa: &dense::DFA, remap: &[StateID], - ) -> Result>, Error> { + ) -> Result>, BuildError> { // Unless the DFA has start states compiled for each pattern, then // as far as the starting state table is concerned, there are zero // patterns to account for. It will instead only store starting states // for the entire DFA. - let start_pattern_count = if dfa.has_starts_for_each_pattern() { - dfa.pattern_count() + let start_pattern_len = if dfa.starts_for_each_pattern() { + Some(dfa.pattern_len()) } else { - 0 + None }; - let mut sl = StartTable::new(start_pattern_count); - for (old_start_id, sty, pid) in dfa.starts() { + let mut sl = StartTable::new(dfa, start_pattern_len); + for (old_start_id, anchored, sty) in dfa.starts() { let new_start_id = remap[dfa.to_index(old_start_id)]; - sl.set_start(sty, pid, new_start_id); + sl.set_start(anchored, sty, new_start_id); } Ok(sl) } @@ -1758,53 +1921,98 @@ impl<'a> StartTable<&'a [u8]> { unsafe fn from_bytes_unchecked( mut slice: &'a [u8], ) -> Result<(StartTable<&'a [u8]>, usize), DeserializeError> { - let slice_start = slice.as_ptr() as usize; + let slice_start = slice.as_ptr().as_usize(); - let (stride, nr) = - bytes::try_read_u32_as_usize(slice, "sparse start table stride")?; + let (kind, nr) = StartKind::from_bytes(slice)?; slice = &slice[nr..]; - let (patterns, nr) = bytes::try_read_u32_as_usize( - slice, - "sparse start table patterns", - )?; + let (start_map, nr) = StartByteMap::from_bytes(slice)?; slice = &slice[nr..]; - if stride != Start::count() { + let (stride, nr) = + wire::try_read_u32_as_usize(slice, "sparse start table stride")?; + slice = &slice[nr..]; + if stride != Start::len() { return Err(DeserializeError::generic( "invalid sparse starting table stride", )); } - if patterns > PatternID::LIMIT { + + let (maybe_pattern_len, nr) = + wire::try_read_u32_as_usize(slice, "sparse start table patterns")?; + slice = &slice[nr..]; + let pattern_len = if maybe_pattern_len.as_u32() == u32::MAX { + None + } else { + Some(maybe_pattern_len) + }; + if pattern_len.map_or(false, |len| len > PatternID::LIMIT) { return Err(DeserializeError::generic( "sparse invalid number of patterns", )); } - let pattern_table_size = - bytes::mul(stride, patterns, "sparse invalid pattern count")?; + + let (universal_unanchored, nr) = + wire::try_read_u32(slice, "universal unanchored start")?; + slice = &slice[nr..]; + let universal_start_unanchored = if universal_unanchored == u32::MAX { + None + } else { + Some(StateID::try_from(universal_unanchored).map_err(|e| { + DeserializeError::state_id_error( + e, + "universal unanchored start", + ) + })?) + }; + + let (universal_anchored, nr) = + wire::try_read_u32(slice, "universal anchored start")?; + slice = &slice[nr..]; + let universal_start_anchored = if universal_anchored == u32::MAX { + None + } else { + Some(StateID::try_from(universal_anchored).map_err(|e| { + DeserializeError::state_id_error(e, "universal anchored start") + })?) + }; + + let pattern_table_size = wire::mul( + stride, + pattern_len.unwrap_or(0), + "sparse invalid pattern length", + )?; // Our start states always start with a single stride of start states // for the entire automaton which permit it to match any pattern. What // follows it are an optional set of start states for each pattern. - let start_state_count = bytes::add( - stride, + let start_state_len = wire::add( + wire::mul(2, stride, "start state stride too big")?, pattern_table_size, "sparse invalid 'any' pattern starts size", )?; - let table_bytes_len = bytes::mul( - start_state_count, + let table_bytes_len = wire::mul( + start_state_len, StateID::SIZE, "sparse pattern table bytes length", )?; - bytes::check_slice_len( + wire::check_slice_len( slice, table_bytes_len, "sparse start ID table", )?; - let table_bytes = &slice[..table_bytes_len]; + let table = &slice[..table_bytes_len]; slice = &slice[table_bytes_len..]; - let sl = StartTable { table: table_bytes, stride, patterns }; - Ok((sl, slice.as_ptr() as usize - slice_start)) + let sl = StartTable { + table, + kind, + start_map, + stride, + pattern_len, + universal_start_unanchored, + universal_start_anchored, + }; + Ok((sl, slice.as_ptr().as_usize() - slice_start)) } } @@ -1821,22 +2029,51 @@ impl> StartTable { } dst = &mut dst[..nwrite]; + // write start kind + let nw = self.kind.write_to::(dst)?; + dst = &mut dst[nw..]; + // write start byte map + let nw = self.start_map.write_to(dst)?; + dst = &mut dst[nw..]; // write stride E::write_u32(u32::try_from(self.stride).unwrap(), dst); dst = &mut dst[size_of::()..]; - // write pattern count - E::write_u32(u32::try_from(self.patterns).unwrap(), dst); + // write pattern length + E::write_u32( + u32::try_from(self.pattern_len.unwrap_or(0xFFFF_FFFF)).unwrap(), + dst, + ); + dst = &mut dst[size_of::()..]; + // write universal start unanchored state id, u32::MAX if absent + E::write_u32( + self.universal_start_unanchored + .map_or(u32::MAX, |sid| sid.as_u32()), + dst, + ); + dst = &mut dst[size_of::()..]; + // write universal start anchored state id, u32::MAX if absent + E::write_u32( + self.universal_start_anchored.map_or(u32::MAX, |sid| sid.as_u32()), + dst, + ); dst = &mut dst[size_of::()..]; // write start IDs - dst.copy_from_slice(self.table()); + for (sid, _, _) in self.iter() { + E::write_u32(sid.as_u32(), dst); + dst = &mut dst[StateID::SIZE..]; + } Ok(nwrite) } /// Returns the number of bytes the serialized form of this transition /// table will use. fn write_to_len(&self) -> usize { - size_of::() // stride + self.kind.write_to_len() + + self.start_map.write_to_len() + + size_of::() // stride + size_of::() // # patterns + + size_of::() // universal unanchored start + + size_of::() // universal anchored start + self.table().len() } @@ -1846,10 +2083,29 @@ impl> StartTable { /// state in the DFA's sparse transitions. fn validate( &self, + sp: &Special, trans: &Transitions, ) -> Result<(), DeserializeError> { for (id, _, _) in self.iter() { - let _ = trans.try_state(id)?; + if sp.is_match_state(id) { + return Err(DeserializeError::generic( + "start states cannot be match states", + )); + } + // Confirm that the start state points to a valid state. + let state = trans.try_state(sp, id)?; + // And like for the transition table, confirm that the transitions + // on all start states themselves point to a valid state. + // + // It'd probably be better to integrate this validation with the + // transition table, or otherwise store a sorted sequence of all + // valid state IDs in the sparse DFA itself. That way, we could + // check that every pointer to a state corresponds precisely to a + // correct and valid state. + for i in 0..state.ntrans { + let to = state.next_at(i); + let _ = trans.try_state(sp, to)?; + } } Ok(()) } @@ -1858,18 +2114,26 @@ impl> StartTable { fn as_ref(&self) -> StartTable<&'_ [u8]> { StartTable { table: self.table(), + kind: self.kind, + start_map: self.start_map.clone(), stride: self.stride, - patterns: self.patterns, + pattern_len: self.pattern_len, + universal_start_unanchored: self.universal_start_unanchored, + universal_start_anchored: self.universal_start_anchored, } } /// Converts this start list to an owned value. #[cfg(feature = "alloc")] - fn to_owned(&self) -> StartTable> { + fn to_owned(&self) -> StartTable> { StartTable { table: self.table().to_vec(), + kind: self.kind, + start_map: self.start_map.clone(), stride: self.stride, - patterns: self.patterns, + pattern_len: self.pattern_len, + universal_start_unanchored: self.universal_start_unanchored, + universal_start_anchored: self.universal_start_anchored, } } @@ -1879,26 +2143,45 @@ impl> StartTable { /// starting state for the given pattern is returned. If this start table /// does not have individual starting states for each pattern, then this /// panics. - fn start(&self, index: Start, pattern_id: Option) -> StateID { - let start_index = index.as_usize(); - let index = match pattern_id { - None => start_index, - Some(pid) => { - let pid = pid.as_usize(); - assert!(pid < self.patterns, "invalid pattern ID {:?}", pid); - self.stride - .checked_mul(pid) - .unwrap() - .checked_add(self.stride) - .unwrap() - .checked_add(start_index) - .unwrap() + fn start( + &self, + input: &Input<'_>, + start: Start, + ) -> Result { + let start_index = start.as_usize(); + let mode = input.get_anchored(); + let index = match mode { + Anchored::No => { + if !self.kind.has_unanchored() { + return Err(MatchError::unsupported_anchored(mode)); + } + start_index + } + Anchored::Yes => { + if !self.kind.has_anchored() { + return Err(MatchError::unsupported_anchored(mode)); + } + self.stride + start_index + } + Anchored::Pattern(pid) => { + let len = match self.pattern_len { + None => { + return Err(MatchError::unsupported_anchored(mode)) + } + Some(len) => len, + }; + if pid.as_usize() >= len { + return Ok(DEAD); + } + (2 * self.stride) + + (self.stride * pid.as_usize()) + + start_index } }; let start = index * StateID::SIZE; // This OK since we're allowed to assume that the start table contains // valid StateIDs. - bytes::read_state_id_unchecked(&self.table()[start..]).0 + Ok(wire::read_state_id_unchecked(&self.table()[start..]).0) } /// Return an iterator over all start IDs in this table. @@ -1924,27 +2207,26 @@ impl> StartTable { } } -#[cfg(feature = "alloc")] +#[cfg(feature = "dfa-build")] impl> StartTable { /// Set the start state for the given index and pattern. /// /// If the pattern ID or state ID are not valid, then this will panic. - fn set_start( - &mut self, - index: Start, - pattern_id: Option, - id: StateID, - ) { - let start_index = index.as_usize(); - let index = match pattern_id { - None => start_index, - Some(pid) => { + fn set_start(&mut self, anchored: Anchored, start: Start, id: StateID) { + let start_index = start.as_usize(); + let index = match anchored { + Anchored::No => start_index, + Anchored::Yes => self.stride + start_index, + Anchored::Pattern(pid) => { let pid = pid.as_usize(); - assert!(pid < self.patterns, "invalid pattern ID {:?}", pid); + let len = self + .pattern_len + .expect("start states for each pattern enabled"); + assert!(pid < len, "invalid pattern ID {:?}", pid); self.stride .checked_mul(pid) .unwrap() - .checked_add(self.stride) + .checked_add(self.stride.checked_mul(2).unwrap()) .unwrap() .checked_add(start_index) .unwrap() @@ -1952,7 +2234,7 @@ impl> StartTable { }; let start = index * StateID::SIZE; let end = start + StateID::SIZE; - bytes::write_state_id::( + wire::write_state_id::( id, &mut self.table.as_mut()[start..end], ); @@ -1966,9 +2248,9 @@ struct StartStateIter<'a, T> { } impl<'a, T: AsRef<[u8]>> Iterator for StartStateIter<'a, T> { - type Item = (StateID, Start, Option); + type Item = (StateID, Anchored, Start); - fn next(&mut self) -> Option<(StateID, Start, Option)> { + fn next(&mut self) -> Option<(StateID, Anchored, Start)> { let i = self.i; if i >= self.st.len() { return None; @@ -1978,18 +2260,13 @@ impl<'a, T: AsRef<[u8]>> Iterator for StartStateIter<'a, T> { // This unwrap is okay since the stride of any DFA must always match // the number of start state types. let start_type = Start::from_usize(i % self.st.stride).unwrap(); - let pid = if i < self.st.stride { - // This means we don't have start states for each pattern. - None + let anchored = if i < self.st.stride { + Anchored::No + } else if i < (2 * self.st.stride) { + Anchored::Yes } else { - // These unwraps are OK since we may assume our table and stride - // is correct. - let pid = i - .checked_sub(self.st.stride) - .unwrap() - .checked_div(self.st.stride) - .unwrap(); - Some(PatternID::new(pid).unwrap()) + let pid = (i - (2 * self.st.stride)) / self.st.stride; + Anchored::Pattern(PatternID::new(pid).unwrap()) }; let start = i * StateID::SIZE; let end = start + StateID::SIZE; @@ -1997,7 +2274,7 @@ impl<'a, T: AsRef<[u8]>> Iterator for StartStateIter<'a, T> { // This is OK since we're allowed to assume that any IDs in this start // table are correct and valid for this DFA. let id = StateID::from_ne_bytes_unchecked(bytes); - Some((id, start_type, pid)) + Some((id, anchored, start_type)) } } @@ -2024,7 +2301,7 @@ impl<'a, T: AsRef<[u8]>> Iterator for StateIter<'a, T> { return None; } let state = self.trans.state(StateID::new_unchecked(self.id)); - self.id = self.id + state.bytes_len(); + self.id = self.id + state.write_to_len(); Some(state) } } @@ -2071,7 +2348,7 @@ impl<'a> State<'a> { /// /// This is marked as inline to help dramatically boost sparse searching, /// which decodes each state it enters to follow the next transition. - #[inline(always)] + #[cfg_attr(feature = "perf-inline", inline(always))] fn next(&self, input: u8) -> StateID { // This straight linear search was observed to be much better than // binary search on ASCII haystacks, likely because a binary search @@ -2120,19 +2397,66 @@ impl<'a> State<'a> { /// is invalid, then this panics. fn pattern_id(&self, match_index: usize) -> PatternID { let start = match_index * PatternID::SIZE; - bytes::read_pattern_id_unchecked(&self.pattern_ids[start..]).0 + wire::read_pattern_id_unchecked(&self.pattern_ids[start..]).0 } /// Returns the total number of pattern IDs for this state. This is always /// zero when `is_match` is false. - fn pattern_count(&self) -> usize { + fn pattern_len(&self) -> usize { assert_eq!(0, self.pattern_ids.len() % 4); self.pattern_ids.len() / 4 } + /// Return an accelerator for this state. + fn accelerator(&self) -> &'a [u8] { + self.accel + } + + /// Write the raw representation of this state to the given buffer using + /// the given endianness. + fn write_to( + &self, + mut dst: &mut [u8], + ) -> Result { + let nwrite = self.write_to_len(); + if dst.len() < nwrite { + return Err(SerializeError::buffer_too_small( + "sparse state transitions", + )); + } + + let ntrans = + if self.is_match { self.ntrans | (1 << 15) } else { self.ntrans }; + E::write_u16(u16::try_from(ntrans).unwrap(), dst); + dst = &mut dst[size_of::()..]; + + dst[..self.input_ranges.len()].copy_from_slice(self.input_ranges); + dst = &mut dst[self.input_ranges.len()..]; + + for i in 0..self.ntrans { + E::write_u32(self.next_at(i).as_u32(), dst); + dst = &mut dst[StateID::SIZE..]; + } + + if self.is_match { + E::write_u32(u32::try_from(self.pattern_len()).unwrap(), dst); + dst = &mut dst[size_of::()..]; + for i in 0..self.pattern_len() { + let pid = self.pattern_id(i); + E::write_u32(pid.as_u32(), dst); + dst = &mut dst[PatternID::SIZE..]; + } + } + + dst[0] = u8::try_from(self.accel.len()).unwrap(); + dst[1..][..self.accel.len()].copy_from_slice(self.accel); + + Ok(nwrite) + } + /// Return the total number of bytes that this state consumes in its /// encoded form. - fn bytes_len(&self) -> usize { + fn write_to_len(&self) -> usize { let mut len = 2 + (self.ntrans * 2) + (self.ntrans * StateID::SIZE) @@ -2142,11 +2466,6 @@ impl<'a> State<'a> { } len } - - /// Return an accelerator for this state. - fn accelerator(&self) -> &'a [u8] { - self.accel - } } impl<'a> fmt::Debug for State<'a> { @@ -2163,14 +2482,14 @@ impl<'a> fmt::Debug for State<'a> { } let (start, end) = self.range(i); if start == end { - write!(f, "{:?} => {:?}", DebugByte(start), next)?; + write!(f, "{:?} => {:?}", DebugByte(start), next.as_usize())?; } else { write!( f, "{:?}-{:?} => {:?}", DebugByte(start), DebugByte(end), - next, + next.as_usize(), )?; } printed = true; @@ -2180,7 +2499,7 @@ impl<'a> fmt::Debug for State<'a> { if printed { write!(f, ", ")?; } - write!(f, "EOI => {:?}", eoi)?; + write!(f, "EOI => {:?}", eoi.as_usize())?; } Ok(()) } @@ -2188,7 +2507,7 @@ impl<'a> fmt::Debug for State<'a> { /// A representation of a mutable sparse DFA state that can be cheaply /// materialized from a state identifier. -#[cfg(feature = "alloc")] +#[cfg(feature = "dfa-build")] struct StateMut<'a> { /// The identifier of this state. id: StateID, @@ -2216,17 +2535,17 @@ struct StateMut<'a> { accel: &'a mut [u8], } -#[cfg(feature = "alloc")] +#[cfg(feature = "dfa-build")] impl<'a> StateMut<'a> { /// Sets the ith transition to the given state. fn set_next_at(&mut self, i: usize, next: StateID) { let start = i * StateID::SIZE; let end = start + StateID::SIZE; - bytes::write_state_id::(next, &mut self.next[start..end]); + wire::write_state_id::(next, &mut self.next[start..end]); } } -#[cfg(feature = "alloc")] +#[cfg(feature = "dfa-build")] impl<'a> fmt::Debug for StateMut<'a> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { let state = State { @@ -2242,6 +2561,7 @@ impl<'a> fmt::Debug for StateMut<'a> { } } +/* /// A binary search routine specialized specifically to a sparse DFA state's /// transitions. Specifically, the transitions are defined as a set of pairs /// of input bytes that delineate an inclusive range of bytes. If the input @@ -2261,8 +2581,7 @@ impl<'a> fmt::Debug for StateMut<'a> { /// guaranteed to be safe and is thus UB (since I don't think the in-memory /// representation of `(u8, u8)` has been nailed down). One could define a /// repr(C) type, but the casting doesn't seem justified. -#[allow(dead_code)] -#[inline(always)] +#[cfg_attr(feature = "perf-inline", inline(always))] fn binary_search_ranges(ranges: &[u8], needle: u8) -> Option { debug_assert!(ranges.len() % 2 == 0, "ranges must have even length"); debug_assert!(ranges.len() <= 512, "ranges should be short"); @@ -2281,3 +2600,57 @@ fn binary_search_ranges(ranges: &[u8], needle: u8) -> Option { } None } +*/ + +#[cfg(all(test, feature = "syntax", feature = "dfa-build"))] +mod tests { + use crate::{ + dfa::{dense::DFA, Automaton}, + nfa::thompson, + Input, MatchError, + }; + + // See the analogous test in src/hybrid/dfa.rs and src/dfa/dense.rs. + #[test] + fn heuristic_unicode_forward() { + let dfa = DFA::builder() + .configure(DFA::config().unicode_word_boundary(true)) + .thompson(thompson::Config::new().reverse(true)) + .build(r"\b[0-9]+\b") + .unwrap() + .to_sparse() + .unwrap(); + + let input = Input::new("β123").range(2..); + let expected = MatchError::quit(0xB2, 1); + let got = dfa.try_search_fwd(&input); + assert_eq!(Err(expected), got); + + let input = Input::new("123β").range(..3); + let expected = MatchError::quit(0xCE, 3); + let got = dfa.try_search_fwd(&input); + assert_eq!(Err(expected), got); + } + + // See the analogous test in src/hybrid/dfa.rs and src/dfa/dense.rs. + #[test] + fn heuristic_unicode_reverse() { + let dfa = DFA::builder() + .configure(DFA::config().unicode_word_boundary(true)) + .thompson(thompson::Config::new().reverse(true)) + .build(r"\b[0-9]+\b") + .unwrap() + .to_sparse() + .unwrap(); + + let input = Input::new("β123").range(2..); + let expected = MatchError::quit(0xB2, 1); + let got = dfa.try_search_rev(&input); + assert_eq!(Err(expected), got); + + let input = Input::new("123β").range(..3); + let expected = MatchError::quit(0xCE, 3); + let got = dfa.try_search_rev(&input); + assert_eq!(Err(expected), got); + } +} -- cgit v1.2.3