diff options
Diffstat (limited to 'vendor/regex-automata/src/util/start.rs')
-rw-r--r-- | vendor/regex-automata/src/util/start.rs | 341 |
1 files changed, 269 insertions, 72 deletions
diff --git a/vendor/regex-automata/src/util/start.rs b/vendor/regex-automata/src/util/start.rs index 3c756fc26..4e360d083 100644 --- a/vendor/regex-automata/src/util/start.rs +++ b/vendor/regex-automata/src/util/start.rs @@ -1,21 +1,186 @@ -/// Represents the four possible starting configurations of a DFA search. +/*! +Provides some helpers for dealing with start state configurations in DFAs. + +[`Start`] represents the possible starting configurations, while +[`StartByteMap`] represents a way to retrieve the `Start` configuration for a +given position in a haystack. +*/ + +use crate::util::{ + look::LookMatcher, + search::Input, + wire::{self, DeserializeError, SerializeError}, +}; + +/// A map from every possible byte value to its corresponding starting +/// configuration. /// -/// The starting configuration is determined by inspecting the the beginning of -/// the haystack (up to 1 byte). Ultimately, this along with a pattern ID (if -/// specified) is what selects the start state to use in a DFA. +/// This map is used in order to lookup the start configuration for a particular +/// position in a haystack. This start configuration is then used in +/// combination with things like the anchored mode and pattern ID to fully +/// determine the start state. /// -/// In a DFA that doesn't have starting states for each pattern, then it will -/// have a maximum of four DFA start states. If the DFA was compiled with start -/// states for each pattern, then it will have a maximum of four DFA start -/// states for searching for any pattern, and then another maximum of four DFA -/// start states for executing an anchored search for each pattern. +/// Generally speaking, this map is only used for fully compiled DFAs and lazy +/// DFAs. For NFAs (including the one-pass DFA), the start state is generally +/// selected by virtue of traversing the NFA state graph. DFAs do the same +/// thing, but at build time and not search time. (Well, technically the lazy +/// DFA does it at search time, but it does enough work to cache the full +/// result of the epsilon closure that the NFA engines tend to need to do.) +#[derive(Clone)] +pub(crate) struct StartByteMap { + map: [Start; 256], +} + +impl StartByteMap { + /// Create a new map from byte values to their corresponding starting + /// configurations. The map is determined, in part, by how look-around + /// assertions are matched via the matcher given. + pub(crate) fn new(lookm: &LookMatcher) -> StartByteMap { + let mut map = [Start::NonWordByte; 256]; + map[usize::from(b'\n')] = Start::LineLF; + map[usize::from(b'\r')] = Start::LineCR; + map[usize::from(b'_')] = Start::WordByte; + + let mut byte = b'0'; + while byte <= b'9' { + map[usize::from(byte)] = Start::WordByte; + byte += 1; + } + byte = b'A'; + while byte <= b'Z' { + map[usize::from(byte)] = Start::WordByte; + byte += 1; + } + byte = b'a'; + while byte <= b'z' { + map[usize::from(byte)] = Start::WordByte; + byte += 1; + } + + let lineterm = lookm.get_line_terminator(); + // If our line terminator is normal, then it is already handled by + // the LineLF and LineCR configurations. But if it's weird, then we + // overwrite whatever was there before for that terminator with a + // special configuration. The trick here is that if the terminator + // is, say, a word byte like `a`, then callers seeing this start + // configuration need to account for that and build their DFA state as + // if it *also* came from a word byte. + if lineterm != b'\r' && lineterm != b'\n' { + map[usize::from(lineterm)] = Start::CustomLineTerminator; + } + StartByteMap { map } + } + + /// Return the forward starting configuration for the given `input`. + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn fwd(&self, input: &Input) -> Start { + match input + .start() + .checked_sub(1) + .and_then(|i| input.haystack().get(i)) + { + None => Start::Text, + Some(&byte) => self.get(byte), + } + } + + /// Return the reverse starting configuration for the given `input`. + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn rev(&self, input: &Input) -> Start { + match input.haystack().get(input.end()) { + None => Start::Text, + Some(&byte) => self.get(byte), + } + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn get(&self, byte: u8) -> Start { + self.map[usize::from(byte)] + } + + /// Deserializes a byte class map from the given slice. If the slice is of + /// insufficient length or otherwise contains an impossible mapping, then + /// an error is returned. Upon success, the number of bytes read along with + /// the map are returned. The number of bytes read is always a multiple of + /// 8. + pub(crate) fn from_bytes( + slice: &[u8], + ) -> Result<(StartByteMap, usize), DeserializeError> { + wire::check_slice_len(slice, 256, "start byte map")?; + let mut map = [Start::NonWordByte; 256]; + for (i, &repr) in slice[..256].iter().enumerate() { + map[i] = match Start::from_usize(usize::from(repr)) { + Some(start) => start, + None => { + return Err(DeserializeError::generic( + "found invalid starting configuration", + )) + } + }; + } + Ok((StartByteMap { map }, 256)) + } + + /// Writes this map to the given byte buffer. if the given buffer is too + /// small, then an error is returned. Upon success, the total number of + /// bytes written is returned. The number of bytes written is guaranteed to + /// be a multiple of 8. + pub(crate) fn write_to( + &self, + dst: &mut [u8], + ) -> Result<usize, SerializeError> { + let nwrite = self.write_to_len(); + if dst.len() < nwrite { + return Err(SerializeError::buffer_too_small("start byte map")); + } + for (i, &start) in self.map.iter().enumerate() { + dst[i] = start.as_u8(); + } + Ok(nwrite) + } + + /// Returns the total number of bytes written by `write_to`. + pub(crate) fn write_to_len(&self) -> usize { + 256 + } +} + +impl core::fmt::Debug for StartByteMap { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + use crate::util::escape::DebugByte; + + write!(f, "StartByteMap{{")?; + for byte in 0..=255 { + if byte > 0 { + write!(f, ", ")?; + } + let start = self.map[usize::from(byte)]; + write!(f, "{:?} => {:?}", DebugByte(byte), start)?; + } + write!(f, "}}")?; + Ok(()) + } +} + +/// Represents the six possible starting configurations of a DFA search. +/// +/// The starting configuration is determined by inspecting the the beginning +/// of the haystack (up to 1 byte). Ultimately, this along with a pattern ID +/// (if specified) and the type of search (anchored or not) is what selects the +/// start state to use in a DFA. /// -/// This ends up being represented as a table in the DFA (whether lazy or fully -/// built) where the stride of that table is 4, and each entry is an index into -/// the state transition table. Note though that multiple entries in the table -/// might point to the same state if the states would otherwise be equivalent. -/// (This is guaranteed by DFA minimization and may even be accomplished by -/// normal determinization, since it attempts to reuse equivalent states too.) +/// As one example, if a DFA only supports unanchored searches and does not +/// support anchored searches for each pattern, then it will have at most 6 +/// distinct start states. (Some start states may be reused if determinization +/// can determine that they will be equivalent.) If the DFA supports both +/// anchored and unanchored searches, then it will have a maximum of 12 +/// distinct start states. Finally, if the DFA also supports anchored searches +/// for each pattern, then it can have up to `12 + (N * 6)` start states, where +/// `N` is the number of patterns. +/// +/// Handling each of these starting configurations in the context of DFA +/// determinization can be *quite* tricky and subtle. But the code is small +/// and can be found at `crate::util::determinize::set_lookbehind_from_start`. #[derive(Clone, Copy, Debug, Eq, PartialEq)] pub(crate) enum Start { /// This occurs when the starting position is not any of the ones below. @@ -28,7 +193,20 @@ pub(crate) enum Start { Text = 2, /// This occurs when the byte immediately preceding the start of the search /// is a line terminator. Specifically, `\n`. - Line = 3, + LineLF = 3, + /// This occurs when the byte immediately preceding the start of the search + /// is a line terminator. Specifically, `\r`. + LineCR = 4, + /// This occurs when a custom line terminator has been set via a + /// `LookMatcher`, and when that line terminator is neither a `\r` or a + /// `\n`. + /// + /// If the custom line terminator is a word byte, then this start + /// configuration is still selected. DFAs that implement word boundary + /// assertions will likely need to check whether the custom line terminator + /// is a word byte, in which case, it should behave as if the byte + /// satisfies `\b` in addition to multi-line anchors. + CustomLineTerminator = 5, } impl Start { @@ -39,71 +217,90 @@ impl Start { 0 => Some(Start::NonWordByte), 1 => Some(Start::WordByte), 2 => Some(Start::Text), - 3 => Some(Start::Line), + 3 => Some(Start::LineLF), + 4 => Some(Start::LineCR), + 5 => Some(Start::CustomLineTerminator), _ => None, } } /// Returns the total number of starting state configurations. - pub(crate) fn count() -> usize { - 4 - } - - /// Returns the starting state configuration for the given search - /// parameters. If the given offset range is not valid, then this panics. - #[inline(always)] - pub(crate) fn from_position_fwd( - bytes: &[u8], - start: usize, - end: usize, - ) -> Start { - assert!( - bytes.get(start..end).is_some(), - "{}..{} is invalid", - start, - end - ); - if start == 0 { - Start::Text - } else if bytes[start - 1] == b'\n' { - Start::Line - } else if crate::util::is_word_byte(bytes[start - 1]) { - Start::WordByte - } else { - Start::NonWordByte - } + pub(crate) fn len() -> usize { + 6 } - /// Returns the starting state configuration for a reverse search with the - /// given search parameters. If the given offset range is not valid, then - /// this panics. - #[inline(always)] - pub(crate) fn from_position_rev( - bytes: &[u8], - start: usize, - end: usize, - ) -> Start { - assert!( - bytes.get(start..end).is_some(), - "{}..{} is invalid", - start, - end - ); - if end == bytes.len() { - Start::Text - } else if bytes[end] == b'\n' { - Start::Line - } else if crate::util::is_word_byte(bytes[end]) { - Start::WordByte - } else { - Start::NonWordByte - } + /// Return this starting configuration as `u8` integer. It is guaranteed to + /// be less than `Start::len()`. + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn as_u8(&self) -> u8 { + // AFAIK, 'as' is the only way to zero-cost convert an int enum to an + // actual int. + *self as u8 } - /// Return this starting configuration as an integer. It is guaranteed to - /// be less than `Start::count()`. - #[inline(always)] + /// Return this starting configuration as a `usize` integer. It is + /// guaranteed to be less than `Start::len()`. + #[cfg_attr(feature = "perf-inline", inline(always))] pub(crate) fn as_usize(&self) -> usize { - *self as usize + usize::from(self.as_u8()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn start_fwd_done_range() { + let smap = StartByteMap::new(&LookMatcher::default()); + assert_eq!(Start::Text, smap.fwd(&Input::new("").range(1..0))); + } + + #[test] + fn start_rev_done_range() { + let smap = StartByteMap::new(&LookMatcher::default()); + assert_eq!(Start::Text, smap.rev(&Input::new("").range(1..0))); + } + + #[test] + fn start_fwd() { + let f = |haystack, start, end| { + let smap = StartByteMap::new(&LookMatcher::default()); + let input = &Input::new(haystack).range(start..end); + smap.fwd(input) + }; + + assert_eq!(Start::Text, f("", 0, 0)); + assert_eq!(Start::Text, f("abc", 0, 3)); + assert_eq!(Start::Text, f("\nabc", 0, 3)); + + assert_eq!(Start::LineLF, f("\nabc", 1, 3)); + + assert_eq!(Start::LineCR, f("\rabc", 1, 3)); + + assert_eq!(Start::WordByte, f("abc", 1, 3)); + + assert_eq!(Start::NonWordByte, f(" abc", 1, 3)); + } + + #[test] + fn start_rev() { + let f = |haystack, start, end| { + let smap = StartByteMap::new(&LookMatcher::default()); + let input = &Input::new(haystack).range(start..end); + smap.rev(input) + }; + + assert_eq!(Start::Text, f("", 0, 0)); + assert_eq!(Start::Text, f("abc", 0, 3)); + assert_eq!(Start::Text, f("abc\n", 0, 4)); + + assert_eq!(Start::LineLF, f("abc\nz", 0, 3)); + + assert_eq!(Start::LineCR, f("abc\rz", 0, 3)); + + assert_eq!(Start::WordByte, f("abc", 0, 2)); + + assert_eq!(Start::NonWordByte, f("abc ", 0, 3)); } } |