1 files changed, 269 insertions, 72 deletions
diff --git a/vendor/regex-automata/src/util/start.rs b/vendor/regex-automata/src/util/start.rs
index 3c756fc26..4e360d083 100644
--- a/vendor/regex-automata/src/util/start.rs
+++ b/vendor/regex-automata/src/util/start.rs
@@ -1,21 +1,186 @@
-/// Represents the four possible starting configurations of a DFA search.
+/*!
+Provides some helpers for dealing with start state configurations in DFAs.
+
+[`Start`] represents the possible starting configurations, while
+[`StartByteMap`] represents a way to retrieve the `Start` configuration for a
+given position in a haystack.
+*/
+
+use crate::util::{
+    look::LookMatcher,
+    search::Input,
+    wire::{self, DeserializeError, SerializeError},
+};
+
+/// A map from every possible byte value to its corresponding starting
+/// configuration.
 ///
-/// The starting configuration is determined by inspecting the the beginning of
-/// the haystack (up to 1 byte). Ultimately, this along with a pattern ID (if
-/// specified) is what selects the start state to use in a DFA.
+/// This map is used in order to lookup the start configuration for a particular
+/// position in a haystack. This start configuration is then used in
+/// combination with things like the anchored mode and pattern ID to fully
+/// determine the start state.
 ///
-/// In a DFA that doesn't have starting states for each pattern, then it will
-/// have a maximum of four DFA start states. If the DFA was compiled with start
-/// states for each pattern, then it will have a maximum of four DFA start
-/// states for searching for any pattern, and then another maximum of four DFA
-/// start states for executing an anchored search for each pattern.
+/// Generally speaking, this map is only used for fully compiled DFAs and lazy
+/// DFAs. For NFAs (including the one-pass DFA), the start state is generally
+/// selected by virtue of traversing the NFA state graph. DFAs do the same
+/// thing, but at build time and not search time. (Well, technically the lazy
+/// DFA does it at search time, but it does enough work to cache the full
+/// result of the epsilon closure that the NFA engines tend to need to do.)
+#[derive(Clone)]
+pub(crate) struct StartByteMap {
+    map: [Start; 256],
+}
+
+impl StartByteMap {
+    /// Create a new map from byte values to their corresponding starting
+    /// configurations. The map is determined, in part, by how look-around
+    /// assertions are matched via the matcher given.
+    pub(crate) fn new(lookm: &LookMatcher) -> StartByteMap {
+        let mut map = [Start::NonWordByte; 256];
+        map[usize::from(b'\n')] = Start::LineLF;
+        map[usize::from(b'\r')] = Start::LineCR;
+        map[usize::from(b'_')] = Start::WordByte;
+
+        let mut byte = b'0';
+        while byte <= b'9' {
+            map[usize::from(byte)] = Start::WordByte;
+            byte += 1;
+        }
+        byte = b'A';
+        while byte <= b'Z' {
+            map[usize::from(byte)] = Start::WordByte;
+            byte += 1;
+        }
+        byte = b'a';
+        while byte <= b'z' {
+            map[usize::from(byte)] = Start::WordByte;
+            byte += 1;
+        }
+
+        let lineterm = lookm.get_line_terminator();
+        // If our line terminator is normal, then it is already handled by
+        // the LineLF and LineCR configurations. But if it's weird, then we
+        // overwrite whatever was there before for that terminator with a
+        // special configuration. The trick here is that if the terminator
+        // is, say, a word byte like `a`, then callers seeing this start
+        // configuration need to account for that and build their DFA state as
+        // if it *also* came from a word byte.
+        if lineterm != b'\r' && lineterm != b'\n' {
+            map[usize::from(lineterm)] = Start::CustomLineTerminator;
+        }
+        StartByteMap { map }
+    }
+
+    /// Return the forward starting configuration for the given `input`.
+    #[cfg_attr(feature = "perf-inline", inline(always))]
+    pub(crate) fn fwd(&self, input: &Input) -> Start {
+        match input
+            .start()
+            .checked_sub(1)
+            .and_then(|i| input.haystack().get(i))
+        {
+            None => Start::Text,
+            Some(&byte) => self.get(byte),
+        }
+    }
+
+    /// Return the reverse starting configuration for the given `input`.
+    #[cfg_attr(feature = "perf-inline", inline(always))]
+    pub(crate) fn rev(&self, input: &Input) -> Start {
+        match input.haystack().get(input.end()) {
+            None => Start::Text,
+            Some(&byte) => self.get(byte),
+        }
+    }
+
+    #[cfg_attr(feature = "perf-inline", inline(always))]
+    fn get(&self, byte: u8) -> Start {
+        self.map[usize::from(byte)]
+    }
+
+    /// Deserializes a byte class map from the given slice. If the slice is of
+    /// insufficient length or otherwise contains an impossible mapping, then
+    /// an error is returned. Upon success, the number of bytes read along with
+    /// the map are returned. The number of bytes read is always a multiple of
+    /// 8.
+    pub(crate) fn from_bytes(
+        slice: &[u8],
+    ) -> Result<(StartByteMap, usize), DeserializeError> {
+        wire::check_slice_len(slice, 256, "start byte map")?;
+        let mut map = [Start::NonWordByte; 256];
+        for (i, &repr) in slice[..256].iter().enumerate() {
+            map[i] = match Start::from_usize(usize::from(repr)) {
+                Some(start) => start,
+                None => {
+                    return Err(DeserializeError::generic(
+                        "found invalid starting configuration",
+                    ))
+                }
+            };
+        }
+        Ok((StartByteMap { map }, 256))
+    }
+
+    /// Writes this map to the given byte buffer. if the given buffer is too
+    /// small, then an error is returned. Upon success, the total number of
+    /// bytes written is returned. The number of bytes written is guaranteed to
+    /// be a multiple of 8.
+    pub(crate) fn write_to(
+        &self,
+        dst: &mut [u8],
+    ) -> Result<usize, SerializeError> {
+        let nwrite = self.write_to_len();
+        if dst.len() < nwrite {
+            return Err(SerializeError::buffer_too_small("start byte map"));
+        }
+        for (i, &start) in self.map.iter().enumerate() {
+            dst[i] = start.as_u8();
+        }
+        Ok(nwrite)
+    }
+
+    /// Returns the total number of bytes written by `write_to`.
+    pub(crate) fn write_to_len(&self) -> usize {
+        256
+    }
+}
+
+impl core::fmt::Debug for StartByteMap {
+    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+        use crate::util::escape::DebugByte;
+
+        write!(f, "StartByteMap{{")?;
+        for byte in 0..=255 {
+            if byte > 0 {
+                write!(f, ", ")?;
+            }
+            let start = self.map[usize::from(byte)];
+            write!(f, "{:?} => {:?}", DebugByte(byte), start)?;
+        }
+        write!(f, "}}")?;
+        Ok(())
+    }
+}
+
+/// Represents the six possible starting configurations of a DFA search.
+///
+/// The starting configuration is determined by inspecting the the beginning
+/// of the haystack (up to 1 byte). Ultimately, this along with a pattern ID
+/// (if specified) and the type of search (anchored or not) is what selects the
+/// start state to use in a DFA.
 ///
-/// This ends up being represented as a table in the DFA (whether lazy or fully
-/// built) where the stride of that table is 4, and each entry is an index into
-/// the state transition table. Note though that multiple entries in the table
-/// might point to the same state if the states would otherwise be equivalent.
-/// (This is guaranteed by DFA minimization and may even be accomplished by
-/// normal determinization, since it attempts to reuse equivalent states too.)
+/// As one example, if a DFA only supports unanchored searches and does not
+/// support anchored searches for each pattern, then it will have at most 6
+/// distinct start states. (Some start states may be reused if determinization
+/// can determine that they will be equivalent.) If the DFA supports both
+/// anchored and unanchored searches, then it will have a maximum of 12
+/// distinct start states. Finally, if the DFA also supports anchored searches
+/// for each pattern, then it can have up to `12 + (N * 6)` start states, where
+/// `N` is the number of patterns.
+///
+/// Handling each of these starting configurations in the context of DFA
+/// determinization can be *quite* tricky and subtle. But the code is small
+/// and can be found at `crate::util::determinize::set_lookbehind_from_start`.
 #[derive(Clone, Copy, Debug, Eq, PartialEq)]
 pub(crate) enum Start {
     /// This occurs when the starting position is not any of the ones below.
@@ -28,7 +193,20 @@ pub(crate) enum Start {
     Text = 2,
     /// This occurs when the byte immediately preceding the start of the search
     /// is a line terminator. Specifically, `\n`.
-    Line = 3,
+    LineLF = 3,
+    /// This occurs when the byte immediately preceding the start of the search
+    /// is a line terminator. Specifically, `\r`.
+    LineCR = 4,
+    /// This occurs when a custom line terminator has been set via a
+    /// `LookMatcher`, and when that line terminator is neither a `\r` or a
+    /// `\n`.
+    ///
+    /// If the custom line terminator is a word byte, then this start
+    /// configuration is still selected. DFAs that implement word boundary
+    /// assertions will likely need to check whether the custom line terminator
+    /// is a word byte, in which case, it should behave as if the byte
+    /// satisfies `\b` in addition to multi-line anchors.
+    CustomLineTerminator = 5,
 }
 
 impl Start {
@@ -39,71 +217,90 @@ impl Start {
             0 => Some(Start::NonWordByte),
             1 => Some(Start::WordByte),
             2 => Some(Start::Text),
-            3 => Some(Start::Line),
+            3 => Some(Start::LineLF),
+            4 => Some(Start::LineCR),
+            5 => Some(Start::CustomLineTerminator),
             _ => None,
         }
     }
 
     /// Returns the total number of starting state configurations.
-    pub(crate) fn count() -> usize {
-        4
-    }
-
-    /// Returns the starting state configuration for the given search
-    /// parameters. If the given offset range is not valid, then this panics.
-    #[inline(always)]
-    pub(crate) fn from_position_fwd(
-        bytes: &[u8],
-        start: usize,
-        end: usize,
-    ) -> Start {
-        assert!(
-            bytes.get(start..end).is_some(),
-            "{}..{} is invalid",
-            start,
-            end
-        );
-        if start == 0 {
-            Start::Text
-        } else if bytes[start - 1] == b'\n' {
-            Start::Line
-        } else if crate::util::is_word_byte(bytes[start - 1]) {
-            Start::WordByte
-        } else {
-            Start::NonWordByte
-        }
+    pub(crate) fn len() -> usize {
+        6
     }
 
-    /// Returns the starting state configuration for a reverse search with the
-    /// given search parameters. If the given offset range is not valid, then
-    /// this panics.
-    #[inline(always)]
-    pub(crate) fn from_position_rev(
-        bytes: &[u8],
-        start: usize,
-        end: usize,
-    ) -> Start {
-        assert!(
-            bytes.get(start..end).is_some(),
-            "{}..{} is invalid",
-            start,
-            end
-        );
-        if end == bytes.len() {
-            Start::Text
-        } else if bytes[end] == b'\n' {
-            Start::Line
-        } else if crate::util::is_word_byte(bytes[end]) {
-            Start::WordByte
-        } else {
-            Start::NonWordByte
-        }
+    /// Return this starting configuration as `u8` integer. It is guaranteed to
+    /// be less than `Start::len()`.
+    #[cfg_attr(feature = "perf-inline", inline(always))]
+    pub(crate) fn as_u8(&self) -> u8 {
+        // AFAIK, 'as' is the only way to zero-cost convert an int enum to an
+        // actual int.
+        *self as u8
     }
 
-    /// Return this starting configuration as an integer. It is guaranteed to
-    /// be less than `Start::count()`.
-    #[inline(always)]
+    /// Return this starting configuration as a `usize` integer. It is
+    /// guaranteed to be less than `Start::len()`.
+    #[cfg_attr(feature = "perf-inline", inline(always))]
     pub(crate) fn as_usize(&self) -> usize {
-        *self as usize
+        usize::from(self.as_u8())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn start_fwd_done_range() {
+        let smap = StartByteMap::new(&LookMatcher::default());
+        assert_eq!(Start::Text, smap.fwd(&Input::new("").range(1..0)));
+    }
+
+    #[test]
+    fn start_rev_done_range() {
+        let smap = StartByteMap::new(&LookMatcher::default());
+        assert_eq!(Start::Text, smap.rev(&Input::new("").range(1..0)));
+    }
+
+    #[test]
+    fn start_fwd() {
+        let f = |haystack, start, end| {
+            let smap = StartByteMap::new(&LookMatcher::default());
+            let input = &Input::new(haystack).range(start..end);
+            smap.fwd(input)
+        };
+
+        assert_eq!(Start::Text, f("", 0, 0));
+        assert_eq!(Start::Text, f("abc", 0, 3));
+        assert_eq!(Start::Text, f("\nabc", 0, 3));
+
+        assert_eq!(Start::LineLF, f("\nabc", 1, 3));
+
+        assert_eq!(Start::LineCR, f("\rabc", 1, 3));
+
+        assert_eq!(Start::WordByte, f("abc", 1, 3));
+
+        assert_eq!(Start::NonWordByte, f(" abc", 1, 3));
+    }
+
+    #[test]
+    fn start_rev() {
+        let f = |haystack, start, end| {
+            let smap = StartByteMap::new(&LookMatcher::default());
+            let input = &Input::new(haystack).range(start..end);
+            smap.rev(input)
+        };
+
+        assert_eq!(Start::Text, f("", 0, 0));
+        assert_eq!(Start::Text, f("abc", 0, 3));
+        assert_eq!(Start::Text, f("abc\n", 0, 4));
+
+        assert_eq!(Start::LineLF, f("abc\nz", 0, 3));
+
+        assert_eq!(Start::LineCR, f("abc\rz", 0, 3));
+
+        assert_eq!(Start::WordByte, f("abc", 0, 2));
+
+        assert_eq!(Start::NonWordByte, f("abc ", 0, 3));
     }
 }