diff options
Diffstat (limited to 'vendor/regex-automata/src/hybrid')
-rw-r--r-- | vendor/regex-automata/src/hybrid/dfa.rs | 205 | ||||
-rw-r--r-- | vendor/regex-automata/src/hybrid/error.rs | 115 | ||||
-rw-r--r-- | vendor/regex-automata/src/hybrid/mod.rs | 2 | ||||
-rw-r--r-- | vendor/regex-automata/src/hybrid/regex.rs | 2 | ||||
-rw-r--r-- | vendor/regex-automata/src/hybrid/search.rs | 10 |
5 files changed, 237 insertions, 97 deletions
diff --git a/vendor/regex-automata/src/hybrid/dfa.rs b/vendor/regex-automata/src/hybrid/dfa.rs index 67261c1a3..bd9179b19 100644 --- a/vendor/regex-automata/src/hybrid/dfa.rs +++ b/vendor/regex-automata/src/hybrid/dfa.rs @@ -13,7 +13,7 @@ use alloc::vec::Vec; use crate::{ hybrid::{ - error::{BuildError, CacheError}, + error::{BuildError, CacheError, StartError}, id::{LazyStateID, LazyStateIDError}, search, }, @@ -28,7 +28,7 @@ use crate::{ Anchored, HalfMatch, Input, MatchError, MatchKind, PatternSet, }, sparse_set::SparseSets, - start::{Start, StartByteMap}, + start::{self, Start, StartByteMap}, }, }; @@ -1518,8 +1518,8 @@ impl DFA { Lazy::new(self, cache).cache_next_state(current, unit) } - /// Return the ID of the start state for this lazy DFA when executing a - /// forward search. + /// Return the ID of the start state for this lazy DFA for the given + /// starting configuration. /// /// Unlike typical DFA implementations, the start state for DFAs in this /// crate is dependent on a few different factors: @@ -1527,85 +1527,122 @@ impl DFA { /// * The [`Anchored`] mode of the search. Unanchored, anchored and /// anchored searches for a specific [`PatternID`] all use different start /// states. - /// * The position at which the search begins, via [`Input::start`]. This - /// and the byte immediately preceding the start of the search (if one - /// exists) influence which look-behind assertions are true at the start - /// of the search. This in turn influences which start state is selected. - /// * Whether the search is a forward or reverse search. This routine can - /// only be used for forward searches. + /// * Whether a "look-behind" byte exists. For example, the `^` anchor + /// matches if and only if there is no look-behind byte. + /// * The specific value of that look-behind byte. For example, a `(?m:^)` + /// assertion only matches when there is either no look-behind byte, or + /// when the look-behind byte is a line terminator. + /// + /// The [starting configuration](start::Config) provides the above + /// information. + /// + /// This routine can be used for either forward or reverse searches. + /// Although, as a convenience, if you have an [`Input`], then it + /// may be more succinct to use [`DFA::start_state_forward`] or + /// [`DFA::start_state_reverse`]. Note, for example, that the convenience + /// routines return a [`MatchError`] on failure where as this routine + /// returns a [`StartError`]. /// /// # Errors /// - /// This may return a [`MatchError`] (not a [`CacheError`]!) if the search - /// needs to give up when determining the start state (for example, if - /// it sees a "quit" byte or if the cache has been cleared too many - /// times). This can also return an error if the given `Input` contains an - /// unsupported [`Anchored`] configuration. + /// This may return a [`StartError`] if the search needs to give up when + /// determining the start state (for example, if it sees a "quit" byte + /// or if the cache has become inefficient). This can also return an + /// error if the given configuration contains an unsupported [`Anchored`] + /// configuration. + #[cfg_attr(feature = "perf-inline", inline(always))] + pub fn start_state( + &self, + cache: &mut Cache, + config: &start::Config, + ) -> Result<LazyStateID, StartError> { + let lazy = LazyRef::new(self, cache); + let anchored = config.get_anchored(); + let start = match config.get_look_behind() { + None => Start::Text, + Some(byte) => { + if !self.quitset.is_empty() && self.quitset.contains(byte) { + return Err(StartError::quit(byte)); + } + self.start_map.get(byte) + } + }; + let start_id = lazy.get_cached_start_id(anchored, start)?; + if !start_id.is_unknown() { + return Ok(start_id); + } + Lazy::new(self, cache).cache_start_group(anchored, start) + } + + /// Return the ID of the start state for this lazy DFA when executing a + /// forward search. + /// + /// This is a convenience routine for calling [`DFA::start_state`] that + /// converts the given [`Input`] to a [start configuration](start::Config). + /// Additionally, if an error occurs, it is converted from a [`StartError`] + /// to a [`MatchError`] using the offset information in the given + /// [`Input`]. + /// + /// # Errors + /// + /// This may return a [`MatchError`] if the search needs to give up when + /// determining the start state (for example, if it sees a "quit" byte or + /// if the cache has become inefficient). This can also return an error if + /// the given `Input` contains an unsupported [`Anchored`] configuration. #[cfg_attr(feature = "perf-inline", inline(always))] pub fn start_state_forward( &self, cache: &mut Cache, input: &Input<'_>, ) -> Result<LazyStateID, MatchError> { - if !self.quitset.is_empty() && input.start() > 0 { - let offset = input.start() - 1; - let byte = input.haystack()[offset]; - if self.quitset.contains(byte) { - return Err(MatchError::quit(byte, offset)); + let config = start::Config::from_input_forward(input); + self.start_state(cache, &config).map_err(|err| match err { + StartError::Cache { .. } => MatchError::gave_up(input.start()), + StartError::Quit { byte } => { + let offset = input + .start() + .checked_sub(1) + .expect("no quit in start without look-behind"); + MatchError::quit(byte, offset) } - } - let start_type = self.start_map.fwd(input); - let start = LazyRef::new(self, cache) - .get_cached_start_id(input, start_type)?; - if !start.is_unknown() { - return Ok(start); - } - Lazy::new(self, cache).cache_start_group(input, start_type) + StartError::UnsupportedAnchored { mode } => { + MatchError::unsupported_anchored(mode) + } + }) } /// Return the ID of the start state for this lazy DFA when executing a /// reverse search. /// - /// Unlike typical DFA implementations, the start state for DFAs in this - /// crate is dependent on a few different factors: - /// - /// * The [`Anchored`] mode of the search. Unanchored, anchored and - /// anchored searches for a specific [`PatternID`] all use different start - /// states. - /// * The position at which the search begins, via [`Input::start`]. This - /// and the byte immediately preceding the start of the search (if one - /// exists) influence which look-behind assertions are true at the start - /// of the search. This in turn influences which start state is selected. - /// * Whether the search is a forward or reverse search. This routine can - /// only be used for reverse searches. + /// This is a convenience routine for calling [`DFA::start_state`] that + /// converts the given [`Input`] to a [start configuration](start::Config). + /// Additionally, if an error occurs, it is converted from a [`StartError`] + /// to a [`MatchError`] using the offset information in the given + /// [`Input`]. /// /// # Errors /// - /// This may return a [`MatchError`] (not a [`CacheError`]!) if the search - /// needs to give up when determining the start state (for example, if - /// it sees a "quit" byte or if the cache has been cleared too many - /// times). This can also return an error if the given `Input` contains an - /// unsupported [`Anchored`] configuration. + /// This may return a [`MatchError`] if the search needs to give up when + /// determining the start state (for example, if it sees a "quit" byte or + /// if the cache has become inefficient). This can also return an error if + /// the given `Input` contains an unsupported [`Anchored`] configuration. #[cfg_attr(feature = "perf-inline", inline(always))] pub fn start_state_reverse( &self, cache: &mut Cache, input: &Input<'_>, ) -> Result<LazyStateID, MatchError> { - if !self.quitset.is_empty() && input.end() < input.haystack().len() { - let offset = input.end(); - let byte = input.haystack()[offset]; - if self.quitset.contains(byte) { - return Err(MatchError::quit(byte, offset)); + let config = start::Config::from_input_reverse(input); + self.start_state(cache, &config).map_err(|err| match err { + StartError::Cache { .. } => MatchError::gave_up(input.end()), + StartError::Quit { byte } => { + let offset = input.end(); + MatchError::quit(byte, offset) } - } - let start_type = self.start_map.rev(input); - let start = LazyRef::new(self, cache) - .get_cached_start_id(input, start_type)?; - if !start.is_unknown() { - return Ok(start); - } - Lazy::new(self, cache).cache_start_group(input, start_type) + StartError::UnsupportedAnchored { mode } => { + MatchError::unsupported_anchored(mode) + } + }) } /// Returns the total number of patterns that match in this state. @@ -2066,8 +2103,10 @@ impl<'i, 'c> Lazy<'i, 'c> { /// Here's an example that justifies 'inline(never)' /// /// ```ignore - /// regex-cli find hybrid dfa \ - /// @all-codepoints-utf8-100x '\pL{100}' --cache-capacity 10000000 + /// regex-cli find match hybrid \ + /// --cache-capacity 100000000 \ + /// -p '\pL{100}' + /// all-codepoints-utf8-100x /// ``` /// /// Where 'all-codepoints-utf8-100x' is the UTF-8 encoding of every @@ -2122,16 +2161,15 @@ impl<'i, 'c> Lazy<'i, 'c> { #[inline(never)] fn cache_start_group( &mut self, - input: &Input<'_>, + anchored: Anchored, start: Start, - ) -> Result<LazyStateID, MatchError> { - let mode = input.get_anchored(); - let nfa_start_id = match mode { + ) -> Result<LazyStateID, StartError> { + let nfa_start_id = match anchored { Anchored::No => self.dfa.get_nfa().start_unanchored(), Anchored::Yes => self.dfa.get_nfa().start_anchored(), Anchored::Pattern(pid) => { if !self.dfa.get_config().get_starts_for_each_pattern() { - return Err(MatchError::unsupported_anchored(mode)); + return Err(StartError::unsupported_anchored(anchored)); } match self.dfa.get_nfa().start_pattern(pid) { None => return Ok(self.as_ref().dead_id()), @@ -2142,8 +2180,8 @@ impl<'i, 'c> Lazy<'i, 'c> { let id = self .cache_start_one(nfa_start_id, start) - .map_err(|_| MatchError::gave_up(input.start()))?; - self.set_start_state(input, start, id); + .map_err(StartError::cache)?; + self.set_start_state(anchored, start, id); Ok(id) } @@ -2574,13 +2612,13 @@ impl<'i, 'c> Lazy<'i, 'c> { /// 'starts_for_each_pattern' is not enabled. fn set_start_state( &mut self, - input: &Input<'_>, + anchored: Anchored, start: Start, id: LazyStateID, ) { assert!(self.as_ref().is_valid(id)); let start_index = start.as_usize(); - let index = match input.get_anchored() { + let index = match anchored { Anchored::No => start_index, Anchored::Yes => Start::len() + start_index, Anchored::Pattern(pid) => { @@ -2642,17 +2680,16 @@ impl<'i, 'c> LazyRef<'i, 'c> { #[cfg_attr(feature = "perf-inline", inline(always))] fn get_cached_start_id( &self, - input: &Input<'_>, + anchored: Anchored, start: Start, - ) -> Result<LazyStateID, MatchError> { + ) -> Result<LazyStateID, StartError> { let start_index = start.as_usize(); - let mode = input.get_anchored(); - let index = match mode { + let index = match anchored { Anchored::No => start_index, Anchored::Yes => Start::len() + start_index, Anchored::Pattern(pid) => { if !self.dfa.get_config().get_starts_for_each_pattern() { - return Err(MatchError::unsupported_anchored(mode)); + return Err(StartError::unsupported_anchored(anchored)); } if pid.as_usize() >= self.dfa.pattern_len() { return Ok(self.dead_id()); @@ -3178,12 +3215,12 @@ impl Config { /// be quit bytes _only_ when a Unicode word boundary is present in the /// pattern. /// - /// When enabling this option, callers _must_ be prepared to handle - /// a [`MatchError`](crate::MatchError) error during search. - /// When using a [`Regex`](crate::hybrid::regex::Regex), this - /// corresponds to using the `try_` suite of methods. Alternatively, - /// if callers can guarantee that their input is ASCII only, then a - /// [`MatchError::quit`] error will never be returned while searching. + /// When enabling this option, callers _must_ be prepared to + /// handle a [`MatchError`] error during search. When using a + /// [`Regex`](crate::hybrid::regex::Regex), this corresponds to using the + /// `try_` suite of methods. Alternatively, if callers can guarantee that + /// their input is ASCII only, then a [`MatchError::quit`] error will never + /// be returned while searching. /// /// This is disabled by default. /// @@ -3269,8 +3306,8 @@ impl Config { /// (The advantage being that non-ASCII quit bytes will only be added if a /// Unicode word boundary is in the pattern.) /// - /// When enabling this option, callers _must_ be prepared to handle a - /// [`MatchError`](crate::MatchError) error during search. When using a + /// When enabling this option, callers _must_ be prepared to + /// handle a [`MatchError`] error during search. When using a /// [`Regex`](crate::hybrid::regex::Regex), this corresponds to using the /// `try_` suite of methods. /// @@ -3795,8 +3832,8 @@ impl Config { // // Test case: // - // regex-cli find hybrid regex -w @conn.json.1000x.log \ - // '^#' '\b10\.55\.182\.100\b' + // regex-cli find match hybrid --unicode-word-boundary \ + // -p '^#' -p '\b10\.55\.182\.100\b' -y @conn.json.1000x.log if !quit.is_empty() { set.add_set(&quit); } diff --git a/vendor/regex-automata/src/hybrid/error.rs b/vendor/regex-automata/src/hybrid/error.rs index 604daf3c3..d134e7ec9 100644 --- a/vendor/regex-automata/src/hybrid/error.rs +++ b/vendor/regex-automata/src/hybrid/error.rs @@ -1,4 +1,4 @@ -use crate::{hybrid::id::LazyStateIDError, nfa}; +use crate::{hybrid::id::LazyStateIDError, nfa, util::search::Anchored}; /// An error that occurs when initial construction of a lazy DFA fails. /// @@ -95,6 +95,113 @@ impl core::fmt::Display for BuildError { } } +/// An error that can occur when computing the start state for a search. +/// +/// Computing a start state can fail for a few reasons, either +/// based on incorrect configuration or even based on whether +/// the look-behind byte triggers a quit state. Typically +/// one does not need to handle this error if you're using +/// [`DFA::start_state_forward`](crate::hybrid::dfa::DFA::start_state_forward) +/// (or its reverse counterpart), as that routine automatically converts +/// `StartError` to a [`MatchError`](crate::MatchError) for you. +/// +/// This error may be returned by the +/// [`DFA::start_state`](crate::hybrid::dfa::DFA::start_state) routine. +/// +/// This error implements the `std::error::Error` trait when the `std` feature +/// is enabled. +/// +/// This error is marked as non-exhaustive. New variants may be added in a +/// semver compatible release. +#[non_exhaustive] +#[derive(Clone, Debug)] +pub enum StartError { + /// An error that occurs when cache inefficiency has dropped below the + /// configured heuristic thresholds. + Cache { + /// The underlying cache error that occurred. + err: CacheError, + }, + /// An error that occurs when a starting configuration's look-behind byte + /// is in this DFA's quit set. + Quit { + /// The quit byte that was found. + byte: u8, + }, + /// An error that occurs when the caller requests an anchored mode that + /// isn't supported by the DFA. + UnsupportedAnchored { + /// The anchored mode given that is unsupported. + mode: Anchored, + }, +} + +impl StartError { + pub(crate) fn cache(err: CacheError) -> StartError { + StartError::Cache { err } + } + + pub(crate) fn quit(byte: u8) -> StartError { + StartError::Quit { byte } + } + + pub(crate) fn unsupported_anchored(mode: Anchored) -> StartError { + StartError::UnsupportedAnchored { mode } + } +} + +#[cfg(feature = "std")] +impl std::error::Error for StartError { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + match *self { + StartError::Cache { ref err } => Some(err), + _ => None, + } + } +} + +impl core::fmt::Display for StartError { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + match *self { + StartError::Cache { .. } => write!( + f, + "error computing start state because of cache inefficiency" + ), + StartError::Quit { byte } => write!( + f, + "error computing start state because the look-behind byte \ + {:?} triggered a quit state", + crate::util::escape::DebugByte(byte), + ), + StartError::UnsupportedAnchored { mode: Anchored::Yes } => { + write!( + f, + "error computing start state because \ + anchored searches are not supported or enabled" + ) + } + StartError::UnsupportedAnchored { mode: Anchored::No } => { + write!( + f, + "error computing start state because \ + unanchored searches are not supported or enabled" + ) + } + StartError::UnsupportedAnchored { + mode: Anchored::Pattern(pid), + } => { + write!( + f, + "error computing start state because \ + anchored searches for a specific pattern ({}) \ + are not supported or enabled", + pid.as_usize(), + ) + } + } + } +} + /// An error that occurs when cache usage has become inefficient. /// /// One of the weaknesses of a lazy DFA is that it may need to clear its @@ -126,11 +233,7 @@ impl CacheError { } #[cfg(feature = "std")] -impl std::error::Error for CacheError { - fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { - None - } -} +impl std::error::Error for CacheError {} impl core::fmt::Display for CacheError { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { diff --git a/vendor/regex-automata/src/hybrid/mod.rs b/vendor/regex-automata/src/hybrid/mod.rs index 44e67e129..2feb839d1 100644 --- a/vendor/regex-automata/src/hybrid/mod.rs +++ b/vendor/regex-automata/src/hybrid/mod.rs @@ -133,7 +133,7 @@ compiled DFAs. */ pub use self::{ - error::{BuildError, CacheError}, + error::{BuildError, CacheError, StartError}, id::LazyStateID, }; diff --git a/vendor/regex-automata/src/hybrid/regex.rs b/vendor/regex-automata/src/hybrid/regex.rs index 75667daf9..b3b1fe317 100644 --- a/vendor/regex-automata/src/hybrid/regex.rs +++ b/vendor/regex-automata/src/hybrid/regex.rs @@ -878,7 +878,7 @@ impl Builder { } /// Set the lazy DFA compilation configuration for this builder using - /// [`dfa::Config`](dfa::Config). + /// [`dfa::Config`]. /// /// This permits setting things like whether Unicode word boundaries should /// be heuristically supported or settings how the behavior of the cache. diff --git a/vendor/regex-automata/src/hybrid/search.rs b/vendor/regex-automata/src/hybrid/search.rs index f23283685..1f4a505db 100644 --- a/vendor/regex-automata/src/hybrid/search.rs +++ b/vendor/regex-automata/src/hybrid/search.rs @@ -105,14 +105,14 @@ fn find_fwd_imp( // PERF: For justification of omitting bounds checks, it gives us a // ~10% bump in search time. This was used for a benchmark: // - // regex-cli find hybrid dfa @bigfile '(?m)^.+$' -UBb + // regex-cli find half hybrid -p '(?m)^.+$' -UBb bigfile // // PERF: For justification for the loop unrolling, we use a few // different tests: // - // regex-cli find hybrid dfa @$bigfile '\w{50}' -UBb - // regex-cli find hybrid dfa @$bigfile '(?m)^.+$' -UBb - // regex-cli find hybrid dfa @$bigfile 'ZQZQZQZQ' -UBb + // regex-cli find half hybrid -p '\w{50}' -UBb bigfile + // regex-cli find half hybrid -p '(?m)^.+$' -UBb bigfile + // regex-cli find half hybrid -p 'ZQZQZQZQ' -UBb bigfile // // And there are three different configurations: // @@ -353,7 +353,7 @@ fn find_rev_imp( // anchored and on shorter haystacks. However, this still makes a // difference. Take this command for example: // - // regex-cli find hybrid regex @$bigfile '(?m)^.+$' -UBb + // regex-cli find match hybrid -p '(?m)^.+$' -UBb bigfile // // (Notice that we use 'find hybrid regex', not 'find hybrid dfa' // like in the justification for the forward direction. The 'regex' |