summaryrefslogtreecommitdiffstats
path: root/vendor/regex-automata/src/hybrid
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/regex-automata/src/hybrid')
-rw-r--r--vendor/regex-automata/src/hybrid/dfa.rs205
-rw-r--r--vendor/regex-automata/src/hybrid/error.rs115
-rw-r--r--vendor/regex-automata/src/hybrid/mod.rs2
-rw-r--r--vendor/regex-automata/src/hybrid/regex.rs2
-rw-r--r--vendor/regex-automata/src/hybrid/search.rs10
5 files changed, 237 insertions, 97 deletions
diff --git a/vendor/regex-automata/src/hybrid/dfa.rs b/vendor/regex-automata/src/hybrid/dfa.rs
index 67261c1a3..bd9179b19 100644
--- a/vendor/regex-automata/src/hybrid/dfa.rs
+++ b/vendor/regex-automata/src/hybrid/dfa.rs
@@ -13,7 +13,7 @@ use alloc::vec::Vec;
use crate::{
hybrid::{
- error::{BuildError, CacheError},
+ error::{BuildError, CacheError, StartError},
id::{LazyStateID, LazyStateIDError},
search,
},
@@ -28,7 +28,7 @@ use crate::{
Anchored, HalfMatch, Input, MatchError, MatchKind, PatternSet,
},
sparse_set::SparseSets,
- start::{Start, StartByteMap},
+ start::{self, Start, StartByteMap},
},
};
@@ -1518,8 +1518,8 @@ impl DFA {
Lazy::new(self, cache).cache_next_state(current, unit)
}
- /// Return the ID of the start state for this lazy DFA when executing a
- /// forward search.
+ /// Return the ID of the start state for this lazy DFA for the given
+ /// starting configuration.
///
/// Unlike typical DFA implementations, the start state for DFAs in this
/// crate is dependent on a few different factors:
@@ -1527,85 +1527,122 @@ impl DFA {
/// * The [`Anchored`] mode of the search. Unanchored, anchored and
/// anchored searches for a specific [`PatternID`] all use different start
/// states.
- /// * The position at which the search begins, via [`Input::start`]. This
- /// and the byte immediately preceding the start of the search (if one
- /// exists) influence which look-behind assertions are true at the start
- /// of the search. This in turn influences which start state is selected.
- /// * Whether the search is a forward or reverse search. This routine can
- /// only be used for forward searches.
+ /// * Whether a "look-behind" byte exists. For example, the `^` anchor
+ /// matches if and only if there is no look-behind byte.
+ /// * The specific value of that look-behind byte. For example, a `(?m:^)`
+ /// assertion only matches when there is either no look-behind byte, or
+ /// when the look-behind byte is a line terminator.
+ ///
+ /// The [starting configuration](start::Config) provides the above
+ /// information.
+ ///
+ /// This routine can be used for either forward or reverse searches.
+ /// Although, as a convenience, if you have an [`Input`], then it
+ /// may be more succinct to use [`DFA::start_state_forward`] or
+ /// [`DFA::start_state_reverse`]. Note, for example, that the convenience
+ /// routines return a [`MatchError`] on failure where as this routine
+ /// returns a [`StartError`].
///
/// # Errors
///
- /// This may return a [`MatchError`] (not a [`CacheError`]!) if the search
- /// needs to give up when determining the start state (for example, if
- /// it sees a "quit" byte or if the cache has been cleared too many
- /// times). This can also return an error if the given `Input` contains an
- /// unsupported [`Anchored`] configuration.
+ /// This may return a [`StartError`] if the search needs to give up when
+ /// determining the start state (for example, if it sees a "quit" byte
+ /// or if the cache has become inefficient). This can also return an
+ /// error if the given configuration contains an unsupported [`Anchored`]
+ /// configuration.
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub fn start_state(
+ &self,
+ cache: &mut Cache,
+ config: &start::Config,
+ ) -> Result<LazyStateID, StartError> {
+ let lazy = LazyRef::new(self, cache);
+ let anchored = config.get_anchored();
+ let start = match config.get_look_behind() {
+ None => Start::Text,
+ Some(byte) => {
+ if !self.quitset.is_empty() && self.quitset.contains(byte) {
+ return Err(StartError::quit(byte));
+ }
+ self.start_map.get(byte)
+ }
+ };
+ let start_id = lazy.get_cached_start_id(anchored, start)?;
+ if !start_id.is_unknown() {
+ return Ok(start_id);
+ }
+ Lazy::new(self, cache).cache_start_group(anchored, start)
+ }
+
+ /// Return the ID of the start state for this lazy DFA when executing a
+ /// forward search.
+ ///
+ /// This is a convenience routine for calling [`DFA::start_state`] that
+ /// converts the given [`Input`] to a [start configuration](start::Config).
+ /// Additionally, if an error occurs, it is converted from a [`StartError`]
+ /// to a [`MatchError`] using the offset information in the given
+ /// [`Input`].
+ ///
+ /// # Errors
+ ///
+ /// This may return a [`MatchError`] if the search needs to give up when
+ /// determining the start state (for example, if it sees a "quit" byte or
+ /// if the cache has become inefficient). This can also return an error if
+ /// the given `Input` contains an unsupported [`Anchored`] configuration.
#[cfg_attr(feature = "perf-inline", inline(always))]
pub fn start_state_forward(
&self,
cache: &mut Cache,
input: &Input<'_>,
) -> Result<LazyStateID, MatchError> {
- if !self.quitset.is_empty() && input.start() > 0 {
- let offset = input.start() - 1;
- let byte = input.haystack()[offset];
- if self.quitset.contains(byte) {
- return Err(MatchError::quit(byte, offset));
+ let config = start::Config::from_input_forward(input);
+ self.start_state(cache, &config).map_err(|err| match err {
+ StartError::Cache { .. } => MatchError::gave_up(input.start()),
+ StartError::Quit { byte } => {
+ let offset = input
+ .start()
+ .checked_sub(1)
+ .expect("no quit in start without look-behind");
+ MatchError::quit(byte, offset)
}
- }
- let start_type = self.start_map.fwd(input);
- let start = LazyRef::new(self, cache)
- .get_cached_start_id(input, start_type)?;
- if !start.is_unknown() {
- return Ok(start);
- }
- Lazy::new(self, cache).cache_start_group(input, start_type)
+ StartError::UnsupportedAnchored { mode } => {
+ MatchError::unsupported_anchored(mode)
+ }
+ })
}
/// Return the ID of the start state for this lazy DFA when executing a
/// reverse search.
///
- /// Unlike typical DFA implementations, the start state for DFAs in this
- /// crate is dependent on a few different factors:
- ///
- /// * The [`Anchored`] mode of the search. Unanchored, anchored and
- /// anchored searches for a specific [`PatternID`] all use different start
- /// states.
- /// * The position at which the search begins, via [`Input::start`]. This
- /// and the byte immediately preceding the start of the search (if one
- /// exists) influence which look-behind assertions are true at the start
- /// of the search. This in turn influences which start state is selected.
- /// * Whether the search is a forward or reverse search. This routine can
- /// only be used for reverse searches.
+ /// This is a convenience routine for calling [`DFA::start_state`] that
+ /// converts the given [`Input`] to a [start configuration](start::Config).
+ /// Additionally, if an error occurs, it is converted from a [`StartError`]
+ /// to a [`MatchError`] using the offset information in the given
+ /// [`Input`].
///
/// # Errors
///
- /// This may return a [`MatchError`] (not a [`CacheError`]!) if the search
- /// needs to give up when determining the start state (for example, if
- /// it sees a "quit" byte or if the cache has been cleared too many
- /// times). This can also return an error if the given `Input` contains an
- /// unsupported [`Anchored`] configuration.
+ /// This may return a [`MatchError`] if the search needs to give up when
+ /// determining the start state (for example, if it sees a "quit" byte or
+ /// if the cache has become inefficient). This can also return an error if
+ /// the given `Input` contains an unsupported [`Anchored`] configuration.
#[cfg_attr(feature = "perf-inline", inline(always))]
pub fn start_state_reverse(
&self,
cache: &mut Cache,
input: &Input<'_>,
) -> Result<LazyStateID, MatchError> {
- if !self.quitset.is_empty() && input.end() < input.haystack().len() {
- let offset = input.end();
- let byte = input.haystack()[offset];
- if self.quitset.contains(byte) {
- return Err(MatchError::quit(byte, offset));
+ let config = start::Config::from_input_reverse(input);
+ self.start_state(cache, &config).map_err(|err| match err {
+ StartError::Cache { .. } => MatchError::gave_up(input.end()),
+ StartError::Quit { byte } => {
+ let offset = input.end();
+ MatchError::quit(byte, offset)
}
- }
- let start_type = self.start_map.rev(input);
- let start = LazyRef::new(self, cache)
- .get_cached_start_id(input, start_type)?;
- if !start.is_unknown() {
- return Ok(start);
- }
- Lazy::new(self, cache).cache_start_group(input, start_type)
+ StartError::UnsupportedAnchored { mode } => {
+ MatchError::unsupported_anchored(mode)
+ }
+ })
}
/// Returns the total number of patterns that match in this state.
@@ -2066,8 +2103,10 @@ impl<'i, 'c> Lazy<'i, 'c> {
/// Here's an example that justifies 'inline(never)'
///
/// ```ignore
- /// regex-cli find hybrid dfa \
- /// @all-codepoints-utf8-100x '\pL{100}' --cache-capacity 10000000
+ /// regex-cli find match hybrid \
+ /// --cache-capacity 100000000 \
+ /// -p '\pL{100}'
+ /// all-codepoints-utf8-100x
/// ```
///
/// Where 'all-codepoints-utf8-100x' is the UTF-8 encoding of every
@@ -2122,16 +2161,15 @@ impl<'i, 'c> Lazy<'i, 'c> {
#[inline(never)]
fn cache_start_group(
&mut self,
- input: &Input<'_>,
+ anchored: Anchored,
start: Start,
- ) -> Result<LazyStateID, MatchError> {
- let mode = input.get_anchored();
- let nfa_start_id = match mode {
+ ) -> Result<LazyStateID, StartError> {
+ let nfa_start_id = match anchored {
Anchored::No => self.dfa.get_nfa().start_unanchored(),
Anchored::Yes => self.dfa.get_nfa().start_anchored(),
Anchored::Pattern(pid) => {
if !self.dfa.get_config().get_starts_for_each_pattern() {
- return Err(MatchError::unsupported_anchored(mode));
+ return Err(StartError::unsupported_anchored(anchored));
}
match self.dfa.get_nfa().start_pattern(pid) {
None => return Ok(self.as_ref().dead_id()),
@@ -2142,8 +2180,8 @@ impl<'i, 'c> Lazy<'i, 'c> {
let id = self
.cache_start_one(nfa_start_id, start)
- .map_err(|_| MatchError::gave_up(input.start()))?;
- self.set_start_state(input, start, id);
+ .map_err(StartError::cache)?;
+ self.set_start_state(anchored, start, id);
Ok(id)
}
@@ -2574,13 +2612,13 @@ impl<'i, 'c> Lazy<'i, 'c> {
/// 'starts_for_each_pattern' is not enabled.
fn set_start_state(
&mut self,
- input: &Input<'_>,
+ anchored: Anchored,
start: Start,
id: LazyStateID,
) {
assert!(self.as_ref().is_valid(id));
let start_index = start.as_usize();
- let index = match input.get_anchored() {
+ let index = match anchored {
Anchored::No => start_index,
Anchored::Yes => Start::len() + start_index,
Anchored::Pattern(pid) => {
@@ -2642,17 +2680,16 @@ impl<'i, 'c> LazyRef<'i, 'c> {
#[cfg_attr(feature = "perf-inline", inline(always))]
fn get_cached_start_id(
&self,
- input: &Input<'_>,
+ anchored: Anchored,
start: Start,
- ) -> Result<LazyStateID, MatchError> {
+ ) -> Result<LazyStateID, StartError> {
let start_index = start.as_usize();
- let mode = input.get_anchored();
- let index = match mode {
+ let index = match anchored {
Anchored::No => start_index,
Anchored::Yes => Start::len() + start_index,
Anchored::Pattern(pid) => {
if !self.dfa.get_config().get_starts_for_each_pattern() {
- return Err(MatchError::unsupported_anchored(mode));
+ return Err(StartError::unsupported_anchored(anchored));
}
if pid.as_usize() >= self.dfa.pattern_len() {
return Ok(self.dead_id());
@@ -3178,12 +3215,12 @@ impl Config {
/// be quit bytes _only_ when a Unicode word boundary is present in the
/// pattern.
///
- /// When enabling this option, callers _must_ be prepared to handle
- /// a [`MatchError`](crate::MatchError) error during search.
- /// When using a [`Regex`](crate::hybrid::regex::Regex), this
- /// corresponds to using the `try_` suite of methods. Alternatively,
- /// if callers can guarantee that their input is ASCII only, then a
- /// [`MatchError::quit`] error will never be returned while searching.
+ /// When enabling this option, callers _must_ be prepared to
+ /// handle a [`MatchError`] error during search. When using a
+ /// [`Regex`](crate::hybrid::regex::Regex), this corresponds to using the
+ /// `try_` suite of methods. Alternatively, if callers can guarantee that
+ /// their input is ASCII only, then a [`MatchError::quit`] error will never
+ /// be returned while searching.
///
/// This is disabled by default.
///
@@ -3269,8 +3306,8 @@ impl Config {
/// (The advantage being that non-ASCII quit bytes will only be added if a
/// Unicode word boundary is in the pattern.)
///
- /// When enabling this option, callers _must_ be prepared to handle a
- /// [`MatchError`](crate::MatchError) error during search. When using a
+ /// When enabling this option, callers _must_ be prepared to
+ /// handle a [`MatchError`] error during search. When using a
/// [`Regex`](crate::hybrid::regex::Regex), this corresponds to using the
/// `try_` suite of methods.
///
@@ -3795,8 +3832,8 @@ impl Config {
//
// Test case:
//
- // regex-cli find hybrid regex -w @conn.json.1000x.log \
- // '^#' '\b10\.55\.182\.100\b'
+ // regex-cli find match hybrid --unicode-word-boundary \
+ // -p '^#' -p '\b10\.55\.182\.100\b' -y @conn.json.1000x.log
if !quit.is_empty() {
set.add_set(&quit);
}
diff --git a/vendor/regex-automata/src/hybrid/error.rs b/vendor/regex-automata/src/hybrid/error.rs
index 604daf3c3..d134e7ec9 100644
--- a/vendor/regex-automata/src/hybrid/error.rs
+++ b/vendor/regex-automata/src/hybrid/error.rs
@@ -1,4 +1,4 @@
-use crate::{hybrid::id::LazyStateIDError, nfa};
+use crate::{hybrid::id::LazyStateIDError, nfa, util::search::Anchored};
/// An error that occurs when initial construction of a lazy DFA fails.
///
@@ -95,6 +95,113 @@ impl core::fmt::Display for BuildError {
}
}
+/// An error that can occur when computing the start state for a search.
+///
+/// Computing a start state can fail for a few reasons, either
+/// based on incorrect configuration or even based on whether
+/// the look-behind byte triggers a quit state. Typically
+/// one does not need to handle this error if you're using
+/// [`DFA::start_state_forward`](crate::hybrid::dfa::DFA::start_state_forward)
+/// (or its reverse counterpart), as that routine automatically converts
+/// `StartError` to a [`MatchError`](crate::MatchError) for you.
+///
+/// This error may be returned by the
+/// [`DFA::start_state`](crate::hybrid::dfa::DFA::start_state) routine.
+///
+/// This error implements the `std::error::Error` trait when the `std` feature
+/// is enabled.
+///
+/// This error is marked as non-exhaustive. New variants may be added in a
+/// semver compatible release.
+#[non_exhaustive]
+#[derive(Clone, Debug)]
+pub enum StartError {
+ /// An error that occurs when cache inefficiency has dropped below the
+ /// configured heuristic thresholds.
+ Cache {
+ /// The underlying cache error that occurred.
+ err: CacheError,
+ },
+ /// An error that occurs when a starting configuration's look-behind byte
+ /// is in this DFA's quit set.
+ Quit {
+ /// The quit byte that was found.
+ byte: u8,
+ },
+ /// An error that occurs when the caller requests an anchored mode that
+ /// isn't supported by the DFA.
+ UnsupportedAnchored {
+ /// The anchored mode given that is unsupported.
+ mode: Anchored,
+ },
+}
+
+impl StartError {
+ pub(crate) fn cache(err: CacheError) -> StartError {
+ StartError::Cache { err }
+ }
+
+ pub(crate) fn quit(byte: u8) -> StartError {
+ StartError::Quit { byte }
+ }
+
+ pub(crate) fn unsupported_anchored(mode: Anchored) -> StartError {
+ StartError::UnsupportedAnchored { mode }
+ }
+}
+
+#[cfg(feature = "std")]
+impl std::error::Error for StartError {
+ fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
+ match *self {
+ StartError::Cache { ref err } => Some(err),
+ _ => None,
+ }
+ }
+}
+
+impl core::fmt::Display for StartError {
+ fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+ match *self {
+ StartError::Cache { .. } => write!(
+ f,
+ "error computing start state because of cache inefficiency"
+ ),
+ StartError::Quit { byte } => write!(
+ f,
+ "error computing start state because the look-behind byte \
+ {:?} triggered a quit state",
+ crate::util::escape::DebugByte(byte),
+ ),
+ StartError::UnsupportedAnchored { mode: Anchored::Yes } => {
+ write!(
+ f,
+ "error computing start state because \
+ anchored searches are not supported or enabled"
+ )
+ }
+ StartError::UnsupportedAnchored { mode: Anchored::No } => {
+ write!(
+ f,
+ "error computing start state because \
+ unanchored searches are not supported or enabled"
+ )
+ }
+ StartError::UnsupportedAnchored {
+ mode: Anchored::Pattern(pid),
+ } => {
+ write!(
+ f,
+ "error computing start state because \
+ anchored searches for a specific pattern ({}) \
+ are not supported or enabled",
+ pid.as_usize(),
+ )
+ }
+ }
+ }
+}
+
/// An error that occurs when cache usage has become inefficient.
///
/// One of the weaknesses of a lazy DFA is that it may need to clear its
@@ -126,11 +233,7 @@ impl CacheError {
}
#[cfg(feature = "std")]
-impl std::error::Error for CacheError {
- fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
- None
- }
-}
+impl std::error::Error for CacheError {}
impl core::fmt::Display for CacheError {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
diff --git a/vendor/regex-automata/src/hybrid/mod.rs b/vendor/regex-automata/src/hybrid/mod.rs
index 44e67e129..2feb839d1 100644
--- a/vendor/regex-automata/src/hybrid/mod.rs
+++ b/vendor/regex-automata/src/hybrid/mod.rs
@@ -133,7 +133,7 @@ compiled DFAs.
*/
pub use self::{
- error::{BuildError, CacheError},
+ error::{BuildError, CacheError, StartError},
id::LazyStateID,
};
diff --git a/vendor/regex-automata/src/hybrid/regex.rs b/vendor/regex-automata/src/hybrid/regex.rs
index 75667daf9..b3b1fe317 100644
--- a/vendor/regex-automata/src/hybrid/regex.rs
+++ b/vendor/regex-automata/src/hybrid/regex.rs
@@ -878,7 +878,7 @@ impl Builder {
}
/// Set the lazy DFA compilation configuration for this builder using
- /// [`dfa::Config`](dfa::Config).
+ /// [`dfa::Config`].
///
/// This permits setting things like whether Unicode word boundaries should
/// be heuristically supported or settings how the behavior of the cache.
diff --git a/vendor/regex-automata/src/hybrid/search.rs b/vendor/regex-automata/src/hybrid/search.rs
index f23283685..1f4a505db 100644
--- a/vendor/regex-automata/src/hybrid/search.rs
+++ b/vendor/regex-automata/src/hybrid/search.rs
@@ -105,14 +105,14 @@ fn find_fwd_imp(
// PERF: For justification of omitting bounds checks, it gives us a
// ~10% bump in search time. This was used for a benchmark:
//
- // regex-cli find hybrid dfa @bigfile '(?m)^.+$' -UBb
+ // regex-cli find half hybrid -p '(?m)^.+$' -UBb bigfile
//
// PERF: For justification for the loop unrolling, we use a few
// different tests:
//
- // regex-cli find hybrid dfa @$bigfile '\w{50}' -UBb
- // regex-cli find hybrid dfa @$bigfile '(?m)^.+$' -UBb
- // regex-cli find hybrid dfa @$bigfile 'ZQZQZQZQ' -UBb
+ // regex-cli find half hybrid -p '\w{50}' -UBb bigfile
+ // regex-cli find half hybrid -p '(?m)^.+$' -UBb bigfile
+ // regex-cli find half hybrid -p 'ZQZQZQZQ' -UBb bigfile
//
// And there are three different configurations:
//
@@ -353,7 +353,7 @@ fn find_rev_imp(
// anchored and on shorter haystacks. However, this still makes a
// difference. Take this command for example:
//
- // regex-cli find hybrid regex @$bigfile '(?m)^.+$' -UBb
+ // regex-cli find match hybrid -p '(?m)^.+$' -UBb bigfile
//
// (Notice that we use 'find hybrid regex', not 'find hybrid dfa'
// like in the justification for the forward direction. The 'regex'