summaryrefslogtreecommitdiffstats
path: root/vendor/regex-automata/src
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-06-19 09:26:03 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-06-19 09:26:03 +0000
commit9918693037dce8aa4bb6f08741b6812923486c18 (patch)
tree21d2b40bec7e6a7ea664acee056eb3d08e15a1cf /vendor/regex-automata/src
parentReleasing progress-linux version 1.75.0+dfsg1-5~progress7.99u1. (diff)
downloadrustc-9918693037dce8aa4bb6f08741b6812923486c18.tar.xz
rustc-9918693037dce8aa4bb6f08741b6812923486c18.zip
Merging upstream version 1.76.0+dfsg1.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'vendor/regex-automata/src')
-rw-r--r--vendor/regex-automata/src/dfa/accel.rs13
-rw-r--r--vendor/regex-automata/src/dfa/automaton.rs190
-rw-r--r--vendor/regex-automata/src/dfa/dense.rs138
-rw-r--r--vendor/regex-automata/src/dfa/mod.rs2
-rw-r--r--vendor/regex-automata/src/dfa/onepass.rs18
-rw-r--r--vendor/regex-automata/src/dfa/regex.rs2
-rw-r--r--vendor/regex-automata/src/dfa/search.rs10
-rw-r--r--vendor/regex-automata/src/dfa/sparse.rs293
-rw-r--r--vendor/regex-automata/src/hybrid/dfa.rs205
-rw-r--r--vendor/regex-automata/src/hybrid/error.rs115
-rw-r--r--vendor/regex-automata/src/hybrid/mod.rs2
-rw-r--r--vendor/regex-automata/src/hybrid/regex.rs2
-rw-r--r--vendor/regex-automata/src/hybrid/search.rs10
-rw-r--r--vendor/regex-automata/src/meta/limited.rs12
-rw-r--r--vendor/regex-automata/src/meta/regex.rs6
-rw-r--r--vendor/regex-automata/src/meta/stopat.rs12
-rw-r--r--vendor/regex-automata/src/meta/strategy.rs8
-rw-r--r--vendor/regex-automata/src/meta/wrappers.rs5
-rw-r--r--vendor/regex-automata/src/nfa/thompson/backtrack.rs28
-rw-r--r--vendor/regex-automata/src/nfa/thompson/builder.rs6
-rw-r--r--vendor/regex-automata/src/nfa/thompson/compiler.rs10
-rw-r--r--vendor/regex-automata/src/nfa/thompson/map.rs2
-rw-r--r--vendor/regex-automata/src/nfa/thompson/nfa.rs6
-rw-r--r--vendor/regex-automata/src/nfa/thompson/range_trie.rs2
-rw-r--r--vendor/regex-automata/src/util/captures.rs3
-rw-r--r--vendor/regex-automata/src/util/determinize/mod.rs124
-rw-r--r--vendor/regex-automata/src/util/determinize/state.rs39
-rw-r--r--vendor/regex-automata/src/util/lazy.rs6
-rw-r--r--vendor/regex-automata/src/util/look.rs861
-rw-r--r--vendor/regex-automata/src/util/mod.rs2
-rw-r--r--vendor/regex-automata/src/util/pool.rs67
-rw-r--r--vendor/regex-automata/src/util/start.rs243
32 files changed, 1911 insertions, 531 deletions
diff --git a/vendor/regex-automata/src/dfa/accel.rs b/vendor/regex-automata/src/dfa/accel.rs
index 5ea2423dd..c0ba18ea8 100644
--- a/vendor/regex-automata/src/dfa/accel.rs
+++ b/vendor/regex-automata/src/dfa/accel.rs
@@ -6,15 +6,16 @@
// non-Unicode regexes. For example, consider '(?-u)[^a]+a'. We can look at its
// DFA with regex-cli:
//
-// $ regex-cli debug dfa dense '(?-u)[^a]+a' -BbC
-// dense::DFA(
+// $ regex-cli debug dense dfa -p '(?-u)[^a]+a' -BbC --no-table
// D 000000:
// Q 000001:
// *000002:
-// A 000003: \x00-` => 3, a => 5, b-\xFF => 3
-// >000004: \x00-` => 3, a => 4, b-\xFF => 3
-// 000005: \x00-\xFF => 2, EOI => 2
-// )
+// A 000003: \x00-` => 3, a => 8, b-\xFF => 3
+// A 000004: \x00-` => 4, a => 7, b-\xFF => 4
+// 000005: \x00-` => 4, b-\xFF => 4
+// 000006: \x00-` => 3, a => 6, b-\xFF => 3
+// 000007: \x00-\xFF => 2, EOI => 2
+// 000008: \x00-\xFF => 2, EOI => 2
//
// In particular, state 3 is accelerated (shown via the 'A' indicator) since
// the only way to leave that state once entered is to see an 'a' byte. If
diff --git a/vendor/regex-automata/src/dfa/automaton.rs b/vendor/regex-automata/src/dfa/automaton.rs
index 7e2be9a15..fcfcf2997 100644
--- a/vendor/regex-automata/src/dfa/automaton.rs
+++ b/vendor/regex-automata/src/dfa/automaton.rs
@@ -7,6 +7,7 @@ use crate::{
prefilter::Prefilter,
primitives::{PatternID, StateID},
search::{Anchored, HalfMatch, Input, MatchError},
+ start,
},
};
@@ -226,8 +227,8 @@ pub unsafe trait Automaton {
/// ```
fn next_eoi_state(&self, current: StateID) -> StateID;
- /// Return the ID of the start state for this lazy DFA when executing a
- /// forward search.
+ /// Return the ID of the start state for this DFA for the given starting
+ /// configuration.
///
/// Unlike typical DFA implementations, the start state for DFAs in this
/// crate is dependent on a few different factors:
@@ -235,12 +236,41 @@ pub unsafe trait Automaton {
/// * The [`Anchored`] mode of the search. Unanchored, anchored and
/// anchored searches for a specific [`PatternID`] all use different start
/// states.
- /// * The position at which the search begins, via [`Input::start`]. This
- /// and the byte immediately preceding the start of the search (if one
- /// exists) influence which look-behind assertions are true at the start
- /// of the search. This in turn influences which start state is selected.
- /// * Whether the search is a forward or reverse search. This routine can
- /// only be used for forward searches.
+ /// * Whether a "look-behind" byte exists. For example, the `^` anchor
+ /// matches if and only if there is no look-behind byte.
+ /// * The specific value of that look-behind byte. For example, a `(?m:^)`
+ /// assertion only matches when there is either no look-behind byte, or
+ /// when the look-behind byte is a line terminator.
+ ///
+ /// The [starting configuration](start::Config) provides the above
+ /// information.
+ ///
+ /// This routine can be used for either forward or reverse searches.
+ /// Although, as a convenience, if you have an [`Input`], then it may
+ /// be more succinct to use [`Automaton::start_state_forward`] or
+ /// [`Automaton::start_state_reverse`]. Note, for example, that the
+ /// convenience routines return a [`MatchError`] on failure where as this
+ /// routine returns a [`StartError`].
+ ///
+ /// # Errors
+ ///
+ /// This may return a [`StartError`] if the search needs to give up when
+ /// determining the start state (for example, if it sees a "quit" byte).
+ /// This can also return an error if the given configuration contains an
+ /// unsupported [`Anchored`] configuration.
+ fn start_state(
+ &self,
+ config: &start::Config,
+ ) -> Result<StateID, StartError>;
+
+ /// Return the ID of the start state for this DFA when executing a forward
+ /// search.
+ ///
+ /// This is a convenience routine for calling [`Automaton::start_state`]
+ /// that converts the given [`Input`] to a [start
+ /// configuration](start::Config). Additionally, if an error occurs, it is
+ /// converted from a [`StartError`] to a [`MatchError`] using the offset
+ /// information in the given [`Input`].
///
/// # Errors
///
@@ -251,23 +281,30 @@ pub unsafe trait Automaton {
fn start_state_forward(
&self,
input: &Input<'_>,
- ) -> Result<StateID, MatchError>;
+ ) -> Result<StateID, MatchError> {
+ let config = start::Config::from_input_forward(input);
+ self.start_state(&config).map_err(|err| match err {
+ StartError::Quit { byte } => {
+ let offset = input
+ .start()
+ .checked_sub(1)
+ .expect("no quit in start without look-behind");
+ MatchError::quit(byte, offset)
+ }
+ StartError::UnsupportedAnchored { mode } => {
+ MatchError::unsupported_anchored(mode)
+ }
+ })
+ }
- /// Return the ID of the start state for this lazy DFA when executing a
- /// reverse search.
+ /// Return the ID of the start state for this DFA when executing a reverse
+ /// search.
///
- /// Unlike typical DFA implementations, the start state for DFAs in this
- /// crate is dependent on a few different factors:
- ///
- /// * The [`Anchored`] mode of the search. Unanchored, anchored and
- /// anchored searches for a specific [`PatternID`] all use different start
- /// states.
- /// * The position at which the search begins, via [`Input::start`]. This
- /// and the byte immediately preceding the start of the search (if one
- /// exists) influence which look-behind assertions are true at the start
- /// of the search. This in turn influences which start state is selected.
- /// * Whether the search is a forward or reverse search. This routine can
- /// only be used for reverse searches.
+ /// This is a convenience routine for calling [`Automaton::start_state`]
+ /// that converts the given [`Input`] to a [start
+ /// configuration](start::Config). Additionally, if an error occurs, it is
+ /// converted from a [`StartError`] to a [`MatchError`] using the offset
+ /// information in the given [`Input`].
///
/// # Errors
///
@@ -278,7 +315,18 @@ pub unsafe trait Automaton {
fn start_state_reverse(
&self,
input: &Input<'_>,
- ) -> Result<StateID, MatchError>;
+ ) -> Result<StateID, MatchError> {
+ let config = start::Config::from_input_reverse(input);
+ self.start_state(&config).map_err(|err| match err {
+ StartError::Quit { byte } => {
+ let offset = input.end();
+ MatchError::quit(byte, offset)
+ }
+ StartError::UnsupportedAnchored { mode } => {
+ MatchError::unsupported_anchored(mode)
+ }
+ })
+ }
/// If this DFA has a universal starting state for the given anchor mode
/// and the DFA supports universal starting states, then this returns that
@@ -1084,7 +1132,7 @@ pub unsafe trait Automaton {
/// // implementation defined.
/// //
/// // N.B. We get '3' by inspecting the state machine using 'regex-cli'.
- /// // e.g., try `regex-cli debug dfa dense '[^abc]+a' -BbUC`.
+ /// // e.g., try `regex-cli debug dense dfa -p '[^abc]+a' -BbUC`.
/// let id = StateID::new(3 * dfa.stride()).unwrap();
/// let accelerator = dfa.accelerator(id);
/// // The `[^abc]+` sub-expression permits [a, b, c] to be accelerated.
@@ -1799,6 +1847,14 @@ unsafe impl<'a, A: Automaton + ?Sized> Automaton for &'a A {
}
#[inline]
+ fn start_state(
+ &self,
+ config: &start::Config,
+ ) -> Result<StateID, StartError> {
+ (**self).start_state(config)
+ }
+
+ #[inline]
fn start_state_forward(
&self,
input: &Input<'_>,
@@ -2015,6 +2071,90 @@ impl OverlappingState {
}
}
+/// An error that can occur when computing the start state for a search.
+///
+/// Computing a start state can fail for a few reasons, either based on
+/// incorrect configuration or even based on whether the look-behind byte
+/// triggers a quit state. Typically one does not need to handle this error
+/// if you're using [`Automaton::start_state_forward`] (or its reverse
+/// counterpart), as that routine automatically converts `StartError` to a
+/// [`MatchError`] for you.
+///
+/// This error may be returned by the [`Automaton::start_state`] routine.
+///
+/// This error implements the `std::error::Error` trait when the `std` feature
+/// is enabled.
+///
+/// This error is marked as non-exhaustive. New variants may be added in a
+/// semver compatible release.
+#[non_exhaustive]
+#[derive(Clone, Debug)]
+pub enum StartError {
+ /// An error that occurs when a starting configuration's look-behind byte
+ /// is in this DFA's quit set.
+ Quit {
+ /// The quit byte that was found.
+ byte: u8,
+ },
+ /// An error that occurs when the caller requests an anchored mode that
+ /// isn't supported by the DFA.
+ UnsupportedAnchored {
+ /// The anchored mode given that is unsupported.
+ mode: Anchored,
+ },
+}
+
+impl StartError {
+ pub(crate) fn quit(byte: u8) -> StartError {
+ StartError::Quit { byte }
+ }
+
+ pub(crate) fn unsupported_anchored(mode: Anchored) -> StartError {
+ StartError::UnsupportedAnchored { mode }
+ }
+}
+
+#[cfg(feature = "std")]
+impl std::error::Error for StartError {}
+
+impl core::fmt::Display for StartError {
+ fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+ match *self {
+ StartError::Quit { byte } => write!(
+ f,
+ "error computing start state because the look-behind byte \
+ {:?} triggered a quit state",
+ crate::util::escape::DebugByte(byte),
+ ),
+ StartError::UnsupportedAnchored { mode: Anchored::Yes } => {
+ write!(
+ f,
+ "error computing start state because \
+ anchored searches are not supported or enabled"
+ )
+ }
+ StartError::UnsupportedAnchored { mode: Anchored::No } => {
+ write!(
+ f,
+ "error computing start state because \
+ unanchored searches are not supported or enabled"
+ )
+ }
+ StartError::UnsupportedAnchored {
+ mode: Anchored::Pattern(pid),
+ } => {
+ write!(
+ f,
+ "error computing start state because \
+ anchored searches for a specific pattern ({}) \
+ are not supported or enabled",
+ pid.as_usize(),
+ )
+ }
+ }
+ }
+}
+
/// Runs the given overlapping `search` function (forwards or backwards) until
/// a match is found whose offset does not split a codepoint.
///
diff --git a/vendor/regex-automata/src/dfa/dense.rs b/vendor/regex-automata/src/dfa/dense.rs
index 6da865f97..fd96bc878 100644
--- a/vendor/regex-automata/src/dfa/dense.rs
+++ b/vendor/regex-automata/src/dfa/dense.rs
@@ -30,7 +30,7 @@ use crate::{
use crate::{
dfa::{
accel::Accels,
- automaton::{fmt_state_indicator, Automaton},
+ automaton::{fmt_state_indicator, Automaton, StartError},
special::Special,
start::StartKind,
DEAD,
@@ -40,8 +40,8 @@ use crate::{
int::{Pointer, Usize},
prefilter::Prefilter,
primitives::{PatternID, StateID},
- search::{Anchored, Input, MatchError},
- start::{Start, StartByteMap},
+ search::Anchored,
+ start::{self, Start, StartByteMap},
wire::{self, DeserializeError, Endian, SerializeError},
},
};
@@ -66,8 +66,9 @@ const VERSION: u32 = 2;
///
/// The default configuration guarantees that a search will never return
/// a "quit" error, although it is possible for a search to fail if
-/// [`Config::starts_for_each_pattern`] wasn't enabled (which it is not by
-/// default) and an [`Anchored::Pattern`] mode is requested via [`Input`].
+/// [`Config::starts_for_each_pattern`] wasn't enabled (which it is
+/// not by default) and an [`Anchored::Pattern`] mode is requested via
+/// [`Input`](crate::Input).
#[cfg(feature = "dfa-build")]
#[derive(Clone, Debug, Default)]
pub struct Config {
@@ -113,8 +114,7 @@ impl Config {
/// make searching slower than it otherwise would be if the transitions
/// that leave accelerated states are traversed frequently.
///
- /// See [`Automaton::accelerator`](crate::dfa::Automaton::accelerator) for
- /// an example.
+ /// See [`Automaton::accelerator`] for an example.
///
/// This is enabled by default.
pub fn accelerate(mut self, yes: bool) -> Config {
@@ -882,20 +882,20 @@ impl Config {
/// # if !cfg!(target_pointer_width = "64") { return Ok(()); } // see #1039
/// use regex_automata::{dfa::{dense, Automaton}, Input};
///
- /// // 600KB isn't enough!
+ /// // 700KB isn't enough!
/// dense::Builder::new()
/// .configure(dense::Config::new()
- /// .determinize_size_limit(Some(600_000))
+ /// .determinize_size_limit(Some(700_000))
/// )
/// .build(r"\w{20}")
/// .unwrap_err();
///
- /// // ... but 700KB probably is!
+ /// // ... but 800KB probably is!
/// // (Note that auxiliary storage sizes aren't necessarily stable between
/// // releases.)
/// let dfa = dense::Builder::new()
/// .configure(dense::Config::new()
- /// .determinize_size_limit(Some(700_000))
+ /// .determinize_size_limit(Some(800_000))
/// )
/// .build(r"\w{20}")?;
/// let haystack = "A".repeat(20).into_bytes();
@@ -1228,13 +1228,14 @@ impl Builder {
} else {
let mut set = nfa.byte_class_set().clone();
// It is important to distinguish any "quit" bytes from all other
- // bytes. Otherwise, a non-quit byte may end up in the same class
- // as a quit byte, and thus cause the DFA stop when it shouldn't.
+ // bytes. Otherwise, a non-quit byte may end up in the same
+ // class as a quit byte, and thus cause the DFA to stop when it
+ // shouldn't.
//
// Test case:
//
- // regex-cli find hybrid regex -w @conn.json.1000x.log \
- // '^#' '\b10\.55\.182\.100\b'
+ // regex-cli find match dense --unicode-word-boundary \
+ // -p '^#' -p '\b10\.55\.182\.100\b' -y @conn.json.1000x.log
if !quitset.is_empty() {
set.add_set(&quitset);
}
@@ -2345,6 +2346,24 @@ impl<'a> DFA<&'a [u32]> {
dfa.accels.validate()?;
// N.B. dfa.special doesn't have a way to do unchecked deserialization,
// so it has already been validated.
+ for state in dfa.states() {
+ // If the state is an accel state, then it must have a non-empty
+ // accelerator.
+ if dfa.is_accel_state(state.id()) {
+ let index = dfa.accelerator_index(state.id());
+ if index >= dfa.accels.len() {
+ return Err(DeserializeError::generic(
+ "found DFA state with invalid accelerator index",
+ ));
+ }
+ let needles = dfa.accels.needles(index);
+ if !(1 <= needles.len() && needles.len() <= 3) {
+ return Err(DeserializeError::generic(
+ "accelerator needles has invalid length",
+ ));
+ }
+ }
+ }
Ok((dfa, nread))
}
@@ -2885,31 +2904,33 @@ impl OwnedDFA {
fn set_universal_starts(&mut self) {
assert_eq!(6, Start::len(), "expected 6 start configurations");
- let start_id = |dfa: &mut OwnedDFA, inp: &Input<'_>, start: Start| {
+ let start_id = |dfa: &mut OwnedDFA,
+ anchored: Anchored,
+ start: Start| {
// This OK because we only call 'start' under conditions
// in which we know it will succeed.
- dfa.st.start(inp, start).expect("valid Input configuration")
+ dfa.st.start(anchored, start).expect("valid Input configuration")
};
if self.start_kind().has_unanchored() {
- let inp = Input::new("").anchored(Anchored::No);
- let sid = start_id(self, &inp, Start::NonWordByte);
- if sid == start_id(self, &inp, Start::WordByte)
- && sid == start_id(self, &inp, Start::Text)
- && sid == start_id(self, &inp, Start::LineLF)
- && sid == start_id(self, &inp, Start::LineCR)
- && sid == start_id(self, &inp, Start::CustomLineTerminator)
+ let anchor = Anchored::No;
+ let sid = start_id(self, anchor, Start::NonWordByte);
+ if sid == start_id(self, anchor, Start::WordByte)
+ && sid == start_id(self, anchor, Start::Text)
+ && sid == start_id(self, anchor, Start::LineLF)
+ && sid == start_id(self, anchor, Start::LineCR)
+ && sid == start_id(self, anchor, Start::CustomLineTerminator)
{
self.st.universal_start_unanchored = Some(sid);
}
}
if self.start_kind().has_anchored() {
- let inp = Input::new("").anchored(Anchored::Yes);
- let sid = start_id(self, &inp, Start::NonWordByte);
- if sid == start_id(self, &inp, Start::WordByte)
- && sid == start_id(self, &inp, Start::Text)
- && sid == start_id(self, &inp, Start::LineLF)
- && sid == start_id(self, &inp, Start::LineCR)
- && sid == start_id(self, &inp, Start::CustomLineTerminator)
+ let anchor = Anchored::Yes;
+ let sid = start_id(self, anchor, Start::NonWordByte);
+ if sid == start_id(self, anchor, Start::WordByte)
+ && sid == start_id(self, anchor, Start::Text)
+ && sid == start_id(self, anchor, Start::LineLF)
+ && sid == start_id(self, anchor, Start::LineCR)
+ && sid == start_id(self, anchor, Start::CustomLineTerminator)
{
self.st.universal_start_anchored = Some(sid);
}
@@ -3216,35 +3237,21 @@ unsafe impl<T: AsRef<[u32]>> Automaton for DFA<T> {
}
#[cfg_attr(feature = "perf-inline", inline(always))]
- fn start_state_forward(
- &self,
- input: &Input<'_>,
- ) -> Result<StateID, MatchError> {
- if !self.quitset.is_empty() && input.start() > 0 {
- let offset = input.start() - 1;
- let byte = input.haystack()[offset];
- if self.quitset.contains(byte) {
- return Err(MatchError::quit(byte, offset));
- }
- }
- let start = self.st.start_map.fwd(&input);
- self.st.start(input, start)
- }
-
- #[cfg_attr(feature = "perf-inline", inline(always))]
- fn start_state_reverse(
+ fn start_state(
&self,
- input: &Input<'_>,
- ) -> Result<StateID, MatchError> {
- if !self.quitset.is_empty() && input.end() < input.haystack().len() {
- let offset = input.end();
- let byte = input.haystack()[offset];
- if self.quitset.contains(byte) {
- return Err(MatchError::quit(byte, offset));
+ config: &start::Config,
+ ) -> Result<StateID, StartError> {
+ let anchored = config.get_anchored();
+ let start = match config.get_look_behind() {
+ None => Start::Text,
+ Some(byte) => {
+ if !self.quitset.is_empty() && self.quitset.contains(byte) {
+ return Err(StartError::quit(byte));
+ }
+ self.st.start_map.get(byte)
}
- }
- let start = self.st.start_map.rev(&input);
- self.st.start(input, start)
+ };
+ self.st.start(anchored, start)
}
#[cfg_attr(feature = "perf-inline", inline(always))]
@@ -4180,28 +4187,27 @@ impl<T: AsRef<[u32]>> StartTable<T> {
#[cfg_attr(feature = "perf-inline", inline(always))]
fn start(
&self,
- input: &Input<'_>,
+ anchored: Anchored,
start: Start,
- ) -> Result<StateID, MatchError> {
+ ) -> Result<StateID, StartError> {
let start_index = start.as_usize();
- let mode = input.get_anchored();
- let index = match mode {
+ let index = match anchored {
Anchored::No => {
if !self.kind.has_unanchored() {
- return Err(MatchError::unsupported_anchored(mode));
+ return Err(StartError::unsupported_anchored(anchored));
}
start_index
}
Anchored::Yes => {
if !self.kind.has_anchored() {
- return Err(MatchError::unsupported_anchored(mode));
+ return Err(StartError::unsupported_anchored(anchored));
}
self.stride + start_index
}
Anchored::Pattern(pid) => {
let len = match self.pattern_len {
None => {
- return Err(MatchError::unsupported_anchored(mode))
+ return Err(StartError::unsupported_anchored(anchored))
}
Some(len) => len,
};
@@ -5086,6 +5092,8 @@ impl core::fmt::Display for BuildError {
#[cfg(all(test, feature = "syntax", feature = "dfa-build"))]
mod tests {
+ use crate::{Input, MatchError};
+
use super::*;
#[test]
diff --git a/vendor/regex-automata/src/dfa/mod.rs b/vendor/regex-automata/src/dfa/mod.rs
index 4bb870435..fd58cac23 100644
--- a/vendor/regex-automata/src/dfa/mod.rs
+++ b/vendor/regex-automata/src/dfa/mod.rs
@@ -320,7 +320,7 @@ dramatically.
#[cfg(feature = "dfa-search")]
pub use crate::dfa::{
- automaton::{Automaton, OverlappingState},
+ automaton::{Automaton, OverlappingState, StartError},
start::StartKind,
};
diff --git a/vendor/regex-automata/src/dfa/onepass.rs b/vendor/regex-automata/src/dfa/onepass.rs
index 44691d0c8..e62bbd383 100644
--- a/vendor/regex-automata/src/dfa/onepass.rs
+++ b/vendor/regex-automata/src/dfa/onepass.rs
@@ -2581,10 +2581,11 @@ impl Cache {
/// Represents a single transition in a one-pass DFA.
///
-/// The high 24 bits corresponds to the state ID. The low 48 bits corresponds
-/// to the transition epsilons, which contains the slots that should be saved
-/// when this transition is followed and the conditional epsilon transitions
-/// that must be satisfied in order to follow this transition.
+/// The high 21 bits corresponds to the state ID. The bit following corresponds
+/// to the special "match wins" flag. The remaining low 42 bits corresponds to
+/// the transition epsilons, which contains the slots that should be saved when
+/// this transition is followed and the conditional epsilon transitions that
+/// must be satisfied in order to follow this transition.
#[derive(Clone, Copy, Eq, PartialEq)]
struct Transition(u64);
@@ -2741,7 +2742,7 @@ impl PatternEpsilons {
fn set_epsilons(self, epsilons: Epsilons) -> PatternEpsilons {
PatternEpsilons(
(self.0 & PatternEpsilons::PATTERN_ID_MASK)
- | u64::from(epsilons.0),
+ | (u64::from(epsilons.0) & PatternEpsilons::EPSILONS_MASK),
)
}
}
@@ -2814,12 +2815,15 @@ impl Epsilons {
/// Return the set of look-around assertions in these epsilon transitions.
fn looks(self) -> LookSet {
- LookSet { bits: (self.0 & Epsilons::LOOK_MASK).low_u16() }
+ LookSet { bits: (self.0 & Epsilons::LOOK_MASK).low_u32() }
}
/// Set the look-around assertions on these epsilon transitions.
fn set_looks(self, look_set: LookSet) -> Epsilons {
- Epsilons((self.0 & Epsilons::SLOT_MASK) | u64::from(look_set.bits))
+ Epsilons(
+ (self.0 & Epsilons::SLOT_MASK)
+ | (u64::from(look_set.bits) & Epsilons::LOOK_MASK),
+ )
}
}
diff --git a/vendor/regex-automata/src/dfa/regex.rs b/vendor/regex-automata/src/dfa/regex.rs
index f39c1c055..5e7e6e38a 100644
--- a/vendor/regex-automata/src/dfa/regex.rs
+++ b/vendor/regex-automata/src/dfa/regex.rs
@@ -853,7 +853,7 @@ impl Builder {
}
/// Set the dense DFA compilation configuration for this builder using
- /// [`dense::Config`](dense::Config).
+ /// [`dense::Config`].
///
/// This permits setting things like whether the underlying DFAs should
/// be minimized.
diff --git a/vendor/regex-automata/src/dfa/search.rs b/vendor/regex-automata/src/dfa/search.rs
index 8c012a594..5a82261f9 100644
--- a/vendor/regex-automata/src/dfa/search.rs
+++ b/vendor/regex-automata/src/dfa/search.rs
@@ -176,7 +176,6 @@ fn find_fwd_imp<A: Automaton + ?Sized>(
// It's important that this is a debug_assert, since this can
// actually be tripped even if DFA::from_bytes succeeds and
// returns a supposedly valid DFA.
- debug_assert!(dfa.is_quit_state(sid));
return Err(MatchError::quit(input.haystack()[at], at));
}
}
@@ -297,7 +296,6 @@ fn find_rev_imp<A: Automaton + ?Sized>(
} else if dfa.is_dead_state(sid) {
return Ok(mat);
} else {
- debug_assert!(dfa.is_quit_state(sid));
return Err(MatchError::quit(input.haystack()[at], at));
}
}
@@ -422,7 +420,6 @@ fn find_overlapping_fwd_imp<A: Automaton + ?Sized>(
} else if dfa.is_dead_state(sid) {
return Ok(());
} else {
- debug_assert!(dfa.is_quit_state(sid));
return Err(MatchError::quit(
input.haystack()[state.at],
state.at,
@@ -526,7 +523,6 @@ pub(crate) fn find_overlapping_rev<A: Automaton + ?Sized>(
} else if dfa.is_dead_state(sid) {
return Ok(());
} else {
- debug_assert!(dfa.is_quit_state(sid));
return Err(MatchError::quit(
input.haystack()[state.at],
state.at,
@@ -600,9 +596,6 @@ fn eoi_fwd<A: Automaton + ?Sized>(
let pattern = dfa.match_pattern(*sid, 0);
*mat = Some(HalfMatch::new(pattern, input.haystack().len()));
}
- // N.B. We don't have to check 'is_quit' here because the EOI
- // transition can never lead to a quit state.
- debug_assert!(!dfa.is_quit_state(*sid));
}
}
Ok(())
@@ -631,9 +624,6 @@ fn eoi_rev<A: Automaton + ?Sized>(
let pattern = dfa.match_pattern(*sid, 0);
*mat = Some(HalfMatch::new(pattern, 0));
}
- // N.B. We don't have to check 'is_quit' here because the EOI
- // transition can never lead to a quit state.
- debug_assert!(!dfa.is_quit_state(*sid));
}
Ok(())
}
diff --git a/vendor/regex-automata/src/dfa/sparse.rs b/vendor/regex-automata/src/dfa/sparse.rs
index 5d8ec2340..d461e0a0f 100644
--- a/vendor/regex-automata/src/dfa/sparse.rs
+++ b/vendor/regex-automata/src/dfa/sparse.rs
@@ -3,13 +3,12 @@ Types and routines specific to sparse DFAs.
This module is the home of [`sparse::DFA`](DFA).
-Unlike the [`dense`](super::dense) module, this module does not contain a
-builder or configuration specific for sparse DFAs. Instead, the intended
-way to build a sparse DFA is either by using a default configuration with
-its constructor [`sparse::DFA::new`](DFA::new), or by first configuring the
-construction of a dense DFA with [`dense::Builder`](super::dense::Builder)
-and then calling [`dense::DFA::to_sparse`](super::dense::DFA::to_sparse). For
-example, this configures a sparse DFA to do an overlapping search:
+Unlike the [`dense`] module, this module does not contain a builder or
+configuration specific for sparse DFAs. Instead, the intended way to build a
+sparse DFA is either by using a default configuration with its constructor
+[`sparse::DFA::new`](DFA::new), or by first configuring the construction of a
+dense DFA with [`dense::Builder`] and then calling [`dense::DFA::to_sparse`].
+For example, this configures a sparse DFA to do an overlapping search:
```
use regex_automata::{
@@ -52,7 +51,7 @@ use alloc::{vec, vec::Vec};
use crate::dfa::dense::{self, BuildError};
use crate::{
dfa::{
- automaton::{fmt_state_indicator, Automaton},
+ automaton::{fmt_state_indicator, Automaton, StartError},
dense::Flags,
special::Special,
StartKind, DEAD,
@@ -63,8 +62,8 @@ use crate::{
int::{Pointer, Usize, U16, U32},
prefilter::Prefilter,
primitives::{PatternID, StateID},
- search::{Anchored, Input, MatchError},
- start::{Start, StartByteMap},
+ search::Anchored,
+ start::{self, Start, StartByteMap},
wire::{self, DeserializeError, Endian, SerializeError},
},
};
@@ -74,18 +73,17 @@ const VERSION: u32 = 2;
/// A sparse deterministic finite automaton (DFA) with variable sized states.
///
-/// In contrast to a [dense::DFA](crate::dfa::dense::DFA), a sparse DFA uses
-/// a more space efficient representation for its transitions. Consequently,
-/// sparse DFAs may use much less memory than dense DFAs, but this comes at a
-/// price. In particular, reading the more space efficient transitions takes
-/// more work, and consequently, searching using a sparse DFA is typically
-/// slower than a dense DFA.
+/// In contrast to a [dense::DFA], a sparse DFA uses a more space efficient
+/// representation for its transitions. Consequently, sparse DFAs may use much
+/// less memory than dense DFAs, but this comes at a price. In particular,
+/// reading the more space efficient transitions takes more work, and
+/// consequently, searching using a sparse DFA is typically slower than a dense
+/// DFA.
///
/// A sparse DFA can be built using the default configuration via the
-/// [`DFA::new`] constructor. Otherwise, one can configure various aspects
-/// of a dense DFA via [`dense::Builder`](crate::dfa::dense::Builder),
-/// and then convert a dense DFA to a sparse DFA using
-/// [`dense::DFA::to_sparse`](crate::dfa::dense::DFA::to_sparse).
+/// [`DFA::new`] constructor. Otherwise, one can configure various aspects of a
+/// dense DFA via [`dense::Builder`], and then convert a dense DFA to a sparse
+/// DFA using [`dense::DFA::to_sparse`].
///
/// In general, a sparse DFA supports all the same search operations as a dense
/// DFA.
@@ -140,11 +138,9 @@ impl DFA<Vec<u8>> {
/// Parse the given regular expression using a default configuration and
/// return the corresponding sparse DFA.
///
- /// If you want a non-default configuration, then use
- /// the [`dense::Builder`](crate::dfa::dense::Builder)
- /// to set your own configuration, and then call
- /// [`dense::DFA::to_sparse`](crate::dfa::dense::DFA::to_sparse) to create
- /// a sparse DFA.
+ /// If you want a non-default configuration, then use the
+ /// [`dense::Builder`] to set your own configuration, and then call
+ /// [`dense::DFA::to_sparse`] to create a sparse DFA.
///
/// # Example
///
@@ -167,11 +163,9 @@ impl DFA<Vec<u8>> {
/// Parse the given regular expressions using a default configuration and
/// return the corresponding multi-DFA.
///
- /// If you want a non-default configuration, then use
- /// the [`dense::Builder`](crate::dfa::dense::Builder)
- /// to set your own configuration, and then call
- /// [`dense::DFA::to_sparse`](crate::dfa::dense::DFA::to_sparse) to create
- /// a sparse DFA.
+ /// If you want a non-default configuration, then use the
+ /// [`dense::Builder`] to set your own configuration, and then call
+ /// [`dense::DFA::to_sparse`] to create a sparse DFA.
///
/// # Example
///
@@ -511,10 +505,9 @@ impl<T: AsRef<[u8]>> DFA<T> {
/// * [`DFA::from_bytes`]
/// * [`DFA::from_bytes_unchecked`]
///
- /// Note that unlike a [`dense::DFA`](crate::dfa::dense::DFA)'s
- /// serialization methods, this does not add any initial padding to the
- /// returned bytes. Padding isn't required for sparse DFAs since they have
- /// no alignment requirements.
+ /// Note that unlike a [`dense::DFA`]'s serialization methods, this does
+ /// not add any initial padding to the returned bytes. Padding isn't
+ /// required for sparse DFAs since they have no alignment requirements.
///
/// # Example
///
@@ -553,10 +546,9 @@ impl<T: AsRef<[u8]>> DFA<T> {
/// * [`DFA::from_bytes`]
/// * [`DFA::from_bytes_unchecked`]
///
- /// Note that unlike a [`dense::DFA`](crate::dfa::dense::DFA)'s
- /// serialization methods, this does not add any initial padding to the
- /// returned bytes. Padding isn't required for sparse DFAs since they have
- /// no alignment requirements.
+ /// Note that unlike a [`dense::DFA`]'s serialization methods, this does
+ /// not add any initial padding to the returned bytes. Padding isn't
+ /// required for sparse DFAs since they have no alignment requirements.
///
/// # Example
///
@@ -595,10 +587,9 @@ impl<T: AsRef<[u8]>> DFA<T> {
/// * [`DFA::from_bytes`]
/// * [`DFA::from_bytes_unchecked`]
///
- /// Note that unlike a [`dense::DFA`](crate::dfa::dense::DFA)'s
- /// serialization methods, this does not add any initial padding to the
- /// returned bytes. Padding isn't required for sparse DFAs since they have
- /// no alignment requirements.
+ /// Note that unlike a [`dense::DFA`]'s serialization methods, this does
+ /// not add any initial padding to the returned bytes. Padding isn't
+ /// required for sparse DFAs since they have no alignment requirements.
///
/// Generally speaking, native endian format should only be used when
/// you know that the target you're compiling the DFA for matches the
@@ -903,9 +894,9 @@ impl<'a> DFA<&'a [u8]> {
///
/// If any of the above are not true, then an error will be returned.
///
- /// Note that unlike deserializing a
- /// [`dense::DFA`](crate::dfa::dense::DFA), deserializing a sparse DFA has
- /// no alignment requirements. That is, an alignment of `1` is valid.
+ /// Note that unlike deserializing a [`dense::DFA`], deserializing a sparse
+ /// DFA has no alignment requirements. That is, an alignment of `1` is
+ /// valid.
///
/// # Panics
///
@@ -1001,8 +992,8 @@ impl<'a> DFA<&'a [u8]> {
// (by trying to decode every state) and start state ID list below. If
// either validation fails, then we return an error.
let (dfa, nread) = unsafe { DFA::from_bytes_unchecked(slice)? };
- dfa.tt.validate(&dfa.special)?;
- dfa.st.validate(&dfa.special, &dfa.tt)?;
+ let seen = dfa.tt.validate(&dfa.special)?;
+ dfa.st.validate(&dfa.special, &seen)?;
// N.B. dfa.special doesn't have a way to do unchecked deserialization,
// so it has already been validated.
Ok((dfa, nread))
@@ -1207,35 +1198,21 @@ unsafe impl<T: AsRef<[u8]>> Automaton for DFA<T> {
}
#[inline]
- fn start_state_forward(
+ fn start_state(
&self,
- input: &Input<'_>,
- ) -> Result<StateID, MatchError> {
- if !self.quitset.is_empty() && input.start() > 0 {
- let offset = input.start() - 1;
- let byte = input.haystack()[offset];
- if self.quitset.contains(byte) {
- return Err(MatchError::quit(byte, offset));
- }
- }
- let start = self.st.start_map.fwd(&input);
- self.st.start(input, start)
- }
-
- #[inline]
- fn start_state_reverse(
- &self,
- input: &Input<'_>,
- ) -> Result<StateID, MatchError> {
- if !self.quitset.is_empty() && input.end() < input.haystack().len() {
- let offset = input.end();
- let byte = input.haystack()[offset];
- if self.quitset.contains(byte) {
- return Err(MatchError::quit(byte, offset));
+ config: &start::Config,
+ ) -> Result<StateID, StartError> {
+ let anchored = config.get_anchored();
+ let start = match config.get_look_behind() {
+ None => Start::Text,
+ Some(byte) => {
+ if !self.quitset.is_empty() && self.quitset.contains(byte) {
+ return Err(StartError::quit(byte));
+ }
+ self.st.start_map.get(byte)
}
- }
- let start = self.st.start_map.rev(&input);
- self.st.start(input, start)
+ };
+ self.st.start(anchored, start)
}
#[inline]
@@ -1411,63 +1388,8 @@ impl<T: AsRef<[u8]>> Transitions<T> {
///
/// That is, every state ID can be used to correctly index a state in this
/// table.
- fn validate(&self, sp: &Special) -> Result<(), DeserializeError> {
- // In order to validate everything, we not only need to make sure we
- // can decode every state, but that every transition in every state
- // points to a valid state. There are many duplicative transitions, so
- // we record state IDs that we've verified so that we don't redo the
- // decoding work.
- //
- // Except, when in no_std mode, we don't have dynamic memory allocation
- // available to us, so we skip this optimization. It's not clear
- // whether doing something more clever is worth it just yet. If you're
- // profiling this code and need it to run faster, please file an issue.
- //
- // OK, so we also use this to record the set of valid state IDs. Since
- // it is possible for a transition to point to an invalid state ID that
- // still (somehow) deserializes to a valid state. So we need to make
- // sure our transitions are limited to actually correct state IDs.
- // The problem is, I'm not sure how to do this verification step in
- // no-std no-alloc mode. I think we'd *have* to store the set of valid
- // state IDs in the DFA itself. For now, we don't do this verification
- // in no-std no-alloc mode. The worst thing that can happen is an
- // incorrect result. But no panics or memory safety problems should
- // result. Because we still do validate that the state itself is
- // "valid" in the sense that everything it points to actually exists.
- //
- // ---AG
- struct Seen {
- #[cfg(feature = "alloc")]
- set: alloc::collections::BTreeSet<StateID>,
- #[cfg(not(feature = "alloc"))]
- set: core::marker::PhantomData<StateID>,
- }
-
- #[cfg(feature = "alloc")]
- impl Seen {
- fn new() -> Seen {
- Seen { set: alloc::collections::BTreeSet::new() }
- }
- fn insert(&mut self, id: StateID) {
- self.set.insert(id);
- }
- fn contains(&self, id: &StateID) -> bool {
- self.set.contains(id)
- }
- }
-
- #[cfg(not(feature = "alloc"))]
- impl Seen {
- fn new() -> Seen {
- Seen { set: core::marker::PhantomData }
- }
- fn insert(&mut self, _id: StateID) {}
- fn contains(&self, _id: &StateID) -> bool {
- false
- }
- }
-
- let mut verified: Seen = Seen::new();
+ fn validate(&self, sp: &Special) -> Result<Seen, DeserializeError> {
+ let mut verified = Seen::new();
// We need to make sure that we decode the correct number of states.
// Otherwise, an empty set of transitions would validate even if the
// recorded state length is non-empty.
@@ -1544,7 +1466,7 @@ impl<T: AsRef<[u8]>> Transitions<T> {
"mismatching sparse state length",
));
}
- Ok(())
+ Ok(verified)
}
/// Converts these transitions to a borrowed value.
@@ -1682,7 +1604,7 @@ impl<T: AsRef<[u8]>> Transitions<T> {
let state = &state[nr..];
if npats == 0 {
return Err(DeserializeError::generic(
- "state marked as a match, but has no pattern IDs",
+ "state marked as a match, but pattern length is zero",
));
}
@@ -1704,6 +1626,21 @@ impl<T: AsRef<[u8]>> Transitions<T> {
} else {
(&[][..], state)
};
+ if is_match && pattern_ids.is_empty() {
+ return Err(DeserializeError::generic(
+ "state marked as a match, but has no pattern IDs",
+ ));
+ }
+ if sp.is_match_state(id) && pattern_ids.is_empty() {
+ return Err(DeserializeError::generic(
+ "state marked special as a match, but has no pattern IDs",
+ ));
+ }
+ if sp.is_match_state(id) != is_match {
+ return Err(DeserializeError::generic(
+ "whether state is a match or not is inconsistent",
+ ));
+ }
// Now read this state's accelerator info. The first byte is the length
// of the accelerator, which is typically 0 (for no acceleration) but
@@ -2084,28 +2021,19 @@ impl<T: AsRef<[u8]>> StartTable<T> {
fn validate(
&self,
sp: &Special,
- trans: &Transitions<T>,
+ seen: &Seen,
) -> Result<(), DeserializeError> {
for (id, _, _) in self.iter() {
+ if !seen.contains(&id) {
+ return Err(DeserializeError::generic(
+ "found invalid start state ID",
+ ));
+ }
if sp.is_match_state(id) {
return Err(DeserializeError::generic(
"start states cannot be match states",
));
}
- // Confirm that the start state points to a valid state.
- let state = trans.try_state(sp, id)?;
- // And like for the transition table, confirm that the transitions
- // on all start states themselves point to a valid state.
- //
- // It'd probably be better to integrate this validation with the
- // transition table, or otherwise store a sorted sequence of all
- // valid state IDs in the sparse DFA itself. That way, we could
- // check that every pointer to a state corresponds precisely to a
- // correct and valid state.
- for i in 0..state.ntrans {
- let to = state.next_at(i);
- let _ = trans.try_state(sp, to)?;
- }
}
Ok(())
}
@@ -2145,28 +2073,27 @@ impl<T: AsRef<[u8]>> StartTable<T> {
/// panics.
fn start(
&self,
- input: &Input<'_>,
+ anchored: Anchored,
start: Start,
- ) -> Result<StateID, MatchError> {
+ ) -> Result<StateID, StartError> {
let start_index = start.as_usize();
- let mode = input.get_anchored();
- let index = match mode {
+ let index = match anchored {
Anchored::No => {
if !self.kind.has_unanchored() {
- return Err(MatchError::unsupported_anchored(mode));
+ return Err(StartError::unsupported_anchored(anchored));
}
start_index
}
Anchored::Yes => {
if !self.kind.has_anchored() {
- return Err(MatchError::unsupported_anchored(mode));
+ return Err(StartError::unsupported_anchored(anchored));
}
self.stride + start_index
}
Anchored::Pattern(pid) => {
let len = match self.pattern_len {
None => {
- return Err(MatchError::unsupported_anchored(mode))
+ return Err(StartError::unsupported_anchored(anchored))
}
Some(len) => len,
};
@@ -2561,6 +2488,62 @@ impl<'a> fmt::Debug for StateMut<'a> {
}
}
+// In order to validate everything, we not only need to make sure we
+// can decode every state, but that every transition in every state
+// points to a valid state. There are many duplicative transitions, so
+// we record state IDs that we've verified so that we don't redo the
+// decoding work.
+//
+// Except, when in no_std mode, we don't have dynamic memory allocation
+// available to us, so we skip this optimization. It's not clear
+// whether doing something more clever is worth it just yet. If you're
+// profiling this code and need it to run faster, please file an issue.
+//
+// OK, so we also use this to record the set of valid state IDs. Since
+// it is possible for a transition to point to an invalid state ID that
+// still (somehow) deserializes to a valid state. So we need to make
+// sure our transitions are limited to actually correct state IDs.
+// The problem is, I'm not sure how to do this verification step in
+// no-std no-alloc mode. I think we'd *have* to store the set of valid
+// state IDs in the DFA itself. For now, we don't do this verification
+// in no-std no-alloc mode. The worst thing that can happen is an
+// incorrect result. But no panics or memory safety problems should
+// result. Because we still do validate that the state itself is
+// "valid" in the sense that everything it points to actually exists.
+//
+// ---AG
+#[derive(Debug)]
+struct Seen {
+ #[cfg(feature = "alloc")]
+ set: alloc::collections::BTreeSet<StateID>,
+ #[cfg(not(feature = "alloc"))]
+ set: core::marker::PhantomData<StateID>,
+}
+
+#[cfg(feature = "alloc")]
+impl Seen {
+ fn new() -> Seen {
+ Seen { set: alloc::collections::BTreeSet::new() }
+ }
+ fn insert(&mut self, id: StateID) {
+ self.set.insert(id);
+ }
+ fn contains(&self, id: &StateID) -> bool {
+ self.set.contains(id)
+ }
+}
+
+#[cfg(not(feature = "alloc"))]
+impl Seen {
+ fn new() -> Seen {
+ Seen { set: core::marker::PhantomData }
+ }
+ fn insert(&mut self, _id: StateID) {}
+ fn contains(&self, _id: &StateID) -> bool {
+ true
+ }
+}
+
/*
/// A binary search routine specialized specifically to a sparse DFA state's
/// transitions. Specifically, the transitions are defined as a set of pairs
diff --git a/vendor/regex-automata/src/hybrid/dfa.rs b/vendor/regex-automata/src/hybrid/dfa.rs
index 67261c1a3..bd9179b19 100644
--- a/vendor/regex-automata/src/hybrid/dfa.rs
+++ b/vendor/regex-automata/src/hybrid/dfa.rs
@@ -13,7 +13,7 @@ use alloc::vec::Vec;
use crate::{
hybrid::{
- error::{BuildError, CacheError},
+ error::{BuildError, CacheError, StartError},
id::{LazyStateID, LazyStateIDError},
search,
},
@@ -28,7 +28,7 @@ use crate::{
Anchored, HalfMatch, Input, MatchError, MatchKind, PatternSet,
},
sparse_set::SparseSets,
- start::{Start, StartByteMap},
+ start::{self, Start, StartByteMap},
},
};
@@ -1518,8 +1518,8 @@ impl DFA {
Lazy::new(self, cache).cache_next_state(current, unit)
}
- /// Return the ID of the start state for this lazy DFA when executing a
- /// forward search.
+ /// Return the ID of the start state for this lazy DFA for the given
+ /// starting configuration.
///
/// Unlike typical DFA implementations, the start state for DFAs in this
/// crate is dependent on a few different factors:
@@ -1527,85 +1527,122 @@ impl DFA {
/// * The [`Anchored`] mode of the search. Unanchored, anchored and
/// anchored searches for a specific [`PatternID`] all use different start
/// states.
- /// * The position at which the search begins, via [`Input::start`]. This
- /// and the byte immediately preceding the start of the search (if one
- /// exists) influence which look-behind assertions are true at the start
- /// of the search. This in turn influences which start state is selected.
- /// * Whether the search is a forward or reverse search. This routine can
- /// only be used for forward searches.
+ /// * Whether a "look-behind" byte exists. For example, the `^` anchor
+ /// matches if and only if there is no look-behind byte.
+ /// * The specific value of that look-behind byte. For example, a `(?m:^)`
+ /// assertion only matches when there is either no look-behind byte, or
+ /// when the look-behind byte is a line terminator.
+ ///
+ /// The [starting configuration](start::Config) provides the above
+ /// information.
+ ///
+ /// This routine can be used for either forward or reverse searches.
+ /// Although, as a convenience, if you have an [`Input`], then it
+ /// may be more succinct to use [`DFA::start_state_forward`] or
+ /// [`DFA::start_state_reverse`]. Note, for example, that the convenience
+ /// routines return a [`MatchError`] on failure where as this routine
+ /// returns a [`StartError`].
///
/// # Errors
///
- /// This may return a [`MatchError`] (not a [`CacheError`]!) if the search
- /// needs to give up when determining the start state (for example, if
- /// it sees a "quit" byte or if the cache has been cleared too many
- /// times). This can also return an error if the given `Input` contains an
- /// unsupported [`Anchored`] configuration.
+ /// This may return a [`StartError`] if the search needs to give up when
+ /// determining the start state (for example, if it sees a "quit" byte
+ /// or if the cache has become inefficient). This can also return an
+ /// error if the given configuration contains an unsupported [`Anchored`]
+ /// configuration.
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub fn start_state(
+ &self,
+ cache: &mut Cache,
+ config: &start::Config,
+ ) -> Result<LazyStateID, StartError> {
+ let lazy = LazyRef::new(self, cache);
+ let anchored = config.get_anchored();
+ let start = match config.get_look_behind() {
+ None => Start::Text,
+ Some(byte) => {
+ if !self.quitset.is_empty() && self.quitset.contains(byte) {
+ return Err(StartError::quit(byte));
+ }
+ self.start_map.get(byte)
+ }
+ };
+ let start_id = lazy.get_cached_start_id(anchored, start)?;
+ if !start_id.is_unknown() {
+ return Ok(start_id);
+ }
+ Lazy::new(self, cache).cache_start_group(anchored, start)
+ }
+
+ /// Return the ID of the start state for this lazy DFA when executing a
+ /// forward search.
+ ///
+ /// This is a convenience routine for calling [`DFA::start_state`] that
+ /// converts the given [`Input`] to a [start configuration](start::Config).
+ /// Additionally, if an error occurs, it is converted from a [`StartError`]
+ /// to a [`MatchError`] using the offset information in the given
+ /// [`Input`].
+ ///
+ /// # Errors
+ ///
+ /// This may return a [`MatchError`] if the search needs to give up when
+ /// determining the start state (for example, if it sees a "quit" byte or
+ /// if the cache has become inefficient). This can also return an error if
+ /// the given `Input` contains an unsupported [`Anchored`] configuration.
#[cfg_attr(feature = "perf-inline", inline(always))]
pub fn start_state_forward(
&self,
cache: &mut Cache,
input: &Input<'_>,
) -> Result<LazyStateID, MatchError> {
- if !self.quitset.is_empty() && input.start() > 0 {
- let offset = input.start() - 1;
- let byte = input.haystack()[offset];
- if self.quitset.contains(byte) {
- return Err(MatchError::quit(byte, offset));
+ let config = start::Config::from_input_forward(input);
+ self.start_state(cache, &config).map_err(|err| match err {
+ StartError::Cache { .. } => MatchError::gave_up(input.start()),
+ StartError::Quit { byte } => {
+ let offset = input
+ .start()
+ .checked_sub(1)
+ .expect("no quit in start without look-behind");
+ MatchError::quit(byte, offset)
}
- }
- let start_type = self.start_map.fwd(input);
- let start = LazyRef::new(self, cache)
- .get_cached_start_id(input, start_type)?;
- if !start.is_unknown() {
- return Ok(start);
- }
- Lazy::new(self, cache).cache_start_group(input, start_type)
+ StartError::UnsupportedAnchored { mode } => {
+ MatchError::unsupported_anchored(mode)
+ }
+ })
}
/// Return the ID of the start state for this lazy DFA when executing a
/// reverse search.
///
- /// Unlike typical DFA implementations, the start state for DFAs in this
- /// crate is dependent on a few different factors:
- ///
- /// * The [`Anchored`] mode of the search. Unanchored, anchored and
- /// anchored searches for a specific [`PatternID`] all use different start
- /// states.
- /// * The position at which the search begins, via [`Input::start`]. This
- /// and the byte immediately preceding the start of the search (if one
- /// exists) influence which look-behind assertions are true at the start
- /// of the search. This in turn influences which start state is selected.
- /// * Whether the search is a forward or reverse search. This routine can
- /// only be used for reverse searches.
+ /// This is a convenience routine for calling [`DFA::start_state`] that
+ /// converts the given [`Input`] to a [start configuration](start::Config).
+ /// Additionally, if an error occurs, it is converted from a [`StartError`]
+ /// to a [`MatchError`] using the offset information in the given
+ /// [`Input`].
///
/// # Errors
///
- /// This may return a [`MatchError`] (not a [`CacheError`]!) if the search
- /// needs to give up when determining the start state (for example, if
- /// it sees a "quit" byte or if the cache has been cleared too many
- /// times). This can also return an error if the given `Input` contains an
- /// unsupported [`Anchored`] configuration.
+ /// This may return a [`MatchError`] if the search needs to give up when
+ /// determining the start state (for example, if it sees a "quit" byte or
+ /// if the cache has become inefficient). This can also return an error if
+ /// the given `Input` contains an unsupported [`Anchored`] configuration.
#[cfg_attr(feature = "perf-inline", inline(always))]
pub fn start_state_reverse(
&self,
cache: &mut Cache,
input: &Input<'_>,
) -> Result<LazyStateID, MatchError> {
- if !self.quitset.is_empty() && input.end() < input.haystack().len() {
- let offset = input.end();
- let byte = input.haystack()[offset];
- if self.quitset.contains(byte) {
- return Err(MatchError::quit(byte, offset));
+ let config = start::Config::from_input_reverse(input);
+ self.start_state(cache, &config).map_err(|err| match err {
+ StartError::Cache { .. } => MatchError::gave_up(input.end()),
+ StartError::Quit { byte } => {
+ let offset = input.end();
+ MatchError::quit(byte, offset)
}
- }
- let start_type = self.start_map.rev(input);
- let start = LazyRef::new(self, cache)
- .get_cached_start_id(input, start_type)?;
- if !start.is_unknown() {
- return Ok(start);
- }
- Lazy::new(self, cache).cache_start_group(input, start_type)
+ StartError::UnsupportedAnchored { mode } => {
+ MatchError::unsupported_anchored(mode)
+ }
+ })
}
/// Returns the total number of patterns that match in this state.
@@ -2066,8 +2103,10 @@ impl<'i, 'c> Lazy<'i, 'c> {
/// Here's an example that justifies 'inline(never)'
///
/// ```ignore
- /// regex-cli find hybrid dfa \
- /// @all-codepoints-utf8-100x '\pL{100}' --cache-capacity 10000000
+ /// regex-cli find match hybrid \
+ /// --cache-capacity 100000000 \
+ /// -p '\pL{100}'
+ /// all-codepoints-utf8-100x
/// ```
///
/// Where 'all-codepoints-utf8-100x' is the UTF-8 encoding of every
@@ -2122,16 +2161,15 @@ impl<'i, 'c> Lazy<'i, 'c> {
#[inline(never)]
fn cache_start_group(
&mut self,
- input: &Input<'_>,
+ anchored: Anchored,
start: Start,
- ) -> Result<LazyStateID, MatchError> {
- let mode = input.get_anchored();
- let nfa_start_id = match mode {
+ ) -> Result<LazyStateID, StartError> {
+ let nfa_start_id = match anchored {
Anchored::No => self.dfa.get_nfa().start_unanchored(),
Anchored::Yes => self.dfa.get_nfa().start_anchored(),
Anchored::Pattern(pid) => {
if !self.dfa.get_config().get_starts_for_each_pattern() {
- return Err(MatchError::unsupported_anchored(mode));
+ return Err(StartError::unsupported_anchored(anchored));
}
match self.dfa.get_nfa().start_pattern(pid) {
None => return Ok(self.as_ref().dead_id()),
@@ -2142,8 +2180,8 @@ impl<'i, 'c> Lazy<'i, 'c> {
let id = self
.cache_start_one(nfa_start_id, start)
- .map_err(|_| MatchError::gave_up(input.start()))?;
- self.set_start_state(input, start, id);
+ .map_err(StartError::cache)?;
+ self.set_start_state(anchored, start, id);
Ok(id)
}
@@ -2574,13 +2612,13 @@ impl<'i, 'c> Lazy<'i, 'c> {
/// 'starts_for_each_pattern' is not enabled.
fn set_start_state(
&mut self,
- input: &Input<'_>,
+ anchored: Anchored,
start: Start,
id: LazyStateID,
) {
assert!(self.as_ref().is_valid(id));
let start_index = start.as_usize();
- let index = match input.get_anchored() {
+ let index = match anchored {
Anchored::No => start_index,
Anchored::Yes => Start::len() + start_index,
Anchored::Pattern(pid) => {
@@ -2642,17 +2680,16 @@ impl<'i, 'c> LazyRef<'i, 'c> {
#[cfg_attr(feature = "perf-inline", inline(always))]
fn get_cached_start_id(
&self,
- input: &Input<'_>,
+ anchored: Anchored,
start: Start,
- ) -> Result<LazyStateID, MatchError> {
+ ) -> Result<LazyStateID, StartError> {
let start_index = start.as_usize();
- let mode = input.get_anchored();
- let index = match mode {
+ let index = match anchored {
Anchored::No => start_index,
Anchored::Yes => Start::len() + start_index,
Anchored::Pattern(pid) => {
if !self.dfa.get_config().get_starts_for_each_pattern() {
- return Err(MatchError::unsupported_anchored(mode));
+ return Err(StartError::unsupported_anchored(anchored));
}
if pid.as_usize() >= self.dfa.pattern_len() {
return Ok(self.dead_id());
@@ -3178,12 +3215,12 @@ impl Config {
/// be quit bytes _only_ when a Unicode word boundary is present in the
/// pattern.
///
- /// When enabling this option, callers _must_ be prepared to handle
- /// a [`MatchError`](crate::MatchError) error during search.
- /// When using a [`Regex`](crate::hybrid::regex::Regex), this
- /// corresponds to using the `try_` suite of methods. Alternatively,
- /// if callers can guarantee that their input is ASCII only, then a
- /// [`MatchError::quit`] error will never be returned while searching.
+ /// When enabling this option, callers _must_ be prepared to
+ /// handle a [`MatchError`] error during search. When using a
+ /// [`Regex`](crate::hybrid::regex::Regex), this corresponds to using the
+ /// `try_` suite of methods. Alternatively, if callers can guarantee that
+ /// their input is ASCII only, then a [`MatchError::quit`] error will never
+ /// be returned while searching.
///
/// This is disabled by default.
///
@@ -3269,8 +3306,8 @@ impl Config {
/// (The advantage being that non-ASCII quit bytes will only be added if a
/// Unicode word boundary is in the pattern.)
///
- /// When enabling this option, callers _must_ be prepared to handle a
- /// [`MatchError`](crate::MatchError) error during search. When using a
+ /// When enabling this option, callers _must_ be prepared to
+ /// handle a [`MatchError`] error during search. When using a
/// [`Regex`](crate::hybrid::regex::Regex), this corresponds to using the
/// `try_` suite of methods.
///
@@ -3795,8 +3832,8 @@ impl Config {
//
// Test case:
//
- // regex-cli find hybrid regex -w @conn.json.1000x.log \
- // '^#' '\b10\.55\.182\.100\b'
+ // regex-cli find match hybrid --unicode-word-boundary \
+ // -p '^#' -p '\b10\.55\.182\.100\b' -y @conn.json.1000x.log
if !quit.is_empty() {
set.add_set(&quit);
}
diff --git a/vendor/regex-automata/src/hybrid/error.rs b/vendor/regex-automata/src/hybrid/error.rs
index 604daf3c3..d134e7ec9 100644
--- a/vendor/regex-automata/src/hybrid/error.rs
+++ b/vendor/regex-automata/src/hybrid/error.rs
@@ -1,4 +1,4 @@
-use crate::{hybrid::id::LazyStateIDError, nfa};
+use crate::{hybrid::id::LazyStateIDError, nfa, util::search::Anchored};
/// An error that occurs when initial construction of a lazy DFA fails.
///
@@ -95,6 +95,113 @@ impl core::fmt::Display for BuildError {
}
}
+/// An error that can occur when computing the start state for a search.
+///
+/// Computing a start state can fail for a few reasons, either
+/// based on incorrect configuration or even based on whether
+/// the look-behind byte triggers a quit state. Typically
+/// one does not need to handle this error if you're using
+/// [`DFA::start_state_forward`](crate::hybrid::dfa::DFA::start_state_forward)
+/// (or its reverse counterpart), as that routine automatically converts
+/// `StartError` to a [`MatchError`](crate::MatchError) for you.
+///
+/// This error may be returned by the
+/// [`DFA::start_state`](crate::hybrid::dfa::DFA::start_state) routine.
+///
+/// This error implements the `std::error::Error` trait when the `std` feature
+/// is enabled.
+///
+/// This error is marked as non-exhaustive. New variants may be added in a
+/// semver compatible release.
+#[non_exhaustive]
+#[derive(Clone, Debug)]
+pub enum StartError {
+ /// An error that occurs when cache inefficiency has dropped below the
+ /// configured heuristic thresholds.
+ Cache {
+ /// The underlying cache error that occurred.
+ err: CacheError,
+ },
+ /// An error that occurs when a starting configuration's look-behind byte
+ /// is in this DFA's quit set.
+ Quit {
+ /// The quit byte that was found.
+ byte: u8,
+ },
+ /// An error that occurs when the caller requests an anchored mode that
+ /// isn't supported by the DFA.
+ UnsupportedAnchored {
+ /// The anchored mode given that is unsupported.
+ mode: Anchored,
+ },
+}
+
+impl StartError {
+ pub(crate) fn cache(err: CacheError) -> StartError {
+ StartError::Cache { err }
+ }
+
+ pub(crate) fn quit(byte: u8) -> StartError {
+ StartError::Quit { byte }
+ }
+
+ pub(crate) fn unsupported_anchored(mode: Anchored) -> StartError {
+ StartError::UnsupportedAnchored { mode }
+ }
+}
+
+#[cfg(feature = "std")]
+impl std::error::Error for StartError {
+ fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
+ match *self {
+ StartError::Cache { ref err } => Some(err),
+ _ => None,
+ }
+ }
+}
+
+impl core::fmt::Display for StartError {
+ fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+ match *self {
+ StartError::Cache { .. } => write!(
+ f,
+ "error computing start state because of cache inefficiency"
+ ),
+ StartError::Quit { byte } => write!(
+ f,
+ "error computing start state because the look-behind byte \
+ {:?} triggered a quit state",
+ crate::util::escape::DebugByte(byte),
+ ),
+ StartError::UnsupportedAnchored { mode: Anchored::Yes } => {
+ write!(
+ f,
+ "error computing start state because \
+ anchored searches are not supported or enabled"
+ )
+ }
+ StartError::UnsupportedAnchored { mode: Anchored::No } => {
+ write!(
+ f,
+ "error computing start state because \
+ unanchored searches are not supported or enabled"
+ )
+ }
+ StartError::UnsupportedAnchored {
+ mode: Anchored::Pattern(pid),
+ } => {
+ write!(
+ f,
+ "error computing start state because \
+ anchored searches for a specific pattern ({}) \
+ are not supported or enabled",
+ pid.as_usize(),
+ )
+ }
+ }
+ }
+}
+
/// An error that occurs when cache usage has become inefficient.
///
/// One of the weaknesses of a lazy DFA is that it may need to clear its
@@ -126,11 +233,7 @@ impl CacheError {
}
#[cfg(feature = "std")]
-impl std::error::Error for CacheError {
- fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
- None
- }
-}
+impl std::error::Error for CacheError {}
impl core::fmt::Display for CacheError {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
diff --git a/vendor/regex-automata/src/hybrid/mod.rs b/vendor/regex-automata/src/hybrid/mod.rs
index 44e67e129..2feb839d1 100644
--- a/vendor/regex-automata/src/hybrid/mod.rs
+++ b/vendor/regex-automata/src/hybrid/mod.rs
@@ -133,7 +133,7 @@ compiled DFAs.
*/
pub use self::{
- error::{BuildError, CacheError},
+ error::{BuildError, CacheError, StartError},
id::LazyStateID,
};
diff --git a/vendor/regex-automata/src/hybrid/regex.rs b/vendor/regex-automata/src/hybrid/regex.rs
index 75667daf9..b3b1fe317 100644
--- a/vendor/regex-automata/src/hybrid/regex.rs
+++ b/vendor/regex-automata/src/hybrid/regex.rs
@@ -878,7 +878,7 @@ impl Builder {
}
/// Set the lazy DFA compilation configuration for this builder using
- /// [`dfa::Config`](dfa::Config).
+ /// [`dfa::Config`].
///
/// This permits setting things like whether Unicode word boundaries should
/// be heuristically supported or settings how the behavior of the cache.
diff --git a/vendor/regex-automata/src/hybrid/search.rs b/vendor/regex-automata/src/hybrid/search.rs
index f23283685..1f4a505db 100644
--- a/vendor/regex-automata/src/hybrid/search.rs
+++ b/vendor/regex-automata/src/hybrid/search.rs
@@ -105,14 +105,14 @@ fn find_fwd_imp(
// PERF: For justification of omitting bounds checks, it gives us a
// ~10% bump in search time. This was used for a benchmark:
//
- // regex-cli find hybrid dfa @bigfile '(?m)^.+$' -UBb
+ // regex-cli find half hybrid -p '(?m)^.+$' -UBb bigfile
//
// PERF: For justification for the loop unrolling, we use a few
// different tests:
//
- // regex-cli find hybrid dfa @$bigfile '\w{50}' -UBb
- // regex-cli find hybrid dfa @$bigfile '(?m)^.+$' -UBb
- // regex-cli find hybrid dfa @$bigfile 'ZQZQZQZQ' -UBb
+ // regex-cli find half hybrid -p '\w{50}' -UBb bigfile
+ // regex-cli find half hybrid -p '(?m)^.+$' -UBb bigfile
+ // regex-cli find half hybrid -p 'ZQZQZQZQ' -UBb bigfile
//
// And there are three different configurations:
//
@@ -353,7 +353,7 @@ fn find_rev_imp(
// anchored and on shorter haystacks. However, this still makes a
// difference. Take this command for example:
//
- // regex-cli find hybrid regex @$bigfile '(?m)^.+$' -UBb
+ // regex-cli find match hybrid -p '(?m)^.+$' -UBb bigfile
//
// (Notice that we use 'find hybrid regex', not 'find hybrid dfa'
// like in the justification for the forward direction. The 'regex'
diff --git a/vendor/regex-automata/src/meta/limited.rs b/vendor/regex-automata/src/meta/limited.rs
index 192a2625e..5653adc9a 100644
--- a/vendor/regex-automata/src/meta/limited.rs
+++ b/vendor/regex-automata/src/meta/limited.rs
@@ -69,9 +69,6 @@ pub(crate) fn dfa_try_search_half_rev(
} else if dfa.is_dead_state(sid) {
return Ok(mat);
} else if dfa.is_quit_state(sid) {
- if mat.is_some() {
- return Ok(mat);
- }
return Err(MatchError::quit(input.haystack()[at], at).into());
}
}
@@ -155,9 +152,6 @@ pub(crate) fn hybrid_try_search_half_rev(
} else if sid.is_dead() {
return Ok(mat);
} else if sid.is_quit() {
- if mat.is_some() {
- return Ok(mat);
- }
return Err(MatchError::quit(input.haystack()[at], at).into());
}
}
@@ -209,9 +203,6 @@ fn dfa_eoi_rev(
let pattern = dfa.match_pattern(*sid, 0);
*mat = Some(HalfMatch::new(pattern, sp.start));
} else if dfa.is_quit_state(*sid) {
- if mat.is_some() {
- return Ok(());
- }
return Err(MatchError::quit(byte, sp.start - 1));
}
} else {
@@ -246,9 +237,6 @@ fn hybrid_eoi_rev(
let pattern = dfa.match_pattern(cache, *sid, 0);
*mat = Some(HalfMatch::new(pattern, sp.start));
} else if sid.is_quit() {
- if mat.is_some() {
- return Ok(());
- }
return Err(MatchError::quit(byte, sp.start - 1));
}
} else {
diff --git a/vendor/regex-automata/src/meta/regex.rs b/vendor/regex-automata/src/meta/regex.rs
index 3a04b14d8..a06d2bb48 100644
--- a/vendor/regex-automata/src/meta/regex.rs
+++ b/vendor/regex-automata/src/meta/regex.rs
@@ -2706,7 +2706,7 @@ impl Config {
/// you're compiling untrusted patterns.
///
/// Note that this limit is applied to _each_ NFA built, and if any of
- /// them excceed the limit, then construction will fail. This limit does
+ /// them exceed the limit, then construction will fail. This limit does
/// _not_ correspond to the total memory used by all NFAs in the meta regex
/// engine.
///
@@ -3640,8 +3640,8 @@ mod tests {
// I found this in the course of building out the benchmark suite for
// rebar.
#[test]
- fn regression() {
- env_logger::init();
+ fn regression_suffix_literal_count() {
+ let _ = env_logger::try_init();
let re = Regex::new(r"[a-zA-Z]+ing").unwrap();
assert_eq!(1, re.find_iter("tingling").count());
diff --git a/vendor/regex-automata/src/meta/stopat.rs b/vendor/regex-automata/src/meta/stopat.rs
index e8d716689..c4dcd797a 100644
--- a/vendor/regex-automata/src/meta/stopat.rs
+++ b/vendor/regex-automata/src/meta/stopat.rs
@@ -81,9 +81,6 @@ pub(crate) fn dfa_try_search_half_fwd(
} else if dfa.is_dead_state(sid) {
return Ok(mat.ok_or(at));
} else if dfa.is_quit_state(sid) {
- if mat.is_some() {
- return Ok(mat.ok_or(at));
- }
return Err(MatchError::quit(input.haystack()[at], at).into());
} else {
// Ideally we wouldn't use a DFA that specialized start states
@@ -122,9 +119,6 @@ pub(crate) fn hybrid_try_search_half_fwd(
} else if sid.is_dead() {
return Ok(mat.ok_or(at));
} else if sid.is_quit() {
- if mat.is_some() {
- return Ok(mat.ok_or(at));
- }
return Err(MatchError::quit(input.haystack()[at], at).into());
} else {
// We should NEVER get an unknown state ID back from
@@ -162,9 +156,6 @@ fn dfa_eoi_fwd(
let pattern = dfa.match_pattern(*sid, 0);
*mat = Some(HalfMatch::new(pattern, sp.end));
} else if dfa.is_quit_state(*sid) {
- if mat.is_some() {
- return Ok(());
- }
return Err(MatchError::quit(b, sp.end));
}
}
@@ -201,9 +192,6 @@ fn hybrid_eoi_fwd(
let pattern = dfa.match_pattern(cache, *sid, 0);
*mat = Some(HalfMatch::new(pattern, sp.end));
} else if sid.is_quit() {
- if mat.is_some() {
- return Ok(());
- }
return Err(MatchError::quit(b, sp.end));
}
}
diff --git a/vendor/regex-automata/src/meta/strategy.rs b/vendor/regex-automata/src/meta/strategy.rs
index ea6c6ab57..04f2ba3c3 100644
--- a/vendor/regex-automata/src/meta/strategy.rs
+++ b/vendor/regex-automata/src/meta/strategy.rs
@@ -353,6 +353,7 @@ impl Pre<()> {
// strategy when len(patterns)==1 if the number of literals is large. In that
// case, literal extraction gives up and will return an infinite set.)
impl<P: PrefilterI> Strategy for Pre<P> {
+ #[cfg_attr(feature = "perf-inline", inline(always))]
fn group_info(&self) -> &GroupInfo {
&self.group_info
}
@@ -378,6 +379,7 @@ impl<P: PrefilterI> Strategy for Pre<P> {
self.pre.memory_usage()
}
+ #[cfg_attr(feature = "perf-inline", inline(always))]
fn search(&self, _cache: &mut Cache, input: &Input<'_>) -> Option<Match> {
if input.is_done() {
return None;
@@ -393,6 +395,7 @@ impl<P: PrefilterI> Strategy for Pre<P> {
.map(|sp| Match::new(PatternID::ZERO, sp))
}
+ #[cfg_attr(feature = "perf-inline", inline(always))]
fn search_half(
&self,
cache: &mut Cache,
@@ -401,10 +404,12 @@ impl<P: PrefilterI> Strategy for Pre<P> {
self.search(cache, input).map(|m| HalfMatch::new(m.pattern(), m.end()))
}
+ #[cfg_attr(feature = "perf-inline", inline(always))]
fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool {
self.search(cache, input).is_some()
}
+ #[cfg_attr(feature = "perf-inline", inline(always))]
fn search_slots(
&self,
cache: &mut Cache,
@@ -421,6 +426,7 @@ impl<P: PrefilterI> Strategy for Pre<P> {
Some(m.pattern())
}
+ #[cfg_attr(feature = "perf-inline", inline(always))]
fn which_overlapping_matches(
&self,
cache: &mut Cache,
@@ -1275,7 +1281,7 @@ impl ReverseSuffix {
e.try_search_half_rev_limited(&input, min_start)
} else if let Some(e) = self.core.hybrid.get(&input) {
trace!(
- "using lazy DFA for reverse inner search at {:?}, \
+ "using lazy DFA for reverse suffix search at {:?}, \
but will be stopped at {} to avoid quadratic behavior",
input.get_span(),
min_start,
diff --git a/vendor/regex-automata/src/meta/wrappers.rs b/vendor/regex-automata/src/meta/wrappers.rs
index 08110d9bb..6cb19ba0d 100644
--- a/vendor/regex-automata/src/meta/wrappers.rs
+++ b/vendor/regex-automata/src/meta/wrappers.rs
@@ -212,7 +212,10 @@ impl BoundedBacktrackerEngine {
.configure(backtrack_config)
.build_from_nfa(nfa.clone())
.map_err(BuildError::nfa)?;
- debug!("BoundedBacktracker built");
+ debug!(
+ "BoundedBacktracker built (max haystack length: {:?})",
+ engine.max_haystack_len()
+ );
Ok(Some(BoundedBacktrackerEngine(engine)))
}
#[cfg(not(feature = "nfa-backtrack"))]
diff --git a/vendor/regex-automata/src/nfa/thompson/backtrack.rs b/vendor/regex-automata/src/nfa/thompson/backtrack.rs
index eba037c1d..df99e456d 100644
--- a/vendor/regex-automata/src/nfa/thompson/backtrack.rs
+++ b/vendor/regex-automata/src/nfa/thompson/backtrack.rs
@@ -820,8 +820,11 @@ impl BoundedBacktracker {
// bytes to the capacity in bits.
let capacity = 8 * self.get_config().get_visited_capacity();
let blocks = div_ceil(capacity, Visited::BLOCK_SIZE);
- let real_capacity = blocks * Visited::BLOCK_SIZE;
- (real_capacity / self.nfa.states().len()) - 1
+ let real_capacity = blocks.saturating_mul(Visited::BLOCK_SIZE);
+ // It's possible for `real_capacity` to be smaller than the number of
+ // NFA states for particularly large regexes, so we saturate towards
+ // zero.
+ (real_capacity / self.nfa.states().len()).saturating_sub(1)
}
}
@@ -1882,3 +1885,24 @@ fn div_ceil(lhs: usize, rhs: usize) -> usize {
(lhs / rhs) + 1
}
}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ // This is a regression test for the maximum haystack length computation.
+ // Previously, it assumed that the total capacity of the backtracker's
+ // bitset would always be greater than the number of NFA states. But there
+ // is of course no guarantee that this is true. This regression test
+ // ensures that not only does `max_haystack_len` not panic, but that it
+ // should return `0`.
+ #[cfg(feature = "syntax")]
+ #[test]
+ fn max_haystack_len_overflow() {
+ let re = BoundedBacktracker::builder()
+ .configure(BoundedBacktracker::config().visited_capacity(10))
+ .build(r"[0-9A-Za-z]{100}")
+ .unwrap();
+ assert_eq!(0, re.max_haystack_len());
+ }
+}
diff --git a/vendor/regex-automata/src/nfa/thompson/builder.rs b/vendor/regex-automata/src/nfa/thompson/builder.rs
index b57e5bc0f..6b69e8784 100644
--- a/vendor/regex-automata/src/nfa/thompson/builder.rs
+++ b/vendor/regex-automata/src/nfa/thompson/builder.rs
@@ -61,7 +61,7 @@ enum State {
Look { look: Look, next: StateID },
/// An empty state that records the start of a capture location. This is an
/// unconditional epsilon transition like `Empty`, except it can be used to
- /// record position information for a captue group when using the NFA for
+ /// record position information for a capture group when using the NFA for
/// search.
CaptureStart {
/// The ID of the pattern that this capture was defined.
@@ -77,7 +77,7 @@ enum State {
},
/// An empty state that records the end of a capture location. This is an
/// unconditional epsilon transition like `Empty`, except it can be used to
- /// record position information for a captue group when using the NFA for
+ /// record position information for a capture group when using the NFA for
/// search.
CaptureEnd {
/// The ID of the pattern that this capture was defined.
@@ -128,7 +128,7 @@ enum State {
}
impl State {
- /// If this state is an unconditional espilon transition, then this returns
+ /// If this state is an unconditional epsilon transition, then this returns
/// the target of the transition.
fn goto(&self) -> Option<StateID> {
match *self {
diff --git a/vendor/regex-automata/src/nfa/thompson/compiler.rs b/vendor/regex-automata/src/nfa/thompson/compiler.rs
index 065e9ef27..2d2172957 100644
--- a/vendor/regex-automata/src/nfa/thompson/compiler.rs
+++ b/vendor/regex-automata/src/nfa/thompson/compiler.rs
@@ -1466,7 +1466,7 @@ impl Compiler {
// compare and contrast performance of the Pike VM when the code below
// is active vs the code above. Here's an example to try:
//
- // regex-cli find match pikevm -b -p '(?m)^\w{20}' -y '@$smallishru'
+ // regex-cli find match pikevm -b -p '(?m)^\w{20}' non-ascii-file
//
// With Unicode classes generated below, this search takes about 45s on
// my machine. But with the compressed version above, the search takes
@@ -1557,6 +1557,14 @@ impl Compiler {
hir::Look::WordAsciiNegate => Look::WordAsciiNegate,
hir::Look::WordUnicode => Look::WordUnicode,
hir::Look::WordUnicodeNegate => Look::WordUnicodeNegate,
+ hir::Look::WordStartAscii => Look::WordStartAscii,
+ hir::Look::WordEndAscii => Look::WordEndAscii,
+ hir::Look::WordStartUnicode => Look::WordStartUnicode,
+ hir::Look::WordEndUnicode => Look::WordEndUnicode,
+ hir::Look::WordStartHalfAscii => Look::WordStartHalfAscii,
+ hir::Look::WordEndHalfAscii => Look::WordEndHalfAscii,
+ hir::Look::WordStartHalfUnicode => Look::WordStartHalfUnicode,
+ hir::Look::WordEndHalfUnicode => Look::WordEndHalfUnicode,
};
let id = self.add_look(look)?;
Ok(ThompsonRef { start: id, end: id })
diff --git a/vendor/regex-automata/src/nfa/thompson/map.rs b/vendor/regex-automata/src/nfa/thompson/map.rs
index c36ce5386..c92d4c0b8 100644
--- a/vendor/regex-automata/src/nfa/thompson/map.rs
+++ b/vendor/regex-automata/src/nfa/thompson/map.rs
@@ -65,7 +65,7 @@ const INIT: u64 = 14695981039346656037;
/// Specifically, one could observe the difference with std's hashmap via
/// something like the following benchmark:
///
-/// hyperfine "regex-cli debug nfa thompson --quiet --reverse '\w{90} ecurB'"
+/// hyperfine "regex-cli debug thompson -qr --no-captures '\w{90} ecurB'"
///
/// But to observe that difference, you'd have to modify the code to use
/// std's hashmap.
diff --git a/vendor/regex-automata/src/nfa/thompson/nfa.rs b/vendor/regex-automata/src/nfa/thompson/nfa.rs
index 2108fa338..1f57f8ebd 100644
--- a/vendor/regex-automata/src/nfa/thompson/nfa.rs
+++ b/vendor/regex-automata/src/nfa/thompson/nfa.rs
@@ -1841,14 +1841,12 @@ impl SparseTransitions {
// This is an alternative implementation that uses binary search. In
// some ad hoc experiments, like
//
- // smallishru=OpenSubtitles2018.raw.sample.smallish.ru
- // regex-cli find nfa thompson pikevm -b "@$smallishru" '\b\w+\b'
+ // regex-cli find match pikevm -b -p '\b\w+\b' non-ascii-file
//
// I could not observe any improvement, and in fact, things seemed to
// be a bit slower. I can see an improvement in at least one benchmark:
//
- // allcpssmall=all-codepoints-utf8-10x
- // regex-cli find nfa thompson pikevm @$allcpssmall '\pL{100}'
+ // regex-cli find match pikevm -b -p '\pL{100}' all-codepoints-utf8
//
// Where total search time goes from 3.2s to 2.4s when using binary
// search.
diff --git a/vendor/regex-automata/src/nfa/thompson/range_trie.rs b/vendor/regex-automata/src/nfa/thompson/range_trie.rs
index 2d43a5b6f..75c9b796b 100644
--- a/vendor/regex-automata/src/nfa/thompson/range_trie.rs
+++ b/vendor/regex-automata/src/nfa/thompson/range_trie.rs
@@ -594,7 +594,7 @@ impl State {
// Benchmarks suggest that binary search is just a bit faster than
// straight linear search. Specifically when using the debug tool:
//
- // hyperfine "regex-cli debug nfa thompson --quiet --reverse '\w{90} ecurB'"
+ // hyperfine "regex-cli debug thompson -qr --no-captures '\w{90} ecurB'"
binary_search(&self.transitions, |t| range.start <= t.range.end)
}
diff --git a/vendor/regex-automata/src/util/captures.rs b/vendor/regex-automata/src/util/captures.rs
index cd3a5f8f7..05db6a993 100644
--- a/vendor/regex-automata/src/util/captures.rs
+++ b/vendor/regex-automata/src/util/captures.rs
@@ -433,7 +433,6 @@ impl Captures {
///
/// ```
/// # if cfg!(miri) { return Ok(()); } // miri takes too long
- /// # if !cfg!(target_pointer_width = "64") { return Ok(()); } // see #1039
/// use regex_automata::{nfa::thompson::pikevm::PikeVM, Span, Match};
///
/// let re = PikeVM::new(r"^(?P<first>\pL+)\s+(?P<last>\pL+)$")?;
@@ -445,6 +444,8 @@ impl Captures {
/// assert_eq!(Some(Span::from(6..17)), caps.get_group(2));
/// // Looking for a non-existent capturing group will return None:
/// assert_eq!(None, caps.get_group(3));
+ /// # // literals are too big for 32-bit usize: #1039
+ /// # #[cfg(target_pointer_width = "64")]
/// assert_eq!(None, caps.get_group(9944060567225171988));
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
diff --git a/vendor/regex-automata/src/util/determinize/mod.rs b/vendor/regex-automata/src/util/determinize/mod.rs
index 30a82afb8..ba32991d0 100644
--- a/vendor/regex-automata/src/util/determinize/mod.rs
+++ b/vendor/regex-automata/src/util/determinize/mod.rs
@@ -145,9 +145,10 @@ pub(crate) fn next(
}
Some(_) => {}
None => {
- look_have = look_have.insert(Look::End);
- look_have = look_have.insert(Look::EndLF);
- look_have = look_have.insert(Look::EndCRLF);
+ look_have = look_have
+ .insert(Look::End)
+ .insert(Look::EndLF)
+ .insert(Look::EndCRLF);
}
}
if unit.is_byte(lookm.get_line_terminator()) {
@@ -160,11 +161,26 @@ pub(crate) fn next(
look_have = look_have.insert(Look::StartCRLF);
}
if state.is_from_word() == unit.is_word_byte() {
- look_have = look_have.insert(Look::WordUnicodeNegate);
- look_have = look_have.insert(Look::WordAsciiNegate);
+ look_have = look_have
+ .insert(Look::WordAsciiNegate)
+ .insert(Look::WordUnicodeNegate);
} else {
- look_have = look_have.insert(Look::WordUnicode);
- look_have = look_have.insert(Look::WordAscii);
+ look_have =
+ look_have.insert(Look::WordAscii).insert(Look::WordUnicode);
+ }
+ if !unit.is_word_byte() {
+ look_have = look_have
+ .insert(Look::WordEndHalfAscii)
+ .insert(Look::WordEndHalfUnicode);
+ }
+ if state.is_from_word() && !unit.is_word_byte() {
+ look_have = look_have
+ .insert(Look::WordEndAscii)
+ .insert(Look::WordEndUnicode);
+ } else if !state.is_from_word() && unit.is_word_byte() {
+ look_have = look_have
+ .insert(Look::WordStartAscii)
+ .insert(Look::WordStartUnicode);
}
// If we have new assertions satisfied that are among the set of
// assertions that exist in this state (that is, just because we added
@@ -220,6 +236,14 @@ pub(crate) fn next(
{
builder.set_look_have(|have| have.insert(Look::StartCRLF));
}
+ // And also for the start-half word boundary assertions. As long as the
+ // look-behind byte is not a word char, then the assertions are satisfied.
+ if nfa.look_set_any().contains_word() && !unit.is_word_byte() {
+ builder.set_look_have(|have| {
+ have.insert(Look::WordStartHalfAscii)
+ .insert(Look::WordStartHalfUnicode)
+ });
+ }
for nfa_id in sparses.set1.iter() {
match *nfa.state(nfa_id) {
thompson::State::Union { .. }
@@ -563,47 +587,95 @@ pub(crate) fn set_lookbehind_from_start(
) {
let rev = nfa.is_reverse();
let lineterm = nfa.look_matcher().get_line_terminator();
+ let lookset = nfa.look_set_any();
match *start {
- Start::NonWordByte => {}
+ Start::NonWordByte => {
+ if lookset.contains_word() {
+ builder.set_look_have(|have| {
+ have.insert(Look::WordStartHalfAscii)
+ .insert(Look::WordStartHalfUnicode)
+ });
+ }
+ }
Start::WordByte => {
- builder.set_is_from_word();
+ if lookset.contains_word() {
+ builder.set_is_from_word();
+ }
}
Start::Text => {
- builder.set_look_have(|have| {
- have.insert(Look::Start)
- .insert(Look::StartLF)
- .insert(Look::StartCRLF)
- });
+ if lookset.contains_anchor_haystack() {
+ builder.set_look_have(|have| have.insert(Look::Start));
+ }
+ if lookset.contains_anchor_line() {
+ builder.set_look_have(|have| {
+ have.insert(Look::StartLF).insert(Look::StartCRLF)
+ });
+ }
+ if lookset.contains_word() {
+ builder.set_look_have(|have| {
+ have.insert(Look::WordStartHalfAscii)
+ .insert(Look::WordStartHalfUnicode)
+ });
+ }
}
Start::LineLF => {
if rev {
- builder.set_is_half_crlf();
- builder.set_look_have(|have| have.insert(Look::StartLF));
+ if lookset.contains_anchor_crlf() {
+ builder.set_is_half_crlf();
+ }
+ if lookset.contains_anchor_line() {
+ builder.set_look_have(|have| have.insert(Look::StartLF));
+ }
} else {
- builder.set_look_have(|have| have.insert(Look::StartCRLF));
+ if lookset.contains_anchor_line() {
+ builder.set_look_have(|have| have.insert(Look::StartCRLF));
+ }
}
- if lineterm == b'\n' {
+ if lookset.contains_anchor_line() && lineterm == b'\n' {
builder.set_look_have(|have| have.insert(Look::StartLF));
}
+ if lookset.contains_word() {
+ builder.set_look_have(|have| {
+ have.insert(Look::WordStartHalfAscii)
+ .insert(Look::WordStartHalfUnicode)
+ });
+ }
}
Start::LineCR => {
- if rev {
- builder.set_look_have(|have| have.insert(Look::StartCRLF));
- } else {
- builder.set_is_half_crlf();
+ if lookset.contains_anchor_crlf() {
+ if rev {
+ builder.set_look_have(|have| have.insert(Look::StartCRLF));
+ } else {
+ builder.set_is_half_crlf();
+ }
}
- if lineterm == b'\r' {
+ if lookset.contains_anchor_line() && lineterm == b'\r' {
builder.set_look_have(|have| have.insert(Look::StartLF));
}
+ if lookset.contains_word() {
+ builder.set_look_have(|have| {
+ have.insert(Look::WordStartHalfAscii)
+ .insert(Look::WordStartHalfUnicode)
+ });
+ }
}
Start::CustomLineTerminator => {
- builder.set_look_have(|have| have.insert(Look::StartLF));
+ if lookset.contains_anchor_line() {
+ builder.set_look_have(|have| have.insert(Look::StartLF));
+ }
// This is a bit of a tricky case, but if the line terminator was
// set to a word byte, then we also need to behave as if the start
// configuration is Start::WordByte. That is, we need to mark our
// state as having come from a word byte.
- if utf8::is_word_byte(lineterm) {
- builder.set_is_from_word();
+ if lookset.contains_word() {
+ if utf8::is_word_byte(lineterm) {
+ builder.set_is_from_word();
+ } else {
+ builder.set_look_have(|have| {
+ have.insert(Look::WordStartHalfAscii)
+ .insert(Look::WordStartHalfUnicode)
+ });
+ }
}
}
}
diff --git a/vendor/regex-automata/src/util/determinize/state.rs b/vendor/regex-automata/src/util/determinize/state.rs
index e64123587..effa6f44d 100644
--- a/vendor/regex-automata/src/util/determinize/state.rs
+++ b/vendor/regex-automata/src/util/determinize/state.rs
@@ -197,7 +197,7 @@ impl StateBuilderEmpty {
}
pub(crate) fn into_matches(mut self) -> StateBuilderMatches {
- self.0.extend_from_slice(&[0, 0, 0, 0, 0]);
+ self.0.extend_from_slice(&[0, 0, 0, 0, 0, 0, 0, 0, 0]);
StateBuilderMatches(self.0)
}
@@ -348,16 +348,17 @@ impl StateBuilderNFA {
/// generated by a transition over a "word" byte. (Callers may not always set
/// this. For example, if the NFA has no word boundary assertion, then needing
/// to track whether a state came from a word byte or not is superfluous and
-/// wasteful.)
+/// wasteful.) Bit 3 is set to 1 if the state was generated by a transition
+/// from a `\r` (forward search) or a `\n` (reverse search) when CRLF mode is
+/// enabled.
///
-/// Byte 1 corresponds to the look-behind assertions that were satisfied by
-/// the transition that created this state. This generally only includes the
-/// StartLF and Start assertions. (Look-ahead assertions are not tracked as
-/// part of states. Instead, these are applied by re-computing the epsilon
-/// closure of a state when computing the transition function. See `next` in
-/// the parent module.)
+/// Bytes 1..5 correspond to the look-behind assertions that were satisfied
+/// by the transition that created this state. (Look-ahead assertions are not
+/// tracked as part of states. Instead, these are applied by re-computing the
+/// epsilon closure of a state when computing the transition function. See
+/// `next` in the parent module.)
///
-/// Byte 2 corresponds to the set of look-around assertions (including both
+/// Bytes 5..9 correspond to the set of look-around assertions (including both
/// look-behind and look-ahead) that appear somewhere in this state's set of
/// NFA state IDs. This is used to determine whether this state's epsilon
/// closure should be re-computed when computing the transition function.
@@ -366,7 +367,7 @@ impl StateBuilderNFA {
/// function, we should only re-compute the epsilon closure if those new
/// assertions are relevant to this particular state.
///
-/// Bytes 3..7 correspond to a 32-bit native-endian encoded integer
+/// Bytes 9..13 correspond to a 32-bit native-endian encoded integer
/// corresponding to the number of patterns encoded in this state. If the state
/// is not a match state (byte 0 bit 0 is 0) or if it's only pattern ID is
/// PatternID::ZERO, then no integer is encoded at this position. Instead, byte
@@ -452,7 +453,7 @@ impl<'a> Repr<'a> {
/// state has no conditional epsilon transitions, then there is no need
/// to re-compute the epsilon closure.
fn look_need(&self) -> LookSet {
- LookSet::read_repr(&self.0[3..])
+ LookSet::read_repr(&self.0[5..])
}
/// Returns the total number of match pattern IDs in this state.
@@ -476,7 +477,7 @@ impl<'a> Repr<'a> {
if !self.has_pattern_ids() {
PatternID::ZERO
} else {
- let offset = 9 + index * PatternID::SIZE;
+ let offset = 13 + index * PatternID::SIZE;
// This is OK since we only ever serialize valid PatternIDs to
// states.
wire::read_pattern_id_unchecked(&self.0[offset..]).0
@@ -507,7 +508,7 @@ impl<'a> Repr<'a> {
f(PatternID::ZERO);
return;
}
- let mut pids = &self.0[9..self.pattern_offset_end()];
+ let mut pids = &self.0[13..self.pattern_offset_end()];
while !pids.is_empty() {
let pid = wire::read_u32(pids);
pids = &pids[PatternID::SIZE..];
@@ -539,11 +540,11 @@ impl<'a> Repr<'a> {
fn pattern_offset_end(&self) -> usize {
let encoded = self.encoded_pattern_len();
if encoded == 0 {
- return 5;
+ return 9;
}
// This arithmetic is OK since we were able to address this many bytes
// when writing to the state, thus, it must fit into a usize.
- encoded.checked_mul(4).unwrap().checked_add(9).unwrap()
+ encoded.checked_mul(4).unwrap().checked_add(13).unwrap()
}
/// Returns the total number of *encoded* pattern IDs in this state.
@@ -557,7 +558,7 @@ impl<'a> Repr<'a> {
}
// This unwrap is OK since the total number of patterns is always
// guaranteed to fit into a usize.
- usize::try_from(wire::read_u32(&self.0[5..9])).unwrap()
+ usize::try_from(wire::read_u32(&self.0[9..13])).unwrap()
}
}
@@ -643,7 +644,7 @@ impl<'a> ReprVec<'a> {
/// Mutate the set of look-around (both behind and ahead) assertions that
/// appear at least once in this state's set of NFA states.
fn set_look_need(&mut self, mut set: impl FnMut(LookSet) -> LookSet) {
- set(self.look_need()).write_repr(&mut self.0[3..]);
+ set(self.look_need()).write_repr(&mut self.0[5..]);
}
/// Add a pattern ID to this state. All match states must have at least
@@ -703,14 +704,14 @@ impl<'a> ReprVec<'a> {
return;
}
let patsize = PatternID::SIZE;
- let pattern_bytes = self.0.len() - 9;
+ let pattern_bytes = self.0.len() - 13;
// Every pattern ID uses 4 bytes, so number of bytes should be
// divisible by 4.
assert_eq!(pattern_bytes % patsize, 0);
// This unwrap is OK since we are guaranteed that the maximum number
// of possible patterns fits into a u32.
let count32 = u32::try_from(pattern_bytes / patsize).unwrap();
- wire::NE::write_u32(count32, &mut self.0[5..9]);
+ wire::NE::write_u32(count32, &mut self.0[9..13]);
}
/// Add an NFA state ID to this state. The order in which NFA states are
diff --git a/vendor/regex-automata/src/util/lazy.rs b/vendor/regex-automata/src/util/lazy.rs
index de27a2a6e..0d0b4fb2a 100644
--- a/vendor/regex-automata/src/util/lazy.rs
+++ b/vendor/regex-automata/src/util/lazy.rs
@@ -384,11 +384,7 @@ mod lazy {
// SAFETY: state is DONE if and only if data has been fully
// initialized. At which point, it is safe to drop.
unsafe {
- // MSRV(1.60): Use assume_init_drop. The below is how
- // assume_init_drop is implemented.
- core::ptr::drop_in_place(
- (*self.data.as_ptr()).as_mut_ptr(),
- )
+ self.data.get_mut().assume_init_drop();
}
}
}
diff --git a/vendor/regex-automata/src/util/look.rs b/vendor/regex-automata/src/util/look.rs
index aee31b34e..73e51c0f6 100644
--- a/vendor/regex-automata/src/util/look.rs
+++ b/vendor/regex-automata/src/util/look.rs
@@ -96,6 +96,42 @@ pub enum Look {
WordUnicode = 1 << 8,
/// Match a Unicode-aware negation of a word boundary.
WordUnicodeNegate = 1 << 9,
+ /// Match the start of an ASCII-only word boundary. That is, this matches a
+ /// position at either the beginning of the haystack or where the previous
+ /// character is not a word character and the following character is a word
+ /// character.
+ WordStartAscii = 1 << 10,
+ /// Match the end of an ASCII-only word boundary. That is, this matches
+ /// a position at either the end of the haystack or where the previous
+ /// character is a word character and the following character is not a word
+ /// character.
+ WordEndAscii = 1 << 11,
+ /// Match the start of a Unicode word boundary. That is, this matches a
+ /// position at either the beginning of the haystack or where the previous
+ /// character is not a word character and the following character is a word
+ /// character.
+ WordStartUnicode = 1 << 12,
+ /// Match the end of a Unicode word boundary. That is, this matches a
+ /// position at either the end of the haystack or where the previous
+ /// character is a word character and the following character is not a word
+ /// character.
+ WordEndUnicode = 1 << 13,
+ /// Match the start half of an ASCII-only word boundary. That is, this
+ /// matches a position at either the beginning of the haystack or where the
+ /// previous character is not a word character.
+ WordStartHalfAscii = 1 << 14,
+ /// Match the end half of an ASCII-only word boundary. That is, this
+ /// matches a position at either the end of the haystack or where the
+ /// following character is not a word character.
+ WordEndHalfAscii = 1 << 15,
+ /// Match the start half of a Unicode word boundary. That is, this matches
+ /// a position at either the beginning of the haystack or where the
+ /// previous character is not a word character.
+ WordStartHalfUnicode = 1 << 16,
+ /// Match the end half of a Unicode word boundary. That is, this matches
+ /// a position at either the end of the haystack or where the following
+ /// character is not a word character.
+ WordEndHalfUnicode = 1 << 17,
}
impl Look {
@@ -117,6 +153,14 @@ impl Look {
Look::WordAsciiNegate => Look::WordAsciiNegate,
Look::WordUnicode => Look::WordUnicode,
Look::WordUnicodeNegate => Look::WordUnicodeNegate,
+ Look::WordStartAscii => Look::WordEndAscii,
+ Look::WordEndAscii => Look::WordStartAscii,
+ Look::WordStartUnicode => Look::WordEndUnicode,
+ Look::WordEndUnicode => Look::WordStartUnicode,
+ Look::WordStartHalfAscii => Look::WordEndHalfAscii,
+ Look::WordEndHalfAscii => Look::WordStartHalfAscii,
+ Look::WordStartHalfUnicode => Look::WordEndHalfUnicode,
+ Look::WordEndHalfUnicode => Look::WordStartHalfUnicode,
}
}
@@ -125,28 +169,36 @@ impl Look {
/// constructor is guaranteed to return the same look-around variant that
/// one started with within a semver compatible release of this crate.
#[inline]
- pub const fn as_repr(self) -> u16 {
+ pub const fn as_repr(self) -> u32 {
// AFAIK, 'as' is the only way to zero-cost convert an int enum to an
// actual int.
- self as u16
+ self as u32
}
/// Given the underlying representation of a `Look` value, return the
/// corresponding `Look` value if the representation is valid. Otherwise
/// `None` is returned.
#[inline]
- pub const fn from_repr(repr: u16) -> Option<Look> {
+ pub const fn from_repr(repr: u32) -> Option<Look> {
match repr {
- 0b00_0000_0001 => Some(Look::Start),
- 0b00_0000_0010 => Some(Look::End),
- 0b00_0000_0100 => Some(Look::StartLF),
- 0b00_0000_1000 => Some(Look::EndLF),
- 0b00_0001_0000 => Some(Look::StartCRLF),
- 0b00_0010_0000 => Some(Look::EndCRLF),
- 0b00_0100_0000 => Some(Look::WordAscii),
- 0b00_1000_0000 => Some(Look::WordAsciiNegate),
- 0b01_0000_0000 => Some(Look::WordUnicode),
- 0b10_0000_0000 => Some(Look::WordUnicodeNegate),
+ 0b00_0000_0000_0000_0001 => Some(Look::Start),
+ 0b00_0000_0000_0000_0010 => Some(Look::End),
+ 0b00_0000_0000_0000_0100 => Some(Look::StartLF),
+ 0b00_0000_0000_0000_1000 => Some(Look::EndLF),
+ 0b00_0000_0000_0001_0000 => Some(Look::StartCRLF),
+ 0b00_0000_0000_0010_0000 => Some(Look::EndCRLF),
+ 0b00_0000_0000_0100_0000 => Some(Look::WordAscii),
+ 0b00_0000_0000_1000_0000 => Some(Look::WordAsciiNegate),
+ 0b00_0000_0001_0000_0000 => Some(Look::WordUnicode),
+ 0b00_0000_0010_0000_0000 => Some(Look::WordUnicodeNegate),
+ 0b00_0000_0100_0000_0000 => Some(Look::WordStartAscii),
+ 0b00_0000_1000_0000_0000 => Some(Look::WordEndAscii),
+ 0b00_0001_0000_0000_0000 => Some(Look::WordStartUnicode),
+ 0b00_0010_0000_0000_0000 => Some(Look::WordEndUnicode),
+ 0b00_0100_0000_0000_0000 => Some(Look::WordStartHalfAscii),
+ 0b00_1000_0000_0000_0000 => Some(Look::WordEndHalfAscii),
+ 0b01_0000_0000_0000_0000 => Some(Look::WordStartHalfUnicode),
+ 0b10_0000_0000_0000_0000 => Some(Look::WordEndHalfUnicode),
_ => None,
}
}
@@ -171,6 +223,14 @@ impl Look {
Look::WordAsciiNegate => 'B',
Look::WordUnicode => '𝛃',
Look::WordUnicodeNegate => '𝚩',
+ Look::WordStartAscii => '<',
+ Look::WordEndAscii => '>',
+ Look::WordStartUnicode => '〈',
+ Look::WordEndUnicode => '〉',
+ Look::WordStartHalfAscii => '◁',
+ Look::WordEndHalfAscii => '▷',
+ Look::WordStartHalfUnicode => '◀',
+ Look::WordEndHalfUnicode => '▶',
}
}
}
@@ -184,14 +244,14 @@ impl Look {
pub struct LookSet {
/// The underlying representation this set is exposed to make it possible
/// to store it somewhere efficiently. The representation is that
- /// of a bitset, where each assertion occupies bit `i` where `i =
- /// Look::as_repr()`.
+ /// of a bitset, where each assertion occupies bit `i` where
+ /// `i = Look::as_repr()`.
///
/// Note that users of this internal representation must permit the full
/// range of `u16` values to be represented. For example, even if the
/// current implementation only makes use of the 10 least significant bits,
/// it may use more bits in a future semver compatible release.
- pub bits: u16,
+ pub bits: u32,
}
impl LookSet {
@@ -294,13 +354,22 @@ impl LookSet {
pub fn contains_word_unicode(self) -> bool {
self.contains(Look::WordUnicode)
|| self.contains(Look::WordUnicodeNegate)
+ || self.contains(Look::WordStartUnicode)
+ || self.contains(Look::WordEndUnicode)
+ || self.contains(Look::WordStartHalfUnicode)
+ || self.contains(Look::WordEndHalfUnicode)
}
/// Returns true if and only if this set contains any ASCII word boundary
/// or negated ASCII word boundary assertions.
#[inline]
pub fn contains_word_ascii(self) -> bool {
- self.contains(Look::WordAscii) || self.contains(Look::WordAsciiNegate)
+ self.contains(Look::WordAscii)
+ || self.contains(Look::WordAsciiNegate)
+ || self.contains(Look::WordStartAscii)
+ || self.contains(Look::WordEndAscii)
+ || self.contains(Look::WordStartHalfAscii)
+ || self.contains(Look::WordEndHalfAscii)
}
/// Returns an iterator over all of the look-around assertions in this set.
@@ -379,29 +448,31 @@ impl LookSet {
*self = self.intersect(other);
}
- /// Return a `LookSet` from the slice given as a native endian 16-bit
+ /// Return a `LookSet` from the slice given as a native endian 32-bit
/// integer.
///
/// # Panics
///
- /// This panics if `slice.len() < 2`.
+ /// This panics if `slice.len() < 4`.
#[inline]
pub fn read_repr(slice: &[u8]) -> LookSet {
- let bits = u16::from_ne_bytes(slice[..2].try_into().unwrap());
+ let bits = u32::from_ne_bytes(slice[..4].try_into().unwrap());
LookSet { bits }
}
- /// Write a `LookSet` as a native endian 16-bit integer to the beginning
+ /// Write a `LookSet` as a native endian 32-bit integer to the beginning
/// of the slice given.
///
/// # Panics
///
- /// This panics if `slice.len() < 2`.
+ /// This panics if `slice.len() < 4`.
#[inline]
pub fn write_repr(self, slice: &mut [u8]) {
let raw = self.bits.to_ne_bytes();
slice[0] = raw[0];
slice[1] = raw[1];
+ slice[2] = raw[2];
+ slice[3] = raw[3];
}
/// Checks that all assertions in this set can be matched.
@@ -456,9 +527,9 @@ impl Iterator for LookSetIter {
return None;
}
// We'll never have more than u8::MAX distinct look-around assertions,
- // so 'repr' will always fit into a u16.
- let repr = u16::try_from(self.set.bits.trailing_zeros()).unwrap();
- let look = Look::from_repr(1 << repr)?;
+ // so 'bit' will always fit into a u16.
+ let bit = u16::try_from(self.set.bits.trailing_zeros()).unwrap();
+ let look = Look::from_repr(1 << bit)?;
self.set = self.set.remove(look);
Some(look)
}
@@ -566,6 +637,23 @@ impl LookMatcher {
}
/// Like `matches`, but forcefully inlined.
+ ///
+ /// # Panics
+ ///
+ /// This panics when testing any Unicode word boundary assertion in this
+ /// set and when the Unicode word data is not available. Specifically, this
+ /// only occurs when the `unicode-word-boundary` feature is not enabled.
+ ///
+ /// Since it's generally expected that this routine is called inside of
+ /// a matching engine, callers should check the error condition when
+ /// building the matching engine. If there is a Unicode word boundary
+ /// in the matcher and the data isn't available, then the matcher should
+ /// fail to build.
+ ///
+ /// Callers can check the error condition with [`LookSet::available`].
+ ///
+ /// This also may panic when `at > haystack.len()`. Note that `at ==
+ /// haystack.len()` is legal and guaranteed not to panic.
#[cfg_attr(feature = "perf-inline", inline(always))]
pub(crate) fn matches_inline(
&self,
@@ -586,6 +674,26 @@ impl LookMatcher {
Look::WordUnicodeNegate => {
self.is_word_unicode_negate(haystack, at).unwrap()
}
+ Look::WordStartAscii => self.is_word_start_ascii(haystack, at),
+ Look::WordEndAscii => self.is_word_end_ascii(haystack, at),
+ Look::WordStartUnicode => {
+ self.is_word_start_unicode(haystack, at).unwrap()
+ }
+ Look::WordEndUnicode => {
+ self.is_word_end_unicode(haystack, at).unwrap()
+ }
+ Look::WordStartHalfAscii => {
+ self.is_word_start_half_ascii(haystack, at)
+ }
+ Look::WordEndHalfAscii => {
+ self.is_word_end_half_ascii(haystack, at)
+ }
+ Look::WordStartHalfUnicode => {
+ self.is_word_start_half_unicode(haystack, at).unwrap()
+ }
+ Look::WordEndHalfUnicode => {
+ self.is_word_end_half_unicode(haystack, at).unwrap()
+ }
}
}
@@ -680,6 +788,46 @@ impl LookMatcher {
return false;
}
}
+ if set.contains(Look::WordStartAscii) {
+ if !self.is_word_start_ascii(haystack, at) {
+ return false;
+ }
+ }
+ if set.contains(Look::WordEndAscii) {
+ if !self.is_word_end_ascii(haystack, at) {
+ return false;
+ }
+ }
+ if set.contains(Look::WordStartUnicode) {
+ if !self.is_word_start_unicode(haystack, at).unwrap() {
+ return false;
+ }
+ }
+ if set.contains(Look::WordEndUnicode) {
+ if !self.is_word_end_unicode(haystack, at).unwrap() {
+ return false;
+ }
+ }
+ if set.contains(Look::WordStartHalfAscii) {
+ if !self.is_word_start_half_ascii(haystack, at) {
+ return false;
+ }
+ }
+ if set.contains(Look::WordEndHalfAscii) {
+ if !self.is_word_end_half_ascii(haystack, at) {
+ return false;
+ }
+ }
+ if set.contains(Look::WordStartHalfUnicode) {
+ if !self.is_word_start_half_unicode(haystack, at).unwrap() {
+ return false;
+ }
+ }
+ if set.contains(Look::WordEndHalfUnicode) {
+ if !self.is_word_end_half_unicode(haystack, at).unwrap() {
+ return false;
+ }
+ }
true
}
@@ -703,7 +851,15 @@ impl LookMatcher {
Look::WordAscii
| Look::WordAsciiNegate
| Look::WordUnicode
- | Look::WordUnicodeNegate => {
+ | Look::WordUnicodeNegate
+ | Look::WordStartAscii
+ | Look::WordEndAscii
+ | Look::WordStartUnicode
+ | Look::WordEndUnicode
+ | Look::WordStartHalfAscii
+ | Look::WordEndHalfAscii
+ | Look::WordStartHalfUnicode
+ | Look::WordEndHalfUnicode => {
// We need to mark all ranges of bytes whose pairs result in
// evaluating \b differently. This isn't technically correct
// for Unicode word boundaries, but DFAs can't handle those
@@ -931,6 +1087,177 @@ impl LookMatcher {
};
Ok(word_before == word_after)
}
+
+ /// Returns true when [`Look::WordStartAscii`] is satisfied `at` the given
+ /// position in `haystack`.
+ ///
+ /// # Panics
+ ///
+ /// This may panic when `at > haystack.len()`. Note that `at ==
+ /// haystack.len()` is legal and guaranteed not to panic.
+ #[inline]
+ pub fn is_word_start_ascii(&self, haystack: &[u8], at: usize) -> bool {
+ let word_before = at > 0 && utf8::is_word_byte(haystack[at - 1]);
+ let word_after =
+ at < haystack.len() && utf8::is_word_byte(haystack[at]);
+ !word_before && word_after
+ }
+
+ /// Returns true when [`Look::WordEndAscii`] is satisfied `at` the given
+ /// position in `haystack`.
+ ///
+ /// # Panics
+ ///
+ /// This may panic when `at > haystack.len()`. Note that `at ==
+ /// haystack.len()` is legal and guaranteed not to panic.
+ #[inline]
+ pub fn is_word_end_ascii(&self, haystack: &[u8], at: usize) -> bool {
+ let word_before = at > 0 && utf8::is_word_byte(haystack[at - 1]);
+ let word_after =
+ at < haystack.len() && utf8::is_word_byte(haystack[at]);
+ word_before && !word_after
+ }
+
+ /// Returns true when [`Look::WordStartUnicode`] is satisfied `at` the
+ /// given position in `haystack`.
+ ///
+ /// # Panics
+ ///
+ /// This may panic when `at > haystack.len()`. Note that `at ==
+ /// haystack.len()` is legal and guaranteed not to panic.
+ ///
+ /// # Errors
+ ///
+ /// This returns an error when Unicode word boundary tables
+ /// are not available. Specifically, this only occurs when the
+ /// `unicode-word-boundary` feature is not enabled.
+ #[inline]
+ pub fn is_word_start_unicode(
+ &self,
+ haystack: &[u8],
+ at: usize,
+ ) -> Result<bool, UnicodeWordBoundaryError> {
+ let word_before = is_word_char::rev(haystack, at)?;
+ let word_after = is_word_char::fwd(haystack, at)?;
+ Ok(!word_before && word_after)
+ }
+
+ /// Returns true when [`Look::WordEndUnicode`] is satisfied `at` the
+ /// given position in `haystack`.
+ ///
+ /// # Panics
+ ///
+ /// This may panic when `at > haystack.len()`. Note that `at ==
+ /// haystack.len()` is legal and guaranteed not to panic.
+ ///
+ /// # Errors
+ ///
+ /// This returns an error when Unicode word boundary tables
+ /// are not available. Specifically, this only occurs when the
+ /// `unicode-word-boundary` feature is not enabled.
+ #[inline]
+ pub fn is_word_end_unicode(
+ &self,
+ haystack: &[u8],
+ at: usize,
+ ) -> Result<bool, UnicodeWordBoundaryError> {
+ let word_before = is_word_char::rev(haystack, at)?;
+ let word_after = is_word_char::fwd(haystack, at)?;
+ Ok(word_before && !word_after)
+ }
+
+ /// Returns true when [`Look::WordStartHalfAscii`] is satisfied `at` the
+ /// given position in `haystack`.
+ ///
+ /// # Panics
+ ///
+ /// This may panic when `at > haystack.len()`. Note that `at ==
+ /// haystack.len()` is legal and guaranteed not to panic.
+ #[inline]
+ pub fn is_word_start_half_ascii(
+ &self,
+ haystack: &[u8],
+ at: usize,
+ ) -> bool {
+ let word_before = at > 0 && utf8::is_word_byte(haystack[at - 1]);
+ !word_before
+ }
+
+ /// Returns true when [`Look::WordEndHalfAscii`] is satisfied `at` the
+ /// given position in `haystack`.
+ ///
+ /// # Panics
+ ///
+ /// This may panic when `at > haystack.len()`. Note that `at ==
+ /// haystack.len()` is legal and guaranteed not to panic.
+ #[inline]
+ pub fn is_word_end_half_ascii(&self, haystack: &[u8], at: usize) -> bool {
+ let word_after =
+ at < haystack.len() && utf8::is_word_byte(haystack[at]);
+ !word_after
+ }
+
+ /// Returns true when [`Look::WordStartHalfUnicode`] is satisfied `at` the
+ /// given position in `haystack`.
+ ///
+ /// # Panics
+ ///
+ /// This may panic when `at > haystack.len()`. Note that `at ==
+ /// haystack.len()` is legal and guaranteed not to panic.
+ ///
+ /// # Errors
+ ///
+ /// This returns an error when Unicode word boundary tables
+ /// are not available. Specifically, this only occurs when the
+ /// `unicode-word-boundary` feature is not enabled.
+ #[inline]
+ pub fn is_word_start_half_unicode(
+ &self,
+ haystack: &[u8],
+ at: usize,
+ ) -> Result<bool, UnicodeWordBoundaryError> {
+ // See `is_word_unicode_negate` for why we need to do this. We don't
+ // need to do it for `is_word_start_unicode` because that guarantees
+ // that the position matched falls on a valid UTF-8 boundary given
+ // that the right side must be in \w.
+ let word_before = at > 0
+ && match utf8::decode_last(&haystack[..at]) {
+ None | Some(Err(_)) => return Ok(false),
+ Some(Ok(_)) => is_word_char::rev(haystack, at)?,
+ };
+ Ok(!word_before)
+ }
+
+ /// Returns true when [`Look::WordEndHalfUnicode`] is satisfied `at` the
+ /// given position in `haystack`.
+ ///
+ /// # Panics
+ ///
+ /// This may panic when `at > haystack.len()`. Note that `at ==
+ /// haystack.len()` is legal and guaranteed not to panic.
+ ///
+ /// # Errors
+ ///
+ /// This returns an error when Unicode word boundary tables
+ /// are not available. Specifically, this only occurs when the
+ /// `unicode-word-boundary` feature is not enabled.
+ #[inline]
+ pub fn is_word_end_half_unicode(
+ &self,
+ haystack: &[u8],
+ at: usize,
+ ) -> Result<bool, UnicodeWordBoundaryError> {
+ // See `is_word_unicode_negate` for why we need to do this. We don't
+ // need to do it for `is_word_end_unicode` because that guarantees
+ // that the position matched falls on a valid UTF-8 boundary given
+ // that the left side must be in \w.
+ let word_after = at < haystack.len()
+ && match utf8::decode(&haystack[at..]) {
+ None | Some(Err(_)) => return Ok(false),
+ Some(Ok(_)) => is_word_char::fwd(haystack, at)?,
+ };
+ Ok(!word_after)
+ }
}
impl Default for LookMatcher {
@@ -1024,7 +1351,9 @@ impl core::fmt::Display for UnicodeWordBoundaryError {
// There are perhaps other choices as well. Why did I stop at these 4? Because
// I wanted to preserve my sanity. I suspect I'll wind up adding the lazy DFA
// approach eventually, as the benefits of the DFA approach are somewhat
-// compelling. The 'boundary-words-holmes' benchmark tests this:
+// compelling. The 'boundary-words-holmes' benchmark tests this. (Note that
+// the commands below no longer work. If necessary, we should re-capitulate
+// the benchmark from whole cloth in rebar.)
//
// $ regex-cli bench measure -f boundary-words-holmes -e pikevm > dfa.csv
//
@@ -1322,8 +1651,7 @@ mod is_word_char {
fn is_word_character(c: char) -> bool {
use crate::util::{unicode_data::perl_word::PERL_WORD, utf8};
- // MSRV(1.59): Use 'u8::try_from(c)' instead.
- if u8::try_from(u32::from(c)).map_or(false, utf8::is_word_byte) {
+ if u8::try_from(c).map_or(false, utf8::is_word_byte) {
return true;
}
PERL_WORD
@@ -1656,6 +1984,434 @@ mod tests {
}
#[test]
+ fn look_matches_word_start_ascii() {
+ let look = Look::WordStartAscii;
+
+ // \xF0\x9D\x9B\x83 = 𝛃 (in \w)
+ // \xF0\x90\x86\x80 = 𐆀 (not in \w)
+
+ // Simple ASCII word boundaries.
+ assert!(testlook!(look, "a", 0));
+ assert!(!testlook!(look, "a", 1));
+ assert!(!testlook!(look, "a ", 1));
+ assert!(testlook!(look, " a ", 1));
+ assert!(!testlook!(look, " a ", 2));
+
+ // Unicode word boundaries with a non-ASCII codepoint. Since this is
+ // an ASCII word boundary, none of these match.
+ assert!(!testlook!(look, "𝛃", 0));
+ assert!(!testlook!(look, "𝛃", 4));
+ assert!(!testlook!(look, "𝛃 ", 4));
+ assert!(!testlook!(look, " 𝛃 ", 1));
+ assert!(!testlook!(look, " 𝛃 ", 5));
+
+ // Unicode word boundaries between non-ASCII codepoints. Again, since
+ // this is an ASCII word boundary, none of these match.
+ assert!(!testlook!(look, "𝛃𐆀", 0));
+ assert!(!testlook!(look, "𝛃𐆀", 4));
+
+ // Non word boundaries for ASCII.
+ assert!(!testlook!(look, "", 0));
+ assert!(!testlook!(look, "ab", 1));
+ assert!(!testlook!(look, "a ", 2));
+ assert!(!testlook!(look, " a ", 0));
+ assert!(!testlook!(look, " a ", 3));
+
+ // Non word boundaries with a non-ASCII codepoint.
+ assert!(testlook!(look, "𝛃b", 4));
+ assert!(!testlook!(look, "b𝛃", 1));
+ assert!(!testlook!(look, "𝛃 ", 5));
+ assert!(!testlook!(look, " 𝛃 ", 0));
+ assert!(!testlook!(look, " 𝛃 ", 6));
+ assert!(!testlook!(look, "𝛃", 1));
+ assert!(!testlook!(look, "𝛃", 2));
+ assert!(!testlook!(look, "𝛃", 3));
+
+ // Non word boundaries with non-ASCII codepoints.
+ assert!(!testlook!(look, "𝛃𐆀", 1));
+ assert!(!testlook!(look, "𝛃𐆀", 2));
+ assert!(!testlook!(look, "𝛃𐆀", 3));
+ assert!(!testlook!(look, "𝛃𐆀", 5));
+ assert!(!testlook!(look, "𝛃𐆀", 6));
+ assert!(!testlook!(look, "𝛃𐆀", 7));
+ assert!(!testlook!(look, "𝛃𐆀", 8));
+ }
+
+ #[test]
+ fn look_matches_word_end_ascii() {
+ let look = Look::WordEndAscii;
+
+ // \xF0\x9D\x9B\x83 = 𝛃 (in \w)
+ // \xF0\x90\x86\x80 = 𐆀 (not in \w)
+
+ // Simple ASCII word boundaries.
+ assert!(!testlook!(look, "a", 0));
+ assert!(testlook!(look, "a", 1));
+ assert!(testlook!(look, "a ", 1));
+ assert!(!testlook!(look, " a ", 1));
+ assert!(testlook!(look, " a ", 2));
+
+ // Unicode word boundaries with a non-ASCII codepoint. Since this is
+ // an ASCII word boundary, none of these match.
+ assert!(!testlook!(look, "𝛃", 0));
+ assert!(!testlook!(look, "𝛃", 4));
+ assert!(!testlook!(look, "𝛃 ", 4));
+ assert!(!testlook!(look, " 𝛃 ", 1));
+ assert!(!testlook!(look, " 𝛃 ", 5));
+
+ // Unicode word boundaries between non-ASCII codepoints. Again, since
+ // this is an ASCII word boundary, none of these match.
+ assert!(!testlook!(look, "𝛃𐆀", 0));
+ assert!(!testlook!(look, "𝛃𐆀", 4));
+
+ // Non word boundaries for ASCII.
+ assert!(!testlook!(look, "", 0));
+ assert!(!testlook!(look, "ab", 1));
+ assert!(!testlook!(look, "a ", 2));
+ assert!(!testlook!(look, " a ", 0));
+ assert!(!testlook!(look, " a ", 3));
+
+ // Non word boundaries with a non-ASCII codepoint.
+ assert!(!testlook!(look, "𝛃b", 4));
+ assert!(testlook!(look, "b𝛃", 1));
+ assert!(!testlook!(look, "𝛃 ", 5));
+ assert!(!testlook!(look, " 𝛃 ", 0));
+ assert!(!testlook!(look, " 𝛃 ", 6));
+ assert!(!testlook!(look, "𝛃", 1));
+ assert!(!testlook!(look, "𝛃", 2));
+ assert!(!testlook!(look, "𝛃", 3));
+
+ // Non word boundaries with non-ASCII codepoints.
+ assert!(!testlook!(look, "𝛃𐆀", 1));
+ assert!(!testlook!(look, "𝛃𐆀", 2));
+ assert!(!testlook!(look, "𝛃𐆀", 3));
+ assert!(!testlook!(look, "𝛃𐆀", 5));
+ assert!(!testlook!(look, "𝛃𐆀", 6));
+ assert!(!testlook!(look, "𝛃𐆀", 7));
+ assert!(!testlook!(look, "𝛃𐆀", 8));
+ }
+
+ #[test]
+ #[cfg(all(not(miri), feature = "unicode-word-boundary"))]
+ fn look_matches_word_start_unicode() {
+ let look = Look::WordStartUnicode;
+
+ // \xF0\x9D\x9B\x83 = 𝛃 (in \w)
+ // \xF0\x90\x86\x80 = 𐆀 (not in \w)
+
+ // Simple ASCII word boundaries.
+ assert!(testlook!(look, "a", 0));
+ assert!(!testlook!(look, "a", 1));
+ assert!(!testlook!(look, "a ", 1));
+ assert!(testlook!(look, " a ", 1));
+ assert!(!testlook!(look, " a ", 2));
+
+ // Unicode word boundaries with a non-ASCII codepoint.
+ assert!(testlook!(look, "𝛃", 0));
+ assert!(!testlook!(look, "𝛃", 4));
+ assert!(!testlook!(look, "𝛃 ", 4));
+ assert!(testlook!(look, " 𝛃 ", 1));
+ assert!(!testlook!(look, " 𝛃 ", 5));
+
+ // Unicode word boundaries between non-ASCII codepoints.
+ assert!(testlook!(look, "𝛃𐆀", 0));
+ assert!(!testlook!(look, "𝛃𐆀", 4));
+
+ // Non word boundaries for ASCII.
+ assert!(!testlook!(look, "", 0));
+ assert!(!testlook!(look, "ab", 1));
+ assert!(!testlook!(look, "a ", 2));
+ assert!(!testlook!(look, " a ", 0));
+ assert!(!testlook!(look, " a ", 3));
+
+ // Non word boundaries with a non-ASCII codepoint.
+ assert!(!testlook!(look, "𝛃b", 4));
+ assert!(!testlook!(look, "b𝛃", 1));
+ assert!(!testlook!(look, "𝛃 ", 5));
+ assert!(!testlook!(look, " 𝛃 ", 0));
+ assert!(!testlook!(look, " 𝛃 ", 6));
+ assert!(!testlook!(look, "𝛃", 1));
+ assert!(!testlook!(look, "𝛃", 2));
+ assert!(!testlook!(look, "𝛃", 3));
+
+ // Non word boundaries with non-ASCII codepoints.
+ assert!(!testlook!(look, "𝛃𐆀", 1));
+ assert!(!testlook!(look, "𝛃𐆀", 2));
+ assert!(!testlook!(look, "𝛃𐆀", 3));
+ assert!(!testlook!(look, "𝛃𐆀", 5));
+ assert!(!testlook!(look, "𝛃𐆀", 6));
+ assert!(!testlook!(look, "𝛃𐆀", 7));
+ assert!(!testlook!(look, "𝛃𐆀", 8));
+ }
+
+ #[test]
+ #[cfg(all(not(miri), feature = "unicode-word-boundary"))]
+ fn look_matches_word_end_unicode() {
+ let look = Look::WordEndUnicode;
+
+ // \xF0\x9D\x9B\x83 = 𝛃 (in \w)
+ // \xF0\x90\x86\x80 = 𐆀 (not in \w)
+
+ // Simple ASCII word boundaries.
+ assert!(!testlook!(look, "a", 0));
+ assert!(testlook!(look, "a", 1));
+ assert!(testlook!(look, "a ", 1));
+ assert!(!testlook!(look, " a ", 1));
+ assert!(testlook!(look, " a ", 2));
+
+ // Unicode word boundaries with a non-ASCII codepoint.
+ assert!(!testlook!(look, "𝛃", 0));
+ assert!(testlook!(look, "𝛃", 4));
+ assert!(testlook!(look, "𝛃 ", 4));
+ assert!(!testlook!(look, " 𝛃 ", 1));
+ assert!(testlook!(look, " 𝛃 ", 5));
+
+ // Unicode word boundaries between non-ASCII codepoints.
+ assert!(!testlook!(look, "𝛃𐆀", 0));
+ assert!(testlook!(look, "𝛃𐆀", 4));
+
+ // Non word boundaries for ASCII.
+ assert!(!testlook!(look, "", 0));
+ assert!(!testlook!(look, "ab", 1));
+ assert!(!testlook!(look, "a ", 2));
+ assert!(!testlook!(look, " a ", 0));
+ assert!(!testlook!(look, " a ", 3));
+
+ // Non word boundaries with a non-ASCII codepoint.
+ assert!(!testlook!(look, "𝛃b", 4));
+ assert!(!testlook!(look, "b𝛃", 1));
+ assert!(!testlook!(look, "𝛃 ", 5));
+ assert!(!testlook!(look, " 𝛃 ", 0));
+ assert!(!testlook!(look, " 𝛃 ", 6));
+ assert!(!testlook!(look, "𝛃", 1));
+ assert!(!testlook!(look, "𝛃", 2));
+ assert!(!testlook!(look, "𝛃", 3));
+
+ // Non word boundaries with non-ASCII codepoints.
+ assert!(!testlook!(look, "𝛃𐆀", 1));
+ assert!(!testlook!(look, "𝛃𐆀", 2));
+ assert!(!testlook!(look, "𝛃𐆀", 3));
+ assert!(!testlook!(look, "𝛃𐆀", 5));
+ assert!(!testlook!(look, "𝛃𐆀", 6));
+ assert!(!testlook!(look, "𝛃𐆀", 7));
+ assert!(!testlook!(look, "𝛃𐆀", 8));
+ }
+
+ #[test]
+ fn look_matches_word_start_half_ascii() {
+ let look = Look::WordStartHalfAscii;
+
+ // \xF0\x9D\x9B\x83 = 𝛃 (in \w)
+ // \xF0\x90\x86\x80 = 𐆀 (not in \w)
+
+ // Simple ASCII word boundaries.
+ assert!(testlook!(look, "a", 0));
+ assert!(!testlook!(look, "a", 1));
+ assert!(!testlook!(look, "a ", 1));
+ assert!(testlook!(look, " a ", 1));
+ assert!(!testlook!(look, " a ", 2));
+
+ // Unicode word boundaries with a non-ASCII codepoint. Since this is
+ // an ASCII word boundary, none of these match.
+ assert!(testlook!(look, "𝛃", 0));
+ assert!(testlook!(look, "𝛃", 4));
+ assert!(testlook!(look, "𝛃 ", 4));
+ assert!(testlook!(look, " 𝛃 ", 1));
+ assert!(testlook!(look, " 𝛃 ", 5));
+
+ // Unicode word boundaries between non-ASCII codepoints. Again, since
+ // this is an ASCII word boundary, none of these match.
+ assert!(testlook!(look, "𝛃𐆀", 0));
+ assert!(testlook!(look, "𝛃𐆀", 4));
+
+ // Non word boundaries for ASCII.
+ assert!(testlook!(look, "", 0));
+ assert!(!testlook!(look, "ab", 1));
+ assert!(testlook!(look, "a ", 2));
+ assert!(testlook!(look, " a ", 0));
+ assert!(testlook!(look, " a ", 3));
+
+ // Non word boundaries with a non-ASCII codepoint.
+ assert!(testlook!(look, "𝛃b", 4));
+ assert!(!testlook!(look, "b𝛃", 1));
+ assert!(testlook!(look, "𝛃 ", 5));
+ assert!(testlook!(look, " 𝛃 ", 0));
+ assert!(testlook!(look, " 𝛃 ", 6));
+ assert!(testlook!(look, "𝛃", 1));
+ assert!(testlook!(look, "𝛃", 2));
+ assert!(testlook!(look, "𝛃", 3));
+
+ // Non word boundaries with non-ASCII codepoints.
+ assert!(testlook!(look, "𝛃𐆀", 1));
+ assert!(testlook!(look, "𝛃𐆀", 2));
+ assert!(testlook!(look, "𝛃𐆀", 3));
+ assert!(testlook!(look, "𝛃𐆀", 5));
+ assert!(testlook!(look, "𝛃𐆀", 6));
+ assert!(testlook!(look, "𝛃𐆀", 7));
+ assert!(testlook!(look, "𝛃𐆀", 8));
+ }
+
+ #[test]
+ fn look_matches_word_end_half_ascii() {
+ let look = Look::WordEndHalfAscii;
+
+ // \xF0\x9D\x9B\x83 = 𝛃 (in \w)
+ // \xF0\x90\x86\x80 = 𐆀 (not in \w)
+
+ // Simple ASCII word boundaries.
+ assert!(!testlook!(look, "a", 0));
+ assert!(testlook!(look, "a", 1));
+ assert!(testlook!(look, "a ", 1));
+ assert!(!testlook!(look, " a ", 1));
+ assert!(testlook!(look, " a ", 2));
+
+ // Unicode word boundaries with a non-ASCII codepoint. Since this is
+ // an ASCII word boundary, none of these match.
+ assert!(testlook!(look, "𝛃", 0));
+ assert!(testlook!(look, "𝛃", 4));
+ assert!(testlook!(look, "𝛃 ", 4));
+ assert!(testlook!(look, " 𝛃 ", 1));
+ assert!(testlook!(look, " 𝛃 ", 5));
+
+ // Unicode word boundaries between non-ASCII codepoints. Again, since
+ // this is an ASCII word boundary, none of these match.
+ assert!(testlook!(look, "𝛃𐆀", 0));
+ assert!(testlook!(look, "𝛃𐆀", 4));
+
+ // Non word boundaries for ASCII.
+ assert!(testlook!(look, "", 0));
+ assert!(!testlook!(look, "ab", 1));
+ assert!(testlook!(look, "a ", 2));
+ assert!(testlook!(look, " a ", 0));
+ assert!(testlook!(look, " a ", 3));
+
+ // Non word boundaries with a non-ASCII codepoint.
+ assert!(!testlook!(look, "𝛃b", 4));
+ assert!(testlook!(look, "b𝛃", 1));
+ assert!(testlook!(look, "𝛃 ", 5));
+ assert!(testlook!(look, " 𝛃 ", 0));
+ assert!(testlook!(look, " 𝛃 ", 6));
+ assert!(testlook!(look, "𝛃", 1));
+ assert!(testlook!(look, "𝛃", 2));
+ assert!(testlook!(look, "𝛃", 3));
+
+ // Non word boundaries with non-ASCII codepoints.
+ assert!(testlook!(look, "𝛃𐆀", 1));
+ assert!(testlook!(look, "𝛃𐆀", 2));
+ assert!(testlook!(look, "𝛃𐆀", 3));
+ assert!(testlook!(look, "𝛃𐆀", 5));
+ assert!(testlook!(look, "𝛃𐆀", 6));
+ assert!(testlook!(look, "𝛃𐆀", 7));
+ assert!(testlook!(look, "𝛃𐆀", 8));
+ }
+
+ #[test]
+ #[cfg(all(not(miri), feature = "unicode-word-boundary"))]
+ fn look_matches_word_start_half_unicode() {
+ let look = Look::WordStartHalfUnicode;
+
+ // \xF0\x9D\x9B\x83 = 𝛃 (in \w)
+ // \xF0\x90\x86\x80 = 𐆀 (not in \w)
+
+ // Simple ASCII word boundaries.
+ assert!(testlook!(look, "a", 0));
+ assert!(!testlook!(look, "a", 1));
+ assert!(!testlook!(look, "a ", 1));
+ assert!(testlook!(look, " a ", 1));
+ assert!(!testlook!(look, " a ", 2));
+
+ // Unicode word boundaries with a non-ASCII codepoint.
+ assert!(testlook!(look, "𝛃", 0));
+ assert!(!testlook!(look, "𝛃", 4));
+ assert!(!testlook!(look, "𝛃 ", 4));
+ assert!(testlook!(look, " 𝛃 ", 1));
+ assert!(!testlook!(look, " 𝛃 ", 5));
+
+ // Unicode word boundaries between non-ASCII codepoints.
+ assert!(testlook!(look, "𝛃𐆀", 0));
+ assert!(!testlook!(look, "𝛃𐆀", 4));
+
+ // Non word boundaries for ASCII.
+ assert!(testlook!(look, "", 0));
+ assert!(!testlook!(look, "ab", 1));
+ assert!(testlook!(look, "a ", 2));
+ assert!(testlook!(look, " a ", 0));
+ assert!(testlook!(look, " a ", 3));
+
+ // Non word boundaries with a non-ASCII codepoint.
+ assert!(!testlook!(look, "𝛃b", 4));
+ assert!(!testlook!(look, "b𝛃", 1));
+ assert!(testlook!(look, "𝛃 ", 5));
+ assert!(testlook!(look, " 𝛃 ", 0));
+ assert!(testlook!(look, " 𝛃 ", 6));
+ assert!(!testlook!(look, "𝛃", 1));
+ assert!(!testlook!(look, "𝛃", 2));
+ assert!(!testlook!(look, "𝛃", 3));
+
+ // Non word boundaries with non-ASCII codepoints.
+ assert!(!testlook!(look, "𝛃𐆀", 1));
+ assert!(!testlook!(look, "𝛃𐆀", 2));
+ assert!(!testlook!(look, "𝛃𐆀", 3));
+ assert!(!testlook!(look, "𝛃𐆀", 5));
+ assert!(!testlook!(look, "𝛃𐆀", 6));
+ assert!(!testlook!(look, "𝛃𐆀", 7));
+ assert!(testlook!(look, "𝛃𐆀", 8));
+ }
+
+ #[test]
+ #[cfg(all(not(miri), feature = "unicode-word-boundary"))]
+ fn look_matches_word_end_half_unicode() {
+ let look = Look::WordEndHalfUnicode;
+
+ // \xF0\x9D\x9B\x83 = 𝛃 (in \w)
+ // \xF0\x90\x86\x80 = 𐆀 (not in \w)
+
+ // Simple ASCII word boundaries.
+ assert!(!testlook!(look, "a", 0));
+ assert!(testlook!(look, "a", 1));
+ assert!(testlook!(look, "a ", 1));
+ assert!(!testlook!(look, " a ", 1));
+ assert!(testlook!(look, " a ", 2));
+
+ // Unicode word boundaries with a non-ASCII codepoint.
+ assert!(!testlook!(look, "𝛃", 0));
+ assert!(testlook!(look, "𝛃", 4));
+ assert!(testlook!(look, "𝛃 ", 4));
+ assert!(!testlook!(look, " 𝛃 ", 1));
+ assert!(testlook!(look, " 𝛃 ", 5));
+
+ // Unicode word boundaries between non-ASCII codepoints.
+ assert!(!testlook!(look, "𝛃𐆀", 0));
+ assert!(testlook!(look, "𝛃𐆀", 4));
+
+ // Non word boundaries for ASCII.
+ assert!(testlook!(look, "", 0));
+ assert!(!testlook!(look, "ab", 1));
+ assert!(testlook!(look, "a ", 2));
+ assert!(testlook!(look, " a ", 0));
+ assert!(testlook!(look, " a ", 3));
+
+ // Non word boundaries with a non-ASCII codepoint.
+ assert!(!testlook!(look, "𝛃b", 4));
+ assert!(!testlook!(look, "b𝛃", 1));
+ assert!(testlook!(look, "𝛃 ", 5));
+ assert!(testlook!(look, " 𝛃 ", 0));
+ assert!(testlook!(look, " 𝛃 ", 6));
+ assert!(!testlook!(look, "𝛃", 1));
+ assert!(!testlook!(look, "𝛃", 2));
+ assert!(!testlook!(look, "𝛃", 3));
+
+ // Non word boundaries with non-ASCII codepoints.
+ assert!(!testlook!(look, "𝛃𐆀", 1));
+ assert!(!testlook!(look, "𝛃𐆀", 2));
+ assert!(!testlook!(look, "𝛃𐆀", 3));
+ assert!(!testlook!(look, "𝛃𐆀", 5));
+ assert!(!testlook!(look, "𝛃𐆀", 6));
+ assert!(!testlook!(look, "𝛃𐆀", 7));
+ assert!(testlook!(look, "𝛃𐆀", 8));
+ }
+
+ #[test]
fn look_set() {
let mut f = LookSet::default();
assert!(!f.contains(Look::Start));
@@ -1716,6 +2472,46 @@ mod tests {
assert!(f.contains(Look::WordAsciiNegate));
f = f.remove(Look::WordAsciiNegate);
assert!(!f.contains(Look::WordAsciiNegate));
+
+ f = f.insert(Look::WordStartAscii);
+ assert!(f.contains(Look::WordStartAscii));
+ f = f.remove(Look::WordStartAscii);
+ assert!(!f.contains(Look::WordStartAscii));
+
+ f = f.insert(Look::WordEndAscii);
+ assert!(f.contains(Look::WordEndAscii));
+ f = f.remove(Look::WordEndAscii);
+ assert!(!f.contains(Look::WordEndAscii));
+
+ f = f.insert(Look::WordStartUnicode);
+ assert!(f.contains(Look::WordStartUnicode));
+ f = f.remove(Look::WordStartUnicode);
+ assert!(!f.contains(Look::WordStartUnicode));
+
+ f = f.insert(Look::WordEndUnicode);
+ assert!(f.contains(Look::WordEndUnicode));
+ f = f.remove(Look::WordEndUnicode);
+ assert!(!f.contains(Look::WordEndUnicode));
+
+ f = f.insert(Look::WordStartHalfAscii);
+ assert!(f.contains(Look::WordStartHalfAscii));
+ f = f.remove(Look::WordStartHalfAscii);
+ assert!(!f.contains(Look::WordStartHalfAscii));
+
+ f = f.insert(Look::WordEndHalfAscii);
+ assert!(f.contains(Look::WordEndHalfAscii));
+ f = f.remove(Look::WordEndHalfAscii);
+ assert!(!f.contains(Look::WordEndHalfAscii));
+
+ f = f.insert(Look::WordStartHalfUnicode);
+ assert!(f.contains(Look::WordStartHalfUnicode));
+ f = f.remove(Look::WordStartHalfUnicode);
+ assert!(!f.contains(Look::WordStartHalfUnicode));
+
+ f = f.insert(Look::WordEndHalfUnicode);
+ assert!(f.contains(Look::WordEndHalfUnicode));
+ f = f.remove(Look::WordEndHalfUnicode);
+ assert!(!f.contains(Look::WordEndHalfUnicode));
}
#[test]
@@ -1724,7 +2520,7 @@ mod tests {
assert_eq!(0, set.iter().count());
let set = LookSet::full();
- assert_eq!(10, set.iter().count());
+ assert_eq!(18, set.iter().count());
let set =
LookSet::empty().insert(Look::StartLF).insert(Look::WordUnicode);
@@ -1735,6 +2531,9 @@ mod tests {
let set = LookSet::empty().insert(Look::WordAsciiNegate);
assert_eq!(1, set.iter().count());
+
+ let set = LookSet::empty().insert(Look::WordEndHalfUnicode);
+ assert_eq!(1, set.iter().count());
}
#[test]
@@ -1743,6 +2542,6 @@ mod tests {
let res = alloc::format!("{:?}", LookSet::empty());
assert_eq!("∅", res);
let res = alloc::format!("{:?}", LookSet::full());
- assert_eq!("Az^$rRbB𝛃𝚩", res);
+ assert_eq!("Az^$rRbB𝛃𝚩<>〈〉◁▷◀▶", res);
}
}
diff --git a/vendor/regex-automata/src/util/mod.rs b/vendor/regex-automata/src/util/mod.rs
index bb739df1d..b3eef64e6 100644
--- a/vendor/regex-automata/src/util/mod.rs
+++ b/vendor/regex-automata/src/util/mod.rs
@@ -40,6 +40,7 @@ pub mod look;
pub mod pool;
pub mod prefilter;
pub mod primitives;
+pub mod start;
#[cfg(feature = "syntax")]
pub mod syntax;
pub mod wire;
@@ -52,6 +53,5 @@ pub(crate) mod memchr;
pub(crate) mod search;
#[cfg(feature = "alloc")]
pub(crate) mod sparse_set;
-pub(crate) mod start;
pub(crate) mod unicode_data;
pub(crate) mod utf8;
diff --git a/vendor/regex-automata/src/util/pool.rs b/vendor/regex-automata/src/util/pool.rs
index c03d7b013..d90d4ecff 100644
--- a/vendor/regex-automata/src/util/pool.rs
+++ b/vendor/regex-automata/src/util/pool.rs
@@ -177,6 +177,7 @@ impl<T: Send, F: Fn() -> T> Pool<T, F> {
/// the value to go back into the pool) and then calling get again is
/// *not* guaranteed to return the same value received in the first `get`
/// call.
+ #[inline]
pub fn get(&self) -> PoolGuard<'_, T, F> {
PoolGuard(self.0.get())
}
@@ -200,6 +201,7 @@ impl<'a, T: Send, F: Fn() -> T> PoolGuard<'a, T, F> {
/// This circumvents the guard's `Drop` implementation. This can be useful
/// in circumstances where the automatic `Drop` results in poorer codegen,
/// such as calling non-inlined functions.
+ #[inline]
pub fn put(this: PoolGuard<'_, T, F>) {
inner::PoolGuard::put(this.0);
}
@@ -208,12 +210,14 @@ impl<'a, T: Send, F: Fn() -> T> PoolGuard<'a, T, F> {
impl<'a, T: Send, F: Fn() -> T> core::ops::Deref for PoolGuard<'a, T, F> {
type Target = T;
+ #[inline]
fn deref(&self) -> &T {
self.0.value()
}
}
impl<'a, T: Send, F: Fn() -> T> core::ops::DerefMut for PoolGuard<'a, T, F> {
+ #[inline]
fn deref_mut(&mut self) -> &mut T {
self.0.value_mut()
}
@@ -451,11 +455,44 @@ mod inner {
/// Create a new pool. The given closure is used to create values in
/// the pool when necessary.
pub(super) fn new(create: F) -> Pool<T, F> {
- // MSRV(1.63): Mark this function as 'const'. I've arranged the
- // code such that it should "just work." Then mark the public
- // 'Pool::new' method as 'const' too. (The alloc-only Pool::new
- // is already 'const', so that should "just work" too.) The only
- // thing we're waiting for is Mutex::new to be const.
+ // FIXME: Now that we require 1.65+, Mutex::new is available as
+ // const... So we can almost mark this function as const. But of
+ // course, we're creating a Vec of stacks below (we didn't when I
+ // originally wrote this code). It seems like the best way to work
+ // around this would be to use a `[Stack; MAX_POOL_STACKS]` instead
+ // of a `Vec<Stack>`. I refrained from making this change at time
+ // of writing (2023/10/08) because I was making a lot of other
+ // changes at the same time and wanted to do this more carefully.
+ // Namely, because of the cache line optimization, that `[Stack;
+ // MAX_POOL_STACKS]` would be quite big. It's unclear how bad (if
+ // at all) that would be.
+ //
+ // Another choice would be to lazily allocate the stacks, but...
+ // I'm not so sure about that. Seems like a fair bit of complexity?
+ //
+ // Maybe there's a simple solution I'm missing.
+ //
+ // ... OK, I tried to fix this. First, I did it by putting `stacks`
+ // in an `UnsafeCell` and using a `Once` to lazily initialize it.
+ // I benchmarked it and everything looked okay. I then made this
+ // function `const` and thought I was just about done. But the
+ // public pool type wraps its inner pool in a `Box` to keep its
+ // size down. Blech.
+ //
+ // So then I thought that I could push the box down into this
+ // type (and leave the non-std version unboxed) and use the same
+ // `UnsafeCell` technique to lazily initialize it. This has the
+ // downside of the `Once` now needing to get hit in the owner fast
+ // path, but maybe that's OK? However, I then realized that we can
+ // only lazily initialize `stacks`, `owner` and `owner_val`. The
+ // `create` function needs to be put somewhere outside of the box.
+ // So now the pool is a `Box`, `Once` and a function. Now we're
+ // starting to defeat the point of boxing in the first place. So I
+ // backed out that change too.
+ //
+ // Back to square one. I maybe we just don't make a pool's
+ // constructor const and live with it. It's probably not a huge
+ // deal.
let mut stacks = Vec::with_capacity(MAX_POOL_STACKS);
for _ in 0..stacks.capacity() {
stacks.push(CacheLine(Mutex::new(vec![])));
@@ -469,6 +506,7 @@ mod inner {
impl<T: Send, F: Fn() -> T> Pool<T, F> {
/// Get a value from the pool. This may block if another thread is also
/// attempting to retrieve a value from the pool.
+ #[inline]
pub(super) fn get(&self) -> PoolGuard<'_, T, F> {
// Our fast path checks if the caller is the thread that "owns"
// this pool. Or stated differently, whether it is the first thread
@@ -562,6 +600,7 @@ mod inner {
/// Puts a value back into the pool. Callers don't need to call this.
/// Once the guard that's returned by 'get' is dropped, it is put back
/// into the pool automatically.
+ #[inline]
fn put_value(&self, value: Box<T>) {
let caller = THREAD_ID.with(|id| *id);
let stack_id = caller % self.stacks.len();
@@ -587,11 +626,13 @@ mod inner {
}
/// Create a guard that represents the special owned T.
+ #[inline]
fn guard_owned(&self, caller: usize) -> PoolGuard<'_, T, F> {
PoolGuard { pool: self, value: Err(caller), discard: false }
}
/// Create a guard that contains a value from the pool's stack.
+ #[inline]
fn guard_stack(&self, value: Box<T>) -> PoolGuard<'_, T, F> {
PoolGuard { pool: self, value: Ok(value), discard: false }
}
@@ -599,6 +640,7 @@ mod inner {
/// Create a guard that contains a value from the pool's stack with an
/// instruction to throw away the value instead of putting it back
/// into the pool.
+ #[inline]
fn guard_stack_transient(&self, value: Box<T>) -> PoolGuard<'_, T, F> {
PoolGuard { pool: self, value: Ok(value), discard: true }
}
@@ -633,6 +675,7 @@ mod inner {
impl<'a, T: Send, F: Fn() -> T> PoolGuard<'a, T, F> {
/// Return the underlying value.
+ #[inline]
pub(super) fn value(&self) -> &T {
match self.value {
Ok(ref v) => &**v,
@@ -657,6 +700,7 @@ mod inner {
}
/// Return the underlying value as a mutable borrow.
+ #[inline]
pub(super) fn value_mut(&mut self) -> &mut T {
match self.value {
Ok(ref mut v) => &mut **v,
@@ -681,6 +725,7 @@ mod inner {
}
/// Consumes this guard and puts it back into the pool.
+ #[inline]
pub(super) fn put(this: PoolGuard<'_, T, F>) {
// Since this is effectively consuming the guard and putting the
// value back into the pool, there's no reason to run its Drop
@@ -729,6 +774,7 @@ mod inner {
}
impl<'a, T: Send, F: Fn() -> T> Drop for PoolGuard<'a, T, F> {
+ #[inline]
fn drop(&mut self) {
self.put_imp();
}
@@ -806,6 +852,7 @@ mod inner {
impl<T: Send, F: Fn() -> T> Pool<T, F> {
/// Get a value from the pool. This may block if another thread is also
/// attempting to retrieve a value from the pool.
+ #[inline]
pub(super) fn get(&self) -> PoolGuard<'_, T, F> {
let mut stack = self.stack.lock();
let value = match stack.pop() {
@@ -815,6 +862,7 @@ mod inner {
PoolGuard { pool: self, value: Some(value) }
}
+ #[inline]
fn put(&self, guard: PoolGuard<'_, T, F>) {
let mut guard = core::mem::ManuallyDrop::new(guard);
if let Some(value) = guard.value.take() {
@@ -825,6 +873,7 @@ mod inner {
/// Puts a value back into the pool. Callers don't need to call this.
/// Once the guard that's returned by 'get' is dropped, it is put back
/// into the pool automatically.
+ #[inline]
fn put_value(&self, value: Box<T>) {
let mut stack = self.stack.lock();
stack.push(value);
@@ -847,16 +896,19 @@ mod inner {
impl<'a, T: Send, F: Fn() -> T> PoolGuard<'a, T, F> {
/// Return the underlying value.
+ #[inline]
pub(super) fn value(&self) -> &T {
self.value.as_deref().unwrap()
}
/// Return the underlying value as a mutable borrow.
+ #[inline]
pub(super) fn value_mut(&mut self) -> &mut T {
self.value.as_deref_mut().unwrap()
}
/// Consumes this guard and puts it back into the pool.
+ #[inline]
pub(super) fn put(this: PoolGuard<'_, T, F>) {
// Since this is effectively consuming the guard and putting the
// value back into the pool, there's no reason to run its Drop
@@ -878,6 +930,7 @@ mod inner {
}
impl<'a, T: Send, F: Fn() -> T> Drop for PoolGuard<'a, T, F> {
+ #[inline]
fn drop(&mut self) {
self.put_imp();
}
@@ -931,6 +984,7 @@ mod inner {
/// Lock this mutex and return a guard providing exclusive access to
/// `T`. This blocks if some other thread has already locked this
/// mutex.
+ #[inline]
fn lock(&self) -> MutexGuard<'_, T> {
while self
.locked
@@ -963,18 +1017,21 @@ mod inner {
impl<'a, T> core::ops::Deref for MutexGuard<'a, T> {
type Target = T;
+ #[inline]
fn deref(&self) -> &T {
self.data
}
}
impl<'a, T> core::ops::DerefMut for MutexGuard<'a, T> {
+ #[inline]
fn deref_mut(&mut self) -> &mut T {
self.data
}
}
impl<'a, T> Drop for MutexGuard<'a, T> {
+ #[inline]
fn drop(&mut self) {
// Drop means 'data' is no longer accessible, so we can unlock
// the mutex.
diff --git a/vendor/regex-automata/src/util/start.rs b/vendor/regex-automata/src/util/start.rs
index 4e360d083..27153780e 100644
--- a/vendor/regex-automata/src/util/start.rs
+++ b/vendor/regex-automata/src/util/start.rs
@@ -1,17 +1,195 @@
/*!
-Provides some helpers for dealing with start state configurations in DFAs.
-
-[`Start`] represents the possible starting configurations, while
-[`StartByteMap`] represents a way to retrieve the `Start` configuration for a
-given position in a haystack.
+Provides helpers for dealing with start state configurations in DFAs.
*/
use crate::util::{
look::LookMatcher,
- search::Input,
+ search::{Anchored, Input},
wire::{self, DeserializeError, SerializeError},
};
+/// The configuration used to determine a DFA's start state for a search.
+///
+/// A DFA has a single starting state in the typical textbook description. That
+/// is, it corresponds to the set of all starting states for the NFA that built
+/// it, along with their espsilon closures. In this crate, however, DFAs have
+/// many possible start states due to a few factors:
+///
+/// * DFAs support the ability to run either anchored or unanchored searches.
+/// Each type of search needs its own start state. For example, an unanchored
+/// search requires starting at a state corresponding to a regex with a
+/// `(?s-u:.)*?` prefix, which will match through anything.
+/// * DFAs also optionally support starting an anchored search for any one
+/// specific pattern. Each such pattern requires its own start state.
+/// * If a look-behind assertion like `^` or `\b` is used in the regex, then
+/// the DFA will need to inspect a single byte immediately before the start of
+/// the search to choose the correct start state.
+///
+/// Indeed, this configuration precisely encapsulates all of the above factors.
+/// The [`Config::anchored`] method sets which kind of anchored search to
+/// perform while the [`Config::look_behind`] method provides a way to set
+/// the byte that occurs immediately before the start of the search.
+///
+/// Generally speaking, this type is only useful when you want to run searches
+/// without using an [`Input`]. In particular, an `Input` wants a haystack
+/// slice, but callers may not have a contiguous sequence of bytes as a
+/// haystack in all cases. This type provides a lower level of control such
+/// that callers can provide their own anchored configuration and look-behind
+/// byte explicitly.
+///
+/// # Example
+///
+/// This shows basic usage that permits running a search with a DFA without
+/// using the `Input` abstraction.
+///
+/// ```
+/// use regex_automata::{
+/// dfa::{Automaton, dense},
+/// util::start,
+/// Anchored,
+/// };
+///
+/// let dfa = dense::DFA::new(r"(?-u)\b\w+\b")?;
+/// let haystack = "quartz";
+///
+/// let config = start::Config::new().anchored(Anchored::Yes);
+/// let mut state = dfa.start_state(&config)?;
+/// for &b in haystack.as_bytes().iter() {
+/// state = dfa.next_state(state, b);
+/// }
+/// state = dfa.next_eoi_state(state);
+/// assert!(dfa.is_match_state(state));
+///
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+///
+/// This example shows how to correctly run a search that doesn't begin at
+/// the start of a haystack. Notice how we set the look-behind byte, and as
+/// a result, the `\b` assertion does not match.
+///
+/// ```
+/// use regex_automata::{
+/// dfa::{Automaton, dense},
+/// util::start,
+/// Anchored,
+/// };
+///
+/// let dfa = dense::DFA::new(r"(?-u)\b\w+\b")?;
+/// let haystack = "quartz";
+///
+/// let config = start::Config::new()
+/// .anchored(Anchored::Yes)
+/// .look_behind(Some(b'q'));
+/// let mut state = dfa.start_state(&config)?;
+/// for &b in haystack.as_bytes().iter().skip(1) {
+/// state = dfa.next_state(state, b);
+/// }
+/// state = dfa.next_eoi_state(state);
+/// // No match!
+/// assert!(!dfa.is_match_state(state));
+///
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+///
+/// If we had instead not set a look-behind byte, then the DFA would assume
+/// that it was starting at the beginning of the haystack, and thus `\b` should
+/// match. This in turn would result in erroneously reporting a match:
+///
+/// ```
+/// use regex_automata::{
+/// dfa::{Automaton, dense},
+/// util::start,
+/// Anchored,
+/// };
+///
+/// let dfa = dense::DFA::new(r"(?-u)\b\w+\b")?;
+/// let haystack = "quartz";
+///
+/// // Whoops, forgot the look-behind byte...
+/// let config = start::Config::new().anchored(Anchored::Yes);
+/// let mut state = dfa.start_state(&config)?;
+/// for &b in haystack.as_bytes().iter().skip(1) {
+/// state = dfa.next_state(state, b);
+/// }
+/// state = dfa.next_eoi_state(state);
+/// // And now we get a match unexpectedly.
+/// assert!(dfa.is_match_state(state));
+///
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+#[derive(Clone, Debug)]
+pub struct Config {
+ look_behind: Option<u8>,
+ anchored: Anchored,
+}
+
+impl Config {
+ /// Create a new default start configuration.
+ ///
+ /// The default is an unanchored search that starts at the beginning of the
+ /// haystack.
+ pub fn new() -> Config {
+ Config { anchored: Anchored::No, look_behind: None }
+ }
+
+ /// A convenience routine for building a start configuration from an
+ /// [`Input`] for a forward search.
+ ///
+ /// This automatically sets the look-behind byte to the byte immediately
+ /// preceding the start of the search. If the start of the search is at
+ /// offset `0`, then no look-behind byte is set.
+ pub fn from_input_forward(input: &Input<'_>) -> Config {
+ let look_behind = input
+ .start()
+ .checked_sub(1)
+ .and_then(|i| input.haystack().get(i).copied());
+ Config { look_behind, anchored: input.get_anchored() }
+ }
+
+ /// A convenience routine for building a start configuration from an
+ /// [`Input`] for a reverse search.
+ ///
+ /// This automatically sets the look-behind byte to the byte immediately
+ /// following the end of the search. If the end of the search is at
+ /// offset `haystack.len()`, then no look-behind byte is set.
+ pub fn from_input_reverse(input: &Input<'_>) -> Config {
+ let look_behind = input.haystack().get(input.end()).copied();
+ Config { look_behind, anchored: input.get_anchored() }
+ }
+
+ /// Set the look-behind byte at the start of a search.
+ ///
+ /// Unless the search is intended to logically start at the beginning of a
+ /// haystack, this should _always_ be set to the byte immediately preceding
+ /// the start of the search. If no look-behind byte is set, then the start
+ /// configuration will assume it is at the beginning of the haystack. For
+ /// example, the anchor `^` will match.
+ ///
+ /// The default is that no look-behind byte is set.
+ pub fn look_behind(mut self, byte: Option<u8>) -> Config {
+ self.look_behind = byte;
+ self
+ }
+
+ /// Set the anchored mode of a search.
+ ///
+ /// The default is an unanchored search.
+ pub fn anchored(mut self, mode: Anchored) -> Config {
+ self.anchored = mode;
+ self
+ }
+
+ /// Return the look-behind byte in this configuration, if one exists.
+ pub fn get_look_behind(&self) -> Option<u8> {
+ self.look_behind
+ }
+
+ /// Return the anchored mode in this configuration.
+ pub fn get_anchored(&self) -> Anchored {
+ self.anchored
+ }
+}
+
/// A map from every possible byte value to its corresponding starting
/// configuration.
///
@@ -71,30 +249,11 @@ impl StartByteMap {
StartByteMap { map }
}
- /// Return the forward starting configuration for the given `input`.
- #[cfg_attr(feature = "perf-inline", inline(always))]
- pub(crate) fn fwd(&self, input: &Input) -> Start {
- match input
- .start()
- .checked_sub(1)
- .and_then(|i| input.haystack().get(i))
- {
- None => Start::Text,
- Some(&byte) => self.get(byte),
- }
- }
-
- /// Return the reverse starting configuration for the given `input`.
- #[cfg_attr(feature = "perf-inline", inline(always))]
- pub(crate) fn rev(&self, input: &Input) -> Start {
- match input.haystack().get(input.end()) {
- None => Start::Text,
- Some(&byte) => self.get(byte),
- }
- }
-
+ /// Return the starting configuration for the given look-behind byte.
+ ///
+ /// If no look-behind exists, callers should use `Start::Text`.
#[cfg_attr(feature = "perf-inline", inline(always))]
- fn get(&self, byte: u8) -> Start {
+ pub(crate) fn get(&self, byte: u8) -> Start {
self.map[usize::from(byte)]
}
@@ -253,21 +412,32 @@ mod tests {
#[test]
fn start_fwd_done_range() {
let smap = StartByteMap::new(&LookMatcher::default());
- assert_eq!(Start::Text, smap.fwd(&Input::new("").range(1..0)));
+ let input = Input::new("").range(1..0);
+ let config = Config::from_input_forward(&input);
+ let start =
+ config.get_look_behind().map_or(Start::Text, |b| smap.get(b));
+ assert_eq!(Start::Text, start);
}
#[test]
fn start_rev_done_range() {
let smap = StartByteMap::new(&LookMatcher::default());
- assert_eq!(Start::Text, smap.rev(&Input::new("").range(1..0)));
+ let input = Input::new("").range(1..0);
+ let config = Config::from_input_reverse(&input);
+ let start =
+ config.get_look_behind().map_or(Start::Text, |b| smap.get(b));
+ assert_eq!(Start::Text, start);
}
#[test]
fn start_fwd() {
let f = |haystack, start, end| {
let smap = StartByteMap::new(&LookMatcher::default());
- let input = &Input::new(haystack).range(start..end);
- smap.fwd(input)
+ let input = Input::new(haystack).range(start..end);
+ let config = Config::from_input_forward(&input);
+ let start =
+ config.get_look_behind().map_or(Start::Text, |b| smap.get(b));
+ start
};
assert_eq!(Start::Text, f("", 0, 0));
@@ -287,8 +457,11 @@ mod tests {
fn start_rev() {
let f = |haystack, start, end| {
let smap = StartByteMap::new(&LookMatcher::default());
- let input = &Input::new(haystack).range(start..end);
- smap.rev(input)
+ let input = Input::new(haystack).range(start..end);
+ let config = Config::from_input_reverse(&input);
+ let start =
+ config.get_look_behind().map_or(Start::Text, |b| smap.get(b));
+ start
};
assert_eq!(Start::Text, f("", 0, 0));