summaryrefslogtreecommitdiffstats
path: root/third_party/rust/regex-automata/src/util/search.rs
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/rust/regex-automata/src/util/search.rs')
-rw-r--r--third_party/rust/regex-automata/src/util/search.rs1969
1 files changed, 1969 insertions, 0 deletions
diff --git a/third_party/rust/regex-automata/src/util/search.rs b/third_party/rust/regex-automata/src/util/search.rs
new file mode 100644
index 0000000000..39aec522be
--- /dev/null
+++ b/third_party/rust/regex-automata/src/util/search.rs
@@ -0,0 +1,1969 @@
+/*!
+Types and routines that support the search APIs of most regex engines.
+
+This sub-module isn't exposed directly, but rather, its contents are exported
+at the crate root due to the universality of most of the types and routines in
+this module.
+*/
+
+use core::ops::{Range, RangeBounds};
+
+use crate::util::{escape::DebugByte, primitives::PatternID, utf8};
+
+/// The parameters for a regex search including the haystack to search.
+///
+/// It turns out that regex searches have a few parameters, and in most cases,
+/// those parameters have defaults that work in the vast majority of cases.
+/// This `Input` type exists to make that common case seamnless while also
+/// providing an avenue for changing the parameters of a search. In particular,
+/// this type enables doing so without a combinatorial explosion of different
+/// methods and/or superfluous parameters in the common cases.
+///
+/// An `Input` permits configuring the following things:
+///
+/// * Search only a substring of a haystack, while taking the broader context
+/// into account for resolving look-around assertions.
+/// * Indicating whether to search for all patterns in a regex, or to
+/// only search for one pattern in particular.
+/// * Whether to perform an anchored on unanchored search.
+/// * Whether to report a match as early as possible.
+///
+/// All of these parameters, except for the haystack, have sensible default
+/// values. This means that the minimal search configuration is simply a call
+/// to [`Input::new`] with your haystack. Setting any other parameter is
+/// optional.
+///
+/// Moreover, for any `H` that implements `AsRef<[u8]>`, there exists a
+/// `From<H> for Input` implementation. This is useful because many of the
+/// search APIs in this crate accept an `Into<Input>`. This means you can
+/// provide string or byte strings to these routines directly, and they'll
+/// automatically get converted into an `Input` for you.
+///
+/// The lifetime parameter `'h` refers to the lifetime of the haystack.
+///
+/// # Organization
+///
+/// The API of `Input` is split into a few different parts:
+///
+/// * A builder-like API that transforms a `Input` by value. Examples:
+/// [`Input::span`] and [`Input::anchored`].
+/// * A setter API that permits mutating parameters in place. Examples:
+/// [`Input::set_span`] and [`Input::set_anchored`].
+/// * A getter API that permits retrieving any of the search parameters.
+/// Examples: [`Input::get_span`] and [`Input::get_anchored`].
+/// * A few convenience getter routines that don't conform to the above naming
+/// pattern due to how common they are. Examples: [`Input::haystack`],
+/// [`Input::start`] and [`Input::end`].
+/// * Miscellaneous predicates and other helper routines that are useful
+/// in some contexts. Examples: [`Input::is_char_boundary`].
+///
+/// A `Input` exposes so much because it is meant to be used by both callers of
+/// regex engines _and_ implementors of regex engines. A constraining factor is
+/// that regex engines should accept a `&Input` as its lowest level API, which
+/// means that implementors should only use the "getter" APIs of a `Input`.
+///
+/// # Valid bounds and search termination
+///
+/// An `Input` permits setting the bounds of a search via either
+/// [`Input::span`] or [`Input::range`]. The bounds set must be valid, or
+/// else a panic will occur. Bounds are valid if and only if:
+///
+/// * The bounds represent a valid range into the input's haystack.
+/// * **or** the end bound is a valid ending bound for the haystack *and*
+/// the start bound is exactly one greater than the start bound.
+///
+/// In the latter case, [`Input::is_done`] will return true and indicates any
+/// search receiving such an input should immediately return with no match.
+///
+/// Note that while `Input` is used for reverse searches in this crate, the
+/// `Input::is_done` predicate assumes a forward search. Because unsigned
+/// offsets are used internally, there is no way to tell from only the offsets
+/// whether a reverse search is done or not.
+///
+/// # Regex engine support
+///
+/// Any regex engine accepting an `Input` must support at least the following
+/// things:
+///
+/// * Searching a `&[u8]` for matches.
+/// * Searching a substring of `&[u8]` for a match, such that any match
+/// reported must appear entirely within that substring.
+/// * For a forwards search, a match should never be reported when
+/// [`Input::is_done`] returns true. (For reverse searches, termination should
+/// be handled outside of `Input`.)
+///
+/// Supporting other aspects of an `Input` are optional, but regex engines
+/// should handle aspects they don't support gracefully. How this is done is
+/// generally up to the regex engine. This crate generally treats unsupported
+/// anchored modes as an error to report for example, but for simplicity, in
+/// the meta regex engine, trying to search with an invalid pattern ID just
+/// results in no match being reported.
+#[derive(Clone)]
+pub struct Input<'h> {
+ haystack: &'h [u8],
+ span: Span,
+ anchored: Anchored,
+ earliest: bool,
+}
+
+impl<'h> Input<'h> {
+ /// Create a new search configuration for the given haystack.
+ #[inline]
+ pub fn new<H: ?Sized + AsRef<[u8]>>(haystack: &'h H) -> Input<'h> {
+ Input {
+ haystack: haystack.as_ref(),
+ span: Span { start: 0, end: haystack.as_ref().len() },
+ anchored: Anchored::No,
+ earliest: false,
+ }
+ }
+
+ /// Set the span for this search.
+ ///
+ /// This routine does not panic if the span given is not a valid range for
+ /// this search's haystack. If this search is run with an invalid range,
+ /// then the most likely outcome is that the actual search execution will
+ /// panic.
+ ///
+ /// This routine is generic over how a span is provided. While
+ /// a [`Span`] may be given directly, one may also provide a
+ /// `std::ops::Range<usize>`. To provide anything supported by range
+ /// syntax, use the [`Input::range`] method.
+ ///
+ /// The default span is the entire haystack.
+ ///
+ /// Note that [`Input::range`] overrides this method and vice versa.
+ ///
+ /// # Panics
+ ///
+ /// This panics if the given span does not correspond to valid bounds in
+ /// the haystack or the termination of a search.
+ ///
+ /// # Example
+ ///
+ /// This example shows how the span of the search can impact whether a
+ /// match is reported or not. This is particularly relevant for look-around
+ /// operators, which might take things outside of the span into account
+ /// when determining whether they match.
+ ///
+ /// ```
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::{
+ /// nfa::thompson::pikevm::PikeVM,
+ /// Match, Input,
+ /// };
+ ///
+ /// // Look for 'at', but as a distinct word.
+ /// let re = PikeVM::new(r"\bat\b")?;
+ /// let mut cache = re.create_cache();
+ /// let mut caps = re.create_captures();
+ ///
+ /// // Our haystack contains 'at', but not as a distinct word.
+ /// let haystack = "batter";
+ ///
+ /// // A standard search finds nothing, as expected.
+ /// let input = Input::new(haystack);
+ /// re.search(&mut cache, &input, &mut caps);
+ /// assert_eq!(None, caps.get_match());
+ ///
+ /// // But if we wanted to search starting at position '1', we might
+ /// // slice the haystack. If we do this, it's impossible for the \b
+ /// // anchors to take the surrounding context into account! And thus,
+ /// // a match is produced.
+ /// let input = Input::new(&haystack[1..3]);
+ /// re.search(&mut cache, &input, &mut caps);
+ /// assert_eq!(Some(Match::must(0, 0..2)), caps.get_match());
+ ///
+ /// // But if we specify the span of the search instead of slicing the
+ /// // haystack, then the regex engine can "see" outside of the span
+ /// // and resolve the anchors correctly.
+ /// let input = Input::new(haystack).span(1..3);
+ /// re.search(&mut cache, &input, &mut caps);
+ /// assert_eq!(None, caps.get_match());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// This may seem a little ham-fisted, but this scenario tends to come up
+ /// if some other regex engine found the match span and now you need to
+ /// re-process that span to look for capturing groups. (e.g., Run a faster
+ /// DFA first, find a match, then run the PikeVM on just the match span to
+ /// resolve capturing groups.) In order to implement that sort of logic
+ /// correctly, you need to set the span on the search instead of slicing
+ /// the haystack directly.
+ ///
+ /// The other advantage of using this routine to specify the bounds of the
+ /// search is that the match offsets are still reported in terms of the
+ /// original haystack. For example, the second search in the example above
+ /// reported a match at position `0`, even though `at` starts at offset
+ /// `1` because we sliced the haystack.
+ #[inline]
+ pub fn span<S: Into<Span>>(mut self, span: S) -> Input<'h> {
+ self.set_span(span);
+ self
+ }
+
+ /// Like `Input::span`, but accepts any range instead.
+ ///
+ /// This routine does not panic if the range given is not a valid range for
+ /// this search's haystack. If this search is run with an invalid range,
+ /// then the most likely outcome is that the actual search execution will
+ /// panic.
+ ///
+ /// The default range is the entire haystack.
+ ///
+ /// Note that [`Input::span`] overrides this method and vice versa.
+ ///
+ /// # Panics
+ ///
+ /// This routine will panic if the given range could not be converted
+ /// to a valid [`Range`]. For example, this would panic when given
+ /// `0..=usize::MAX` since it cannot be represented using a half-open
+ /// interval in terms of `usize`.
+ ///
+ /// This also panics if the given range does not correspond to valid bounds
+ /// in the haystack or the termination of a search.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::Input;
+ ///
+ /// let input = Input::new("foobar");
+ /// assert_eq!(0..6, input.get_range());
+ ///
+ /// let input = Input::new("foobar").range(2..=4);
+ /// assert_eq!(2..5, input.get_range());
+ /// ```
+ #[inline]
+ pub fn range<R: RangeBounds<usize>>(mut self, range: R) -> Input<'h> {
+ self.set_range(range);
+ self
+ }
+
+ /// Sets the anchor mode of a search.
+ ///
+ /// When a search is anchored (so that's [`Anchored::Yes`] or
+ /// [`Anchored::Pattern`]), a match must begin at the start of a search.
+ /// When a search is not anchored (that's [`Anchored::No`]), regex engines
+ /// will behave as if the pattern started with a `(?s-u:.)*?`. This prefix
+ /// permits a match to appear anywhere.
+ ///
+ /// By default, the anchored mode is [`Anchored::No`].
+ ///
+ /// **WARNING:** this is subtly different than using a `^` at the start of
+ /// your regex. A `^` forces a regex to match exclusively at the start of
+ /// a haystack, regardless of where you begin your search. In contrast,
+ /// anchoring a search will allow your regex to match anywhere in your
+ /// haystack, but the match must start at the beginning of a search.
+ ///
+ /// For example, consider the haystack `aba` and the following searches:
+ ///
+ /// 1. The regex `^a` is compiled with `Anchored::No` and searches `aba`
+ /// starting at position `2`. Since `^` requires the match to start at
+ /// the beginning of the haystack and `2 > 0`, no match is found.
+ /// 2. The regex `a` is compiled with `Anchored::Yes` and searches `aba`
+ /// starting at position `2`. This reports a match at `[2, 3]` since
+ /// the match starts where the search started. Since there is no `^`,
+ /// there is no requirement for the match to start at the beginning of
+ /// the haystack.
+ /// 3. The regex `a` is compiled with `Anchored::Yes` and searches `aba`
+ /// starting at position `1`. Since `b` corresponds to position `1` and
+ /// since the search is anchored, it finds no match. While the regex
+ /// matches at other positions, configuring the search to be anchored
+ /// requires that it only report a match that begins at the same offset
+ /// as the beginning of the search.
+ /// 4. The regex `a` is compiled with `Anchored::No` and searches `aba`
+ /// starting at position `1`. Since the search is not anchored and
+ /// the regex does not start with `^`, the search executes as if there
+ /// is a `(?s:.)*?` prefix that permits it to match anywhere. Thus, it
+ /// reports a match at `[2, 3]`.
+ ///
+ /// Note that the [`Anchored::Pattern`] mode is like `Anchored::Yes`,
+ /// except it only reports matches for a particular pattern.
+ ///
+ /// # Example
+ ///
+ /// This demonstrates the differences between an anchored search and
+ /// a pattern that begins with `^` (as described in the above warning
+ /// message).
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// nfa::thompson::pikevm::PikeVM,
+ /// Anchored, Match, Input,
+ /// };
+ ///
+ /// let haystack = "aba";
+ ///
+ /// let re = PikeVM::new(r"^a")?;
+ /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
+ /// let input = Input::new(haystack).span(2..3).anchored(Anchored::No);
+ /// re.search(&mut cache, &input, &mut caps);
+ /// // No match is found because 2 is not the beginning of the haystack,
+ /// // which is what ^ requires.
+ /// assert_eq!(None, caps.get_match());
+ ///
+ /// let re = PikeVM::new(r"a")?;
+ /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
+ /// let input = Input::new(haystack).span(2..3).anchored(Anchored::Yes);
+ /// re.search(&mut cache, &input, &mut caps);
+ /// // An anchored search can still match anywhere in the haystack, it just
+ /// // must begin at the start of the search which is '2' in this case.
+ /// assert_eq!(Some(Match::must(0, 2..3)), caps.get_match());
+ ///
+ /// let re = PikeVM::new(r"a")?;
+ /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
+ /// let input = Input::new(haystack).span(1..3).anchored(Anchored::Yes);
+ /// re.search(&mut cache, &input, &mut caps);
+ /// // No match is found since we start searching at offset 1 which
+ /// // corresponds to 'b'. Since there is no '(?s:.)*?' prefix, no match
+ /// // is found.
+ /// assert_eq!(None, caps.get_match());
+ ///
+ /// let re = PikeVM::new(r"a")?;
+ /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
+ /// let input = Input::new(haystack).span(1..3).anchored(Anchored::No);
+ /// re.search(&mut cache, &input, &mut caps);
+ /// // Since anchored=no, an implicit '(?s:.)*?' prefix was added to the
+ /// // pattern. Even though the search starts at 'b', the 'match anything'
+ /// // prefix allows the search to match 'a'.
+ /// let expected = Some(Match::must(0, 2..3));
+ /// assert_eq!(expected, caps.get_match());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn anchored(mut self, mode: Anchored) -> Input<'h> {
+ self.set_anchored(mode);
+ self
+ }
+
+ /// Whether to execute an "earliest" search or not.
+ ///
+ /// When running a non-overlapping search, an "earliest" search will return
+ /// the match location as early as possible. For example, given a pattern
+ /// of `foo[0-9]+` and a haystack of `foo12345`, a normal leftmost search
+ /// will return `foo12345` as a match. But an "earliest" search for regex
+ /// engines that support "earliest" semantics will return `foo1` as a
+ /// match, since as soon as the first digit following `foo` is seen, it is
+ /// known to have found a match.
+ ///
+ /// Note that "earliest" semantics generally depend on the regex engine.
+ /// Different regex engines may determine there is a match at different
+ /// points. So there is no guarantee that "earliest" matches will always
+ /// return the same offsets for all regex engines. The "earliest" notion
+ /// is really about when the particular regex engine determines there is
+ /// a match rather than a consistent semantic unto itself. This is often
+ /// useful for implementing "did a match occur or not" predicates, but
+ /// sometimes the offset is useful as well.
+ ///
+ /// This is disabled by default.
+ ///
+ /// # Example
+ ///
+ /// This example shows the difference between "earliest" searching and
+ /// normal searching.
+ ///
+ /// ```
+ /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match, Input};
+ ///
+ /// let re = PikeVM::new(r"foo[0-9]+")?;
+ /// let mut cache = re.create_cache();
+ /// let mut caps = re.create_captures();
+ ///
+ /// // A normal search implements greediness like you expect.
+ /// let input = Input::new("foo12345");
+ /// re.search(&mut cache, &input, &mut caps);
+ /// assert_eq!(Some(Match::must(0, 0..8)), caps.get_match());
+ ///
+ /// // When 'earliest' is enabled and the regex engine supports
+ /// // it, the search will bail once it knows a match has been
+ /// // found.
+ /// let input = Input::new("foo12345").earliest(true);
+ /// re.search(&mut cache, &input, &mut caps);
+ /// assert_eq!(Some(Match::must(0, 0..4)), caps.get_match());
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn earliest(mut self, yes: bool) -> Input<'h> {
+ self.set_earliest(yes);
+ self
+ }
+
+ /// Set the span for this search configuration.
+ ///
+ /// This is like the [`Input::span`] method, except this mutates the
+ /// span in place.
+ ///
+ /// This routine is generic over how a span is provided. While
+ /// a [`Span`] may be given directly, one may also provide a
+ /// `std::ops::Range<usize>`.
+ ///
+ /// # Panics
+ ///
+ /// This panics if the given span does not correspond to valid bounds in
+ /// the haystack or the termination of a search.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::Input;
+ ///
+ /// let mut input = Input::new("foobar");
+ /// assert_eq!(0..6, input.get_range());
+ /// input.set_span(2..4);
+ /// assert_eq!(2..4, input.get_range());
+ /// ```
+ #[inline]
+ pub fn set_span<S: Into<Span>>(&mut self, span: S) {
+ let span = span.into();
+ assert!(
+ span.end <= self.haystack.len()
+ && span.start <= span.end.wrapping_add(1),
+ "invalid span {:?} for haystack of length {}",
+ span,
+ self.haystack.len(),
+ );
+ self.span = span;
+ }
+
+ /// Set the span for this search configuration given any range.
+ ///
+ /// This is like the [`Input::range`] method, except this mutates the
+ /// span in place.
+ ///
+ /// This routine does not panic if the range given is not a valid range for
+ /// this search's haystack. If this search is run with an invalid range,
+ /// then the most likely outcome is that the actual search execution will
+ /// panic.
+ ///
+ /// # Panics
+ ///
+ /// This routine will panic if the given range could not be converted
+ /// to a valid [`Range`]. For example, this would panic when given
+ /// `0..=usize::MAX` since it cannot be represented using a half-open
+ /// interval in terms of `usize`.
+ ///
+ /// This also panics if the given span does not correspond to valid bounds
+ /// in the haystack or the termination of a search.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::Input;
+ ///
+ /// let mut input = Input::new("foobar");
+ /// assert_eq!(0..6, input.get_range());
+ /// input.set_range(2..=4);
+ /// assert_eq!(2..5, input.get_range());
+ /// ```
+ #[inline]
+ pub fn set_range<R: RangeBounds<usize>>(&mut self, range: R) {
+ use core::ops::Bound;
+
+ // It's a little weird to convert ranges into spans, and then spans
+ // back into ranges when we actually slice the haystack. Because
+ // of that process, we always represent everything as a half-open
+ // internal. Therefore, handling things like m..=n is a little awkward.
+ let start = match range.start_bound() {
+ Bound::Included(&i) => i,
+ // Can this case ever happen? Range syntax doesn't support it...
+ Bound::Excluded(&i) => i.checked_add(1).unwrap(),
+ Bound::Unbounded => 0,
+ };
+ let end = match range.end_bound() {
+ Bound::Included(&i) => i.checked_add(1).unwrap(),
+ Bound::Excluded(&i) => i,
+ Bound::Unbounded => self.haystack().len(),
+ };
+ self.set_span(Span { start, end });
+ }
+
+ /// Set the starting offset for the span for this search configuration.
+ ///
+ /// This is a convenience routine for only mutating the start of a span
+ /// without having to set the entire span.
+ ///
+ /// # Panics
+ ///
+ /// This panics if the span resulting from the new start position does not
+ /// correspond to valid bounds in the haystack or the termination of a
+ /// search.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::Input;
+ ///
+ /// let mut input = Input::new("foobar");
+ /// assert_eq!(0..6, input.get_range());
+ /// input.set_start(5);
+ /// assert_eq!(5..6, input.get_range());
+ /// ```
+ #[inline]
+ pub fn set_start(&mut self, start: usize) {
+ self.set_span(Span { start, ..self.get_span() });
+ }
+
+ /// Set the ending offset for the span for this search configuration.
+ ///
+ /// This is a convenience routine for only mutating the end of a span
+ /// without having to set the entire span.
+ ///
+ /// # Panics
+ ///
+ /// This panics if the span resulting from the new end position does not
+ /// correspond to valid bounds in the haystack or the termination of a
+ /// search.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::Input;
+ ///
+ /// let mut input = Input::new("foobar");
+ /// assert_eq!(0..6, input.get_range());
+ /// input.set_end(5);
+ /// assert_eq!(0..5, input.get_range());
+ /// ```
+ #[inline]
+ pub fn set_end(&mut self, end: usize) {
+ self.set_span(Span { end, ..self.get_span() });
+ }
+
+ /// Set the anchor mode of a search.
+ ///
+ /// This is like [`Input::anchored`], except it mutates the search
+ /// configuration in place.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{Anchored, Input, PatternID};
+ ///
+ /// let mut input = Input::new("foobar");
+ /// assert_eq!(Anchored::No, input.get_anchored());
+ ///
+ /// let pid = PatternID::must(5);
+ /// input.set_anchored(Anchored::Pattern(pid));
+ /// assert_eq!(Anchored::Pattern(pid), input.get_anchored());
+ /// ```
+ #[inline]
+ pub fn set_anchored(&mut self, mode: Anchored) {
+ self.anchored = mode;
+ }
+
+ /// Set whether the search should execute in "earliest" mode or not.
+ ///
+ /// This is like [`Input::earliest`], except it mutates the search
+ /// configuration in place.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::Input;
+ ///
+ /// let mut input = Input::new("foobar");
+ /// assert!(!input.get_earliest());
+ /// input.set_earliest(true);
+ /// assert!(input.get_earliest());
+ /// ```
+ #[inline]
+ pub fn set_earliest(&mut self, yes: bool) {
+ self.earliest = yes;
+ }
+
+ /// Return a borrow of the underlying haystack as a slice of bytes.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::Input;
+ ///
+ /// let input = Input::new("foobar");
+ /// assert_eq!(b"foobar", input.haystack());
+ /// ```
+ #[inline]
+ pub fn haystack(&self) -> &[u8] {
+ self.haystack
+ }
+
+ /// Return the start position of this search.
+ ///
+ /// This is a convenience routine for `search.get_span().start()`.
+ ///
+ /// When [`Input::is_done`] is `false`, this is guaranteed to return
+ /// an offset that is less than or equal to [`Input::end`]. Otherwise,
+ /// the offset is one greater than [`Input::end`].
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::Input;
+ ///
+ /// let input = Input::new("foobar");
+ /// assert_eq!(0, input.start());
+ ///
+ /// let input = Input::new("foobar").span(2..4);
+ /// assert_eq!(2, input.start());
+ /// ```
+ #[inline]
+ pub fn start(&self) -> usize {
+ self.get_span().start
+ }
+
+ /// Return the end position of this search.
+ ///
+ /// This is a convenience routine for `search.get_span().end()`.
+ ///
+ /// This is guaranteed to return an offset that is a valid exclusive end
+ /// bound for this input's haystack.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::Input;
+ ///
+ /// let input = Input::new("foobar");
+ /// assert_eq!(6, input.end());
+ ///
+ /// let input = Input::new("foobar").span(2..4);
+ /// assert_eq!(4, input.end());
+ /// ```
+ #[inline]
+ pub fn end(&self) -> usize {
+ self.get_span().end
+ }
+
+ /// Return the span for this search configuration.
+ ///
+ /// If one was not explicitly set, then the span corresponds to the entire
+ /// range of the haystack.
+ ///
+ /// When [`Input::is_done`] is `false`, the span returned is guaranteed
+ /// to correspond to valid bounds for this input's haystack.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{Input, Span};
+ ///
+ /// let input = Input::new("foobar");
+ /// assert_eq!(Span { start: 0, end: 6 }, input.get_span());
+ /// ```
+ #[inline]
+ pub fn get_span(&self) -> Span {
+ self.span
+ }
+
+ /// Return the span as a range for this search configuration.
+ ///
+ /// If one was not explicitly set, then the span corresponds to the entire
+ /// range of the haystack.
+ ///
+ /// When [`Input::is_done`] is `false`, the range returned is guaranteed
+ /// to correspond to valid bounds for this input's haystack.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::Input;
+ ///
+ /// let input = Input::new("foobar");
+ /// assert_eq!(0..6, input.get_range());
+ /// ```
+ #[inline]
+ pub fn get_range(&self) -> Range<usize> {
+ self.get_span().range()
+ }
+
+ /// Return the anchored mode for this search configuration.
+ ///
+ /// If no anchored mode was set, then it defaults to [`Anchored::No`].
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{Anchored, Input, PatternID};
+ ///
+ /// let mut input = Input::new("foobar");
+ /// assert_eq!(Anchored::No, input.get_anchored());
+ ///
+ /// let pid = PatternID::must(5);
+ /// input.set_anchored(Anchored::Pattern(pid));
+ /// assert_eq!(Anchored::Pattern(pid), input.get_anchored());
+ /// ```
+ #[inline]
+ pub fn get_anchored(&self) -> Anchored {
+ self.anchored
+ }
+
+ /// Return whether this search should execute in "earliest" mode.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::Input;
+ ///
+ /// let input = Input::new("foobar");
+ /// assert!(!input.get_earliest());
+ /// ```
+ #[inline]
+ pub fn get_earliest(&self) -> bool {
+ self.earliest
+ }
+
+ /// Return true if and only if this search can never return any other
+ /// matches.
+ ///
+ /// This occurs when the start position of this search is greater than the
+ /// end position of the search.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::Input;
+ ///
+ /// let mut input = Input::new("foobar");
+ /// assert!(!input.is_done());
+ /// input.set_start(6);
+ /// assert!(!input.is_done());
+ /// input.set_start(7);
+ /// assert!(input.is_done());
+ /// ```
+ #[inline]
+ pub fn is_done(&self) -> bool {
+ self.get_span().start > self.get_span().end
+ }
+
+ /// Returns true if and only if the given offset in this search's haystack
+ /// falls on a valid UTF-8 encoded codepoint boundary.
+ ///
+ /// If the haystack is not valid UTF-8, then the behavior of this routine
+ /// is unspecified.
+ ///
+ /// # Example
+ ///
+ /// This shows where codepoint boundaries do and don't exist in valid
+ /// UTF-8.
+ ///
+ /// ```
+ /// use regex_automata::Input;
+ ///
+ /// let input = Input::new("☃");
+ /// assert!(input.is_char_boundary(0));
+ /// assert!(!input.is_char_boundary(1));
+ /// assert!(!input.is_char_boundary(2));
+ /// assert!(input.is_char_boundary(3));
+ /// assert!(!input.is_char_boundary(4));
+ /// ```
+ #[inline]
+ pub fn is_char_boundary(&self, offset: usize) -> bool {
+ utf8::is_boundary(self.haystack(), offset)
+ }
+}
+
+impl<'h> core::fmt::Debug for Input<'h> {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ use crate::util::escape::DebugHaystack;
+
+ f.debug_struct("Input")
+ .field("haystack", &DebugHaystack(self.haystack()))
+ .field("span", &self.span)
+ .field("anchored", &self.anchored)
+ .field("earliest", &self.earliest)
+ .finish()
+ }
+}
+
+impl<'h, H: ?Sized + AsRef<[u8]>> From<&'h H> for Input<'h> {
+ fn from(haystack: &'h H) -> Input<'h> {
+ Input::new(haystack)
+ }
+}
+
+/// A representation of a span reported by a regex engine.
+///
+/// A span corresponds to the starting and ending _byte offsets_ of a
+/// contiguous region of bytes. The starting offset is inclusive while the
+/// ending offset is exclusive. That is, a span is a half-open interval.
+///
+/// A span is used to report the offsets of a match, but it is also used to
+/// convey which region of a haystack should be searched via routines like
+/// [`Input::span`].
+///
+/// This is basically equivalent to a `std::ops::Range<usize>`, except this
+/// type implements `Copy` which makes it more ergonomic to use in the context
+/// of this crate. Like a range, this implements `Index` for `[u8]` and `str`,
+/// and `IndexMut` for `[u8]`. For convenience, this also impls `From<Range>`,
+/// which means things like `Span::from(5..10)` work.
+#[derive(Clone, Copy, Eq, Hash, PartialEq)]
+pub struct Span {
+ /// The start offset of the span, inclusive.
+ pub start: usize,
+ /// The end offset of the span, exclusive.
+ pub end: usize,
+}
+
+impl Span {
+ /// Returns this span as a range.
+ #[inline]
+ pub fn range(&self) -> Range<usize> {
+ Range::from(*self)
+ }
+
+ /// Returns true when this span is empty. That is, when `start >= end`.
+ #[inline]
+ pub fn is_empty(&self) -> bool {
+ self.start >= self.end
+ }
+
+ /// Returns the length of this span.
+ ///
+ /// This returns `0` in precisely the cases that `is_empty` returns `true`.
+ #[inline]
+ pub fn len(&self) -> usize {
+ self.end.saturating_sub(self.start)
+ }
+
+ /// Returns true when the given offset is contained within this span.
+ ///
+ /// Note that an empty span contains no offsets and will always return
+ /// false.
+ #[inline]
+ pub fn contains(&self, offset: usize) -> bool {
+ !self.is_empty() && self.start <= offset && offset <= self.end
+ }
+
+ /// Returns a new span with `offset` added to this span's `start` and `end`
+ /// values.
+ #[inline]
+ pub fn offset(&self, offset: usize) -> Span {
+ Span { start: self.start + offset, end: self.end + offset }
+ }
+}
+
+impl core::fmt::Debug for Span {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ write!(f, "{}..{}", self.start, self.end)
+ }
+}
+
+impl core::ops::Index<Span> for [u8] {
+ type Output = [u8];
+
+ #[inline]
+ fn index(&self, index: Span) -> &[u8] {
+ &self[index.range()]
+ }
+}
+
+impl core::ops::IndexMut<Span> for [u8] {
+ #[inline]
+ fn index_mut(&mut self, index: Span) -> &mut [u8] {
+ &mut self[index.range()]
+ }
+}
+
+impl core::ops::Index<Span> for str {
+ type Output = str;
+
+ #[inline]
+ fn index(&self, index: Span) -> &str {
+ &self[index.range()]
+ }
+}
+
+impl From<Range<usize>> for Span {
+ #[inline]
+ fn from(range: Range<usize>) -> Span {
+ Span { start: range.start, end: range.end }
+ }
+}
+
+impl From<Span> for Range<usize> {
+ #[inline]
+ fn from(span: Span) -> Range<usize> {
+ Range { start: span.start, end: span.end }
+ }
+}
+
+impl PartialEq<Range<usize>> for Span {
+ #[inline]
+ fn eq(&self, range: &Range<usize>) -> bool {
+ self.start == range.start && self.end == range.end
+ }
+}
+
+impl PartialEq<Span> for Range<usize> {
+ #[inline]
+ fn eq(&self, span: &Span) -> bool {
+ self.start == span.start && self.end == span.end
+ }
+}
+
+/// A representation of "half" of a match reported by a DFA.
+///
+/// This is called a "half" match because it only includes the end location (or
+/// start location for a reverse search) of a match. This corresponds to the
+/// information that a single DFA scan can report. Getting the other half of
+/// the match requires a second scan with a reversed DFA.
+///
+/// A half match also includes the pattern that matched. The pattern is
+/// identified by an ID, which corresponds to its position (starting from `0`)
+/// relative to other patterns used to construct the corresponding DFA. If only
+/// a single pattern is provided to the DFA, then all matches are guaranteed to
+/// have a pattern ID of `0`.
+#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
+pub struct HalfMatch {
+ /// The pattern ID.
+ pattern: PatternID,
+ /// The offset of the match.
+ ///
+ /// For forward searches, the offset is exclusive. For reverse searches,
+ /// the offset is inclusive.
+ offset: usize,
+}
+
+impl HalfMatch {
+ /// Create a new half match from a pattern ID and a byte offset.
+ #[inline]
+ pub fn new(pattern: PatternID, offset: usize) -> HalfMatch {
+ HalfMatch { pattern, offset }
+ }
+
+ /// Create a new half match from a pattern ID and a byte offset.
+ ///
+ /// This is like [`HalfMatch::new`], but accepts a `usize` instead of a
+ /// [`PatternID`]. This panics if the given `usize` is not representable
+ /// as a `PatternID`.
+ #[inline]
+ pub fn must(pattern: usize, offset: usize) -> HalfMatch {
+ HalfMatch::new(PatternID::new(pattern).unwrap(), offset)
+ }
+
+ /// Returns the ID of the pattern that matched.
+ ///
+ /// The ID of a pattern is derived from the position in which it was
+ /// originally inserted into the corresponding DFA. The first pattern has
+ /// identifier `0`, and each subsequent pattern is `1`, `2` and so on.
+ #[inline]
+ pub fn pattern(&self) -> PatternID {
+ self.pattern
+ }
+
+ /// The position of the match.
+ ///
+ /// If this match was produced by a forward search, then the offset is
+ /// exclusive. If this match was produced by a reverse search, then the
+ /// offset is inclusive.
+ #[inline]
+ pub fn offset(&self) -> usize {
+ self.offset
+ }
+}
+
+/// A representation of a match reported by a regex engine.
+///
+/// A match has two essential pieces of information: the [`PatternID`] that
+/// matches, and the [`Span`] of the match in a haystack.
+///
+/// The pattern is identified by an ID, which corresponds to its position
+/// (starting from `0`) relative to other patterns used to construct the
+/// corresponding regex engine. If only a single pattern is provided, then all
+/// matches are guaranteed to have a pattern ID of `0`.
+///
+/// Every match reported by a regex engine guarantees that its span has its
+/// start offset as less than or equal to its end offset.
+#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
+pub struct Match {
+ /// The pattern ID.
+ pattern: PatternID,
+ /// The underlying match span.
+ span: Span,
+}
+
+impl Match {
+ /// Create a new match from a pattern ID and a span.
+ ///
+ /// This constructor is generic over how a span is provided. While
+ /// a [`Span`] may be given directly, one may also provide a
+ /// `std::ops::Range<usize>`.
+ ///
+ /// # Panics
+ ///
+ /// This panics if `end < start`.
+ ///
+ /// # Example
+ ///
+ /// This shows how to create a match for the first pattern in a regex
+ /// object using convenient range syntax.
+ ///
+ /// ```
+ /// use regex_automata::{Match, PatternID};
+ ///
+ /// let m = Match::new(PatternID::ZERO, 5..10);
+ /// assert_eq!(0, m.pattern().as_usize());
+ /// assert_eq!(5, m.start());
+ /// assert_eq!(10, m.end());
+ /// ```
+ #[inline]
+ pub fn new<S: Into<Span>>(pattern: PatternID, span: S) -> Match {
+ let span: Span = span.into();
+ assert!(span.start <= span.end, "invalid match span");
+ Match { pattern, span }
+ }
+
+ /// Create a new match from a pattern ID and a byte offset span.
+ ///
+ /// This constructor is generic over how a span is provided. While
+ /// a [`Span`] may be given directly, one may also provide a
+ /// `std::ops::Range<usize>`.
+ ///
+ /// This is like [`Match::new`], but accepts a `usize` instead of a
+ /// [`PatternID`]. This panics if the given `usize` is not representable
+ /// as a `PatternID`.
+ ///
+ /// # Panics
+ ///
+ /// This panics if `end < start` or if `pattern > PatternID::MAX`.
+ ///
+ /// # Example
+ ///
+ /// This shows how to create a match for the third pattern in a regex
+ /// object using convenient range syntax.
+ ///
+ /// ```
+ /// use regex_automata::Match;
+ ///
+ /// let m = Match::must(3, 5..10);
+ /// assert_eq!(3, m.pattern().as_usize());
+ /// assert_eq!(5, m.start());
+ /// assert_eq!(10, m.end());
+ /// ```
+ #[inline]
+ pub fn must<S: Into<Span>>(pattern: usize, span: S) -> Match {
+ Match::new(PatternID::must(pattern), span)
+ }
+
+ /// Returns the ID of the pattern that matched.
+ ///
+ /// The ID of a pattern is derived from the position in which it was
+ /// originally inserted into the corresponding regex engine. The first
+ /// pattern has identifier `0`, and each subsequent pattern is `1`, `2` and
+ /// so on.
+ #[inline]
+ pub fn pattern(&self) -> PatternID {
+ self.pattern
+ }
+
+ /// The starting position of the match.
+ ///
+ /// This is a convenience routine for `Match::span().start`.
+ #[inline]
+ pub fn start(&self) -> usize {
+ self.span().start
+ }
+
+ /// The ending position of the match.
+ ///
+ /// This is a convenience routine for `Match::span().end`.
+ #[inline]
+ pub fn end(&self) -> usize {
+ self.span().end
+ }
+
+ /// Returns the match span as a range.
+ ///
+ /// This is a convenience routine for `Match::span().range()`.
+ #[inline]
+ pub fn range(&self) -> core::ops::Range<usize> {
+ self.span().range()
+ }
+
+ /// Returns the span for this match.
+ #[inline]
+ pub fn span(&self) -> Span {
+ self.span
+ }
+
+ /// Returns true when the span in this match is empty.
+ ///
+ /// An empty match can only be returned when the regex itself can match
+ /// the empty string.
+ #[inline]
+ pub fn is_empty(&self) -> bool {
+ self.span().is_empty()
+ }
+
+ /// Returns the length of this match.
+ ///
+ /// This returns `0` in precisely the cases that `is_empty` returns `true`.
+ #[inline]
+ pub fn len(&self) -> usize {
+ self.span().len()
+ }
+}
+
+/// A set of `PatternID`s.
+///
+/// A set of pattern identifiers is useful for recording which patterns have
+/// matched a particular haystack. A pattern set _only_ includes pattern
+/// identifiers. It does not include offset information.
+///
+/// # Example
+///
+/// This shows basic usage of a set.
+///
+/// ```
+/// use regex_automata::{PatternID, PatternSet};
+///
+/// let pid1 = PatternID::must(5);
+/// let pid2 = PatternID::must(8);
+/// // Create a new empty set.
+/// let mut set = PatternSet::new(10);
+/// // Insert pattern IDs.
+/// set.insert(pid1);
+/// set.insert(pid2);
+/// // Test membership.
+/// assert!(set.contains(pid1));
+/// assert!(set.contains(pid2));
+/// // Get all members.
+/// assert_eq!(
+/// vec![5, 8],
+/// set.iter().map(|p| p.as_usize()).collect::<Vec<usize>>(),
+/// );
+/// // Clear the set.
+/// set.clear();
+/// // Test that it is indeed empty.
+/// assert!(set.is_empty());
+/// ```
+#[cfg(feature = "alloc")]
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct PatternSet {
+ /// The number of patterns set to 'true' in this set.
+ len: usize,
+ /// A map from PatternID to boolean of whether a pattern matches or not.
+ ///
+ /// This should probably be a bitset, but it's probably unlikely to matter
+ /// much in practice.
+ ///
+ /// The main downside of this representation (and similarly for a bitset)
+ /// is that iteration scales with the capacity of the set instead of
+ /// the length of the set. This doesn't seem likely to be a problem in
+ /// practice.
+ ///
+ /// Another alternative is to just use a 'SparseSet' for this. It does use
+ /// more memory (quite a bit more), but that seems fine I think compared
+ /// to the memory being used by the regex engine. The real hiccup with
+ /// it is that it yields pattern IDs in the order they were inserted.
+ /// Which is actually kind of nice, but at the time of writing, pattern
+ /// IDs are yielded in ascending order in the regex crate RegexSet API.
+ /// If we did change to 'SparseSet', we could provide an additional
+ /// 'iter_match_order' iterator, but keep the ascending order one for
+ /// compatibility.
+ which: alloc::boxed::Box<[bool]>,
+}
+
+#[cfg(feature = "alloc")]
+impl PatternSet {
+ /// Create a new set of pattern identifiers with the given capacity.
+ ///
+ /// The given capacity typically corresponds to (at least) the number of
+ /// patterns in a compiled regex object.
+ ///
+ /// # Panics
+ ///
+ /// This panics if the given capacity exceeds [`PatternID::LIMIT`]. This is
+ /// impossible if you use the `pattern_len()` method as defined on any of
+ /// the regex engines in this crate. Namely, a regex will fail to build by
+ /// returning an error if the number of patterns given to it exceeds the
+ /// limit. Therefore, the number of patterns in a valid regex is always
+ /// a correct capacity to provide here.
+ pub fn new(capacity: usize) -> PatternSet {
+ assert!(
+ capacity <= PatternID::LIMIT,
+ "pattern set capacity exceeds limit of {}",
+ PatternID::LIMIT,
+ );
+ PatternSet {
+ len: 0,
+ which: alloc::vec![false; capacity].into_boxed_slice(),
+ }
+ }
+
+ /// Clear this set such that it contains no pattern IDs.
+ pub fn clear(&mut self) {
+ self.len = 0;
+ for matched in self.which.iter_mut() {
+ *matched = false;
+ }
+ }
+
+ /// Return true if and only if the given pattern identifier is in this set.
+ pub fn contains(&self, pid: PatternID) -> bool {
+ pid.as_usize() < self.capacity() && self.which[pid]
+ }
+
+ /// Insert the given pattern identifier into this set and return `true` if
+ /// the given pattern ID was not previously in this set.
+ ///
+ /// If the pattern identifier is already in this set, then this is a no-op.
+ ///
+ /// Use [`PatternSet::try_insert`] for a fallible version of this routine.
+ ///
+ /// # Panics
+ ///
+ /// This panics if this pattern set has insufficient capacity to
+ /// store the given pattern ID.
+ pub fn insert(&mut self, pid: PatternID) -> bool {
+ self.try_insert(pid)
+ .expect("PatternSet should have sufficient capacity")
+ }
+
+ /// Insert the given pattern identifier into this set and return `true` if
+ /// the given pattern ID was not previously in this set.
+ ///
+ /// If the pattern identifier is already in this set, then this is a no-op.
+ ///
+ /// # Errors
+ ///
+ /// This returns an error if this pattern set has insufficient capacity to
+ /// store the given pattern ID.
+ pub fn try_insert(
+ &mut self,
+ pid: PatternID,
+ ) -> Result<bool, PatternSetInsertError> {
+ if pid.as_usize() >= self.capacity() {
+ return Err(PatternSetInsertError {
+ attempted: pid,
+ capacity: self.capacity(),
+ });
+ }
+ if self.which[pid] {
+ return Ok(false);
+ }
+ self.len += 1;
+ self.which[pid] = true;
+ Ok(true)
+ }
+
+ /*
+ // This is currently commented out because it is unused and it is unclear
+ // whether it's useful or not. What's the harm in having it? When, if
+ // we ever wanted to change our representation to a 'SparseSet', then
+ // supporting this method would be a bit tricky. So in order to keep some
+ // API evolution flexibility, we leave it out for now.
+
+ /// Remove the given pattern identifier from this set.
+ ///
+ /// If the pattern identifier was not previously in this set, then this
+ /// does not change the set and returns `false`.
+ ///
+ /// # Panics
+ ///
+ /// This panics if `pid` exceeds the capacity of this set.
+ pub fn remove(&mut self, pid: PatternID) -> bool {
+ if !self.which[pid] {
+ return false;
+ }
+ self.len -= 1;
+ self.which[pid] = false;
+ true
+ }
+ */
+
+ /// Return true if and only if this set has no pattern identifiers in it.
+ pub fn is_empty(&self) -> bool {
+ self.len() == 0
+ }
+
+ /// Return true if and only if this set has the maximum number of pattern
+ /// identifiers in the set. This occurs precisely when `PatternSet::len()
+ /// == PatternSet::capacity()`.
+ ///
+ /// This particular property is useful to test because it may allow one to
+ /// stop a search earlier than you might otherwise. Namely, if a search is
+ /// only reporting which patterns match a haystack and if you know all of
+ /// the patterns match at a given point, then there's no new information
+ /// that can be learned by continuing the search. (Because a pattern set
+ /// does not keep track of offset information.)
+ pub fn is_full(&self) -> bool {
+ self.len() == self.capacity()
+ }
+
+ /// Returns the total number of pattern identifiers in this set.
+ pub fn len(&self) -> usize {
+ self.len
+ }
+
+ /// Returns the total number of pattern identifiers that may be stored
+ /// in this set.
+ ///
+ /// This is guaranteed to be less than or equal to [`PatternID::LIMIT`].
+ ///
+ /// Typically, the capacity of a pattern set matches the number of patterns
+ /// in a regex object with which you are searching.
+ pub fn capacity(&self) -> usize {
+ self.which.len()
+ }
+
+ /// Returns an iterator over all pattern identifiers in this set.
+ ///
+ /// The iterator yields pattern identifiers in ascending order, starting
+ /// at zero.
+ pub fn iter(&self) -> PatternSetIter<'_> {
+ PatternSetIter { it: self.which.iter().enumerate() }
+ }
+}
+
+/// An error that occurs when a `PatternID` failed to insert into a
+/// `PatternSet`.
+///
+/// An insert fails when the given `PatternID` exceeds the configured capacity
+/// of the `PatternSet`.
+///
+/// This error is created by the [`PatternSet::try_insert`] routine.
+#[cfg(feature = "alloc")]
+#[derive(Clone, Debug)]
+pub struct PatternSetInsertError {
+ attempted: PatternID,
+ capacity: usize,
+}
+
+#[cfg(feature = "std")]
+impl std::error::Error for PatternSetInsertError {}
+
+#[cfg(feature = "alloc")]
+impl core::fmt::Display for PatternSetInsertError {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ write!(
+ f,
+ "failed to insert pattern ID {} into pattern set \
+ with insufficiet capacity of {}",
+ self.attempted.as_usize(),
+ self.capacity,
+ )
+ }
+}
+
+/// An iterator over all pattern identifiers in a [`PatternSet`].
+///
+/// The lifetime parameter `'a` refers to the lifetime of the pattern set being
+/// iterated over.
+///
+/// This iterator is created by the [`PatternSet::iter`] method.
+#[cfg(feature = "alloc")]
+#[derive(Clone, Debug)]
+pub struct PatternSetIter<'a> {
+ it: core::iter::Enumerate<core::slice::Iter<'a, bool>>,
+}
+
+#[cfg(feature = "alloc")]
+impl<'a> Iterator for PatternSetIter<'a> {
+ type Item = PatternID;
+
+ fn next(&mut self) -> Option<PatternID> {
+ while let Some((index, &yes)) = self.it.next() {
+ if yes {
+ // Only valid 'PatternID' values can be inserted into the set
+ // and construction of the set panics if the capacity would
+ // permit storing invalid pattern IDs. Thus, 'yes' is only true
+ // precisely when 'index' corresponds to a valid 'PatternID'.
+ return Some(PatternID::new_unchecked(index));
+ }
+ }
+ None
+ }
+
+ fn size_hint(&self) -> (usize, Option<usize>) {
+ self.it.size_hint()
+ }
+}
+
+#[cfg(feature = "alloc")]
+impl<'a> DoubleEndedIterator for PatternSetIter<'a> {
+ fn next_back(&mut self) -> Option<PatternID> {
+ while let Some((index, &yes)) = self.it.next_back() {
+ if yes {
+ // Only valid 'PatternID' values can be inserted into the set
+ // and construction of the set panics if the capacity would
+ // permit storing invalid pattern IDs. Thus, 'yes' is only true
+ // precisely when 'index' corresponds to a valid 'PatternID'.
+ return Some(PatternID::new_unchecked(index));
+ }
+ }
+ None
+ }
+}
+
+/// The type of anchored search to perform.
+///
+/// This is *almost* a boolean option. That is, you can either do an unanchored
+/// search for any pattern in a regex, or you can do an anchored search for any
+/// pattern in a regex.
+///
+/// A third option exists that, assuming the regex engine supports it, permits
+/// you to do an anchored search for a specific pattern.
+///
+/// Note that there is no way to run an unanchored search for a specific
+/// pattern. If you need that, you'll need to build separate regexes for each
+/// pattern.
+///
+/// # Errors
+///
+/// If a regex engine does not support the anchored mode selected, then the
+/// regex engine will return an error. While any non-trivial regex engine
+/// should support at least one of the available anchored modes, there is no
+/// singular mode that is guaranteed to be universally supported. Some regex
+/// engines might only support unanchored searches (DFAs compiled without
+/// anchored starting states) and some regex engines might only support
+/// anchored searches (like the one-pass DFA).
+///
+/// The specific error returned is a [`MatchError`] with a
+/// [`MatchErrorKind::UnsupportedAnchored`] kind. The kind includes the
+/// `Anchored` value given that is unsupported.
+///
+/// Note that regex engines should report "no match" if, for example, an
+/// `Anchored::Pattern` is provided with an invalid pattern ID _but_ where
+/// anchored searches for a specific pattern are supported. This is smooths out
+/// behavior such that it's possible to guarantee that an error never occurs
+/// based on how the regex engine is configured. All regex engines in this
+/// crate report "no match" when searching for an invalid pattern ID, but where
+/// searching for a valid pattern ID is otherwise supported.
+///
+/// # Example
+///
+/// This example shows how to use the various `Anchored` modes to run a
+/// search. We use the [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM)
+/// because it supports all modes unconditionally. Some regex engines, like
+/// the [`onepass::DFA`](crate::dfa::onepass::DFA) cannot support unanchored
+/// searches.
+///
+/// ```
+/// # if cfg!(miri) { return Ok(()); } // miri takes too long
+/// use regex_automata::{
+/// nfa::thompson::pikevm::PikeVM,
+/// Anchored, Input, Match, PatternID,
+/// };
+///
+/// let re = PikeVM::new_many(&[
+/// r"Mrs. \w+",
+/// r"Miss \w+",
+/// r"Mr. \w+",
+/// r"Ms. \w+",
+/// ])?;
+/// let mut cache = re.create_cache();
+/// let hay = "Hello Mr. Springsteen!";
+///
+/// // The default is to do an unanchored search.
+/// assert_eq!(Some(Match::must(2, 6..21)), re.find(&mut cache, hay));
+/// // Explicitly ask for an unanchored search. Same as above.
+/// let input = Input::new(hay).anchored(Anchored::No);
+/// assert_eq!(Some(Match::must(2, 6..21)), re.find(&mut cache, hay));
+///
+/// // Now try an anchored search. Since the match doesn't start at the
+/// // beginning of the haystack, no match is found!
+/// let input = Input::new(hay).anchored(Anchored::Yes);
+/// assert_eq!(None, re.find(&mut cache, input));
+///
+/// // We can try an anchored search again, but move the location of where
+/// // we start the search. Note that the offsets reported are still in
+/// // terms of the overall haystack and not relative to where we started
+/// // the search.
+/// let input = Input::new(hay).anchored(Anchored::Yes).range(6..);
+/// assert_eq!(Some(Match::must(2, 6..21)), re.find(&mut cache, input));
+///
+/// // Now try an anchored search for a specific pattern. We specifically
+/// // choose a pattern that we know doesn't match to prove that the search
+/// // only looks for the pattern we provide.
+/// let input = Input::new(hay)
+/// .anchored(Anchored::Pattern(PatternID::must(1)))
+/// .range(6..);
+/// assert_eq!(None, re.find(&mut cache, input));
+///
+/// // But if we switch it to the pattern that we know matches, then we find
+/// // the match.
+/// let input = Input::new(hay)
+/// .anchored(Anchored::Pattern(PatternID::must(2)))
+/// .range(6..);
+/// assert_eq!(Some(Match::must(2, 6..21)), re.find(&mut cache, input));
+///
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+pub enum Anchored {
+ /// Run an unanchored search. This means a match may occur anywhere at or
+ /// after the start position of the search.
+ ///
+ /// This search can return a match for any pattern in the regex.
+ No,
+ /// Run an anchored search. This means that a match must begin at the
+ /// start position of the search.
+ ///
+ /// This search can return a match for any pattern in the regex.
+ Yes,
+ /// Run an anchored search for a specific pattern. This means that a match
+ /// must be for the given pattern and must begin at the start position of
+ /// the search.
+ Pattern(PatternID),
+}
+
+impl Anchored {
+ /// Returns true if and only if this anchor mode corresponds to any kind of
+ /// anchored search.
+ ///
+ /// # Example
+ ///
+ /// This examples shows that both `Anchored::Yes` and `Anchored::Pattern`
+ /// are considered anchored searches.
+ ///
+ /// ```
+ /// use regex_automata::{Anchored, PatternID};
+ ///
+ /// assert!(!Anchored::No.is_anchored());
+ /// assert!(Anchored::Yes.is_anchored());
+ /// assert!(Anchored::Pattern(PatternID::ZERO).is_anchored());
+ /// ```
+ #[inline]
+ pub fn is_anchored(&self) -> bool {
+ matches!(*self, Anchored::Yes | Anchored::Pattern(_))
+ }
+
+ /// Returns the pattern ID associated with this configuration if it is an
+ /// anchored search for a specific pattern. Otherwise `None` is returned.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{Anchored, PatternID};
+ ///
+ /// assert_eq!(None, Anchored::No.pattern());
+ /// assert_eq!(None, Anchored::Yes.pattern());
+ ///
+ /// let pid = PatternID::must(5);
+ /// assert_eq!(Some(pid), Anchored::Pattern(pid).pattern());
+ /// ```
+ #[inline]
+ pub fn pattern(&self) -> Option<PatternID> {
+ match *self {
+ Anchored::Pattern(pid) => Some(pid),
+ _ => None,
+ }
+ }
+}
+
+/// The kind of match semantics to use for a regex pattern.
+///
+/// The default match kind is `LeftmostFirst`, and this corresponds to the
+/// match semantics used by most backtracking engines, such as Perl.
+///
+/// # Leftmost first or "preference order" match semantics
+///
+/// Leftmost-first semantics determine which match to report when there are
+/// multiple paths through a regex that match at the same position. The tie is
+/// essentially broken by how a backtracker would behave. For example, consider
+/// running the regex `foofoofoo|foofoo|foo` on the haystack `foofoo`. In this
+/// case, both the `foofoo` and `foo` branches match at position `0`. So should
+/// the end of the match be `3` or `6`?
+///
+/// A backtracker will conceptually work by trying `foofoofoo` and failing.
+/// Then it will try `foofoo`, find the match and stop there. Thus, the
+/// leftmost-first match position is `6`. This is called "leftmost-first" or
+/// "preference order" because the order of the branches as written in the
+/// regex pattern is what determines how to break the tie.
+///
+/// (Note that leftmost-longest match semantics, which break ties by always
+/// taking the longest matching string, are not currently supported by this
+/// crate. These match semantics tend to be found in POSIX regex engines.)
+///
+/// This example shows how leftmost-first semantics work, and how it even
+/// applies to multi-pattern regexes:
+///
+/// ```
+/// use regex_automata::{
+/// nfa::thompson::pikevm::PikeVM,
+/// Match,
+/// };
+///
+/// let re = PikeVM::new_many(&[
+/// r"foofoofoo",
+/// r"foofoo",
+/// r"foo",
+/// ])?;
+/// let mut cache = re.create_cache();
+/// let got: Vec<Match> = re.find_iter(&mut cache, "foofoo").collect();
+/// let expected = vec![Match::must(1, 0..6)];
+/// assert_eq!(expected, got);
+///
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+///
+/// # All matches
+///
+/// The `All` match semantics report any and all matches, and generally will
+/// attempt to match as much as possible. It doesn't respect any sort of match
+/// priority at all, so things like non-greedy matching don't work in this
+/// mode.
+///
+/// The fact that non-greedy matching doesn't work generally makes most forms
+/// of unanchored non-overlapping searches have unintuitive behavior. Namely,
+/// unanchored searches behave as if there is a `(?s-u:.)*?` prefix at the
+/// beginning of the pattern, which is specifically non-greedy. Since it will
+/// be treated as greedy in `All` match semantics, this generally means that
+/// it will first attempt to consume all of the haystack and is likely to wind
+/// up skipping matches.
+///
+/// Generally speaking, `All` should only be used in two circumstances:
+///
+/// * When running an anchored search and there is a desire to match as much as
+/// possible. For example, when building a reverse regex matcher to find the
+/// start of a match after finding the end. In this case, the reverse search
+/// is anchored to the end of the match found by the forward search.
+/// * When running overlapping searches. Since `All` encodes all possible
+/// matches, this is generally what you want for an overlapping search. If you
+/// try to use leftmost-first in an overlapping search, it is likely to produce
+/// counter-intuitive results since leftmost-first specifically excludes some
+/// matches from its underlying finite state machine.
+///
+/// This example demonstrates the counter-intuitive behavior of `All` semantics
+/// when using a standard leftmost unanchored search:
+///
+/// ```
+/// use regex_automata::{
+/// nfa::thompson::pikevm::PikeVM,
+/// Match, MatchKind,
+/// };
+///
+/// let re = PikeVM::builder()
+/// .configure(PikeVM::config().match_kind(MatchKind::All))
+/// .build("foo")?;
+/// let hay = "first foo second foo wat";
+/// let mut cache = re.create_cache();
+/// let got: Vec<Match> = re.find_iter(&mut cache, hay).collect();
+/// // Notice that it completely skips the first 'foo'!
+/// let expected = vec![Match::must(0, 17..20)];
+/// assert_eq!(expected, got);
+///
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+///
+/// This second example shows how `All` semantics are useful for an overlapping
+/// search. Note that we use lower level lazy DFA APIs here since the NFA
+/// engines only currently support a very limited form of overlapping search.
+///
+/// ```
+/// use regex_automata::{
+/// hybrid::dfa::{DFA, OverlappingState},
+/// HalfMatch, Input, MatchKind,
+/// };
+///
+/// let re = DFA::builder()
+/// // If we didn't set 'All' semantics here, then the regex would only
+/// // match 'foo' at offset 3 and nothing else. Why? Because the state
+/// // machine implements preference order and knows that the 'foofoo' and
+/// // 'foofoofoo' branches can never match since 'foo' will always match
+/// // when they match and take priority.
+/// .configure(DFA::config().match_kind(MatchKind::All))
+/// .build(r"foo|foofoo|foofoofoo")?;
+/// let mut cache = re.create_cache();
+/// let mut state = OverlappingState::start();
+/// let input = Input::new("foofoofoo");
+/// let mut got = vec![];
+/// loop {
+/// re.try_search_overlapping_fwd(&mut cache, &input, &mut state)?;
+/// let m = match state.get_match() {
+/// None => break,
+/// Some(m) => m,
+/// };
+/// got.push(m);
+/// }
+/// let expected = vec![
+/// HalfMatch::must(0, 3),
+/// HalfMatch::must(0, 6),
+/// HalfMatch::must(0, 9),
+/// ];
+/// assert_eq!(expected, got);
+///
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+#[non_exhaustive]
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+pub enum MatchKind {
+ /// Report all possible matches.
+ All,
+ /// Report only the leftmost matches. When multiple leftmost matches exist,
+ /// report the match corresponding to the part of the regex that appears
+ /// first in the syntax.
+ LeftmostFirst,
+ // There is prior art in RE2 that shows that we should be able to add
+ // LeftmostLongest too. The tricky part of it is supporting ungreedy
+ // repetitions. Instead of treating all NFA states as having equivalent
+ // priority (as in 'All') or treating all NFA states as having distinct
+ // priority based on order (as in 'LeftmostFirst'), we instead group NFA
+ // states into sets, and treat members of each set as having equivalent
+ // priority, but having greater priority than all following members
+ // of different sets.
+ //
+ // However, it's not clear whether it's really worth adding this. After
+ // all, leftmost-longest can be emulated when using literals by using
+ // leftmost-first and sorting the literals by length in descending order.
+ // However, this won't work for arbitrary regexes. e.g., `\w|\w\w` will
+ // always match `a` in `ab` when using leftmost-first, but leftmost-longest
+ // would match `ab`.
+}
+
+impl MatchKind {
+ #[cfg(feature = "alloc")]
+ pub(crate) fn continue_past_first_match(&self) -> bool {
+ *self == MatchKind::All
+ }
+}
+
+impl Default for MatchKind {
+ fn default() -> MatchKind {
+ MatchKind::LeftmostFirst
+ }
+}
+
+/// An error indicating that a search stopped before reporting whether a
+/// match exists or not.
+///
+/// To be very clear, this error type implies that one cannot assume that no
+/// matches occur, since the search stopped before completing. That is, if
+/// you're looking for information about where a search determined that no
+/// match can occur, then this error type does *not* give you that. (Indeed, at
+/// the time of writing, if you need such a thing, you have to write your own
+/// search routine.)
+///
+/// Normally, when one searches for something, the response is either an
+/// affirmative "it was found at this location" or a negative "not found at
+/// all." However, in some cases, a regex engine can be configured to stop its
+/// search before concluding whether a match exists or not. When this happens,
+/// it may be important for the caller to know why the regex engine gave up and
+/// where in the input it gave up at. This error type exposes the 'why' and the
+/// 'where.'
+///
+/// For example, the DFAs provided by this library generally cannot correctly
+/// implement Unicode word boundaries. Instead, they provide an option to
+/// eagerly support them on ASCII text (since Unicode word boundaries are
+/// equivalent to ASCII word boundaries when searching ASCII text), but will
+/// "give up" if a non-ASCII byte is seen. In such cases, one is usually
+/// required to either report the failure to the caller (unergonomic) or
+/// otherwise fall back to some other regex engine (ergonomic, but potentially
+/// costly).
+///
+/// More generally, some regex engines offer the ability for callers to specify
+/// certain bytes that will trigger the regex engine to automatically quit if
+/// they are seen.
+///
+/// Still yet, there may be other reasons for a failed match. For example,
+/// the hybrid DFA provided by this crate can be configured to give up if it
+/// believes that it is not efficient. This in turn permits callers to choose a
+/// different regex engine.
+///
+/// (Note that DFAs are configured by default to never quit or give up in this
+/// fashion. For example, by default, a DFA will fail to build if the regex
+/// pattern contains a Unicode word boundary. One needs to opt into the "quit"
+/// behavior via options, like
+/// [`hybrid::dfa::Config::unicode_word_boundary`](crate::hybrid::dfa::Config::unicode_word_boundary).)
+///
+/// There are a couple other ways a search
+/// can fail. For example, when using the
+/// [`BoundedBacktracker`](crate::nfa::thompson::backtrack::BoundedBacktracker)
+/// with a haystack that is too long, or trying to run an unanchored search
+/// with a [one-pass DFA](crate::dfa::onepass).
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct MatchError(
+ #[cfg(feature = "alloc")] alloc::boxed::Box<MatchErrorKind>,
+ #[cfg(not(feature = "alloc"))] MatchErrorKind,
+);
+
+impl MatchError {
+ /// Create a new error value with the given kind.
+ ///
+ /// This is a more verbose version of the kind-specific constructors,
+ /// e.g., `MatchError::quit`.
+ pub fn new(kind: MatchErrorKind) -> MatchError {
+ #[cfg(feature = "alloc")]
+ {
+ MatchError(alloc::boxed::Box::new(kind))
+ }
+ #[cfg(not(feature = "alloc"))]
+ {
+ MatchError(kind)
+ }
+ }
+
+ /// Returns a reference to the underlying error kind.
+ pub fn kind(&self) -> &MatchErrorKind {
+ &self.0
+ }
+
+ /// Create a new "quit" error. The given `byte` corresponds to the value
+ /// that tripped a search's quit condition, and `offset` corresponds to the
+ /// location in the haystack at which the search quit.
+ ///
+ /// This is the same as calling `MatchError::new` with a
+ /// [`MatchErrorKind::Quit`] kind.
+ pub fn quit(byte: u8, offset: usize) -> MatchError {
+ MatchError::new(MatchErrorKind::Quit { byte, offset })
+ }
+
+ /// Create a new "gave up" error. The given `offset` corresponds to the
+ /// location in the haystack at which the search gave up.
+ ///
+ /// This is the same as calling `MatchError::new` with a
+ /// [`MatchErrorKind::GaveUp`] kind.
+ pub fn gave_up(offset: usize) -> MatchError {
+ MatchError::new(MatchErrorKind::GaveUp { offset })
+ }
+
+ /// Create a new "haystack too long" error. The given `len` corresponds to
+ /// the length of the haystack that was problematic.
+ ///
+ /// This is the same as calling `MatchError::new` with a
+ /// [`MatchErrorKind::HaystackTooLong`] kind.
+ pub fn haystack_too_long(len: usize) -> MatchError {
+ MatchError::new(MatchErrorKind::HaystackTooLong { len })
+ }
+
+ /// Create a new "unsupported anchored" error. This occurs when the caller
+ /// requests a search with an anchor mode that is not supported by the
+ /// regex engine.
+ ///
+ /// This is the same as calling `MatchError::new` with a
+ /// [`MatchErrorKind::UnsupportedAnchored`] kind.
+ pub fn unsupported_anchored(mode: Anchored) -> MatchError {
+ MatchError::new(MatchErrorKind::UnsupportedAnchored { mode })
+ }
+}
+
+/// The underlying kind of a [`MatchError`].
+///
+/// This is a **non-exhaustive** enum. That means new variants may be added in
+/// a semver-compatible release.
+#[non_exhaustive]
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub enum MatchErrorKind {
+ /// The search saw a "quit" byte at which it was instructed to stop
+ /// searching.
+ Quit {
+ /// The "quit" byte that was observed that caused the search to stop.
+ byte: u8,
+ /// The offset at which the quit byte was observed.
+ offset: usize,
+ },
+ /// The search, based on heuristics, determined that it would be better
+ /// to stop, typically to provide the caller an opportunity to use an
+ /// alternative regex engine.
+ ///
+ /// Currently, the only way for this to occur is via the lazy DFA and
+ /// only when it is configured to do so (it will not return this error by
+ /// default).
+ GaveUp {
+ /// The offset at which the search stopped. This corresponds to the
+ /// position immediately following the last byte scanned.
+ offset: usize,
+ },
+ /// This error occurs if the haystack given to the regex engine was too
+ /// long to be searched. This occurs, for example, with regex engines
+ /// like the bounded backtracker that have a configurable fixed amount of
+ /// capacity that is tied to the length of the haystack. Anything beyond
+ /// that configured limit will result in an error at search time.
+ HaystackTooLong {
+ /// The length of the haystack that exceeded the limit.
+ len: usize,
+ },
+ /// An error indicating that a particular type of anchored search was
+ /// requested, but that the regex engine does not support it.
+ ///
+ /// Note that this error should not be returned by a regex engine simply
+ /// because the pattern ID is invalid (i.e., equal to or exceeds the number
+ /// of patterns in the regex). In that case, the regex engine should report
+ /// a non-match.
+ UnsupportedAnchored {
+ /// The anchored mode given that is unsupported.
+ mode: Anchored,
+ },
+}
+
+#[cfg(feature = "std")]
+impl std::error::Error for MatchError {}
+
+impl core::fmt::Display for MatchError {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ match *self.kind() {
+ MatchErrorKind::Quit { byte, offset } => write!(
+ f,
+ "quit search after observing byte {:?} at offset {}",
+ DebugByte(byte),
+ offset,
+ ),
+ MatchErrorKind::GaveUp { offset } => {
+ write!(f, "gave up searching at offset {}", offset)
+ }
+ MatchErrorKind::HaystackTooLong { len } => {
+ write!(f, "haystack of length {} is too long", len)
+ }
+ MatchErrorKind::UnsupportedAnchored { mode: Anchored::Yes } => {
+ write!(f, "anchored searches are not supported or enabled")
+ }
+ MatchErrorKind::UnsupportedAnchored { mode: Anchored::No } => {
+ write!(f, "unanchored searches are not supported or enabled")
+ }
+ MatchErrorKind::UnsupportedAnchored {
+ mode: Anchored::Pattern(pid),
+ } => {
+ write!(
+ f,
+ "anchored searches for a specific pattern ({}) are \
+ not supported or enabled",
+ pid.as_usize(),
+ )
+ }
+ }
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ // We test that our 'MatchError' type is the size we expect. This isn't an
+ // API guarantee, but if the size increases, we really want to make sure we
+ // decide to do that intentionally. So this should be a speed bump. And in
+ // general, we should not increase the size without a very good reason.
+ //
+ // Why? Because low level search APIs return Result<.., MatchError>. When
+ // MatchError gets bigger, so to does the Result type.
+ //
+ // Now, when 'alloc' is enabled, we do box the error, which de-emphasizes
+ // the importance of keeping a small error type. But without 'alloc', we
+ // still want things to be small.
+ #[test]
+ fn match_error_size() {
+ let expected_size = if cfg!(feature = "alloc") {
+ core::mem::size_of::<usize>()
+ } else {
+ 2 * core::mem::size_of::<usize>()
+ };
+ assert_eq!(expected_size, core::mem::size_of::<MatchError>());
+ }
+
+ // Same as above, but for the underlying match error kind.
+ #[cfg(target_pointer_width = "64")]
+ #[test]
+ fn match_error_kind_size() {
+ let expected_size = 2 * core::mem::size_of::<usize>();
+ assert_eq!(expected_size, core::mem::size_of::<MatchErrorKind>());
+ }
+
+ #[cfg(target_pointer_width = "32")]
+ #[test]
+ fn match_error_kind_size() {
+ let expected_size = 3 * core::mem::size_of::<usize>();
+ assert_eq!(expected_size, core::mem::size_of::<MatchErrorKind>());
+ }
+}