summaryrefslogtreecommitdiffstats
path: root/third_party/rust/regex-automata/src/dfa/regex.rs
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/rust/regex-automata/src/dfa/regex.rs')
-rw-r--r--third_party/rust/regex-automata/src/dfa/regex.rs871
1 files changed, 871 insertions, 0 deletions
diff --git a/third_party/rust/regex-automata/src/dfa/regex.rs b/third_party/rust/regex-automata/src/dfa/regex.rs
new file mode 100644
index 0000000000..f39c1c055c
--- /dev/null
+++ b/third_party/rust/regex-automata/src/dfa/regex.rs
@@ -0,0 +1,871 @@
+/*!
+A DFA-backed `Regex`.
+
+This module provides [`Regex`], which is defined generically over the
+[`Automaton`] trait. A `Regex` implements convenience routines you might have
+come to expect, such as finding the start/end of a match and iterating over
+all non-overlapping matches. This `Regex` type is limited in its capabilities
+to what a DFA can provide. Therefore, APIs involving capturing groups, for
+example, are not provided.
+
+Internally, a `Regex` is composed of two DFAs. One is a "forward" DFA that
+finds the end offset of a match, where as the other is a "reverse" DFA that
+find the start offset of a match.
+
+See the [parent module](crate::dfa) for examples.
+*/
+
+#[cfg(feature = "alloc")]
+use alloc::vec::Vec;
+
+#[cfg(feature = "dfa-build")]
+use crate::dfa::dense::BuildError;
+use crate::{
+ dfa::{automaton::Automaton, dense},
+ util::{iter, search::Input},
+ Anchored, Match, MatchError,
+};
+#[cfg(feature = "alloc")]
+use crate::{
+ dfa::{sparse, StartKind},
+ util::search::MatchKind,
+};
+
+// When the alloc feature is enabled, the regex type sets its A type parameter
+// to default to an owned dense DFA. But without alloc, we set no default. This
+// makes things a lot more convenient in the common case, since writing out the
+// DFA types is pretty annoying.
+//
+// Since we have two different definitions but only want to write one doc
+// string, we use a macro to capture the doc and other attributes once and then
+// repeat them for each definition.
+macro_rules! define_regex_type {
+ ($(#[$doc:meta])*) => {
+ #[cfg(feature = "alloc")]
+ $(#[$doc])*
+ pub struct Regex<A = dense::OwnedDFA> {
+ forward: A,
+ reverse: A,
+ }
+
+ #[cfg(not(feature = "alloc"))]
+ $(#[$doc])*
+ pub struct Regex<A> {
+ forward: A,
+ reverse: A,
+ }
+ };
+}
+
+define_regex_type!(
+ /// A regular expression that uses deterministic finite automata for fast
+ /// searching.
+ ///
+ /// A regular expression is comprised of two DFAs, a "forward" DFA and a
+ /// "reverse" DFA. The forward DFA is responsible for detecting the end of
+ /// a match while the reverse DFA is responsible for detecting the start
+ /// of a match. Thus, in order to find the bounds of any given match, a
+ /// forward search must first be run followed by a reverse search. A match
+ /// found by the forward DFA guarantees that the reverse DFA will also find
+ /// a match.
+ ///
+ /// The type of the DFA used by a `Regex` corresponds to the `A` type
+ /// parameter, which must satisfy the [`Automaton`] trait. Typically,
+ /// `A` is either a [`dense::DFA`](crate::dfa::dense::DFA) or a
+ /// [`sparse::DFA`](crate::dfa::sparse::DFA), where dense DFAs use more
+ /// memory but search faster, while sparse DFAs use less memory but search
+ /// more slowly.
+ ///
+ /// # Crate features
+ ///
+ /// Note that despite what the documentation auto-generates, the _only_
+ /// crate feature needed to use this type is `dfa-search`. You do _not_
+ /// need to enable the `alloc` feature.
+ ///
+ /// By default, a regex's automaton type parameter is set to
+ /// `dense::DFA<Vec<u32>>` when the `alloc` feature is enabled. For most
+ /// in-memory work loads, this is the most convenient type that gives the
+ /// best search performance. When the `alloc` feature is disabled, no
+ /// default type is used.
+ ///
+ /// # When should I use this?
+ ///
+ /// Generally speaking, if you can afford the overhead of building a full
+ /// DFA for your regex, and you don't need things like capturing groups,
+ /// then this is a good choice if you're looking to optimize for matching
+ /// speed. Note however that its speed may be worse than a general purpose
+ /// regex engine if you don't provide a [`dense::Config::prefilter`] to the
+ /// underlying DFA.
+ ///
+ /// # Sparse DFAs
+ ///
+ /// Since a `Regex` is generic over the [`Automaton`] trait, it can be
+ /// used with any kind of DFA. While this crate constructs dense DFAs by
+ /// default, it is easy enough to build corresponding sparse DFAs, and then
+ /// build a regex from them:
+ ///
+ /// ```
+ /// use regex_automata::dfa::regex::Regex;
+ ///
+ /// // First, build a regex that uses dense DFAs.
+ /// let dense_re = Regex::new("foo[0-9]+")?;
+ ///
+ /// // Second, build sparse DFAs from the forward and reverse dense DFAs.
+ /// let fwd = dense_re.forward().to_sparse()?;
+ /// let rev = dense_re.reverse().to_sparse()?;
+ ///
+ /// // Third, build a new regex from the constituent sparse DFAs.
+ /// let sparse_re = Regex::builder().build_from_dfas(fwd, rev);
+ ///
+ /// // A regex that uses sparse DFAs can be used just like with dense DFAs.
+ /// assert_eq!(true, sparse_re.is_match(b"foo123"));
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// Alternatively, one can use a [`Builder`] to construct a sparse DFA
+ /// more succinctly. (Note though that dense DFAs are still constructed
+ /// first internally, and then converted to sparse DFAs, as in the example
+ /// above.)
+ ///
+ /// ```
+ /// use regex_automata::dfa::regex::Regex;
+ ///
+ /// let sparse_re = Regex::builder().build_sparse(r"foo[0-9]+")?;
+ /// // A regex that uses sparse DFAs can be used just like with dense DFAs.
+ /// assert!(sparse_re.is_match(b"foo123"));
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// # Fallibility
+ ///
+ /// Most of the search routines defined on this type will _panic_ when the
+ /// underlying search fails. This might be because the DFA gave up because
+ /// it saw a quit byte, whether configured explicitly or via heuristic
+ /// Unicode word boundary support, although neither are enabled by default.
+ /// Or it might fail because an invalid `Input` configuration is given,
+ /// for example, with an unsupported [`Anchored`] mode.
+ ///
+ /// If you need to handle these error cases instead of allowing them to
+ /// trigger a panic, then the lower level [`Regex::try_search`] provides
+ /// a fallible API that never panics.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to cause a search to terminate if it sees a
+ /// `\n` byte, and handle the error returned. This could be useful if, for
+ /// example, you wanted to prevent a user supplied pattern from matching
+ /// across a line boundary.
+ ///
+ /// ```
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::{dfa::{self, regex::Regex}, Input, MatchError};
+ ///
+ /// let re = Regex::builder()
+ /// .dense(dfa::dense::Config::new().quit(b'\n', true))
+ /// .build(r"foo\p{any}+bar")?;
+ ///
+ /// let input = Input::new("foo\nbar");
+ /// // Normally this would produce a match, since \p{any} contains '\n'.
+ /// // But since we instructed the automaton to enter a quit state if a
+ /// // '\n' is observed, this produces a match error instead.
+ /// let expected = MatchError::quit(b'\n', 3);
+ /// let got = re.try_search(&input).unwrap_err();
+ /// assert_eq!(expected, got);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[derive(Clone, Debug)]
+);
+
+#[cfg(all(feature = "syntax", feature = "dfa-build"))]
+impl Regex {
+ /// Parse the given regular expression using the default configuration and
+ /// return the corresponding regex.
+ ///
+ /// If you want a non-default configuration, then use the [`Builder`] to
+ /// set your own configuration.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{Match, dfa::regex::Regex};
+ ///
+ /// let re = Regex::new("foo[0-9]+bar")?;
+ /// assert_eq!(
+ /// Some(Match::must(0, 3..14)),
+ /// re.find(b"zzzfoo12345barzzz"),
+ /// );
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn new(pattern: &str) -> Result<Regex, BuildError> {
+ Builder::new().build(pattern)
+ }
+
+ /// Like `new`, but parses multiple patterns into a single "regex set."
+ /// This similarly uses the default regex configuration.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{Match, dfa::regex::Regex};
+ ///
+ /// let re = Regex::new_many(&["[a-z]+", "[0-9]+"])?;
+ ///
+ /// let mut it = re.find_iter(b"abc 1 foo 4567 0 quux");
+ /// assert_eq!(Some(Match::must(0, 0..3)), it.next());
+ /// assert_eq!(Some(Match::must(1, 4..5)), it.next());
+ /// assert_eq!(Some(Match::must(0, 6..9)), it.next());
+ /// assert_eq!(Some(Match::must(1, 10..14)), it.next());
+ /// assert_eq!(Some(Match::must(1, 15..16)), it.next());
+ /// assert_eq!(Some(Match::must(0, 17..21)), it.next());
+ /// assert_eq!(None, it.next());
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn new_many<P: AsRef<str>>(
+ patterns: &[P],
+ ) -> Result<Regex, BuildError> {
+ Builder::new().build_many(patterns)
+ }
+}
+
+#[cfg(all(feature = "syntax", feature = "dfa-build"))]
+impl Regex<sparse::DFA<Vec<u8>>> {
+ /// Parse the given regular expression using the default configuration,
+ /// except using sparse DFAs, and return the corresponding regex.
+ ///
+ /// If you want a non-default configuration, then use the [`Builder`] to
+ /// set your own configuration.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{Match, dfa::regex::Regex};
+ ///
+ /// let re = Regex::new_sparse("foo[0-9]+bar")?;
+ /// assert_eq!(
+ /// Some(Match::must(0, 3..14)),
+ /// re.find(b"zzzfoo12345barzzz"),
+ /// );
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn new_sparse(
+ pattern: &str,
+ ) -> Result<Regex<sparse::DFA<Vec<u8>>>, BuildError> {
+ Builder::new().build_sparse(pattern)
+ }
+
+ /// Like `new`, but parses multiple patterns into a single "regex set"
+ /// using sparse DFAs. This otherwise similarly uses the default regex
+ /// configuration.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{Match, dfa::regex::Regex};
+ ///
+ /// let re = Regex::new_many_sparse(&["[a-z]+", "[0-9]+"])?;
+ ///
+ /// let mut it = re.find_iter(b"abc 1 foo 4567 0 quux");
+ /// assert_eq!(Some(Match::must(0, 0..3)), it.next());
+ /// assert_eq!(Some(Match::must(1, 4..5)), it.next());
+ /// assert_eq!(Some(Match::must(0, 6..9)), it.next());
+ /// assert_eq!(Some(Match::must(1, 10..14)), it.next());
+ /// assert_eq!(Some(Match::must(1, 15..16)), it.next());
+ /// assert_eq!(Some(Match::must(0, 17..21)), it.next());
+ /// assert_eq!(None, it.next());
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn new_many_sparse<P: AsRef<str>>(
+ patterns: &[P],
+ ) -> Result<Regex<sparse::DFA<Vec<u8>>>, BuildError> {
+ Builder::new().build_many_sparse(patterns)
+ }
+}
+
+/// Convenience routines for regex construction.
+impl Regex<dense::DFA<&'static [u32]>> {
+ /// Return a builder for configuring the construction of a `Regex`.
+ ///
+ /// This is a convenience routine to avoid needing to import the
+ /// [`Builder`] type in common cases.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to use the builder to disable UTF-8 mode
+ /// everywhere.
+ ///
+ /// ```
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::{
+ /// dfa::regex::Regex, nfa::thompson, util::syntax, Match,
+ /// };
+ ///
+ /// let re = Regex::builder()
+ /// .syntax(syntax::Config::new().utf8(false))
+ /// .thompson(thompson::Config::new().utf8(false))
+ /// .build(r"foo(?-u:[^b])ar.*")?;
+ /// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n";
+ /// let expected = Some(Match::must(0, 1..9));
+ /// let got = re.find(haystack);
+ /// assert_eq!(expected, got);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn builder() -> Builder {
+ Builder::new()
+ }
+}
+
+/// Standard search routines for finding and iterating over matches.
+impl<A: Automaton> Regex<A> {
+ /// Returns true if and only if this regex matches the given haystack.
+ ///
+ /// This routine may short circuit if it knows that scanning future input
+ /// will never lead to a different result. In particular, if the underlying
+ /// DFA enters a match state or a dead state, then this routine will return
+ /// `true` or `false`, respectively, without inspecting any future input.
+ ///
+ /// # Panics
+ ///
+ /// This routine panics if the search could not complete. This can occur
+ /// in a number of circumstances:
+ ///
+ /// * The configuration of the DFA may permit it to "quit" the search.
+ /// For example, setting quit bytes or enabling heuristic support for
+ /// Unicode word boundaries. The default configuration does not enable any
+ /// option that could result in the DFA quitting.
+ /// * When the provided `Input` configuration is not supported. For
+ /// example, by providing an unsupported anchor mode.
+ ///
+ /// When a search panics, callers cannot know whether a match exists or
+ /// not.
+ ///
+ /// Use [`Regex::try_search`] if you want to handle these error conditions.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::dfa::regex::Regex;
+ ///
+ /// let re = Regex::new("foo[0-9]+bar")?;
+ /// assert_eq!(true, re.is_match("foo12345bar"));
+ /// assert_eq!(false, re.is_match("foobar"));
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn is_match<'h, I: Into<Input<'h>>>(&self, input: I) -> bool {
+ // Not only can we do an "earliest" search, but we can avoid doing a
+ // reverse scan too.
+ let input = input.into().earliest(true);
+ self.forward().try_search_fwd(&input).map(|x| x.is_some()).unwrap()
+ }
+
+ /// Returns the start and end offset of the leftmost match. If no match
+ /// exists, then `None` is returned.
+ ///
+ /// # Panics
+ ///
+ /// This routine panics if the search could not complete. This can occur
+ /// in a number of circumstances:
+ ///
+ /// * The configuration of the DFA may permit it to "quit" the search.
+ /// For example, setting quit bytes or enabling heuristic support for
+ /// Unicode word boundaries. The default configuration does not enable any
+ /// option that could result in the DFA quitting.
+ /// * When the provided `Input` configuration is not supported. For
+ /// example, by providing an unsupported anchor mode.
+ ///
+ /// When a search panics, callers cannot know whether a match exists or
+ /// not.
+ ///
+ /// Use [`Regex::try_search`] if you want to handle these error conditions.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{Match, dfa::regex::Regex};
+ ///
+ /// // Greediness is applied appropriately.
+ /// let re = Regex::new("foo[0-9]+")?;
+ /// assert_eq!(Some(Match::must(0, 3..11)), re.find("zzzfoo12345zzz"));
+ ///
+ /// // Even though a match is found after reading the first byte (`a`),
+ /// // the default leftmost-first match semantics demand that we find the
+ /// // earliest match that prefers earlier parts of the pattern over latter
+ /// // parts.
+ /// let re = Regex::new("abc|a")?;
+ /// assert_eq!(Some(Match::must(0, 0..3)), re.find("abc"));
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn find<'h, I: Into<Input<'h>>>(&self, input: I) -> Option<Match> {
+ self.try_search(&input.into()).unwrap()
+ }
+
+ /// Returns an iterator over all non-overlapping leftmost matches in the
+ /// given bytes. If no match exists, then the iterator yields no elements.
+ ///
+ /// This corresponds to the "standard" regex search iterator.
+ ///
+ /// # Panics
+ ///
+ /// If the search returns an error during iteration, then iteration
+ /// panics. See [`Regex::find`] for the panic conditions.
+ ///
+ /// Use [`Regex::try_search`] with
+ /// [`util::iter::Searcher`](crate::util::iter::Searcher) if you want to
+ /// handle these error conditions.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{Match, dfa::regex::Regex};
+ ///
+ /// let re = Regex::new("foo[0-9]+")?;
+ /// let text = "foo1 foo12 foo123";
+ /// let matches: Vec<Match> = re.find_iter(text).collect();
+ /// assert_eq!(matches, vec![
+ /// Match::must(0, 0..4),
+ /// Match::must(0, 5..10),
+ /// Match::must(0, 11..17),
+ /// ]);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn find_iter<'r, 'h, I: Into<Input<'h>>>(
+ &'r self,
+ input: I,
+ ) -> FindMatches<'r, 'h, A> {
+ let it = iter::Searcher::new(input.into());
+ FindMatches { re: self, it }
+ }
+}
+
+/// Lower level fallible search routines that permit controlling where the
+/// search starts and ends in a particular sequence.
+impl<A: Automaton> Regex<A> {
+ /// Returns the start and end offset of the leftmost match. If no match
+ /// exists, then `None` is returned.
+ ///
+ /// This is like [`Regex::find`] but with two differences:
+ ///
+ /// 1. It is not generic over `Into<Input>` and instead accepts a
+ /// `&Input`. This permits reusing the same `Input` for multiple searches
+ /// without needing to create a new one. This _may_ help with latency.
+ /// 2. It returns an error if the search could not complete where as
+ /// [`Regex::find`] will panic.
+ ///
+ /// # Errors
+ ///
+ /// This routine errors if the search could not complete. This can occur
+ /// in the following circumstances:
+ ///
+ /// * The configuration of the DFA may permit it to "quit" the search.
+ /// For example, setting quit bytes or enabling heuristic support for
+ /// Unicode word boundaries. The default configuration does not enable any
+ /// option that could result in the DFA quitting.
+ /// * When the provided `Input` configuration is not supported. For
+ /// example, by providing an unsupported anchor mode.
+ ///
+ /// When a search returns an error, callers cannot know whether a match
+ /// exists or not.
+ #[inline]
+ pub fn try_search(
+ &self,
+ input: &Input<'_>,
+ ) -> Result<Option<Match>, MatchError> {
+ let (fwd, rev) = (self.forward(), self.reverse());
+ let end = match fwd.try_search_fwd(input)? {
+ None => return Ok(None),
+ Some(end) => end,
+ };
+ // This special cases an empty match at the beginning of the search. If
+ // our end matches our start, then since a reverse DFA can't match past
+ // the start, it must follow that our starting position is also our end
+ // position. So short circuit and skip the reverse search.
+ if input.start() == end.offset() {
+ return Ok(Some(Match::new(
+ end.pattern(),
+ end.offset()..end.offset(),
+ )));
+ }
+ // We can also skip the reverse search if we know our search was
+ // anchored. This occurs either when the input config is anchored or
+ // when we know the regex itself is anchored. In this case, we know the
+ // start of the match, if one is found, must be the start of the
+ // search.
+ if self.is_anchored(input) {
+ return Ok(Some(Match::new(
+ end.pattern(),
+ input.start()..end.offset(),
+ )));
+ }
+ // N.B. I have tentatively convinced myself that it isn't necessary
+ // to specify the specific pattern for the reverse search since the
+ // reverse search will always find the same pattern to match as the
+ // forward search. But I lack a rigorous proof. Why not just provide
+ // the pattern anyway? Well, if it is needed, then leaving it out
+ // gives us a chance to find a witness. (Also, if we don't need to
+ // specify the pattern, then we don't need to build the reverse DFA
+ // with 'starts_for_each_pattern' enabled.)
+ //
+ // We also need to be careful to disable 'earliest' for the reverse
+ // search, since it could be enabled for the forward search. In the
+ // reverse case, to satisfy "leftmost" criteria, we need to match
+ // as much as we can. We also need to be careful to make the search
+ // anchored. We don't want the reverse search to report any matches
+ // other than the one beginning at the end of our forward search.
+ let revsearch = input
+ .clone()
+ .span(input.start()..end.offset())
+ .anchored(Anchored::Yes)
+ .earliest(false);
+ let start = rev
+ .try_search_rev(&revsearch)?
+ .expect("reverse search must match if forward search does");
+ assert_eq!(
+ start.pattern(),
+ end.pattern(),
+ "forward and reverse search must match same pattern",
+ );
+ assert!(start.offset() <= end.offset());
+ Ok(Some(Match::new(end.pattern(), start.offset()..end.offset())))
+ }
+
+ /// Returns true if either the given input specifies an anchored search
+ /// or if the underlying DFA is always anchored.
+ fn is_anchored(&self, input: &Input<'_>) -> bool {
+ match input.get_anchored() {
+ Anchored::No => self.forward().is_always_start_anchored(),
+ Anchored::Yes | Anchored::Pattern(_) => true,
+ }
+ }
+}
+
+/// Non-search APIs for querying information about the regex and setting a
+/// prefilter.
+impl<A: Automaton> Regex<A> {
+ /// Return the underlying DFA responsible for forward matching.
+ ///
+ /// This is useful for accessing the underlying DFA and converting it to
+ /// some other format or size. See the [`Builder::build_from_dfas`] docs
+ /// for an example of where this might be useful.
+ pub fn forward(&self) -> &A {
+ &self.forward
+ }
+
+ /// Return the underlying DFA responsible for reverse matching.
+ ///
+ /// This is useful for accessing the underlying DFA and converting it to
+ /// some other format or size. See the [`Builder::build_from_dfas`] docs
+ /// for an example of where this might be useful.
+ pub fn reverse(&self) -> &A {
+ &self.reverse
+ }
+
+ /// Returns the total number of patterns matched by this regex.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::dfa::regex::Regex;
+ ///
+ /// let re = Regex::new_many(&[r"[a-z]+", r"[0-9]+", r"\w+"])?;
+ /// assert_eq!(3, re.pattern_len());
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn pattern_len(&self) -> usize {
+ assert_eq!(self.forward().pattern_len(), self.reverse().pattern_len());
+ self.forward().pattern_len()
+ }
+}
+
+/// An iterator over all non-overlapping matches for an infallible search.
+///
+/// The iterator yields a [`Match`] value until no more matches could be found.
+/// If the underlying regex engine returns an error, then a panic occurs.
+///
+/// The type parameters are as follows:
+///
+/// * `A` represents the type of the underlying DFA that implements the
+/// [`Automaton`] trait.
+///
+/// The lifetime parameters are as follows:
+///
+/// * `'h` represents the lifetime of the haystack being searched.
+/// * `'r` represents the lifetime of the regex object itself.
+///
+/// This iterator can be created with the [`Regex::find_iter`] method.
+#[derive(Debug)]
+pub struct FindMatches<'r, 'h, A> {
+ re: &'r Regex<A>,
+ it: iter::Searcher<'h>,
+}
+
+impl<'r, 'h, A: Automaton> Iterator for FindMatches<'r, 'h, A> {
+ type Item = Match;
+
+ #[inline]
+ fn next(&mut self) -> Option<Match> {
+ let FindMatches { re, ref mut it } = *self;
+ it.advance(|input| re.try_search(input))
+ }
+}
+
+/// A builder for a regex based on deterministic finite automatons.
+///
+/// This builder permits configuring options for the syntax of a pattern, the
+/// NFA construction, the DFA construction and finally the regex searching
+/// itself. This builder is different from a general purpose regex builder in
+/// that it permits fine grain configuration of the construction process. The
+/// trade off for this is complexity, and the possibility of setting a
+/// configuration that might not make sense. For example, there are two
+/// different UTF-8 modes:
+///
+/// * [`syntax::Config::utf8`](crate::util::syntax::Config::utf8) controls
+/// whether the pattern itself can contain sub-expressions that match invalid
+/// UTF-8.
+/// * [`thompson::Config::utf8`](crate::nfa::thompson::Config::utf8) controls
+/// how the regex iterators themselves advance the starting position of the
+/// next search when a match with zero length is found.
+///
+/// Generally speaking, callers will want to either enable all of these or
+/// disable all of these.
+///
+/// Internally, building a regex requires building two DFAs, where one is
+/// responsible for finding the end of a match and the other is responsible
+/// for finding the start of a match. If you only need to detect whether
+/// something matched, or only the end of a match, then you should use a
+/// [`dense::Builder`] to construct a single DFA, which is cheaper than
+/// building two DFAs.
+///
+/// # Build methods
+///
+/// This builder has a few "build" methods. In general, it's the result of
+/// combining the following parameters:
+///
+/// * Building one or many regexes.
+/// * Building a regex with dense or sparse DFAs.
+///
+/// The simplest "build" method is [`Builder::build`]. It accepts a single
+/// pattern and builds a dense DFA using `usize` for the state identifier
+/// representation.
+///
+/// The most general "build" method is [`Builder::build_many`], which permits
+/// building a regex that searches for multiple patterns simultaneously while
+/// using a specific state identifier representation.
+///
+/// The most flexible "build" method, but hardest to use, is
+/// [`Builder::build_from_dfas`]. This exposes the fact that a [`Regex`] is
+/// just a pair of DFAs, and this method allows you to specify those DFAs
+/// exactly.
+///
+/// # Example
+///
+/// This example shows how to disable UTF-8 mode in the syntax and the regex
+/// itself. This is generally what you want for matching on arbitrary bytes.
+///
+/// ```
+/// # if cfg!(miri) { return Ok(()); } // miri takes too long
+/// use regex_automata::{
+/// dfa::regex::Regex, nfa::thompson, util::syntax, Match,
+/// };
+///
+/// let re = Regex::builder()
+/// .syntax(syntax::Config::new().utf8(false))
+/// .thompson(thompson::Config::new().utf8(false))
+/// .build(r"foo(?-u:[^b])ar.*")?;
+/// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n";
+/// let expected = Some(Match::must(0, 1..9));
+/// let got = re.find(haystack);
+/// assert_eq!(expected, got);
+/// // Notice that `(?-u:[^b])` matches invalid UTF-8,
+/// // but the subsequent `.*` does not! Disabling UTF-8
+/// // on the syntax permits this.
+/// assert_eq!(b"foo\xFFarzz", &haystack[got.unwrap().range()]);
+///
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+#[derive(Clone, Debug)]
+pub struct Builder {
+ #[cfg(feature = "dfa-build")]
+ dfa: dense::Builder,
+}
+
+impl Builder {
+ /// Create a new regex builder with the default configuration.
+ pub fn new() -> Builder {
+ Builder {
+ #[cfg(feature = "dfa-build")]
+ dfa: dense::Builder::new(),
+ }
+ }
+
+ /// Build a regex from the given pattern.
+ ///
+ /// If there was a problem parsing or compiling the pattern, then an error
+ /// is returned.
+ #[cfg(all(feature = "syntax", feature = "dfa-build"))]
+ pub fn build(&self, pattern: &str) -> Result<Regex, BuildError> {
+ self.build_many(&[pattern])
+ }
+
+ /// Build a regex from the given pattern using sparse DFAs.
+ ///
+ /// If there was a problem parsing or compiling the pattern, then an error
+ /// is returned.
+ #[cfg(all(feature = "syntax", feature = "dfa-build"))]
+ pub fn build_sparse(
+ &self,
+ pattern: &str,
+ ) -> Result<Regex<sparse::DFA<Vec<u8>>>, BuildError> {
+ self.build_many_sparse(&[pattern])
+ }
+
+ /// Build a regex from the given patterns.
+ #[cfg(all(feature = "syntax", feature = "dfa-build"))]
+ pub fn build_many<P: AsRef<str>>(
+ &self,
+ patterns: &[P],
+ ) -> Result<Regex, BuildError> {
+ let forward = self.dfa.build_many(patterns)?;
+ let reverse = self
+ .dfa
+ .clone()
+ .configure(
+ dense::Config::new()
+ .prefilter(None)
+ .specialize_start_states(false)
+ .start_kind(StartKind::Anchored)
+ .match_kind(MatchKind::All),
+ )
+ .thompson(crate::nfa::thompson::Config::new().reverse(true))
+ .build_many(patterns)?;
+ Ok(self.build_from_dfas(forward, reverse))
+ }
+
+ /// Build a sparse regex from the given patterns.
+ #[cfg(all(feature = "syntax", feature = "dfa-build"))]
+ pub fn build_many_sparse<P: AsRef<str>>(
+ &self,
+ patterns: &[P],
+ ) -> Result<Regex<sparse::DFA<Vec<u8>>>, BuildError> {
+ let re = self.build_many(patterns)?;
+ let forward = re.forward().to_sparse()?;
+ let reverse = re.reverse().to_sparse()?;
+ Ok(self.build_from_dfas(forward, reverse))
+ }
+
+ /// Build a regex from its component forward and reverse DFAs.
+ ///
+ /// This is useful when deserializing a regex from some arbitrary
+ /// memory region. This is also useful for building regexes from other
+ /// types of DFAs.
+ ///
+ /// If you're building the DFAs from scratch instead of building new DFAs
+ /// from other DFAs, then you'll need to make sure that the reverse DFA is
+ /// configured correctly to match the intended semantics. Namely:
+ ///
+ /// * It should be anchored.
+ /// * It should use [`MatchKind::All`] semantics.
+ /// * It should match in reverse.
+ /// * Otherwise, its configuration should match the forward DFA.
+ ///
+ /// If these conditions aren't satisfied, then the behavior of searches is
+ /// unspecified.
+ ///
+ /// Note that when using this constructor, no configuration is applied.
+ /// Since this routine provides the DFAs to the builder, there is no
+ /// opportunity to apply other configuration options.
+ ///
+ /// # Example
+ ///
+ /// This example is a bit a contrived. The usual use of these methods
+ /// would involve serializing `initial_re` somewhere and then deserializing
+ /// it later to build a regex. But in this case, we do everything in
+ /// memory.
+ ///
+ /// ```
+ /// use regex_automata::dfa::regex::Regex;
+ ///
+ /// let initial_re = Regex::new("foo[0-9]+")?;
+ /// assert_eq!(true, initial_re.is_match(b"foo123"));
+ ///
+ /// let (fwd, rev) = (initial_re.forward(), initial_re.reverse());
+ /// let re = Regex::builder().build_from_dfas(fwd, rev);
+ /// assert_eq!(true, re.is_match(b"foo123"));
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// This example shows how to build a `Regex` that uses sparse DFAs instead
+ /// of dense DFAs without using one of the convenience `build_sparse`
+ /// routines:
+ ///
+ /// ```
+ /// use regex_automata::dfa::regex::Regex;
+ ///
+ /// let initial_re = Regex::new("foo[0-9]+")?;
+ /// assert_eq!(true, initial_re.is_match(b"foo123"));
+ ///
+ /// let fwd = initial_re.forward().to_sparse()?;
+ /// let rev = initial_re.reverse().to_sparse()?;
+ /// let re = Regex::builder().build_from_dfas(fwd, rev);
+ /// assert_eq!(true, re.is_match(b"foo123"));
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn build_from_dfas<A: Automaton>(
+ &self,
+ forward: A,
+ reverse: A,
+ ) -> Regex<A> {
+ Regex { forward, reverse }
+ }
+
+ /// Set the syntax configuration for this builder using
+ /// [`syntax::Config`](crate::util::syntax::Config).
+ ///
+ /// This permits setting things like case insensitivity, Unicode and multi
+ /// line mode.
+ #[cfg(all(feature = "syntax", feature = "dfa-build"))]
+ pub fn syntax(
+ &mut self,
+ config: crate::util::syntax::Config,
+ ) -> &mut Builder {
+ self.dfa.syntax(config);
+ self
+ }
+
+ /// Set the Thompson NFA configuration for this builder using
+ /// [`nfa::thompson::Config`](crate::nfa::thompson::Config).
+ ///
+ /// This permits setting things like whether additional time should be
+ /// spent shrinking the size of the NFA.
+ #[cfg(all(feature = "syntax", feature = "dfa-build"))]
+ pub fn thompson(
+ &mut self,
+ config: crate::nfa::thompson::Config,
+ ) -> &mut Builder {
+ self.dfa.thompson(config);
+ self
+ }
+
+ /// Set the dense DFA compilation configuration for this builder using
+ /// [`dense::Config`](dense::Config).
+ ///
+ /// This permits setting things like whether the underlying DFAs should
+ /// be minimized.
+ #[cfg(feature = "dfa-build")]
+ pub fn dense(&mut self, config: dense::Config) -> &mut Builder {
+ self.dfa.configure(config);
+ self
+ }
+}
+
+impl Default for Builder {
+ fn default() -> Builder {
+ Builder::new()
+ }
+}