diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-19 00:47:55 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-19 00:47:55 +0000 |
commit | 26a029d407be480d791972afb5975cf62c9360a6 (patch) | |
tree | f435a8308119effd964b339f76abb83a57c29483 /third_party/rust/regex-automata/src/hybrid/regex.rs | |
parent | Initial commit. (diff) | |
download | firefox-26a029d407be480d791972afb5975cf62c9360a6.tar.xz firefox-26a029d407be480d791972afb5975cf62c9360a6.zip |
Adding upstream version 124.0.1.upstream/124.0.1
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/rust/regex-automata/src/hybrid/regex.rs')
-rw-r--r-- | third_party/rust/regex-automata/src/hybrid/regex.rs | 895 |
1 files changed, 895 insertions, 0 deletions
diff --git a/third_party/rust/regex-automata/src/hybrid/regex.rs b/third_party/rust/regex-automata/src/hybrid/regex.rs new file mode 100644 index 0000000000..75667daf91 --- /dev/null +++ b/third_party/rust/regex-automata/src/hybrid/regex.rs @@ -0,0 +1,895 @@ +/*! +A lazy DFA backed `Regex`. + +This module provides a [`Regex`] backed by a lazy DFA. A `Regex` implements +convenience routines you might have come to expect, such as finding a match +and iterating over all non-overlapping matches. This `Regex` type is limited +in its capabilities to what a lazy DFA can provide. Therefore, APIs involving +capturing groups, for example, are not provided. + +Internally, a `Regex` is composed of two DFAs. One is a "forward" DFA that +finds the end offset of a match, where as the other is a "reverse" DFA that +find the start offset of a match. + +See the [parent module](crate::hybrid) for examples. +*/ + +use crate::{ + hybrid::{ + dfa::{self, DFA}, + error::BuildError, + }, + nfa::thompson, + util::{ + iter, + search::{Anchored, Input, Match, MatchError, MatchKind}, + }, +}; + +/// A regular expression that uses hybrid NFA/DFAs (also called "lazy DFAs") +/// for searching. +/// +/// A regular expression is comprised of two lazy DFAs, a "forward" DFA and a +/// "reverse" DFA. The forward DFA is responsible for detecting the end of +/// a match while the reverse DFA is responsible for detecting the start +/// of a match. Thus, in order to find the bounds of any given match, a +/// forward search must first be run followed by a reverse search. A match +/// found by the forward DFA guarantees that the reverse DFA will also find +/// a match. +/// +/// # Fallibility +/// +/// Most of the search routines defined on this type will _panic_ when the +/// underlying search fails. This might be because the DFA gave up because it +/// saw a quit byte, whether configured explicitly or via heuristic Unicode +/// word boundary support, although neither are enabled by default. It might +/// also fail if the underlying DFA determines it isn't making effective use of +/// the cache (which also never happens by default). Or it might fail because +/// an invalid `Input` configuration is given, for example, with an unsupported +/// [`Anchored`] mode. +/// +/// If you need to handle these error cases instead of allowing them to trigger +/// a panic, then the lower level [`Regex::try_search`] provides a fallible API +/// that never panics. +/// +/// # Example +/// +/// This example shows how to cause a search to terminate if it sees a +/// `\n` byte, and handle the error returned. This could be useful if, for +/// example, you wanted to prevent a user supplied pattern from matching +/// across a line boundary. +/// +/// ``` +/// # if cfg!(miri) { return Ok(()); } // miri takes too long +/// use regex_automata::{hybrid::{dfa, regex::Regex}, Input, MatchError}; +/// +/// let re = Regex::builder() +/// .dfa(dfa::Config::new().quit(b'\n', true)) +/// .build(r"foo\p{any}+bar")?; +/// let mut cache = re.create_cache(); +/// +/// let input = Input::new("foo\nbar"); +/// // Normally this would produce a match, since \p{any} contains '\n'. +/// // But since we instructed the automaton to enter a quit state if a +/// // '\n' is observed, this produces a match error instead. +/// let expected = MatchError::quit(b'\n', 3); +/// let got = re.try_search(&mut cache, &input).unwrap_err(); +/// assert_eq!(expected, got); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +#[derive(Debug)] +pub struct Regex { + /// The forward lazy DFA. This can only find the end of a match. + forward: DFA, + /// The reverse lazy DFA. This can only find the start of a match. + /// + /// This is built with 'all' match semantics (instead of leftmost-first) + /// so that it always finds the longest possible match (which corresponds + /// to the leftmost starting position). It is also compiled as an anchored + /// matcher and has 'starts_for_each_pattern' enabled. Including starting + /// states for each pattern is necessary to ensure that we only look for + /// matches of a pattern that matched in the forward direction. Otherwise, + /// we might wind up finding the "leftmost" starting position of a totally + /// different pattern! + reverse: DFA, +} + +/// Convenience routines for regex and cache construction. +impl Regex { + /// Parse the given regular expression using the default configuration and + /// return the corresponding regex. + /// + /// If you want a non-default configuration, then use the [`Builder`] to + /// set your own configuration. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{hybrid::regex::Regex, Match}; + /// + /// let re = Regex::new("foo[0-9]+bar")?; + /// let mut cache = re.create_cache(); + /// assert_eq!( + /// Some(Match::must(0, 3..14)), + /// re.find(&mut cache, "zzzfoo12345barzzz"), + /// ); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[cfg(feature = "syntax")] + pub fn new(pattern: &str) -> Result<Regex, BuildError> { + Regex::builder().build(pattern) + } + + /// Like `new`, but parses multiple patterns into a single "multi regex." + /// This similarly uses the default regex configuration. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{hybrid::regex::Regex, Match}; + /// + /// let re = Regex::new_many(&["[a-z]+", "[0-9]+"])?; + /// let mut cache = re.create_cache(); + /// + /// let mut it = re.find_iter(&mut cache, "abc 1 foo 4567 0 quux"); + /// assert_eq!(Some(Match::must(0, 0..3)), it.next()); + /// assert_eq!(Some(Match::must(1, 4..5)), it.next()); + /// assert_eq!(Some(Match::must(0, 6..9)), it.next()); + /// assert_eq!(Some(Match::must(1, 10..14)), it.next()); + /// assert_eq!(Some(Match::must(1, 15..16)), it.next()); + /// assert_eq!(Some(Match::must(0, 17..21)), it.next()); + /// assert_eq!(None, it.next()); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[cfg(feature = "syntax")] + pub fn new_many<P: AsRef<str>>( + patterns: &[P], + ) -> Result<Regex, BuildError> { + Regex::builder().build_many(patterns) + } + + /// Return a builder for configuring the construction of a `Regex`. + /// + /// This is a convenience routine to avoid needing to import the + /// [`Builder`] type in common cases. + /// + /// # Example + /// + /// This example shows how to use the builder to disable UTF-8 mode + /// everywhere. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{ + /// hybrid::regex::Regex, nfa::thompson, util::syntax, Match, + /// }; + /// + /// let re = Regex::builder() + /// .syntax(syntax::Config::new().utf8(false)) + /// .thompson(thompson::Config::new().utf8(false)) + /// .build(r"foo(?-u:[^b])ar.*")?; + /// let mut cache = re.create_cache(); + /// + /// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n"; + /// let expected = Some(Match::must(0, 1..9)); + /// let got = re.find(&mut cache, haystack); + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn builder() -> Builder { + Builder::new() + } + + /// Create a new cache for this `Regex`. + /// + /// The cache returned should only be used for searches for this + /// `Regex`. If you want to reuse the cache for another `Regex`, then + /// you must call [`Cache::reset`] with that `Regex` (or, equivalently, + /// [`Regex::reset_cache`]). + pub fn create_cache(&self) -> Cache { + Cache::new(self) + } + + /// Reset the given cache such that it can be used for searching with the + /// this `Regex` (and only this `Regex`). + /// + /// A cache reset permits reusing memory already allocated in this cache + /// with a different `Regex`. + /// + /// Resetting a cache sets its "clear count" to 0. This is relevant if the + /// `Regex` has been configured to "give up" after it has cleared the cache + /// a certain number of times. + /// + /// # Example + /// + /// This shows how to re-purpose a cache for use with a different `Regex`. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{hybrid::regex::Regex, Match}; + /// + /// let re1 = Regex::new(r"\w")?; + /// let re2 = Regex::new(r"\W")?; + /// + /// let mut cache = re1.create_cache(); + /// assert_eq!( + /// Some(Match::must(0, 0..2)), + /// re1.find(&mut cache, "Δ"), + /// ); + /// + /// // Using 'cache' with re2 is not allowed. It may result in panics or + /// // incorrect results. In order to re-purpose the cache, we must reset + /// // it with the Regex we'd like to use it with. + /// // + /// // Similarly, after this reset, using the cache with 're1' is also not + /// // allowed. + /// re2.reset_cache(&mut cache); + /// assert_eq!( + /// Some(Match::must(0, 0..3)), + /// re2.find(&mut cache, "☃"), + /// ); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn reset_cache(&self, cache: &mut Cache) { + self.forward().reset_cache(&mut cache.forward); + self.reverse().reset_cache(&mut cache.reverse); + } +} + +/// Standard infallible search routines for finding and iterating over matches. +impl Regex { + /// Returns true if and only if this regex matches the given haystack. + /// + /// This routine may short circuit if it knows that scanning future input + /// will never lead to a different result. In particular, if the underlying + /// DFA enters a match state or a dead state, then this routine will return + /// `true` or `false`, respectively, without inspecting any future input. + /// + /// # Panics + /// + /// This routine panics if the search could not complete. This can occur + /// in a number of circumstances: + /// + /// * The configuration of the lazy DFA may permit it to "quit" the search. + /// For example, setting quit bytes or enabling heuristic support for + /// Unicode word boundaries. The default configuration does not enable any + /// option that could result in the lazy DFA quitting. + /// * The configuration of the lazy DFA may also permit it to "give up" + /// on a search if it makes ineffective use of its transition table + /// cache. The default configuration does not enable this by default, + /// although it is typically a good idea to. + /// * When the provided `Input` configuration is not supported. For + /// example, by providing an unsupported anchor mode. + /// + /// When a search panics, callers cannot know whether a match exists or + /// not. + /// + /// Use [`Regex::try_search`] if you want to handle these error conditions. + /// + /// # Example + /// + /// ``` + /// use regex_automata::hybrid::regex::Regex; + /// + /// let re = Regex::new("foo[0-9]+bar")?; + /// let mut cache = re.create_cache(); + /// + /// assert!(re.is_match(&mut cache, "foo12345bar")); + /// assert!(!re.is_match(&mut cache, "foobar")); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn is_match<'h, I: Into<Input<'h>>>( + &self, + cache: &mut Cache, + input: I, + ) -> bool { + // Not only can we do an "earliest" search, but we can avoid doing a + // reverse scan too. + self.forward() + .try_search_fwd(&mut cache.forward, &input.into().earliest(true)) + .unwrap() + .is_some() + } + + /// Returns the start and end offset of the leftmost match. If no match + /// exists, then `None` is returned. + /// + /// # Panics + /// + /// This routine panics if the search could not complete. This can occur + /// in a number of circumstances: + /// + /// * The configuration of the lazy DFA may permit it to "quit" the search. + /// For example, setting quit bytes or enabling heuristic support for + /// Unicode word boundaries. The default configuration does not enable any + /// option that could result in the lazy DFA quitting. + /// * The configuration of the lazy DFA may also permit it to "give up" + /// on a search if it makes ineffective use of its transition table + /// cache. The default configuration does not enable this by default, + /// although it is typically a good idea to. + /// * When the provided `Input` configuration is not supported. For + /// example, by providing an unsupported anchor mode. + /// + /// When a search panics, callers cannot know whether a match exists or + /// not. + /// + /// Use [`Regex::try_search`] if you want to handle these error conditions. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{Match, hybrid::regex::Regex}; + /// + /// let re = Regex::new("foo[0-9]+")?; + /// let mut cache = re.create_cache(); + /// assert_eq!( + /// Some(Match::must(0, 3..11)), + /// re.find(&mut cache, "zzzfoo12345zzz"), + /// ); + /// + /// // Even though a match is found after reading the first byte (`a`), + /// // the default leftmost-first match semantics demand that we find the + /// // earliest match that prefers earlier parts of the pattern over latter + /// // parts. + /// let re = Regex::new("abc|a")?; + /// let mut cache = re.create_cache(); + /// assert_eq!(Some(Match::must(0, 0..3)), re.find(&mut cache, "abc")); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn find<'h, I: Into<Input<'h>>>( + &self, + cache: &mut Cache, + input: I, + ) -> Option<Match> { + self.try_search(cache, &input.into()).unwrap() + } + + /// Returns an iterator over all non-overlapping leftmost matches in the + /// given bytes. If no match exists, then the iterator yields no elements. + /// + /// # Panics + /// + /// This routine panics if the search could not complete. This can occur + /// in a number of circumstances: + /// + /// * The configuration of the lazy DFA may permit it to "quit" the search. + /// For example, setting quit bytes or enabling heuristic support for + /// Unicode word boundaries. The default configuration does not enable any + /// option that could result in the lazy DFA quitting. + /// * The configuration of the lazy DFA may also permit it to "give up" + /// on a search if it makes ineffective use of its transition table + /// cache. The default configuration does not enable this by default, + /// although it is typically a good idea to. + /// * When the provided `Input` configuration is not supported. For + /// example, by providing an unsupported anchor mode. + /// + /// When a search panics, callers cannot know whether a match exists or + /// not. + /// + /// The above conditions also apply to the iterator returned as well. For + /// example, if the lazy DFA gives up or quits during a search using this + /// method, then a panic will occur during iteration. + /// + /// Use [`Regex::try_search`] with [`util::iter::Searcher`](iter::Searcher) + /// if you want to handle these error conditions. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{hybrid::regex::Regex, Match}; + /// + /// let re = Regex::new("foo[0-9]+")?; + /// let mut cache = re.create_cache(); + /// + /// let text = "foo1 foo12 foo123"; + /// let matches: Vec<Match> = re.find_iter(&mut cache, text).collect(); + /// assert_eq!(matches, vec![ + /// Match::must(0, 0..4), + /// Match::must(0, 5..10), + /// Match::must(0, 11..17), + /// ]); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn find_iter<'r, 'c, 'h, I: Into<Input<'h>>>( + &'r self, + cache: &'c mut Cache, + input: I, + ) -> FindMatches<'r, 'c, 'h> { + let it = iter::Searcher::new(input.into()); + FindMatches { re: self, cache, it } + } +} + +/// Lower level "search" primitives that accept a `&Input` for cheap reuse +/// and return an error if one occurs instead of panicking. +impl Regex { + /// Returns the start and end offset of the leftmost match. If no match + /// exists, then `None` is returned. + /// + /// This is like [`Regex::find`] but with two differences: + /// + /// 1. It is not generic over `Into<Input>` and instead accepts a + /// `&Input`. This permits reusing the same `Input` for multiple searches + /// without needing to create a new one. This _may_ help with latency. + /// 2. It returns an error if the search could not complete where as + /// [`Regex::find`] will panic. + /// + /// # Errors + /// + /// This routine errors if the search could not complete. This can occur + /// in a number of circumstances: + /// + /// * The configuration of the lazy DFA may permit it to "quit" the search. + /// For example, setting quit bytes or enabling heuristic support for + /// Unicode word boundaries. The default configuration does not enable any + /// option that could result in the lazy DFA quitting. + /// * The configuration of the lazy DFA may also permit it to "give up" + /// on a search if it makes ineffective use of its transition table + /// cache. The default configuration does not enable this by default, + /// although it is typically a good idea to. + /// * When the provided `Input` configuration is not supported. For + /// example, by providing an unsupported anchor mode. + /// + /// When a search returns an error, callers cannot know whether a match + /// exists or not. + #[inline] + pub fn try_search( + &self, + cache: &mut Cache, + input: &Input<'_>, + ) -> Result<Option<Match>, MatchError> { + let (fcache, rcache) = (&mut cache.forward, &mut cache.reverse); + let end = match self.forward().try_search_fwd(fcache, input)? { + None => return Ok(None), + Some(end) => end, + }; + // This special cases an empty match at the beginning of the search. If + // our end matches our start, then since a reverse DFA can't match past + // the start, it must follow that our starting position is also our end + // position. So short circuit and skip the reverse search. + if input.start() == end.offset() { + return Ok(Some(Match::new( + end.pattern(), + end.offset()..end.offset(), + ))); + } + // We can also skip the reverse search if we know our search was + // anchored. This occurs either when the input config is anchored or + // when we know the regex itself is anchored. In this case, we know the + // start of the match, if one is found, must be the start of the + // search. + if self.is_anchored(input) { + return Ok(Some(Match::new( + end.pattern(), + input.start()..end.offset(), + ))); + } + // N.B. I have tentatively convinced myself that it isn't necessary + // to specify the specific pattern for the reverse search since the + // reverse search will always find the same pattern to match as the + // forward search. But I lack a rigorous proof. Why not just provide + // the pattern anyway? Well, if it is needed, then leaving it out + // gives us a chance to find a witness. (Also, if we don't need to + // specify the pattern, then we don't need to build the reverse DFA + // with 'starts_for_each_pattern' enabled. It doesn't matter too much + // for the lazy DFA, but does make the overall DFA bigger.) + // + // We also need to be careful to disable 'earliest' for the reverse + // search, since it could be enabled for the forward search. In the + // reverse case, to satisfy "leftmost" criteria, we need to match as + // much as we can. We also need to be careful to make the search + // anchored. We don't want the reverse search to report any matches + // other than the one beginning at the end of our forward search. + let revsearch = input + .clone() + .span(input.start()..end.offset()) + .anchored(Anchored::Yes) + .earliest(false); + let start = self + .reverse() + .try_search_rev(rcache, &revsearch)? + .expect("reverse search must match if forward search does"); + debug_assert_eq!( + start.pattern(), + end.pattern(), + "forward and reverse search must match same pattern", + ); + debug_assert!(start.offset() <= end.offset()); + Ok(Some(Match::new(end.pattern(), start.offset()..end.offset()))) + } + + /// Returns true if either the given input specifies an anchored search + /// or if the underlying NFA is always anchored. + fn is_anchored(&self, input: &Input<'_>) -> bool { + match input.get_anchored() { + Anchored::No => { + self.forward().get_nfa().is_always_start_anchored() + } + Anchored::Yes | Anchored::Pattern(_) => true, + } + } +} + +/// Non-search APIs for querying information about the regex and setting a +/// prefilter. +impl Regex { + /// Return the underlying lazy DFA responsible for forward matching. + /// + /// This is useful for accessing the underlying lazy DFA and using it + /// directly if the situation calls for it. + pub fn forward(&self) -> &DFA { + &self.forward + } + + /// Return the underlying lazy DFA responsible for reverse matching. + /// + /// This is useful for accessing the underlying lazy DFA and using it + /// directly if the situation calls for it. + pub fn reverse(&self) -> &DFA { + &self.reverse + } + + /// Returns the total number of patterns matched by this regex. + /// + /// # Example + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::hybrid::regex::Regex; + /// + /// let re = Regex::new_many(&[r"[a-z]+", r"[0-9]+", r"\w+"])?; + /// assert_eq!(3, re.pattern_len()); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn pattern_len(&self) -> usize { + assert_eq!(self.forward().pattern_len(), self.reverse().pattern_len()); + self.forward().pattern_len() + } +} + +/// An iterator over all non-overlapping matches for an infallible search. +/// +/// The iterator yields a [`Match`] value until no more matches could be found. +/// If the underlying regex engine returns an error, then a panic occurs. +/// +/// The lifetime parameters are as follows: +/// +/// * `'r` represents the lifetime of the regex object. +/// * `'h` represents the lifetime of the haystack being searched. +/// * `'c` represents the lifetime of the regex cache. +/// +/// This iterator can be created with the [`Regex::find_iter`] method. +#[derive(Debug)] +pub struct FindMatches<'r, 'c, 'h> { + re: &'r Regex, + cache: &'c mut Cache, + it: iter::Searcher<'h>, +} + +impl<'r, 'c, 'h> Iterator for FindMatches<'r, 'c, 'h> { + type Item = Match; + + #[inline] + fn next(&mut self) -> Option<Match> { + let FindMatches { re, ref mut cache, ref mut it } = *self; + it.advance(|input| re.try_search(cache, input)) + } +} + +/// A cache represents a partially computed forward and reverse DFA. +/// +/// A cache is the key component that differentiates a classical DFA and a +/// hybrid NFA/DFA (also called a "lazy DFA"). Where a classical DFA builds a +/// complete transition table that can handle all possible inputs, a hybrid +/// NFA/DFA starts with an empty transition table and builds only the parts +/// required during search. The parts that are built are stored in a cache. For +/// this reason, a cache is a required parameter for nearly every operation on +/// a [`Regex`]. +/// +/// Caches can be created from their corresponding `Regex` via +/// [`Regex::create_cache`]. A cache can only be used with either the `Regex` +/// that created it, or the `Regex` that was most recently used to reset it +/// with [`Cache::reset`]. Using a cache with any other `Regex` may result in +/// panics or incorrect results. +#[derive(Debug, Clone)] +pub struct Cache { + forward: dfa::Cache, + reverse: dfa::Cache, +} + +impl Cache { + /// Create a new cache for the given `Regex`. + /// + /// The cache returned should only be used for searches for the given + /// `Regex`. If you want to reuse the cache for another `Regex`, then you + /// must call [`Cache::reset`] with that `Regex`. + pub fn new(re: &Regex) -> Cache { + let forward = dfa::Cache::new(re.forward()); + let reverse = dfa::Cache::new(re.reverse()); + Cache { forward, reverse } + } + + /// Reset this cache such that it can be used for searching with the given + /// `Regex` (and only that `Regex`). + /// + /// A cache reset permits reusing memory already allocated in this cache + /// with a different `Regex`. + /// + /// Resetting a cache sets its "clear count" to 0. This is relevant if the + /// `Regex` has been configured to "give up" after it has cleared the cache + /// a certain number of times. + /// + /// # Example + /// + /// This shows how to re-purpose a cache for use with a different `Regex`. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{hybrid::regex::Regex, Match}; + /// + /// let re1 = Regex::new(r"\w")?; + /// let re2 = Regex::new(r"\W")?; + /// + /// let mut cache = re1.create_cache(); + /// assert_eq!( + /// Some(Match::must(0, 0..2)), + /// re1.find(&mut cache, "Δ"), + /// ); + /// + /// // Using 'cache' with re2 is not allowed. It may result in panics or + /// // incorrect results. In order to re-purpose the cache, we must reset + /// // it with the Regex we'd like to use it with. + /// // + /// // Similarly, after this reset, using the cache with 're1' is also not + /// // allowed. + /// cache.reset(&re2); + /// assert_eq!( + /// Some(Match::must(0, 0..3)), + /// re2.find(&mut cache, "☃"), + /// ); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn reset(&mut self, re: &Regex) { + self.forward.reset(re.forward()); + self.reverse.reset(re.reverse()); + } + + /// Return a reference to the forward cache. + pub fn forward(&mut self) -> &dfa::Cache { + &self.forward + } + + /// Return a reference to the reverse cache. + pub fn reverse(&mut self) -> &dfa::Cache { + &self.reverse + } + + /// Return a mutable reference to the forward cache. + /// + /// If you need mutable references to both the forward and reverse caches, + /// then use [`Cache::as_parts_mut`]. + pub fn forward_mut(&mut self) -> &mut dfa::Cache { + &mut self.forward + } + + /// Return a mutable reference to the reverse cache. + /// + /// If you need mutable references to both the forward and reverse caches, + /// then use [`Cache::as_parts_mut`]. + pub fn reverse_mut(&mut self) -> &mut dfa::Cache { + &mut self.reverse + } + + /// Return references to the forward and reverse caches, respectively. + pub fn as_parts(&self) -> (&dfa::Cache, &dfa::Cache) { + (&self.forward, &self.reverse) + } + + /// Return mutable references to the forward and reverse caches, + /// respectively. + pub fn as_parts_mut(&mut self) -> (&mut dfa::Cache, &mut dfa::Cache) { + (&mut self.forward, &mut self.reverse) + } + + /// Returns the heap memory usage, in bytes, as a sum of the forward and + /// reverse lazy DFA caches. + /// + /// This does **not** include the stack size used up by this cache. To + /// compute that, use `std::mem::size_of::<Cache>()`. + pub fn memory_usage(&self) -> usize { + self.forward.memory_usage() + self.reverse.memory_usage() + } +} + +/// A builder for a regex based on a hybrid NFA/DFA. +/// +/// This builder permits configuring options for the syntax of a pattern, the +/// NFA construction, the lazy DFA construction and finally the regex searching +/// itself. This builder is different from a general purpose regex builder +/// in that it permits fine grain configuration of the construction process. +/// The trade off for this is complexity, and the possibility of setting a +/// configuration that might not make sense. For example, there are two +/// different UTF-8 modes: +/// +/// * [`syntax::Config::utf8`](crate::util::syntax::Config::utf8) controls +/// whether the pattern itself can contain sub-expressions that match invalid +/// UTF-8. +/// * [`thompson::Config::utf8`] controls how the regex iterators themselves +/// advance the starting position of the next search when a match with zero +/// length is found. +/// +/// Generally speaking, callers will want to either enable all of these or +/// disable all of these. +/// +/// Internally, building a regex requires building two hybrid NFA/DFAs, +/// where one is responsible for finding the end of a match and the other is +/// responsible for finding the start of a match. If you only need to detect +/// whether something matched, or only the end of a match, then you should use +/// a [`dfa::Builder`] to construct a single hybrid NFA/DFA, which is cheaper +/// than building two of them. +/// +/// # Example +/// +/// This example shows how to disable UTF-8 mode in the syntax and the regex +/// itself. This is generally what you want for matching on arbitrary bytes. +/// +/// ``` +/// # if cfg!(miri) { return Ok(()); } // miri takes too long +/// use regex_automata::{ +/// hybrid::regex::Regex, nfa::thompson, util::syntax, Match, +/// }; +/// +/// let re = Regex::builder() +/// .syntax(syntax::Config::new().utf8(false)) +/// .thompson(thompson::Config::new().utf8(false)) +/// .build(r"foo(?-u:[^b])ar.*")?; +/// let mut cache = re.create_cache(); +/// +/// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n"; +/// let expected = Some(Match::must(0, 1..9)); +/// let got = re.find(&mut cache, haystack); +/// assert_eq!(expected, got); +/// // Notice that `(?-u:[^b])` matches invalid UTF-8, +/// // but the subsequent `.*` does not! Disabling UTF-8 +/// // on the syntax permits this. +/// assert_eq!(b"foo\xFFarzz", &haystack[got.unwrap().range()]); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +#[derive(Clone, Debug)] +pub struct Builder { + dfa: dfa::Builder, +} + +impl Builder { + /// Create a new regex builder with the default configuration. + pub fn new() -> Builder { + Builder { dfa: DFA::builder() } + } + + /// Build a regex from the given pattern. + /// + /// If there was a problem parsing or compiling the pattern, then an error + /// is returned. + #[cfg(feature = "syntax")] + pub fn build(&self, pattern: &str) -> Result<Regex, BuildError> { + self.build_many(&[pattern]) + } + + /// Build a regex from the given patterns. + #[cfg(feature = "syntax")] + pub fn build_many<P: AsRef<str>>( + &self, + patterns: &[P], + ) -> Result<Regex, BuildError> { + let forward = self.dfa.build_many(patterns)?; + let reverse = self + .dfa + .clone() + .configure( + DFA::config() + .prefilter(None) + .specialize_start_states(false) + .match_kind(MatchKind::All), + ) + .thompson(thompson::Config::new().reverse(true)) + .build_many(patterns)?; + Ok(self.build_from_dfas(forward, reverse)) + } + + /// Build a regex from its component forward and reverse hybrid NFA/DFAs. + /// + /// This is useful when you've built a forward and reverse lazy DFA + /// separately, and want to combine them into a single regex. Once build, + /// the individual DFAs given can still be accessed via [`Regex::forward`] + /// and [`Regex::reverse`]. + /// + /// It is important that the reverse lazy DFA be compiled under the + /// following conditions: + /// + /// * It should use [`MatchKind::All`] semantics. + /// * It should match in reverse. + /// * Otherwise, its configuration should match the forward DFA. + /// + /// If these conditions aren't satisfied, then the behavior of searches is + /// unspecified. + /// + /// Note that when using this constructor, no configuration is applied. + /// Since this routine provides the DFAs to the builder, there is no + /// opportunity to apply other configuration options. + /// + /// # Example + /// + /// This shows how to build individual lazy forward and reverse DFAs, and + /// then combine them into a single `Regex`. + /// + /// ``` + /// use regex_automata::{ + /// hybrid::{dfa::DFA, regex::Regex}, + /// nfa::thompson, + /// MatchKind, + /// }; + /// + /// let fwd = DFA::new(r"foo[0-9]+")?; + /// let rev = DFA::builder() + /// .configure(DFA::config().match_kind(MatchKind::All)) + /// .thompson(thompson::Config::new().reverse(true)) + /// .build(r"foo[0-9]+")?; + /// + /// let re = Regex::builder().build_from_dfas(fwd, rev); + /// let mut cache = re.create_cache(); + /// assert_eq!(true, re.is_match(&mut cache, "foo123")); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn build_from_dfas(&self, forward: DFA, reverse: DFA) -> Regex { + Regex { forward, reverse } + } + + /// Set the syntax configuration for this builder using + /// [`syntax::Config`](crate::util::syntax::Config). + /// + /// This permits setting things like case insensitivity, Unicode and multi + /// line mode. + #[cfg(feature = "syntax")] + pub fn syntax( + &mut self, + config: crate::util::syntax::Config, + ) -> &mut Builder { + self.dfa.syntax(config); + self + } + + /// Set the Thompson NFA configuration for this builder using + /// [`nfa::thompson::Config`](thompson::Config). + /// + /// This permits setting things like whether additional time should be + /// spent shrinking the size of the NFA. + #[cfg(feature = "syntax")] + pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder { + self.dfa.thompson(config); + self + } + + /// Set the lazy DFA compilation configuration for this builder using + /// [`dfa::Config`](dfa::Config). + /// + /// This permits setting things like whether Unicode word boundaries should + /// be heuristically supported or settings how the behavior of the cache. + pub fn dfa(&mut self, config: dfa::Config) -> &mut Builder { + self.dfa.configure(config); + self + } +} + +impl Default for Builder { + fn default() -> Builder { + Builder::new() + } +} |