/*! A lazy DFA backed `Regex`. This module provides a [`Regex`] backed by a lazy DFA. A `Regex` implements convenience routines you might have come to expect, such as finding a match and iterating over all non-overlapping matches. This `Regex` type is limited in its capabilities to what a lazy DFA can provide. Therefore, APIs involving capturing groups, for example, are not provided. Internally, a `Regex` is composed of two DFAs. One is a "forward" DFA that finds the end offset of a match, where as the other is a "reverse" DFA that find the start offset of a match. See the [parent module](crate::hybrid) for examples. */ use crate::{ hybrid::{ dfa::{self, DFA}, error::BuildError, }, nfa::thompson, util::{ iter, search::{Anchored, Input, Match, MatchError, MatchKind}, }, }; /// A regular expression that uses hybrid NFA/DFAs (also called "lazy DFAs") /// for searching. /// /// A regular expression is comprised of two lazy DFAs, a "forward" DFA and a /// "reverse" DFA. The forward DFA is responsible for detecting the end of /// a match while the reverse DFA is responsible for detecting the start /// of a match. Thus, in order to find the bounds of any given match, a /// forward search must first be run followed by a reverse search. A match /// found by the forward DFA guarantees that the reverse DFA will also find /// a match. /// /// # Fallibility /// /// Most of the search routines defined on this type will _panic_ when the /// underlying search fails. This might be because the DFA gave up because it /// saw a quit byte, whether configured explicitly or via heuristic Unicode /// word boundary support, although neither are enabled by default. It might /// also fail if the underlying DFA determines it isn't making effective use of /// the cache (which also never happens by default). Or it might fail because /// an invalid `Input` configuration is given, for example, with an unsupported /// [`Anchored`] mode. /// /// If you need to handle these error cases instead of allowing them to trigger /// a panic, then the lower level [`Regex::try_search`] provides a fallible API /// that never panics. /// /// # Example /// /// This example shows how to cause a search to terminate if it sees a /// `\n` byte, and handle the error returned. This could be useful if, for /// example, you wanted to prevent a user supplied pattern from matching /// across a line boundary. /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{hybrid::{dfa, regex::Regex}, Input, MatchError}; /// /// let re = Regex::builder() /// .dfa(dfa::Config::new().quit(b'\n', true)) /// .build(r"foo\p{any}+bar")?; /// let mut cache = re.create_cache(); /// /// let input = Input::new("foo\nbar"); /// // Normally this would produce a match, since \p{any} contains '\n'. /// // But since we instructed the automaton to enter a quit state if a /// // '\n' is observed, this produces a match error instead. /// let expected = MatchError::quit(b'\n', 3); /// let got = re.try_search(&mut cache, &input).unwrap_err(); /// assert_eq!(expected, got); /// /// # Ok::<(), Box>(()) /// ``` #[derive(Debug)] pub struct Regex { /// The forward lazy DFA. This can only find the end of a match. forward: DFA, /// The reverse lazy DFA. This can only find the start of a match. /// /// This is built with 'all' match semantics (instead of leftmost-first) /// so that it always finds the longest possible match (which corresponds /// to the leftmost starting position). It is also compiled as an anchored /// matcher and has 'starts_for_each_pattern' enabled. Including starting /// states for each pattern is necessary to ensure that we only look for /// matches of a pattern that matched in the forward direction. Otherwise, /// we might wind up finding the "leftmost" starting position of a totally /// different pattern! reverse: DFA, } /// Convenience routines for regex and cache construction. impl Regex { /// Parse the given regular expression using the default configuration and /// return the corresponding regex. /// /// If you want a non-default configuration, then use the [`Builder`] to /// set your own configuration. /// /// # Example /// /// ``` /// use regex_automata::{hybrid::regex::Regex, Match}; /// /// let re = Regex::new("foo[0-9]+bar")?; /// let mut cache = re.create_cache(); /// assert_eq!( /// Some(Match::must(0, 3..14)), /// re.find(&mut cache, "zzzfoo12345barzzz"), /// ); /// # Ok::<(), Box>(()) /// ``` #[cfg(feature = "syntax")] pub fn new(pattern: &str) -> Result { Regex::builder().build(pattern) } /// Like `new`, but parses multiple patterns into a single "multi regex." /// This similarly uses the default regex configuration. /// /// # Example /// /// ``` /// use regex_automata::{hybrid::regex::Regex, Match}; /// /// let re = Regex::new_many(&["[a-z]+", "[0-9]+"])?; /// let mut cache = re.create_cache(); /// /// let mut it = re.find_iter(&mut cache, "abc 1 foo 4567 0 quux"); /// assert_eq!(Some(Match::must(0, 0..3)), it.next()); /// assert_eq!(Some(Match::must(1, 4..5)), it.next()); /// assert_eq!(Some(Match::must(0, 6..9)), it.next()); /// assert_eq!(Some(Match::must(1, 10..14)), it.next()); /// assert_eq!(Some(Match::must(1, 15..16)), it.next()); /// assert_eq!(Some(Match::must(0, 17..21)), it.next()); /// assert_eq!(None, it.next()); /// # Ok::<(), Box>(()) /// ``` #[cfg(feature = "syntax")] pub fn new_many>( patterns: &[P], ) -> Result { Regex::builder().build_many(patterns) } /// Return a builder for configuring the construction of a `Regex`. /// /// This is a convenience routine to avoid needing to import the /// [`Builder`] type in common cases. /// /// # Example /// /// This example shows how to use the builder to disable UTF-8 mode /// everywhere. /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{ /// hybrid::regex::Regex, nfa::thompson, util::syntax, Match, /// }; /// /// let re = Regex::builder() /// .syntax(syntax::Config::new().utf8(false)) /// .thompson(thompson::Config::new().utf8(false)) /// .build(r"foo(?-u:[^b])ar.*")?; /// let mut cache = re.create_cache(); /// /// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n"; /// let expected = Some(Match::must(0, 1..9)); /// let got = re.find(&mut cache, haystack); /// assert_eq!(expected, got); /// /// # Ok::<(), Box>(()) /// ``` pub fn builder() -> Builder { Builder::new() } /// Create a new cache for this `Regex`. /// /// The cache returned should only be used for searches for this /// `Regex`. If you want to reuse the cache for another `Regex`, then /// you must call [`Cache::reset`] with that `Regex` (or, equivalently, /// [`Regex::reset_cache`]). pub fn create_cache(&self) -> Cache { Cache::new(self) } /// Reset the given cache such that it can be used for searching with the /// this `Regex` (and only this `Regex`). /// /// A cache reset permits reusing memory already allocated in this cache /// with a different `Regex`. /// /// Resetting a cache sets its "clear count" to 0. This is relevant if the /// `Regex` has been configured to "give up" after it has cleared the cache /// a certain number of times. /// /// # Example /// /// This shows how to re-purpose a cache for use with a different `Regex`. /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{hybrid::regex::Regex, Match}; /// /// let re1 = Regex::new(r"\w")?; /// let re2 = Regex::new(r"\W")?; /// /// let mut cache = re1.create_cache(); /// assert_eq!( /// Some(Match::must(0, 0..2)), /// re1.find(&mut cache, "Δ"), /// ); /// /// // Using 'cache' with re2 is not allowed. It may result in panics or /// // incorrect results. In order to re-purpose the cache, we must reset /// // it with the Regex we'd like to use it with. /// // /// // Similarly, after this reset, using the cache with 're1' is also not /// // allowed. /// re2.reset_cache(&mut cache); /// assert_eq!( /// Some(Match::must(0, 0..3)), /// re2.find(&mut cache, "☃"), /// ); /// /// # Ok::<(), Box>(()) /// ``` pub fn reset_cache(&self, cache: &mut Cache) { self.forward().reset_cache(&mut cache.forward); self.reverse().reset_cache(&mut cache.reverse); } } /// Standard infallible search routines for finding and iterating over matches. impl Regex { /// Returns true if and only if this regex matches the given haystack. /// /// This routine may short circuit if it knows that scanning future input /// will never lead to a different result. In particular, if the underlying /// DFA enters a match state or a dead state, then this routine will return /// `true` or `false`, respectively, without inspecting any future input. /// /// # Panics /// /// This routine panics if the search could not complete. This can occur /// in a number of circumstances: /// /// * The configuration of the lazy DFA may permit it to "quit" the search. /// For example, setting quit bytes or enabling heuristic support for /// Unicode word boundaries. The default configuration does not enable any /// option that could result in the lazy DFA quitting. /// * The configuration of the lazy DFA may also permit it to "give up" /// on a search if it makes ineffective use of its transition table /// cache. The default configuration does not enable this by default, /// although it is typically a good idea to. /// * When the provided `Input` configuration is not supported. For /// example, by providing an unsupported anchor mode. /// /// When a search panics, callers cannot know whether a match exists or /// not. /// /// Use [`Regex::try_search`] if you want to handle these error conditions. /// /// # Example /// /// ``` /// use regex_automata::hybrid::regex::Regex; /// /// let re = Regex::new("foo[0-9]+bar")?; /// let mut cache = re.create_cache(); /// /// assert!(re.is_match(&mut cache, "foo12345bar")); /// assert!(!re.is_match(&mut cache, "foobar")); /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn is_match<'h, I: Into>>( &self, cache: &mut Cache, input: I, ) -> bool { // Not only can we do an "earliest" search, but we can avoid doing a // reverse scan too. self.forward() .try_search_fwd(&mut cache.forward, &input.into().earliest(true)) .unwrap() .is_some() } /// Returns the start and end offset of the leftmost match. If no match /// exists, then `None` is returned. /// /// # Panics /// /// This routine panics if the search could not complete. This can occur /// in a number of circumstances: /// /// * The configuration of the lazy DFA may permit it to "quit" the search. /// For example, setting quit bytes or enabling heuristic support for /// Unicode word boundaries. The default configuration does not enable any /// option that could result in the lazy DFA quitting. /// * The configuration of the lazy DFA may also permit it to "give up" /// on a search if it makes ineffective use of its transition table /// cache. The default configuration does not enable this by default, /// although it is typically a good idea to. /// * When the provided `Input` configuration is not supported. For /// example, by providing an unsupported anchor mode. /// /// When a search panics, callers cannot know whether a match exists or /// not. /// /// Use [`Regex::try_search`] if you want to handle these error conditions. /// /// # Example /// /// ``` /// use regex_automata::{Match, hybrid::regex::Regex}; /// /// let re = Regex::new("foo[0-9]+")?; /// let mut cache = re.create_cache(); /// assert_eq!( /// Some(Match::must(0, 3..11)), /// re.find(&mut cache, "zzzfoo12345zzz"), /// ); /// /// // Even though a match is found after reading the first byte (`a`), /// // the default leftmost-first match semantics demand that we find the /// // earliest match that prefers earlier parts of the pattern over latter /// // parts. /// let re = Regex::new("abc|a")?; /// let mut cache = re.create_cache(); /// assert_eq!(Some(Match::must(0, 0..3)), re.find(&mut cache, "abc")); /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn find<'h, I: Into>>( &self, cache: &mut Cache, input: I, ) -> Option { self.try_search(cache, &input.into()).unwrap() } /// Returns an iterator over all non-overlapping leftmost matches in the /// given bytes. If no match exists, then the iterator yields no elements. /// /// # Panics /// /// This routine panics if the search could not complete. This can occur /// in a number of circumstances: /// /// * The configuration of the lazy DFA may permit it to "quit" the search. /// For example, setting quit bytes or enabling heuristic support for /// Unicode word boundaries. The default configuration does not enable any /// option that could result in the lazy DFA quitting. /// * The configuration of the lazy DFA may also permit it to "give up" /// on a search if it makes ineffective use of its transition table /// cache. The default configuration does not enable this by default, /// although it is typically a good idea to. /// * When the provided `Input` configuration is not supported. For /// example, by providing an unsupported anchor mode. /// /// When a search panics, callers cannot know whether a match exists or /// not. /// /// The above conditions also apply to the iterator returned as well. For /// example, if the lazy DFA gives up or quits during a search using this /// method, then a panic will occur during iteration. /// /// Use [`Regex::try_search`] with [`util::iter::Searcher`](iter::Searcher) /// if you want to handle these error conditions. /// /// # Example /// /// ``` /// use regex_automata::{hybrid::regex::Regex, Match}; /// /// let re = Regex::new("foo[0-9]+")?; /// let mut cache = re.create_cache(); /// /// let text = "foo1 foo12 foo123"; /// let matches: Vec = re.find_iter(&mut cache, text).collect(); /// assert_eq!(matches, vec![ /// Match::must(0, 0..4), /// Match::must(0, 5..10), /// Match::must(0, 11..17), /// ]); /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn find_iter<'r, 'c, 'h, I: Into>>( &'r self, cache: &'c mut Cache, input: I, ) -> FindMatches<'r, 'c, 'h> { let it = iter::Searcher::new(input.into()); FindMatches { re: self, cache, it } } } /// Lower level "search" primitives that accept a `&Input` for cheap reuse /// and return an error if one occurs instead of panicking. impl Regex { /// Returns the start and end offset of the leftmost match. If no match /// exists, then `None` is returned. /// /// This is like [`Regex::find`] but with two differences: /// /// 1. It is not generic over `Into` and instead accepts a /// `&Input`. This permits reusing the same `Input` for multiple searches /// without needing to create a new one. This _may_ help with latency. /// 2. It returns an error if the search could not complete where as /// [`Regex::find`] will panic. /// /// # Errors /// /// This routine errors if the search could not complete. This can occur /// in a number of circumstances: /// /// * The configuration of the lazy DFA may permit it to "quit" the search. /// For example, setting quit bytes or enabling heuristic support for /// Unicode word boundaries. The default configuration does not enable any /// option that could result in the lazy DFA quitting. /// * The configuration of the lazy DFA may also permit it to "give up" /// on a search if it makes ineffective use of its transition table /// cache. The default configuration does not enable this by default, /// although it is typically a good idea to. /// * When the provided `Input` configuration is not supported. For /// example, by providing an unsupported anchor mode. /// /// When a search returns an error, callers cannot know whether a match /// exists or not. #[inline] pub fn try_search( &self, cache: &mut Cache, input: &Input<'_>, ) -> Result, MatchError> { let (fcache, rcache) = (&mut cache.forward, &mut cache.reverse); let end = match self.forward().try_search_fwd(fcache, input)? { None => return Ok(None), Some(end) => end, }; // This special cases an empty match at the beginning of the search. If // our end matches our start, then since a reverse DFA can't match past // the start, it must follow that our starting position is also our end // position. So short circuit and skip the reverse search. if input.start() == end.offset() { return Ok(Some(Match::new( end.pattern(), end.offset()..end.offset(), ))); } // We can also skip the reverse search if we know our search was // anchored. This occurs either when the input config is anchored or // when we know the regex itself is anchored. In this case, we know the // start of the match, if one is found, must be the start of the // search. if self.is_anchored(input) { return Ok(Some(Match::new( end.pattern(), input.start()..end.offset(), ))); } // N.B. I have tentatively convinced myself that it isn't necessary // to specify the specific pattern for the reverse search since the // reverse search will always find the same pattern to match as the // forward search. But I lack a rigorous proof. Why not just provide // the pattern anyway? Well, if it is needed, then leaving it out // gives us a chance to find a witness. (Also, if we don't need to // specify the pattern, then we don't need to build the reverse DFA // with 'starts_for_each_pattern' enabled. It doesn't matter too much // for the lazy DFA, but does make the overall DFA bigger.) // // We also need to be careful to disable 'earliest' for the reverse // search, since it could be enabled for the forward search. In the // reverse case, to satisfy "leftmost" criteria, we need to match as // much as we can. We also need to be careful to make the search // anchored. We don't want the reverse search to report any matches // other than the one beginning at the end of our forward search. let revsearch = input .clone() .span(input.start()..end.offset()) .anchored(Anchored::Yes) .earliest(false); let start = self .reverse() .try_search_rev(rcache, &revsearch)? .expect("reverse search must match if forward search does"); debug_assert_eq!( start.pattern(), end.pattern(), "forward and reverse search must match same pattern", ); debug_assert!(start.offset() <= end.offset()); Ok(Some(Match::new(end.pattern(), start.offset()..end.offset()))) } /// Returns true if either the given input specifies an anchored search /// or if the underlying NFA is always anchored. fn is_anchored(&self, input: &Input<'_>) -> bool { match input.get_anchored() { Anchored::No => { self.forward().get_nfa().is_always_start_anchored() } Anchored::Yes | Anchored::Pattern(_) => true, } } } /// Non-search APIs for querying information about the regex and setting a /// prefilter. impl Regex { /// Return the underlying lazy DFA responsible for forward matching. /// /// This is useful for accessing the underlying lazy DFA and using it /// directly if the situation calls for it. pub fn forward(&self) -> &DFA { &self.forward } /// Return the underlying lazy DFA responsible for reverse matching. /// /// This is useful for accessing the underlying lazy DFA and using it /// directly if the situation calls for it. pub fn reverse(&self) -> &DFA { &self.reverse } /// Returns the total number of patterns matched by this regex. /// /// # Example /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::hybrid::regex::Regex; /// /// let re = Regex::new_many(&[r"[a-z]+", r"[0-9]+", r"\w+"])?; /// assert_eq!(3, re.pattern_len()); /// # Ok::<(), Box>(()) /// ``` pub fn pattern_len(&self) -> usize { assert_eq!(self.forward().pattern_len(), self.reverse().pattern_len()); self.forward().pattern_len() } } /// An iterator over all non-overlapping matches for an infallible search. /// /// The iterator yields a [`Match`] value until no more matches could be found. /// If the underlying regex engine returns an error, then a panic occurs. /// /// The lifetime parameters are as follows: /// /// * `'r` represents the lifetime of the regex object. /// * `'h` represents the lifetime of the haystack being searched. /// * `'c` represents the lifetime of the regex cache. /// /// This iterator can be created with the [`Regex::find_iter`] method. #[derive(Debug)] pub struct FindMatches<'r, 'c, 'h> { re: &'r Regex, cache: &'c mut Cache, it: iter::Searcher<'h>, } impl<'r, 'c, 'h> Iterator for FindMatches<'r, 'c, 'h> { type Item = Match; #[inline] fn next(&mut self) -> Option { let FindMatches { re, ref mut cache, ref mut it } = *self; it.advance(|input| re.try_search(cache, input)) } } /// A cache represents a partially computed forward and reverse DFA. /// /// A cache is the key component that differentiates a classical DFA and a /// hybrid NFA/DFA (also called a "lazy DFA"). Where a classical DFA builds a /// complete transition table that can handle all possible inputs, a hybrid /// NFA/DFA starts with an empty transition table and builds only the parts /// required during search. The parts that are built are stored in a cache. For /// this reason, a cache is a required parameter for nearly every operation on /// a [`Regex`]. /// /// Caches can be created from their corresponding `Regex` via /// [`Regex::create_cache`]. A cache can only be used with either the `Regex` /// that created it, or the `Regex` that was most recently used to reset it /// with [`Cache::reset`]. Using a cache with any other `Regex` may result in /// panics or incorrect results. #[derive(Debug, Clone)] pub struct Cache { forward: dfa::Cache, reverse: dfa::Cache, } impl Cache { /// Create a new cache for the given `Regex`. /// /// The cache returned should only be used for searches for the given /// `Regex`. If you want to reuse the cache for another `Regex`, then you /// must call [`Cache::reset`] with that `Regex`. pub fn new(re: &Regex) -> Cache { let forward = dfa::Cache::new(re.forward()); let reverse = dfa::Cache::new(re.reverse()); Cache { forward, reverse } } /// Reset this cache such that it can be used for searching with the given /// `Regex` (and only that `Regex`). /// /// A cache reset permits reusing memory already allocated in this cache /// with a different `Regex`. /// /// Resetting a cache sets its "clear count" to 0. This is relevant if the /// `Regex` has been configured to "give up" after it has cleared the cache /// a certain number of times. /// /// # Example /// /// This shows how to re-purpose a cache for use with a different `Regex`. /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{hybrid::regex::Regex, Match}; /// /// let re1 = Regex::new(r"\w")?; /// let re2 = Regex::new(r"\W")?; /// /// let mut cache = re1.create_cache(); /// assert_eq!( /// Some(Match::must(0, 0..2)), /// re1.find(&mut cache, "Δ"), /// ); /// /// // Using 'cache' with re2 is not allowed. It may result in panics or /// // incorrect results. In order to re-purpose the cache, we must reset /// // it with the Regex we'd like to use it with. /// // /// // Similarly, after this reset, using the cache with 're1' is also not /// // allowed. /// cache.reset(&re2); /// assert_eq!( /// Some(Match::must(0, 0..3)), /// re2.find(&mut cache, "☃"), /// ); /// /// # Ok::<(), Box>(()) /// ``` pub fn reset(&mut self, re: &Regex) { self.forward.reset(re.forward()); self.reverse.reset(re.reverse()); } /// Return a reference to the forward cache. pub fn forward(&mut self) -> &dfa::Cache { &self.forward } /// Return a reference to the reverse cache. pub fn reverse(&mut self) -> &dfa::Cache { &self.reverse } /// Return a mutable reference to the forward cache. /// /// If you need mutable references to both the forward and reverse caches, /// then use [`Cache::as_parts_mut`]. pub fn forward_mut(&mut self) -> &mut dfa::Cache { &mut self.forward } /// Return a mutable reference to the reverse cache. /// /// If you need mutable references to both the forward and reverse caches, /// then use [`Cache::as_parts_mut`]. pub fn reverse_mut(&mut self) -> &mut dfa::Cache { &mut self.reverse } /// Return references to the forward and reverse caches, respectively. pub fn as_parts(&self) -> (&dfa::Cache, &dfa::Cache) { (&self.forward, &self.reverse) } /// Return mutable references to the forward and reverse caches, /// respectively. pub fn as_parts_mut(&mut self) -> (&mut dfa::Cache, &mut dfa::Cache) { (&mut self.forward, &mut self.reverse) } /// Returns the heap memory usage, in bytes, as a sum of the forward and /// reverse lazy DFA caches. /// /// This does **not** include the stack size used up by this cache. To /// compute that, use `std::mem::size_of::()`. pub fn memory_usage(&self) -> usize { self.forward.memory_usage() + self.reverse.memory_usage() } } /// A builder for a regex based on a hybrid NFA/DFA. /// /// This builder permits configuring options for the syntax of a pattern, the /// NFA construction, the lazy DFA construction and finally the regex searching /// itself. This builder is different from a general purpose regex builder /// in that it permits fine grain configuration of the construction process. /// The trade off for this is complexity, and the possibility of setting a /// configuration that might not make sense. For example, there are two /// different UTF-8 modes: /// /// * [`syntax::Config::utf8`](crate::util::syntax::Config::utf8) controls /// whether the pattern itself can contain sub-expressions that match invalid /// UTF-8. /// * [`thompson::Config::utf8`] controls how the regex iterators themselves /// advance the starting position of the next search when a match with zero /// length is found. /// /// Generally speaking, callers will want to either enable all of these or /// disable all of these. /// /// Internally, building a regex requires building two hybrid NFA/DFAs, /// where one is responsible for finding the end of a match and the other is /// responsible for finding the start of a match. If you only need to detect /// whether something matched, or only the end of a match, then you should use /// a [`dfa::Builder`] to construct a single hybrid NFA/DFA, which is cheaper /// than building two of them. /// /// # Example /// /// This example shows how to disable UTF-8 mode in the syntax and the regex /// itself. This is generally what you want for matching on arbitrary bytes. /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{ /// hybrid::regex::Regex, nfa::thompson, util::syntax, Match, /// }; /// /// let re = Regex::builder() /// .syntax(syntax::Config::new().utf8(false)) /// .thompson(thompson::Config::new().utf8(false)) /// .build(r"foo(?-u:[^b])ar.*")?; /// let mut cache = re.create_cache(); /// /// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n"; /// let expected = Some(Match::must(0, 1..9)); /// let got = re.find(&mut cache, haystack); /// assert_eq!(expected, got); /// // Notice that `(?-u:[^b])` matches invalid UTF-8, /// // but the subsequent `.*` does not! Disabling UTF-8 /// // on the syntax permits this. /// assert_eq!(b"foo\xFFarzz", &haystack[got.unwrap().range()]); /// /// # Ok::<(), Box>(()) /// ``` #[derive(Clone, Debug)] pub struct Builder { dfa: dfa::Builder, } impl Builder { /// Create a new regex builder with the default configuration. pub fn new() -> Builder { Builder { dfa: DFA::builder() } } /// Build a regex from the given pattern. /// /// If there was a problem parsing or compiling the pattern, then an error /// is returned. #[cfg(feature = "syntax")] pub fn build(&self, pattern: &str) -> Result { self.build_many(&[pattern]) } /// Build a regex from the given patterns. #[cfg(feature = "syntax")] pub fn build_many>( &self, patterns: &[P], ) -> Result { let forward = self.dfa.build_many(patterns)?; let reverse = self .dfa .clone() .configure( DFA::config() .prefilter(None) .specialize_start_states(false) .match_kind(MatchKind::All), ) .thompson(thompson::Config::new().reverse(true)) .build_many(patterns)?; Ok(self.build_from_dfas(forward, reverse)) } /// Build a regex from its component forward and reverse hybrid NFA/DFAs. /// /// This is useful when you've built a forward and reverse lazy DFA /// separately, and want to combine them into a single regex. Once build, /// the individual DFAs given can still be accessed via [`Regex::forward`] /// and [`Regex::reverse`]. /// /// It is important that the reverse lazy DFA be compiled under the /// following conditions: /// /// * It should use [`MatchKind::All`] semantics. /// * It should match in reverse. /// * Otherwise, its configuration should match the forward DFA. /// /// If these conditions aren't satisfied, then the behavior of searches is /// unspecified. /// /// Note that when using this constructor, no configuration is applied. /// Since this routine provides the DFAs to the builder, there is no /// opportunity to apply other configuration options. /// /// # Example /// /// This shows how to build individual lazy forward and reverse DFAs, and /// then combine them into a single `Regex`. /// /// ``` /// use regex_automata::{ /// hybrid::{dfa::DFA, regex::Regex}, /// nfa::thompson, /// MatchKind, /// }; /// /// let fwd = DFA::new(r"foo[0-9]+")?; /// let rev = DFA::builder() /// .configure(DFA::config().match_kind(MatchKind::All)) /// .thompson(thompson::Config::new().reverse(true)) /// .build(r"foo[0-9]+")?; /// /// let re = Regex::builder().build_from_dfas(fwd, rev); /// let mut cache = re.create_cache(); /// assert_eq!(true, re.is_match(&mut cache, "foo123")); /// # Ok::<(), Box>(()) /// ``` pub fn build_from_dfas(&self, forward: DFA, reverse: DFA) -> Regex { Regex { forward, reverse } } /// Set the syntax configuration for this builder using /// [`syntax::Config`](crate::util::syntax::Config). /// /// This permits setting things like case insensitivity, Unicode and multi /// line mode. #[cfg(feature = "syntax")] pub fn syntax( &mut self, config: crate::util::syntax::Config, ) -> &mut Builder { self.dfa.syntax(config); self } /// Set the Thompson NFA configuration for this builder using /// [`nfa::thompson::Config`](thompson::Config). /// /// This permits setting things like whether additional time should be /// spent shrinking the size of the NFA. #[cfg(feature = "syntax")] pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder { self.dfa.thompson(config); self } /// Set the lazy DFA compilation configuration for this builder using /// [`dfa::Config`](dfa::Config). /// /// This permits setting things like whether Unicode word boundaries should /// be heuristically supported or settings how the behavior of the cache. pub fn dfa(&mut self, config: dfa::Config) -> &mut Builder { self.dfa.configure(config); self } } impl Default for Builder { fn default() -> Builder { Builder::new() } }