use alloc::string::String; use regex_automata::{meta, Input, PatternID, PatternSet, PatternSetIter}; use crate::{bytes::RegexSetBuilder, Error}; /// Match multiple, possibly overlapping, regexes in a single search. /// /// A regex set corresponds to the union of zero or more regular expressions. /// That is, a regex set will match a haystack when at least one of its /// constituent regexes matches. A regex set as its formulated here provides a /// touch more power: it will also report *which* regular expressions in the /// set match. Indeed, this is the key difference between regex sets and a /// single `Regex` with many alternates, since only one alternate can match at /// a time. /// /// For example, consider regular expressions to match email addresses and /// domains: `[a-z]+@[a-z]+\.(com|org|net)` and `[a-z]+\.(com|org|net)`. If a /// regex set is constructed from those regexes, then searching the haystack /// `foo@example.com` will report both regexes as matching. Of course, one /// could accomplish this by compiling each regex on its own and doing two /// searches over the haystack. The key advantage of using a regex set is /// that it will report the matching regexes using a *single pass through the /// haystack*. If one has hundreds or thousands of regexes to match repeatedly /// (like a URL router for a complex web application or a user agent matcher), /// then a regex set *can* realize huge performance gains. /// /// Unlike the top-level [`RegexSet`](crate::RegexSet), this `RegexSet` /// searches haystacks with type `&[u8]` instead of `&str`. Consequently, this /// `RegexSet` is permitted to match invalid UTF-8. /// /// # Limitations /// /// Regex sets are limited to answering the following two questions: /// /// 1. Does any regex in the set match? /// 2. If so, which regexes in the set match? /// /// As with the main [`Regex`][crate::bytes::Regex] type, it is cheaper to ask /// (1) instead of (2) since the matching engines can stop after the first /// match is found. /// /// You cannot directly extract [`Match`][crate::bytes::Match] or /// [`Captures`][crate::bytes::Captures] objects from a regex set. If you need /// these operations, the recommended approach is to compile each pattern in /// the set independently and scan the exact same haystack a second time with /// those independently compiled patterns: /// /// ``` /// use regex::bytes::{Regex, RegexSet}; /// /// let patterns = ["foo", "bar"]; /// // Both patterns will match different ranges of this string. /// let hay = b"barfoo"; /// /// // Compile a set matching any of our patterns. /// let set = RegexSet::new(patterns).unwrap(); /// // Compile each pattern independently. /// let regexes: Vec<_> = set /// .patterns() /// .iter() /// .map(|pat| Regex::new(pat).unwrap()) /// .collect(); /// /// // Match against the whole set first and identify the individual /// // matching patterns. /// let matches: Vec<&[u8]> = set /// .matches(hay) /// .into_iter() /// // Dereference the match index to get the corresponding /// // compiled pattern. /// .map(|index| ®exes[index]) /// // To get match locations or any other info, we then have to search the /// // exact same haystack again, using our separately-compiled pattern. /// .map(|re| re.find(hay).unwrap().as_bytes()) /// .collect(); /// /// // Matches arrive in the order the constituent patterns were declared, /// // not the order they appear in the haystack. /// assert_eq!(vec![&b"foo"[..], &b"bar"[..]], matches); /// ``` /// /// # Performance /// /// A `RegexSet` has the same performance characteristics as `Regex`. Namely, /// search takes `O(m * n)` time, where `m` is proportional to the size of the /// regex set and `n` is proportional to the length of the haystack. /// /// # Trait implementations /// /// The `Default` trait is implemented for `RegexSet`. The default value /// is an empty set. An empty set can also be explicitly constructed via /// [`RegexSet::empty`]. /// /// # Example /// /// This shows how the above two regexes (for matching email addresses and /// domains) might work: /// /// ``` /// use regex::bytes::RegexSet; /// /// let set = RegexSet::new(&[ /// r"[a-z]+@[a-z]+\.(com|org|net)", /// r"[a-z]+\.(com|org|net)", /// ]).unwrap(); /// /// // Ask whether any regexes in the set match. /// assert!(set.is_match(b"foo@example.com")); /// /// // Identify which regexes in the set match. /// let matches: Vec<_> = set.matches(b"foo@example.com").into_iter().collect(); /// assert_eq!(vec![0, 1], matches); /// /// // Try again, but with a haystack that only matches one of the regexes. /// let matches: Vec<_> = set.matches(b"example.com").into_iter().collect(); /// assert_eq!(vec![1], matches); /// /// // Try again, but with a haystack that doesn't match any regex in the set. /// let matches: Vec<_> = set.matches(b"example").into_iter().collect(); /// assert!(matches.is_empty()); /// ``` /// /// Note that it would be possible to adapt the above example to using `Regex` /// with an expression like: /// /// ```text /// (?P[a-z]+@(?P[a-z]+[.](com|org|net)))|(?P[a-z]+[.](com|org|net)) /// ``` /// /// After a match, one could then inspect the capture groups to figure out /// which alternates matched. The problem is that it is hard to make this /// approach scale when there are many regexes since the overlap between each /// alternate isn't always obvious to reason about. #[derive(Clone)] pub struct RegexSet { pub(crate) meta: meta::Regex, pub(crate) patterns: alloc::sync::Arc<[String]>, } impl RegexSet { /// Create a new regex set with the given regular expressions. /// /// This takes an iterator of `S`, where `S` is something that can produce /// a `&str`. If any of the strings in the iterator are not valid regular /// expressions, then an error is returned. /// /// # Example /// /// Create a new regex set from an iterator of strings: /// /// ``` /// use regex::bytes::RegexSet; /// /// let set = RegexSet::new([r"\w+", r"\d+"]).unwrap(); /// assert!(set.is_match(b"foo")); /// ``` pub fn new(exprs: I) -> Result where S: AsRef, I: IntoIterator, { RegexSetBuilder::new(exprs).build() } /// Create a new empty regex set. /// /// An empty regex never matches anything. /// /// This is a convenience function for `RegexSet::new([])`, but doesn't /// require one to specify the type of the input. /// /// # Example /// /// ``` /// use regex::bytes::RegexSet; /// /// let set = RegexSet::empty(); /// assert!(set.is_empty()); /// // an empty set matches nothing /// assert!(!set.is_match(b"")); /// ``` pub fn empty() -> RegexSet { let empty: [&str; 0] = []; RegexSetBuilder::new(empty).build().unwrap() } /// Returns true if and only if one of the regexes in this set matches /// the haystack given. /// /// This method should be preferred if you only need to test whether any /// of the regexes in the set should match, but don't care about *which* /// regexes matched. This is because the underlying matching engine will /// quit immediately after seeing the first match instead of continuing to /// find all matches. /// /// Note that as with searches using [`Regex`](crate::bytes::Regex), the /// expression is unanchored by default. That is, if the regex does not /// start with `^` or `\A`, or end with `$` or `\z`, then it is permitted /// to match anywhere in the haystack. /// /// # Example /// /// Tests whether a set matches somewhere in a haystack: /// /// ``` /// use regex::bytes::RegexSet; /// /// let set = RegexSet::new([r"\w+", r"\d+"]).unwrap(); /// assert!(set.is_match(b"foo")); /// assert!(!set.is_match("☃".as_bytes())); /// ``` #[inline] pub fn is_match(&self, haystack: &[u8]) -> bool { self.is_match_at(haystack, 0) } /// Returns true if and only if one of the regexes in this set matches the /// haystack given, with the search starting at the offset given. /// /// The significance of the starting point is that it takes the surrounding /// context into consideration. For example, the `\A` anchor can only /// match when `start == 0`. /// /// # Panics /// /// This panics when `start >= haystack.len() + 1`. /// /// # Example /// /// This example shows the significance of `start`. Namely, consider a /// haystack `foobar` and a desire to execute a search starting at offset /// `3`. You could search a substring explicitly, but then the look-around /// assertions won't work correctly. Instead, you can use this method to /// specify the start position of a search. /// /// ``` /// use regex::bytes::RegexSet; /// /// let set = RegexSet::new([r"\bbar\b", r"(?m)^bar$"]).unwrap(); /// let hay = b"foobar"; /// // We get a match here, but it's probably not intended. /// assert!(set.is_match(&hay[3..])); /// // No match because the assertions take the context into account. /// assert!(!set.is_match_at(hay, 3)); /// ``` #[inline] pub fn is_match_at(&self, haystack: &[u8], start: usize) -> bool { self.meta.is_match(Input::new(haystack).span(start..haystack.len())) } /// Returns the set of regexes that match in the given haystack. /// /// The set returned contains the index of each regex that matches in /// the given haystack. The index is in correspondence with the order of /// regular expressions given to `RegexSet`'s constructor. /// /// The set can also be used to iterate over the matched indices. The order /// of iteration is always ascending with respect to the matching indices. /// /// Note that as with searches using [`Regex`](crate::bytes::Regex), the /// expression is unanchored by default. That is, if the regex does not /// start with `^` or `\A`, or end with `$` or `\z`, then it is permitted /// to match anywhere in the haystack. /// /// # Example /// /// Tests which regular expressions match the given haystack: /// /// ``` /// use regex::bytes::RegexSet; /// /// let set = RegexSet::new([ /// r"\w+", /// r"\d+", /// r"\pL+", /// r"foo", /// r"bar", /// r"barfoo", /// r"foobar", /// ]).unwrap(); /// let matches: Vec<_> = set.matches(b"foobar").into_iter().collect(); /// assert_eq!(matches, vec![0, 2, 3, 4, 6]); /// /// // You can also test whether a particular regex matched: /// let matches = set.matches(b"foobar"); /// assert!(!matches.matched(5)); /// assert!(matches.matched(6)); /// ``` #[inline] pub fn matches(&self, haystack: &[u8]) -> SetMatches { self.matches_at(haystack, 0) } /// Returns the set of regexes that match in the given haystack. /// /// The set returned contains the index of each regex that matches in /// the given haystack. The index is in correspondence with the order of /// regular expressions given to `RegexSet`'s constructor. /// /// The set can also be used to iterate over the matched indices. The order /// of iteration is always ascending with respect to the matching indices. /// /// The significance of the starting point is that it takes the surrounding /// context into consideration. For example, the `\A` anchor can only /// match when `start == 0`. /// /// # Panics /// /// This panics when `start >= haystack.len() + 1`. /// /// # Example /// /// Tests which regular expressions match the given haystack: /// /// ``` /// use regex::bytes::RegexSet; /// /// let set = RegexSet::new([r"\bbar\b", r"(?m)^bar$"]).unwrap(); /// let hay = b"foobar"; /// // We get matches here, but it's probably not intended. /// let matches: Vec<_> = set.matches(&hay[3..]).into_iter().collect(); /// assert_eq!(matches, vec![0, 1]); /// // No matches because the assertions take the context into account. /// let matches: Vec<_> = set.matches_at(hay, 3).into_iter().collect(); /// assert_eq!(matches, vec![]); /// ``` #[inline] pub fn matches_at(&self, haystack: &[u8], start: usize) -> SetMatches { let input = Input::new(haystack).span(start..haystack.len()); let mut patset = PatternSet::new(self.meta.pattern_len()); self.meta.which_overlapping_matches(&input, &mut patset); SetMatches(patset) } /// Returns the same as matches, but starts the search at the given /// offset and stores the matches into the slice given. /// /// The significance of the starting point is that it takes the surrounding /// context into consideration. For example, the `\A` anchor can only /// match when `start == 0`. /// /// `matches` must have a length that is at least the number of regexes /// in this set. /// /// This method returns true if and only if at least one member of /// `matches` is true after executing the set against `haystack`. #[doc(hidden)] #[inline] pub fn matches_read_at( &self, matches: &mut [bool], haystack: &[u8], start: usize, ) -> bool { // This is pretty dumb. We should try to fix this, but the // regex-automata API doesn't provide a way to store matches in an // arbitrary &mut [bool]. Thankfully, this API is is doc(hidden) and // thus not public... But regex-capi currently uses it. We should // fix regex-capi to use a PatternSet, maybe? Not sure... PatternSet // is in regex-automata, not regex. So maybe we should just accept a // 'SetMatches', which is basically just a newtype around PatternSet. let mut patset = PatternSet::new(self.meta.pattern_len()); let mut input = Input::new(haystack); input.set_start(start); self.meta.which_overlapping_matches(&input, &mut patset); for pid in patset.iter() { matches[pid] = true; } !patset.is_empty() } /// An alias for `matches_read_at` to preserve backward compatibility. /// /// The `regex-capi` crate used this method, so to avoid breaking that /// crate, we continue to export it as an undocumented API. #[doc(hidden)] #[inline] pub fn read_matches_at( &self, matches: &mut [bool], haystack: &[u8], start: usize, ) -> bool { self.matches_read_at(matches, haystack, start) } /// Returns the total number of regexes in this set. /// /// # Example /// /// ``` /// use regex::bytes::RegexSet; /// /// assert_eq!(0, RegexSet::empty().len()); /// assert_eq!(1, RegexSet::new([r"[0-9]"]).unwrap().len()); /// assert_eq!(2, RegexSet::new([r"[0-9]", r"[a-z]"]).unwrap().len()); /// ``` #[inline] pub fn len(&self) -> usize { self.meta.pattern_len() } /// Returns `true` if this set contains no regexes. /// /// # Example /// /// ``` /// use regex::bytes::RegexSet; /// /// assert!(RegexSet::empty().is_empty()); /// assert!(!RegexSet::new([r"[0-9]"]).unwrap().is_empty()); /// ``` #[inline] pub fn is_empty(&self) -> bool { self.meta.pattern_len() == 0 } /// Returns the regex patterns that this regex set was constructed from. /// /// This function can be used to determine the pattern for a match. The /// slice returned has exactly as many patterns givens to this regex set, /// and the order of the slice is the same as the order of the patterns /// provided to the set. /// /// # Example /// /// ``` /// use regex::bytes::RegexSet; /// /// let set = RegexSet::new(&[ /// r"\w+", /// r"\d+", /// r"\pL+", /// r"foo", /// r"bar", /// r"barfoo", /// r"foobar", /// ]).unwrap(); /// let matches: Vec<_> = set /// .matches(b"foobar") /// .into_iter() /// .map(|index| &set.patterns()[index]) /// .collect(); /// assert_eq!(matches, vec![r"\w+", r"\pL+", r"foo", r"bar", r"foobar"]); /// ``` #[inline] pub fn patterns(&self) -> &[String] { &self.patterns } } impl Default for RegexSet { fn default() -> Self { RegexSet::empty() } } /// A set of matches returned by a regex set. /// /// Values of this type are constructed by [`RegexSet::matches`]. #[derive(Clone, Debug)] pub struct SetMatches(PatternSet); impl SetMatches { /// Whether this set contains any matches. /// /// # Example /// /// ``` /// use regex::bytes::RegexSet; /// /// let set = RegexSet::new(&[ /// r"[a-z]+@[a-z]+\.(com|org|net)", /// r"[a-z]+\.(com|org|net)", /// ]).unwrap(); /// let matches = set.matches(b"foo@example.com"); /// assert!(matches.matched_any()); /// ``` #[inline] pub fn matched_any(&self) -> bool { !self.0.is_empty() } /// Whether the regex at the given index matched. /// /// The index for a regex is determined by its insertion order upon the /// initial construction of a `RegexSet`, starting at `0`. /// /// # Panics /// /// If `index` is greater than or equal to the number of regexes in the /// original set that produced these matches. Equivalently, when `index` /// is greater than or equal to [`SetMatches::len`]. /// /// # Example /// /// ``` /// use regex::bytes::RegexSet; /// /// let set = RegexSet::new([ /// r"[a-z]+@[a-z]+\.(com|org|net)", /// r"[a-z]+\.(com|org|net)", /// ]).unwrap(); /// let matches = set.matches(b"example.com"); /// assert!(!matches.matched(0)); /// assert!(matches.matched(1)); /// ``` #[inline] pub fn matched(&self, index: usize) -> bool { self.0.contains(PatternID::new_unchecked(index)) } /// The total number of regexes in the set that created these matches. /// /// **WARNING:** This always returns the same value as [`RegexSet::len`]. /// In particular, it does *not* return the number of elements yielded by /// [`SetMatches::iter`]. The only way to determine the total number of /// matched regexes is to iterate over them. /// /// # Example /// /// Notice that this method returns the total number of regexes in the /// original set, and *not* the total number of regexes that matched. /// /// ``` /// use regex::bytes::RegexSet; /// /// let set = RegexSet::new([ /// r"[a-z]+@[a-z]+\.(com|org|net)", /// r"[a-z]+\.(com|org|net)", /// ]).unwrap(); /// let matches = set.matches(b"example.com"); /// // Total number of patterns that matched. /// assert_eq!(1, matches.iter().count()); /// // Total number of patterns in the set. /// assert_eq!(2, matches.len()); /// ``` #[inline] pub fn len(&self) -> usize { self.0.capacity() } /// Returns an iterator over the indices of the regexes that matched. /// /// This will always produces matches in ascending order, where the index /// yielded corresponds to the index of the regex that matched with respect /// to its position when initially building the set. /// /// # Example /// /// ``` /// use regex::bytes::RegexSet; /// /// let set = RegexSet::new([ /// r"[0-9]", /// r"[a-z]", /// r"[A-Z]", /// r"\p{Greek}", /// ]).unwrap(); /// let hay = "βa1".as_bytes(); /// let matches: Vec<_> = set.matches(hay).iter().collect(); /// assert_eq!(matches, vec![0, 1, 3]); /// ``` /// /// Note that `SetMatches` also implemnets the `IntoIterator` trait, so /// this method is not always needed. For example: /// /// ``` /// use regex::bytes::RegexSet; /// /// let set = RegexSet::new([ /// r"[0-9]", /// r"[a-z]", /// r"[A-Z]", /// r"\p{Greek}", /// ]).unwrap(); /// let hay = "βa1".as_bytes(); /// let mut matches = vec![]; /// for index in set.matches(hay) { /// matches.push(index); /// } /// assert_eq!(matches, vec![0, 1, 3]); /// ``` #[inline] pub fn iter(&self) -> SetMatchesIter<'_> { SetMatchesIter(self.0.iter()) } } impl IntoIterator for SetMatches { type IntoIter = SetMatchesIntoIter; type Item = usize; fn into_iter(self) -> Self::IntoIter { let it = 0..self.0.capacity(); SetMatchesIntoIter { patset: self.0, it } } } impl<'a> IntoIterator for &'a SetMatches { type IntoIter = SetMatchesIter<'a>; type Item = usize; fn into_iter(self) -> Self::IntoIter { self.iter() } } /// An owned iterator over the set of matches from a regex set. /// /// This will always produces matches in ascending order of index, where the /// index corresponds to the index of the regex that matched with respect to /// its position when initially building the set. /// /// This iterator is created by calling `SetMatches::into_iter` via the /// `IntoIterator` trait. This is automatically done in `for` loops. /// /// # Example /// /// ``` /// use regex::bytes::RegexSet; /// /// let set = RegexSet::new([ /// r"[0-9]", /// r"[a-z]", /// r"[A-Z]", /// r"\p{Greek}", /// ]).unwrap(); /// let hay = "βa1".as_bytes(); /// let mut matches = vec![]; /// for index in set.matches(hay) { /// matches.push(index); /// } /// assert_eq!(matches, vec![0, 1, 3]); /// ``` #[derive(Debug)] pub struct SetMatchesIntoIter { patset: PatternSet, it: core::ops::Range, } impl Iterator for SetMatchesIntoIter { type Item = usize; fn next(&mut self) -> Option { loop { let id = self.it.next()?; if self.patset.contains(PatternID::new_unchecked(id)) { return Some(id); } } } fn size_hint(&self) -> (usize, Option) { self.it.size_hint() } } impl DoubleEndedIterator for SetMatchesIntoIter { fn next_back(&mut self) -> Option { loop { let id = self.it.next_back()?; if self.patset.contains(PatternID::new_unchecked(id)) { return Some(id); } } } } impl core::iter::FusedIterator for SetMatchesIntoIter {} /// A borrowed iterator over the set of matches from a regex set. /// /// The lifetime `'a` refers to the lifetime of the [`SetMatches`] value that /// created this iterator. /// /// This will always produces matches in ascending order, where the index /// corresponds to the index of the regex that matched with respect to its /// position when initially building the set. /// /// This iterator is created by the [`SetMatches::iter`] method. #[derive(Clone, Debug)] pub struct SetMatchesIter<'a>(PatternSetIter<'a>); impl<'a> Iterator for SetMatchesIter<'a> { type Item = usize; fn next(&mut self) -> Option { self.0.next().map(|pid| pid.as_usize()) } fn size_hint(&self) -> (usize, Option) { self.0.size_hint() } } impl<'a> DoubleEndedIterator for SetMatchesIter<'a> { fn next_back(&mut self) -> Option { self.0.next_back().map(|pid| pid.as_usize()) } } impl<'a> core::iter::FusedIterator for SetMatchesIter<'a> {} impl core::fmt::Debug for RegexSet { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { write!(f, "RegexSet({:?})", self.patterns()) } }