diff options
Diffstat (limited to 'third_party/rust/regex/src/re_set.rs')
-rw-r--r-- | third_party/rust/regex/src/re_set.rs | 452 |
1 files changed, 452 insertions, 0 deletions
diff --git a/third_party/rust/regex/src/re_set.rs b/third_party/rust/regex/src/re_set.rs new file mode 100644 index 0000000000..fc2b61a8ba --- /dev/null +++ b/third_party/rust/regex/src/re_set.rs @@ -0,0 +1,452 @@ +macro_rules! define_set { + ($name:ident, $builder_mod:ident, $text_ty:ty, $as_bytes:expr, + $(#[$doc_regexset_example:meta])* ) => { + pub mod $name { + use std::fmt; + use std::iter; + use std::slice; + use std::vec; + + use error::Error; + use exec::Exec; + use re_builder::$builder_mod::RegexSetBuilder; + use re_trait::RegularExpression; + +/// Match multiple (possibly overlapping) regular expressions in a single scan. +/// +/// A regex set corresponds to the union of two or more regular expressions. +/// That is, a regex set will match text where at least one of its +/// constituent regular expressions matches. A regex set as its formulated here +/// provides a touch more power: it will also report *which* regular +/// expressions in the set match. Indeed, this is the key difference between +/// regex sets and a single `Regex` with many alternates, since only one +/// alternate can match at a time. +/// +/// For example, consider regular expressions to match email addresses and +/// domains: `[a-z]+@[a-z]+\.(com|org|net)` and `[a-z]+\.(com|org|net)`. If a +/// regex set is constructed from those regexes, then searching the text +/// `foo@example.com` will report both regexes as matching. Of course, one +/// could accomplish this by compiling each regex on its own and doing two +/// searches over the text. The key advantage of using a regex set is that it +/// will report the matching regexes using a *single pass through the text*. +/// If one has hundreds or thousands of regexes to match repeatedly (like a URL +/// router for a complex web application or a user agent matcher), then a regex +/// set can realize huge performance gains. +/// +/// # Example +/// +/// This shows how the above two regexes (for matching email addresses and +/// domains) might work: +/// +$(#[$doc_regexset_example])* +/// +/// Note that it would be possible to adapt the above example to using `Regex` +/// with an expression like: +/// +/// ```ignore +/// (?P<email>[a-z]+@(?P<email_domain>[a-z]+[.](com|org|net)))|(?P<domain>[a-z]+[.](com|org|net)) +/// ``` +/// +/// After a match, one could then inspect the capture groups to figure out +/// which alternates matched. The problem is that it is hard to make this +/// approach scale when there are many regexes since the overlap between each +/// alternate isn't always obvious to reason about. +/// +/// # Limitations +/// +/// Regex sets are limited to answering the following two questions: +/// +/// 1. Does any regex in the set match? +/// 2. If so, which regexes in the set match? +/// +/// As with the main `Regex` type, it is cheaper to ask (1) instead of (2) +/// since the matching engines can stop after the first match is found. +/// +/// Other features like finding the location of successive matches or their +/// sub-captures aren't supported. If you need this functionality, the +/// recommended approach is to compile each regex in the set independently and +/// selectively match them based on which regexes in the set matched. +/// +/// # Performance +/// +/// A `RegexSet` has the same performance characteristics as `Regex`. Namely, +/// search takes `O(mn)` time, where `m` is proportional to the size of the +/// regex set and `n` is proportional to the length of the search text. +#[derive(Clone)] +pub struct RegexSet(Exec); + +impl RegexSet { + /// Create a new regex set with the given regular expressions. + /// + /// This takes an iterator of `S`, where `S` is something that can produce + /// a `&str`. If any of the strings in the iterator are not valid regular + /// expressions, then an error is returned. + /// + /// # Example + /// + /// Create a new regex set from an iterator of strings: + /// + /// ```rust + /// # use regex::RegexSet; + /// let set = RegexSet::new(&[r"\w+", r"\d+"]).unwrap(); + /// assert!(set.is_match("foo")); + /// ``` + pub fn new<I, S>(exprs: I) -> Result<RegexSet, Error> + where S: AsRef<str>, I: IntoIterator<Item=S> { + RegexSetBuilder::new(exprs).build() + } + + /// Returns true if and only if one of the regexes in this set matches + /// the text given. + /// + /// This method should be preferred if you only need to test whether any + /// of the regexes in the set should match, but don't care about *which* + /// regexes matched. This is because the underlying matching engine will + /// quit immediately after seeing the first match instead of continuing to + /// find all matches. + /// + /// Note that as with searches using `Regex`, the expression is unanchored + /// by default. That is, if the regex does not start with `^` or `\A`, or + /// end with `$` or `\z`, then it is permitted to match anywhere in the + /// text. + /// + /// # Example + /// + /// Tests whether a set matches some text: + /// + /// ```rust + /// # use regex::RegexSet; + /// let set = RegexSet::new(&[r"\w+", r"\d+"]).unwrap(); + /// assert!(set.is_match("foo")); + /// assert!(!set.is_match("☃")); + /// ``` + pub fn is_match(&self, text: $text_ty) -> bool { + self.is_match_at(text, 0) + } + + /// Returns the same as is_match, but starts the search at the given + /// offset. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `start == 0`. + #[doc(hidden)] + pub fn is_match_at(&self, text: $text_ty, start: usize) -> bool { + self.0.searcher().is_match_at($as_bytes(text), start) + } + + /// Returns the set of regular expressions that match in the given text. + /// + /// The set returned contains the index of each regular expression that + /// matches in the given text. The index is in correspondence with the + /// order of regular expressions given to `RegexSet`'s constructor. + /// + /// The set can also be used to iterate over the matched indices. + /// + /// Note that as with searches using `Regex`, the expression is unanchored + /// by default. That is, if the regex does not start with `^` or `\A`, or + /// end with `$` or `\z`, then it is permitted to match anywhere in the + /// text. + /// + /// # Example + /// + /// Tests which regular expressions match the given text: + /// + /// ```rust + /// # use regex::RegexSet; + /// let set = RegexSet::new(&[ + /// r"\w+", + /// r"\d+", + /// r"\pL+", + /// r"foo", + /// r"bar", + /// r"barfoo", + /// r"foobar", + /// ]).unwrap(); + /// let matches: Vec<_> = set.matches("foobar").into_iter().collect(); + /// assert_eq!(matches, vec![0, 2, 3, 4, 6]); + /// + /// // You can also test whether a particular regex matched: + /// let matches = set.matches("foobar"); + /// assert!(!matches.matched(5)); + /// assert!(matches.matched(6)); + /// ``` + pub fn matches(&self, text: $text_ty) -> SetMatches { + let mut matches = vec![false; self.0.regex_strings().len()]; + let any = self.read_matches_at(&mut matches, text, 0); + SetMatches { + matched_any: any, + matches: matches, + } + } + + /// Returns the same as matches, but starts the search at the given + /// offset and stores the matches into the slice given. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `start == 0`. + /// + /// `matches` must have a length that is at least the number of regexes + /// in this set. + /// + /// This method returns true if and only if at least one member of + /// `matches` is true after executing the set against `text`. + #[doc(hidden)] + pub fn read_matches_at( + &self, + matches: &mut [bool], + text: $text_ty, + start: usize, + ) -> bool { + self.0.searcher().many_matches_at(matches, $as_bytes(text), start) + } + + /// Returns the total number of regular expressions in this set. + pub fn len(&self) -> usize { + self.0.regex_strings().len() + } + + /// Returns the patterns that this set will match on. + /// + /// This function can be used to determine the pattern for a match. The + /// slice returned has exactly as many patterns givens to this regex set, + /// and the order of the slice is the same as the order of the patterns + /// provided to the set. + /// + /// # Example + /// + /// ```rust + /// # use regex::RegexSet; + /// let set = RegexSet::new(&[ + /// r"\w+", + /// r"\d+", + /// r"\pL+", + /// r"foo", + /// r"bar", + /// r"barfoo", + /// r"foobar", + /// ]).unwrap(); + /// let matches: Vec<_> = set + /// .matches("foobar") + /// .into_iter() + /// .map(|match_idx| &set.patterns()[match_idx]) + /// .collect(); + /// assert_eq!(matches, vec![r"\w+", r"\pL+", r"foo", r"bar", r"foobar"]); + /// ``` + pub fn patterns(&self) -> &[String] { + self.0.regex_strings() + } +} + +/// A set of matches returned by a regex set. +#[derive(Clone, Debug)] +pub struct SetMatches { + matched_any: bool, + matches: Vec<bool>, +} + +impl SetMatches { + /// Whether this set contains any matches. + pub fn matched_any(&self) -> bool { + self.matched_any + } + + /// Whether the regex at the given index matched. + /// + /// The index for a regex is determined by its insertion order upon the + /// initial construction of a `RegexSet`, starting at `0`. + /// + /// # Panics + /// + /// If `regex_index` is greater than or equal to `self.len()`. + pub fn matched(&self, regex_index: usize) -> bool { + self.matches[regex_index] + } + + /// The total number of regexes in the set that created these matches. + pub fn len(&self) -> usize { + self.matches.len() + } + + /// Returns an iterator over indexes in the regex that matched. + /// + /// This will always produces matches in ascending order of index, where + /// the index corresponds to the index of the regex that matched with + /// respect to its position when initially building the set. + pub fn iter(&self) -> SetMatchesIter { + SetMatchesIter((&*self.matches).into_iter().enumerate()) + } +} + +impl IntoIterator for SetMatches { + type IntoIter = SetMatchesIntoIter; + type Item = usize; + + fn into_iter(self) -> Self::IntoIter { + SetMatchesIntoIter(self.matches.into_iter().enumerate()) + } +} + +impl<'a> IntoIterator for &'a SetMatches { + type IntoIter = SetMatchesIter<'a>; + type Item = usize; + + fn into_iter(self) -> Self::IntoIter { + self.iter() + } +} + +/// An owned iterator over the set of matches from a regex set. +/// +/// This will always produces matches in ascending order of index, where the +/// index corresponds to the index of the regex that matched with respect to +/// its position when initially building the set. +pub struct SetMatchesIntoIter(iter::Enumerate<vec::IntoIter<bool>>); + +impl Iterator for SetMatchesIntoIter { + type Item = usize; + + fn next(&mut self) -> Option<usize> { + loop { + match self.0.next() { + None => return None, + Some((_, false)) => {} + Some((i, true)) => return Some(i), + } + } + } + + fn size_hint(&self) -> (usize, Option<usize>) { + self.0.size_hint() + } +} + +impl DoubleEndedIterator for SetMatchesIntoIter { + fn next_back(&mut self) -> Option<usize> { + loop { + match self.0.next_back() { + None => return None, + Some((_, false)) => {} + Some((i, true)) => return Some(i), + } + } + } +} + +/// A borrowed iterator over the set of matches from a regex set. +/// +/// The lifetime `'a` refers to the lifetime of a `SetMatches` value. +/// +/// This will always produces matches in ascending order of index, where the +/// index corresponds to the index of the regex that matched with respect to +/// its position when initially building the set. +#[derive(Clone)] +pub struct SetMatchesIter<'a>(iter::Enumerate<slice::Iter<'a, bool>>); + +impl<'a> Iterator for SetMatchesIter<'a> { + type Item = usize; + + fn next(&mut self) -> Option<usize> { + loop { + match self.0.next() { + None => return None, + Some((_, &false)) => {} + Some((i, &true)) => return Some(i), + } + } + } + + fn size_hint(&self) -> (usize, Option<usize>) { + self.0.size_hint() + } +} + +impl<'a> DoubleEndedIterator for SetMatchesIter<'a> { + fn next_back(&mut self) -> Option<usize> { + loop { + match self.0.next_back() { + None => return None, + Some((_, &false)) => {} + Some((i, &true)) => return Some(i), + } + } + } +} + +#[doc(hidden)] +impl From<Exec> for RegexSet { + fn from(exec: Exec) -> Self { + RegexSet(exec) + } +} + +impl fmt::Debug for RegexSet { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "RegexSet({:?})", self.0.regex_strings()) + } +} + +#[allow(dead_code)] fn as_bytes_str(text: &str) -> &[u8] { text.as_bytes() } +#[allow(dead_code)] fn as_bytes_bytes(text: &[u8]) -> &[u8] { text } + } + } +} + +define_set! { + unicode, + set_unicode, + &str, + as_bytes_str, +/// ```rust +/// # use regex::RegexSet; +/// let set = RegexSet::new(&[ +/// r"[a-z]+@[a-z]+\.(com|org|net)", +/// r"[a-z]+\.(com|org|net)", +/// ]).unwrap(); +/// +/// // Ask whether any regexes in the set match. +/// assert!(set.is_match("foo@example.com")); +/// +/// // Identify which regexes in the set match. +/// let matches: Vec<_> = set.matches("foo@example.com").into_iter().collect(); +/// assert_eq!(vec![0, 1], matches); +/// +/// // Try again, but with text that only matches one of the regexes. +/// let matches: Vec<_> = set.matches("example.com").into_iter().collect(); +/// assert_eq!(vec![1], matches); +/// +/// // Try again, but with text that doesn't match any regex in the set. +/// let matches: Vec<_> = set.matches("example").into_iter().collect(); +/// assert!(matches.is_empty()); +/// ``` +} + +define_set! { + bytes, + set_bytes, + &[u8], + as_bytes_bytes, +/// ```rust +/// # use regex::bytes::RegexSet; +/// let set = RegexSet::new(&[ +/// r"[a-z]+@[a-z]+\.(com|org|net)", +/// r"[a-z]+\.(com|org|net)", +/// ]).unwrap(); +/// +/// // Ask whether any regexes in the set match. +/// assert!(set.is_match(b"foo@example.com")); +/// +/// // Identify which regexes in the set match. +/// let matches: Vec<_> = set.matches(b"foo@example.com").into_iter().collect(); +/// assert_eq!(vec![0, 1], matches); +/// +/// // Try again, but with text that only matches one of the regexes. +/// let matches: Vec<_> = set.matches(b"example.com").into_iter().collect(); +/// assert_eq!(vec![1], matches); +/// +/// // Try again, but with text that doesn't match any regex in the set. +/// let matches: Vec<_> = set.matches(b"example").into_iter().collect(); +/// assert!(matches.is_empty()); +/// ``` +} |