diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-18 02:49:50 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-18 02:49:50 +0000 |
commit | 9835e2ae736235810b4ea1c162ca5e65c547e770 (patch) | |
tree | 3fcebf40ed70e581d776a8a4c65923e8ec20e026 /vendor/regex-syntax/src/hir | |
parent | Releasing progress-linux version 1.70.0+dfsg2-1~progress7.99u1. (diff) | |
download | rustc-9835e2ae736235810b4ea1c162ca5e65c547e770.tar.xz rustc-9835e2ae736235810b4ea1c162ca5e65c547e770.zip |
Merging upstream version 1.71.1+dfsg1.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'vendor/regex-syntax/src/hir')
-rw-r--r-- | vendor/regex-syntax/src/hir/interval.rs | 83 | ||||
-rw-r--r-- | vendor/regex-syntax/src/hir/literal.rs | 3165 | ||||
-rw-r--r-- | vendor/regex-syntax/src/hir/literal/mod.rs | 1686 | ||||
-rw-r--r-- | vendor/regex-syntax/src/hir/mod.rs | 3021 | ||||
-rw-r--r-- | vendor/regex-syntax/src/hir/print.rs | 394 | ||||
-rw-r--r-- | vendor/regex-syntax/src/hir/translate.rs | 1920 | ||||
-rw-r--r-- | vendor/regex-syntax/src/hir/visitor.rs | 19 |
7 files changed, 6925 insertions, 3363 deletions
diff --git a/vendor/regex-syntax/src/hir/interval.rs b/vendor/regex-syntax/src/hir/interval.rs index 56698c53a..e063390a8 100644 --- a/vendor/regex-syntax/src/hir/interval.rs +++ b/vendor/regex-syntax/src/hir/interval.rs @@ -1,8 +1,6 @@ -use std::char; -use std::cmp; -use std::fmt::Debug; -use std::slice; -use std::u8; +use core::{char, cmp, fmt::Debug, slice}; + +use alloc::vec::Vec; use crate::unicode; @@ -32,9 +30,38 @@ use crate::unicode; // // Tests on this are relegated to the public API of HIR in src/hir.rs. -#[derive(Clone, Debug, Eq, PartialEq)] +#[derive(Clone, Debug)] pub struct IntervalSet<I> { + /// A sorted set of non-overlapping ranges. ranges: Vec<I>, + /// While not required at all for correctness, we keep track of whether an + /// interval set has been case folded or not. This helps us avoid doing + /// redundant work if, for example, a set has already been cased folded. + /// And note that whether a set is folded or not is preserved through + /// all of the pairwise set operations. That is, if both interval sets + /// have been case folded, then any of difference, union, intersection or + /// symmetric difference all produce a case folded set. + /// + /// Note that when this is true, it *must* be the case that the set is case + /// folded. But when it's false, the set *may* be case folded. In other + /// words, we only set this to true when we know it to be case, but we're + /// okay with it being false if it would otherwise be costly to determine + /// whether it should be true. This means code cannot assume that a false + /// value necessarily indicates that the set is not case folded. + /// + /// Bottom line: this is a performance optimization. + folded: bool, +} + +impl<I: Interval> Eq for IntervalSet<I> {} + +// We implement PartialEq manually so that we don't consider the set's internal +// 'folded' property to be part of its identity. The 'folded' property is +// strictly an optimization. +impl<I: Interval> PartialEq for IntervalSet<I> { + fn eq(&self, other: &IntervalSet<I>) -> bool { + self.ranges.eq(&other.ranges) + } } impl<I: Interval> IntervalSet<I> { @@ -44,7 +71,10 @@ impl<I: Interval> IntervalSet<I> { /// The given ranges do not need to be in any specific order, and ranges /// may overlap. pub fn new<T: IntoIterator<Item = I>>(intervals: T) -> IntervalSet<I> { - let mut set = IntervalSet { ranges: intervals.into_iter().collect() }; + let ranges: Vec<I> = intervals.into_iter().collect(); + // An empty set is case folded. + let folded = ranges.is_empty(); + let mut set = IntervalSet { ranges, folded }; set.canonicalize(); set } @@ -55,6 +85,10 @@ impl<I: Interval> IntervalSet<I> { // it preserves canonicalization. self.ranges.push(interval); self.canonicalize(); + // We don't know whether the new interval added here is considered + // case folded, so we conservatively assume that the entire set is + // no longer case folded if it was previously. + self.folded = false; } /// Return an iterator over all intervals in this set. @@ -79,6 +113,9 @@ impl<I: Interval> IntervalSet<I> { /// This returns an error if the necessary case mapping data is not /// available. pub fn case_fold_simple(&mut self) -> Result<(), unicode::CaseFoldError> { + if self.folded { + return Ok(()); + } let len = self.ranges.len(); for i in 0..len { let range = self.ranges[i]; @@ -88,14 +125,19 @@ impl<I: Interval> IntervalSet<I> { } } self.canonicalize(); + self.folded = true; Ok(()) } /// Union this set with the given set, in place. pub fn union(&mut self, other: &IntervalSet<I>) { + if other.ranges.is_empty() || self.ranges == other.ranges { + return; + } // This could almost certainly be done more efficiently. self.ranges.extend(&other.ranges); self.canonicalize(); + self.folded = self.folded && other.folded; } /// Intersect this set with the given set, in place. @@ -105,6 +147,8 @@ impl<I: Interval> IntervalSet<I> { } if other.ranges.is_empty() { self.ranges.clear(); + // An empty set is case folded. + self.folded = true; return; } @@ -134,6 +178,7 @@ impl<I: Interval> IntervalSet<I> { } } self.ranges.drain(..drain_end); + self.folded = self.folded && other.folded; } /// Subtract the given set from this set, in place. @@ -226,6 +271,7 @@ impl<I: Interval> IntervalSet<I> { a += 1; } self.ranges.drain(..drain_end); + self.folded = self.folded && other.folded; } /// Compute the symmetric difference of the two sets, in place. @@ -251,6 +297,8 @@ impl<I: Interval> IntervalSet<I> { if self.ranges.is_empty() { let (min, max) = (I::Bound::min_value(), I::Bound::max_value()); self.ranges.push(I::create(min, max)); + // The set containing everything must case folded. + self.folded = true; return; } @@ -276,6 +324,19 @@ impl<I: Interval> IntervalSet<I> { self.ranges.push(I::create(lower, I::Bound::max_value())); } self.ranges.drain(..drain_end); + // We don't need to update whether this set is folded or not, because + // it is conservatively preserved through negation. Namely, if a set + // is not folded, then it is possible that its negation is folded, for + // example, [^☃]. But we're fine with assuming that the set is not + // folded in that case. (`folded` permits false negatives but not false + // positives.) + // + // But what about when a set is folded, is its negation also + // necessarily folded? Yes. Because if a set is folded, then for every + // character in the set, it necessarily included its equivalence class + // of case folded characters. Negating it in turn means that all + // equivalence classes in the set are negated, and any equivalence + // class that was previously not in the set is now entirely in the set. } /// Converts this set into a canonical ordering. @@ -481,7 +542,7 @@ impl Bound for u8 { u8::MAX } fn as_u32(self) -> u32 { - self as u32 + u32::from(self) } fn increment(self) -> Self { self.checked_add(1).unwrap() @@ -499,20 +560,20 @@ impl Bound for char { '\u{10FFFF}' } fn as_u32(self) -> u32 { - self as u32 + u32::from(self) } fn increment(self) -> Self { match self { '\u{D7FF}' => '\u{E000}', - c => char::from_u32((c as u32).checked_add(1).unwrap()).unwrap(), + c => char::from_u32(u32::from(c).checked_add(1).unwrap()).unwrap(), } } fn decrement(self) -> Self { match self { '\u{E000}' => '\u{D7FF}', - c => char::from_u32((c as u32).checked_sub(1).unwrap()).unwrap(), + c => char::from_u32(u32::from(c).checked_sub(1).unwrap()).unwrap(), } } } diff --git a/vendor/regex-syntax/src/hir/literal.rs b/vendor/regex-syntax/src/hir/literal.rs new file mode 100644 index 000000000..bd3a2d143 --- /dev/null +++ b/vendor/regex-syntax/src/hir/literal.rs @@ -0,0 +1,3165 @@ +/*! +Provides literal extraction from `Hir` expressions. + +An [`Extractor`] pulls literals out of [`Hir`] expressions and returns a +[`Seq`] of [`Literal`]s. + +The purpose of literal extraction is generally to provide avenues for +optimizing regex searches. The main idea is that substring searches can be an +order of magnitude faster than a regex search. Therefore, if one can execute +a substring search to find candidate match locations and only run the regex +search at those locations, then it is possible for huge improvements in +performance to be realized. + +With that said, literal optimizations are generally a black art because even +though substring search is generally faster, if the number of candidates +produced is high, then it can create a lot of overhead by ping-ponging between +the substring search and the regex search. + +Here are some heuristics that might be used to help increase the chances of +effective literal optimizations: + +* Stick to small [`Seq`]s. If you search for too many literals, it's likely +to lead to substring search that is only a little faster than a regex search, +and thus the overhead of using literal optimizations in the first place might +make things slower overall. +* The literals in your [`Seq`] shoudn't be too short. In general, longer is +better. A sequence corresponding to single bytes that occur frequently in the +haystack, for example, is probably a bad literal optimization because it's +likely to produce many false positive candidates. Longer literals are less +likely to match, and thus probably produce fewer false positives. +* If it's possible to estimate the approximate frequency of each byte according +to some pre-computed background distribution, it is possible to compute a score +of how "good" a `Seq` is. If a `Seq` isn't good enough, you might consider +skipping the literal optimization and just use the regex engine. + +(It should be noted that there are always pathological cases that can make +any kind of literal optimization be a net slower result. This is why it +might be a good idea to be conservative, or to even provide a means for +literal optimizations to be dynamically disabled if they are determined to be +ineffective according to some measure.) + +You're encouraged to explore the methods on [`Seq`], which permit shrinking +the size of sequences in a preference-order preserving fashion. + +Finally, note that it isn't strictly necessary to use an [`Extractor`]. Namely, +an `Extractor` only uses public APIs of the [`Seq`] and [`Literal`] types, +so it is possible to implement your own extractor. For example, for n-grams +or "inner" literals (i.e., not prefix or suffix literals). The `Extractor` +is mostly responsible for the case analysis over `Hir` expressions. Much of +the "trickier" parts are how to combine literal sequences, and that is all +implemented on [`Seq`]. +*/ + +use core::{cmp, mem}; + +use alloc::{vec, vec::Vec}; + +use crate::hir::{self, Hir}; + +/// Extracts prefix or suffix literal sequences from [`Hir`] expressions. +/// +/// Literal extraction is based on the following observations: +/// +/// * Many regexes start with one or a small number of literals. +/// * Substring search for literals is often much faster (sometimes by an order +/// of magnitude) than a regex search. +/// +/// Thus, in many cases, one can search for literals to find candidate starting +/// locations of a match, and then only run the full regex engine at each such +/// location instead of over the full haystack. +/// +/// The main downside of literal extraction is that it can wind up causing a +/// search to be slower overall. For example, if there are many matches or if +/// there are many candidates that don't ultimately lead to a match, then a +/// lot of overhead will be spent in shuffing back-and-forth between substring +/// search and the regex engine. This is the fundamental reason why literal +/// optimizations for regex patterns is sometimes considered a "black art." +/// +/// # Look-around assertions +/// +/// Literal extraction treats all look-around assertions as-if they match every +/// empty string. So for example, the regex `\bquux\b` will yield a sequence +/// containing a single exact literal `quux`. However, not all occurrences +/// of `quux` correspond to a match a of the regex. For example, `\bquux\b` +/// does not match `ZquuxZ` anywhere because `quux` does not fall on a word +/// boundary. +/// +/// In effect, if your regex contains look-around assertions, then a match of +/// an exact literal does not necessarily mean the regex overall matches. So +/// you may still need to run the regex engine in such cases to confirm the +/// match. +/// +/// The precise guarantee you get from a literal sequence is: if every literal +/// in the sequence is exact and the original regex contains zero look-around +/// assertions, then a preference-order multi-substring search of those +/// literals will precisely match a preference-order search of the original +/// regex. +/// +/// # Example +/// +/// This shows how to extract prefixes: +/// +/// ``` +/// use regex_syntax::{hir::literal::{Extractor, Literal, Seq}, parse}; +/// +/// let hir = parse(r"(a|b|c)(x|y|z)[A-Z]+foo")?; +/// let got = Extractor::new().extract(&hir); +/// // All literals returned are "inexact" because none of them reach the +/// // match state. +/// let expected = Seq::from_iter([ +/// Literal::inexact("ax"), +/// Literal::inexact("ay"), +/// Literal::inexact("az"), +/// Literal::inexact("bx"), +/// Literal::inexact("by"), +/// Literal::inexact("bz"), +/// Literal::inexact("cx"), +/// Literal::inexact("cy"), +/// Literal::inexact("cz"), +/// ]); +/// assert_eq!(expected, got); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +/// +/// This shows how to extract suffixes: +/// +/// ``` +/// use regex_syntax::{ +/// hir::literal::{Extractor, ExtractKind, Literal, Seq}, +/// parse, +/// }; +/// +/// let hir = parse(r"foo|[A-Z]+bar")?; +/// let got = Extractor::new().kind(ExtractKind::Suffix).extract(&hir); +/// // Since 'foo' gets to a match state, it is considered exact. But 'bar' +/// // does not because of the '[A-Z]+', and thus is marked inexact. +/// let expected = Seq::from_iter([ +/// Literal::exact("foo"), +/// Literal::inexact("bar"), +/// ]); +/// assert_eq!(expected, got); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +#[derive(Clone, Debug)] +pub struct Extractor { + kind: ExtractKind, + limit_class: usize, + limit_repeat: usize, + limit_literal_len: usize, + limit_total: usize, +} + +impl Extractor { + /// Create a new extractor with a default configuration. + /// + /// The extractor can be optionally configured before calling + /// [`Extractor::extract`] to get a literal sequence. + pub fn new() -> Extractor { + Extractor { + kind: ExtractKind::Prefix, + limit_class: 10, + limit_repeat: 10, + limit_literal_len: 100, + limit_total: 250, + } + } + + /// Execute the extractor and return a sequence of literals. + pub fn extract(&self, hir: &Hir) -> Seq { + use crate::hir::HirKind::*; + + match *hir.kind() { + Empty | Look(_) => Seq::singleton(self::Literal::exact(vec![])), + Literal(hir::Literal(ref bytes)) => { + let mut seq = + Seq::singleton(self::Literal::exact(bytes.to_vec())); + self.enforce_literal_len(&mut seq); + seq + } + Class(hir::Class::Unicode(ref cls)) => { + self.extract_class_unicode(cls) + } + Class(hir::Class::Bytes(ref cls)) => self.extract_class_bytes(cls), + Repetition(ref rep) => self.extract_repetition(rep), + Capture(hir::Capture { ref sub, .. }) => self.extract(sub), + Concat(ref hirs) => match self.kind { + ExtractKind::Prefix => self.extract_concat(hirs.iter()), + ExtractKind::Suffix => self.extract_concat(hirs.iter().rev()), + }, + Alternation(ref hirs) => { + // Unlike concat, we always union starting from the beginning, + // since the beginning corresponds to the highest preference, + // which doesn't change based on forwards vs reverse. + self.extract_alternation(hirs.iter()) + } + } + } + + /// Set the kind of literal sequence to extract from an [`Hir`] expression. + /// + /// The default is to extract prefixes, but suffixes can be selected + /// instead. The contract for prefixes is that every match of the + /// corresponding `Hir` must start with one of the literals in the sequence + /// returned. Moreover, the _order_ of the sequence returned corresponds to + /// the preference order. + /// + /// Suffixes satisfy a similar contract in that every match of the + /// corresponding `Hir` must end with one of the literals in the sequence + /// returned. However, there is no guarantee that the literals are in + /// preference order. + /// + /// Remember that a sequence can be infinite. For example, unless the + /// limits are configured to be impractically large, attempting to extract + /// prefixes (or suffixes) for the pattern `[A-Z]` will return an infinite + /// sequence. Generally speaking, if the sequence returned is infinite, + /// then it is presumed to be unwise to do prefix (or suffix) optimizations + /// for the pattern. + pub fn kind(&mut self, kind: ExtractKind) -> &mut Extractor { + self.kind = kind; + self + } + + /// Configure a limit on the length of the sequence that is permitted for + /// a character class. If a character class exceeds this limit, then the + /// sequence returned for it is infinite. + /// + /// This prevents classes like `[A-Z]` or `\pL` from getting turned into + /// huge and likely unproductive sequences of literals. + /// + /// # Example + /// + /// This example shows how this limit can be lowered to decrease the tolerance + /// for character classes being turned into literal sequences. + /// + /// ``` + /// use regex_syntax::{hir::literal::{Extractor, Seq}, parse}; + /// + /// let hir = parse(r"[0-9]")?; + /// + /// let got = Extractor::new().extract(&hir); + /// let expected = Seq::new([ + /// "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", + /// ]); + /// assert_eq!(expected, got); + /// + /// // Now let's shrink the limit and see how that changes things. + /// let got = Extractor::new().limit_class(4).extract(&hir); + /// let expected = Seq::infinite(); + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn limit_class(&mut self, limit: usize) -> &mut Extractor { + self.limit_class = limit; + self + } + + /// Configure a limit on the total number of repetitions that is permitted + /// before literal extraction is stopped. + /// + /// This is useful for limiting things like `(abcde){50}`, or more + /// insidiously, `(?:){1000000000}`. This limit prevents any one single + /// repetition from adding too much to a literal sequence. + /// + /// With this limit set, repetitions that exceed it will be stopped and any + /// literals extracted up to that point will be made inexact. + /// + /// # Example + /// + /// This shows how to decrease the limit and compares it with the default. + /// + /// ``` + /// use regex_syntax::{hir::literal::{Extractor, Literal, Seq}, parse}; + /// + /// let hir = parse(r"(abc){8}")?; + /// + /// let got = Extractor::new().extract(&hir); + /// let expected = Seq::new(["abcabcabcabcabcabcabcabc"]); + /// assert_eq!(expected, got); + /// + /// // Now let's shrink the limit and see how that changes things. + /// let got = Extractor::new().limit_repeat(4).extract(&hir); + /// let expected = Seq::from_iter([ + /// Literal::inexact("abcabcabcabc"), + /// ]); + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn limit_repeat(&mut self, limit: usize) -> &mut Extractor { + self.limit_repeat = limit; + self + } + + /// Configure a limit on the maximum length of any literal in a sequence. + /// + /// This is useful for limiting things like `(abcde){5}{5}{5}{5}`. While + /// each repetition or literal in that regex is small, when all the + /// repetitions are applied, one ends up with a literal of length `5^4 = + /// 625`. + /// + /// With this limit set, literals that exceed it will be made inexact and + /// thus prevented from growing. + /// + /// # Example + /// + /// This shows how to decrease the limit and compares it with the default. + /// + /// ``` + /// use regex_syntax::{hir::literal::{Extractor, Literal, Seq}, parse}; + /// + /// let hir = parse(r"(abc){2}{2}{2}")?; + /// + /// let got = Extractor::new().extract(&hir); + /// let expected = Seq::new(["abcabcabcabcabcabcabcabc"]); + /// assert_eq!(expected, got); + /// + /// // Now let's shrink the limit and see how that changes things. + /// let got = Extractor::new().limit_literal_len(14).extract(&hir); + /// let expected = Seq::from_iter([ + /// Literal::inexact("abcabcabcabcab"), + /// ]); + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn limit_literal_len(&mut self, limit: usize) -> &mut Extractor { + self.limit_literal_len = limit; + self + } + + /// Configure a limit on the total number of literals that will be + /// returned. + /// + /// This is useful as a practical measure for avoiding the creation of + /// large sequences of literals. While the extractor will automatically + /// handle local creations of large sequences (for example, `[A-Z]` yields + /// an infinite sequence by default), large sequences can be created + /// through non-local means as well. + /// + /// For example, `[ab]{3}{3}` would yield a sequence of length `512 = 2^9` + /// despite each of the repetitions being small on their own. This limit + /// thus represents a "catch all" for avoiding locally small sequences from + /// combining into large sequences. + /// + /// # Example + /// + /// This example shows how reducing the limit will change the literal + /// sequence returned. + /// + /// ``` + /// use regex_syntax::{hir::literal::{Extractor, Literal, Seq}, parse}; + /// + /// let hir = parse(r"[ab]{2}{2}")?; + /// + /// let got = Extractor::new().extract(&hir); + /// let expected = Seq::new([ + /// "aaaa", "aaab", "aaba", "aabb", + /// "abaa", "abab", "abba", "abbb", + /// "baaa", "baab", "baba", "babb", + /// "bbaa", "bbab", "bbba", "bbbb", + /// ]); + /// assert_eq!(expected, got); + /// + /// // The default limit is not too big, but big enough to extract all + /// // literals from '[ab]{2}{2}'. If we shrink the limit to less than 16, + /// // then we'll get a truncated set. Notice that it returns a sequence of + /// // length 4 even though our limit was 10. This is because the sequence + /// // is difficult to increase without blowing the limit. Notice also + /// // that every literal in the sequence is now inexact because they were + /// // stripped of some suffix. + /// let got = Extractor::new().limit_total(10).extract(&hir); + /// let expected = Seq::from_iter([ + /// Literal::inexact("aa"), + /// Literal::inexact("ab"), + /// Literal::inexact("ba"), + /// Literal::inexact("bb"), + /// ]); + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn limit_total(&mut self, limit: usize) -> &mut Extractor { + self.limit_total = limit; + self + } + + /// Extract a sequence from the given concatenation. Sequences from each of + /// the child HIR expressions are combined via cross product. + /// + /// This short circuits once the cross product turns into a sequence + /// containing only inexact literals. + fn extract_concat<'a, I: Iterator<Item = &'a Hir>>(&self, it: I) -> Seq { + let mut seq = Seq::singleton(self::Literal::exact(vec![])); + for hir in it { + // If every element in the sequence is inexact, then a cross + // product will always be a no-op. Thus, there is nothing else we + // can add to it and can quit early. Note that this also includes + // infinite sequences. + if seq.is_inexact() { + break; + } + // Note that 'cross' also dispatches based on whether we're + // extracting prefixes or suffixes. + seq = self.cross(seq, &mut self.extract(hir)); + } + seq + } + + /// Extract a sequence from the given alternation. + /// + /// This short circuits once the union turns into an infinite sequence. + fn extract_alternation<'a, I: Iterator<Item = &'a Hir>>( + &self, + it: I, + ) -> Seq { + let mut seq = Seq::empty(); + for hir in it { + // Once our 'seq' is infinite, every subsequent union + // operation on it will itself always result in an + // infinite sequence. Thus, it can never change and we can + // short-circuit. + if !seq.is_finite() { + break; + } + seq = self.union(seq, &mut self.extract(hir)); + } + seq + } + + /// Extract a sequence of literals from the given repetition. We do our + /// best, Some examples: + /// + /// 'a*' => [inexact(a), exact("")] + /// 'a*?' => [exact(""), inexact(a)] + /// 'a+' => [inexact(a)] + /// 'a{3}' => [exact(aaa)] + /// 'a{3,5} => [inexact(aaa)] + /// + /// The key here really is making sure we get the 'inexact' vs 'exact' + /// attributes correct on each of the literals we add. For example, the + /// fact that 'a*' gives us an inexact 'a' and an exact empty string means + /// that a regex like 'ab*c' will result in [inexact(ab), exact(ac)] + /// literals being extracted, which might actually be a better prefilter + /// than just 'a'. + fn extract_repetition(&self, rep: &hir::Repetition) -> Seq { + let mut subseq = self.extract(&rep.sub); + match *rep { + hir::Repetition { min: 0, max, greedy, .. } => { + // When 'max=1', we can retain exactness, since 'a?' is + // equivalent to 'a|'. Similarly below, 'a??' is equivalent to + // '|a'. + if max != Some(1) { + subseq.make_inexact(); + } + let mut empty = Seq::singleton(Literal::exact(vec![])); + if !greedy { + mem::swap(&mut subseq, &mut empty); + } + self.union(subseq, &mut empty) + } + hir::Repetition { min, max: Some(max), .. } if min == max => { + assert!(min > 0); // handled above + let limit = + u32::try_from(self.limit_repeat).unwrap_or(u32::MAX); + let mut seq = Seq::singleton(Literal::exact(vec![])); + for _ in 0..cmp::min(min, limit) { + if seq.is_inexact() { + break; + } + seq = self.cross(seq, &mut subseq.clone()); + } + if usize::try_from(min).is_err() || min > limit { + seq.make_inexact(); + } + seq + } + hir::Repetition { min, max: Some(max), .. } if min < max => { + assert!(min > 0); // handled above + let limit = + u32::try_from(self.limit_repeat).unwrap_or(u32::MAX); + let mut seq = Seq::singleton(Literal::exact(vec![])); + for _ in 0..cmp::min(min, limit) { + if seq.is_inexact() { + break; + } + seq = self.cross(seq, &mut subseq.clone()); + } + seq.make_inexact(); + seq + } + hir::Repetition { .. } => { + subseq.make_inexact(); + subseq + } + } + } + + /// Convert the given Unicode class into a sequence of literals if the + /// class is small enough. If the class is too big, return an infinite + /// sequence. + fn extract_class_unicode(&self, cls: &hir::ClassUnicode) -> Seq { + if self.class_over_limit_unicode(cls) { + return Seq::infinite(); + } + let mut seq = Seq::empty(); + for r in cls.iter() { + for ch in r.start()..=r.end() { + seq.push(Literal::from(ch)); + } + } + self.enforce_literal_len(&mut seq); + seq + } + + /// Convert the given byte class into a sequence of literals if the class + /// is small enough. If the class is too big, return an infinite sequence. + fn extract_class_bytes(&self, cls: &hir::ClassBytes) -> Seq { + if self.class_over_limit_bytes(cls) { + return Seq::infinite(); + } + let mut seq = Seq::empty(); + for r in cls.iter() { + for b in r.start()..=r.end() { + seq.push(Literal::from(b)); + } + } + self.enforce_literal_len(&mut seq); + seq + } + + /// Returns true if the given Unicode class exceeds the configured limits + /// on this extractor. + fn class_over_limit_unicode(&self, cls: &hir::ClassUnicode) -> bool { + let mut count = 0; + for r in cls.iter() { + if count > self.limit_class { + return true; + } + count += r.len(); + } + count > self.limit_class + } + + /// Returns true if the given byte class exceeds the configured limits on + /// this extractor. + fn class_over_limit_bytes(&self, cls: &hir::ClassBytes) -> bool { + let mut count = 0; + for r in cls.iter() { + if count > self.limit_class { + return true; + } + count += r.len(); + } + count > self.limit_class + } + + /// Compute the cross product of the two sequences if the result would be + /// within configured limits. Otherwise, make `seq2` infinite and cross the + /// infinite sequence with `seq1`. + fn cross(&self, mut seq1: Seq, seq2: &mut Seq) -> Seq { + if seq1.max_cross_len(seq2).map_or(false, |len| len > self.limit_total) + { + seq2.make_infinite(); + } + if let ExtractKind::Suffix = self.kind { + seq1.cross_reverse(seq2); + } else { + seq1.cross_forward(seq2); + } + assert!(seq1.len().map_or(true, |x| x <= self.limit_total)); + self.enforce_literal_len(&mut seq1); + seq1 + } + + /// Union the two sequences if the result would be within configured + /// limits. Otherwise, make `seq2` infinite and union the infinite sequence + /// with `seq1`. + fn union(&self, mut seq1: Seq, seq2: &mut Seq) -> Seq { + if seq1.max_union_len(seq2).map_or(false, |len| len > self.limit_total) + { + // We try to trim our literal sequences to see if we can make + // room for more literals. The idea is that we'd rather trim down + // literals already in our sequence if it means we can add a few + // more and retain a finite sequence. Otherwise, we'll union with + // an infinite sequence and that infects everything and effectively + // stops literal extraction in its tracks. + // + // We do we keep 4 bytes here? Well, it's a bit of an abstraction + // leakage. Downstream, the literals may wind up getting fed to + // the Teddy algorithm, which supports searching literals up to + // length 4. So that's why we pick that number here. Arguably this + // should be a tuneable parameter, but it seems a little tricky to + // describe. And I'm still unsure if this is the right way to go + // about culling literal sequences. + match self.kind { + ExtractKind::Prefix => { + seq1.keep_first_bytes(4); + seq2.keep_first_bytes(4); + } + ExtractKind::Suffix => { + seq1.keep_last_bytes(4); + seq2.keep_last_bytes(4); + } + } + seq1.dedup(); + seq2.dedup(); + if seq1 + .max_union_len(seq2) + .map_or(false, |len| len > self.limit_total) + { + seq2.make_infinite(); + } + } + seq1.union(seq2); + assert!(seq1.len().map_or(true, |x| x <= self.limit_total)); + seq1 + } + + /// Applies the literal length limit to the given sequence. If none of the + /// literals in the sequence exceed the limit, then this is a no-op. + fn enforce_literal_len(&self, seq: &mut Seq) { + let len = self.limit_literal_len; + match self.kind { + ExtractKind::Prefix => seq.keep_first_bytes(len), + ExtractKind::Suffix => seq.keep_last_bytes(len), + } + } +} + +impl Default for Extractor { + fn default() -> Extractor { + Extractor::new() + } +} + +/// The kind of literals to extract from an [`Hir`] expression. +/// +/// The default extraction kind is `Prefix`. +#[non_exhaustive] +#[derive(Clone, Debug)] +pub enum ExtractKind { + /// Extracts only prefix literals from a regex. + Prefix, + /// Extracts only suffix literals from a regex. + /// + /// Note that the sequence returned by suffix literals currently may + /// not correctly represent leftmost-first or "preference" order match + /// semantics. + Suffix, +} + +impl ExtractKind { + /// Returns true if this kind is the `Prefix` variant. + pub fn is_prefix(&self) -> bool { + matches!(*self, ExtractKind::Prefix) + } + + /// Returns true if this kind is the `Suffix` variant. + pub fn is_suffix(&self) -> bool { + matches!(*self, ExtractKind::Suffix) + } +} + +impl Default for ExtractKind { + fn default() -> ExtractKind { + ExtractKind::Prefix + } +} + +/// A sequence of literals. +/// +/// A `Seq` is very much like a set in that it represents a union of its +/// members. That is, it corresponds to a set of literals where at least one +/// must match in order for a particular [`Hir`] expression to match. (Whether +/// this corresponds to the entire `Hir` expression, a prefix of it or a suffix +/// of it depends on how the `Seq` was extracted from the `Hir`.) +/// +/// It is also unlike a set in that multiple identical literals may appear, +/// and that the order of the literals in the `Seq` matters. For example, if +/// the sequence is `[sam, samwise]` and leftmost-first matching is used, then +/// `samwise` can never match and the sequence is equivalent to `[sam]`. +/// +/// # States of a sequence +/// +/// A `Seq` has a few different logical states to consider: +/// +/// * The sequence can represent "any" literal. When this happens, the set does +/// not have a finite size. The purpose of this state is to inhibit callers +/// from making assumptions about what literals are required in order to match +/// a particular [`Hir`] expression. Generally speaking, when a set is in this +/// state, literal optimizations are inhibited. A good example of a regex that +/// will cause this sort of set to apppear is `[A-Za-z]`. The character class +/// is just too big (and also too narrow) to be usefully expanded into 52 +/// different literals. (Note that the decision for when a seq should become +/// infinite is determined by the caller. A seq itself has no hard-coded +/// limits.) +/// * The sequence can be empty, in which case, it is an affirmative statement +/// that there are no literals that can match the corresponding `Hir`. +/// Consequently, the `Hir` never matches any input. For example, `[a&&b]`. +/// * The sequence can be non-empty, in which case, at least one of the +/// literals must match in order for the corresponding `Hir` to match. +/// +/// # Example +/// +/// This example shows how literal sequences can be simplified by stripping +/// suffixes and minimizing while maintaining preference order. +/// +/// ``` +/// use regex_syntax::hir::literal::{Literal, Seq}; +/// +/// let mut seq = Seq::new(&[ +/// "farm", +/// "appliance", +/// "faraway", +/// "apple", +/// "fare", +/// "gap", +/// "applicant", +/// "applaud", +/// ]); +/// seq.keep_first_bytes(3); +/// seq.minimize_by_preference(); +/// // Notice that 'far' comes before 'app', which matches the order in the +/// // original sequence. This guarantees that leftmost-first semantics are +/// // not altered by simplifying the set. +/// let expected = Seq::from_iter([ +/// Literal::inexact("far"), +/// Literal::inexact("app"), +/// Literal::exact("gap"), +/// ]); +/// assert_eq!(expected, seq); +/// ``` +#[derive(Clone, Eq, PartialEq)] +pub struct Seq { + /// The members of this seq. + /// + /// When `None`, the seq represents all possible literals. That is, it + /// prevents one from making assumptions about specific literals in the + /// seq, and forces one to treat it as if any literal might be in the seq. + /// + /// Note that `Some(vec![])` is valid and corresponds to the empty seq of + /// literals, i.e., a regex that can never match. For example, `[a&&b]`. + /// It is distinct from `Some(vec![""])`, which corresponds to the seq + /// containing an empty string, which matches at every position. + literals: Option<Vec<Literal>>, +} + +impl Seq { + /// Returns an empty sequence. + /// + /// An empty sequence matches zero literals, and thus corresponds to a + /// regex that itself can never match. + #[inline] + pub fn empty() -> Seq { + Seq { literals: Some(vec![]) } + } + + /// Returns a sequence of literals without a finite size and may contain + /// any literal. + /// + /// A sequence without finite size does not reveal anything about the + /// characteristics of the literals in its set. There are no fixed prefixes + /// or suffixes, nor are lower or upper bounds on the length of the literals + /// in the set known. + /// + /// This is useful to represent constructs in a regex that are "too big" + /// to useful represent as a sequence of literals. For example, `[A-Za-z]`. + /// When sequences get too big, they lose their discriminating nature and + /// are more likely to produce false positives, which in turn makes them + /// less likely to speed up searches. + /// + /// More pragmatically, for many regexes, enumerating all possible literals + /// is itself not possible or might otherwise use too many resources. So + /// constraining the size of sets during extraction is a practical trade + /// off to make. + #[inline] + pub fn infinite() -> Seq { + Seq { literals: None } + } + + /// Returns a sequence containing a single literal. + #[inline] + pub fn singleton(lit: Literal) -> Seq { + Seq { literals: Some(vec![lit]) } + } + + /// Returns a sequence of exact literals from the given byte strings. + #[inline] + pub fn new<I, B>(it: I) -> Seq + where + I: IntoIterator<Item = B>, + B: AsRef<[u8]>, + { + it.into_iter().map(|b| Literal::exact(b.as_ref())).collect() + } + + /// If this is a finite sequence, return its members as a slice of + /// literals. + /// + /// The slice returned may be empty, in which case, there are no literals + /// that can match this sequence. + #[inline] + pub fn literals(&self) -> Option<&[Literal]> { + self.literals.as_deref() + } + + /// Push a literal to the end of this sequence. + /// + /// If this sequence is not finite, then this is a no-op. + /// + /// Similarly, if the most recently added item of this sequence is + /// equivalent to the literal given, then it is not added. This reflects + /// a `Seq`'s "set like" behavior, and represents a practical trade off. + /// Namely, there is never any need to have two adjacent and equivalent + /// literals in the same sequence, _and_ it is easy to detect in some + /// cases. + #[inline] + pub fn push(&mut self, lit: Literal) { + let lits = match self.literals { + None => return, + Some(ref mut lits) => lits, + }; + if lits.last().map_or(false, |m| m == &lit) { + return; + } + lits.push(lit); + } + + /// Make all of the literals in this sequence inexact. + /// + /// This is a no-op if this sequence is not finite. + #[inline] + pub fn make_inexact(&mut self) { + let lits = match self.literals { + None => return, + Some(ref mut lits) => lits, + }; + for lit in lits.iter_mut() { + lit.make_inexact(); + } + } + + /// Converts this sequence to an infinite sequence. + /// + /// This is a no-op if the sequence is already infinite. + #[inline] + pub fn make_infinite(&mut self) { + self.literals = None; + } + + /// Modify this sequence to contain the cross product between it and the + /// sequence given. + /// + /// The cross product only considers literals in this sequence that are + /// exact. That is, inexact literals are not extended. + /// + /// The literals are always drained from `other`, even if none are used. + /// This permits callers to reuse the sequence allocation elsewhere. + /// + /// If this sequence is infinite, then this is a no-op, regardless of what + /// `other` contains (and in this case, the literals are still drained from + /// `other`). If `other` is infinite and this sequence is finite, then this + /// is a no-op, unless this sequence contains a zero-length literal. In + /// which case, the infiniteness of `other` infects this sequence, and this + /// sequence is itself made infinite. + /// + /// Like [`Seq::union`], this may attempt to deduplicate literals. See + /// [`Seq::dedup`] for how deduplication deals with exact and inexact + /// literals. + /// + /// # Example + /// + /// This example shows basic usage and how exact and inexact literals + /// interact. + /// + /// ``` + /// use regex_syntax::hir::literal::{Literal, Seq}; + /// + /// let mut seq1 = Seq::from_iter([ + /// Literal::exact("foo"), + /// Literal::inexact("bar"), + /// ]); + /// let mut seq2 = Seq::from_iter([ + /// Literal::inexact("quux"), + /// Literal::exact("baz"), + /// ]); + /// seq1.cross_forward(&mut seq2); + /// + /// // The literals are pulled out of seq2. + /// assert_eq!(Some(0), seq2.len()); + /// + /// let expected = Seq::from_iter([ + /// Literal::inexact("fooquux"), + /// Literal::exact("foobaz"), + /// Literal::inexact("bar"), + /// ]); + /// assert_eq!(expected, seq1); + /// ``` + /// + /// This example shows the behavior of when `other` is an infinite + /// sequence. + /// + /// ``` + /// use regex_syntax::hir::literal::{Literal, Seq}; + /// + /// let mut seq1 = Seq::from_iter([ + /// Literal::exact("foo"), + /// Literal::inexact("bar"), + /// ]); + /// let mut seq2 = Seq::infinite(); + /// seq1.cross_forward(&mut seq2); + /// + /// // When seq2 is infinite, cross product doesn't add anything, but + /// // ensures all members of seq1 are inexact. + /// let expected = Seq::from_iter([ + /// Literal::inexact("foo"), + /// Literal::inexact("bar"), + /// ]); + /// assert_eq!(expected, seq1); + /// ``` + /// + /// This example is like the one above, but shows what happens when this + /// sequence contains an empty string. In this case, an infinite `other` + /// sequence infects this sequence (because the empty string means that + /// there are no finite prefixes): + /// + /// ``` + /// use regex_syntax::hir::literal::{Literal, Seq}; + /// + /// let mut seq1 = Seq::from_iter([ + /// Literal::exact("foo"), + /// Literal::exact(""), // inexact provokes same behavior + /// Literal::inexact("bar"), + /// ]); + /// let mut seq2 = Seq::infinite(); + /// seq1.cross_forward(&mut seq2); + /// + /// // seq1 is now infinite! + /// assert!(!seq1.is_finite()); + /// ``` + /// + /// This example shows the behavior of this sequence is infinite. + /// + /// ``` + /// use regex_syntax::hir::literal::{Literal, Seq}; + /// + /// let mut seq1 = Seq::infinite(); + /// let mut seq2 = Seq::from_iter([ + /// Literal::exact("foo"), + /// Literal::inexact("bar"), + /// ]); + /// seq1.cross_forward(&mut seq2); + /// + /// // seq1 remains unchanged. + /// assert!(!seq1.is_finite()); + /// // Even though the literals in seq2 weren't used, it was still drained. + /// assert_eq!(Some(0), seq2.len()); + /// ``` + #[inline] + pub fn cross_forward(&mut self, other: &mut Seq) { + let (lits1, lits2) = match self.cross_preamble(other) { + None => return, + Some((lits1, lits2)) => (lits1, lits2), + }; + let newcap = lits1.len().saturating_mul(lits2.len()); + for selflit in mem::replace(lits1, Vec::with_capacity(newcap)) { + if !selflit.is_exact() { + lits1.push(selflit); + continue; + } + for otherlit in lits2.iter() { + let mut newlit = Literal::exact(Vec::with_capacity( + selflit.len() + otherlit.len(), + )); + newlit.extend(&selflit); + newlit.extend(&otherlit); + if !otherlit.is_exact() { + newlit.make_inexact(); + } + lits1.push(newlit); + } + } + lits2.drain(..); + self.dedup(); + } + + /// Modify this sequence to contain the cross product between it and + /// the sequence given, where the sequences are treated as suffixes + /// instead of prefixes. Namely, the sequence `other` is *prepended* + /// to `self` (as opposed to `other` being *appended* to `self` in + /// [`Seq::cross_forward`]). + /// + /// The cross product only considers literals in this sequence that are + /// exact. That is, inexact literals are not extended. + /// + /// The literals are always drained from `other`, even if none are used. + /// This permits callers to reuse the sequence allocation elsewhere. + /// + /// If this sequence is infinite, then this is a no-op, regardless of what + /// `other` contains (and in this case, the literals are still drained from + /// `other`). If `other` is infinite and this sequence is finite, then this + /// is a no-op, unless this sequence contains a zero-length literal. In + /// which case, the infiniteness of `other` infects this sequence, and this + /// sequence is itself made infinite. + /// + /// Like [`Seq::union`], this may attempt to deduplicate literals. See + /// [`Seq::dedup`] for how deduplication deals with exact and inexact + /// literals. + /// + /// # Example + /// + /// This example shows basic usage and how exact and inexact literals + /// interact. + /// + /// ``` + /// use regex_syntax::hir::literal::{Literal, Seq}; + /// + /// let mut seq1 = Seq::from_iter([ + /// Literal::exact("foo"), + /// Literal::inexact("bar"), + /// ]); + /// let mut seq2 = Seq::from_iter([ + /// Literal::inexact("quux"), + /// Literal::exact("baz"), + /// ]); + /// seq1.cross_reverse(&mut seq2); + /// + /// // The literals are pulled out of seq2. + /// assert_eq!(Some(0), seq2.len()); + /// + /// let expected = Seq::from_iter([ + /// Literal::inexact("quuxfoo"), + /// Literal::inexact("bar"), + /// Literal::exact("bazfoo"), + /// ]); + /// assert_eq!(expected, seq1); + /// ``` + /// + /// This example shows the behavior of when `other` is an infinite + /// sequence. + /// + /// ``` + /// use regex_syntax::hir::literal::{Literal, Seq}; + /// + /// let mut seq1 = Seq::from_iter([ + /// Literal::exact("foo"), + /// Literal::inexact("bar"), + /// ]); + /// let mut seq2 = Seq::infinite(); + /// seq1.cross_reverse(&mut seq2); + /// + /// // When seq2 is infinite, cross product doesn't add anything, but + /// // ensures all members of seq1 are inexact. + /// let expected = Seq::from_iter([ + /// Literal::inexact("foo"), + /// Literal::inexact("bar"), + /// ]); + /// assert_eq!(expected, seq1); + /// ``` + /// + /// This example is like the one above, but shows what happens when this + /// sequence contains an empty string. In this case, an infinite `other` + /// sequence infects this sequence (because the empty string means that + /// there are no finite suffixes): + /// + /// ``` + /// use regex_syntax::hir::literal::{Literal, Seq}; + /// + /// let mut seq1 = Seq::from_iter([ + /// Literal::exact("foo"), + /// Literal::exact(""), // inexact provokes same behavior + /// Literal::inexact("bar"), + /// ]); + /// let mut seq2 = Seq::infinite(); + /// seq1.cross_reverse(&mut seq2); + /// + /// // seq1 is now infinite! + /// assert!(!seq1.is_finite()); + /// ``` + /// + /// This example shows the behavior when this sequence is infinite. + /// + /// ``` + /// use regex_syntax::hir::literal::{Literal, Seq}; + /// + /// let mut seq1 = Seq::infinite(); + /// let mut seq2 = Seq::from_iter([ + /// Literal::exact("foo"), + /// Literal::inexact("bar"), + /// ]); + /// seq1.cross_reverse(&mut seq2); + /// + /// // seq1 remains unchanged. + /// assert!(!seq1.is_finite()); + /// // Even though the literals in seq2 weren't used, it was still drained. + /// assert_eq!(Some(0), seq2.len()); + /// ``` + #[inline] + pub fn cross_reverse(&mut self, other: &mut Seq) { + let (lits1, lits2) = match self.cross_preamble(other) { + None => return, + Some((lits1, lits2)) => (lits1, lits2), + }; + // We basically proceed as we do in 'cross_forward' at this point, + // except that the outer loop is now 'other' and the inner loop is now + // 'self'. That's because 'self' corresponds to suffixes and 'other' + // corresponds to the sequence we want to *prepend* to the suffixes. + let newcap = lits1.len().saturating_mul(lits2.len()); + let selflits = mem::replace(lits1, Vec::with_capacity(newcap)); + for (i, otherlit) in lits2.drain(..).enumerate() { + for selflit in selflits.iter() { + if !selflit.is_exact() { + // If the suffix isn't exact, then we can't prepend + // anything to it. However, we still want to keep it. But + // we only want to keep one of them, to avoid duplication. + // (The duplication is okay from a correctness perspective, + // but wasteful.) + if i == 0 { + lits1.push(selflit.clone()); + } + continue; + } + let mut newlit = Literal::exact(Vec::with_capacity( + otherlit.len() + selflit.len(), + )); + newlit.extend(&otherlit); + newlit.extend(&selflit); + if !otherlit.is_exact() { + newlit.make_inexact(); + } + lits1.push(newlit); + } + } + self.dedup(); + } + + /// A helper function the corresponds to the subtle preamble for both + /// `cross_forward` and `cross_reverse`. In effect, it handles the cases + /// of infinite sequences for both `self` and `other`, as well as ensuring + /// that literals from `other` are drained even if they aren't used. + fn cross_preamble<'a>( + &'a mut self, + other: &'a mut Seq, + ) -> Option<(&'a mut Vec<Literal>, &'a mut Vec<Literal>)> { + let lits2 = match other.literals { + None => { + // If our current seq contains the empty string and the seq + // we're adding matches any literal, then it follows that the + // current seq must now also match any literal. + // + // Otherwise, we just have to make sure everything in this + // sequence is inexact. + if self.min_literal_len() == Some(0) { + *self = Seq::infinite(); + } else { + self.make_inexact(); + } + return None; + } + Some(ref mut lits) => lits, + }; + let lits1 = match self.literals { + None => { + // If we aren't going to make it to the end of this routine + // where lits2 is drained, then we need to do it now. + lits2.drain(..); + return None; + } + Some(ref mut lits) => lits, + }; + Some((lits1, lits2)) + } + + /// Unions the `other` sequence into this one. + /// + /// The literals are always drained out of the given `other` sequence, + /// even if they are being unioned into an infinite sequence. This permits + /// the caller to reuse the `other` sequence in another context. + /// + /// Some literal deduping may be performed. If any deduping happens, + /// any leftmost-first or "preference" order match semantics will be + /// preserved. + /// + /// # Example + /// + /// This example shows basic usage. + /// + /// ``` + /// use regex_syntax::hir::literal::Seq; + /// + /// let mut seq1 = Seq::new(&["foo", "bar"]); + /// let mut seq2 = Seq::new(&["bar", "quux", "foo"]); + /// seq1.union(&mut seq2); + /// + /// // The literals are pulled out of seq2. + /// assert_eq!(Some(0), seq2.len()); + /// + /// // Adjacent literals are deduped, but non-adjacent literals may not be. + /// assert_eq!(Seq::new(&["foo", "bar", "quux", "foo"]), seq1); + /// ``` + /// + /// This example shows that literals are drained from `other` even when + /// they aren't necessarily used. + /// + /// ``` + /// use regex_syntax::hir::literal::Seq; + /// + /// let mut seq1 = Seq::infinite(); + /// // Infinite sequences have no finite length. + /// assert_eq!(None, seq1.len()); + /// + /// let mut seq2 = Seq::new(&["bar", "quux", "foo"]); + /// seq1.union(&mut seq2); + /// + /// // seq1 is still infinite and seq2 has been drained. + /// assert_eq!(None, seq1.len()); + /// assert_eq!(Some(0), seq2.len()); + /// ``` + #[inline] + pub fn union(&mut self, other: &mut Seq) { + let lits2 = match other.literals { + None => { + // Unioning with an infinite sequence always results in an + // infinite sequence. + self.make_infinite(); + return; + } + Some(ref mut lits) => lits.drain(..), + }; + let lits1 = match self.literals { + None => return, + Some(ref mut lits) => lits, + }; + lits1.extend(lits2); + self.dedup(); + } + + /// Unions the `other` sequence into this one by splice the `other` + /// sequence at the position of the first zero-length literal. + /// + /// This is useful for preserving preference order semantics when combining + /// two literal sequences. For example, in the regex `(a||f)+foo`, the + /// correct preference order prefix sequence is `[a, foo, f]`. + /// + /// The literals are always drained out of the given `other` sequence, + /// even if they are being unioned into an infinite sequence. This permits + /// the caller to reuse the `other` sequence in another context. Note that + /// the literals are drained even if no union is performed as well, i.e., + /// when this sequence does not contain a zero-length literal. + /// + /// Some literal deduping may be performed. If any deduping happens, + /// any leftmost-first or "preference" order match semantics will be + /// preserved. + /// + /// # Example + /// + /// This example shows basic usage. + /// + /// ``` + /// use regex_syntax::hir::literal::Seq; + /// + /// let mut seq1 = Seq::new(&["a", "", "f", ""]); + /// let mut seq2 = Seq::new(&["foo"]); + /// seq1.union_into_empty(&mut seq2); + /// + /// // The literals are pulled out of seq2. + /// assert_eq!(Some(0), seq2.len()); + /// // 'foo' gets spliced into seq1 where the first empty string occurs. + /// assert_eq!(Seq::new(&["a", "foo", "f"]), seq1); + /// ``` + /// + /// This example shows that literals are drained from `other` even when + /// they aren't necessarily used. + /// + /// ``` + /// use regex_syntax::hir::literal::Seq; + /// + /// let mut seq1 = Seq::new(&["foo", "bar"]); + /// let mut seq2 = Seq::new(&["bar", "quux", "foo"]); + /// seq1.union_into_empty(&mut seq2); + /// + /// // seq1 has no zero length literals, so no splicing happens. + /// assert_eq!(Seq::new(&["foo", "bar"]), seq1); + /// // Even though no splicing happens, seq2 is still drained. + /// assert_eq!(Some(0), seq2.len()); + /// ``` + #[inline] + pub fn union_into_empty(&mut self, other: &mut Seq) { + let lits2 = other.literals.as_mut().map(|lits| lits.drain(..)); + let lits1 = match self.literals { + None => return, + Some(ref mut lits) => lits, + }; + let first_empty = match lits1.iter().position(|m| m.is_empty()) { + None => return, + Some(i) => i, + }; + let lits2 = match lits2 { + None => { + // Note that we are only here if we've found an empty literal, + // which implies that an infinite sequence infects this seq and + // also turns it into an infinite sequence. + self.literals = None; + return; + } + Some(lits) => lits, + }; + // Clearing out the empties needs to come before the splice because + // the splice might add more empties that we don't want to get rid + // of. Since we're splicing into the position of the first empty, the + // 'first_empty' position computed above is still correct. + lits1.retain(|m| !m.is_empty()); + lits1.splice(first_empty..first_empty, lits2); + self.dedup(); + } + + /// Deduplicate adjacent equivalent literals in this sequence. + /// + /// If adjacent literals are equivalent strings but one is exact and the + /// other inexact, the inexact literal is kept and the exact one is + /// removed. + /// + /// Deduping an infinite sequence is a no-op. + /// + /// # Example + /// + /// This example shows how literals that are duplicate byte strings but + /// are not equivalent with respect to exactness are resolved. + /// + /// ``` + /// use regex_syntax::hir::literal::{Literal, Seq}; + /// + /// let mut seq = Seq::from_iter([ + /// Literal::exact("foo"), + /// Literal::inexact("foo"), + /// ]); + /// seq.dedup(); + /// + /// assert_eq!(Seq::from_iter([Literal::inexact("foo")]), seq); + /// ``` + #[inline] + pub fn dedup(&mut self) { + if let Some(ref mut lits) = self.literals { + lits.dedup_by(|lit1, lit2| { + if lit1.as_bytes() != lit2.as_bytes() { + return false; + } + if lit1.is_exact() != lit2.is_exact() { + lit1.make_inexact(); + lit2.make_inexact(); + } + true + }); + } + } + + /// Sorts this sequence of literals lexicographically. + /// + /// Note that if, before sorting, if a literal that is a prefix of another + /// literal appears after it, then after sorting, the sequence will not + /// represent the same preference order match semantics. For example, + /// sorting the sequence `[samwise, sam]` yields the sequence `[sam, + /// samwise]`. Under preference order semantics, the latter sequence will + /// never match `samwise` where as the first sequence can. + /// + /// # Example + /// + /// This example shows basic usage. + /// + /// ``` + /// use regex_syntax::hir::literal::Seq; + /// + /// let mut seq = Seq::new(&["foo", "quux", "bar"]); + /// seq.sort(); + /// + /// assert_eq!(Seq::new(&["bar", "foo", "quux"]), seq); + /// ``` + #[inline] + pub fn sort(&mut self) { + if let Some(ref mut lits) = self.literals { + lits.sort(); + } + } + + /// Reverses all of the literals in this sequence. + /// + /// The order of the sequence itself is preserved. + /// + /// # Example + /// + /// This example shows basic usage. + /// + /// ``` + /// use regex_syntax::hir::literal::Seq; + /// + /// let mut seq = Seq::new(&["oof", "rab"]); + /// seq.reverse_literals(); + /// assert_eq!(Seq::new(&["foo", "bar"]), seq); + /// ``` + #[inline] + pub fn reverse_literals(&mut self) { + if let Some(ref mut lits) = self.literals { + for lit in lits.iter_mut() { + lit.reverse(); + } + } + } + + /// Shrinks this seq to its minimal size while respecting the preference + /// order of its literals. + /// + /// While this routine will remove duplicate literals from this seq, it + /// will also remove literals that can never match in a leftmost-first or + /// "preference order" search. Similar to [`Seq::dedup`], if a literal is + /// deduped, then the one that remains is made inexact. + /// + /// This is a no-op on seqs that are empty or not finite. + /// + /// # Example + /// + /// This example shows the difference between `{sam, samwise}` and + /// `{samwise, sam}`. + /// + /// ``` + /// use regex_syntax::hir::literal::{Literal, Seq}; + /// + /// // If 'sam' comes before 'samwise' and a preference order search is + /// // executed, then 'samwise' can never match. + /// let mut seq = Seq::new(&["sam", "samwise"]); + /// seq.minimize_by_preference(); + /// assert_eq!(Seq::from_iter([Literal::inexact("sam")]), seq); + /// + /// // But if they are reversed, then it's possible for 'samwise' to match + /// // since it is given higher preference. + /// let mut seq = Seq::new(&["samwise", "sam"]); + /// seq.minimize_by_preference(); + /// assert_eq!(Seq::new(&["samwise", "sam"]), seq); + /// ``` + /// + /// This example shows that if an empty string is in this seq, then + /// anything that comes after it can never match. + /// + /// ``` + /// use regex_syntax::hir::literal::{Literal, Seq}; + /// + /// // An empty string is a prefix of all strings, so it automatically + /// // inhibits any subsequent strings from matching. + /// let mut seq = Seq::new(&["foo", "bar", "", "quux", "fox"]); + /// seq.minimize_by_preference(); + /// let expected = Seq::from_iter([ + /// Literal::exact("foo"), + /// Literal::exact("bar"), + /// Literal::inexact(""), + /// ]); + /// assert_eq!(expected, seq); + /// + /// // And of course, if it's at the beginning, then it makes it impossible + /// // for anything else to match. + /// let mut seq = Seq::new(&["", "foo", "quux", "fox"]); + /// seq.minimize_by_preference(); + /// assert_eq!(Seq::from_iter([Literal::inexact("")]), seq); + /// ``` + #[inline] + pub fn minimize_by_preference(&mut self) { + if let Some(ref mut lits) = self.literals { + PreferenceTrie::minimize(lits, false); + } + } + + /// Trims all literals in this seq such that only the first `len` bytes + /// remain. If a literal has less than or equal to `len` bytes, then it + /// remains unchanged. Otherwise, it is trimmed and made inexact. + /// + /// # Example + /// + /// ``` + /// use regex_syntax::hir::literal::{Literal, Seq}; + /// + /// let mut seq = Seq::new(&["a", "foo", "quux"]); + /// seq.keep_first_bytes(2); + /// + /// let expected = Seq::from_iter([ + /// Literal::exact("a"), + /// Literal::inexact("fo"), + /// Literal::inexact("qu"), + /// ]); + /// assert_eq!(expected, seq); + /// ``` + #[inline] + pub fn keep_first_bytes(&mut self, len: usize) { + if let Some(ref mut lits) = self.literals { + for m in lits.iter_mut() { + m.keep_first_bytes(len); + } + } + } + + /// Trims all literals in this seq such that only the last `len` bytes + /// remain. If a literal has less than or equal to `len` bytes, then it + /// remains unchanged. Otherwise, it is trimmed and made inexact. + /// + /// # Example + /// + /// ``` + /// use regex_syntax::hir::literal::{Literal, Seq}; + /// + /// let mut seq = Seq::new(&["a", "foo", "quux"]); + /// seq.keep_last_bytes(2); + /// + /// let expected = Seq::from_iter([ + /// Literal::exact("a"), + /// Literal::inexact("oo"), + /// Literal::inexact("ux"), + /// ]); + /// assert_eq!(expected, seq); + /// ``` + #[inline] + pub fn keep_last_bytes(&mut self, len: usize) { + if let Some(ref mut lits) = self.literals { + for m in lits.iter_mut() { + m.keep_last_bytes(len); + } + } + } + + /// Returns true if this sequence is finite. + /// + /// When false, this sequence is infinite and must be treated as if it + /// contains every possible literal. + #[inline] + pub fn is_finite(&self) -> bool { + self.literals.is_some() + } + + /// Returns true if and only if this sequence is finite and empty. + /// + /// An empty sequence never matches anything. It can only be produced by + /// literal extraction when the corresponding regex itself cannot match. + #[inline] + pub fn is_empty(&self) -> bool { + self.len() == Some(0) + } + + /// Returns the number of literals in this sequence if the sequence is + /// finite. If the sequence is infinite, then `None` is returned. + #[inline] + pub fn len(&self) -> Option<usize> { + self.literals.as_ref().map(|lits| lits.len()) + } + + /// Returns true if and only if all literals in this sequence are exact. + /// + /// This returns false if the sequence is infinite. + #[inline] + pub fn is_exact(&self) -> bool { + self.literals().map_or(false, |lits| lits.iter().all(|x| x.is_exact())) + } + + /// Returns true if and only if all literals in this sequence are inexact. + /// + /// This returns true if the sequence is infinite. + #[inline] + pub fn is_inexact(&self) -> bool { + self.literals().map_or(true, |lits| lits.iter().all(|x| !x.is_exact())) + } + + /// Return the maximum length of the sequence that would result from + /// unioning `self` with `other`. If either set is infinite, then this + /// returns `None`. + #[inline] + fn max_union_len(&self, other: &Seq) -> Option<usize> { + let len1 = self.len()?; + let len2 = other.len()?; + Some(len1.saturating_add(len2)) + } + + /// Return the maximum length of the sequence that would result from the + /// cross product of `self` with `other`. If either set is infinite, then + /// this returns `None`. + #[inline] + fn max_cross_len(&self, other: &Seq) -> Option<usize> { + let len1 = self.len()?; + let len2 = other.len()?; + Some(len1.saturating_mul(len2)) + } + + /// Returns the length of the shortest literal in this sequence. + /// + /// If the sequence is infinite or empty, then this returns `None`. + #[inline] + pub fn min_literal_len(&self) -> Option<usize> { + self.literals.as_ref()?.iter().map(|x| x.len()).min() + } + + /// Returns the length of the longest literal in this sequence. + /// + /// If the sequence is infinite or empty, then this returns `None`. + #[inline] + pub fn max_literal_len(&self) -> Option<usize> { + self.literals.as_ref()?.iter().map(|x| x.len()).max() + } + + /// Returns the longest common prefix from this seq. + /// + /// If the seq matches any literal or other contains no literals, then + /// there is no meaningful prefix and this returns `None`. + /// + /// # Example + /// + /// This shows some example seqs and their longest common prefix. + /// + /// ``` + /// use regex_syntax::hir::literal::Seq; + /// + /// let seq = Seq::new(&["foo", "foobar", "fo"]); + /// assert_eq!(Some(&b"fo"[..]), seq.longest_common_prefix()); + /// let seq = Seq::new(&["foo", "foo"]); + /// assert_eq!(Some(&b"foo"[..]), seq.longest_common_prefix()); + /// let seq = Seq::new(&["foo", "bar"]); + /// assert_eq!(Some(&b""[..]), seq.longest_common_prefix()); + /// let seq = Seq::new(&[""]); + /// assert_eq!(Some(&b""[..]), seq.longest_common_prefix()); + /// + /// let seq = Seq::infinite(); + /// assert_eq!(None, seq.longest_common_prefix()); + /// let seq = Seq::empty(); + /// assert_eq!(None, seq.longest_common_prefix()); + /// ``` + #[inline] + pub fn longest_common_prefix(&self) -> Option<&[u8]> { + // If we match everything or match nothing, then there's no meaningful + // longest common prefix. + let lits = match self.literals { + None => return None, + Some(ref lits) => lits, + }; + if lits.len() == 0 { + return None; + } + let base = lits[0].as_bytes(); + let mut len = base.len(); + for m in lits.iter().skip(1) { + len = m + .as_bytes() + .iter() + .zip(base[..len].iter()) + .take_while(|&(a, b)| a == b) + .count(); + if len == 0 { + return Some(&[]); + } + } + Some(&base[..len]) + } + + /// Returns the longest common suffix from this seq. + /// + /// If the seq matches any literal or other contains no literals, then + /// there is no meaningful suffix and this returns `None`. + /// + /// # Example + /// + /// This shows some example seqs and their longest common suffix. + /// + /// ``` + /// use regex_syntax::hir::literal::Seq; + /// + /// let seq = Seq::new(&["oof", "raboof", "of"]); + /// assert_eq!(Some(&b"of"[..]), seq.longest_common_suffix()); + /// let seq = Seq::new(&["foo", "foo"]); + /// assert_eq!(Some(&b"foo"[..]), seq.longest_common_suffix()); + /// let seq = Seq::new(&["foo", "bar"]); + /// assert_eq!(Some(&b""[..]), seq.longest_common_suffix()); + /// let seq = Seq::new(&[""]); + /// assert_eq!(Some(&b""[..]), seq.longest_common_suffix()); + /// + /// let seq = Seq::infinite(); + /// assert_eq!(None, seq.longest_common_suffix()); + /// let seq = Seq::empty(); + /// assert_eq!(None, seq.longest_common_suffix()); + /// ``` + #[inline] + pub fn longest_common_suffix(&self) -> Option<&[u8]> { + // If we match everything or match nothing, then there's no meaningful + // longest common suffix. + let lits = match self.literals { + None => return None, + Some(ref lits) => lits, + }; + if lits.len() == 0 { + return None; + } + let base = lits[0].as_bytes(); + let mut len = base.len(); + for m in lits.iter().skip(1) { + len = m + .as_bytes() + .iter() + .rev() + .zip(base[base.len() - len..].iter().rev()) + .take_while(|&(a, b)| a == b) + .count(); + if len == 0 { + return Some(&[]); + } + } + Some(&base[base.len() - len..]) + } + + /// Optimizes this seq while treating its literals as prefixes and + /// respecting the preference order of its literals. + /// + /// The specific way "optimization" works is meant to be an implementation + /// detail, as it essentially represents a set of heuristics. The goal + /// that optimization tries to accomplish is to make the literals in this + /// set reflect inputs that will result in a more effective prefilter. + /// Principally by reducing the false positive rate of candidates found by + /// the literals in this sequence. That is, when a match of a literal is + /// found, we would like it to be a strong predictor of the overall match + /// of the regex. If it isn't, then much time will be spent starting and + /// stopping the prefilter search and attempting to confirm the match only + /// to have it fail. + /// + /// Some of those heuristics might be: + /// + /// * Identifying a common prefix from a larger sequence of literals, and + /// shrinking the sequence down to that single common prefix. + /// * Rejecting the sequence entirely if it is believed to result in very + /// high false positive rate. When this happens, the sequence is made + /// infinite. + /// * Shrinking the sequence to a smaller number of literals representing + /// prefixes, but not shrinking it so much as to make literals too short. + /// (A sequence with very short literals, of 1 or 2 bytes, will typically + /// result in a higher false positive rate.) + /// + /// Optimization should only be run once extraction is complete. Namely, + /// optimization may make assumptions that do not compose with other + /// operations in the middle of extraction. For example, optimization will + /// reduce `[E(sam), E(samwise)]` to `[E(sam)]`, but such a transformation + /// is only valid if no other extraction will occur. If other extraction + /// may occur, then the correct transformation would be to `[I(sam)]`. + /// + /// The [`Seq::optimize_for_suffix_by_preference`] does the same thing, but + /// for suffixes. + /// + /// # Example + /// + /// This shows how optimization might transform a sequence. Note that + /// the specific behavior is not a documented guarantee. The heuristics + /// used are an implementation detail and may change over time in semver + /// compatible releases. + /// + /// ``` + /// use regex_syntax::hir::literal::{Seq, Literal}; + /// + /// let mut seq = Seq::new(&[ + /// "samantha", + /// "sam", + /// "samwise", + /// "frodo", + /// ]); + /// seq.optimize_for_prefix_by_preference(); + /// assert_eq!(Seq::from_iter([ + /// Literal::exact("samantha"), + /// // Kept exact even though 'samwise' got pruned + /// // because optimization assumes literal extraction + /// // has finished. + /// Literal::exact("sam"), + /// Literal::exact("frodo"), + /// ]), seq); + /// ``` + /// + /// # Example: optimization may make the sequence infinite + /// + /// If the heuristics deem that the sequence could cause a very high false + /// positive rate, then it may make the sequence infinite, effectively + /// disabling its use as a prefilter. + /// + /// ``` + /// use regex_syntax::hir::literal::{Seq, Literal}; + /// + /// let mut seq = Seq::new(&[ + /// "samantha", + /// // An empty string matches at every position, + /// // thus rendering the prefilter completely + /// // ineffective. + /// "", + /// "sam", + /// "samwise", + /// "frodo", + /// ]); + /// seq.optimize_for_prefix_by_preference(); + /// assert!(!seq.is_finite()); + /// ``` + /// + /// Do note that just because there is a `" "` in the sequence, that + /// doesn't mean the sequence will always be made infinite after it is + /// optimized. Namely, if the sequence is considered exact (any match + /// corresponds to an overall match of the original regex), then any match + /// is an overall match, and so the false positive rate is always `0`. + /// + /// To demonstrate this, we remove `samwise` from our sequence. This + /// results in no optimization happening and all literals remain exact. + /// Thus the entire sequence is exact, and it is kept as-is, even though + /// one is an ASCII space: + /// + /// ``` + /// use regex_syntax::hir::literal::{Seq, Literal}; + /// + /// let mut seq = Seq::new(&[ + /// "samantha", + /// " ", + /// "sam", + /// "frodo", + /// ]); + /// seq.optimize_for_prefix_by_preference(); + /// assert!(seq.is_finite()); + /// ``` + #[inline] + pub fn optimize_for_prefix_by_preference(&mut self) { + self.optimize_by_preference(true); + } + + /// Optimizes this seq while treating its literals as suffixes and + /// respecting the preference order of its literals. + /// + /// Optimization should only be run once extraction is complete. + /// + /// The [`Seq::optimize_for_prefix_by_preference`] does the same thing, but + /// for prefixes. See its documentation for more explanation. + #[inline] + pub fn optimize_for_suffix_by_preference(&mut self) { + self.optimize_by_preference(false); + } + + fn optimize_by_preference(&mut self, prefix: bool) { + let origlen = match self.len() { + None => return, + Some(len) => len, + }; + // Make sure we start with the smallest sequence possible. We use a + // special version of preference minimization that retains exactness. + // This is legal because optimization is only expected to occur once + // extraction is complete. + if prefix { + if let Some(ref mut lits) = self.literals { + PreferenceTrie::minimize(lits, true); + } + } + + // Look for a common prefix (or suffix). If we found one of those and + // it's long enough, then it's a good bet that it will be our fastest + // possible prefilter since single-substring search is so fast. + let fix = if prefix { + self.longest_common_prefix() + } else { + self.longest_common_suffix() + }; + if let Some(fix) = fix { + // As a special case, if we have a common prefix and the leading + // byte of that prefix is one that we think probably occurs rarely, + // then strip everything down to just that single byte. This should + // promote the use of memchr. + // + // ... we only do this though if our sequence has more than one + // literal. Otherwise, we'd rather just stick with a single literal + // scan. That is, using memchr is probably better than looking + // for 2 or more literals, but probably not as good as a straight + // memmem search. + // + // ... and also only do this when the prefix is short and probably + // not too discriminatory anyway. If it's longer, then it's + // probably quite discriminatory and thus is likely to have a low + // false positive rate. + if prefix + && origlen > 1 + && fix.len() >= 1 + && fix.len() <= 3 + && rank(fix[0]) < 200 + { + self.keep_first_bytes(1); + self.dedup(); + return; + } + // We only strip down to the common prefix/suffix if we think + // the existing set of literals isn't great, or if the common + // prefix/suffix is expected to be particularly discriminatory. + let isfast = + self.is_exact() && self.len().map_or(false, |len| len <= 16); + let usefix = fix.len() > 4 || (fix.len() > 1 && !isfast); + if usefix { + // If we keep exactly the number of bytes equal to the length + // of the prefix (or suffix), then by the definition of a + // prefix, every literal in the sequence will be equivalent. + // Thus, 'dedup' will leave us with one literal. + // + // We do it this way to avoid an alloc, but also to make sure + // the exactness of literals is kept (or not). + if prefix { + self.keep_first_bytes(fix.len()); + } else { + self.keep_last_bytes(fix.len()); + } + self.dedup(); + assert_eq!(Some(1), self.len()); + // We still fall through here. In particular, we want our + // longest common prefix to be subject to the poison check. + } + } + // Everything below this check is more-or-less about trying to + // heuristically reduce the false positive rate of a prefilter. But + // if our sequence is completely exact, then it's possible the regex + // engine can be skipped entirely. In this case, the false positive + // rate is zero because every literal match corresponds to a regex + // match. + // + // This is OK even if the sequence contains a poison literal. Remember, + // a literal is only poisononous because of what we assume about its + // impact on the false positive rate. However, we do still check for + // an empty string. Empty strings are weird and it's best to let the + // regex engine handle those. + // + // We do currently do this check after the longest common prefix (or + // suffix) check, under the theory that single-substring search is so + // fast that we want that even if we'd end up turning an exact sequence + // into an inexact one. But this might be wrong... + if self.is_exact() + && self.min_literal_len().map_or(false, |len| len > 0) + { + return; + } + // Now we attempt to shorten the sequence. The idea here is that we + // don't want to look for too many literals, but we want to shorten + // our sequence enough to improve our odds of using better algorithms + // downstream (such as Teddy). + const ATTEMPTS: [(usize, usize); 5] = + [(5, 64), (4, 64), (3, 64), (2, 64), (1, 10)]; + for (keep, limit) in ATTEMPTS { + let len = match self.len() { + None => break, + Some(len) => len, + }; + if len <= limit { + break; + } + if prefix { + self.keep_first_bytes(keep); + } else { + self.keep_last_bytes(keep); + } + self.minimize_by_preference(); + } + // Check for a poison literal. A poison literal is one that is short + // and is believed to have a very high match count. These poisons + // generally lead to a prefilter with a very high false positive rate, + // and thus overall worse performance. + // + // We do this last because we could have gone from a non-poisonous + // sequence to a poisonous one. Perhaps we should add some code to + // prevent such transitions in the first place, but then again, we + // likely only made the transition in the first place if the sequence + // was itself huge. And huge sequences are themselves poisonous. So... + if let Some(lits) = self.literals() { + if lits.iter().any(|lit| lit.is_poisonous()) { + self.make_infinite(); + } + } + } +} + +impl core::fmt::Debug for Seq { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + write!(f, "Seq")?; + if let Some(lits) = self.literals() { + f.debug_list().entries(lits.iter()).finish() + } else { + write!(f, "[∅]") + } + } +} + +impl FromIterator<Literal> for Seq { + fn from_iter<T: IntoIterator<Item = Literal>>(it: T) -> Seq { + let mut seq = Seq::empty(); + for literal in it { + seq.push(literal); + } + seq + } +} + +/// A single literal extracted from an [`Hir`] expression. +/// +/// A literal is composed of two things: +/// +/// * A sequence of bytes. No guarantees with respect to UTF-8 are provided. +/// In particular, even if the regex a literal is extracted from is UTF-8, the +/// literal extracted may not be valid UTF-8. (For example, if an [`Extractor`] +/// limit resulted in trimming a literal in a way that splits a codepoint.) +/// * Whether the literal is "exact" or not. An "exact" literal means that it +/// has not been trimmed, and may continue to be extended. If a literal is +/// "exact" after visiting the entire `Hir` expression, then this implies that +/// the literal leads to a match state. (Although it doesn't necessarily imply +/// all occurrences of the literal correspond to a match of the regex, since +/// literal extraction ignores look-around assertions.) +#[derive(Clone, Eq, PartialEq, PartialOrd, Ord)] +pub struct Literal { + bytes: Vec<u8>, + exact: bool, +} + +impl Literal { + /// Returns a new exact literal containing the bytes given. + #[inline] + pub fn exact<B: Into<Vec<u8>>>(bytes: B) -> Literal { + Literal { bytes: bytes.into(), exact: true } + } + + /// Returns a new inexact literal containing the bytes given. + #[inline] + pub fn inexact<B: Into<Vec<u8>>>(bytes: B) -> Literal { + Literal { bytes: bytes.into(), exact: false } + } + + /// Returns the bytes in this literal. + #[inline] + pub fn as_bytes(&self) -> &[u8] { + &self.bytes + } + + /// Yields ownership of the bytes inside this literal. + /// + /// Note that this throws away whether the literal is "exact" or not. + #[inline] + pub fn into_bytes(self) -> Vec<u8> { + self.bytes + } + + /// Returns the length of this literal in bytes. + #[inline] + pub fn len(&self) -> usize { + self.as_bytes().len() + } + + /// Returns true if and only if this literal has zero bytes. + #[inline] + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Returns true if and only if this literal is exact. + #[inline] + pub fn is_exact(&self) -> bool { + self.exact + } + + /// Marks this literal as inexact. + /// + /// Inexact literals can never be extended. For example, + /// [`Seq::cross_forward`] will not extend inexact literals. + #[inline] + pub fn make_inexact(&mut self) { + self.exact = false; + } + + /// Reverse the bytes in this literal. + #[inline] + pub fn reverse(&mut self) { + self.bytes.reverse(); + } + + /// Extend this literal with the literal given. + /// + /// If this literal is inexact, then this is a no-op. + #[inline] + pub fn extend(&mut self, lit: &Literal) { + if !self.is_exact() { + return; + } + self.bytes.extend_from_slice(&lit.bytes); + } + + /// Trims this literal such that only the first `len` bytes remain. If + /// this literal has fewer than `len` bytes, then it remains unchanged. + /// Otherwise, the literal is marked as inexact. + #[inline] + pub fn keep_first_bytes(&mut self, len: usize) { + if len >= self.len() { + return; + } + self.make_inexact(); + self.bytes.truncate(len); + } + + /// Trims this literal such that only the last `len` bytes remain. If this + /// literal has fewer than `len` bytes, then it remains unchanged. + /// Otherwise, the literal is marked as inexact. + #[inline] + pub fn keep_last_bytes(&mut self, len: usize) { + if len >= self.len() { + return; + } + self.make_inexact(); + self.bytes.drain(..self.len() - len); + } + + /// Returns true if it is believe that this literal is likely to match very + /// frequently, and is thus not a good candidate for a prefilter. + fn is_poisonous(&self) -> bool { + self.is_empty() || (self.len() == 1 && rank(self.as_bytes()[0]) >= 250) + } +} + +impl From<u8> for Literal { + fn from(byte: u8) -> Literal { + Literal::exact(vec![byte]) + } +} + +impl From<char> for Literal { + fn from(ch: char) -> Literal { + use alloc::string::ToString; + Literal::exact(ch.encode_utf8(&mut [0; 4]).to_string()) + } +} + +impl AsRef<[u8]> for Literal { + fn as_ref(&self) -> &[u8] { + self.as_bytes() + } +} + +impl core::fmt::Debug for Literal { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + let tag = if self.exact { "E" } else { "I" }; + f.debug_tuple(tag) + .field(&crate::debug::Bytes(self.as_bytes())) + .finish() + } +} + +/// A "preference" trie that rejects literals that will never match when +/// executing a leftmost first or "preference" search. +/// +/// For example, if 'sam' is inserted, then trying to insert 'samwise' will be +/// rejected because 'samwise' can never match since 'sam' will always take +/// priority. However, if 'samwise' is inserted first, then inserting 'sam' +/// after it is accepted. In this case, either 'samwise' or 'sam' can match in +/// a "preference" search. +/// +/// Note that we only use this trie as a "set." That is, given a sequence of +/// literals, we insert each one in order. An `insert` will reject a literal +/// if a prefix of that literal already exists in the trie. Thus, to rebuild +/// the "minimal" sequence, we simply only keep literals that were successfully +/// inserted. (Since we don't need traversal, one wonders whether we can make +/// some simplifications here, but I haven't given it a ton of thought and I've +/// never seen this show up on a profile. Because of the heuristic limits +/// imposed on literal extractions, the size of the inputs here is usually +/// very small.) +#[derive(Debug, Default)] +struct PreferenceTrie { + /// The states in this trie. The index of a state in this vector is its ID. + states: Vec<State>, + /// The index to allocate to the next literal added to this trie. Starts at + /// 0 and increments by 1 for every literal successfully added to the trie. + next_literal_index: usize, +} + +/// A single state in a trie. Uses a sparse representation for its transitions. +#[derive(Debug, Default)] +struct State { + /// Sparse representation of the transitions out of this state. Transitions + /// are sorted by byte. There is at most one such transition for any + /// particular byte. + trans: Vec<(u8, usize)>, + /// Whether this is a matching state or not. If it is, then it contains the + /// index to the matching literal. + literal_index: Option<usize>, +} + +impl PreferenceTrie { + /// Minimizes the given sequence of literals while preserving preference + /// order semantics. + /// + /// When `keep_exact` is true, the exactness of every literal retained is + /// kept. This is useful when dealing with a fully extracted `Seq` that + /// only contains exact literals. In that case, we can keep all retained + /// literals as exact because we know we'll never need to match anything + /// after them and because any removed literals are guaranteed to never + /// match. + fn minimize(literals: &mut Vec<Literal>, keep_exact: bool) { + use core::cell::RefCell; + + // MSRV(1.61): Use retain_mut here to avoid interior mutability. + let trie = RefCell::new(PreferenceTrie::default()); + let mut make_inexact = vec![]; + literals.retain(|lit| { + match trie.borrow_mut().insert(lit.as_bytes()) { + Ok(_) => true, + Err(i) => { + if !keep_exact { + make_inexact.push(i); + } + false + } + } + }); + for i in make_inexact { + literals[i].make_inexact(); + } + } + + /// Returns `Ok` if the given byte string is accepted into this trie and + /// `Err` otherwise. The index for the success case corresponds to the + /// index of the literal added. The index for the error case corresponds to + /// the index of the literal already in the trie that prevented the given + /// byte string from being added. (Which implies it is a prefix of the one + /// given.) + /// + /// In short, the byte string given is accepted into the trie if and only + /// if it is possible for it to match when executing a preference order + /// search. + fn insert(&mut self, bytes: &[u8]) -> Result<usize, usize> { + let mut prev = self.root(); + if let Some(idx) = self.states[prev].literal_index { + return Err(idx); + } + for &b in bytes.iter() { + match self.states[prev].trans.binary_search_by_key(&b, |t| t.0) { + Ok(i) => { + prev = self.states[prev].trans[i].1; + if let Some(idx) = self.states[prev].literal_index { + return Err(idx); + } + } + Err(i) => { + let next = self.create_state(); + self.states[prev].trans.insert(i, (b, next)); + prev = next; + } + } + } + let idx = self.next_literal_index; + self.next_literal_index += 1; + self.states[prev].literal_index = Some(idx); + Ok(idx) + } + + /// Returns the root state ID, and if it doesn't exist, creates it. + fn root(&mut self) -> usize { + if !self.states.is_empty() { + 0 + } else { + self.create_state() + } + } + + /// Creates a new empty state and returns its ID. + fn create_state(&mut self) -> usize { + let id = self.states.len(); + self.states.push(State::default()); + id + } +} + +/// Returns the "rank" of the given byte. +/// +/// The minimum rank value is `0` and the maximum rank value is `255`. +/// +/// The rank of a byte is derived from a heuristic background distribution of +/// relative frequencies of bytes. The heuristic says that lower the rank of a +/// byte, the less likely that byte is to appear in any arbitrary haystack. +pub fn rank(byte: u8) -> u8 { + crate::rank::BYTE_FREQUENCIES[usize::from(byte)] +} + +#[cfg(test)] +mod tests { + use super::*; + + fn parse(pattern: &str) -> Hir { + crate::ParserBuilder::new().utf8(false).build().parse(pattern).unwrap() + } + + fn prefixes(pattern: &str) -> Seq { + Extractor::new().kind(ExtractKind::Prefix).extract(&parse(pattern)) + } + + fn suffixes(pattern: &str) -> Seq { + Extractor::new().kind(ExtractKind::Suffix).extract(&parse(pattern)) + } + + fn e(pattern: &str) -> (Seq, Seq) { + (prefixes(pattern), suffixes(pattern)) + } + + #[allow(non_snake_case)] + fn E(x: &str) -> Literal { + Literal::exact(x.as_bytes()) + } + + #[allow(non_snake_case)] + fn I(x: &str) -> Literal { + Literal::inexact(x.as_bytes()) + } + + fn seq<I: IntoIterator<Item = Literal>>(it: I) -> Seq { + Seq::from_iter(it) + } + + fn infinite() -> (Seq, Seq) { + (Seq::infinite(), Seq::infinite()) + } + + fn inexact<I1, I2>(it1: I1, it2: I2) -> (Seq, Seq) + where + I1: IntoIterator<Item = Literal>, + I2: IntoIterator<Item = Literal>, + { + (Seq::from_iter(it1), Seq::from_iter(it2)) + } + + fn exact<B: AsRef<[u8]>, I: IntoIterator<Item = B>>(it: I) -> (Seq, Seq) { + let s1 = Seq::new(it); + let s2 = s1.clone(); + (s1, s2) + } + + fn opt<B: AsRef<[u8]>, I: IntoIterator<Item = B>>(it: I) -> (Seq, Seq) { + let (mut p, mut s) = exact(it); + p.optimize_for_prefix_by_preference(); + s.optimize_for_suffix_by_preference(); + (p, s) + } + + #[test] + fn literal() { + assert_eq!(exact(["a"]), e("a")); + assert_eq!(exact(["aaaaa"]), e("aaaaa")); + assert_eq!(exact(["A", "a"]), e("(?i-u)a")); + assert_eq!(exact(["AB", "Ab", "aB", "ab"]), e("(?i-u)ab")); + assert_eq!(exact(["abC", "abc"]), e("ab(?i-u)c")); + + assert_eq!(exact([b"\xFF"]), e(r"(?-u:\xFF)")); + + #[cfg(feature = "unicode-case")] + { + assert_eq!(exact(["☃"]), e("☃")); + assert_eq!(exact(["☃"]), e("(?i)☃")); + assert_eq!(exact(["☃☃☃☃☃"]), e("☃☃☃☃☃")); + + assert_eq!(exact(["Δ"]), e("Δ")); + assert_eq!(exact(["δ"]), e("δ")); + assert_eq!(exact(["Δ", "δ"]), e("(?i)Δ")); + assert_eq!(exact(["Δ", "δ"]), e("(?i)δ")); + + assert_eq!(exact(["S", "s", "ſ"]), e("(?i)S")); + assert_eq!(exact(["S", "s", "ſ"]), e("(?i)s")); + assert_eq!(exact(["S", "s", "ſ"]), e("(?i)ſ")); + } + + let letters = "ͱͳͷΐάέήίΰαβγδεζηθικλμνξοπρςστυφχψωϊϋ"; + assert_eq!(exact([letters]), e(letters)); + } + + #[test] + fn class() { + assert_eq!(exact(["a", "b", "c"]), e("[abc]")); + assert_eq!(exact(["a1b", "a2b", "a3b"]), e("a[123]b")); + assert_eq!(exact(["δ", "ε"]), e("[εδ]")); + #[cfg(feature = "unicode-case")] + { + assert_eq!(exact(["Δ", "Ε", "δ", "ε", "ϵ"]), e(r"(?i)[εδ]")); + } + } + + #[test] + fn look() { + assert_eq!(exact(["ab"]), e(r"a\Ab")); + assert_eq!(exact(["ab"]), e(r"a\zb")); + assert_eq!(exact(["ab"]), e(r"a(?m:^)b")); + assert_eq!(exact(["ab"]), e(r"a(?m:$)b")); + assert_eq!(exact(["ab"]), e(r"a\bb")); + assert_eq!(exact(["ab"]), e(r"a\Bb")); + assert_eq!(exact(["ab"]), e(r"a(?-u:\b)b")); + assert_eq!(exact(["ab"]), e(r"a(?-u:\B)b")); + + assert_eq!(exact(["ab"]), e(r"^ab")); + assert_eq!(exact(["ab"]), e(r"$ab")); + assert_eq!(exact(["ab"]), e(r"(?m:^)ab")); + assert_eq!(exact(["ab"]), e(r"(?m:$)ab")); + assert_eq!(exact(["ab"]), e(r"\bab")); + assert_eq!(exact(["ab"]), e(r"\Bab")); + assert_eq!(exact(["ab"]), e(r"(?-u:\b)ab")); + assert_eq!(exact(["ab"]), e(r"(?-u:\B)ab")); + + assert_eq!(exact(["ab"]), e(r"ab^")); + assert_eq!(exact(["ab"]), e(r"ab$")); + assert_eq!(exact(["ab"]), e(r"ab(?m:^)")); + assert_eq!(exact(["ab"]), e(r"ab(?m:$)")); + assert_eq!(exact(["ab"]), e(r"ab\b")); + assert_eq!(exact(["ab"]), e(r"ab\B")); + assert_eq!(exact(["ab"]), e(r"ab(?-u:\b)")); + assert_eq!(exact(["ab"]), e(r"ab(?-u:\B)")); + + let expected = (seq([I("aZ"), E("ab")]), seq([I("Zb"), E("ab")])); + assert_eq!(expected, e(r"^aZ*b")); + } + + #[test] + fn repetition() { + assert_eq!(exact(["a", ""]), e(r"a?")); + assert_eq!(exact(["", "a"]), e(r"a??")); + assert_eq!(inexact([I("a"), E("")], [I("a"), E("")]), e(r"a*")); + assert_eq!(inexact([E(""), I("a")], [E(""), I("a")]), e(r"a*?")); + assert_eq!(inexact([I("a")], [I("a")]), e(r"a+")); + assert_eq!(inexact([I("a")], [I("a")]), e(r"(a+)+")); + + assert_eq!(exact(["ab"]), e(r"aZ{0}b")); + assert_eq!(exact(["aZb", "ab"]), e(r"aZ?b")); + assert_eq!(exact(["ab", "aZb"]), e(r"aZ??b")); + assert_eq!( + inexact([I("aZ"), E("ab")], [I("Zb"), E("ab")]), + e(r"aZ*b") + ); + assert_eq!( + inexact([E("ab"), I("aZ")], [E("ab"), I("Zb")]), + e(r"aZ*?b") + ); + assert_eq!(inexact([I("aZ")], [I("Zb")]), e(r"aZ+b")); + assert_eq!(inexact([I("aZ")], [I("Zb")]), e(r"aZ+?b")); + + assert_eq!(exact(["aZZb"]), e(r"aZ{2}b")); + assert_eq!(inexact([I("aZZ")], [I("ZZb")]), e(r"aZ{2,3}b")); + + assert_eq!(exact(["abc", ""]), e(r"(abc)?")); + assert_eq!(exact(["", "abc"]), e(r"(abc)??")); + + assert_eq!(inexact([I("a"), E("b")], [I("ab"), E("b")]), e(r"a*b")); + assert_eq!(inexact([E("b"), I("a")], [E("b"), I("ab")]), e(r"a*?b")); + assert_eq!(inexact([I("ab")], [I("b")]), e(r"ab+")); + assert_eq!(inexact([I("a"), I("b")], [I("b")]), e(r"a*b+")); + + // FIXME: The suffixes for this don't look quite right to me. I think + // the right suffixes would be: [I(ac), I(bc), E(c)]. The main issue I + // think is that suffixes are computed by iterating over concatenations + // in reverse, and then [bc, ac, c] ordering is indeed correct from + // that perspective. We also test a few more equivalent regexes, and + // we get the same result, so it is consistent at least I suppose. + // + // The reason why this isn't an issue is that it only messes up + // preference order, and currently, suffixes are never used in a + // context where preference order matters. For prefixes it matters + // because we sometimes want to use prefilters without confirmation + // when all of the literals are exact (and there's no look-around). But + // we never do that for suffixes. Any time we use suffixes, we always + // include a confirmation step. If that ever changes, then it's likely + // this bug will need to be fixed, but last time I looked, it appears + // hard to do so. + assert_eq!( + inexact([I("a"), I("b"), E("c")], [I("bc"), I("ac"), E("c")]), + e(r"a*b*c") + ); + assert_eq!( + inexact([I("a"), I("b"), E("c")], [I("bc"), I("ac"), E("c")]), + e(r"(a+)?(b+)?c") + ); + assert_eq!( + inexact([I("a"), I("b"), E("c")], [I("bc"), I("ac"), E("c")]), + e(r"(a+|)(b+|)c") + ); + // A few more similarish but not identical regexes. These may have a + // similar problem as above. + assert_eq!( + inexact( + [I("a"), I("b"), I("c"), E("")], + [I("c"), I("b"), I("a"), E("")] + ), + e(r"a*b*c*") + ); + assert_eq!(inexact([I("a"), I("b"), I("c")], [I("c")]), e(r"a*b*c+")); + assert_eq!(inexact([I("a"), I("b")], [I("bc")]), e(r"a*b+c")); + assert_eq!(inexact([I("a"), I("b")], [I("c"), I("b")]), e(r"a*b+c*")); + assert_eq!(inexact([I("ab"), E("a")], [I("b"), E("a")]), e(r"ab*")); + assert_eq!( + inexact([I("ab"), E("ac")], [I("bc"), E("ac")]), + e(r"ab*c") + ); + assert_eq!(inexact([I("ab")], [I("b")]), e(r"ab+")); + assert_eq!(inexact([I("ab")], [I("bc")]), e(r"ab+c")); + + assert_eq!( + inexact([I("z"), E("azb")], [I("zazb"), E("azb")]), + e(r"z*azb") + ); + + let expected = + exact(["aaa", "aab", "aba", "abb", "baa", "bab", "bba", "bbb"]); + assert_eq!(expected, e(r"[ab]{3}")); + let expected = inexact( + [ + I("aaa"), + I("aab"), + I("aba"), + I("abb"), + I("baa"), + I("bab"), + I("bba"), + I("bbb"), + ], + [ + I("aaa"), + I("aab"), + I("aba"), + I("abb"), + I("baa"), + I("bab"), + I("bba"), + I("bbb"), + ], + ); + assert_eq!(expected, e(r"[ab]{3,4}")); + } + + #[test] + fn concat() { + let empty: [&str; 0] = []; + + assert_eq!(exact(["abcxyz"]), e(r"abc()xyz")); + assert_eq!(exact(["abcxyz"]), e(r"(abc)(xyz)")); + assert_eq!(exact(["abcmnoxyz"]), e(r"abc()mno()xyz")); + assert_eq!(exact(empty), e(r"abc[a&&b]xyz")); + assert_eq!(exact(["abcxyz"]), e(r"abc[a&&b]*xyz")); + } + + #[test] + fn alternation() { + assert_eq!(exact(["abc", "mno", "xyz"]), e(r"abc|mno|xyz")); + assert_eq!( + inexact( + [E("abc"), I("mZ"), E("mo"), E("xyz")], + [E("abc"), I("Zo"), E("mo"), E("xyz")] + ), + e(r"abc|mZ*o|xyz") + ); + assert_eq!(exact(["abc", "xyz"]), e(r"abc|M[a&&b]N|xyz")); + assert_eq!(exact(["abc", "MN", "xyz"]), e(r"abc|M[a&&b]*N|xyz")); + + assert_eq!(exact(["aaa", "aaaaa"]), e(r"(?:|aa)aaa")); + assert_eq!( + inexact( + [I("aaa"), E(""), I("aaaaa"), E("aa")], + [I("aaa"), E(""), E("aa")] + ), + e(r"(?:|aa)(?:aaa)*") + ); + assert_eq!( + inexact( + [E(""), I("aaa"), E("aa"), I("aaaaa")], + [E(""), I("aaa"), E("aa")] + ), + e(r"(?:|aa)(?:aaa)*?") + ); + + assert_eq!( + inexact([E("a"), I("b"), E("")], [E("a"), I("b"), E("")]), + e(r"a|b*") + ); + assert_eq!(inexact([E("a"), I("b")], [E("a"), I("b")]), e(r"a|b+")); + + assert_eq!( + inexact([I("a"), E("b"), E("c")], [I("ab"), E("b"), E("c")]), + e(r"a*b|c") + ); + + assert_eq!( + inexact( + [E("a"), E("b"), I("c"), E("")], + [E("a"), E("b"), I("c"), E("")] + ), + e(r"a|(?:b|c*)") + ); + + assert_eq!( + inexact( + [I("a"), I("b"), E("c"), I("a"), I("ab"), E("c")], + [I("ac"), I("bc"), E("c"), I("ac"), I("abc"), E("c")], + ), + e(r"(a|b)*c|(a|ab)*c") + ); + + assert_eq!( + exact(["abef", "abgh", "cdef", "cdgh"]), + e(r"(ab|cd)(ef|gh)") + ); + assert_eq!( + exact([ + "abefij", "abefkl", "abghij", "abghkl", "cdefij", "cdefkl", + "cdghij", "cdghkl", + ]), + e(r"(ab|cd)(ef|gh)(ij|kl)") + ); + } + + #[test] + fn impossible() { + let empty: [&str; 0] = []; + + assert_eq!(exact(empty), e(r"[a&&b]")); + assert_eq!(exact(empty), e(r"a[a&&b]")); + assert_eq!(exact(empty), e(r"[a&&b]b")); + assert_eq!(exact(empty), e(r"a[a&&b]b")); + assert_eq!(exact(["a", "b"]), e(r"a|[a&&b]|b")); + assert_eq!(exact(["a", "b"]), e(r"a|c[a&&b]|b")); + assert_eq!(exact(["a", "b"]), e(r"a|[a&&b]d|b")); + assert_eq!(exact(["a", "b"]), e(r"a|c[a&&b]d|b")); + assert_eq!(exact([""]), e(r"[a&&b]*")); + assert_eq!(exact(["MN"]), e(r"M[a&&b]*N")); + } + + // This tests patterns that contain something that defeats literal + // detection, usually because it would blow some limit on the total number + // of literals that can be returned. + // + // The main idea is that when literal extraction sees something that + // it knows will blow a limit, it replaces it with a marker that says + // "any literal will match here." While not necessarily true, the + // over-estimation is just fine for the purposes of literal extraction, + // because the imprecision doesn't matter: too big is too big. + // + // This is one of the trickier parts of literal extraction, since we need + // to make sure all of our literal extraction operations correctly compose + // with the markers. + #[test] + fn anything() { + assert_eq!(infinite(), e(r".")); + assert_eq!(infinite(), e(r"(?s).")); + assert_eq!(infinite(), e(r"[A-Za-z]")); + assert_eq!(infinite(), e(r"[A-Z]")); + assert_eq!(exact([""]), e(r"[A-Z]{0}")); + assert_eq!(infinite(), e(r"[A-Z]?")); + assert_eq!(infinite(), e(r"[A-Z]*")); + assert_eq!(infinite(), e(r"[A-Z]+")); + assert_eq!((seq([I("1")]), Seq::infinite()), e(r"1[A-Z]")); + assert_eq!((seq([I("1")]), seq([I("2")])), e(r"1[A-Z]2")); + assert_eq!((Seq::infinite(), seq([I("123")])), e(r"[A-Z]+123")); + assert_eq!(infinite(), e(r"[A-Z]+123[A-Z]+")); + assert_eq!(infinite(), e(r"1|[A-Z]|3")); + assert_eq!( + (seq([E("1"), I("2"), E("3")]), Seq::infinite()), + e(r"1|2[A-Z]|3"), + ); + assert_eq!( + (Seq::infinite(), seq([E("1"), I("2"), E("3")])), + e(r"1|[A-Z]2|3"), + ); + assert_eq!( + (seq([E("1"), I("2"), E("4")]), seq([E("1"), I("3"), E("4")])), + e(r"1|2[A-Z]3|4"), + ); + assert_eq!((Seq::infinite(), seq([I("2")])), e(r"(?:|1)[A-Z]2")); + assert_eq!(inexact([I("a")], [I("z")]), e(r"a.z")); + } + + // Like the 'anything' test, but it uses smaller limits in order to test + // the logic for effectively aborting literal extraction when the seqs get + // too big. + #[test] + fn anything_small_limits() { + fn prefixes(pattern: &str) -> Seq { + Extractor::new() + .kind(ExtractKind::Prefix) + .limit_total(10) + .extract(&parse(pattern)) + } + + fn suffixes(pattern: &str) -> Seq { + Extractor::new() + .kind(ExtractKind::Suffix) + .limit_total(10) + .extract(&parse(pattern)) + } + + fn e(pattern: &str) -> (Seq, Seq) { + (prefixes(pattern), suffixes(pattern)) + } + + assert_eq!( + ( + seq([ + I("aaa"), + I("aab"), + I("aba"), + I("abb"), + I("baa"), + I("bab"), + I("bba"), + I("bbb") + ]), + seq([ + I("aaa"), + I("aab"), + I("aba"), + I("abb"), + I("baa"), + I("bab"), + I("bba"), + I("bbb") + ]) + ), + e(r"[ab]{3}{3}") + ); + + assert_eq!(infinite(), e(r"ab|cd|ef|gh|ij|kl|mn|op|qr|st|uv|wx|yz")); + } + + #[test] + fn empty() { + assert_eq!(exact([""]), e(r"")); + assert_eq!(exact([""]), e(r"^")); + assert_eq!(exact([""]), e(r"$")); + assert_eq!(exact([""]), e(r"(?m:^)")); + assert_eq!(exact([""]), e(r"(?m:$)")); + assert_eq!(exact([""]), e(r"\b")); + assert_eq!(exact([""]), e(r"\B")); + assert_eq!(exact([""]), e(r"(?-u:\b)")); + assert_eq!(exact([""]), e(r"(?-u:\B)")); + } + + #[test] + fn odds_and_ends() { + assert_eq!((Seq::infinite(), seq([I("a")])), e(r".a")); + assert_eq!((seq([I("a")]), Seq::infinite()), e(r"a.")); + assert_eq!(infinite(), e(r"a|.")); + assert_eq!(infinite(), e(r".|a")); + + let pat = r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]"; + let expected = inexact( + ["Mo'am", "Moam", "Mu'am", "Muam"].map(I), + [ + "ddafi", "ddafy", "dhafi", "dhafy", "dzafi", "dzafy", "dafi", + "dafy", "tdafi", "tdafy", "thafi", "thafy", "tzafi", "tzafy", + "tafi", "tafy", "zdafi", "zdafy", "zhafi", "zhafy", "zzafi", + "zzafy", "zafi", "zafy", + ] + .map(I), + ); + assert_eq!(expected, e(pat)); + + assert_eq!( + (seq(["fn is_", "fn as_"].map(I)), Seq::infinite()), + e(r"fn is_([A-Z]+)|fn as_([A-Z]+)"), + ); + assert_eq!( + inexact([I("foo")], [I("quux")]), + e(r"foo[A-Z]+bar[A-Z]+quux") + ); + assert_eq!(infinite(), e(r"[A-Z]+bar[A-Z]+")); + assert_eq!( + exact(["Sherlock Holmes"]), + e(r"(?m)^Sherlock Holmes|Sherlock Holmes$") + ); + + assert_eq!(exact(["sa", "sb"]), e(r"\bs(?:[ab])")); + } + + // This tests a specific regex along with some heuristic steps to reduce + // the sequences extracted. This is meant to roughly correspond to the + // types of heuristics used to shrink literal sets in practice. (Shrinking + // is done because you want to balance "spend too much work looking for + // too many literals" and "spend too much work processing false positive + // matches from short literals.") + #[test] + #[cfg(feature = "unicode-case")] + fn holmes() { + let expected = inexact( + ["HOL", "HOl", "HoL", "Hol", "hOL", "hOl", "hoL", "hol"].map(I), + [ + "MES", "MEs", "Eſ", "MeS", "Mes", "eſ", "mES", "mEs", "meS", + "mes", + ] + .map(I), + ); + let (mut prefixes, mut suffixes) = e(r"(?i)Holmes"); + prefixes.keep_first_bytes(3); + suffixes.keep_last_bytes(3); + prefixes.minimize_by_preference(); + suffixes.minimize_by_preference(); + assert_eq!(expected, (prefixes, suffixes)); + } + + // This tests that we get some kind of literals extracted for a beefier + // alternation with case insensitive mode enabled. At one point during + // development, this returned nothing, and motivated some special case + // code in Extractor::union to try and trim down the literal sequences + // if the union would blow the limits set. + #[test] + #[cfg(feature = "unicode-case")] + fn holmes_alt() { + let mut pre = + prefixes(r"(?i)Sherlock|Holmes|Watson|Irene|Adler|John|Baker"); + assert!(pre.len().unwrap() > 0); + pre.optimize_for_prefix_by_preference(); + assert!(pre.len().unwrap() > 0); + } + + // See: https://github.com/rust-lang/regex/security/advisories/GHSA-m5pq-gvj9-9vr8 + // See: CVE-2022-24713 + // + // We test this here to ensure literal extraction completes in reasonable + // time and isn't materially impacted by these sorts of pathological + // repeats. + #[test] + fn crazy_repeats() { + assert_eq!(inexact([I("")], [I("")]), e(r"(?:){4294967295}")); + assert_eq!( + inexact([I("")], [I("")]), + e(r"(?:){64}{64}{64}{64}{64}{64}") + ); + assert_eq!(inexact([I("")], [I("")]), e(r"x{0}{4294967295}")); + assert_eq!(inexact([I("")], [I("")]), e(r"(?:|){4294967295}")); + + assert_eq!( + inexact([E("")], [E("")]), + e(r"(?:){8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}") + ); + let repa = "a".repeat(100); + assert_eq!( + inexact([I(&repa)], [I(&repa)]), + e(r"a{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}") + ); + } + + #[test] + fn huge() { + let pat = r#"(?-u) + 2(?: + [45]\d{3}| + 7(?: + 1[0-267]| + 2[0-289]| + 3[0-29]| + 4[01]| + 5[1-3]| + 6[013]| + 7[0178]| + 91 + )| + 8(?: + 0[125]| + [139][1-6]| + 2[0157-9]| + 41| + 6[1-35]| + 7[1-5]| + 8[1-8]| + 90 + )| + 9(?: + 0[0-2]| + 1[0-4]| + 2[568]| + 3[3-6]| + 5[5-7]| + 6[0167]| + 7[15]| + 8[0146-9] + ) + )\d{4}| + 3(?: + 12?[5-7]\d{2}| + 0(?: + 2(?: + [025-79]\d| + [348]\d{1,2} + )| + 3(?: + [2-4]\d| + [56]\d? + ) + )| + 2(?: + 1\d{2}| + 2(?: + [12]\d| + [35]\d{1,2}| + 4\d? + ) + )| + 3(?: + 1\d{2}| + 2(?: + [2356]\d| + 4\d{1,2} + ) + )| + 4(?: + 1\d{2}| + 2(?: + 2\d{1,2}| + [47]| + 5\d{2} + ) + )| + 5(?: + 1\d{2}| + 29 + )| + [67]1\d{2}| + 8(?: + 1\d{2}| + 2(?: + 2\d{2}| + 3| + 4\d + ) + ) + )\d{3}| + 4(?: + 0(?: + 2(?: + [09]\d| + 7 + )| + 33\d{2} + )| + 1\d{3}| + 2(?: + 1\d{2}| + 2(?: + [25]\d?| + [348]\d| + [67]\d{1,2} + ) + )| + 3(?: + 1\d{2}(?: + \d{2} + )?| + 2(?: + [045]\d| + [236-9]\d{1,2} + )| + 32\d{2} + )| + 4(?: + [18]\d{2}| + 2(?: + [2-46]\d{2}| + 3 + )| + 5[25]\d{2} + )| + 5(?: + 1\d{2}| + 2(?: + 3\d| + 5 + ) + )| + 6(?: + [18]\d{2}| + 2(?: + 3(?: + \d{2} + )?| + [46]\d{1,2}| + 5\d{2}| + 7\d + )| + 5(?: + 3\d?| + 4\d| + [57]\d{1,2}| + 6\d{2}| + 8 + ) + )| + 71\d{2}| + 8(?: + [18]\d{2}| + 23\d{2}| + 54\d{2} + )| + 9(?: + [18]\d{2}| + 2[2-5]\d{2}| + 53\d{1,2} + ) + )\d{3}| + 5(?: + 02[03489]\d{2}| + 1\d{2}| + 2(?: + 1\d{2}| + 2(?: + 2(?: + \d{2} + )?| + [457]\d{2} + ) + )| + 3(?: + 1\d{2}| + 2(?: + [37](?: + \d{2} + )?| + [569]\d{2} + ) + )| + 4(?: + 1\d{2}| + 2[46]\d{2} + )| + 5(?: + 1\d{2}| + 26\d{1,2} + )| + 6(?: + [18]\d{2}| + 2| + 53\d{2} + )| + 7(?: + 1| + 24 + )\d{2}| + 8(?: + 1| + 26 + )\d{2}| + 91\d{2} + )\d{3}| + 6(?: + 0(?: + 1\d{2}| + 2(?: + 3\d{2}| + 4\d{1,2} + ) + )| + 2(?: + 2[2-5]\d{2}| + 5(?: + [3-5]\d{2}| + 7 + )| + 8\d{2} + )| + 3(?: + 1| + 2[3478] + )\d{2}| + 4(?: + 1| + 2[34] + )\d{2}| + 5(?: + 1| + 2[47] + )\d{2}| + 6(?: + [18]\d{2}| + 6(?: + 2(?: + 2\d| + [34]\d{2} + )| + 5(?: + [24]\d{2}| + 3\d| + 5\d{1,2} + ) + ) + )| + 72[2-5]\d{2}| + 8(?: + 1\d{2}| + 2[2-5]\d{2} + )| + 9(?: + 1\d{2}| + 2[2-6]\d{2} + ) + )\d{3}| + 7(?: + (?: + 02| + [3-589]1| + 6[12]| + 72[24] + )\d{2}| + 21\d{3}| + 32 + )\d{3}| + 8(?: + (?: + 4[12]| + [5-7]2| + 1\d? + )| + (?: + 0| + 3[12]| + [5-7]1| + 217 + )\d + )\d{4}| + 9(?: + [35]1| + (?: + [024]2| + 81 + )\d| + (?: + 1| + [24]1 + )\d{2} + )\d{3} + "#; + // TODO: This is a good candidate of a seq of literals that could be + // shrunk quite a bit and still be very productive with respect to + // literal optimizations. + let (prefixes, suffixes) = e(pat); + assert!(!suffixes.is_finite()); + assert_eq!(Some(243), prefixes.len()); + } + + #[test] + fn optimize() { + // This gets a common prefix that isn't too short. + let (p, s) = + opt(["foobarfoobar", "foobar", "foobarzfoobar", "foobarfoobar"]); + assert_eq!(seq([I("foobar")]), p); + assert_eq!(seq([I("foobar")]), s); + + // This also finds a common prefix, but since it's only one byte, it + // prefers the multiple literals. + let (p, s) = opt(["abba", "akka", "abccba"]); + assert_eq!(exact(["abba", "akka", "abccba"]), (p, s)); + + let (p, s) = opt(["sam", "samwise"]); + assert_eq!((seq([E("sam")]), seq([E("sam"), E("samwise")])), (p, s)); + + // The empty string is poisonous, so our seq becomes infinite, even + // though all literals are exact. + let (p, s) = opt(["foobarfoo", "foo", "", "foozfoo", "foofoo"]); + assert!(!p.is_finite()); + assert!(!s.is_finite()); + + // A space is also poisonous, so our seq becomes infinite. But this + // only gets triggered when we don't have a completely exact sequence. + // When the sequence is exact, spaces are okay, since we presume that + // any prefilter will match a space more quickly than the regex engine. + // (When the sequence is exact, there's a chance of the prefilter being + // used without needing the regex engine at all.) + let mut p = seq([E("foobarfoo"), I("foo"), E(" "), E("foofoo")]); + p.optimize_for_prefix_by_preference(); + assert!(!p.is_finite()); + } +} diff --git a/vendor/regex-syntax/src/hir/literal/mod.rs b/vendor/regex-syntax/src/hir/literal/mod.rs deleted file mode 100644 index fbc5d3c97..000000000 --- a/vendor/regex-syntax/src/hir/literal/mod.rs +++ /dev/null @@ -1,1686 +0,0 @@ -/*! -Provides routines for extracting literal prefixes and suffixes from an `Hir`. -*/ - -use std::cmp; -use std::fmt; -use std::iter; -use std::mem; -use std::ops; - -use crate::hir::{self, Hir, HirKind}; - -/// A set of literal byte strings extracted from a regular expression. -/// -/// Every member of the set is a `Literal`, which is represented by a -/// `Vec<u8>`. (Notably, it may contain invalid UTF-8.) Every member is -/// said to be either *complete* or *cut*. A complete literal means that -/// it extends until the beginning (or end) of the regular expression. In -/// some circumstances, this can be used to indicate a match in the regular -/// expression. -/// -/// A key aspect of literal extraction is knowing when to stop. It is not -/// feasible to blindly extract all literals from a regular expression, even if -/// there are finitely many. For example, the regular expression `[0-9]{10}` -/// has `10^10` distinct literals. For this reason, literal extraction is -/// bounded to some low number by default using heuristics, but the limits can -/// be tweaked. -/// -/// **WARNING**: Literal extraction uses stack space proportional to the size -/// of the `Hir` expression. At some point, this drawback will be eliminated. -/// To protect yourself, set a reasonable -/// [`nest_limit` on your `Parser`](../../struct.ParserBuilder.html#method.nest_limit). -/// This is done for you by default. -#[derive(Clone, Eq, PartialEq)] -pub struct Literals { - lits: Vec<Literal>, - limit_size: usize, - limit_class: usize, -} - -/// A single member of a set of literals extracted from a regular expression. -/// -/// This type has `Deref` and `DerefMut` impls to `Vec<u8>` so that all slice -/// and `Vec` operations are available. -#[derive(Clone, Eq, Ord)] -pub struct Literal { - v: Vec<u8>, - cut: bool, -} - -impl Literals { - /// Returns a new empty set of literals using default limits. - pub fn empty() -> Literals { - Literals { lits: vec![], limit_size: 250, limit_class: 10 } - } - - /// Returns a set of literal prefixes extracted from the given `Hir`. - pub fn prefixes(expr: &Hir) -> Literals { - let mut lits = Literals::empty(); - lits.union_prefixes(expr); - lits - } - - /// Returns a set of literal suffixes extracted from the given `Hir`. - pub fn suffixes(expr: &Hir) -> Literals { - let mut lits = Literals::empty(); - lits.union_suffixes(expr); - lits - } - - /// Get the approximate size limit (in bytes) of this set. - pub fn limit_size(&self) -> usize { - self.limit_size - } - - /// Set the approximate size limit (in bytes) of this set. - /// - /// If extracting a literal would put the set over this limit, then - /// extraction stops. - /// - /// The new limits will only apply to additions to this set. Existing - /// members remain unchanged, even if the set exceeds the new limit. - pub fn set_limit_size(&mut self, size: usize) -> &mut Literals { - self.limit_size = size; - self - } - - /// Get the character class size limit for this set. - pub fn limit_class(&self) -> usize { - self.limit_class - } - - /// Limits the size of character(or byte) classes considered. - /// - /// A value of `0` prevents all character classes from being considered. - /// - /// This limit also applies to case insensitive literals, since each - /// character in the case insensitive literal is converted to a class, and - /// then case folded. - /// - /// The new limits will only apply to additions to this set. Existing - /// members remain unchanged, even if the set exceeds the new limit. - pub fn set_limit_class(&mut self, size: usize) -> &mut Literals { - self.limit_class = size; - self - } - - /// Returns the set of literals as a slice. Its order is unspecified. - pub fn literals(&self) -> &[Literal] { - &self.lits - } - - /// Returns the length of the smallest literal. - /// - /// Returns None is there are no literals in the set. - pub fn min_len(&self) -> Option<usize> { - let mut min = None; - for lit in &self.lits { - match min { - None => min = Some(lit.len()), - Some(m) if lit.len() < m => min = Some(lit.len()), - _ => {} - } - } - min - } - - /// Returns true if all members in this set are complete. - pub fn all_complete(&self) -> bool { - !self.lits.is_empty() && self.lits.iter().all(|l| !l.is_cut()) - } - - /// Returns true if any member in this set is complete. - pub fn any_complete(&self) -> bool { - self.lits.iter().any(|lit| !lit.is_cut()) - } - - /// Returns true if this set contains an empty literal. - pub fn contains_empty(&self) -> bool { - self.lits.iter().any(|lit| lit.is_empty()) - } - - /// Returns true if this set is empty or if all of its members is empty. - pub fn is_empty(&self) -> bool { - self.lits.is_empty() || self.lits.iter().all(|lit| lit.is_empty()) - } - - /// Returns a new empty set of literals using this set's limits. - pub fn to_empty(&self) -> Literals { - let mut lits = Literals::empty(); - lits.set_limit_size(self.limit_size).set_limit_class(self.limit_class); - lits - } - - /// Returns the longest common prefix of all members in this set. - pub fn longest_common_prefix(&self) -> &[u8] { - if self.is_empty() { - return &[]; - } - let lit0 = &*self.lits[0]; - let mut len = lit0.len(); - for lit in &self.lits[1..] { - len = cmp::min( - len, - lit.iter().zip(lit0).take_while(|&(a, b)| a == b).count(), - ); - } - &self.lits[0][..len] - } - - /// Returns the longest common suffix of all members in this set. - pub fn longest_common_suffix(&self) -> &[u8] { - if self.is_empty() { - return &[]; - } - let lit0 = &*self.lits[0]; - let mut len = lit0.len(); - for lit in &self.lits[1..] { - len = cmp::min( - len, - lit.iter() - .rev() - .zip(lit0.iter().rev()) - .take_while(|&(a, b)| a == b) - .count(), - ); - } - &self.lits[0][self.lits[0].len() - len..] - } - - /// Returns a new set of literals with the given number of bytes trimmed - /// from the suffix of each literal. - /// - /// If any literal would be cut out completely by trimming, then None is - /// returned. - /// - /// Any duplicates that are created as a result of this transformation are - /// removed. - pub fn trim_suffix(&self, num_bytes: usize) -> Option<Literals> { - if self.min_len().map(|len| len <= num_bytes).unwrap_or(true) { - return None; - } - let mut new = self.to_empty(); - for mut lit in self.lits.iter().cloned() { - let new_len = lit.len() - num_bytes; - lit.truncate(new_len); - lit.cut(); - new.lits.push(lit); - } - new.lits.sort(); - new.lits.dedup(); - Some(new) - } - - /// Returns a new set of prefixes of this set of literals that are - /// guaranteed to be unambiguous. - /// - /// Any substring match with a member of the set is returned is guaranteed - /// to never overlap with a substring match of another member of the set - /// at the same starting position. - /// - /// Given any two members of the returned set, neither is a substring of - /// the other. - pub fn unambiguous_prefixes(&self) -> Literals { - if self.lits.is_empty() { - return self.to_empty(); - } - let mut old = self.lits.to_vec(); - let mut new = self.to_empty(); - 'OUTER: while let Some(mut candidate) = old.pop() { - if candidate.is_empty() { - continue; - } - if new.lits.is_empty() { - new.lits.push(candidate); - continue; - } - for lit2 in &mut new.lits { - if lit2.is_empty() { - continue; - } - if &candidate == lit2 { - // If the literal is already in the set, then we can - // just drop it. But make sure that cut literals are - // infectious! - candidate.cut = candidate.cut || lit2.cut; - lit2.cut = candidate.cut; - continue 'OUTER; - } - if candidate.len() < lit2.len() { - if let Some(i) = position(&candidate, &lit2) { - candidate.cut(); - let mut lit3 = lit2.clone(); - lit3.truncate(i); - lit3.cut(); - old.push(lit3); - lit2.clear(); - } - } else if let Some(i) = position(&lit2, &candidate) { - lit2.cut(); - let mut new_candidate = candidate.clone(); - new_candidate.truncate(i); - new_candidate.cut(); - old.push(new_candidate); - candidate.clear(); - } - // Oops, the candidate is already represented in the set. - if candidate.is_empty() { - continue 'OUTER; - } - } - new.lits.push(candidate); - } - new.lits.retain(|lit| !lit.is_empty()); - new.lits.sort(); - new.lits.dedup(); - new - } - - /// Returns a new set of suffixes of this set of literals that are - /// guaranteed to be unambiguous. - /// - /// Any substring match with a member of the set is returned is guaranteed - /// to never overlap with a substring match of another member of the set - /// at the same ending position. - /// - /// Given any two members of the returned set, neither is a substring of - /// the other. - pub fn unambiguous_suffixes(&self) -> Literals { - // This is a touch wasteful... - let mut lits = self.clone(); - lits.reverse(); - let mut unamb = lits.unambiguous_prefixes(); - unamb.reverse(); - unamb - } - - /// Unions the prefixes from the given expression to this set. - /// - /// If prefixes could not be added (for example, this set would exceed its - /// size limits or the set of prefixes from `expr` includes the empty - /// string), then false is returned. - /// - /// Note that prefix literals extracted from `expr` are said to be complete - /// if and only if the literal extends from the beginning of `expr` to the - /// end of `expr`. - pub fn union_prefixes(&mut self, expr: &Hir) -> bool { - let mut lits = self.to_empty(); - prefixes(expr, &mut lits); - !lits.is_empty() && !lits.contains_empty() && self.union(lits) - } - - /// Unions the suffixes from the given expression to this set. - /// - /// If suffixes could not be added (for example, this set would exceed its - /// size limits or the set of suffixes from `expr` includes the empty - /// string), then false is returned. - /// - /// Note that prefix literals extracted from `expr` are said to be complete - /// if and only if the literal extends from the end of `expr` to the - /// beginning of `expr`. - pub fn union_suffixes(&mut self, expr: &Hir) -> bool { - let mut lits = self.to_empty(); - suffixes(expr, &mut lits); - lits.reverse(); - !lits.is_empty() && !lits.contains_empty() && self.union(lits) - } - - /// Unions this set with another set. - /// - /// If the union would cause the set to exceed its limits, then the union - /// is skipped and it returns false. Otherwise, if the union succeeds, it - /// returns true. - pub fn union(&mut self, lits: Literals) -> bool { - if self.num_bytes() + lits.num_bytes() > self.limit_size { - return false; - } - if lits.is_empty() { - self.lits.push(Literal::empty()); - } else { - self.lits.extend(lits.lits); - } - true - } - - /// Extends this set with another set. - /// - /// The set of literals is extended via a cross product. - /// - /// If a cross product would cause this set to exceed its limits, then the - /// cross product is skipped and it returns false. Otherwise, if the cross - /// product succeeds, it returns true. - pub fn cross_product(&mut self, lits: &Literals) -> bool { - if lits.is_empty() { - return true; - } - // Check that we make sure we stay in our limits. - let mut size_after; - if self.is_empty() || !self.any_complete() { - size_after = self.num_bytes(); - for lits_lit in lits.literals() { - size_after += lits_lit.len(); - } - } else { - size_after = self.lits.iter().fold(0, |accum, lit| { - accum + if lit.is_cut() { lit.len() } else { 0 } - }); - for lits_lit in lits.literals() { - for self_lit in self.literals() { - if !self_lit.is_cut() { - size_after += self_lit.len() + lits_lit.len(); - } - } - } - } - if size_after > self.limit_size { - return false; - } - - let mut base = self.remove_complete(); - if base.is_empty() { - base = vec![Literal::empty()]; - } - for lits_lit in lits.literals() { - for mut self_lit in base.clone() { - self_lit.extend(&**lits_lit); - self_lit.cut = lits_lit.cut; - self.lits.push(self_lit); - } - } - true - } - - /// Extends each literal in this set with the bytes given. - /// - /// If the set is empty, then the given literal is added to the set. - /// - /// If adding any number of bytes to all members of this set causes a limit - /// to be exceeded, then no bytes are added and false is returned. If a - /// prefix of `bytes` can be fit into this set, then it is used and all - /// resulting literals are cut. - pub fn cross_add(&mut self, bytes: &[u8]) -> bool { - // N.B. This could be implemented by simply calling cross_product with - // a literal set containing just `bytes`, but we can be smarter about - // taking shorter prefixes of `bytes` if they'll fit. - if bytes.is_empty() { - return true; - } - if self.lits.is_empty() { - let i = cmp::min(self.limit_size, bytes.len()); - self.lits.push(Literal::new(bytes[..i].to_owned())); - self.lits[0].cut = i < bytes.len(); - return !self.lits[0].is_cut(); - } - let size = self.num_bytes(); - if size + self.lits.len() >= self.limit_size { - return false; - } - let mut i = 1; - while size + (i * self.lits.len()) <= self.limit_size - && i < bytes.len() - { - i += 1; - } - for lit in &mut self.lits { - if !lit.is_cut() { - lit.extend(&bytes[..i]); - if i < bytes.len() { - lit.cut(); - } - } - } - true - } - - /// Adds the given literal to this set. - /// - /// Returns false if adding this literal would cause the class to be too - /// big. - pub fn add(&mut self, lit: Literal) -> bool { - if self.num_bytes() + lit.len() > self.limit_size { - return false; - } - self.lits.push(lit); - true - } - - /// Extends each literal in this set with the character class given. - /// - /// Returns false if the character class was too big to add. - pub fn add_char_class(&mut self, cls: &hir::ClassUnicode) -> bool { - self._add_char_class(cls, false) - } - - /// Extends each literal in this set with the character class given, - /// writing the bytes of each character in reverse. - /// - /// Returns false if the character class was too big to add. - fn add_char_class_reverse(&mut self, cls: &hir::ClassUnicode) -> bool { - self._add_char_class(cls, true) - } - - fn _add_char_class( - &mut self, - cls: &hir::ClassUnicode, - reverse: bool, - ) -> bool { - use std::char; - - if self.class_exceeds_limits(cls_char_count(cls)) { - return false; - } - let mut base = self.remove_complete(); - if base.is_empty() { - base = vec![Literal::empty()]; - } - for r in cls.iter() { - let (s, e) = (r.start as u32, r.end as u32 + 1); - for c in (s..e).filter_map(char::from_u32) { - for mut lit in base.clone() { - let mut bytes = c.to_string().into_bytes(); - if reverse { - bytes.reverse(); - } - lit.extend(&bytes); - self.lits.push(lit); - } - } - } - true - } - - /// Extends each literal in this set with the byte class given. - /// - /// Returns false if the byte class was too big to add. - pub fn add_byte_class(&mut self, cls: &hir::ClassBytes) -> bool { - if self.class_exceeds_limits(cls_byte_count(cls)) { - return false; - } - let mut base = self.remove_complete(); - if base.is_empty() { - base = vec![Literal::empty()]; - } - for r in cls.iter() { - let (s, e) = (r.start as u32, r.end as u32 + 1); - for b in (s..e).map(|b| b as u8) { - for mut lit in base.clone() { - lit.push(b); - self.lits.push(lit); - } - } - } - true - } - - /// Cuts every member of this set. When a member is cut, it can never - /// be extended. - pub fn cut(&mut self) { - for lit in &mut self.lits { - lit.cut(); - } - } - - /// Reverses all members in place. - pub fn reverse(&mut self) { - for lit in &mut self.lits { - lit.reverse(); - } - } - - /// Clears this set of all members. - pub fn clear(&mut self) { - self.lits.clear(); - } - - /// Pops all complete literals out of this set. - fn remove_complete(&mut self) -> Vec<Literal> { - let mut base = vec![]; - for lit in mem::replace(&mut self.lits, vec![]) { - if lit.is_cut() { - self.lits.push(lit); - } else { - base.push(lit); - } - } - base - } - - /// Returns the total number of bytes in this set. - fn num_bytes(&self) -> usize { - self.lits.iter().fold(0, |accum, lit| accum + lit.len()) - } - - /// Returns true if a character class with the given size would cause this - /// set to exceed its limits. - /// - /// The size given should correspond to the number of items in the class. - fn class_exceeds_limits(&self, size: usize) -> bool { - if size > self.limit_class { - return true; - } - // This is an approximation since codepoints in a char class can encode - // to 1-4 bytes. - let new_byte_count = if self.lits.is_empty() { - size - } else { - self.lits.iter().fold(0, |accum, lit| { - accum - + if lit.is_cut() { - // If the literal is cut, then we'll never add - // anything to it, so don't count it. - 0 - } else { - (lit.len() + 1) * size - } - }) - }; - new_byte_count > self.limit_size - } -} - -fn prefixes(expr: &Hir, lits: &mut Literals) { - match *expr.kind() { - HirKind::Literal(hir::Literal::Unicode(c)) => { - let mut buf = [0; 4]; - lits.cross_add(c.encode_utf8(&mut buf).as_bytes()); - } - HirKind::Literal(hir::Literal::Byte(b)) => { - lits.cross_add(&[b]); - } - HirKind::Class(hir::Class::Unicode(ref cls)) => { - if !lits.add_char_class(cls) { - lits.cut(); - } - } - HirKind::Class(hir::Class::Bytes(ref cls)) => { - if !lits.add_byte_class(cls) { - lits.cut(); - } - } - HirKind::Group(hir::Group { ref hir, .. }) => { - prefixes(&**hir, lits); - } - HirKind::Repetition(ref x) => match x.kind { - hir::RepetitionKind::ZeroOrOne => { - repeat_zero_or_one_literals(&x.hir, lits, prefixes); - } - hir::RepetitionKind::ZeroOrMore => { - repeat_zero_or_more_literals(&x.hir, lits, prefixes); - } - hir::RepetitionKind::OneOrMore => { - repeat_one_or_more_literals(&x.hir, lits, prefixes); - } - hir::RepetitionKind::Range(ref rng) => { - let (min, max) = match *rng { - hir::RepetitionRange::Exactly(m) => (m, Some(m)), - hir::RepetitionRange::AtLeast(m) => (m, None), - hir::RepetitionRange::Bounded(m, n) => (m, Some(n)), - }; - repeat_range_literals( - &x.hir, min, max, x.greedy, lits, prefixes, - ) - } - }, - HirKind::Concat(ref es) if es.is_empty() => {} - HirKind::Concat(ref es) if es.len() == 1 => prefixes(&es[0], lits), - HirKind::Concat(ref es) => { - for e in es { - if let HirKind::Anchor(hir::Anchor::StartText) = *e.kind() { - if !lits.is_empty() { - lits.cut(); - break; - } - lits.add(Literal::empty()); - continue; - } - let mut lits2 = lits.to_empty(); - prefixes(e, &mut lits2); - if !lits.cross_product(&lits2) || !lits2.any_complete() { - // If this expression couldn't yield any literal that - // could be extended, then we need to quit. Since we're - // short-circuiting, we also need to freeze every member. - lits.cut(); - break; - } - } - } - HirKind::Alternation(ref es) => { - alternate_literals(es, lits, prefixes); - } - _ => lits.cut(), - } -} - -fn suffixes(expr: &Hir, lits: &mut Literals) { - match *expr.kind() { - HirKind::Literal(hir::Literal::Unicode(c)) => { - let mut buf = [0u8; 4]; - let i = c.encode_utf8(&mut buf).len(); - let buf = &mut buf[..i]; - buf.reverse(); - lits.cross_add(buf); - } - HirKind::Literal(hir::Literal::Byte(b)) => { - lits.cross_add(&[b]); - } - HirKind::Class(hir::Class::Unicode(ref cls)) => { - if !lits.add_char_class_reverse(cls) { - lits.cut(); - } - } - HirKind::Class(hir::Class::Bytes(ref cls)) => { - if !lits.add_byte_class(cls) { - lits.cut(); - } - } - HirKind::Group(hir::Group { ref hir, .. }) => { - suffixes(&**hir, lits); - } - HirKind::Repetition(ref x) => match x.kind { - hir::RepetitionKind::ZeroOrOne => { - repeat_zero_or_one_literals(&x.hir, lits, suffixes); - } - hir::RepetitionKind::ZeroOrMore => { - repeat_zero_or_more_literals(&x.hir, lits, suffixes); - } - hir::RepetitionKind::OneOrMore => { - repeat_one_or_more_literals(&x.hir, lits, suffixes); - } - hir::RepetitionKind::Range(ref rng) => { - let (min, max) = match *rng { - hir::RepetitionRange::Exactly(m) => (m, Some(m)), - hir::RepetitionRange::AtLeast(m) => (m, None), - hir::RepetitionRange::Bounded(m, n) => (m, Some(n)), - }; - repeat_range_literals( - &x.hir, min, max, x.greedy, lits, suffixes, - ) - } - }, - HirKind::Concat(ref es) if es.is_empty() => {} - HirKind::Concat(ref es) if es.len() == 1 => suffixes(&es[0], lits), - HirKind::Concat(ref es) => { - for e in es.iter().rev() { - if let HirKind::Anchor(hir::Anchor::EndText) = *e.kind() { - if !lits.is_empty() { - lits.cut(); - break; - } - lits.add(Literal::empty()); - continue; - } - let mut lits2 = lits.to_empty(); - suffixes(e, &mut lits2); - if !lits.cross_product(&lits2) || !lits2.any_complete() { - // If this expression couldn't yield any literal that - // could be extended, then we need to quit. Since we're - // short-circuiting, we also need to freeze every member. - lits.cut(); - break; - } - } - } - HirKind::Alternation(ref es) => { - alternate_literals(es, lits, suffixes); - } - _ => lits.cut(), - } -} - -fn repeat_zero_or_one_literals<F: FnMut(&Hir, &mut Literals)>( - e: &Hir, - lits: &mut Literals, - mut f: F, -) { - f( - &Hir::repetition(hir::Repetition { - kind: hir::RepetitionKind::ZeroOrMore, - // FIXME: Our literal extraction doesn't care about greediness. - // Which is partially why we're treating 'e?' as 'e*'. Namely, - // 'ab??' yields [Complete(ab), Complete(a)], but it should yield - // [Complete(a), Complete(ab)] because of the non-greediness. - greedy: true, - hir: Box::new(e.clone()), - }), - lits, - ); -} - -fn repeat_zero_or_more_literals<F: FnMut(&Hir, &mut Literals)>( - e: &Hir, - lits: &mut Literals, - mut f: F, -) { - let (mut lits2, mut lits3) = (lits.clone(), lits.to_empty()); - lits3.set_limit_size(lits.limit_size() / 2); - f(e, &mut lits3); - - if lits3.is_empty() || !lits2.cross_product(&lits3) { - lits.cut(); - return; - } - lits2.cut(); - lits2.add(Literal::empty()); - if !lits.union(lits2) { - lits.cut(); - } -} - -fn repeat_one_or_more_literals<F: FnMut(&Hir, &mut Literals)>( - e: &Hir, - lits: &mut Literals, - mut f: F, -) { - f(e, lits); - lits.cut(); -} - -fn repeat_range_literals<F: FnMut(&Hir, &mut Literals)>( - e: &Hir, - min: u32, - max: Option<u32>, - greedy: bool, - lits: &mut Literals, - mut f: F, -) { - if min == 0 { - // This is a bit conservative. If `max` is set, then we could - // treat this as a finite set of alternations. For now, we - // just treat it as `e*`. - f( - &Hir::repetition(hir::Repetition { - kind: hir::RepetitionKind::ZeroOrMore, - greedy, - hir: Box::new(e.clone()), - }), - lits, - ); - } else { - if min > 0 { - let n = cmp::min(lits.limit_size, min as usize); - let es = iter::repeat(e.clone()).take(n).collect(); - f(&Hir::concat(es), lits); - if n < min as usize || lits.contains_empty() { - lits.cut(); - } - } - if max.map_or(true, |max| min < max) { - lits.cut(); - } - } -} - -fn alternate_literals<F: FnMut(&Hir, &mut Literals)>( - es: &[Hir], - lits: &mut Literals, - mut f: F, -) { - let mut lits2 = lits.to_empty(); - for e in es { - let mut lits3 = lits.to_empty(); - lits3.set_limit_size(lits.limit_size() / 5); - f(e, &mut lits3); - if lits3.is_empty() || !lits2.union(lits3) { - // If we couldn't find suffixes for *any* of the - // alternates, then the entire alternation has to be thrown - // away and any existing members must be frozen. Similarly, - // if the union couldn't complete, stop and freeze. - lits.cut(); - return; - } - } - if !lits.cross_product(&lits2) { - lits.cut(); - } -} - -impl fmt::Debug for Literals { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_struct("Literals") - .field("lits", &self.lits) - .field("limit_size", &self.limit_size) - .field("limit_class", &self.limit_class) - .finish() - } -} - -impl Literal { - /// Returns a new complete literal with the bytes given. - pub fn new(bytes: Vec<u8>) -> Literal { - Literal { v: bytes, cut: false } - } - - /// Returns a new complete empty literal. - pub fn empty() -> Literal { - Literal { v: vec![], cut: false } - } - - /// Returns true if this literal was "cut." - pub fn is_cut(&self) -> bool { - self.cut - } - - /// Cuts this literal. - pub fn cut(&mut self) { - self.cut = true; - } -} - -impl PartialEq for Literal { - fn eq(&self, other: &Literal) -> bool { - self.v == other.v - } -} - -impl PartialOrd for Literal { - fn partial_cmp(&self, other: &Literal) -> Option<cmp::Ordering> { - self.v.partial_cmp(&other.v) - } -} - -impl fmt::Debug for Literal { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - if self.is_cut() { - write!(f, "Cut({})", escape_unicode(&self.v)) - } else { - write!(f, "Complete({})", escape_unicode(&self.v)) - } - } -} - -impl AsRef<[u8]> for Literal { - fn as_ref(&self) -> &[u8] { - &self.v - } -} - -impl ops::Deref for Literal { - type Target = Vec<u8>; - fn deref(&self) -> &Vec<u8> { - &self.v - } -} - -impl ops::DerefMut for Literal { - fn deref_mut(&mut self) -> &mut Vec<u8> { - &mut self.v - } -} - -fn position(needle: &[u8], mut haystack: &[u8]) -> Option<usize> { - let mut i = 0; - while haystack.len() >= needle.len() { - if needle == &haystack[..needle.len()] { - return Some(i); - } - i += 1; - haystack = &haystack[1..]; - } - None -} - -fn escape_unicode(bytes: &[u8]) -> String { - let show = match ::std::str::from_utf8(bytes) { - Ok(v) => v.to_string(), - Err(_) => escape_bytes(bytes), - }; - let mut space_escaped = String::new(); - for c in show.chars() { - if c.is_whitespace() { - let escaped = if c as u32 <= 0x7F { - escape_byte(c as u8) - } else if c as u32 <= 0xFFFF { - format!(r"\u{{{:04x}}}", c as u32) - } else { - format!(r"\U{{{:08x}}}", c as u32) - }; - space_escaped.push_str(&escaped); - } else { - space_escaped.push(c); - } - } - space_escaped -} - -fn escape_bytes(bytes: &[u8]) -> String { - let mut s = String::new(); - for &b in bytes { - s.push_str(&escape_byte(b)); - } - s -} - -fn escape_byte(byte: u8) -> String { - use std::ascii::escape_default; - - let escaped: Vec<u8> = escape_default(byte).collect(); - String::from_utf8_lossy(&escaped).into_owned() -} - -fn cls_char_count(cls: &hir::ClassUnicode) -> usize { - cls.iter().map(|&r| 1 + (r.end as u32) - (r.start as u32)).sum::<u32>() - as usize -} - -fn cls_byte_count(cls: &hir::ClassBytes) -> usize { - cls.iter().map(|&r| 1 + (r.end as u32) - (r.start as u32)).sum::<u32>() - as usize -} - -#[cfg(test)] -mod tests { - use std::fmt; - - use super::{escape_bytes, Literal, Literals}; - use crate::hir::Hir; - use crate::ParserBuilder; - - // To make test failures easier to read. - #[derive(Debug, Eq, PartialEq)] - struct Bytes(Vec<ULiteral>); - #[derive(Debug, Eq, PartialEq)] - struct Unicode(Vec<ULiteral>); - - fn escape_lits(blits: &[Literal]) -> Vec<ULiteral> { - let mut ulits = vec![]; - for blit in blits { - ulits - .push(ULiteral { v: escape_bytes(&blit), cut: blit.is_cut() }); - } - ulits - } - - fn create_lits<I: IntoIterator<Item = Literal>>(it: I) -> Literals { - Literals { - lits: it.into_iter().collect(), - limit_size: 0, - limit_class: 0, - } - } - - // Needs to be pub for 1.3? - #[derive(Clone, Eq, PartialEq)] - pub struct ULiteral { - v: String, - cut: bool, - } - - impl ULiteral { - fn is_cut(&self) -> bool { - self.cut - } - } - - impl fmt::Debug for ULiteral { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - if self.is_cut() { - write!(f, "Cut({})", self.v) - } else { - write!(f, "Complete({})", self.v) - } - } - } - - impl PartialEq<Literal> for ULiteral { - fn eq(&self, other: &Literal) -> bool { - self.v.as_bytes() == &*other.v && self.is_cut() == other.is_cut() - } - } - - impl PartialEq<ULiteral> for Literal { - fn eq(&self, other: &ULiteral) -> bool { - &*self.v == other.v.as_bytes() && self.is_cut() == other.is_cut() - } - } - - #[allow(non_snake_case)] - fn C(s: &'static str) -> ULiteral { - ULiteral { v: s.to_owned(), cut: true } - } - #[allow(non_snake_case)] - fn M(s: &'static str) -> ULiteral { - ULiteral { v: s.to_owned(), cut: false } - } - - fn prefixes(lits: &mut Literals, expr: &Hir) { - lits.union_prefixes(expr); - } - - fn suffixes(lits: &mut Literals, expr: &Hir) { - lits.union_suffixes(expr); - } - - macro_rules! assert_lit_eq { - ($which:ident, $got_lits:expr, $($expected_lit:expr),*) => {{ - let expected: Vec<ULiteral> = vec![$($expected_lit),*]; - let lits = $got_lits; - assert_eq!( - $which(expected.clone()), - $which(escape_lits(lits.literals()))); - assert_eq!( - !expected.is_empty() && expected.iter().all(|l| !l.is_cut()), - lits.all_complete()); - assert_eq!( - expected.iter().any(|l| !l.is_cut()), - lits.any_complete()); - }}; - } - - macro_rules! test_lit { - ($name:ident, $which:ident, $re:expr) => { - test_lit!($name, $which, $re,); - }; - ($name:ident, $which:ident, $re:expr, $($lit:expr),*) => { - #[test] - fn $name() { - let expr = ParserBuilder::new() - .build() - .parse($re) - .unwrap(); - let lits = Literals::$which(&expr); - assert_lit_eq!(Unicode, lits, $($lit),*); - - let expr = ParserBuilder::new() - .allow_invalid_utf8(true) - .unicode(false) - .build() - .parse($re) - .unwrap(); - let lits = Literals::$which(&expr); - assert_lit_eq!(Bytes, lits, $($lit),*); - } - }; - } - - // ************************************************************************ - // Tests for prefix literal extraction. - // ************************************************************************ - - // Elementary tests. - test_lit!(pfx_one_lit1, prefixes, "a", M("a")); - test_lit!(pfx_one_lit2, prefixes, "abc", M("abc")); - test_lit!(pfx_one_lit3, prefixes, "(?u)☃", M("\\xe2\\x98\\x83")); - #[cfg(feature = "unicode-case")] - test_lit!(pfx_one_lit4, prefixes, "(?ui)☃", M("\\xe2\\x98\\x83")); - test_lit!(pfx_class1, prefixes, "[1-4]", M("1"), M("2"), M("3"), M("4")); - test_lit!( - pfx_class2, - prefixes, - "(?u)[☃Ⅰ]", - M("\\xe2\\x85\\xa0"), - M("\\xe2\\x98\\x83") - ); - #[cfg(feature = "unicode-case")] - test_lit!( - pfx_class3, - prefixes, - "(?ui)[☃Ⅰ]", - M("\\xe2\\x85\\xa0"), - M("\\xe2\\x85\\xb0"), - M("\\xe2\\x98\\x83") - ); - test_lit!(pfx_one_lit_casei1, prefixes, "(?i-u)a", M("A"), M("a")); - test_lit!( - pfx_one_lit_casei2, - prefixes, - "(?i-u)abc", - M("ABC"), - M("aBC"), - M("AbC"), - M("abC"), - M("ABc"), - M("aBc"), - M("Abc"), - M("abc") - ); - test_lit!(pfx_group1, prefixes, "(a)", M("a")); - test_lit!(pfx_rep_zero_or_one1, prefixes, "a?"); - test_lit!(pfx_rep_zero_or_one2, prefixes, "(?:abc)?"); - test_lit!(pfx_rep_zero_or_one_cat1, prefixes, "ab?", C("ab"), M("a")); - // FIXME: This should return [M("a"), M("ab")] because of the non-greedy - // repetition. As a work-around, we rewrite ab?? as ab*?, and thus we get - // a cut literal. - test_lit!(pfx_rep_zero_or_one_cat2, prefixes, "ab??", C("ab"), M("a")); - test_lit!(pfx_rep_zero_or_more1, prefixes, "a*"); - test_lit!(pfx_rep_zero_or_more2, prefixes, "(?:abc)*"); - test_lit!(pfx_rep_one_or_more1, prefixes, "a+", C("a")); - test_lit!(pfx_rep_one_or_more2, prefixes, "(?:abc)+", C("abc")); - test_lit!(pfx_rep_nested_one_or_more, prefixes, "(?:a+)+", C("a")); - test_lit!(pfx_rep_range1, prefixes, "a{0}"); - test_lit!(pfx_rep_range2, prefixes, "a{0,}"); - test_lit!(pfx_rep_range3, prefixes, "a{0,1}"); - test_lit!(pfx_rep_range4, prefixes, "a{1}", M("a")); - test_lit!(pfx_rep_range5, prefixes, "a{2}", M("aa")); - test_lit!(pfx_rep_range6, prefixes, "a{1,2}", C("a")); - test_lit!(pfx_rep_range7, prefixes, "a{2,3}", C("aa")); - - // Test regexes with concatenations. - test_lit!(pfx_cat1, prefixes, "(?:a)(?:b)", M("ab")); - test_lit!(pfx_cat2, prefixes, "[ab]z", M("az"), M("bz")); - test_lit!( - pfx_cat3, - prefixes, - "(?i-u)[ab]z", - M("AZ"), - M("BZ"), - M("aZ"), - M("bZ"), - M("Az"), - M("Bz"), - M("az"), - M("bz") - ); - test_lit!( - pfx_cat4, - prefixes, - "[ab][yz]", - M("ay"), - M("by"), - M("az"), - M("bz") - ); - test_lit!(pfx_cat5, prefixes, "a*b", C("a"), M("b")); - test_lit!(pfx_cat6, prefixes, "a*b*c", C("a"), C("b"), M("c")); - test_lit!(pfx_cat7, prefixes, "a*b*c+", C("a"), C("b"), C("c")); - test_lit!(pfx_cat8, prefixes, "a*b+c", C("a"), C("b")); - test_lit!(pfx_cat9, prefixes, "a*b+c*", C("a"), C("b")); - test_lit!(pfx_cat10, prefixes, "ab*", C("ab"), M("a")); - test_lit!(pfx_cat11, prefixes, "ab*c", C("ab"), M("ac")); - test_lit!(pfx_cat12, prefixes, "ab+", C("ab")); - test_lit!(pfx_cat13, prefixes, "ab+c", C("ab")); - test_lit!(pfx_cat14, prefixes, "a^", C("a")); - test_lit!(pfx_cat15, prefixes, "$a"); - test_lit!(pfx_cat16, prefixes, r"ab*c", C("ab"), M("ac")); - test_lit!(pfx_cat17, prefixes, r"ab+c", C("ab")); - test_lit!(pfx_cat18, prefixes, r"z*azb", C("z"), M("azb")); - test_lit!(pfx_cat19, prefixes, "a.z", C("a")); - - // Test regexes with alternations. - test_lit!(pfx_alt1, prefixes, "a|b", M("a"), M("b")); - test_lit!(pfx_alt2, prefixes, "[1-3]|b", M("1"), M("2"), M("3"), M("b")); - test_lit!(pfx_alt3, prefixes, "y(?:a|b)z", M("yaz"), M("ybz")); - test_lit!(pfx_alt4, prefixes, "a|b*"); - test_lit!(pfx_alt5, prefixes, "a|b+", M("a"), C("b")); - test_lit!(pfx_alt6, prefixes, "a|(?:b|c*)"); - test_lit!( - pfx_alt7, - prefixes, - "(a|b)*c|(a|ab)*c", - C("a"), - C("b"), - M("c"), - C("a"), - C("ab"), - M("c") - ); - test_lit!(pfx_alt8, prefixes, "a*b|c", C("a"), M("b"), M("c")); - - // Test regexes with empty assertions. - test_lit!(pfx_empty1, prefixes, "^a", M("a")); - test_lit!(pfx_empty2, prefixes, "a${2}", C("a")); - test_lit!(pfx_empty3, prefixes, "^abc", M("abc")); - test_lit!(pfx_empty4, prefixes, "(?:^abc)|(?:^z)", M("abc"), M("z")); - - // Make sure some curious regexes have no prefixes. - test_lit!(pfx_nothing1, prefixes, "."); - test_lit!(pfx_nothing2, prefixes, "(?s)."); - test_lit!(pfx_nothing3, prefixes, "^"); - test_lit!(pfx_nothing4, prefixes, "$"); - test_lit!(pfx_nothing6, prefixes, "(?m)$"); - test_lit!(pfx_nothing7, prefixes, r"\b"); - test_lit!(pfx_nothing8, prefixes, r"\B"); - - // Test a few regexes that defeat any prefix literal detection. - test_lit!(pfx_defeated1, prefixes, ".a"); - test_lit!(pfx_defeated2, prefixes, "(?s).a"); - test_lit!(pfx_defeated3, prefixes, "a*b*c*"); - test_lit!(pfx_defeated4, prefixes, "a|."); - test_lit!(pfx_defeated5, prefixes, ".|a"); - test_lit!(pfx_defeated6, prefixes, "a|^"); - test_lit!(pfx_defeated7, prefixes, ".(?:a(?:b)(?:c))"); - test_lit!(pfx_defeated8, prefixes, "$a"); - test_lit!(pfx_defeated9, prefixes, "(?m)$a"); - test_lit!(pfx_defeated10, prefixes, r"\ba"); - test_lit!(pfx_defeated11, prefixes, r"\Ba"); - test_lit!(pfx_defeated12, prefixes, "^*a"); - test_lit!(pfx_defeated13, prefixes, "^+a"); - - test_lit!( - pfx_crazy1, - prefixes, - r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", - C("Mo\\'"), - C("Mu\\'"), - C("Moam"), - C("Muam") - ); - - // ************************************************************************ - // Tests for quiting prefix literal search. - // ************************************************************************ - - macro_rules! test_exhausted { - ($name:ident, $which:ident, $re:expr) => { - test_exhausted!($name, $which, $re,); - }; - ($name:ident, $which:ident, $re:expr, $($lit:expr),*) => { - #[test] - fn $name() { - let expr = ParserBuilder::new() - .build() - .parse($re) - .unwrap(); - let mut lits = Literals::empty(); - lits.set_limit_size(20).set_limit_class(10); - $which(&mut lits, &expr); - assert_lit_eq!(Unicode, lits, $($lit),*); - - let expr = ParserBuilder::new() - .allow_invalid_utf8(true) - .unicode(false) - .build() - .parse($re) - .unwrap(); - let mut lits = Literals::empty(); - lits.set_limit_size(20).set_limit_class(10); - $which(&mut lits, &expr); - assert_lit_eq!(Bytes, lits, $($lit),*); - } - }; - } - - // These test use a much lower limit than the default so that we can - // write test cases of reasonable size. - test_exhausted!(pfx_exhausted1, prefixes, "[a-z]"); - test_exhausted!(pfx_exhausted2, prefixes, "[a-z]*A"); - test_exhausted!(pfx_exhausted3, prefixes, "A[a-z]Z", C("A")); - test_exhausted!( - pfx_exhausted4, - prefixes, - "(?i-u)foobar", - C("FO"), - C("fO"), - C("Fo"), - C("fo") - ); - test_exhausted!( - pfx_exhausted5, - prefixes, - "(?:ab){100}", - C("abababababababababab") - ); - test_exhausted!( - pfx_exhausted6, - prefixes, - "(?:(?:ab){100})*cd", - C("ababababab"), - M("cd") - ); - test_exhausted!( - pfx_exhausted7, - prefixes, - "z(?:(?:ab){100})*cd", - C("zababababab"), - M("zcd") - ); - test_exhausted!( - pfx_exhausted8, - prefixes, - "aaaaaaaaaaaaaaaaaaaaz", - C("aaaaaaaaaaaaaaaaaaaa") - ); - - // ************************************************************************ - // Tests for suffix literal extraction. - // ************************************************************************ - - // Elementary tests. - test_lit!(sfx_one_lit1, suffixes, "a", M("a")); - test_lit!(sfx_one_lit2, suffixes, "abc", M("abc")); - test_lit!(sfx_one_lit3, suffixes, "(?u)☃", M("\\xe2\\x98\\x83")); - #[cfg(feature = "unicode-case")] - test_lit!(sfx_one_lit4, suffixes, "(?ui)☃", M("\\xe2\\x98\\x83")); - test_lit!(sfx_class1, suffixes, "[1-4]", M("1"), M("2"), M("3"), M("4")); - test_lit!( - sfx_class2, - suffixes, - "(?u)[☃Ⅰ]", - M("\\xe2\\x85\\xa0"), - M("\\xe2\\x98\\x83") - ); - #[cfg(feature = "unicode-case")] - test_lit!( - sfx_class3, - suffixes, - "(?ui)[☃Ⅰ]", - M("\\xe2\\x85\\xa0"), - M("\\xe2\\x85\\xb0"), - M("\\xe2\\x98\\x83") - ); - test_lit!(sfx_one_lit_casei1, suffixes, "(?i-u)a", M("A"), M("a")); - test_lit!( - sfx_one_lit_casei2, - suffixes, - "(?i-u)abc", - M("ABC"), - M("ABc"), - M("AbC"), - M("Abc"), - M("aBC"), - M("aBc"), - M("abC"), - M("abc") - ); - test_lit!(sfx_group1, suffixes, "(a)", M("a")); - test_lit!(sfx_rep_zero_or_one1, suffixes, "a?"); - test_lit!(sfx_rep_zero_or_one2, suffixes, "(?:abc)?"); - test_lit!(sfx_rep_zero_or_more1, suffixes, "a*"); - test_lit!(sfx_rep_zero_or_more2, suffixes, "(?:abc)*"); - test_lit!(sfx_rep_one_or_more1, suffixes, "a+", C("a")); - test_lit!(sfx_rep_one_or_more2, suffixes, "(?:abc)+", C("abc")); - test_lit!(sfx_rep_nested_one_or_more, suffixes, "(?:a+)+", C("a")); - test_lit!(sfx_rep_range1, suffixes, "a{0}"); - test_lit!(sfx_rep_range2, suffixes, "a{0,}"); - test_lit!(sfx_rep_range3, suffixes, "a{0,1}"); - test_lit!(sfx_rep_range4, suffixes, "a{1}", M("a")); - test_lit!(sfx_rep_range5, suffixes, "a{2}", M("aa")); - test_lit!(sfx_rep_range6, suffixes, "a{1,2}", C("a")); - test_lit!(sfx_rep_range7, suffixes, "a{2,3}", C("aa")); - - // Test regexes with concatenations. - test_lit!(sfx_cat1, suffixes, "(?:a)(?:b)", M("ab")); - test_lit!(sfx_cat2, suffixes, "[ab]z", M("az"), M("bz")); - test_lit!( - sfx_cat3, - suffixes, - "(?i-u)[ab]z", - M("AZ"), - M("Az"), - M("BZ"), - M("Bz"), - M("aZ"), - M("az"), - M("bZ"), - M("bz") - ); - test_lit!( - sfx_cat4, - suffixes, - "[ab][yz]", - M("ay"), - M("az"), - M("by"), - M("bz") - ); - test_lit!(sfx_cat5, suffixes, "a*b", C("ab"), M("b")); - test_lit!(sfx_cat6, suffixes, "a*b*c", C("bc"), C("ac"), M("c")); - test_lit!(sfx_cat7, suffixes, "a*b*c+", C("c")); - test_lit!(sfx_cat8, suffixes, "a*b+c", C("bc")); - test_lit!(sfx_cat9, suffixes, "a*b+c*", C("c"), C("b")); - test_lit!(sfx_cat10, suffixes, "ab*", C("b"), M("a")); - test_lit!(sfx_cat11, suffixes, "ab*c", C("bc"), M("ac")); - test_lit!(sfx_cat12, suffixes, "ab+", C("b")); - test_lit!(sfx_cat13, suffixes, "ab+c", C("bc")); - test_lit!(sfx_cat14, suffixes, "a^"); - test_lit!(sfx_cat15, suffixes, "$a", C("a")); - test_lit!(sfx_cat16, suffixes, r"ab*c", C("bc"), M("ac")); - test_lit!(sfx_cat17, suffixes, r"ab+c", C("bc")); - test_lit!(sfx_cat18, suffixes, r"z*azb", C("zazb"), M("azb")); - test_lit!(sfx_cat19, suffixes, "a.z", C("z")); - - // Test regexes with alternations. - test_lit!(sfx_alt1, suffixes, "a|b", M("a"), M("b")); - test_lit!(sfx_alt2, suffixes, "[1-3]|b", M("1"), M("2"), M("3"), M("b")); - test_lit!(sfx_alt3, suffixes, "y(?:a|b)z", M("yaz"), M("ybz")); - test_lit!(sfx_alt4, suffixes, "a|b*"); - test_lit!(sfx_alt5, suffixes, "a|b+", M("a"), C("b")); - test_lit!(sfx_alt6, suffixes, "a|(?:b|c*)"); - test_lit!( - sfx_alt7, - suffixes, - "(a|b)*c|(a|ab)*c", - C("ac"), - C("bc"), - M("c"), - C("ac"), - C("abc"), - M("c") - ); - test_lit!(sfx_alt8, suffixes, "a*b|c", C("ab"), M("b"), M("c")); - - // Test regexes with empty assertions. - test_lit!(sfx_empty1, suffixes, "a$", M("a")); - test_lit!(sfx_empty2, suffixes, "${2}a", C("a")); - - // Make sure some curious regexes have no suffixes. - test_lit!(sfx_nothing1, suffixes, "."); - test_lit!(sfx_nothing2, suffixes, "(?s)."); - test_lit!(sfx_nothing3, suffixes, "^"); - test_lit!(sfx_nothing4, suffixes, "$"); - test_lit!(sfx_nothing6, suffixes, "(?m)$"); - test_lit!(sfx_nothing7, suffixes, r"\b"); - test_lit!(sfx_nothing8, suffixes, r"\B"); - - // Test a few regexes that defeat any suffix literal detection. - test_lit!(sfx_defeated1, suffixes, "a."); - test_lit!(sfx_defeated2, suffixes, "(?s)a."); - test_lit!(sfx_defeated3, suffixes, "a*b*c*"); - test_lit!(sfx_defeated4, suffixes, "a|."); - test_lit!(sfx_defeated5, suffixes, ".|a"); - test_lit!(sfx_defeated6, suffixes, "a|^"); - test_lit!(sfx_defeated7, suffixes, "(?:a(?:b)(?:c))."); - test_lit!(sfx_defeated8, suffixes, "a^"); - test_lit!(sfx_defeated9, suffixes, "(?m)a$"); - test_lit!(sfx_defeated10, suffixes, r"a\b"); - test_lit!(sfx_defeated11, suffixes, r"a\B"); - test_lit!(sfx_defeated12, suffixes, "a^*"); - test_lit!(sfx_defeated13, suffixes, "a^+"); - - // These test use a much lower limit than the default so that we can - // write test cases of reasonable size. - test_exhausted!(sfx_exhausted1, suffixes, "[a-z]"); - test_exhausted!(sfx_exhausted2, suffixes, "A[a-z]*"); - test_exhausted!(sfx_exhausted3, suffixes, "A[a-z]Z", C("Z")); - test_exhausted!( - sfx_exhausted4, - suffixes, - "(?i-u)foobar", - C("AR"), - C("Ar"), - C("aR"), - C("ar") - ); - test_exhausted!( - sfx_exhausted5, - suffixes, - "(?:ab){100}", - C("abababababababababab") - ); - test_exhausted!( - sfx_exhausted6, - suffixes, - "cd(?:(?:ab){100})*", - C("ababababab"), - M("cd") - ); - test_exhausted!( - sfx_exhausted7, - suffixes, - "cd(?:(?:ab){100})*z", - C("abababababz"), - M("cdz") - ); - test_exhausted!( - sfx_exhausted8, - suffixes, - "zaaaaaaaaaaaaaaaaaaaa", - C("aaaaaaaaaaaaaaaaaaaa") - ); - - // ************************************************************************ - // Tests for generating unambiguous literal sets. - // ************************************************************************ - - macro_rules! test_unamb { - ($name:ident, $given:expr, $expected:expr) => { - #[test] - fn $name() { - let given: Vec<Literal> = $given - .into_iter() - .map(|ul| { - let cut = ul.is_cut(); - Literal { v: ul.v.into_bytes(), cut: cut } - }) - .collect(); - let lits = create_lits(given); - let got = lits.unambiguous_prefixes(); - assert_eq!($expected, escape_lits(got.literals())); - } - }; - } - - test_unamb!(unambiguous1, vec![M("z"), M("azb")], vec![C("a"), C("z")]); - test_unamb!( - unambiguous2, - vec![M("zaaaaaa"), M("aa")], - vec![C("aa"), C("z")] - ); - test_unamb!( - unambiguous3, - vec![M("Sherlock"), M("Watson")], - vec![M("Sherlock"), M("Watson")] - ); - test_unamb!(unambiguous4, vec![M("abc"), M("bc")], vec![C("a"), C("bc")]); - test_unamb!(unambiguous5, vec![M("bc"), M("abc")], vec![C("a"), C("bc")]); - test_unamb!(unambiguous6, vec![M("a"), M("aa")], vec![C("a")]); - test_unamb!(unambiguous7, vec![M("aa"), M("a")], vec![C("a")]); - test_unamb!(unambiguous8, vec![M("ab"), M("a")], vec![C("a")]); - test_unamb!( - unambiguous9, - vec![M("ac"), M("bc"), M("c"), M("ac"), M("abc"), M("c")], - vec![C("a"), C("b"), C("c")] - ); - test_unamb!( - unambiguous10, - vec![M("Mo'"), M("Mu'"), M("Mo"), M("Mu")], - vec![C("Mo"), C("Mu")] - ); - test_unamb!( - unambiguous11, - vec![M("zazb"), M("azb")], - vec![C("a"), C("z")] - ); - test_unamb!(unambiguous12, vec![M("foo"), C("foo")], vec![C("foo")]); - test_unamb!( - unambiguous13, - vec![M("ABCX"), M("CDAX"), M("BCX")], - vec![C("A"), C("BCX"), C("CD")] - ); - test_unamb!( - unambiguous14, - vec![M("IMGX"), M("MVIX"), M("MGX"), M("DSX")], - vec![M("DSX"), C("I"), C("MGX"), C("MV")] - ); - test_unamb!( - unambiguous15, - vec![M("IMG_"), M("MG_"), M("CIMG")], - vec![C("C"), C("I"), C("MG_")] - ); - - // ************************************************************************ - // Tests for suffix trimming. - // ************************************************************************ - macro_rules! test_trim { - ($name:ident, $trim:expr, $given:expr, $expected:expr) => { - #[test] - fn $name() { - let given: Vec<Literal> = $given - .into_iter() - .map(|ul| { - let cut = ul.is_cut(); - Literal { v: ul.v.into_bytes(), cut: cut } - }) - .collect(); - let lits = create_lits(given); - let got = lits.trim_suffix($trim).unwrap(); - assert_eq!($expected, escape_lits(got.literals())); - } - }; - } - - test_trim!(trim1, 1, vec![M("ab"), M("yz")], vec![C("a"), C("y")]); - test_trim!(trim2, 1, vec![M("abc"), M("abd")], vec![C("ab")]); - test_trim!(trim3, 2, vec![M("abc"), M("abd")], vec![C("a")]); - test_trim!(trim4, 2, vec![M("abc"), M("ghij")], vec![C("a"), C("gh")]); - - // ************************************************************************ - // Tests for longest common prefix. - // ************************************************************************ - - macro_rules! test_lcp { - ($name:ident, $given:expr, $expected:expr) => { - #[test] - fn $name() { - let given: Vec<Literal> = $given - .into_iter() - .map(|s: &str| Literal { - v: s.to_owned().into_bytes(), - cut: false, - }) - .collect(); - let lits = create_lits(given); - let got = lits.longest_common_prefix(); - assert_eq!($expected, escape_bytes(got)); - } - }; - } - - test_lcp!(lcp1, vec!["a"], "a"); - test_lcp!(lcp2, vec![], ""); - test_lcp!(lcp3, vec!["a", "b"], ""); - test_lcp!(lcp4, vec!["ab", "ab"], "ab"); - test_lcp!(lcp5, vec!["ab", "a"], "a"); - test_lcp!(lcp6, vec!["a", "ab"], "a"); - test_lcp!(lcp7, vec!["ab", "b"], ""); - test_lcp!(lcp8, vec!["b", "ab"], ""); - test_lcp!(lcp9, vec!["foobar", "foobaz"], "fooba"); - test_lcp!(lcp10, vec!["foobar", "foobaz", "a"], ""); - test_lcp!(lcp11, vec!["a", "foobar", "foobaz"], ""); - test_lcp!(lcp12, vec!["foo", "flub", "flab", "floo"], "f"); - - // ************************************************************************ - // Tests for longest common suffix. - // ************************************************************************ - - macro_rules! test_lcs { - ($name:ident, $given:expr, $expected:expr) => { - #[test] - fn $name() { - let given: Vec<Literal> = $given - .into_iter() - .map(|s: &str| Literal { - v: s.to_owned().into_bytes(), - cut: false, - }) - .collect(); - let lits = create_lits(given); - let got = lits.longest_common_suffix(); - assert_eq!($expected, escape_bytes(got)); - } - }; - } - - test_lcs!(lcs1, vec!["a"], "a"); - test_lcs!(lcs2, vec![], ""); - test_lcs!(lcs3, vec!["a", "b"], ""); - test_lcs!(lcs4, vec!["ab", "ab"], "ab"); - test_lcs!(lcs5, vec!["ab", "a"], ""); - test_lcs!(lcs6, vec!["a", "ab"], ""); - test_lcs!(lcs7, vec!["ab", "b"], "b"); - test_lcs!(lcs8, vec!["b", "ab"], "b"); - test_lcs!(lcs9, vec!["barfoo", "bazfoo"], "foo"); - test_lcs!(lcs10, vec!["barfoo", "bazfoo", "a"], ""); - test_lcs!(lcs11, vec!["a", "barfoo", "bazfoo"], ""); - test_lcs!(lcs12, vec!["flub", "bub", "boob", "dub"], "b"); -} diff --git a/vendor/regex-syntax/src/hir/mod.rs b/vendor/regex-syntax/src/hir/mod.rs index 1096e9f05..e5ea3701b 100644 --- a/vendor/regex-syntax/src/hir/mod.rs +++ b/vendor/regex-syntax/src/hir/mod.rs @@ -1,19 +1,42 @@ /*! -Defines a high-level intermediate representation for regular expressions. +Defines a high-level intermediate (HIR) representation for regular expressions. + +The HIR is represented by the [`Hir`] type, and it principally constructed via +[translation](translate) from an [`Ast`](crate::ast::Ast). Alternatively, users +may use the smart constructors defined on `Hir` to build their own by hand. The +smart constructors simultaneously simplify and "optimize" the HIR, and are also +the same routines used by translation. + +Most regex engines only have an HIR like this, and usually construct it +directly from the concrete syntax. This crate however first parses the +concrete syntax into an `Ast`, and only then creates the HIR from the `Ast`, +as mentioned above. It's done this way to facilitate better error reporting, +and to have a structured representation of a regex that faithfully represents +its concrete syntax. Namely, while an `Hir` value can be converted back to an +equivalent regex pattern string, it is unlikely to look like the original due +to its simplified structure. */ -use std::char; -use std::cmp; -use std::error; -use std::fmt; -use std::result; -use std::u8; -use crate::ast::Span; -use crate::hir::interval::{Interval, IntervalSet, IntervalSetIter}; -use crate::unicode; +use core::{char, cmp}; -pub use crate::hir::visitor::{visit, Visitor}; -pub use crate::unicode::CaseFoldError; +use alloc::{ + boxed::Box, + format, + string::{String, ToString}, + vec, + vec::Vec, +}; + +use crate::{ + ast::Span, + hir::interval::{Interval, IntervalSet, IntervalSetIter}, + unicode, +}; + +pub use crate::{ + hir::visitor::{visit, Visitor}, + unicode::CaseFoldError, +}; mod interval; pub mod literal; @@ -53,13 +76,17 @@ impl Error { } /// The type of an error that occurred while building an `Hir`. +/// +/// This error type is marked as `non_exhaustive`. This means that adding a +/// new variant is not considered a breaking change. +#[non_exhaustive] #[derive(Clone, Debug, Eq, PartialEq)] pub enum ErrorKind { /// This error occurs when a Unicode feature is used when Unicode /// support is disabled. For example `(?-u:\pL)` would trigger this error. UnicodeNotAllowed, /// This error occurs when translating a pattern that could match a byte - /// sequence that isn't UTF-8 and `allow_invalid_utf8` was disabled. + /// sequence that isn't UTF-8 and `utf8` was enabled. InvalidUtf8, /// This occurs when an unrecognized Unicode property name could not /// be found. @@ -75,27 +102,22 @@ pub enum ErrorKind { /// available, and the regular expression required Unicode aware case /// insensitivity. UnicodeCaseUnavailable, - /// This occurs when the translator attempts to construct a character class - /// that is empty. - /// - /// Note that this restriction in the translator may be removed in the - /// future. - EmptyClassNotAllowed, - /// Hints that destructuring should not be exhaustive. - /// - /// This enum may grow additional variants, so this makes sure clients - /// don't count on exhaustive matching. (Otherwise, adding a new variant - /// could break existing code.) - #[doc(hidden)] - __Nonexhaustive, } -impl ErrorKind { - // TODO: Remove this method entirely on the next breaking semver release. - #[allow(deprecated)] - fn description(&self) -> &str { +#[cfg(feature = "std")] +impl std::error::Error for Error {} + +impl core::fmt::Display for Error { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + crate::error::Formatter::from(self).fmt(f) + } +} + +impl core::fmt::Display for ErrorKind { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { use self::ErrorKind::*; - match *self { + + let msg = match *self { UnicodeNotAllowed => "Unicode not allowed here", InvalidUtf8 => "pattern can match invalid UTF-8", UnicodePropertyNotFound => "Unicode property not found", @@ -108,112 +130,82 @@ impl ErrorKind { "Unicode-aware case insensitivity matching is not available \ (make sure the unicode-case feature is enabled)" } - EmptyClassNotAllowed => "empty character classes are not allowed", - __Nonexhaustive => unreachable!(), - } - } -} - -impl error::Error for Error { - // TODO: Remove this method entirely on the next breaking semver release. - #[allow(deprecated)] - fn description(&self) -> &str { - self.kind.description() - } -} - -impl fmt::Display for Error { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - crate::error::Formatter::from(self).fmt(f) - } -} - -impl fmt::Display for ErrorKind { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - // TODO: Remove this on the next breaking semver release. - #[allow(deprecated)] - f.write_str(self.description()) + }; + f.write_str(msg) } } /// A high-level intermediate representation (HIR) for a regular expression. /// -/// The HIR of a regular expression represents an intermediate step between its -/// abstract syntax (a structured description of the concrete syntax) and -/// compiled byte codes. The purpose of HIR is to make regular expressions +/// An HIR value is a combination of a [`HirKind`] and a set of [`Properties`]. +/// An `HirKind` indicates what kind of regular expression it is (a literal, +/// a repetition, a look-around assertion, etc.), where as a `Properties` +/// describes various facts about the regular expression. For example, whether +/// it matches UTF-8 or if it matches the empty string. +/// +/// The HIR of a regular expression represents an intermediate step between +/// its abstract syntax (a structured description of the concrete syntax) and +/// an actual regex matcher. The purpose of HIR is to make regular expressions /// easier to analyze. In particular, the AST is much more complex than the /// HIR. For example, while an AST supports arbitrarily nested character /// classes, the HIR will flatten all nested classes into a single set. The HIR /// will also "compile away" every flag present in the concrete syntax. For /// example, users of HIR expressions never need to worry about case folding; -/// it is handled automatically by the translator (e.g., by translating `(?i)A` -/// to `[aA]`). -/// -/// If the HIR was produced by a translator that disallows invalid UTF-8, then -/// the HIR is guaranteed to match UTF-8 exclusively. -/// -/// This type defines its own destructor that uses constant stack space and -/// heap space proportional to the size of the HIR. +/// it is handled automatically by the translator (e.g., by translating +/// `(?i:A)` to `[aA]`). /// /// The specific type of an HIR expression can be accessed via its `kind` /// or `into_kind` methods. This extra level of indirection exists for two /// reasons: /// -/// 1. Construction of an HIR expression *must* use the constructor methods -/// on this `Hir` type instead of building the `HirKind` values directly. -/// This permits construction to enforce invariants like "concatenations -/// always consist of two or more sub-expressions." +/// 1. Construction of an HIR expression *must* use the constructor methods on +/// this `Hir` type instead of building the `HirKind` values directly. This +/// permits construction to enforce invariants like "concatenations always +/// consist of two or more sub-expressions." /// 2. Every HIR expression contains attributes that are defined inductively, -/// and can be computed cheaply during the construction process. For -/// example, one such attribute is whether the expression must match at the -/// beginning of the text. +/// and can be computed cheaply during the construction process. For example, +/// one such attribute is whether the expression must match at the beginning of +/// the haystack. +/// +/// In particular, if you have an `HirKind` value, then there is intentionally +/// no way to build an `Hir` value from it. You instead need to do case +/// analysis on the `HirKind` value and build the `Hir` value using its smart +/// constructors. +/// +/// # UTF-8 +/// +/// If the HIR was produced by a translator with +/// [`TranslatorBuilder::utf8`](translate::TranslatorBuilder::utf8) enabled, +/// then the HIR is guaranteed to match UTF-8 exclusively for all non-empty +/// matches. +/// +/// For empty matches, those can occur at any position. It is the +/// repsonsibility of the regex engine to determine whether empty matches are +/// permitted between the code units of a single codepoint. +/// +/// # Stack space +/// +/// This type defines its own destructor that uses constant stack space and +/// heap space proportional to the size of the HIR. /// /// Also, an `Hir`'s `fmt::Display` implementation prints an HIR as a regular /// expression pattern string, and uses constant stack space and heap space -/// proportional to the size of the `Hir`. -#[derive(Clone, Debug, Eq, PartialEq)] +/// proportional to the size of the `Hir`. The regex it prints is guaranteed to +/// be _semantically_ equivalent to the original concrete syntax, but it may +/// look very different. (And potentially not practically readable by a human.) +/// +/// An `Hir`'s `fmt::Debug` implementation currently does not use constant +/// stack space. The implementation will also suppress some details (such as +/// the `Properties` inlined into every `Hir` value to make it less noisy). +#[derive(Clone, Eq, PartialEq)] pub struct Hir { /// The underlying HIR kind. kind: HirKind, /// Analysis info about this HIR, computed during construction. - info: HirInfo, -} - -/// The kind of an arbitrary `Hir` expression. -#[derive(Clone, Debug, Eq, PartialEq)] -pub enum HirKind { - /// The empty regular expression, which matches everything, including the - /// empty string. - Empty, - /// A single literal character that matches exactly this character. - Literal(Literal), - /// A single character class that matches any of the characters in the - /// class. A class can either consist of Unicode scalar values as - /// characters, or it can use bytes. - Class(Class), - /// An anchor assertion. An anchor assertion match always has zero length. - Anchor(Anchor), - /// A word boundary assertion, which may or may not be Unicode aware. A - /// word boundary assertion match always has zero length. - WordBoundary(WordBoundary), - /// A repetition operation applied to a child expression. - Repetition(Repetition), - /// A possibly capturing group, which contains a child expression. - Group(Group), - /// A concatenation of expressions. A concatenation always has at least two - /// child expressions. - /// - /// A concatenation matches only if each of its child expression matches - /// one after the other. - Concat(Vec<Hir>), - /// An alternation of expressions. An alternation always has at least two - /// child expressions. - /// - /// An alternation matches only if at least one of its child expression - /// matches. If multiple expressions match, then the leftmost is preferred. - Alternation(Vec<Hir>), + props: Properties, } +/// Methods for accessing the underlying `HirKind` and `Properties`. impl Hir { /// Returns a reference to the underlying HIR kind. pub fn kind(&self) -> &HirKind { @@ -223,543 +215,560 @@ impl Hir { /// Consumes ownership of this HIR expression and returns its underlying /// `HirKind`. pub fn into_kind(mut self) -> HirKind { - use std::mem; - mem::replace(&mut self.kind, HirKind::Empty) + core::mem::replace(&mut self.kind, HirKind::Empty) + } + + /// Returns the properties computed for this `Hir`. + pub fn properties(&self) -> &Properties { + &self.props + } + + /// Splits this HIR into its constituent parts. + /// + /// This is useful because `let Hir { kind, props } = hir;` does not work + /// because of `Hir`'s custom `Drop` implementation. + fn into_parts(mut self) -> (HirKind, Properties) { + ( + core::mem::replace(&mut self.kind, HirKind::Empty), + core::mem::replace(&mut self.props, Properties::empty()), + ) } +} +/// Smart constructors for HIR values. +/// +/// These constructors are called "smart" because they do inductive work or +/// simplifications. For example, calling `Hir::repetition` with a repetition +/// like `a{0}` will actually return a `Hir` with a `HirKind::Empty` kind +/// since it is equivalent to an empty regex. Another example is calling +/// `Hir::concat(vec![expr])`. Instead of getting a `HirKind::Concat`, you'll +/// just get back the original `expr` since it's precisely equivalent. +/// +/// Smart constructors enable maintaining invariants about the HIR data type +/// while also simulanteously keeping the representation as simple as possible. +impl Hir { /// Returns an empty HIR expression. /// /// An empty HIR expression always matches, including the empty string. + #[inline] pub fn empty() -> Hir { - let mut info = HirInfo::new(); - info.set_always_utf8(true); - info.set_all_assertions(true); - info.set_anchored_start(false); - info.set_anchored_end(false); - info.set_line_anchored_start(false); - info.set_line_anchored_end(false); - info.set_any_anchored_start(false); - info.set_any_anchored_end(false); - info.set_match_empty(true); - info.set_literal(false); - info.set_alternation_literal(false); - Hir { kind: HirKind::Empty, info } + let props = Properties::empty(); + Hir { kind: HirKind::Empty, props } + } + + /// Returns an HIR expression that can never match anything. That is, + /// the size of the set of strings in the language described by the HIR + /// returned is `0`. + /// + /// This is distinct from [`Hir::empty`] in that the empty string matches + /// the HIR returned by `Hir::empty`. That is, the set of strings in the + /// language describe described by `Hir::empty` is non-empty. + /// + /// Note that currently, the HIR returned uses an empty character class to + /// indicate that nothing can match. An equivalent expression that cannot + /// match is an empty alternation, but all such "fail" expressions are + /// normalized (via smart constructors) to empty character classes. This is + /// because empty character classes can be spelled in the concrete syntax + /// of a regex (e.g., `\P{any}` or `(?-u:[^\x00-\xFF])` or `[a&&b]`), but + /// empty alternations cannot. + #[inline] + pub fn fail() -> Hir { + let class = Class::Bytes(ClassBytes::empty()); + let props = Properties::class(&class); + // We can't just call Hir::class here because it defers to Hir::fail + // in order to canonicalize the Hir value used to represent "cannot + // match." + Hir { kind: HirKind::Class(class), props } } /// Creates a literal HIR expression. /// - /// If the given literal has a `Byte` variant with an ASCII byte, then this - /// method panics. This enforces the invariant that `Byte` variants are - /// only used to express matching of invalid UTF-8. - pub fn literal(lit: Literal) -> Hir { - if let Literal::Byte(b) = lit { - assert!(b > 0x7F); + /// This accepts anything that can be converted into a `Box<[u8]>`. + /// + /// Note that there is no mechanism for storing a `char` or a `Box<str>` + /// in an HIR. Everything is "just bytes." Whether a `Literal` (or + /// any HIR node) matches valid UTF-8 exclusively can be queried via + /// [`Properties::is_utf8`]. + /// + /// # Example + /// + /// This example shows that concatenations of `Literal` HIR values will + /// automatically get flattened and combined together. So for example, even + /// if you concat multiple `Literal` values that are themselves not valid + /// UTF-8, they might add up to valid UTF-8. This also demonstrates just + /// how "smart" Hir's smart constructors are. + /// + /// ``` + /// use regex_syntax::hir::{Hir, HirKind, Literal}; + /// + /// let literals = vec![ + /// Hir::literal([0xE2]), + /// Hir::literal([0x98]), + /// Hir::literal([0x83]), + /// ]; + /// // Each literal, on its own, is invalid UTF-8. + /// assert!(literals.iter().all(|hir| !hir.properties().is_utf8())); + /// + /// let concat = Hir::concat(literals); + /// // But the concatenation is valid UTF-8! + /// assert!(concat.properties().is_utf8()); + /// + /// // And also notice that the literals have been concatenated into a + /// // single `Literal`, to the point where there is no explicit `Concat`! + /// let expected = HirKind::Literal(Literal(Box::from("☃".as_bytes()))); + /// assert_eq!(&expected, concat.kind()); + /// ``` + #[inline] + pub fn literal<B: Into<Box<[u8]>>>(lit: B) -> Hir { + let bytes = lit.into(); + if bytes.is_empty() { + return Hir::empty(); } - let mut info = HirInfo::new(); - info.set_always_utf8(lit.is_unicode()); - info.set_all_assertions(false); - info.set_anchored_start(false); - info.set_anchored_end(false); - info.set_line_anchored_start(false); - info.set_line_anchored_end(false); - info.set_any_anchored_start(false); - info.set_any_anchored_end(false); - info.set_match_empty(false); - info.set_literal(true); - info.set_alternation_literal(true); - Hir { kind: HirKind::Literal(lit), info } - } - - /// Creates a class HIR expression. + let lit = Literal(bytes); + let props = Properties::literal(&lit); + Hir { kind: HirKind::Literal(lit), props } + } + + /// Creates a class HIR expression. The class may either be defined over + /// ranges of Unicode codepoints or ranges of raw byte values. + /// + /// Note that an empty class is permitted. An empty class is equivalent to + /// `Hir::fail()`. + #[inline] pub fn class(class: Class) -> Hir { - let mut info = HirInfo::new(); - info.set_always_utf8(class.is_always_utf8()); - info.set_all_assertions(false); - info.set_anchored_start(false); - info.set_anchored_end(false); - info.set_line_anchored_start(false); - info.set_line_anchored_end(false); - info.set_any_anchored_start(false); - info.set_any_anchored_end(false); - info.set_match_empty(false); - info.set_literal(false); - info.set_alternation_literal(false); - Hir { kind: HirKind::Class(class), info } - } - - /// Creates an anchor assertion HIR expression. - pub fn anchor(anchor: Anchor) -> Hir { - let mut info = HirInfo::new(); - info.set_always_utf8(true); - info.set_all_assertions(true); - info.set_anchored_start(false); - info.set_anchored_end(false); - info.set_line_anchored_start(false); - info.set_line_anchored_end(false); - info.set_any_anchored_start(false); - info.set_any_anchored_end(false); - info.set_match_empty(true); - info.set_literal(false); - info.set_alternation_literal(false); - if let Anchor::StartText = anchor { - info.set_anchored_start(true); - info.set_line_anchored_start(true); - info.set_any_anchored_start(true); - } - if let Anchor::EndText = anchor { - info.set_anchored_end(true); - info.set_line_anchored_end(true); - info.set_any_anchored_end(true); + if class.is_empty() { + return Hir::fail(); + } else if let Some(bytes) = class.literal() { + return Hir::literal(bytes); } - if let Anchor::StartLine = anchor { - info.set_line_anchored_start(true); - } - if let Anchor::EndLine = anchor { - info.set_line_anchored_end(true); - } - Hir { kind: HirKind::Anchor(anchor), info } - } - - /// Creates a word boundary assertion HIR expression. - pub fn word_boundary(word_boundary: WordBoundary) -> Hir { - let mut info = HirInfo::new(); - info.set_always_utf8(true); - info.set_all_assertions(true); - info.set_anchored_start(false); - info.set_anchored_end(false); - info.set_line_anchored_start(false); - info.set_line_anchored_end(false); - info.set_any_anchored_start(false); - info.set_any_anchored_end(false); - info.set_literal(false); - info.set_alternation_literal(false); - // A negated word boundary matches '', so that's fine. But \b does not - // match \b, so why do we say it can match the empty string? Well, - // because, if you search for \b against 'a', it will report [0, 0) and - // [1, 1) as matches, and both of those matches correspond to the empty - // string. Thus, only *certain* empty strings match \b, which similarly - // applies to \B. - info.set_match_empty(true); - // Negated ASCII word boundaries can match invalid UTF-8. - if let WordBoundary::AsciiNegate = word_boundary { - info.set_always_utf8(false); - } - Hir { kind: HirKind::WordBoundary(word_boundary), info } + let props = Properties::class(&class); + Hir { kind: HirKind::Class(class), props } + } + + /// Creates a look-around assertion HIR expression. + #[inline] + pub fn look(look: Look) -> Hir { + let props = Properties::look(look); + Hir { kind: HirKind::Look(look), props } } /// Creates a repetition HIR expression. + #[inline] pub fn repetition(rep: Repetition) -> Hir { - let mut info = HirInfo::new(); - info.set_always_utf8(rep.hir.is_always_utf8()); - info.set_all_assertions(rep.hir.is_all_assertions()); - // If this operator can match the empty string, then it can never - // be anchored. - info.set_anchored_start( - !rep.is_match_empty() && rep.hir.is_anchored_start(), - ); - info.set_anchored_end( - !rep.is_match_empty() && rep.hir.is_anchored_end(), - ); - info.set_line_anchored_start( - !rep.is_match_empty() && rep.hir.is_anchored_start(), - ); - info.set_line_anchored_end( - !rep.is_match_empty() && rep.hir.is_anchored_end(), - ); - info.set_any_anchored_start(rep.hir.is_any_anchored_start()); - info.set_any_anchored_end(rep.hir.is_any_anchored_end()); - info.set_match_empty(rep.is_match_empty() || rep.hir.is_match_empty()); - info.set_literal(false); - info.set_alternation_literal(false); - Hir { kind: HirKind::Repetition(rep), info } - } - - /// Creates a group HIR expression. - pub fn group(group: Group) -> Hir { - let mut info = HirInfo::new(); - info.set_always_utf8(group.hir.is_always_utf8()); - info.set_all_assertions(group.hir.is_all_assertions()); - info.set_anchored_start(group.hir.is_anchored_start()); - info.set_anchored_end(group.hir.is_anchored_end()); - info.set_line_anchored_start(group.hir.is_line_anchored_start()); - info.set_line_anchored_end(group.hir.is_line_anchored_end()); - info.set_any_anchored_start(group.hir.is_any_anchored_start()); - info.set_any_anchored_end(group.hir.is_any_anchored_end()); - info.set_match_empty(group.hir.is_match_empty()); - info.set_literal(false); - info.set_alternation_literal(false); - Hir { kind: HirKind::Group(group), info } + // The regex 'a{0}' is always equivalent to the empty regex. This is + // true even when 'a' is an expression that never matches anything + // (like '\P{any}'). + // + // Additionally, the regex 'a{1}' is always equivalent to 'a'. + if rep.min == 0 && rep.max == Some(0) { + return Hir::empty(); + } else if rep.min == 1 && rep.max == Some(1) { + return *rep.sub; + } + let props = Properties::repetition(&rep); + Hir { kind: HirKind::Repetition(rep), props } + } + + /// Creates a capture HIR expression. + /// + /// Note that there is no explicit HIR value for a non-capturing group. + /// Since a non-capturing group only exists to override precedence in the + /// concrete syntax and since an HIR already does its own grouping based on + /// what is parsed, there is no need to explicitly represent non-capturing + /// groups in the HIR. + #[inline] + pub fn capture(capture: Capture) -> Hir { + let props = Properties::capture(&capture); + Hir { kind: HirKind::Capture(capture), props } } /// Returns the concatenation of the given expressions. /// - /// This flattens the concatenation as appropriate. - pub fn concat(mut exprs: Vec<Hir>) -> Hir { - match exprs.len() { - 0 => Hir::empty(), - 1 => exprs.pop().unwrap(), - _ => { - let mut info = HirInfo::new(); - info.set_always_utf8(true); - info.set_all_assertions(true); - info.set_any_anchored_start(false); - info.set_any_anchored_end(false); - info.set_match_empty(true); - info.set_literal(true); - info.set_alternation_literal(true); - - // Some attributes require analyzing all sub-expressions. - for e in &exprs { - let x = info.is_always_utf8() && e.is_always_utf8(); - info.set_always_utf8(x); - - let x = info.is_all_assertions() && e.is_all_assertions(); - info.set_all_assertions(x); - - let x = info.is_any_anchored_start() - || e.is_any_anchored_start(); - info.set_any_anchored_start(x); - - let x = - info.is_any_anchored_end() || e.is_any_anchored_end(); - info.set_any_anchored_end(x); - - let x = info.is_match_empty() && e.is_match_empty(); - info.set_match_empty(x); - - let x = info.is_literal() && e.is_literal(); - info.set_literal(x); - - let x = info.is_alternation_literal() - && e.is_alternation_literal(); - info.set_alternation_literal(x); + /// This attempts to flatten and simplify the concatenation as appropriate. + /// + /// # Example + /// + /// This shows a simple example of basic flattening of both concatenations + /// and literals. + /// + /// ``` + /// use regex_syntax::hir::Hir; + /// + /// let hir = Hir::concat(vec![ + /// Hir::concat(vec![ + /// Hir::literal([b'a']), + /// Hir::literal([b'b']), + /// Hir::literal([b'c']), + /// ]), + /// Hir::concat(vec![ + /// Hir::literal([b'x']), + /// Hir::literal([b'y']), + /// Hir::literal([b'z']), + /// ]), + /// ]); + /// let expected = Hir::literal("abcxyz".as_bytes()); + /// assert_eq!(expected, hir); + /// ``` + pub fn concat(subs: Vec<Hir>) -> Hir { + // We rebuild the concatenation by simplifying it. Would be nice to do + // it in place, but that seems a little tricky? + let mut new = vec![]; + // This gobbles up any adjacent literals in a concatenation and smushes + // them together. Basically, when we see a literal, we add its bytes + // to 'prior_lit', and whenever we see anything else, we first take + // any bytes in 'prior_lit' and add it to the 'new' concatenation. + let mut prior_lit: Option<Vec<u8>> = None; + for sub in subs { + let (kind, props) = sub.into_parts(); + match kind { + HirKind::Literal(Literal(bytes)) => { + if let Some(ref mut prior_bytes) = prior_lit { + prior_bytes.extend_from_slice(&bytes); + } else { + prior_lit = Some(bytes.to_vec()); + } + } + // We also flatten concats that are direct children of another + // concat. We only need to do this one level deep since + // Hir::concat is the only way to build concatenations, and so + // flattening happens inductively. + HirKind::Concat(subs2) => { + for sub2 in subs2 { + let (kind2, props2) = sub2.into_parts(); + match kind2 { + HirKind::Literal(Literal(bytes)) => { + if let Some(ref mut prior_bytes) = prior_lit { + prior_bytes.extend_from_slice(&bytes); + } else { + prior_lit = Some(bytes.to_vec()); + } + } + kind2 => { + if let Some(prior_bytes) = prior_lit.take() { + new.push(Hir::literal(prior_bytes)); + } + new.push(Hir { kind: kind2, props: props2 }); + } + } + } + } + // We can just skip empty HIRs. + HirKind::Empty => {} + kind => { + if let Some(prior_bytes) = prior_lit.take() { + new.push(Hir::literal(prior_bytes)); + } + new.push(Hir { kind, props }); } - // Anchored attributes require something slightly more - // sophisticated. Normally, WLOG, to determine whether an - // expression is anchored to the start, we'd only need to check - // the first expression of a concatenation. However, - // expressions like `$\b^` are still anchored to the start, - // but the first expression in the concatenation *isn't* - // anchored to the start. So the "first" expression to look at - // is actually one that is either not an assertion or is - // specifically the StartText assertion. - info.set_anchored_start( - exprs - .iter() - .take_while(|e| { - e.is_anchored_start() || e.is_all_assertions() - }) - .any(|e| e.is_anchored_start()), - ); - // Similarly for the end anchor, but in reverse. - info.set_anchored_end( - exprs - .iter() - .rev() - .take_while(|e| { - e.is_anchored_end() || e.is_all_assertions() - }) - .any(|e| e.is_anchored_end()), - ); - // Repeat the process for line anchors. - info.set_line_anchored_start( - exprs - .iter() - .take_while(|e| { - e.is_line_anchored_start() || e.is_all_assertions() - }) - .any(|e| e.is_line_anchored_start()), - ); - info.set_line_anchored_end( - exprs - .iter() - .rev() - .take_while(|e| { - e.is_line_anchored_end() || e.is_all_assertions() - }) - .any(|e| e.is_line_anchored_end()), - ); - Hir { kind: HirKind::Concat(exprs), info } } } + if let Some(prior_bytes) = prior_lit.take() { + new.push(Hir::literal(prior_bytes)); + } + if new.is_empty() { + return Hir::empty(); + } else if new.len() == 1 { + return new.pop().unwrap(); + } + let props = Properties::concat(&new); + Hir { kind: HirKind::Concat(new), props } } /// Returns the alternation of the given expressions. /// - /// This flattens the alternation as appropriate. - pub fn alternation(mut exprs: Vec<Hir>) -> Hir { - match exprs.len() { - 0 => Hir::empty(), - 1 => exprs.pop().unwrap(), - _ => { - let mut info = HirInfo::new(); - info.set_always_utf8(true); - info.set_all_assertions(true); - info.set_anchored_start(true); - info.set_anchored_end(true); - info.set_line_anchored_start(true); - info.set_line_anchored_end(true); - info.set_any_anchored_start(false); - info.set_any_anchored_end(false); - info.set_match_empty(false); - info.set_literal(false); - info.set_alternation_literal(true); - - // Some attributes require analyzing all sub-expressions. - for e in &exprs { - let x = info.is_always_utf8() && e.is_always_utf8(); - info.set_always_utf8(x); - - let x = info.is_all_assertions() && e.is_all_assertions(); - info.set_all_assertions(x); - - let x = info.is_anchored_start() && e.is_anchored_start(); - info.set_anchored_start(x); - - let x = info.is_anchored_end() && e.is_anchored_end(); - info.set_anchored_end(x); - - let x = info.is_line_anchored_start() - && e.is_line_anchored_start(); - info.set_line_anchored_start(x); - - let x = info.is_line_anchored_end() - && e.is_line_anchored_end(); - info.set_line_anchored_end(x); - - let x = info.is_any_anchored_start() - || e.is_any_anchored_start(); - info.set_any_anchored_start(x); - - let x = - info.is_any_anchored_end() || e.is_any_anchored_end(); - info.set_any_anchored_end(x); - - let x = info.is_match_empty() || e.is_match_empty(); - info.set_match_empty(x); - - let x = info.is_alternation_literal() && e.is_literal(); - info.set_alternation_literal(x); + /// This flattens and simplifies the alternation as appropriate. This may + /// include factoring out common prefixes or even rewriting the alternation + /// as a character class. + /// + /// Note that an empty alternation is equivalent to `Hir::fail()`. (It + /// is not possible for one to write an empty alternation, or even an + /// alternation with a single sub-expression, in the concrete syntax of a + /// regex.) + /// + /// # Example + /// + /// This is a simple example showing how an alternation might get + /// simplified. + /// + /// ``` + /// use regex_syntax::hir::{Hir, Class, ClassUnicode, ClassUnicodeRange}; + /// + /// let hir = Hir::alternation(vec![ + /// Hir::literal([b'a']), + /// Hir::literal([b'b']), + /// Hir::literal([b'c']), + /// Hir::literal([b'd']), + /// Hir::literal([b'e']), + /// Hir::literal([b'f']), + /// ]); + /// let expected = Hir::class(Class::Unicode(ClassUnicode::new([ + /// ClassUnicodeRange::new('a', 'f'), + /// ]))); + /// assert_eq!(expected, hir); + /// ``` + /// + /// And another example showing how common prefixes might get factored + /// out. + /// + /// ``` + /// use regex_syntax::hir::{Hir, Class, ClassUnicode, ClassUnicodeRange}; + /// + /// let hir = Hir::alternation(vec![ + /// Hir::concat(vec![ + /// Hir::literal("abc".as_bytes()), + /// Hir::class(Class::Unicode(ClassUnicode::new([ + /// ClassUnicodeRange::new('A', 'Z'), + /// ]))), + /// ]), + /// Hir::concat(vec![ + /// Hir::literal("abc".as_bytes()), + /// Hir::class(Class::Unicode(ClassUnicode::new([ + /// ClassUnicodeRange::new('a', 'z'), + /// ]))), + /// ]), + /// ]); + /// let expected = Hir::concat(vec![ + /// Hir::literal("abc".as_bytes()), + /// Hir::alternation(vec![ + /// Hir::class(Class::Unicode(ClassUnicode::new([ + /// ClassUnicodeRange::new('A', 'Z'), + /// ]))), + /// Hir::class(Class::Unicode(ClassUnicode::new([ + /// ClassUnicodeRange::new('a', 'z'), + /// ]))), + /// ]), + /// ]); + /// assert_eq!(expected, hir); + /// ``` + /// + /// Note that these sorts of simplifications are not guaranteed. + pub fn alternation(subs: Vec<Hir>) -> Hir { + // We rebuild the alternation by simplifying it. We proceed similarly + // as the concatenation case. But in this case, there's no literal + // simplification happening. We're just flattening alternations. + let mut new = vec![]; + for sub in subs { + let (kind, props) = sub.into_parts(); + match kind { + HirKind::Alternation(subs2) => { + new.extend(subs2); + } + kind => { + new.push(Hir { kind, props }); } - Hir { kind: HirKind::Alternation(exprs), info } } } - } - - /// Build an HIR expression for `.`. - /// - /// A `.` expression matches any character except for `\n`. To build an - /// expression that matches any character, including `\n`, use the `any` - /// method. - /// - /// If `bytes` is `true`, then this assumes characters are limited to a - /// single byte. - pub fn dot(bytes: bool) -> Hir { - if bytes { - let mut cls = ClassBytes::empty(); - cls.push(ClassBytesRange::new(b'\0', b'\x09')); - cls.push(ClassBytesRange::new(b'\x0B', b'\xFF')); - Hir::class(Class::Bytes(cls)) - } else { - let mut cls = ClassUnicode::empty(); - cls.push(ClassUnicodeRange::new('\0', '\x09')); - cls.push(ClassUnicodeRange::new('\x0B', '\u{10FFFF}')); - Hir::class(Class::Unicode(cls)) + if new.is_empty() { + return Hir::fail(); + } else if new.len() == 1 { + return new.pop().unwrap(); + } + // Now that it's completely flattened, look for the special case of + // 'char1|char2|...|charN' and collapse that into a class. Note that + // we look for 'char' first and then bytes. The issue here is that if + // we find both non-ASCII codepoints and non-ASCII singleton bytes, + // then it isn't actually possible to smush them into a single class. + // (Because classes are either "all codepoints" or "all bytes." You + // can have a class that both matches non-ASCII but valid UTF-8 and + // invalid UTF-8.) So we look for all chars and then all bytes, and + // don't handle anything else. + if let Some(singletons) = singleton_chars(&new) { + let it = singletons + .into_iter() + .map(|ch| ClassUnicodeRange { start: ch, end: ch }); + return Hir::class(Class::Unicode(ClassUnicode::new(it))); } + if let Some(singletons) = singleton_bytes(&new) { + let it = singletons + .into_iter() + .map(|b| ClassBytesRange { start: b, end: b }); + return Hir::class(Class::Bytes(ClassBytes::new(it))); + } + // Similar to singleton chars, we can also look for alternations of + // classes. Those can be smushed into a single class. + if let Some(cls) = class_chars(&new) { + return Hir::class(cls); + } + if let Some(cls) = class_bytes(&new) { + return Hir::class(cls); + } + // Factor out a common prefix if we can, which might potentially + // simplify the expression and unlock other optimizations downstream. + // It also might generally make NFA matching and DFA construction + // faster by reducing the scope of branching in the regex. + new = match lift_common_prefix(new) { + Ok(hir) => return hir, + Err(unchanged) => unchanged, + }; + let props = Properties::alternation(&new); + Hir { kind: HirKind::Alternation(new), props } } - /// Build an HIR expression for `(?s).`. + /// Returns an HIR expression for `.`. /// - /// A `(?s).` expression matches any character, including `\n`. To build an - /// expression that matches any character except for `\n`, then use the - /// `dot` method. + /// * [`Dot::AnyChar`] maps to `(?su-R:.)`. + /// * [`Dot::AnyByte`] maps to `(?s-Ru:.)`. + /// * [`Dot::AnyCharExceptLF`] maps to `(?u-Rs:.)`. + /// * [`Dot::AnyCharExceptCRLF`] maps to `(?Ru-s:.)`. + /// * [`Dot::AnyByteExceptLF`] maps to `(?-Rsu:.)`. + /// * [`Dot::AnyByteExceptCRLF`] maps to `(?R-su:.)`. /// - /// If `bytes` is `true`, then this assumes characters are limited to a - /// single byte. - pub fn any(bytes: bool) -> Hir { - if bytes { - let mut cls = ClassBytes::empty(); - cls.push(ClassBytesRange::new(b'\0', b'\xFF')); - Hir::class(Class::Bytes(cls)) - } else { - let mut cls = ClassUnicode::empty(); - cls.push(ClassUnicodeRange::new('\0', '\u{10FFFF}')); - Hir::class(Class::Unicode(cls)) - } - } - - /// Return true if and only if this HIR will always match valid UTF-8. + /// # Example /// - /// When this returns false, then it is possible for this HIR expression - /// to match invalid UTF-8. - pub fn is_always_utf8(&self) -> bool { - self.info.is_always_utf8() - } - - /// Returns true if and only if this entire HIR expression is made up of - /// zero-width assertions. + /// Note that this is a convenience routine for constructing the correct + /// character class based on the value of `Dot`. There is no explicit "dot" + /// HIR value. It is just an abbreviation for a common character class. /// - /// This includes expressions like `^$\b\A\z` and even `((\b)+())*^`, but - /// not `^a`. - pub fn is_all_assertions(&self) -> bool { - self.info.is_all_assertions() - } - - /// Return true if and only if this HIR is required to match from the - /// beginning of text. This includes expressions like `^foo`, `^(foo|bar)`, - /// `^foo|^bar` but not `^foo|bar`. - pub fn is_anchored_start(&self) -> bool { - self.info.is_anchored_start() - } - - /// Return true if and only if this HIR is required to match at the end - /// of text. This includes expressions like `foo$`, `(foo|bar)$`, - /// `foo$|bar$` but not `foo$|bar`. - pub fn is_anchored_end(&self) -> bool { - self.info.is_anchored_end() - } - - /// Return true if and only if this HIR is required to match from the - /// beginning of text or the beginning of a line. This includes expressions - /// like `^foo`, `(?m)^foo`, `^(foo|bar)`, `^(foo|bar)`, `(?m)^foo|^bar` - /// but not `^foo|bar` or `(?m)^foo|bar`. + /// ``` + /// use regex_syntax::hir::{Hir, Dot, Class, ClassBytes, ClassBytesRange}; /// - /// Note that if `is_anchored_start` is `true`, then - /// `is_line_anchored_start` will also be `true`. The reverse implication - /// is not true. For example, `(?m)^foo` is line anchored, but not - /// `is_anchored_start`. - pub fn is_line_anchored_start(&self) -> bool { - self.info.is_line_anchored_start() + /// let hir = Hir::dot(Dot::AnyByte); + /// let expected = Hir::class(Class::Bytes(ClassBytes::new([ + /// ClassBytesRange::new(0x00, 0xFF), + /// ]))); + /// assert_eq!(expected, hir); + /// ``` + #[inline] + pub fn dot(dot: Dot) -> Hir { + match dot { + Dot::AnyChar => { + let mut cls = ClassUnicode::empty(); + cls.push(ClassUnicodeRange::new('\0', '\u{10FFFF}')); + Hir::class(Class::Unicode(cls)) + } + Dot::AnyByte => { + let mut cls = ClassBytes::empty(); + cls.push(ClassBytesRange::new(b'\0', b'\xFF')); + Hir::class(Class::Bytes(cls)) + } + Dot::AnyCharExceptLF => { + let mut cls = ClassUnicode::empty(); + cls.push(ClassUnicodeRange::new('\0', '\x09')); + cls.push(ClassUnicodeRange::new('\x0B', '\u{10FFFF}')); + Hir::class(Class::Unicode(cls)) + } + Dot::AnyCharExceptCRLF => { + let mut cls = ClassUnicode::empty(); + cls.push(ClassUnicodeRange::new('\0', '\x09')); + cls.push(ClassUnicodeRange::new('\x0B', '\x0C')); + cls.push(ClassUnicodeRange::new('\x0E', '\u{10FFFF}')); + Hir::class(Class::Unicode(cls)) + } + Dot::AnyByteExceptLF => { + let mut cls = ClassBytes::empty(); + cls.push(ClassBytesRange::new(b'\0', b'\x09')); + cls.push(ClassBytesRange::new(b'\x0B', b'\xFF')); + Hir::class(Class::Bytes(cls)) + } + Dot::AnyByteExceptCRLF => { + let mut cls = ClassBytes::empty(); + cls.push(ClassBytesRange::new(b'\0', b'\x09')); + cls.push(ClassBytesRange::new(b'\x0B', b'\x0C')); + cls.push(ClassBytesRange::new(b'\x0E', b'\xFF')); + Hir::class(Class::Bytes(cls)) + } + } } +} - /// Return true if and only if this HIR is required to match at the - /// end of text or the end of a line. This includes expressions like - /// `foo$`, `(?m)foo$`, `(foo|bar)$`, `(?m)(foo|bar)$`, `foo$|bar$`, - /// `(?m)(foo|bar)$`, but not `foo$|bar` or `(?m)foo$|bar`. +/// The underlying kind of an arbitrary [`Hir`] expression. +/// +/// An `HirKind` is principally useful for doing case analysis on the type +/// of a regular expression. If you're looking to build new `Hir` values, +/// then you _must_ use the smart constructors defined on `Hir`, like +/// [`Hir::repetition`], to build new `Hir` values. The API intentionally does +/// not expose any way of building an `Hir` directly from an `HirKind`. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum HirKind { + /// The empty regular expression, which matches everything, including the + /// empty string. + Empty, + /// A literalstring that matches exactly these bytes. + Literal(Literal), + /// A single character class that matches any of the characters in the + /// class. A class can either consist of Unicode scalar values as + /// characters, or it can use bytes. /// - /// Note that if `is_anchored_end` is `true`, then - /// `is_line_anchored_end` will also be `true`. The reverse implication - /// is not true. For example, `(?m)foo$` is line anchored, but not - /// `is_anchored_end`. - pub fn is_line_anchored_end(&self) -> bool { - self.info.is_line_anchored_end() - } - - /// Return true if and only if this HIR contains any sub-expression that - /// is required to match at the beginning of text. Specifically, this - /// returns true if the `^` symbol (when multiline mode is disabled) or the - /// `\A` escape appear anywhere in the regex. - pub fn is_any_anchored_start(&self) -> bool { - self.info.is_any_anchored_start() - } - - /// Return true if and only if this HIR contains any sub-expression that is - /// required to match at the end of text. Specifically, this returns true - /// if the `$` symbol (when multiline mode is disabled) or the `\z` escape - /// appear anywhere in the regex. - pub fn is_any_anchored_end(&self) -> bool { - self.info.is_any_anchored_end() - } - - /// Return true if and only if the empty string is part of the language - /// matched by this regular expression. + /// A class may be empty. In which case, it matches nothing. + Class(Class), + /// A look-around assertion. A look-around match always has zero length. + Look(Look), + /// A repetition operation applied to a sub-expression. + Repetition(Repetition), + /// A capturing group, which contains a sub-expression. + Capture(Capture), + /// A concatenation of expressions. /// - /// This includes `a*`, `a?b*`, `a{0}`, `()`, `()+`, `^$`, `a|b?`, `\b` - /// and `\B`, but not `a` or `a+`. - pub fn is_match_empty(&self) -> bool { - self.info.is_match_empty() - } - - /// Return true if and only if this HIR is a simple literal. This is only - /// true when this HIR expression is either itself a `Literal` or a - /// concatenation of only `Literal`s. + /// A concatenation matches only if each of its sub-expressions match one + /// after the other. /// - /// For example, `f` and `foo` are literals, but `f+`, `(foo)`, `foo()`, - /// `` are not (even though that contain sub-expressions that are literals). - pub fn is_literal(&self) -> bool { - self.info.is_literal() - } - - /// Return true if and only if this HIR is either a simple literal or an - /// alternation of simple literals. This is only - /// true when this HIR expression is either itself a `Literal` or a - /// concatenation of only `Literal`s or an alternation of only `Literal`s. + /// Concatenations are guaranteed by `Hir`'s smart constructors to always + /// have at least two sub-expressions. + Concat(Vec<Hir>), + /// An alternation of expressions. /// - /// For example, `f`, `foo`, `a|b|c`, and `foo|bar|baz` are alternation - /// literals, but `f+`, `(foo)`, `foo()`, `` - /// are not (even though that contain sub-expressions that are literals). - pub fn is_alternation_literal(&self) -> bool { - self.info.is_alternation_literal() - } + /// An alternation matches only if at least one of its sub-expressions + /// match. If multiple sub-expressions match, then the leftmost is + /// preferred. + /// + /// Alternations are guaranteed by `Hir`'s smart constructors to always + /// have at least two sub-expressions. + Alternation(Vec<Hir>), } impl HirKind { - /// Return true if and only if this HIR is the empty regular expression. - /// - /// Note that this is not defined inductively. That is, it only tests if - /// this kind is the `Empty` variant. To get the inductive definition, - /// use the `is_match_empty` method on [`Hir`](struct.Hir.html). - pub fn is_empty(&self) -> bool { - match *self { - HirKind::Empty => true, - _ => false, - } - } + /// Returns a slice of this kind's sub-expressions, if any. + pub fn subs(&self) -> &[Hir] { + use core::slice::from_ref; - /// Returns true if and only if this kind has any (including possibly - /// empty) subexpressions. - pub fn has_subexprs(&self) -> bool { match *self { HirKind::Empty | HirKind::Literal(_) | HirKind::Class(_) - | HirKind::Anchor(_) - | HirKind::WordBoundary(_) => false, - HirKind::Group(_) - | HirKind::Repetition(_) - | HirKind::Concat(_) - | HirKind::Alternation(_) => true, + | HirKind::Look(_) => &[], + HirKind::Repetition(Repetition { ref sub, .. }) => from_ref(sub), + HirKind::Capture(Capture { ref sub, .. }) => from_ref(sub), + HirKind::Concat(ref subs) => subs, + HirKind::Alternation(ref subs) => subs, } } } +impl core::fmt::Debug for Hir { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + self.kind.fmt(f) + } +} + /// Print a display representation of this Hir. /// /// The result of this is a valid regular expression pattern string. /// /// This implementation uses constant stack space and heap space proportional /// to the size of the `Hir`. -impl fmt::Display for Hir { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - use crate::hir::print::Printer; - Printer::new().print(self, f) +impl core::fmt::Display for Hir { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + crate::hir::print::Printer::new().print(self, f) } } /// The high-level intermediate representation of a literal. /// -/// A literal corresponds to a single character, where a character is either -/// defined by a Unicode scalar value or an arbitrary byte. Unicode characters -/// are preferred whenever possible. In particular, a `Byte` variant is only -/// ever produced when it could match invalid UTF-8. -#[derive(Clone, Debug, Eq, PartialEq)] -pub enum Literal { - /// A single character represented by a Unicode scalar value. - Unicode(char), - /// A single character represented by an arbitrary byte. - Byte(u8), -} +/// A literal corresponds to `0` or more bytes that should be matched +/// literally. The smart constructors defined on `Hir` will automatically +/// concatenate adjacent literals into one literal, and will even automatically +/// replace empty literals with `Hir::empty()`. +/// +/// Note that despite a literal being represented by a sequence of bytes, its +/// `Debug` implementation will attempt to print it as a normal string. (That +/// is, not a sequence of decimal numbers.) +#[derive(Clone, Eq, PartialEq)] +pub struct Literal(pub Box<[u8]>); -impl Literal { - /// Returns true if and only if this literal corresponds to a Unicode - /// scalar value. - pub fn is_unicode(&self) -> bool { - match *self { - Literal::Unicode(_) => true, - Literal::Byte(b) if b <= 0x7F => true, - Literal::Byte(_) => false, - } +impl core::fmt::Debug for Literal { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + crate::debug::Bytes(&self.0).fmt(f) } } @@ -773,13 +782,12 @@ impl Literal { /// A character class, regardless of its character type, is represented by a /// sequence of non-overlapping non-adjacent ranges of characters. /// -/// Note that unlike [`Literal`](enum.Literal.html), a `Bytes` variant may -/// be produced even when it exclusively matches valid UTF-8. This is because -/// a `Bytes` variant represents an intention by the author of the regular -/// expression to disable Unicode mode, which in turn impacts the semantics of -/// case insensitive matching. For example, `(?i)k` and `(?i-u)k` will not -/// match the same set of strings. -#[derive(Clone, Debug, Eq, PartialEq)] +/// Note that `Bytes` variant may be produced even when it exclusively matches +/// valid UTF-8. This is because a `Bytes` variant represents an intention by +/// the author of the regular expression to disable Unicode mode, which in turn +/// impacts the semantics of case insensitive matching. For example, `(?i)k` +/// and `(?i-u)k` will not match the same set of strings. +#[derive(Clone, Eq, PartialEq)] pub enum Class { /// A set of characters represented by Unicode scalar values. Unicode(ClassUnicode), @@ -795,6 +803,15 @@ impl Class { /// /// If this is a byte oriented character class, then this will be limited /// to the ASCII ranges `A-Z` and `a-z`. + /// + /// # Panics + /// + /// This routine panics when the case mapping data necessary for this + /// routine to complete is unavailable. This occurs when the `unicode-case` + /// feature is not enabled and the underlying class is Unicode oriented. + /// + /// Callers should prefer using `try_case_fold_simple` instead, which will + /// return an error instead of panicking. pub fn case_fold_simple(&mut self) { match *self { Class::Unicode(ref mut x) => x.case_fold_simple(), @@ -802,6 +819,29 @@ impl Class { } } + /// Apply Unicode simple case folding to this character class, in place. + /// The character class will be expanded to include all simple case folded + /// character variants. + /// + /// If this is a byte oriented character class, then this will be limited + /// to the ASCII ranges `A-Z` and `a-z`. + /// + /// # Error + /// + /// This routine returns an error when the case mapping data necessary + /// for this routine to complete is unavailable. This occurs when the + /// `unicode-case` feature is not enabled and the underlying class is + /// Unicode oriented. + pub fn try_case_fold_simple( + &mut self, + ) -> core::result::Result<(), CaseFoldError> { + match *self { + Class::Unicode(ref mut x) => x.try_case_fold_simple()?, + Class::Bytes(ref mut x) => x.case_fold_simple(), + } + Ok(()) + } + /// Negate this character class in place. /// /// After completion, this character class will contain precisely the @@ -824,14 +864,149 @@ impl Class { /// 2. Unicode mode (via the `u` flag) was disabled either in the concrete /// syntax or in the parser builder. By default, Unicode mode is /// enabled. - pub fn is_always_utf8(&self) -> bool { + pub fn is_utf8(&self) -> bool { match *self { Class::Unicode(_) => true, - Class::Bytes(ref x) => x.is_all_ascii(), + Class::Bytes(ref x) => x.is_ascii(), + } + } + + /// Returns the length, in bytes, of the smallest string matched by this + /// character class. + /// + /// For non-empty byte oriented classes, this always returns `1`. For + /// non-empty Unicode oriented classes, this can return `1`, `2`, `3` or + /// `4`. For empty classes, `None` is returned. It is impossible for `0` to + /// be returned. + /// + /// # Example + /// + /// This example shows some examples of regexes and their corresponding + /// minimum length, if any. + /// + /// ``` + /// use regex_syntax::{hir::Properties, parse}; + /// + /// // The empty string has a min length of 0. + /// let hir = parse(r"")?; + /// assert_eq!(Some(0), hir.properties().minimum_len()); + /// // As do other types of regexes that only match the empty string. + /// let hir = parse(r"^$\b\B")?; + /// assert_eq!(Some(0), hir.properties().minimum_len()); + /// // A regex that can match the empty string but match more is still 0. + /// let hir = parse(r"a*")?; + /// assert_eq!(Some(0), hir.properties().minimum_len()); + /// // A regex that matches nothing has no minimum defined. + /// let hir = parse(r"[a&&b]")?; + /// assert_eq!(None, hir.properties().minimum_len()); + /// // Character classes usually have a minimum length of 1. + /// let hir = parse(r"\w")?; + /// assert_eq!(Some(1), hir.properties().minimum_len()); + /// // But sometimes Unicode classes might be bigger! + /// let hir = parse(r"\p{Cyrillic}")?; + /// assert_eq!(Some(2), hir.properties().minimum_len()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn minimum_len(&self) -> Option<usize> { + match *self { + Class::Unicode(ref x) => x.minimum_len(), + Class::Bytes(ref x) => x.minimum_len(), + } + } + + /// Returns the length, in bytes, of the longest string matched by this + /// character class. + /// + /// For non-empty byte oriented classes, this always returns `1`. For + /// non-empty Unicode oriented classes, this can return `1`, `2`, `3` or + /// `4`. For empty classes, `None` is returned. It is impossible for `0` to + /// be returned. + /// + /// # Example + /// + /// This example shows some examples of regexes and their corresponding + /// maximum length, if any. + /// + /// ``` + /// use regex_syntax::{hir::Properties, parse}; + /// + /// // The empty string has a max length of 0. + /// let hir = parse(r"")?; + /// assert_eq!(Some(0), hir.properties().maximum_len()); + /// // As do other types of regexes that only match the empty string. + /// let hir = parse(r"^$\b\B")?; + /// assert_eq!(Some(0), hir.properties().maximum_len()); + /// // A regex that matches nothing has no maximum defined. + /// let hir = parse(r"[a&&b]")?; + /// assert_eq!(None, hir.properties().maximum_len()); + /// // Bounded repeats work as you expect. + /// let hir = parse(r"x{2,10}")?; + /// assert_eq!(Some(10), hir.properties().maximum_len()); + /// // An unbounded repeat means there is no maximum. + /// let hir = parse(r"x{2,}")?; + /// assert_eq!(None, hir.properties().maximum_len()); + /// // With Unicode enabled, \w can match up to 4 bytes! + /// let hir = parse(r"\w")?; + /// assert_eq!(Some(4), hir.properties().maximum_len()); + /// // Without Unicode enabled, \w matches at most 1 byte. + /// let hir = parse(r"(?-u)\w")?; + /// assert_eq!(Some(1), hir.properties().maximum_len()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn maximum_len(&self) -> Option<usize> { + match *self { + Class::Unicode(ref x) => x.maximum_len(), + Class::Bytes(ref x) => x.maximum_len(), + } + } + + /// Returns true if and only if this character class is empty. That is, + /// it has no elements. + /// + /// An empty character can never match anything, including an empty string. + pub fn is_empty(&self) -> bool { + match *self { + Class::Unicode(ref x) => x.ranges().is_empty(), + Class::Bytes(ref x) => x.ranges().is_empty(), + } + } + + /// If this class consists of exactly one element (whether a codepoint or a + /// byte), then return it as a literal byte string. + /// + /// If this class is empty or contains more than one element, then `None` + /// is returned. + pub fn literal(&self) -> Option<Vec<u8>> { + match *self { + Class::Unicode(ref x) => x.literal(), + Class::Bytes(ref x) => x.literal(), } } } +impl core::fmt::Debug for Class { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + use crate::debug::Byte; + + let mut fmter = f.debug_set(); + match *self { + Class::Unicode(ref cls) => { + for r in cls.ranges().iter() { + fmter.entry(&(r.start..=r.end)); + } + } + Class::Bytes(ref cls) => { + for r in cls.ranges().iter() { + fmter.entry(&(Byte(r.start)..=Byte(r.end))); + } + } + } + fmter.finish() + } +} + /// A set of characters represented by Unicode scalar values. #[derive(Clone, Debug, Eq, PartialEq)] pub struct ClassUnicode { @@ -842,7 +1017,8 @@ impl ClassUnicode { /// Create a new class from a sequence of ranges. /// /// The given ranges do not need to be in any specific order, and ranges - /// may overlap. + /// may overlap. Ranges will automatically be sorted into a canonical + /// non-overlapping order. pub fn new<I>(ranges: I) -> ClassUnicode where I: IntoIterator<Item = ClassUnicodeRange>, @@ -851,6 +1027,9 @@ impl ClassUnicode { } /// Create a new class with no ranges. + /// + /// An empty class matches nothing. That is, it is equivalent to + /// [`Hir::fail`]. pub fn empty() -> ClassUnicode { ClassUnicode::new(vec![]) } @@ -903,7 +1082,7 @@ impl ClassUnicode { /// `unicode-case` feature is not enabled. pub fn try_case_fold_simple( &mut self, - ) -> result::Result<(), CaseFoldError> { + ) -> core::result::Result<(), CaseFoldError> { self.set.case_fold_simple() } @@ -946,9 +1125,59 @@ impl ClassUnicode { /// Returns true if and only if this character class will either match /// nothing or only ASCII bytes. Stated differently, this returns false /// if and only if this class contains a non-ASCII codepoint. - pub fn is_all_ascii(&self) -> bool { + pub fn is_ascii(&self) -> bool { self.set.intervals().last().map_or(true, |r| r.end <= '\x7F') } + + /// Returns the length, in bytes, of the smallest string matched by this + /// character class. + /// + /// Returns `None` when the class is empty. + pub fn minimum_len(&self) -> Option<usize> { + let first = self.ranges().get(0)?; + // Correct because c1 < c2 implies c1.len_utf8() < c2.len_utf8(). + Some(first.start.len_utf8()) + } + + /// Returns the length, in bytes, of the longest string matched by this + /// character class. + /// + /// Returns `None` when the class is empty. + pub fn maximum_len(&self) -> Option<usize> { + let last = self.ranges().last()?; + // Correct because c1 < c2 implies c1.len_utf8() < c2.len_utf8(). + Some(last.end.len_utf8()) + } + + /// If this class consists of exactly one codepoint, then return it as + /// a literal byte string. + /// + /// If this class is empty or contains more than one codepoint, then `None` + /// is returned. + pub fn literal(&self) -> Option<Vec<u8>> { + let rs = self.ranges(); + if rs.len() == 1 && rs[0].start == rs[0].end { + Some(rs[0].start.encode_utf8(&mut [0; 4]).to_string().into_bytes()) + } else { + None + } + } + + /// If this class consists of only ASCII ranges, then return its + /// corresponding and equivalent byte class. + pub fn to_byte_class(&self) -> Option<ClassBytes> { + if !self.is_ascii() { + return None; + } + Some(ClassBytes::new(self.ranges().iter().map(|r| { + // Since we are guaranteed that our codepoint range is ASCII, the + // 'u8::try_from' calls below are guaranteed to be correct. + ClassBytesRange { + start: u8::try_from(r.start).unwrap(), + end: u8::try_from(r.end).unwrap(), + } + }))) + } } /// An iterator over all ranges in a Unicode character class. @@ -975,18 +1204,18 @@ pub struct ClassUnicodeRange { end: char, } -impl fmt::Debug for ClassUnicodeRange { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { +impl core::fmt::Debug for ClassUnicodeRange { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { let start = if !self.start.is_whitespace() && !self.start.is_control() { self.start.to_string() } else { - format!("0x{:X}", self.start as u32) + format!("0x{:X}", u32::from(self.start)) }; let end = if !self.end.is_whitespace() && !self.end.is_control() { self.end.to_string() } else { - format!("0x{:X}", self.end as u32) + format!("0x{:X}", u32::from(self.end)) }; f.debug_struct("ClassUnicodeRange") .field("start", &start) @@ -1023,24 +1252,13 @@ impl Interval for ClassUnicodeRange { &self, ranges: &mut Vec<ClassUnicodeRange>, ) -> Result<(), unicode::CaseFoldError> { - if !unicode::contains_simple_case_mapping(self.start, self.end)? { + let mut folder = unicode::SimpleCaseFolder::new()?; + if !folder.overlaps(self.start, self.end) { return Ok(()); } - let start = self.start as u32; - let end = (self.end as u32).saturating_add(1); - let mut next_simple_cp = None; - for cp in (start..end).filter_map(char::from_u32) { - if next_simple_cp.map_or(false, |next| cp < next) { - continue; - } - let it = match unicode::simple_fold(cp)? { - Ok(it) => it, - Err(next) => { - next_simple_cp = next; - continue; - } - }; - for cp_folded in it { + let (start, end) = (u32::from(self.start), u32::from(self.end)); + for cp in (start..=end).filter_map(char::from_u32) { + for &cp_folded in folder.mapping(cp) { ranges.push(ClassUnicodeRange::new(cp_folded, cp_folded)); } } @@ -1072,6 +1290,18 @@ impl ClassUnicodeRange { pub fn end(&self) -> char { self.end } + + /// Returns the number of codepoints in this range. + pub fn len(&self) -> usize { + let diff = 1 + u32::from(self.end) - u32::from(self.start); + // This is likely to panic in 16-bit targets since a usize can only fit + // 2^16. It's not clear what to do here, other than to return an error + // when building a Unicode class that contains a range whose length + // overflows usize. (Which, to be honest, is probably quite common on + // 16-bit targets. For example, this would imply that '.' and '\p{any}' + // would be impossible to build.) + usize::try_from(diff).expect("char class len fits in usize") + } } /// A set of characters represented by arbitrary bytes (where one byte @@ -1085,7 +1315,8 @@ impl ClassBytes { /// Create a new class from a sequence of ranges. /// /// The given ranges do not need to be in any specific order, and ranges - /// may overlap. + /// may overlap. Ranges will automatically be sorted into a canonical + /// non-overlapping order. pub fn new<I>(ranges: I) -> ClassBytes where I: IntoIterator<Item = ClassBytesRange>, @@ -1094,6 +1325,9 @@ impl ClassBytes { } /// Create a new class with no ranges. + /// + /// An empty class matches nothing. That is, it is equivalent to + /// [`Hir::fail`]. pub fn empty() -> ClassBytes { ClassBytes::new(vec![]) } @@ -1163,9 +1397,64 @@ impl ClassBytes { /// Returns true if and only if this character class will either match /// nothing or only ASCII bytes. Stated differently, this returns false /// if and only if this class contains a non-ASCII byte. - pub fn is_all_ascii(&self) -> bool { + pub fn is_ascii(&self) -> bool { self.set.intervals().last().map_or(true, |r| r.end <= 0x7F) } + + /// Returns the length, in bytes, of the smallest string matched by this + /// character class. + /// + /// Returns `None` when the class is empty. + pub fn minimum_len(&self) -> Option<usize> { + if self.ranges().is_empty() { + None + } else { + Some(1) + } + } + + /// Returns the length, in bytes, of the longest string matched by this + /// character class. + /// + /// Returns `None` when the class is empty. + pub fn maximum_len(&self) -> Option<usize> { + if self.ranges().is_empty() { + None + } else { + Some(1) + } + } + + /// If this class consists of exactly one byte, then return it as + /// a literal byte string. + /// + /// If this class is empty or contains more than one byte, then `None` + /// is returned. + pub fn literal(&self) -> Option<Vec<u8>> { + let rs = self.ranges(); + if rs.len() == 1 && rs[0].start == rs[0].end { + Some(vec![rs[0].start]) + } else { + None + } + } + + /// If this class consists of only ASCII ranges, then return its + /// corresponding and equivalent Unicode class. + pub fn to_unicode_class(&self) -> Option<ClassUnicode> { + if !self.is_ascii() { + return None; + } + Some(ClassUnicode::new(self.ranges().iter().map(|r| { + // Since we are guaranteed that our byte range is ASCII, the + // 'char::from' calls below are correct and will not erroneously + // convert a raw byte value into its corresponding codepoint. + ClassUnicodeRange { + start: char::from(r.start), + end: char::from(r.end), + } + }))) + } } /// An iterator over all ranges in a byte character class. @@ -1259,108 +1548,161 @@ impl ClassBytesRange { pub fn end(&self) -> u8 { self.end } + + /// Returns the number of bytes in this range. + pub fn len(&self) -> usize { + usize::from(self.end.checked_sub(self.start).unwrap()) + .checked_add(1) + .unwrap() + } } -impl fmt::Debug for ClassBytesRange { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - let mut debug = f.debug_struct("ClassBytesRange"); - if self.start <= 0x7F { - debug.field("start", &(self.start as char)); - } else { - debug.field("start", &self.start); - } - if self.end <= 0x7F { - debug.field("end", &(self.end as char)); - } else { - debug.field("end", &self.end); - } - debug.finish() +impl core::fmt::Debug for ClassBytesRange { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + f.debug_struct("ClassBytesRange") + .field("start", &crate::debug::Byte(self.start)) + .field("end", &crate::debug::Byte(self.end)) + .finish() } } -/// The high-level intermediate representation for an anchor assertion. +/// The high-level intermediate representation for a look-around assertion. /// -/// A matching anchor assertion is always zero-length. -#[derive(Clone, Debug, Eq, PartialEq)] -pub enum Anchor { - /// Match the beginning of a line or the beginning of text. Specifically, - /// this matches at the starting position of the input, or at the position - /// immediately following a `\n` character. - StartLine, - /// Match the end of a line or the end of text. Specifically, - /// this matches at the end position of the input, or at the position - /// immediately preceding a `\n` character. - EndLine, +/// An assertion match is always zero-length. Also called an "empty match." +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum Look { /// Match the beginning of text. Specifically, this matches at the starting /// position of the input. - StartText, + Start = 1 << 0, /// Match the end of text. Specifically, this matches at the ending /// position of the input. - EndText, -} - -/// The high-level intermediate representation for a word-boundary assertion. -/// -/// A matching word boundary assertion is always zero-length. -#[derive(Clone, Debug, Eq, PartialEq)] -pub enum WordBoundary { - /// Match a Unicode-aware word boundary. That is, this matches a position - /// where the left adjacent character and right adjacent character - /// correspond to a word and non-word or a non-word and word character. - Unicode, - /// Match a Unicode-aware negation of a word boundary. - UnicodeNegate, + End = 1 << 1, + /// Match the beginning of a line or the beginning of text. Specifically, + /// this matches at the starting position of the input, or at the position + /// immediately following a `\n` character. + StartLF = 1 << 2, + /// Match the end of a line or the end of text. Specifically, this matches + /// at the end position of the input, or at the position immediately + /// preceding a `\n` character. + EndLF = 1 << 3, + /// Match the beginning of a line or the beginning of text. Specifically, + /// this matches at the starting position of the input, or at the position + /// immediately following either a `\r` or `\n` character, but never after + /// a `\r` when a `\n` follows. + StartCRLF = 1 << 4, + /// Match the end of a line or the end of text. Specifically, this matches + /// at the end position of the input, or at the position immediately + /// preceding a `\r` or `\n` character, but never before a `\n` when a `\r` + /// precedes it. + EndCRLF = 1 << 5, /// Match an ASCII-only word boundary. That is, this matches a position /// where the left adjacent character and right adjacent character /// correspond to a word and non-word or a non-word and word character. - Ascii, + WordAscii = 1 << 6, /// Match an ASCII-only negation of a word boundary. - AsciiNegate, + WordAsciiNegate = 1 << 7, + /// Match a Unicode-aware word boundary. That is, this matches a position + /// where the left adjacent character and right adjacent character + /// correspond to a word and non-word or a non-word and word character. + WordUnicode = 1 << 8, + /// Match a Unicode-aware negation of a word boundary. + WordUnicodeNegate = 1 << 9, } -impl WordBoundary { - /// Returns true if and only if this word boundary assertion is negated. - pub fn is_negated(&self) -> bool { - match *self { - WordBoundary::Unicode | WordBoundary::Ascii => false, - WordBoundary::UnicodeNegate | WordBoundary::AsciiNegate => true, +impl Look { + /// Flip the look-around assertion to its equivalent for reverse searches. + /// For example, `StartLF` gets translated to `EndLF`. + /// + /// Some assertions, such as `WordUnicode`, remain the same since they + /// match the same positions regardless of the direction of the search. + #[inline] + pub const fn reversed(self) -> Look { + match self { + Look::Start => Look::End, + Look::End => Look::Start, + Look::StartLF => Look::EndLF, + Look::EndLF => Look::StartLF, + Look::StartCRLF => Look::EndCRLF, + Look::EndCRLF => Look::StartCRLF, + Look::WordAscii => Look::WordAscii, + Look::WordAsciiNegate => Look::WordAsciiNegate, + Look::WordUnicode => Look::WordUnicode, + Look::WordUnicodeNegate => Look::WordUnicodeNegate, + } + } + + /// Return the underlying representation of this look-around enumeration + /// as an integer. Giving the return value to the [`Look::from_repr`] + /// constructor is guaranteed to return the same look-around variant that + /// one started with within a semver compatible release of this crate. + #[inline] + pub const fn as_repr(self) -> u16 { + // AFAIK, 'as' is the only way to zero-cost convert an int enum to an + // actual int. + self as u16 + } + + /// Given the underlying representation of a `Look` value, return the + /// corresponding `Look` value if the representation is valid. Otherwise + /// `None` is returned. + #[inline] + pub const fn from_repr(repr: u16) -> Option<Look> { + match repr { + 0b00_0000_0001 => Some(Look::Start), + 0b00_0000_0010 => Some(Look::End), + 0b00_0000_0100 => Some(Look::StartLF), + 0b00_0000_1000 => Some(Look::EndLF), + 0b00_0001_0000 => Some(Look::StartCRLF), + 0b00_0010_0000 => Some(Look::EndCRLF), + 0b00_0100_0000 => Some(Look::WordAscii), + 0b00_1000_0000 => Some(Look::WordAsciiNegate), + 0b01_0000_0000 => Some(Look::WordUnicode), + 0b10_0000_0000 => Some(Look::WordUnicodeNegate), + _ => None, + } + } + + /// Returns a convenient single codepoint representation of this + /// look-around assertion. Each assertion is guaranteed to be represented + /// by a distinct character. + /// + /// This is useful for succinctly representing a look-around assertion in + /// human friendly but succinct output intended for a programmer working on + /// regex internals. + #[inline] + pub const fn as_char(self) -> char { + match self { + Look::Start => 'A', + Look::End => 'z', + Look::StartLF => '^', + Look::EndLF => '$', + Look::StartCRLF => 'r', + Look::EndCRLF => 'R', + Look::WordAscii => 'b', + Look::WordAsciiNegate => 'B', + Look::WordUnicode => '𝛃', + Look::WordUnicodeNegate => '𝚩', } } } -/// The high-level intermediate representation for a group. +/// The high-level intermediate representation for a capturing group. /// -/// This represents one of three possible group types: +/// A capturing group always has an index and a child expression. It may +/// also have a name associated with it (e.g., `(?P<foo>\w)`), but it's not +/// necessary. /// -/// 1. A non-capturing group (e.g., `(?:expr)`). -/// 2. A capturing group (e.g., `(expr)`). -/// 3. A named capturing group (e.g., `(?P<name>expr)`). +/// Note that there is no explicit representation of a non-capturing group +/// in a `Hir`. Instead, non-capturing grouping is handled automatically by +/// the recursive structure of the `Hir` itself. #[derive(Clone, Debug, Eq, PartialEq)] -pub struct Group { - /// The kind of this group. If it is a capturing group, then the kind - /// contains the capture group index (and the name, if it is a named - /// group). - pub kind: GroupKind, +pub struct Capture { + /// The capture index of the capture. + pub index: u32, + /// The name of the capture, if it exists. + pub name: Option<Box<str>>, /// The expression inside the capturing group, which may be empty. - pub hir: Box<Hir>, -} - -/// The kind of group. -#[derive(Clone, Debug, Eq, PartialEq)] -pub enum GroupKind { - /// A normal unnamed capturing group. - /// - /// The value is the capture index of the group. - CaptureIndex(u32), - /// A named capturing group. - CaptureName { - /// The name of the group. - name: String, - /// The capture index of the group. - index: u32, - }, - /// A non-capturing group. - NonCapturing, + pub sub: Box<Hir>, } /// The high-level intermediate representation of a repetition operator. @@ -1369,8 +1711,21 @@ pub enum GroupKind { /// sub-expression. #[derive(Clone, Debug, Eq, PartialEq)] pub struct Repetition { - /// The kind of this repetition operator. - pub kind: RepetitionKind, + /// The minimum range of the repetition. + /// + /// Note that special cases like `?`, `+` and `*` all get translated into + /// the ranges `{0,1}`, `{1,}` and `{0,}`, respectively. + /// + /// When `min` is zero, this expression can match the empty string + /// regardless of what its sub-expression is. + pub min: u32, + /// The maximum range of the repetition. + /// + /// Note that when `max` is `None`, `min` acts as a lower bound but where + /// there is no upper bound. For something like `x{5}` where the min and + /// max are equivalent, `min` will be set to `5` and `max` will be set to + /// `Some(5)`. + pub max: Option<u32>, /// Whether this repetition operator is greedy or not. A greedy operator /// will match as much as it can. A non-greedy operator will match as /// little as it can. @@ -1380,69 +1735,71 @@ pub struct Repetition { /// not. However, this can be inverted via the `U` "ungreedy" flag. pub greedy: bool, /// The expression being repeated. - pub hir: Box<Hir>, + pub sub: Box<Hir>, } impl Repetition { - /// Returns true if and only if this repetition operator makes it possible - /// to match the empty string. - /// - /// Note that this is not defined inductively. For example, while `a*` - /// will report `true`, `()+` will not, even though `()` matches the empty - /// string and one or more occurrences of something that matches the empty - /// string will always match the empty string. In order to get the - /// inductive definition, see the corresponding method on - /// [`Hir`](struct.Hir.html). - pub fn is_match_empty(&self) -> bool { - match self.kind { - RepetitionKind::ZeroOrOne => true, - RepetitionKind::ZeroOrMore => true, - RepetitionKind::OneOrMore => false, - RepetitionKind::Range(RepetitionRange::Exactly(m)) => m == 0, - RepetitionKind::Range(RepetitionRange::AtLeast(m)) => m == 0, - RepetitionKind::Range(RepetitionRange::Bounded(m, _)) => m == 0, + /// Returns a new repetition with the same `min`, `max` and `greedy` + /// values, but with its sub-expression replaced with the one given. + pub fn with(&self, sub: Hir) -> Repetition { + Repetition { + min: self.min, + max: self.max, + greedy: self.greedy, + sub: Box::new(sub), } } } -/// The kind of a repetition operator. -#[derive(Clone, Debug, Eq, PartialEq)] -pub enum RepetitionKind { - /// Matches a sub-expression zero or one times. - ZeroOrOne, - /// Matches a sub-expression zero or more times. - ZeroOrMore, - /// Matches a sub-expression one or more times. - OneOrMore, - /// Matches a sub-expression within a bounded range of times. - Range(RepetitionRange), -} - -/// The kind of a counted repetition operator. -#[derive(Clone, Debug, Eq, PartialEq)] -pub enum RepetitionRange { - /// Matches a sub-expression exactly this many times. - Exactly(u32), - /// Matches a sub-expression at least this many times. - AtLeast(u32), - /// Matches a sub-expression at least `m` times and at most `n` times. - Bounded(u32, u32), +/// A type describing the different flavors of `.`. +/// +/// This type is meant to be used with [`Hir::dot`], which is a convenience +/// routine for building HIR values derived from the `.` regex. +#[non_exhaustive] +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum Dot { + /// Matches the UTF-8 encoding of any Unicode scalar value. + /// + /// This is equivalent to `(?su:.)` and also `\p{any}`. + AnyChar, + /// Matches any byte value. + /// + /// This is equivalent to `(?s-u:.)` and also `(?-u:[\x00-\xFF])`. + AnyByte, + /// Matches the UTF-8 encoding of any Unicode scalar value except for `\n`. + /// + /// This is equivalent to `(?u-s:.)` and also `[\p{any}--\n]`. + AnyCharExceptLF, + /// Matches the UTF-8 encoding of any Unicode scalar value except for `\r` + /// and `\n`. + /// + /// This is equivalent to `(?uR-s:.)` and also `[\p{any}--\r\n]`. + AnyCharExceptCRLF, + /// Matches any byte value except for `\n`. + /// + /// This is equivalent to `(?-su:.)` and also `(?-u:[[\x00-\xFF]--\n])`. + AnyByteExceptLF, + /// Matches any byte value except for `\r` and `\n`. + /// + /// This is equivalent to `(?R-su:.)` and also `(?-u:[[\x00-\xFF]--\r\n])`. + AnyByteExceptCRLF, } /// A custom `Drop` impl is used for `HirKind` such that it uses constant stack /// space but heap space proportional to the depth of the total `Hir`. impl Drop for Hir { fn drop(&mut self) { - use std::mem; + use core::mem; match *self.kind() { HirKind::Empty | HirKind::Literal(_) | HirKind::Class(_) - | HirKind::Anchor(_) - | HirKind::WordBoundary(_) => return, - HirKind::Group(ref x) if !x.hir.kind.has_subexprs() => return, - HirKind::Repetition(ref x) if !x.hir.kind.has_subexprs() => return, + | HirKind::Look(_) => return, + HirKind::Capture(ref x) if x.sub.kind.subs().is_empty() => return, + HirKind::Repetition(ref x) if x.sub.kind.subs().is_empty() => { + return + } HirKind::Concat(ref x) if x.is_empty() => return, HirKind::Alternation(ref x) if x.is_empty() => return, _ => {} @@ -1454,13 +1811,12 @@ impl Drop for Hir { HirKind::Empty | HirKind::Literal(_) | HirKind::Class(_) - | HirKind::Anchor(_) - | HirKind::WordBoundary(_) => {} - HirKind::Group(ref mut x) => { - stack.push(mem::replace(&mut x.hir, Hir::empty())); + | HirKind::Look(_) => {} + HirKind::Capture(ref mut x) => { + stack.push(mem::replace(&mut x.sub, Hir::empty())); } HirKind::Repetition(ref mut x) => { - stack.push(mem::replace(&mut x.hir, Hir::empty())); + stack.push(mem::replace(&mut x.sub, Hir::empty())); } HirKind::Concat(ref mut x) => { stack.extend(x.drain(..)); @@ -1473,52 +1829,1105 @@ impl Drop for Hir { } } -/// A type that documents various attributes of an HIR expression. +/// A type that collects various properties of an HIR value. +/// +/// Properties are always scalar values and represent meta data that is +/// computed inductively on an HIR value. Properties are defined for all +/// HIR values. +/// +/// All methods on a `Properties` value take constant time and are meant to +/// be cheap to call. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Properties(Box<PropertiesI>); + +/// The property definition. It is split out so that we can box it, and +/// there by make `Properties` use less stack size. This is kind-of important +/// because every HIR value has a `Properties` attached to it. /// -/// These attributes are typically defined inductively on the HIR. +/// This does have the unfortunate consequence that creating any HIR value +/// always leads to at least one alloc for properties, but this is generally +/// true anyway (for pretty much all HirKinds except for look-arounds). #[derive(Clone, Debug, Eq, PartialEq)] -struct HirInfo { - /// Represent yes/no questions by a bitfield to conserve space, since - /// this is included in every HIR expression. - /// - /// If more attributes need to be added, it is OK to increase the size of - /// this as appropriate. - bools: u16, +struct PropertiesI { + minimum_len: Option<usize>, + maximum_len: Option<usize>, + look_set: LookSet, + look_set_prefix: LookSet, + look_set_suffix: LookSet, + look_set_prefix_any: LookSet, + look_set_suffix_any: LookSet, + utf8: bool, + explicit_captures_len: usize, + static_explicit_captures_len: Option<usize>, + literal: bool, + alternation_literal: bool, } -// A simple macro for defining bitfield accessors/mutators. -macro_rules! define_bool { - ($bit:expr, $is_fn_name:ident, $set_fn_name:ident) => { - fn $is_fn_name(&self) -> bool { - self.bools & (0b1 << $bit) > 0 +impl Properties { + /// Returns the length (in bytes) of the smallest string matched by this + /// HIR. + /// + /// A return value of `0` is possible and occurs when the HIR can match an + /// empty string. + /// + /// `None` is returned when there is no minimum length. This occurs in + /// precisely the cases where the HIR matches nothing. i.e., The language + /// the regex matches is empty. An example of such a regex is `\P{any}`. + #[inline] + pub fn minimum_len(&self) -> Option<usize> { + self.0.minimum_len + } + + /// Returns the length (in bytes) of the longest string matched by this + /// HIR. + /// + /// A return value of `0` is possible and occurs when nothing longer than + /// the empty string is in the language described by this HIR. + /// + /// `None` is returned when there is no longest matching string. This + /// occurs when the HIR matches nothing or when there is no upper bound on + /// the length of matching strings. Example of such regexes are `\P{any}` + /// (matches nothing) and `a+` (has no upper bound). + #[inline] + pub fn maximum_len(&self) -> Option<usize> { + self.0.maximum_len + } + + /// Returns a set of all look-around assertions that appear at least once + /// in this HIR value. + #[inline] + pub fn look_set(&self) -> LookSet { + self.0.look_set + } + + /// Returns a set of all look-around assertions that appear as a prefix for + /// this HIR value. That is, the set returned corresponds to the set of + /// assertions that must be passed before matching any bytes in a haystack. + /// + /// For example, `hir.look_set_prefix().contains(Look::Start)` returns true + /// if and only if the HIR is fully anchored at the start. + #[inline] + pub fn look_set_prefix(&self) -> LookSet { + self.0.look_set_prefix + } + + /// Returns a set of all look-around assertions that appear as a _possible_ + /// prefix for this HIR value. That is, the set returned corresponds to the + /// set of assertions that _may_ be passed before matching any bytes in a + /// haystack. + /// + /// For example, `hir.look_set_prefix_any().contains(Look::Start)` returns + /// true if and only if it's possible for the regex to match through a + /// anchored assertion before consuming any input. + #[inline] + pub fn look_set_prefix_any(&self) -> LookSet { + self.0.look_set_prefix_any + } + + /// Returns a set of all look-around assertions that appear as a suffix for + /// this HIR value. That is, the set returned corresponds to the set of + /// assertions that must be passed in order to be considered a match after + /// all other consuming HIR expressions. + /// + /// For example, `hir.look_set_suffix().contains(Look::End)` returns true + /// if and only if the HIR is fully anchored at the end. + #[inline] + pub fn look_set_suffix(&self) -> LookSet { + self.0.look_set_suffix + } + + /// Returns a set of all look-around assertions that appear as a _possible_ + /// suffix for this HIR value. That is, the set returned corresponds to the + /// set of assertions that _may_ be passed before matching any bytes in a + /// haystack. + /// + /// For example, `hir.look_set_suffix_any().contains(Look::End)` returns + /// true if and only if it's possible for the regex to match through a + /// anchored assertion at the end of a match without consuming any input. + #[inline] + pub fn look_set_suffix_any(&self) -> LookSet { + self.0.look_set_suffix_any + } + + /// Return true if and only if the corresponding HIR will always match + /// valid UTF-8. + /// + /// When this returns false, then it is possible for this HIR expression to + /// match invalid UTF-8, including by matching between the code units of + /// a single UTF-8 encoded codepoint. + /// + /// Note that this returns true even when the corresponding HIR can match + /// the empty string. Since an empty string can technically appear between + /// UTF-8 code units, it is possible for a match to be reported that splits + /// a codepoint which could in turn be considered matching invalid UTF-8. + /// However, it is generally assumed that such empty matches are handled + /// specially by the search routine if it is absolutely required that + /// matches not split a codepoint. + /// + /// # Example + /// + /// This code example shows the UTF-8 property of a variety of patterns. + /// + /// ``` + /// use regex_syntax::{ParserBuilder, parse}; + /// + /// // Examples of 'is_utf8() == true'. + /// assert!(parse(r"a")?.properties().is_utf8()); + /// assert!(parse(r"[^a]")?.properties().is_utf8()); + /// assert!(parse(r".")?.properties().is_utf8()); + /// assert!(parse(r"\W")?.properties().is_utf8()); + /// assert!(parse(r"\b")?.properties().is_utf8()); + /// assert!(parse(r"\B")?.properties().is_utf8()); + /// assert!(parse(r"(?-u)\b")?.properties().is_utf8()); + /// assert!(parse(r"(?-u)\B")?.properties().is_utf8()); + /// // Unicode mode is enabled by default, and in + /// // that mode, all \x hex escapes are treated as + /// // codepoints. So this actually matches the UTF-8 + /// // encoding of U+00FF. + /// assert!(parse(r"\xFF")?.properties().is_utf8()); + /// + /// // Now we show examples of 'is_utf8() == false'. + /// // The only way to do this is to force the parser + /// // to permit invalid UTF-8, otherwise all of these + /// // would fail to parse! + /// let parse = |pattern| { + /// ParserBuilder::new().utf8(false).build().parse(pattern) + /// }; + /// assert!(!parse(r"(?-u)[^a]")?.properties().is_utf8()); + /// assert!(!parse(r"(?-u).")?.properties().is_utf8()); + /// assert!(!parse(r"(?-u)\W")?.properties().is_utf8()); + /// // Conversely to the equivalent example above, + /// // when Unicode mode is disabled, \x hex escapes + /// // are treated as their raw byte values. + /// assert!(!parse(r"(?-u)\xFF")?.properties().is_utf8()); + /// // Note that just because we disabled UTF-8 in the + /// // parser doesn't mean we still can't use Unicode. + /// // It is enabled by default, so \xFF is still + /// // equivalent to matching the UTF-8 encoding of + /// // U+00FF by default. + /// assert!(parse(r"\xFF")?.properties().is_utf8()); + /// // Even though we use raw bytes that individually + /// // are not valid UTF-8, when combined together, the + /// // overall expression *does* match valid UTF-8! + /// assert!(parse(r"(?-u)\xE2\x98\x83")?.properties().is_utf8()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn is_utf8(&self) -> bool { + self.0.utf8 + } + + /// Returns the total number of explicit capturing groups in the + /// corresponding HIR. + /// + /// Note that this does not include the implicit capturing group + /// corresponding to the entire match that is typically included by regex + /// engines. + /// + /// # Example + /// + /// This method will return `0` for `a` and `1` for `(a)`: + /// + /// ``` + /// use regex_syntax::parse; + /// + /// assert_eq!(0, parse("a")?.properties().explicit_captures_len()); + /// assert_eq!(1, parse("(a)")?.properties().explicit_captures_len()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn explicit_captures_len(&self) -> usize { + self.0.explicit_captures_len + } + + /// Returns the total number of explicit capturing groups that appear in + /// every possible match. + /// + /// If the number of capture groups can vary depending on the match, then + /// this returns `None`. That is, a value is only returned when the number + /// of matching groups is invariant or "static." + /// + /// Note that this does not include the implicit capturing group + /// corresponding to the entire match. + /// + /// # Example + /// + /// This shows a few cases where a static number of capture groups is + /// available and a few cases where it is not. + /// + /// ``` + /// use regex_syntax::parse; + /// + /// let len = |pattern| { + /// parse(pattern).map(|h| { + /// h.properties().static_explicit_captures_len() + /// }) + /// }; + /// + /// assert_eq!(Some(0), len("a")?); + /// assert_eq!(Some(1), len("(a)")?); + /// assert_eq!(Some(1), len("(a)|(b)")?); + /// assert_eq!(Some(2), len("(a)(b)|(c)(d)")?); + /// assert_eq!(None, len("(a)|b")?); + /// assert_eq!(None, len("a|(b)")?); + /// assert_eq!(None, len("(b)*")?); + /// assert_eq!(Some(1), len("(b)+")?); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn static_explicit_captures_len(&self) -> Option<usize> { + self.0.static_explicit_captures_len + } + + /// Return true if and only if this HIR is a simple literal. This is + /// only true when this HIR expression is either itself a `Literal` or a + /// concatenation of only `Literal`s. + /// + /// For example, `f` and `foo` are literals, but `f+`, `(foo)`, `foo()` and + /// the empty string are not (even though they contain sub-expressions that + /// are literals). + #[inline] + pub fn is_literal(&self) -> bool { + self.0.literal + } + + /// Return true if and only if this HIR is either a simple literal or an + /// alternation of simple literals. This is only + /// true when this HIR expression is either itself a `Literal` or a + /// concatenation of only `Literal`s or an alternation of only `Literal`s. + /// + /// For example, `f`, `foo`, `a|b|c`, and `foo|bar|baz` are alternation + /// literals, but `f+`, `(foo)`, `foo()`, and the empty pattern are not + /// (even though that contain sub-expressions that are literals). + #[inline] + pub fn is_alternation_literal(&self) -> bool { + self.0.alternation_literal + } + + /// Returns the total amount of heap memory usage, in bytes, used by this + /// `Properties` value. + #[inline] + pub fn memory_usage(&self) -> usize { + core::mem::size_of::<PropertiesI>() + } + + /// Returns a new set of properties that corresponds to the union of the + /// iterator of properties given. + /// + /// This is useful when one has multiple `Hir` expressions and wants + /// to combine them into a single alternation without constructing the + /// corresponding `Hir`. This routine provides a way of combining the + /// properties of each `Hir` expression into one set of properties + /// representing the union of those expressions. + /// + /// # Example: union with HIRs that never match + /// + /// This example shows that unioning properties together with one that + /// represents a regex that never matches will "poison" certain attributes, + /// like the minimum and maximum lengths. + /// + /// ``` + /// use regex_syntax::{hir::Properties, parse}; + /// + /// let hir1 = parse("ab?c?")?; + /// assert_eq!(Some(1), hir1.properties().minimum_len()); + /// assert_eq!(Some(3), hir1.properties().maximum_len()); + /// + /// let hir2 = parse(r"[a&&b]")?; + /// assert_eq!(None, hir2.properties().minimum_len()); + /// assert_eq!(None, hir2.properties().maximum_len()); + /// + /// let hir3 = parse(r"wxy?z?")?; + /// assert_eq!(Some(2), hir3.properties().minimum_len()); + /// assert_eq!(Some(4), hir3.properties().maximum_len()); + /// + /// let unioned = Properties::union([ + /// hir1.properties(), + /// hir2.properties(), + /// hir3.properties(), + /// ]); + /// assert_eq!(None, unioned.minimum_len()); + /// assert_eq!(None, unioned.maximum_len()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// The maximum length can also be "poisoned" by a pattern that has no + /// upper bound on the length of a match. The minimum length remains + /// unaffected: + /// + /// ``` + /// use regex_syntax::{hir::Properties, parse}; + /// + /// let hir1 = parse("ab?c?")?; + /// assert_eq!(Some(1), hir1.properties().minimum_len()); + /// assert_eq!(Some(3), hir1.properties().maximum_len()); + /// + /// let hir2 = parse(r"a+")?; + /// assert_eq!(Some(1), hir2.properties().minimum_len()); + /// assert_eq!(None, hir2.properties().maximum_len()); + /// + /// let hir3 = parse(r"wxy?z?")?; + /// assert_eq!(Some(2), hir3.properties().minimum_len()); + /// assert_eq!(Some(4), hir3.properties().maximum_len()); + /// + /// let unioned = Properties::union([ + /// hir1.properties(), + /// hir2.properties(), + /// hir3.properties(), + /// ]); + /// assert_eq!(Some(1), unioned.minimum_len()); + /// assert_eq!(None, unioned.maximum_len()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn union<I, P>(props: I) -> Properties + where + I: IntoIterator<Item = P>, + P: core::borrow::Borrow<Properties>, + { + let mut it = props.into_iter().peekable(); + // While empty alternations aren't possible, we still behave as if they + // are. When we have an empty alternate, then clearly the look-around + // prefix and suffix is empty. Otherwise, it is the intersection of all + // prefixes and suffixes (respectively) of the branches. + let fix = if it.peek().is_none() { + LookSet::empty() + } else { + LookSet::full() + }; + // And also, an empty alternate means we have 0 static capture groups, + // but we otherwise start with the number corresponding to the first + // alternate. If any subsequent alternate has a different number of + // static capture groups, then we overall have a variation and not a + // static number of groups. + let static_explicit_captures_len = + it.peek().and_then(|p| p.borrow().static_explicit_captures_len()); + // The base case is an empty alternation, which matches nothing. + // Note though that empty alternations aren't possible, because the + // Hir::alternation smart constructor rewrites those as empty character + // classes. + let mut props = PropertiesI { + minimum_len: None, + maximum_len: None, + look_set: LookSet::empty(), + look_set_prefix: fix, + look_set_suffix: fix, + look_set_prefix_any: LookSet::empty(), + look_set_suffix_any: LookSet::empty(), + utf8: true, + explicit_captures_len: 0, + static_explicit_captures_len, + literal: false, + alternation_literal: true, + }; + let (mut min_poisoned, mut max_poisoned) = (false, false); + // Handle properties that need to visit every child hir. + for prop in it { + let p = prop.borrow(); + props.look_set.set_union(p.look_set()); + props.look_set_prefix.set_intersect(p.look_set_prefix()); + props.look_set_suffix.set_intersect(p.look_set_suffix()); + props.look_set_prefix_any.set_union(p.look_set_prefix_any()); + props.look_set_suffix_any.set_union(p.look_set_suffix_any()); + props.utf8 = props.utf8 && p.is_utf8(); + props.explicit_captures_len = props + .explicit_captures_len + .saturating_add(p.explicit_captures_len()); + if props.static_explicit_captures_len + != p.static_explicit_captures_len() + { + props.static_explicit_captures_len = None; + } + props.alternation_literal = + props.alternation_literal && p.is_literal(); + if !min_poisoned { + if let Some(xmin) = p.minimum_len() { + if props.minimum_len.map_or(true, |pmin| xmin < pmin) { + props.minimum_len = Some(xmin); + } + } else { + props.minimum_len = None; + min_poisoned = true; + } + } + if !max_poisoned { + if let Some(xmax) = p.maximum_len() { + if props.maximum_len.map_or(true, |pmax| xmax > pmax) { + props.maximum_len = Some(xmax); + } + } else { + props.maximum_len = None; + max_poisoned = true; + } + } } + Properties(Box::new(props)) + } +} - fn $set_fn_name(&mut self, yes: bool) { - if yes { - self.bools |= 1 << $bit; +impl Properties { + /// Create a new set of HIR properties for an empty regex. + fn empty() -> Properties { + let inner = PropertiesI { + minimum_len: Some(0), + maximum_len: Some(0), + look_set: LookSet::empty(), + look_set_prefix: LookSet::empty(), + look_set_suffix: LookSet::empty(), + look_set_prefix_any: LookSet::empty(), + look_set_suffix_any: LookSet::empty(), + // It is debatable whether an empty regex always matches at valid + // UTF-8 boundaries. Strictly speaking, at a byte oriented view, + // it is clearly false. There are, for example, many empty strings + // between the bytes encoding a '☃'. + // + // However, when Unicode mode is enabled, the fundamental atom + // of matching is really a codepoint. And in that scenario, an + // empty regex is defined to only match at valid UTF-8 boundaries + // and to never split a codepoint. It just so happens that this + // enforcement is somewhat tricky to do for regexes that match + // the empty string inside regex engines themselves. It usually + // requires some layer above the regex engine to filter out such + // matches. + // + // In any case, 'true' is really the only coherent option. If it + // were false, for example, then 'a*' would also need to be false + // since it too can match the empty string. + utf8: true, + explicit_captures_len: 0, + static_explicit_captures_len: Some(0), + literal: false, + alternation_literal: false, + }; + Properties(Box::new(inner)) + } + + /// Create a new set of HIR properties for a literal regex. + fn literal(lit: &Literal) -> Properties { + let inner = PropertiesI { + minimum_len: Some(lit.0.len()), + maximum_len: Some(lit.0.len()), + look_set: LookSet::empty(), + look_set_prefix: LookSet::empty(), + look_set_suffix: LookSet::empty(), + look_set_prefix_any: LookSet::empty(), + look_set_suffix_any: LookSet::empty(), + utf8: core::str::from_utf8(&lit.0).is_ok(), + explicit_captures_len: 0, + static_explicit_captures_len: Some(0), + literal: true, + alternation_literal: true, + }; + Properties(Box::new(inner)) + } + + /// Create a new set of HIR properties for a character class. + fn class(class: &Class) -> Properties { + let inner = PropertiesI { + minimum_len: class.minimum_len(), + maximum_len: class.maximum_len(), + look_set: LookSet::empty(), + look_set_prefix: LookSet::empty(), + look_set_suffix: LookSet::empty(), + look_set_prefix_any: LookSet::empty(), + look_set_suffix_any: LookSet::empty(), + utf8: class.is_utf8(), + explicit_captures_len: 0, + static_explicit_captures_len: Some(0), + literal: false, + alternation_literal: false, + }; + Properties(Box::new(inner)) + } + + /// Create a new set of HIR properties for a look-around assertion. + fn look(look: Look) -> Properties { + let inner = PropertiesI { + minimum_len: Some(0), + maximum_len: Some(0), + look_set: LookSet::singleton(look), + look_set_prefix: LookSet::singleton(look), + look_set_suffix: LookSet::singleton(look), + look_set_prefix_any: LookSet::singleton(look), + look_set_suffix_any: LookSet::singleton(look), + // This requires a little explanation. Basically, we don't consider + // matching an empty string to be equivalent to matching invalid + // UTF-8, even though technically matching every empty string will + // split the UTF-8 encoding of a single codepoint when treating a + // UTF-8 encoded string as a sequence of bytes. Our defense here is + // that in such a case, a codepoint should logically be treated as + // the fundamental atom for matching, and thus the only valid match + // points are between codepoints and not bytes. + // + // More practically, this is true here because it's also true + // for 'Hir::empty()', otherwise something like 'a*' would be + // considered to match invalid UTF-8. That in turn makes this + // property borderline useless. + utf8: true, + explicit_captures_len: 0, + static_explicit_captures_len: Some(0), + literal: false, + alternation_literal: false, + }; + Properties(Box::new(inner)) + } + + /// Create a new set of HIR properties for a repetition. + fn repetition(rep: &Repetition) -> Properties { + let p = rep.sub.properties(); + let minimum_len = p.minimum_len().map(|child_min| { + let rep_min = usize::try_from(rep.min).unwrap_or(usize::MAX); + child_min.saturating_mul(rep_min) + }); + let maximum_len = rep.max.and_then(|rep_max| { + let rep_max = usize::try_from(rep_max).ok()?; + let child_max = p.maximum_len()?; + child_max.checked_mul(rep_max) + }); + + let mut inner = PropertiesI { + minimum_len, + maximum_len, + look_set: p.look_set(), + look_set_prefix: LookSet::empty(), + look_set_suffix: LookSet::empty(), + look_set_prefix_any: p.look_set_prefix_any(), + look_set_suffix_any: p.look_set_suffix_any(), + utf8: p.is_utf8(), + explicit_captures_len: p.explicit_captures_len(), + static_explicit_captures_len: p.static_explicit_captures_len(), + literal: false, + alternation_literal: false, + }; + // If the repetition operator can match the empty string, then its + // lookset prefix and suffixes themselves remain empty since they are + // no longer required to match. + if rep.min > 0 { + inner.look_set_prefix = p.look_set_prefix(); + inner.look_set_suffix = p.look_set_suffix(); + } + // If the static captures len of the sub-expression is not known or is + // zero, then it automatically propagates to the repetition, regardless + // of the repetition. Otherwise, it might change, but only when the + // repetition can match 0 times. + if rep.min == 0 + && inner.static_explicit_captures_len.map_or(false, |len| len > 0) + { + // If we require a match 0 times, then our captures len is + // guaranteed to be zero. Otherwise, if we *can* match the empty + // string, then it's impossible to know how many captures will be + // in the resulting match. + if rep.max == Some(0) { + inner.static_explicit_captures_len = Some(0); } else { - self.bools &= !(1 << $bit); + inner.static_explicit_captures_len = None; } } - }; + Properties(Box::new(inner)) + } + + /// Create a new set of HIR properties for a capture. + fn capture(capture: &Capture) -> Properties { + let p = capture.sub.properties(); + Properties(Box::new(PropertiesI { + explicit_captures_len: p.explicit_captures_len().saturating_add(1), + static_explicit_captures_len: p + .static_explicit_captures_len() + .map(|len| len.saturating_add(1)), + literal: false, + alternation_literal: false, + ..*p.0.clone() + })) + } + + /// Create a new set of HIR properties for a concatenation. + fn concat(concat: &[Hir]) -> Properties { + // The base case is an empty concatenation, which matches the empty + // string. Note though that empty concatenations aren't possible, + // because the Hir::concat smart constructor rewrites those as + // Hir::empty. + let mut props = PropertiesI { + minimum_len: Some(0), + maximum_len: Some(0), + look_set: LookSet::empty(), + look_set_prefix: LookSet::empty(), + look_set_suffix: LookSet::empty(), + look_set_prefix_any: LookSet::empty(), + look_set_suffix_any: LookSet::empty(), + utf8: true, + explicit_captures_len: 0, + static_explicit_captures_len: Some(0), + literal: true, + alternation_literal: true, + }; + // Handle properties that need to visit every child hir. + for x in concat.iter() { + let p = x.properties(); + props.look_set.set_union(p.look_set()); + props.utf8 = props.utf8 && p.is_utf8(); + props.explicit_captures_len = props + .explicit_captures_len + .saturating_add(p.explicit_captures_len()); + props.static_explicit_captures_len = p + .static_explicit_captures_len() + .and_then(|len1| { + Some((len1, props.static_explicit_captures_len?)) + }) + .and_then(|(len1, len2)| Some(len1.saturating_add(len2))); + props.literal = props.literal && p.is_literal(); + props.alternation_literal = + props.alternation_literal && p.is_alternation_literal(); + if let Some(ref mut minimum_len) = props.minimum_len { + match p.minimum_len() { + None => props.minimum_len = None, + Some(len) => *minimum_len += len, + } + } + if let Some(ref mut maximum_len) = props.maximum_len { + match p.maximum_len() { + None => props.maximum_len = None, + Some(len) => *maximum_len += len, + } + } + } + // Handle the prefix properties, which only requires visiting + // child exprs until one matches more than the empty string. + let mut it = concat.iter(); + while let Some(x) = it.next() { + props.look_set_prefix.set_union(x.properties().look_set_prefix()); + props + .look_set_prefix_any + .set_union(x.properties().look_set_prefix_any()); + if x.properties().maximum_len().map_or(true, |x| x > 0) { + break; + } + } + // Same thing for the suffix properties, but in reverse. + let mut it = concat.iter().rev(); + while let Some(x) = it.next() { + props.look_set_suffix.set_union(x.properties().look_set_suffix()); + props + .look_set_suffix_any + .set_union(x.properties().look_set_suffix_any()); + if x.properties().maximum_len().map_or(true, |x| x > 0) { + break; + } + } + Properties(Box::new(props)) + } + + /// Create a new set of HIR properties for a concatenation. + fn alternation(alts: &[Hir]) -> Properties { + Properties::union(alts.iter().map(|hir| hir.properties())) + } } -impl HirInfo { - fn new() -> HirInfo { - HirInfo { bools: 0 } - } - - define_bool!(0, is_always_utf8, set_always_utf8); - define_bool!(1, is_all_assertions, set_all_assertions); - define_bool!(2, is_anchored_start, set_anchored_start); - define_bool!(3, is_anchored_end, set_anchored_end); - define_bool!(4, is_line_anchored_start, set_line_anchored_start); - define_bool!(5, is_line_anchored_end, set_line_anchored_end); - define_bool!(6, is_any_anchored_start, set_any_anchored_start); - define_bool!(7, is_any_anchored_end, set_any_anchored_end); - define_bool!(8, is_match_empty, set_match_empty); - define_bool!(9, is_literal, set_literal); - define_bool!(10, is_alternation_literal, set_alternation_literal); +/// A set of look-around assertions. +/// +/// This is useful for efficiently tracking look-around assertions. For +/// example, an [`Hir`] provides properties that return `LookSet`s. +#[derive(Clone, Copy, Default, Eq, PartialEq)] +pub struct LookSet { + /// The underlying representation this set is exposed to make it possible + /// to store it somewhere efficiently. The representation is that + /// of a bitset, where each assertion occupies bit `i` where `i = + /// Look::as_repr()`. + /// + /// Note that users of this internal representation must permit the full + /// range of `u16` values to be represented. For example, even if the + /// current implementation only makes use of the 10 least significant bits, + /// it may use more bits in a future semver compatible release. + pub bits: u16, +} + +impl LookSet { + /// Create an empty set of look-around assertions. + #[inline] + pub fn empty() -> LookSet { + LookSet { bits: 0 } + } + + /// Create a full set of look-around assertions. + /// + /// This set contains all possible look-around assertions. + #[inline] + pub fn full() -> LookSet { + LookSet { bits: !0 } + } + + /// Create a look-around set containing the look-around assertion given. + /// + /// This is a convenience routine for creating an empty set and inserting + /// one look-around assertions. + #[inline] + pub fn singleton(look: Look) -> LookSet { + LookSet::empty().insert(look) + } + + /// Returns the total number of look-around assertions in this set. + #[inline] + pub fn len(self) -> usize { + // OK because max value always fits in a u8, which in turn always + // fits in a usize, regardless of target. + usize::try_from(self.bits.count_ones()).unwrap() + } + + /// Returns true if and only if this set is empty. + #[inline] + pub fn is_empty(self) -> bool { + self.len() == 0 + } + + /// Returns true if and only if the given look-around assertion is in this + /// set. + #[inline] + pub fn contains(self, look: Look) -> bool { + self.bits & look.as_repr() != 0 + } + + /// Returns true if and only if this set contains any anchor assertions. + /// This includes both "start/end of haystack" and "start/end of line." + #[inline] + pub fn contains_anchor(&self) -> bool { + self.contains_anchor_haystack() || self.contains_anchor_line() + } + + /// Returns true if and only if this set contains any "start/end of + /// haystack" anchors. This doesn't include "start/end of line" anchors. + #[inline] + pub fn contains_anchor_haystack(&self) -> bool { + self.contains(Look::Start) || self.contains(Look::End) + } + + /// Returns true if and only if this set contains any "start/end of line" + /// anchors. This doesn't include "start/end of haystack" anchors. This + /// includes both `\n` line anchors and CRLF (`\r\n`) aware line anchors. + #[inline] + pub fn contains_anchor_line(&self) -> bool { + self.contains(Look::StartLF) + || self.contains(Look::EndLF) + || self.contains(Look::StartCRLF) + || self.contains(Look::EndCRLF) + } + + /// Returns true if and only if this set contains any "start/end of line" + /// anchors that only treat `\n` as line terminators. This does not include + /// haystack anchors or CRLF aware line anchors. + #[inline] + pub fn contains_anchor_lf(&self) -> bool { + self.contains(Look::StartLF) || self.contains(Look::EndLF) + } + + /// Returns true if and only if this set contains any "start/end of line" + /// anchors that are CRLF-aware. This doesn't include "start/end of + /// haystack" or "start/end of line-feed" anchors. + #[inline] + pub fn contains_anchor_crlf(&self) -> bool { + self.contains(Look::StartCRLF) || self.contains(Look::EndCRLF) + } + + /// Returns true if and only if this set contains any word boundary or + /// negated word boundary assertions. This include both Unicode and ASCII + /// word boundaries. + #[inline] + pub fn contains_word(self) -> bool { + self.contains_word_unicode() || self.contains_word_ascii() + } + + /// Returns true if and only if this set contains any Unicode word boundary + /// or negated Unicode word boundary assertions. + #[inline] + pub fn contains_word_unicode(self) -> bool { + self.contains(Look::WordUnicode) + || self.contains(Look::WordUnicodeNegate) + } + + /// Returns true if and only if this set contains any ASCII word boundary + /// or negated ASCII word boundary assertions. + #[inline] + pub fn contains_word_ascii(self) -> bool { + self.contains(Look::WordAscii) || self.contains(Look::WordAsciiNegate) + } + + /// Returns an iterator over all of the look-around assertions in this set. + #[inline] + pub fn iter(self) -> LookSetIter { + LookSetIter { set: self } + } + + /// Return a new set that is equivalent to the original, but with the given + /// assertion added to it. If the assertion is already in the set, then the + /// returned set is equivalent to the original. + #[inline] + pub fn insert(self, look: Look) -> LookSet { + LookSet { bits: self.bits | look.as_repr() } + } + + /// Updates this set in place with the result of inserting the given + /// assertion into this set. + #[inline] + pub fn set_insert(&mut self, look: Look) { + *self = self.insert(look); + } + + /// Return a new set that is equivalent to the original, but with the given + /// assertion removed from it. If the assertion is not in the set, then the + /// returned set is equivalent to the original. + #[inline] + pub fn remove(self, look: Look) -> LookSet { + LookSet { bits: self.bits & !look.as_repr() } + } + + /// Updates this set in place with the result of removing the given + /// assertion from this set. + #[inline] + pub fn set_remove(&mut self, look: Look) { + *self = self.remove(look); + } + + /// Returns a new set that is the result of subtracting the given set from + /// this set. + #[inline] + pub fn subtract(self, other: LookSet) -> LookSet { + LookSet { bits: self.bits & !other.bits } + } + + /// Updates this set in place with the result of subtracting the given set + /// from this set. + #[inline] + pub fn set_subtract(&mut self, other: LookSet) { + *self = self.subtract(other); + } + + /// Returns a new set that is the union of this and the one given. + #[inline] + pub fn union(self, other: LookSet) -> LookSet { + LookSet { bits: self.bits | other.bits } + } + + /// Updates this set in place with the result of unioning it with the one + /// given. + #[inline] + pub fn set_union(&mut self, other: LookSet) { + *self = self.union(other); + } + + /// Returns a new set that is the intersection of this and the one given. + #[inline] + pub fn intersect(self, other: LookSet) -> LookSet { + LookSet { bits: self.bits & other.bits } + } + + /// Updates this set in place with the result of intersecting it with the + /// one given. + #[inline] + pub fn set_intersect(&mut self, other: LookSet) { + *self = self.intersect(other); + } + + /// Return a `LookSet` from the slice given as a native endian 16-bit + /// integer. + /// + /// # Panics + /// + /// This panics if `slice.len() < 2`. + #[inline] + pub fn read_repr(slice: &[u8]) -> LookSet { + let bits = u16::from_ne_bytes(slice[..2].try_into().unwrap()); + LookSet { bits } + } + + /// Write a `LookSet` as a native endian 16-bit integer to the beginning + /// of the slice given. + /// + /// # Panics + /// + /// This panics if `slice.len() < 2`. + #[inline] + pub fn write_repr(self, slice: &mut [u8]) { + let raw = self.bits.to_ne_bytes(); + slice[0] = raw[0]; + slice[1] = raw[1]; + } +} + +impl core::fmt::Debug for LookSet { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + if self.is_empty() { + return write!(f, "∅"); + } + for look in self.iter() { + write!(f, "{}", look.as_char())?; + } + Ok(()) + } +} + +/// An iterator over all look-around assertions in a [`LookSet`]. +/// +/// This iterator is created by [`LookSet::iter`]. +#[derive(Clone, Debug)] +pub struct LookSetIter { + set: LookSet, +} + +impl Iterator for LookSetIter { + type Item = Look; + + #[inline] + fn next(&mut self) -> Option<Look> { + if self.set.is_empty() { + return None; + } + // We'll never have more than u8::MAX distinct look-around assertions, + // so 'repr' will always fit into a u16. + let repr = u16::try_from(self.set.bits.trailing_zeros()).unwrap(); + let look = Look::from_repr(1 << repr)?; + self.set = self.set.remove(look); + Some(look) + } +} + +/// Given a sequence of HIR values where each value corresponds to a Unicode +/// class (or an all-ASCII byte class), return a single Unicode class +/// corresponding to the union of the classes found. +fn class_chars(hirs: &[Hir]) -> Option<Class> { + let mut cls = ClassUnicode::new(vec![]); + for hir in hirs.iter() { + match *hir.kind() { + HirKind::Class(Class::Unicode(ref cls2)) => { + cls.union(cls2); + } + HirKind::Class(Class::Bytes(ref cls2)) => { + cls.union(&cls2.to_unicode_class()?); + } + _ => return None, + }; + } + Some(Class::Unicode(cls)) +} + +/// Given a sequence of HIR values where each value corresponds to a byte class +/// (or an all-ASCII Unicode class), return a single byte class corresponding +/// to the union of the classes found. +fn class_bytes(hirs: &[Hir]) -> Option<Class> { + let mut cls = ClassBytes::new(vec![]); + for hir in hirs.iter() { + match *hir.kind() { + HirKind::Class(Class::Unicode(ref cls2)) => { + cls.union(&cls2.to_byte_class()?); + } + HirKind::Class(Class::Bytes(ref cls2)) => { + cls.union(cls2); + } + _ => return None, + }; + } + Some(Class::Bytes(cls)) +} + +/// Given a sequence of HIR values where each value corresponds to a literal +/// that is a single `char`, return that sequence of `char`s. Otherwise return +/// None. No deduplication is done. +fn singleton_chars(hirs: &[Hir]) -> Option<Vec<char>> { + let mut singletons = vec![]; + for hir in hirs.iter() { + let literal = match *hir.kind() { + HirKind::Literal(Literal(ref bytes)) => bytes, + _ => return None, + }; + let ch = match crate::debug::utf8_decode(literal) { + None => return None, + Some(Err(_)) => return None, + Some(Ok(ch)) => ch, + }; + if literal.len() != ch.len_utf8() { + return None; + } + singletons.push(ch); + } + Some(singletons) +} + +/// Given a sequence of HIR values where each value corresponds to a literal +/// that is a single byte, return that sequence of bytes. Otherwise return +/// None. No deduplication is done. +fn singleton_bytes(hirs: &[Hir]) -> Option<Vec<u8>> { + let mut singletons = vec![]; + for hir in hirs.iter() { + let literal = match *hir.kind() { + HirKind::Literal(Literal(ref bytes)) => bytes, + _ => return None, + }; + if literal.len() != 1 { + return None; + } + singletons.push(literal[0]); + } + Some(singletons) +} + +/// Looks for a common prefix in the list of alternation branches given. If one +/// is found, then an equivalent but (hopefully) simplified Hir is returned. +/// Otherwise, the original given list of branches is returned unmodified. +/// +/// This is not quite as good as it could be. Right now, it requires that +/// all branches are 'Concat' expressions. It also doesn't do well with +/// literals. For example, given 'foofoo|foobar', it will not refactor it to +/// 'foo(?:foo|bar)' because literals are flattened into their own special +/// concatenation. (One wonders if perhaps 'Literal' should be a single atom +/// instead of a string of bytes because of this. Otherwise, handling the +/// current representation in this routine will be pretty gnarly. Sigh.) +fn lift_common_prefix(hirs: Vec<Hir>) -> Result<Hir, Vec<Hir>> { + if hirs.len() <= 1 { + return Err(hirs); + } + let mut prefix = match hirs[0].kind() { + HirKind::Concat(ref xs) => &**xs, + _ => return Err(hirs), + }; + if prefix.is_empty() { + return Err(hirs); + } + for h in hirs.iter().skip(1) { + let concat = match h.kind() { + HirKind::Concat(ref xs) => xs, + _ => return Err(hirs), + }; + let common_len = prefix + .iter() + .zip(concat.iter()) + .take_while(|(x, y)| x == y) + .count(); + prefix = &prefix[..common_len]; + if prefix.is_empty() { + return Err(hirs); + } + } + let len = prefix.len(); + assert_ne!(0, len); + let mut prefix_concat = vec![]; + let mut suffix_alts = vec![]; + for h in hirs { + let mut concat = match h.into_kind() { + HirKind::Concat(xs) => xs, + // We required all sub-expressions to be + // concats above, so we're only here if we + // have a concat. + _ => unreachable!(), + }; + suffix_alts.push(Hir::concat(concat.split_off(len))); + if prefix_concat.is_empty() { + prefix_concat = concat; + } + } + let mut concat = prefix_concat; + concat.push(Hir::alternation(suffix_alts)); + Ok(Hir::concat(concat)) } #[cfg(test)] @@ -2244,12 +3653,6 @@ mod tests { assert_eq!(expected, bsymdifference(&cls1, &cls2)); } - #[test] - #[should_panic] - fn hir_byte_literal_non_ascii() { - Hir::literal(Literal::Byte(b'a')); - } - // We use a thread with an explicit stack size to test that our destructor // for Hir can handle arbitrarily sized expressions in constant stack // space. In case we run on a platform without threads (WASM?), we limit @@ -2262,35 +3665,67 @@ mod tests { let run = || { let mut expr = Hir::empty(); for _ in 0..100 { - expr = Hir::group(Group { - kind: GroupKind::NonCapturing, - hir: Box::new(expr), + expr = Hir::capture(Capture { + index: 1, + name: None, + sub: Box::new(expr), }); expr = Hir::repetition(Repetition { - kind: RepetitionKind::ZeroOrOne, + min: 0, + max: Some(1), greedy: true, - hir: Box::new(expr), + sub: Box::new(expr), }); expr = Hir { kind: HirKind::Concat(vec![expr]), - info: HirInfo::new(), + props: Properties::empty(), }; expr = Hir { kind: HirKind::Alternation(vec![expr]), - info: HirInfo::new(), + props: Properties::empty(), }; } - assert!(!expr.kind.is_empty()); + assert!(!matches!(*expr.kind(), HirKind::Empty)); }; // We run our test on a thread with a small stack size so we can // force the issue more easily. + // + // NOTE(2023-03-21): See the corresponding test in 'crate::ast::tests' + // for context on the specific stack size chosen here. thread::Builder::new() - .stack_size(1 << 10) + .stack_size(16 << 10) .spawn(run) .unwrap() .join() .unwrap(); } + + #[test] + fn look_set_iter() { + let set = LookSet::empty(); + assert_eq!(0, set.iter().count()); + + let set = LookSet::full(); + assert_eq!(10, set.iter().count()); + + let set = + LookSet::empty().insert(Look::StartLF).insert(Look::WordUnicode); + assert_eq!(2, set.iter().count()); + + let set = LookSet::empty().insert(Look::StartLF); + assert_eq!(1, set.iter().count()); + + let set = LookSet::empty().insert(Look::WordAsciiNegate); + assert_eq!(1, set.iter().count()); + } + + #[test] + fn look_set_debug() { + let res = format!("{:?}", LookSet::empty()); + assert_eq!("∅", res); + let res = format!("{:?}", LookSet::full()); + assert_eq!("Az^$rRbB𝛃𝚩", res); + } } diff --git a/vendor/regex-syntax/src/hir/print.rs b/vendor/regex-syntax/src/hir/print.rs index b71f3897c..fcb7cd252 100644 --- a/vendor/regex-syntax/src/hir/print.rs +++ b/vendor/regex-syntax/src/hir/print.rs @@ -2,11 +2,16 @@ This module provides a regular expression printer for `Hir`. */ -use std::fmt; +use core::fmt; -use crate::hir::visitor::{self, Visitor}; -use crate::hir::{self, Hir, HirKind}; -use crate::is_meta_character; +use crate::{ + hir::{ + self, + visitor::{self, Visitor}, + Hir, HirKind, + }, + is_meta_character, +}; /// A builder for constructing a printer. /// @@ -84,21 +89,54 @@ impl<W: fmt::Write> Visitor for Writer<W> { fn visit_pre(&mut self, hir: &Hir) -> fmt::Result { match *hir.kind() { - HirKind::Empty - | HirKind::Repetition(_) - | HirKind::Concat(_) - | HirKind::Alternation(_) => {} - HirKind::Literal(hir::Literal::Unicode(c)) => { - self.write_literal_char(c)?; - } - HirKind::Literal(hir::Literal::Byte(b)) => { - self.write_literal_byte(b)?; + // Empty is represented by nothing in the concrete syntax, and + // repetition operators are strictly suffix oriented. + HirKind::Empty | HirKind::Repetition(_) => {} + HirKind::Literal(hir::Literal(ref bytes)) => { + // See the comment on the 'Concat' and 'Alternation' case below + // for why we put parens here. Literals are, conceptually, + // a special case of concatenation where each element is a + // character. The HIR flattens this into a Box<[u8]>, but we + // still need to treat it like a concatenation for correct + // printing. As a special case, we don't write parens if there + // is only one character. One character means there is no + // concat so we don't need parens. Adding parens would still be + // correct, but we drop them here because it tends to create + // rather noisy regexes even in simple cases. + let result = core::str::from_utf8(bytes); + let len = result.map_or(bytes.len(), |s| s.chars().count()); + if len > 1 { + self.wtr.write_str(r"(?:")?; + } + match result { + Ok(string) => { + for c in string.chars() { + self.write_literal_char(c)?; + } + } + Err(_) => { + for &b in bytes.iter() { + self.write_literal_byte(b)?; + } + } + } + if len > 1 { + self.wtr.write_str(r")")?; + } } HirKind::Class(hir::Class::Unicode(ref cls)) => { + if cls.ranges().is_empty() { + return self.wtr.write_str("[a&&b]"); + } self.wtr.write_str("[")?; for range in cls.iter() { if range.start() == range.end() { self.write_literal_char(range.start())?; + } else if u32::from(range.start()) + 1 + == u32::from(range.end()) + { + self.write_literal_char(range.start())?; + self.write_literal_char(range.end())?; } else { self.write_literal_char(range.start())?; self.wtr.write_str("-")?; @@ -108,10 +146,16 @@ impl<W: fmt::Write> Visitor for Writer<W> { self.wtr.write_str("]")?; } HirKind::Class(hir::Class::Bytes(ref cls)) => { + if cls.ranges().is_empty() { + return self.wtr.write_str("[a&&b]"); + } self.wtr.write_str("(?-u:[")?; for range in cls.iter() { if range.start() == range.end() { self.write_literal_class_byte(range.start())?; + } else if range.start() + 1 == range.end() { + self.write_literal_class_byte(range.start())?; + self.write_literal_class_byte(range.end())?; } else { self.write_literal_class_byte(range.start())?; self.wtr.write_str("-")?; @@ -120,41 +164,60 @@ impl<W: fmt::Write> Visitor for Writer<W> { } self.wtr.write_str("])")?; } - HirKind::Anchor(hir::Anchor::StartLine) => { - self.wtr.write_str("(?m:^)")?; - } - HirKind::Anchor(hir::Anchor::EndLine) => { - self.wtr.write_str("(?m:$)")?; - } - HirKind::Anchor(hir::Anchor::StartText) => { - self.wtr.write_str(r"\A")?; - } - HirKind::Anchor(hir::Anchor::EndText) => { - self.wtr.write_str(r"\z")?; - } - HirKind::WordBoundary(hir::WordBoundary::Unicode) => { - self.wtr.write_str(r"\b")?; - } - HirKind::WordBoundary(hir::WordBoundary::UnicodeNegate) => { - self.wtr.write_str(r"\B")?; - } - HirKind::WordBoundary(hir::WordBoundary::Ascii) => { - self.wtr.write_str(r"(?-u:\b)")?; - } - HirKind::WordBoundary(hir::WordBoundary::AsciiNegate) => { - self.wtr.write_str(r"(?-u:\B)")?; - } - HirKind::Group(ref x) => match x.kind { - hir::GroupKind::CaptureIndex(_) => { - self.wtr.write_str("(")?; + HirKind::Look(ref look) => match *look { + hir::Look::Start => { + self.wtr.write_str(r"\A")?; + } + hir::Look::End => { + self.wtr.write_str(r"\z")?; + } + hir::Look::StartLF => { + self.wtr.write_str("(?m:^)")?; + } + hir::Look::EndLF => { + self.wtr.write_str("(?m:$)")?; + } + hir::Look::StartCRLF => { + self.wtr.write_str("(?mR:^)")?; } - hir::GroupKind::CaptureName { ref name, .. } => { - write!(self.wtr, "(?P<{}>", name)?; + hir::Look::EndCRLF => { + self.wtr.write_str("(?mR:$)")?; } - hir::GroupKind::NonCapturing => { - self.wtr.write_str("(?:")?; + hir::Look::WordAscii => { + self.wtr.write_str(r"(?-u:\b)")?; + } + hir::Look::WordAsciiNegate => { + self.wtr.write_str(r"(?-u:\B)")?; + } + hir::Look::WordUnicode => { + self.wtr.write_str(r"\b")?; + } + hir::Look::WordUnicodeNegate => { + self.wtr.write_str(r"\B")?; } }, + HirKind::Capture(hir::Capture { ref name, .. }) => { + self.wtr.write_str("(")?; + if let Some(ref name) = *name { + write!(self.wtr, "?P<{}>", name)?; + } + } + // Why do this? Wrapping concats and alts in non-capturing groups + // is not *always* necessary, but is sometimes necessary. For + // example, 'concat(a, alt(b, c))' should be written as 'a(?:b|c)' + // and not 'ab|c'. The former is clearly the intended meaning, but + // the latter is actually 'alt(concat(a, b), c)'. + // + // It would be possible to only group these things in cases where + // it's strictly necessary, but it requires knowing the parent + // expression. And since this technique is simpler and always + // correct, we take this route. More to the point, it is a non-goal + // of an HIR printer to show a nice easy-to-read regex. Indeed, + // its construction forbids it from doing so. Therefore, inserting + // extra groups where they aren't necessary is perfectly okay. + HirKind::Concat(_) | HirKind::Alternation(_) => { + self.wtr.write_str(r"(?:")?; + } } Ok(()) } @@ -165,39 +228,42 @@ impl<W: fmt::Write> Visitor for Writer<W> { HirKind::Empty | HirKind::Literal(_) | HirKind::Class(_) - | HirKind::Anchor(_) - | HirKind::WordBoundary(_) - | HirKind::Concat(_) - | HirKind::Alternation(_) => {} + | HirKind::Look(_) => {} HirKind::Repetition(ref x) => { - match x.kind { - hir::RepetitionKind::ZeroOrOne => { + match (x.min, x.max) { + (0, Some(1)) => { self.wtr.write_str("?")?; } - hir::RepetitionKind::ZeroOrMore => { + (0, None) => { self.wtr.write_str("*")?; } - hir::RepetitionKind::OneOrMore => { + (1, None) => { self.wtr.write_str("+")?; } - hir::RepetitionKind::Range(ref x) => match *x { - hir::RepetitionRange::Exactly(m) => { - write!(self.wtr, "{{{}}}", m)?; - } - hir::RepetitionRange::AtLeast(m) => { - write!(self.wtr, "{{{},}}", m)?; - } - hir::RepetitionRange::Bounded(m, n) => { - write!(self.wtr, "{{{},{}}}", m, n)?; - } - }, + (1, Some(1)) => { + // 'a{1}' and 'a{1}?' are exactly equivalent to 'a'. + return Ok(()); + } + (m, None) => { + write!(self.wtr, "{{{},}}", m)?; + } + (m, Some(n)) if m == n => { + write!(self.wtr, "{{{}}}", m)?; + // a{m} and a{m}? are always exactly equivalent. + return Ok(()); + } + (m, Some(n)) => { + write!(self.wtr, "{{{},{}}}", m, n)?; + } } if !x.greedy { self.wtr.write_str("?")?; } } - HirKind::Group(_) => { - self.wtr.write_str(")")?; + HirKind::Capture(_) + | HirKind::Concat(_) + | HirKind::Alternation(_) => { + self.wtr.write_str(r")")?; } } Ok(()) @@ -217,18 +283,16 @@ impl<W: fmt::Write> Writer<W> { } fn write_literal_byte(&mut self, b: u8) -> fmt::Result { - let c = b as char; - if c <= 0x7F as char && !c.is_control() && !c.is_whitespace() { - self.write_literal_char(c) + if b <= 0x7F && !b.is_ascii_control() && !b.is_ascii_whitespace() { + self.write_literal_char(char::try_from(b).unwrap()) } else { write!(self.wtr, "(?-u:\\x{:02X})", b) } } fn write_literal_class_byte(&mut self, b: u8) -> fmt::Result { - let c = b as char; - if c <= 0x7F as char && !c.is_control() && !c.is_whitespace() { - self.write_literal_char(c) + if b <= 0x7F && !b.is_ascii_control() && !b.is_ascii_whitespace() { + self.write_literal_char(char::try_from(b).unwrap()) } else { write!(self.wtr, "\\x{:02X}", b) } @@ -237,15 +301,21 @@ impl<W: fmt::Write> Writer<W> { #[cfg(test)] mod tests { - use super::Printer; + use alloc::{ + boxed::Box, + string::{String, ToString}, + }; + use crate::ParserBuilder; + use super::*; + fn roundtrip(given: &str, expected: &str) { roundtrip_with(|b| b, given, expected); } fn roundtrip_bytes(given: &str, expected: &str) { - roundtrip_with(|b| b.allow_invalid_utf8(true), given, expected); + roundtrip_with(|b| b.utf8(false), given, expected); } fn roundtrip_with<F>(mut f: F, given: &str, expected: &str) @@ -277,28 +347,35 @@ mod tests { #[test] fn print_class() { - roundtrip(r"[a]", r"[a]"); + roundtrip(r"[a]", r"a"); + roundtrip(r"[ab]", r"[ab]"); roundtrip(r"[a-z]", r"[a-z]"); roundtrip(r"[a-z--b-c--x-y]", r"[ad-wz]"); - roundtrip(r"[^\x01-\u{10FFFF}]", "[\u{0}]"); - roundtrip(r"[-]", r"[\-]"); + roundtrip(r"[^\x01-\u{10FFFF}]", "\u{0}"); + roundtrip(r"[-]", r"\-"); roundtrip(r"[☃-⛄]", r"[☃-⛄]"); - roundtrip(r"(?-u)[a]", r"(?-u:[a])"); + roundtrip(r"(?-u)[a]", r"a"); + roundtrip(r"(?-u)[ab]", r"(?-u:[ab])"); roundtrip(r"(?-u)[a-z]", r"(?-u:[a-z])"); roundtrip_bytes(r"(?-u)[a-\xFF]", r"(?-u:[a-\xFF])"); // The following test that the printer escapes meta characters // in character classes. - roundtrip(r"[\[]", r"[\[]"); + roundtrip(r"[\[]", r"\["); roundtrip(r"[Z-_]", r"[Z-_]"); roundtrip(r"[Z-_--Z]", r"[\[-_]"); // The following test that the printer escapes meta characters // in byte oriented character classes. - roundtrip_bytes(r"(?-u)[\[]", r"(?-u:[\[])"); + roundtrip_bytes(r"(?-u)[\[]", r"\["); roundtrip_bytes(r"(?-u)[Z-_]", r"(?-u:[Z-_])"); roundtrip_bytes(r"(?-u)[Z-_--Z]", r"(?-u:[\[-_])"); + + // This tests that an empty character class is correctly roundtripped. + #[cfg(feature = "unicode-gencat")] + roundtrip(r"\P{any}", r"[a&&b]"); + roundtrip_bytes(r"(?-u)[^\x00-\xFF]", r"[a&&b]"); } #[test] @@ -331,37 +408,170 @@ mod tests { roundtrip("a+?", "a+?"); roundtrip("(?U)a+", "a+?"); - roundtrip("a{1}", "a{1}"); - roundtrip("a{1,}", "a{1,}"); + roundtrip("a{1}", "a"); + roundtrip("a{2}", "a{2}"); + roundtrip("a{1,}", "a+"); roundtrip("a{1,5}", "a{1,5}"); - roundtrip("a{1}?", "a{1}?"); - roundtrip("a{1,}?", "a{1,}?"); + roundtrip("a{1}?", "a"); + roundtrip("a{2}?", "a{2}"); + roundtrip("a{1,}?", "a+?"); roundtrip("a{1,5}?", "a{1,5}?"); - roundtrip("(?U)a{1}", "a{1}?"); - roundtrip("(?U)a{1,}", "a{1,}?"); + roundtrip("(?U)a{1}", "a"); + roundtrip("(?U)a{2}", "a{2}"); + roundtrip("(?U)a{1,}", "a+?"); roundtrip("(?U)a{1,5}", "a{1,5}?"); + + // Test that various zero-length repetitions always translate to an + // empty regex. This is more a property of HIR's smart constructors + // than the printer though. + roundtrip("a{0}", ""); + roundtrip("(?:ab){0}", ""); + #[cfg(feature = "unicode-gencat")] + { + roundtrip(r"\p{any}{0}", ""); + roundtrip(r"\P{any}{0}", ""); + } } #[test] fn print_group() { roundtrip("()", "()"); roundtrip("(?P<foo>)", "(?P<foo>)"); - roundtrip("(?:)", "(?:)"); + roundtrip("(?:)", ""); roundtrip("(a)", "(a)"); roundtrip("(?P<foo>a)", "(?P<foo>a)"); - roundtrip("(?:a)", "(?:a)"); + roundtrip("(?:a)", "a"); roundtrip("((((a))))", "((((a))))"); } #[test] fn print_alternation() { - roundtrip("|", "|"); - roundtrip("||", "||"); + roundtrip("|", "(?:|)"); + roundtrip("||", "(?:||)"); + + roundtrip("a|b", "[ab]"); + roundtrip("ab|cd", "(?:(?:ab)|(?:cd))"); + roundtrip("a|b|c", "[a-c]"); + roundtrip("ab|cd|ef", "(?:(?:ab)|(?:cd)|(?:ef))"); + roundtrip("foo|bar|quux", "(?:(?:foo)|(?:bar)|(?:quux))"); + } - roundtrip("a|b", "a|b"); - roundtrip("a|b|c", "a|b|c"); - roundtrip("foo|bar|quux", "foo|bar|quux"); + // This is a regression test that stresses a peculiarity of how the HIR + // is both constructed and printed. Namely, it is legal for a repetition + // to directly contain a concatenation. This particular construct isn't + // really possible to build from the concrete syntax directly, since you'd + // be forced to put the concatenation into (at least) a non-capturing + // group. Concurrently, the printer doesn't consider this case and just + // kind of naively prints the child expression and tacks on the repetition + // operator. + // + // As a result, if you attached '+' to a 'concat(a, b)', the printer gives + // you 'ab+', but clearly it really should be '(?:ab)+'. + // + // This bug isn't easy to surface because most ways of building an HIR + // come directly from the concrete syntax, and as mentioned above, it just + // isn't possible to build this kind of HIR from the concrete syntax. + // Nevertheless, this is definitely a bug. + // + // See: https://github.com/rust-lang/regex/issues/731 + #[test] + fn regression_repetition_concat() { + let expr = Hir::concat(alloc::vec![ + Hir::literal("x".as_bytes()), + Hir::repetition(hir::Repetition { + min: 1, + max: None, + greedy: true, + sub: Box::new(Hir::literal("ab".as_bytes())), + }), + Hir::literal("y".as_bytes()), + ]); + assert_eq!(r"(?:x(?:ab)+y)", expr.to_string()); + + let expr = Hir::concat(alloc::vec![ + Hir::look(hir::Look::Start), + Hir::repetition(hir::Repetition { + min: 1, + max: None, + greedy: true, + sub: Box::new(Hir::concat(alloc::vec![ + Hir::look(hir::Look::Start), + Hir::look(hir::Look::End), + ])), + }), + Hir::look(hir::Look::End), + ]); + assert_eq!(r"(?:\A(?:\A\z)+\z)", expr.to_string()); + } + + // Just like regression_repetition_concat, but with the repetition using + // an alternation as a child expression instead. + // + // See: https://github.com/rust-lang/regex/issues/731 + #[test] + fn regression_repetition_alternation() { + let expr = Hir::concat(alloc::vec![ + Hir::literal("ab".as_bytes()), + Hir::repetition(hir::Repetition { + min: 1, + max: None, + greedy: true, + sub: Box::new(Hir::alternation(alloc::vec![ + Hir::literal("cd".as_bytes()), + Hir::literal("ef".as_bytes()), + ])), + }), + Hir::literal("gh".as_bytes()), + ]); + assert_eq!(r"(?:(?:ab)(?:(?:cd)|(?:ef))+(?:gh))", expr.to_string()); + + let expr = Hir::concat(alloc::vec![ + Hir::look(hir::Look::Start), + Hir::repetition(hir::Repetition { + min: 1, + max: None, + greedy: true, + sub: Box::new(Hir::alternation(alloc::vec![ + Hir::look(hir::Look::Start), + Hir::look(hir::Look::End), + ])), + }), + Hir::look(hir::Look::End), + ]); + assert_eq!(r"(?:\A(?:\A|\z)+\z)", expr.to_string()); + } + + // This regression test is very similar in flavor to + // regression_repetition_concat in that the root of the issue lies in a + // peculiarity of how the HIR is represented and how the printer writes it + // out. Like the other regression, this one is also rooted in the fact that + // you can't produce the peculiar HIR from the concrete syntax. Namely, you + // just can't have a 'concat(a, alt(b, c))' because the 'alt' will normally + // be in (at least) a non-capturing group. Why? Because the '|' has very + // low precedence (lower that concatenation), and so something like 'ab|c' + // is actually 'alt(ab, c)'. + // + // See: https://github.com/rust-lang/regex/issues/516 + #[test] + fn regression_alternation_concat() { + let expr = Hir::concat(alloc::vec![ + Hir::literal("ab".as_bytes()), + Hir::alternation(alloc::vec![ + Hir::literal("mn".as_bytes()), + Hir::literal("xy".as_bytes()), + ]), + ]); + assert_eq!(r"(?:(?:ab)(?:(?:mn)|(?:xy)))", expr.to_string()); + + let expr = Hir::concat(alloc::vec![ + Hir::look(hir::Look::Start), + Hir::alternation(alloc::vec![ + Hir::look(hir::Look::Start), + Hir::look(hir::Look::End), + ]), + ]); + assert_eq!(r"(?:\A(?:\A|\z))", expr.to_string()); } } diff --git a/vendor/regex-syntax/src/hir/translate.rs b/vendor/regex-syntax/src/hir/translate.rs index 890e1608b..ff9c5ee91 100644 --- a/vendor/regex-syntax/src/hir/translate.rs +++ b/vendor/regex-syntax/src/hir/translate.rs @@ -2,19 +2,23 @@ Defines a translator that converts an `Ast` to an `Hir`. */ -use std::cell::{Cell, RefCell}; -use std::result; +use core::cell::{Cell, RefCell}; -use crate::ast::{self, Ast, Span, Visitor}; -use crate::hir::{self, Error, ErrorKind, Hir}; -use crate::unicode::{self, ClassQuery}; +use alloc::{boxed::Box, string::ToString, vec, vec::Vec}; -type Result<T> = result::Result<T, Error>; +use crate::{ + ast::{self, Ast, Span, Visitor}, + either::Either, + hir::{self, Error, ErrorKind, Hir, HirKind}, + unicode::{self, ClassQuery}, +}; + +type Result<T> = core::result::Result<T, Error>; /// A builder for constructing an AST->HIR translator. #[derive(Clone, Debug)] pub struct TranslatorBuilder { - allow_invalid_utf8: bool, + utf8: bool, flags: Flags, } @@ -27,10 +31,7 @@ impl Default for TranslatorBuilder { impl TranslatorBuilder { /// Create a new translator builder with a default c onfiguration. pub fn new() -> TranslatorBuilder { - TranslatorBuilder { - allow_invalid_utf8: false, - flags: Flags::default(), - } + TranslatorBuilder { utf8: true, flags: Flags::default() } } /// Build a translator using the current configuration. @@ -38,23 +39,27 @@ impl TranslatorBuilder { Translator { stack: RefCell::new(vec![]), flags: Cell::new(self.flags), - allow_invalid_utf8: self.allow_invalid_utf8, + utf8: self.utf8, } } - /// When enabled, translation will permit the construction of a regular + /// When disabled, translation will permit the construction of a regular /// expression that may match invalid UTF-8. /// - /// When disabled (the default), the translator is guaranteed to produce - /// an expression that will only ever match valid UTF-8 (otherwise, the - /// translator will return an error). + /// When enabled (the default), the translator is guaranteed to produce an + /// expression that, for non-empty matches, will only ever produce spans + /// that are entirely valid UTF-8 (otherwise, the translator will return an + /// error). /// - /// Perhaps surprisingly, when invalid UTF-8 isn't allowed, a negated ASCII - /// word boundary (uttered as `(?-u:\B)` in the concrete syntax) will cause - /// the parser to return an error. Namely, a negated ASCII word boundary - /// can result in matching positions that aren't valid UTF-8 boundaries. - pub fn allow_invalid_utf8(&mut self, yes: bool) -> &mut TranslatorBuilder { - self.allow_invalid_utf8 = yes; + /// Perhaps surprisingly, when UTF-8 is enabled, an empty regex or even + /// a negated ASCII word boundary (uttered as `(?-u:\B)` in the concrete + /// syntax) will be allowed even though they can produce matches that split + /// a UTF-8 encoded codepoint. This only applies to zero-width or "empty" + /// matches, and it is expected that the regex engine itself must handle + /// these cases if necessary (perhaps by suppressing any zero-width matches + /// that split a codepoint). + pub fn utf8(&mut self, yes: bool) -> &mut TranslatorBuilder { + self.utf8 = yes; self } @@ -80,6 +85,12 @@ impl TranslatorBuilder { self } + /// Enable or disable the CRLF mode flag (`R`) by default. + pub fn crlf(&mut self, yes: bool) -> &mut TranslatorBuilder { + self.flags.crlf = if yes { Some(true) } else { None }; + self + } + /// Enable or disable the "swap greed" flag (`U`) by default. pub fn swap_greed(&mut self, yes: bool) -> &mut TranslatorBuilder { self.flags.swap_greed = if yes { Some(true) } else { None }; @@ -100,7 +111,7 @@ impl TranslatorBuilder { /// many abstract syntax trees. /// /// A `Translator` can be configured in more detail via a -/// [`TranslatorBuilder`](struct.TranslatorBuilder.html). +/// [`TranslatorBuilder`]. #[derive(Clone, Debug)] pub struct Translator { /// Our call stack, but on the heap. @@ -108,7 +119,7 @@ pub struct Translator { /// The current flag settings. flags: Cell<Flags>, /// Whether we're allowed to produce HIR that can match arbitrary bytes. - allow_invalid_utf8: bool, + utf8: bool, } impl Translator { @@ -143,6 +154,12 @@ enum HirFrame { /// case in the Ast. They get popped after an inductive (i.e., recursive) /// step is complete. Expr(Hir), + /// A literal that is being constructed, character by character, from the + /// AST. We need this because the AST gives each individual character its + /// own node. So as we see characters, we peek at the top-most HirFrame. + /// If it's a literal, then we add to it. Otherwise, we push a new literal. + /// When it comes time to pop it, we convert it to an Hir via Hir::literal. + Literal(Vec<u8>), /// A Unicode character class. This frame is mutated as we descend into /// the Ast of a character class (which is itself its own mini recursive /// structure). @@ -152,10 +169,17 @@ enum HirFrame { /// recursive structure). /// /// Byte character classes are created when Unicode mode (`u`) is disabled. - /// If `allow_invalid_utf8` is disabled (the default), then a byte - /// character is only permitted to match ASCII text. + /// If `utf8` is enabled (the default), then a byte character is only + /// permitted to match ASCII text. ClassBytes(hir::ClassBytes), - /// This is pushed on to the stack upon first seeing any kind of group, + /// This is pushed whenever a repetition is observed. After visiting every + /// sub-expression in the repetition, the translator's stack is expected to + /// have this sentinel at the top. + /// + /// This sentinel only exists to stop other things (like flattening + /// literals) from reaching across repetition operators. + Repetition, + /// This is pushed on to the stack upon first seeing any kind of capture, /// indicated by parentheses (including non-capturing groups). It is popped /// upon leaving a group. Group { @@ -181,6 +205,14 @@ enum HirFrame { /// every sub-expression in the alternation, the translator's stack is /// popped until it sees an Alternation frame. Alternation, + /// This is pushed immediately before each sub-expression in an + /// alternation. This separates the branches of an alternation on the + /// stack and prevents literal flattening from reaching across alternation + /// branches. + /// + /// It is popped after each expression in a branch until an 'Alternation' + /// frame is observed when doing a post visit on an alternation. + AlternationBranch, } impl HirFrame { @@ -188,6 +220,7 @@ impl HirFrame { fn unwrap_expr(self) -> Hir { match self { HirFrame::Expr(expr) => expr, + HirFrame::Literal(lit) => Hir::literal(lit), _ => panic!("tried to unwrap expr from HirFrame, got: {:?}", self), } } @@ -218,6 +251,20 @@ impl HirFrame { } } + /// Assert that the current stack frame is a repetition sentinel. If it + /// isn't, then panic. + fn unwrap_repetition(self) { + match self { + HirFrame::Repetition => {} + _ => { + panic!( + "tried to unwrap repetition from HirFrame, got: {:?}", + self + ) + } + } + } + /// Assert that the current stack frame is a group indicator and return /// its corresponding flags (the flags that were active at the time the /// group was entered). @@ -229,6 +276,20 @@ impl HirFrame { } } } + + /// Assert that the current stack frame is an alternation pipe sentinel. If + /// it isn't, then panic. + fn unwrap_alternation_pipe(self) { + match self { + HirFrame::AlternationBranch => {} + _ => { + panic!( + "tried to unwrap alt pipe from HirFrame, got: {:?}", + self + ) + } + } + } } impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { @@ -252,6 +313,7 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { self.push(HirFrame::ClassBytes(cls)); } } + Ast::Repetition(_) => self.push(HirFrame::Repetition), Ast::Group(ref x) => { let old_flags = x .flags() @@ -266,6 +328,7 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { Ast::Alternation(ref x) if x.asts.is_empty() => {} Ast::Alternation(_) => { self.push(HirFrame::Alternation); + self.push(HirFrame::AlternationBranch); } _ => {} } @@ -291,7 +354,20 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { self.push(HirFrame::Expr(Hir::empty())); } Ast::Literal(ref x) => { - self.push(HirFrame::Expr(self.hir_literal(x)?)); + match self.ast_literal_to_scalar(x)? { + Either::Right(byte) => self.push_byte(byte), + Either::Left(ch) => { + if !self.flags().unicode() && ch.len_utf8() > 1 { + return Err(self + .error(x.span, ErrorKind::UnicodeNotAllowed)); + } + match self.case_fold_char(x.span, ch)? { + None => self.push_char(ch), + Some(expr) => self.push(HirFrame::Expr(expr)), + } + } + } + // self.push(HirFrame::Expr(self.hir_literal(x)?)); } Ast::Dot(span) => { self.push(HirFrame::Expr(self.hir_dot(span)?)); @@ -305,7 +381,7 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { let hcls = hir::Class::Unicode(cls); self.push(HirFrame::Expr(Hir::class(hcls))); } else { - let cls = self.hir_perl_byte_class(x); + let cls = self.hir_perl_byte_class(x)?; let hcls = hir::Class::Bytes(cls); self.push(HirFrame::Expr(Hir::class(hcls))); } @@ -322,12 +398,6 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { ast.negated, &mut cls, )?; - if cls.ranges().is_empty() { - return Err(self.error( - ast.span, - ErrorKind::EmptyClassNotAllowed, - )); - } let expr = Hir::class(hir::Class::Unicode(cls)); self.push(HirFrame::Expr(expr)); } else { @@ -337,31 +407,25 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { ast.negated, &mut cls, )?; - if cls.ranges().is_empty() { - return Err(self.error( - ast.span, - ErrorKind::EmptyClassNotAllowed, - )); - } - let expr = Hir::class(hir::Class::Bytes(cls)); self.push(HirFrame::Expr(expr)); } } Ast::Repetition(ref x) => { let expr = self.pop().unwrap().unwrap_expr(); + self.pop().unwrap().unwrap_repetition(); self.push(HirFrame::Expr(self.hir_repetition(x, expr))); } Ast::Group(ref x) => { let expr = self.pop().unwrap().unwrap_expr(); let old_flags = self.pop().unwrap().unwrap_group(); self.trans().flags.set(old_flags); - self.push(HirFrame::Expr(self.hir_group(x, expr))); + self.push(HirFrame::Expr(self.hir_capture(x, expr))); } Ast::Concat(_) => { let mut exprs = vec![]; - while let Some(HirFrame::Expr(expr)) = self.pop() { - if !expr.kind().is_empty() { + while let Some(expr) = self.pop_concat_expr() { + if !matches!(*expr.kind(), HirKind::Empty) { exprs.push(expr); } } @@ -370,7 +434,8 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { } Ast::Alternation(_) => { let mut exprs = vec![]; - while let Some(HirFrame::Expr(expr)) = self.pop() { + while let Some(expr) = self.pop_alt_expr() { + self.pop().unwrap().unwrap_alternation_pipe(); exprs.push(expr); } exprs.reverse(); @@ -380,6 +445,11 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { Ok(()) } + fn visit_alternation_in(&mut self) -> Result<()> { + self.push(HirFrame::AlternationBranch); + Ok(()) + } + fn visit_class_set_item_pre( &mut self, ast: &ast::ClassSetItem, @@ -458,7 +528,7 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { cls.union(&xcls); self.push(HirFrame::ClassUnicode(cls)); } else { - let xcls = self.hir_perl_byte_class(x); + let xcls = self.hir_perl_byte_class(x)?; let mut cls = self.pop().unwrap().unwrap_class_bytes(); cls.union(&xcls); self.push(HirFrame::ClassBytes(cls)); @@ -602,11 +672,103 @@ impl<'t, 'p> TranslatorI<'t, 'p> { self.trans().stack.borrow_mut().push(frame); } + /// Push the given literal char on to the call stack. + /// + /// If the top-most element of the stack is a literal, then the char + /// is appended to the end of that literal. Otherwise, a new literal + /// containing just the given char is pushed to the top of the stack. + fn push_char(&self, ch: char) { + let mut buf = [0; 4]; + let bytes = ch.encode_utf8(&mut buf).as_bytes(); + let mut stack = self.trans().stack.borrow_mut(); + if let Some(HirFrame::Literal(ref mut literal)) = stack.last_mut() { + literal.extend_from_slice(bytes); + } else { + stack.push(HirFrame::Literal(bytes.to_vec())); + } + } + + /// Push the given literal byte on to the call stack. + /// + /// If the top-most element of the stack is a literal, then the byte + /// is appended to the end of that literal. Otherwise, a new literal + /// containing just the given byte is pushed to the top of the stack. + fn push_byte(&self, byte: u8) { + let mut stack = self.trans().stack.borrow_mut(); + if let Some(HirFrame::Literal(ref mut literal)) = stack.last_mut() { + literal.push(byte); + } else { + stack.push(HirFrame::Literal(vec![byte])); + } + } + /// Pop the top of the call stack. If the call stack is empty, return None. fn pop(&self) -> Option<HirFrame> { self.trans().stack.borrow_mut().pop() } + /// Pop an HIR expression from the top of the stack for a concatenation. + /// + /// This returns None if the stack is empty or when a concat frame is seen. + /// Otherwise, it panics if it could not find an HIR expression. + fn pop_concat_expr(&self) -> Option<Hir> { + let frame = self.pop()?; + match frame { + HirFrame::Concat => None, + HirFrame::Expr(expr) => Some(expr), + HirFrame::Literal(lit) => Some(Hir::literal(lit)), + HirFrame::ClassUnicode(_) => { + unreachable!("expected expr or concat, got Unicode class") + } + HirFrame::ClassBytes(_) => { + unreachable!("expected expr or concat, got byte class") + } + HirFrame::Repetition => { + unreachable!("expected expr or concat, got repetition") + } + HirFrame::Group { .. } => { + unreachable!("expected expr or concat, got group") + } + HirFrame::Alternation => { + unreachable!("expected expr or concat, got alt marker") + } + HirFrame::AlternationBranch => { + unreachable!("expected expr or concat, got alt branch marker") + } + } + } + + /// Pop an HIR expression from the top of the stack for an alternation. + /// + /// This returns None if the stack is empty or when an alternation frame is + /// seen. Otherwise, it panics if it could not find an HIR expression. + fn pop_alt_expr(&self) -> Option<Hir> { + let frame = self.pop()?; + match frame { + HirFrame::Alternation => None, + HirFrame::Expr(expr) => Some(expr), + HirFrame::Literal(lit) => Some(Hir::literal(lit)), + HirFrame::ClassUnicode(_) => { + unreachable!("expected expr or alt, got Unicode class") + } + HirFrame::ClassBytes(_) => { + unreachable!("expected expr or alt, got byte class") + } + HirFrame::Repetition => { + unreachable!("expected expr or alt, got repetition") + } + HirFrame::Group { .. } => { + unreachable!("expected expr or alt, got group") + } + HirFrame::Concat => { + unreachable!("expected expr or alt, got concat marker") + } + HirFrame::AlternationBranch => { + unreachable!("expected expr or alt, got alt branch marker") + } + } + } + /// Create a new error with the given span and error type. fn error(&self, span: Span, kind: ErrorKind) -> Error { Error { kind, pattern: self.pattern.to_string(), span } @@ -627,63 +789,48 @@ impl<'t, 'p> TranslatorI<'t, 'p> { old_flags } - fn hir_literal(&self, lit: &ast::Literal) -> Result<Hir> { - let ch = match self.literal_to_char(lit)? { - byte @ hir::Literal::Byte(_) => return Ok(Hir::literal(byte)), - hir::Literal::Unicode(ch) => ch, - }; - if self.flags().case_insensitive() { - self.hir_from_char_case_insensitive(lit.span, ch) - } else { - self.hir_from_char(lit.span, ch) - } - } - /// Convert an Ast literal to its scalar representation. /// /// When Unicode mode is enabled, then this always succeeds and returns a /// `char` (Unicode scalar value). /// - /// When Unicode mode is disabled, then a raw byte is returned. If that - /// byte is not ASCII and invalid UTF-8 is not allowed, then this returns - /// an error. - fn literal_to_char(&self, lit: &ast::Literal) -> Result<hir::Literal> { + /// When Unicode mode is disabled, then a `char` will still be returned + /// whenever possible. A byte is returned only when invalid UTF-8 is + /// allowed and when the byte is not ASCII. Otherwise, a non-ASCII byte + /// will result in an error when invalid UTF-8 is not allowed. + fn ast_literal_to_scalar( + &self, + lit: &ast::Literal, + ) -> Result<Either<char, u8>> { if self.flags().unicode() { - return Ok(hir::Literal::Unicode(lit.c)); + return Ok(Either::Left(lit.c)); } let byte = match lit.byte() { - None => return Ok(hir::Literal::Unicode(lit.c)), + None => return Ok(Either::Left(lit.c)), Some(byte) => byte, }; if byte <= 0x7F { - return Ok(hir::Literal::Unicode(byte as char)); + return Ok(Either::Left(char::try_from(byte).unwrap())); } - if !self.trans().allow_invalid_utf8 { + if self.trans().utf8 { return Err(self.error(lit.span, ErrorKind::InvalidUtf8)); } - Ok(hir::Literal::Byte(byte)) + Ok(Either::Right(byte)) } - fn hir_from_char(&self, span: Span, c: char) -> Result<Hir> { - if !self.flags().unicode() && c.len_utf8() > 1 { - return Err(self.error(span, ErrorKind::UnicodeNotAllowed)); + fn case_fold_char(&self, span: Span, c: char) -> Result<Option<Hir>> { + if !self.flags().case_insensitive() { + return Ok(None); } - Ok(Hir::literal(hir::Literal::Unicode(c))) - } - - fn hir_from_char_case_insensitive( - &self, - span: Span, - c: char, - ) -> Result<Hir> { if self.flags().unicode() { // If case folding won't do anything, then don't bother trying. - let map = - unicode::contains_simple_case_mapping(c, c).map_err(|_| { + let map = unicode::SimpleCaseFolder::new() + .map(|f| f.overlaps(c, c)) + .map_err(|_| { self.error(span, ErrorKind::UnicodeCaseUnavailable) })?; if !map { - return self.hir_from_char(span, c); + return Ok(None); } let mut cls = hir::ClassUnicode::new(vec![hir::ClassUnicodeRange::new( @@ -692,7 +839,7 @@ impl<'t, 'p> TranslatorI<'t, 'p> { cls.try_case_fold_simple().map_err(|_| { self.error(span, ErrorKind::UnicodeCaseUnavailable) })?; - Ok(Hir::class(hir::Class::Unicode(cls))) + Ok(Some(Hir::class(hir::Class::Unicode(cls)))) } else { if c.len_utf8() > 1 { return Err(self.error(span, ErrorKind::UnicodeNotAllowed)); @@ -700,109 +847,102 @@ impl<'t, 'p> TranslatorI<'t, 'p> { // If case folding won't do anything, then don't bother trying. match c { 'A'..='Z' | 'a'..='z' => {} - _ => return self.hir_from_char(span, c), + _ => return Ok(None), } let mut cls = hir::ClassBytes::new(vec![hir::ClassBytesRange::new( - c as u8, c as u8, + // OK because 'c.len_utf8() == 1' which in turn implies + // that 'c' is ASCII. + u8::try_from(c).unwrap(), + u8::try_from(c).unwrap(), )]); cls.case_fold_simple(); - Ok(Hir::class(hir::Class::Bytes(cls))) + Ok(Some(Hir::class(hir::Class::Bytes(cls)))) } } fn hir_dot(&self, span: Span) -> Result<Hir> { - let unicode = self.flags().unicode(); - if !unicode && !self.trans().allow_invalid_utf8 { + if !self.flags().unicode() && self.trans().utf8 { return Err(self.error(span, ErrorKind::InvalidUtf8)); } - Ok(if self.flags().dot_matches_new_line() { - Hir::any(!unicode) - } else { - Hir::dot(!unicode) - }) + Ok(Hir::dot(self.flags().dot())) } fn hir_assertion(&self, asst: &ast::Assertion) -> Result<Hir> { let unicode = self.flags().unicode(); let multi_line = self.flags().multi_line(); + let crlf = self.flags().crlf(); Ok(match asst.kind { - ast::AssertionKind::StartLine => Hir::anchor(if multi_line { - hir::Anchor::StartLine + ast::AssertionKind::StartLine => Hir::look(if multi_line { + if crlf { + hir::Look::StartCRLF + } else { + hir::Look::StartLF + } } else { - hir::Anchor::StartText + hir::Look::Start }), - ast::AssertionKind::EndLine => Hir::anchor(if multi_line { - hir::Anchor::EndLine + ast::AssertionKind::EndLine => Hir::look(if multi_line { + if crlf { + hir::Look::EndCRLF + } else { + hir::Look::EndLF + } } else { - hir::Anchor::EndText + hir::Look::End + }), + ast::AssertionKind::StartText => Hir::look(hir::Look::Start), + ast::AssertionKind::EndText => Hir::look(hir::Look::End), + ast::AssertionKind::WordBoundary => Hir::look(if unicode { + hir::Look::WordUnicode + } else { + hir::Look::WordAscii + }), + ast::AssertionKind::NotWordBoundary => Hir::look(if unicode { + hir::Look::WordUnicodeNegate + } else { + hir::Look::WordAsciiNegate }), - ast::AssertionKind::StartText => { - Hir::anchor(hir::Anchor::StartText) - } - ast::AssertionKind::EndText => Hir::anchor(hir::Anchor::EndText), - ast::AssertionKind::WordBoundary => { - Hir::word_boundary(if unicode { - hir::WordBoundary::Unicode - } else { - hir::WordBoundary::Ascii - }) - } - ast::AssertionKind::NotWordBoundary => { - Hir::word_boundary(if unicode { - hir::WordBoundary::UnicodeNegate - } else { - // It is possible for negated ASCII word boundaries to - // match at invalid UTF-8 boundaries, even when searching - // valid UTF-8. - if !self.trans().allow_invalid_utf8 { - return Err( - self.error(asst.span, ErrorKind::InvalidUtf8) - ); - } - hir::WordBoundary::AsciiNegate - }) - } }) } - fn hir_group(&self, group: &ast::Group, expr: Hir) -> Hir { - let kind = match group.kind { - ast::GroupKind::CaptureIndex(idx) => { - hir::GroupKind::CaptureIndex(idx) + fn hir_capture(&self, group: &ast::Group, expr: Hir) -> Hir { + let (index, name) = match group.kind { + ast::GroupKind::CaptureIndex(index) => (index, None), + ast::GroupKind::CaptureName { ref name, .. } => { + (name.index, Some(name.name.clone().into_boxed_str())) } - ast::GroupKind::CaptureName(ref capname) => { - hir::GroupKind::CaptureName { - name: capname.name.clone(), - index: capname.index, - } - } - ast::GroupKind::NonCapturing(_) => hir::GroupKind::NonCapturing, + // The HIR doesn't need to use non-capturing groups, since the way + // in which the data type is defined handles this automatically. + ast::GroupKind::NonCapturing(_) => return expr, }; - Hir::group(hir::Group { kind, hir: Box::new(expr) }) + Hir::capture(hir::Capture { index, name, sub: Box::new(expr) }) } fn hir_repetition(&self, rep: &ast::Repetition, expr: Hir) -> Hir { - let kind = match rep.op.kind { - ast::RepetitionKind::ZeroOrOne => hir::RepetitionKind::ZeroOrOne, - ast::RepetitionKind::ZeroOrMore => hir::RepetitionKind::ZeroOrMore, - ast::RepetitionKind::OneOrMore => hir::RepetitionKind::OneOrMore, + let (min, max) = match rep.op.kind { + ast::RepetitionKind::ZeroOrOne => (0, Some(1)), + ast::RepetitionKind::ZeroOrMore => (0, None), + ast::RepetitionKind::OneOrMore => (1, None), ast::RepetitionKind::Range(ast::RepetitionRange::Exactly(m)) => { - hir::RepetitionKind::Range(hir::RepetitionRange::Exactly(m)) + (m, Some(m)) } ast::RepetitionKind::Range(ast::RepetitionRange::AtLeast(m)) => { - hir::RepetitionKind::Range(hir::RepetitionRange::AtLeast(m)) + (m, None) } ast::RepetitionKind::Range(ast::RepetitionRange::Bounded( m, n, - )) => { - hir::RepetitionKind::Range(hir::RepetitionRange::Bounded(m, n)) - } + )) => (m, Some(n)), }; let greedy = if self.flags().swap_greed() { !rep.greedy } else { rep.greedy }; - Hir::repetition(hir::Repetition { kind, greedy, hir: Box::new(expr) }) + Hir::repetition(hir::Repetition { + min, + max, + greedy, + sub: Box::new(expr), + }) } fn hir_unicode_class( @@ -834,11 +974,6 @@ impl<'t, 'p> TranslatorI<'t, 'p> { ast_class.negated, class, )?; - if class.ranges().is_empty() { - let err = self - .error(ast_class.span, ErrorKind::EmptyClassNotAllowed); - return Err(err); - } } result } @@ -848,9 +983,8 @@ impl<'t, 'p> TranslatorI<'t, 'p> { ast: &ast::ClassAscii, ) -> Result<hir::ClassUnicode> { let mut cls = hir::ClassUnicode::new( - ascii_class(&ast.kind) - .iter() - .map(|&(s, e)| hir::ClassUnicodeRange::new(s, e)), + ascii_class_as_chars(&ast.kind) + .map(|(s, e)| hir::ClassUnicodeRange::new(s, e)), ); self.unicode_fold_and_negate(&ast.span, ast.negated, &mut cls)?; Ok(cls) @@ -862,8 +996,7 @@ impl<'t, 'p> TranslatorI<'t, 'p> { ) -> Result<hir::ClassBytes> { let mut cls = hir::ClassBytes::new( ascii_class(&ast.kind) - .iter() - .map(|&(s, e)| hir::ClassBytesRange::new(s as u8, e as u8)), + .map(|(s, e)| hir::ClassBytesRange::new(s, e)), ); self.bytes_fold_and_negate(&ast.span, ast.negated, &mut cls)?; Ok(cls) @@ -894,7 +1027,7 @@ impl<'t, 'p> TranslatorI<'t, 'p> { fn hir_perl_byte_class( &self, ast_class: &ast::ClassPerl, - ) -> hir::ClassBytes { + ) -> Result<hir::ClassBytes> { use crate::ast::ClassPerlKind::*; assert!(!self.flags().unicode()); @@ -908,7 +1041,13 @@ impl<'t, 'p> TranslatorI<'t, 'p> { if ast_class.negated { class.negate(); } - class + // Negating a Perl byte class is likely to cause it to match invalid + // UTF-8. That's only OK if the translator is configured to allow such + // things. + if self.trans().utf8 && !class.is_ascii() { + return Err(self.error(ast_class.span, ErrorKind::InvalidUtf8)); + } + Ok(class) } /// Converts the given Unicode specific error to an HIR translation error. @@ -918,7 +1057,7 @@ impl<'t, 'p> TranslatorI<'t, 'p> { fn convert_unicode_class_error( &self, span: &Span, - result: unicode::Result<hir::ClassUnicode>, + result: core::result::Result<hir::ClassUnicode, unicode::Error>, ) -> Result<hir::ClassUnicode> { result.map_err(|err| { let sp = span.clone(); @@ -943,7 +1082,7 @@ impl<'t, 'p> TranslatorI<'t, 'p> { class: &mut hir::ClassUnicode, ) -> Result<()> { // Note that we must apply case folding before negation! - // Consider `(?i)[^x]`. If we applied negation field, then + // Consider `(?i)[^x]`. If we applied negation first, then // the result would be the character class that matched any // Unicode scalar value. if self.flags().case_insensitive() { @@ -973,7 +1112,7 @@ impl<'t, 'p> TranslatorI<'t, 'p> { if negated { class.negate(); } - if !self.trans().allow_invalid_utf8 && !class.is_all_ascii() { + if self.trans().utf8 && !class.is_ascii() { return Err(self.error(span.clone(), ErrorKind::InvalidUtf8)); } Ok(()) @@ -982,11 +1121,12 @@ impl<'t, 'p> TranslatorI<'t, 'p> { /// Return a scalar byte value suitable for use as a literal in a byte /// character class. fn class_literal_byte(&self, ast: &ast::Literal) -> Result<u8> { - match self.literal_to_char(ast)? { - hir::Literal::Byte(byte) => Ok(byte), - hir::Literal::Unicode(ch) => { - if ch <= 0x7F as char { - Ok(ch as u8) + match self.ast_literal_to_scalar(ast)? { + Either::Right(byte) => Ok(byte), + Either::Left(ch) => { + let cp = u32::from(ch); + if cp <= 0x7F { + Ok(u8::try_from(cp).unwrap()) } else { // We can't feasibly support Unicode in // byte oriented classes. Byte classes don't @@ -1010,6 +1150,7 @@ struct Flags { dot_matches_new_line: Option<bool>, swap_greed: Option<bool>, unicode: Option<bool>, + crlf: Option<bool>, // Note that `ignore_whitespace` is omitted here because it is handled // entirely in the parser. } @@ -1038,6 +1179,9 @@ impl Flags { ast::FlagsItemKind::Flag(ast::Flag::Unicode) => { flags.unicode = Some(enable); } + ast::FlagsItemKind::Flag(ast::Flag::CRLF) => { + flags.crlf = Some(enable); + } ast::FlagsItemKind::Flag(ast::Flag::IgnoreWhitespace) => {} } } @@ -1060,6 +1204,33 @@ impl Flags { if self.unicode.is_none() { self.unicode = previous.unicode; } + if self.crlf.is_none() { + self.crlf = previous.crlf; + } + } + + fn dot(&self) -> hir::Dot { + if self.dot_matches_new_line() { + if self.unicode() { + hir::Dot::AnyChar + } else { + hir::Dot::AnyByte + } + } else { + if self.unicode() { + if self.crlf() { + hir::Dot::AnyCharExceptCRLF + } else { + hir::Dot::AnyCharExceptLF + } + } else { + if self.crlf() { + hir::Dot::AnyByteExceptCRLF + } else { + hir::Dot::AnyByteExceptLF + } + } + } } fn case_insensitive(&self) -> bool { @@ -1081,52 +1252,63 @@ impl Flags { fn unicode(&self) -> bool { self.unicode.unwrap_or(true) } + + fn crlf(&self) -> bool { + self.crlf.unwrap_or(false) + } } fn hir_ascii_class_bytes(kind: &ast::ClassAsciiKind) -> hir::ClassBytes { let ranges: Vec<_> = ascii_class(kind) - .iter() - .cloned() - .map(|(s, e)| hir::ClassBytesRange::new(s as u8, e as u8)) + .map(|(s, e)| hir::ClassBytesRange::new(s, e)) .collect(); hir::ClassBytes::new(ranges) } -fn ascii_class(kind: &ast::ClassAsciiKind) -> &'static [(char, char)] { +fn ascii_class(kind: &ast::ClassAsciiKind) -> impl Iterator<Item = (u8, u8)> { use crate::ast::ClassAsciiKind::*; - match *kind { - Alnum => &[('0', '9'), ('A', 'Z'), ('a', 'z')], - Alpha => &[('A', 'Z'), ('a', 'z')], - Ascii => &[('\x00', '\x7F')], - Blank => &[('\t', '\t'), (' ', ' ')], - Cntrl => &[('\x00', '\x1F'), ('\x7F', '\x7F')], - Digit => &[('0', '9')], - Graph => &[('!', '~')], - Lower => &[('a', 'z')], - Print => &[(' ', '~')], - Punct => &[('!', '/'), (':', '@'), ('[', '`'), ('{', '~')], + + let slice: &'static [(u8, u8)] = match *kind { + Alnum => &[(b'0', b'9'), (b'A', b'Z'), (b'a', b'z')], + Alpha => &[(b'A', b'Z'), (b'a', b'z')], + Ascii => &[(b'\x00', b'\x7F')], + Blank => &[(b'\t', b'\t'), (b' ', b' ')], + Cntrl => &[(b'\x00', b'\x1F'), (b'\x7F', b'\x7F')], + Digit => &[(b'0', b'9')], + Graph => &[(b'!', b'~')], + Lower => &[(b'a', b'z')], + Print => &[(b' ', b'~')], + Punct => &[(b'!', b'/'), (b':', b'@'), (b'[', b'`'), (b'{', b'~')], Space => &[ - ('\t', '\t'), - ('\n', '\n'), - ('\x0B', '\x0B'), - ('\x0C', '\x0C'), - ('\r', '\r'), - (' ', ' '), + (b'\t', b'\t'), + (b'\n', b'\n'), + (b'\x0B', b'\x0B'), + (b'\x0C', b'\x0C'), + (b'\r', b'\r'), + (b' ', b' '), ], - Upper => &[('A', 'Z')], - Word => &[('0', '9'), ('A', 'Z'), ('_', '_'), ('a', 'z')], - Xdigit => &[('0', '9'), ('A', 'F'), ('a', 'f')], - } + Upper => &[(b'A', b'Z')], + Word => &[(b'0', b'9'), (b'A', b'Z'), (b'_', b'_'), (b'a', b'z')], + Xdigit => &[(b'0', b'9'), (b'A', b'F'), (b'a', b'f')], + }; + slice.iter().copied() +} + +fn ascii_class_as_chars( + kind: &ast::ClassAsciiKind, +) -> impl Iterator<Item = (char, char)> { + ascii_class(kind).map(|(s, e)| (char::from(s), char::from(e))) } #[cfg(test)] mod tests { - use crate::ast::parse::ParserBuilder; - use crate::ast::{self, Ast, Position, Span}; - use crate::hir::{self, Hir, HirKind}; - use crate::unicode::{self, ClassQuery}; + use crate::{ + ast::{self, parse::ParserBuilder, Ast, Position, Span}, + hir::{self, Hir, HirKind, Look, Properties}, + unicode::{self, ClassQuery}, + }; - use super::{ascii_class, TranslatorBuilder}; + use super::*; // We create these errors to compare with real hir::Errors in the tests. // We define equality between TestError and hir::Error to disregard the @@ -1155,7 +1337,7 @@ mod tests { fn t(pattern: &str) -> Hir { TranslatorBuilder::new() - .allow_invalid_utf8(false) + .utf8(true) .build() .translate(pattern, &parse(pattern)) .unwrap() @@ -1163,7 +1345,7 @@ mod tests { fn t_err(pattern: &str) -> hir::Error { TranslatorBuilder::new() - .allow_invalid_utf8(false) + .utf8(true) .build() .translate(pattern, &parse(pattern)) .unwrap_err() @@ -1171,95 +1353,73 @@ mod tests { fn t_bytes(pattern: &str) -> Hir { TranslatorBuilder::new() - .allow_invalid_utf8(true) + .utf8(false) .build() .translate(pattern, &parse(pattern)) .unwrap() } - fn hir_lit(s: &str) -> Hir { - match s.len() { - 0 => Hir::empty(), - _ => { - let lits = s - .chars() - .map(hir::Literal::Unicode) - .map(Hir::literal) - .collect(); - Hir::concat(lits) - } - } + fn props(pattern: &str) -> Properties { + t(pattern).properties().clone() } - fn hir_blit(s: &[u8]) -> Hir { - match s.len() { - 0 => Hir::empty(), - 1 => Hir::literal(hir::Literal::Byte(s[0])), - _ => { - let lits = s - .iter() - .cloned() - .map(hir::Literal::Byte) - .map(Hir::literal) - .collect(); - Hir::concat(lits) - } - } + fn props_bytes(pattern: &str) -> Properties { + t_bytes(pattern).properties().clone() } - fn hir_group(i: u32, expr: Hir) -> Hir { - Hir::group(hir::Group { - kind: hir::GroupKind::CaptureIndex(i), - hir: Box::new(expr), - }) + fn hir_lit(s: &str) -> Hir { + hir_blit(s.as_bytes()) } - fn hir_group_name(i: u32, name: &str, expr: Hir) -> Hir { - Hir::group(hir::Group { - kind: hir::GroupKind::CaptureName { - name: name.to_string(), - index: i, - }, - hir: Box::new(expr), - }) + fn hir_blit(s: &[u8]) -> Hir { + Hir::literal(s) + } + + fn hir_capture(index: u32, expr: Hir) -> Hir { + Hir::capture(hir::Capture { index, name: None, sub: Box::new(expr) }) } - fn hir_group_nocap(expr: Hir) -> Hir { - Hir::group(hir::Group { - kind: hir::GroupKind::NonCapturing, - hir: Box::new(expr), + fn hir_capture_name(index: u32, name: &str, expr: Hir) -> Hir { + Hir::capture(hir::Capture { + index, + name: Some(name.into()), + sub: Box::new(expr), }) } fn hir_quest(greedy: bool, expr: Hir) -> Hir { Hir::repetition(hir::Repetition { - kind: hir::RepetitionKind::ZeroOrOne, + min: 0, + max: Some(1), greedy, - hir: Box::new(expr), + sub: Box::new(expr), }) } fn hir_star(greedy: bool, expr: Hir) -> Hir { Hir::repetition(hir::Repetition { - kind: hir::RepetitionKind::ZeroOrMore, + min: 0, + max: None, greedy, - hir: Box::new(expr), + sub: Box::new(expr), }) } fn hir_plus(greedy: bool, expr: Hir) -> Hir { Hir::repetition(hir::Repetition { - kind: hir::RepetitionKind::OneOrMore, + min: 1, + max: None, greedy, - hir: Box::new(expr), + sub: Box::new(expr), }) } - fn hir_range(greedy: bool, range: hir::RepetitionRange, expr: Hir) -> Hir { + fn hir_range(greedy: bool, min: u32, max: Option<u32>, expr: Hir) -> Hir { Hir::repetition(hir::Repetition { - kind: hir::RepetitionKind::Range(range), + min, + max, greedy, - hir: Box::new(expr), + sub: Box::new(expr), }) } @@ -1281,32 +1441,25 @@ mod tests { Hir::class(hir::Class::Unicode(unicode::perl_word().unwrap())) } - fn hir_uclass(ranges: &[(char, char)]) -> Hir { - let ranges: Vec<hir::ClassUnicodeRange> = ranges - .iter() - .map(|&(s, e)| hir::ClassUnicodeRange::new(s, e)) - .collect(); - Hir::class(hir::Class::Unicode(hir::ClassUnicode::new(ranges))) + fn hir_ascii_uclass(kind: &ast::ClassAsciiKind) -> Hir { + Hir::class(hir::Class::Unicode(hir::ClassUnicode::new( + ascii_class_as_chars(kind) + .map(|(s, e)| hir::ClassUnicodeRange::new(s, e)), + ))) } - fn hir_bclass(ranges: &[(u8, u8)]) -> Hir { - let ranges: Vec<hir::ClassBytesRange> = ranges - .iter() - .map(|&(s, e)| hir::ClassBytesRange::new(s, e)) - .collect(); - Hir::class(hir::Class::Bytes(hir::ClassBytes::new(ranges))) + fn hir_ascii_bclass(kind: &ast::ClassAsciiKind) -> Hir { + Hir::class(hir::Class::Bytes(hir::ClassBytes::new( + ascii_class(kind).map(|(s, e)| hir::ClassBytesRange::new(s, e)), + ))) } - fn hir_bclass_from_char(ranges: &[(char, char)]) -> Hir { - let ranges: Vec<hir::ClassBytesRange> = ranges - .iter() - .map(|&(s, e)| { - assert!(s as u32 <= 0x7F); - assert!(e as u32 <= 0x7F); - hir::ClassBytesRange::new(s as u8, e as u8) - }) - .collect(); - Hir::class(hir::Class::Bytes(hir::ClassBytes::new(ranges))) + fn hir_uclass(ranges: &[(char, char)]) -> Hir { + Hir::class(uclass(ranges)) + } + + fn hir_bclass(ranges: &[(u8, u8)]) -> Hir { + Hir::class(bclass(ranges)) } fn hir_case_fold(expr: Hir) -> Hir { @@ -1329,6 +1482,33 @@ mod tests { } } + fn uclass(ranges: &[(char, char)]) -> hir::Class { + let ranges: Vec<hir::ClassUnicodeRange> = ranges + .iter() + .map(|&(s, e)| hir::ClassUnicodeRange::new(s, e)) + .collect(); + hir::Class::Unicode(hir::ClassUnicode::new(ranges)) + } + + fn bclass(ranges: &[(u8, u8)]) -> hir::Class { + let ranges: Vec<hir::ClassBytesRange> = ranges + .iter() + .map(|&(s, e)| hir::ClassBytesRange::new(s, e)) + .collect(); + hir::Class::Bytes(hir::ClassBytes::new(ranges)) + } + + #[cfg(feature = "unicode-case")] + fn class_case_fold(mut cls: hir::Class) -> Hir { + cls.case_fold_simple(); + Hir::class(cls) + } + + fn class_negate(mut cls: hir::Class) -> Hir { + cls.negate(); + Hir::class(cls) + } + #[allow(dead_code)] fn hir_union(expr1: Hir, expr2: Hir) -> Hir { use crate::hir::Class::{Bytes, Unicode}; @@ -1363,47 +1543,43 @@ mod tests { } } - fn hir_anchor(anchor: hir::Anchor) -> Hir { - Hir::anchor(anchor) - } - - fn hir_word(wb: hir::WordBoundary) -> Hir { - Hir::word_boundary(wb) + fn hir_look(look: hir::Look) -> Hir { + Hir::look(look) } #[test] fn empty() { assert_eq!(t(""), Hir::empty()); assert_eq!(t("(?i)"), Hir::empty()); - assert_eq!(t("()"), hir_group(1, Hir::empty())); - assert_eq!(t("(?:)"), hir_group_nocap(Hir::empty())); - assert_eq!(t("(?P<wat>)"), hir_group_name(1, "wat", Hir::empty())); + assert_eq!(t("()"), hir_capture(1, Hir::empty())); + assert_eq!(t("(?:)"), Hir::empty()); + assert_eq!(t("(?P<wat>)"), hir_capture_name(1, "wat", Hir::empty())); assert_eq!(t("|"), hir_alt(vec![Hir::empty(), Hir::empty()])); assert_eq!( t("()|()"), hir_alt(vec![ - hir_group(1, Hir::empty()), - hir_group(2, Hir::empty()), + hir_capture(1, Hir::empty()), + hir_capture(2, Hir::empty()), ]) ); assert_eq!( t("(|b)"), - hir_group(1, hir_alt(vec![Hir::empty(), hir_lit("b"),])) + hir_capture(1, hir_alt(vec![Hir::empty(), hir_lit("b"),])) ); assert_eq!( t("(a|)"), - hir_group(1, hir_alt(vec![hir_lit("a"), Hir::empty(),])) + hir_capture(1, hir_alt(vec![hir_lit("a"), Hir::empty(),])) ); assert_eq!( t("(a||c)"), - hir_group( + hir_capture( 1, hir_alt(vec![hir_lit("a"), Hir::empty(), hir_lit("c"),]) ) ); assert_eq!( t("(||)"), - hir_group( + hir_capture( 1, hir_alt(vec![Hir::empty(), Hir::empty(), Hir::empty(),]) ) @@ -1449,10 +1625,7 @@ mod tests { #[cfg(feature = "unicode-case")] assert_eq!(t("(?i)a"), hir_uclass(&[('A', 'A'), ('a', 'a'),])); #[cfg(feature = "unicode-case")] - assert_eq!( - t("(?i:a)"), - hir_group_nocap(hir_uclass(&[('A', 'A'), ('a', 'a')],)) - ); + assert_eq!(t("(?i:a)"), hir_uclass(&[('A', 'A'), ('a', 'a')])); #[cfg(feature = "unicode-case")] assert_eq!( t("a(?i)a(?-i)a"), @@ -1528,14 +1701,32 @@ mod tests { fn dot() { assert_eq!( t("."), - hir_uclass(&[('\0', '\t'), ('\x0B', '\u{10FFFF}'),]) + hir_uclass(&[('\0', '\t'), ('\x0B', '\u{10FFFF}')]) + ); + assert_eq!( + t("(?R)."), + hir_uclass(&[ + ('\0', '\t'), + ('\x0B', '\x0C'), + ('\x0E', '\u{10FFFF}'), + ]) ); - assert_eq!(t("(?s)."), hir_uclass(&[('\0', '\u{10FFFF}'),])); + assert_eq!(t("(?s)."), hir_uclass(&[('\0', '\u{10FFFF}')])); + assert_eq!(t("(?Rs)."), hir_uclass(&[('\0', '\u{10FFFF}')])); assert_eq!( t_bytes("(?-u)."), - hir_bclass(&[(b'\0', b'\t'), (b'\x0B', b'\xFF'),]) + hir_bclass(&[(b'\0', b'\t'), (b'\x0B', b'\xFF')]) + ); + assert_eq!( + t_bytes("(?R-u)."), + hir_bclass(&[ + (b'\0', b'\t'), + (b'\x0B', b'\x0C'), + (b'\x0E', b'\xFF'), + ]) ); assert_eq!(t_bytes("(?s-u)."), hir_bclass(&[(b'\0', b'\xFF'),])); + assert_eq!(t_bytes("(?Rs-u)."), hir_bclass(&[(b'\0', b'\xFF'),])); // If invalid UTF-8 isn't allowed, then non-Unicode `.` isn't allowed. assert_eq!( @@ -1549,7 +1740,7 @@ mod tests { } ); assert_eq!( - t_err("(?s-u)."), + t_err("(?R-u)."), TestError { kind: hir::ErrorKind::InvalidUtf8, span: Span::new( @@ -1558,94 +1749,123 @@ mod tests { ), } ); - } - - #[test] - fn assertions() { - assert_eq!(t("^"), hir_anchor(hir::Anchor::StartText)); - assert_eq!(t("$"), hir_anchor(hir::Anchor::EndText)); - assert_eq!(t(r"\A"), hir_anchor(hir::Anchor::StartText)); - assert_eq!(t(r"\z"), hir_anchor(hir::Anchor::EndText)); - assert_eq!(t("(?m)^"), hir_anchor(hir::Anchor::StartLine)); - assert_eq!(t("(?m)$"), hir_anchor(hir::Anchor::EndLine)); - assert_eq!(t(r"(?m)\A"), hir_anchor(hir::Anchor::StartText)); - assert_eq!(t(r"(?m)\z"), hir_anchor(hir::Anchor::EndText)); - - assert_eq!(t(r"\b"), hir_word(hir::WordBoundary::Unicode)); - assert_eq!(t(r"\B"), hir_word(hir::WordBoundary::UnicodeNegate)); - assert_eq!(t(r"(?-u)\b"), hir_word(hir::WordBoundary::Ascii)); assert_eq!( - t_bytes(r"(?-u)\B"), - hir_word(hir::WordBoundary::AsciiNegate) + t_err("(?s-u)."), + TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new( + Position::new(6, 1, 7), + Position::new(7, 1, 8) + ), + } ); - assert_eq!( - t_err(r"(?-u)\B"), + t_err("(?Rs-u)."), TestError { kind: hir::ErrorKind::InvalidUtf8, span: Span::new( - Position::new(5, 1, 6), - Position::new(7, 1, 8) + Position::new(7, 1, 8), + Position::new(8, 1, 9) ), } ); } #[test] + fn assertions() { + assert_eq!(t("^"), hir_look(hir::Look::Start)); + assert_eq!(t("$"), hir_look(hir::Look::End)); + assert_eq!(t(r"\A"), hir_look(hir::Look::Start)); + assert_eq!(t(r"\z"), hir_look(hir::Look::End)); + assert_eq!(t("(?m)^"), hir_look(hir::Look::StartLF)); + assert_eq!(t("(?m)$"), hir_look(hir::Look::EndLF)); + assert_eq!(t(r"(?m)\A"), hir_look(hir::Look::Start)); + assert_eq!(t(r"(?m)\z"), hir_look(hir::Look::End)); + + assert_eq!(t(r"\b"), hir_look(hir::Look::WordUnicode)); + assert_eq!(t(r"\B"), hir_look(hir::Look::WordUnicodeNegate)); + assert_eq!(t(r"(?-u)\b"), hir_look(hir::Look::WordAscii)); + assert_eq!(t(r"(?-u)\B"), hir_look(hir::Look::WordAsciiNegate)); + } + + #[test] fn group() { - assert_eq!(t("(a)"), hir_group(1, hir_lit("a"))); + assert_eq!(t("(a)"), hir_capture(1, hir_lit("a"))); assert_eq!( t("(a)(b)"), hir_cat(vec![ - hir_group(1, hir_lit("a")), - hir_group(2, hir_lit("b")), + hir_capture(1, hir_lit("a")), + hir_capture(2, hir_lit("b")), ]) ); assert_eq!( t("(a)|(b)"), hir_alt(vec![ - hir_group(1, hir_lit("a")), - hir_group(2, hir_lit("b")), + hir_capture(1, hir_lit("a")), + hir_capture(2, hir_lit("b")), ]) ); - assert_eq!(t("(?P<foo>)"), hir_group_name(1, "foo", Hir::empty())); - assert_eq!(t("(?P<foo>a)"), hir_group_name(1, "foo", hir_lit("a"))); + assert_eq!(t("(?P<foo>)"), hir_capture_name(1, "foo", Hir::empty())); + assert_eq!(t("(?P<foo>a)"), hir_capture_name(1, "foo", hir_lit("a"))); assert_eq!( t("(?P<foo>a)(?P<bar>b)"), hir_cat(vec![ - hir_group_name(1, "foo", hir_lit("a")), - hir_group_name(2, "bar", hir_lit("b")), + hir_capture_name(1, "foo", hir_lit("a")), + hir_capture_name(2, "bar", hir_lit("b")), ]) ); - assert_eq!(t("(?:)"), hir_group_nocap(Hir::empty())); - assert_eq!(t("(?:a)"), hir_group_nocap(hir_lit("a"))); + assert_eq!(t("(?:)"), Hir::empty()); + assert_eq!(t("(?:a)"), hir_lit("a")); assert_eq!( t("(?:a)(b)"), - hir_cat(vec![ - hir_group_nocap(hir_lit("a")), - hir_group(1, hir_lit("b")), - ]) + hir_cat(vec![hir_lit("a"), hir_capture(1, hir_lit("b")),]) ); assert_eq!( t("(a)(?:b)(c)"), hir_cat(vec![ - hir_group(1, hir_lit("a")), - hir_group_nocap(hir_lit("b")), - hir_group(2, hir_lit("c")), + hir_capture(1, hir_lit("a")), + hir_lit("b"), + hir_capture(2, hir_lit("c")), ]) ); assert_eq!( t("(a)(?P<foo>b)(c)"), hir_cat(vec![ - hir_group(1, hir_lit("a")), - hir_group_name(2, "foo", hir_lit("b")), - hir_group(3, hir_lit("c")), + hir_capture(1, hir_lit("a")), + hir_capture_name(2, "foo", hir_lit("b")), + hir_capture(3, hir_lit("c")), ]) ); - assert_eq!(t("()"), hir_group(1, Hir::empty())); - assert_eq!(t("((?i))"), hir_group(1, Hir::empty())); - assert_eq!(t("((?x))"), hir_group(1, Hir::empty())); - assert_eq!(t("(((?x)))"), hir_group(1, hir_group(2, Hir::empty()))); + assert_eq!(t("()"), hir_capture(1, Hir::empty())); + assert_eq!(t("((?i))"), hir_capture(1, Hir::empty())); + assert_eq!(t("((?x))"), hir_capture(1, Hir::empty())); + assert_eq!( + t("(((?x)))"), + hir_capture(1, hir_capture(2, Hir::empty())) + ); + } + + #[test] + fn line_anchors() { + assert_eq!(t("^"), hir_look(hir::Look::Start)); + assert_eq!(t("$"), hir_look(hir::Look::End)); + assert_eq!(t(r"\A"), hir_look(hir::Look::Start)); + assert_eq!(t(r"\z"), hir_look(hir::Look::End)); + + assert_eq!(t(r"(?m)\A"), hir_look(hir::Look::Start)); + assert_eq!(t(r"(?m)\z"), hir_look(hir::Look::End)); + assert_eq!(t("(?m)^"), hir_look(hir::Look::StartLF)); + assert_eq!(t("(?m)$"), hir_look(hir::Look::EndLF)); + + assert_eq!(t(r"(?R)\A"), hir_look(hir::Look::Start)); + assert_eq!(t(r"(?R)\z"), hir_look(hir::Look::End)); + assert_eq!(t("(?R)^"), hir_look(hir::Look::Start)); + assert_eq!(t("(?R)$"), hir_look(hir::Look::End)); + + assert_eq!(t(r"(?Rm)\A"), hir_look(hir::Look::Start)); + assert_eq!(t(r"(?Rm)\z"), hir_look(hir::Look::End)); + assert_eq!(t("(?Rm)^"), hir_look(hir::Look::StartCRLF)); + assert_eq!(t("(?Rm)$"), hir_look(hir::Look::EndCRLF)); } #[test] @@ -1653,46 +1873,44 @@ mod tests { #[cfg(feature = "unicode-case")] assert_eq!( t("(?i:a)a"), - hir_cat(vec![ - hir_group_nocap(hir_uclass(&[('A', 'A'), ('a', 'a')])), - hir_lit("a"), - ]) + hir_cat( + vec![hir_uclass(&[('A', 'A'), ('a', 'a')]), hir_lit("a"),] + ) ); assert_eq!( t("(?i-u:a)β"), hir_cat(vec![ - hir_group_nocap(hir_bclass(&[(b'A', b'A'), (b'a', b'a')])), + hir_bclass(&[(b'A', b'A'), (b'a', b'a')]), hir_lit("β"), ]) ); assert_eq!( t("(?:(?i-u)a)b"), hir_cat(vec![ - hir_group_nocap(hir_bclass(&[(b'A', b'A'), (b'a', b'a')])), + hir_bclass(&[(b'A', b'A'), (b'a', b'a')]), hir_lit("b"), ]) ); assert_eq!( t("((?i-u)a)b"), hir_cat(vec![ - hir_group(1, hir_bclass(&[(b'A', b'A'), (b'a', b'a')])), + hir_capture(1, hir_bclass(&[(b'A', b'A'), (b'a', b'a')])), hir_lit("b"), ]) ); #[cfg(feature = "unicode-case")] assert_eq!( t("(?i)(?-i:a)a"), - hir_cat(vec![ - hir_group_nocap(hir_lit("a")), - hir_uclass(&[('A', 'A'), ('a', 'a')]), - ]) + hir_cat( + vec![hir_lit("a"), hir_uclass(&[('A', 'A'), ('a', 'a')]),] + ) ); #[cfg(feature = "unicode-case")] assert_eq!( t("(?im)a^"), hir_cat(vec![ hir_uclass(&[('A', 'A'), ('a', 'a')]), - hir_anchor(hir::Anchor::StartLine), + hir_look(hir::Look::StartLF), ]) ); #[cfg(feature = "unicode-case")] @@ -1700,9 +1918,9 @@ mod tests { t("(?im)a^(?i-m)a^"), hir_cat(vec![ hir_uclass(&[('A', 'A'), ('a', 'a')]), - hir_anchor(hir::Anchor::StartLine), + hir_look(hir::Look::StartLF), hir_uclass(&[('A', 'A'), ('a', 'a')]), - hir_anchor(hir::Anchor::StartText), + hir_look(hir::Look::Start), ]) ); assert_eq!( @@ -1718,10 +1936,10 @@ mod tests { assert_eq!( t("(?:a(?i)a)a"), hir_cat(vec![ - hir_group_nocap(hir_cat(vec![ + hir_cat(vec![ hir_lit("a"), hir_uclass(&[('A', 'A'), ('a', 'a')]), - ])), + ]), hir_lit("a"), ]) ); @@ -1729,10 +1947,10 @@ mod tests { assert_eq!( t("(?i)(?:a(?-i)a)a"), hir_cat(vec![ - hir_group_nocap(hir_cat(vec![ + hir_cat(vec![ hir_uclass(&[('A', 'A'), ('a', 'a')]), hir_lit("a"), - ])), + ]), hir_uclass(&[('A', 'A'), ('a', 'a')]), ]) ); @@ -1755,46 +1973,18 @@ mod tests { assert_eq!(t("a*?"), hir_star(false, hir_lit("a"))); assert_eq!(t("a+?"), hir_plus(false, hir_lit("a"))); - assert_eq!( - t("a{1}"), - hir_range(true, hir::RepetitionRange::Exactly(1), hir_lit("a"),) - ); - assert_eq!( - t("a{1,}"), - hir_range(true, hir::RepetitionRange::AtLeast(1), hir_lit("a"),) - ); - assert_eq!( - t("a{1,2}"), - hir_range(true, hir::RepetitionRange::Bounded(1, 2), hir_lit("a"),) - ); - assert_eq!( - t("a{1}?"), - hir_range(false, hir::RepetitionRange::Exactly(1), hir_lit("a"),) - ); - assert_eq!( - t("a{1,}?"), - hir_range(false, hir::RepetitionRange::AtLeast(1), hir_lit("a"),) - ); - assert_eq!( - t("a{1,2}?"), - hir_range( - false, - hir::RepetitionRange::Bounded(1, 2), - hir_lit("a"), - ) - ); + assert_eq!(t("a{1}"), hir_range(true, 1, Some(1), hir_lit("a"),)); + assert_eq!(t("a{1,}"), hir_range(true, 1, None, hir_lit("a"),)); + assert_eq!(t("a{1,2}"), hir_range(true, 1, Some(2), hir_lit("a"),)); + assert_eq!(t("a{1}?"), hir_range(false, 1, Some(1), hir_lit("a"),)); + assert_eq!(t("a{1,}?"), hir_range(false, 1, None, hir_lit("a"),)); + assert_eq!(t("a{1,2}?"), hir_range(false, 1, Some(2), hir_lit("a"),)); assert_eq!( t("ab?"), hir_cat(vec![hir_lit("a"), hir_quest(true, hir_lit("b")),]) ); - assert_eq!( - t("(ab)?"), - hir_quest( - true, - hir_group(1, hir_cat(vec![hir_lit("a"), hir_lit("b"),])) - ) - ); + assert_eq!(t("(ab)?"), hir_quest(true, hir_capture(1, hir_lit("ab")))); assert_eq!( t("a|b?"), hir_alt(vec![hir_lit("a"), hir_quest(true, hir_lit("b")),]) @@ -1803,48 +1993,49 @@ mod tests { #[test] fn cat_alt() { + let a = || hir_look(hir::Look::Start); + let b = || hir_look(hir::Look::End); + let c = || hir_look(hir::Look::WordUnicode); + let d = || hir_look(hir::Look::WordUnicodeNegate); + + assert_eq!(t("(^$)"), hir_capture(1, hir_cat(vec![a(), b()]))); + assert_eq!(t("^|$"), hir_alt(vec![a(), b()])); + assert_eq!(t(r"^|$|\b"), hir_alt(vec![a(), b(), c()])); assert_eq!( - t("(ab)"), - hir_group(1, hir_cat(vec![hir_lit("a"), hir_lit("b"),])) - ); - assert_eq!(t("a|b"), hir_alt(vec![hir_lit("a"), hir_lit("b"),])); - assert_eq!( - t("a|b|c"), - hir_alt(vec![hir_lit("a"), hir_lit("b"), hir_lit("c"),]) - ); - assert_eq!( - t("ab|bc|cd"), - hir_alt(vec![hir_lit("ab"), hir_lit("bc"), hir_lit("cd"),]) - ); - assert_eq!( - t("(a|b)"), - hir_group(1, hir_alt(vec![hir_lit("a"), hir_lit("b"),])) + t(r"^$|$\b|\b\B"), + hir_alt(vec![ + hir_cat(vec![a(), b()]), + hir_cat(vec![b(), c()]), + hir_cat(vec![c(), d()]), + ]) ); + assert_eq!(t("(^|$)"), hir_capture(1, hir_alt(vec![a(), b()]))); assert_eq!( - t("(a|b|c)"), - hir_group( - 1, - hir_alt(vec![hir_lit("a"), hir_lit("b"), hir_lit("c"),]) - ) + t(r"(^|$|\b)"), + hir_capture(1, hir_alt(vec![a(), b(), c()])) ); assert_eq!( - t("(ab|bc|cd)"), - hir_group( + t(r"(^$|$\b|\b\B)"), + hir_capture( 1, - hir_alt(vec![hir_lit("ab"), hir_lit("bc"), hir_lit("cd"),]) + hir_alt(vec![ + hir_cat(vec![a(), b()]), + hir_cat(vec![b(), c()]), + hir_cat(vec![c(), d()]), + ]) ) ); assert_eq!( - t("(ab|(bc|(cd)))"), - hir_group( + t(r"(^$|($\b|(\b\B)))"), + hir_capture( 1, hir_alt(vec![ - hir_lit("ab"), - hir_group( + hir_cat(vec![a(), b()]), + hir_capture( 2, hir_alt(vec![ - hir_lit("bc"), - hir_group(3, hir_lit("cd")), + hir_cat(vec![b(), c()]), + hir_capture(3, hir_cat(vec![c(), d()])), ]) ), ]) @@ -1852,68 +2043,107 @@ mod tests { ); } + // Tests the HIR transformation of things like '[a-z]|[A-Z]' into + // '[A-Za-z]'. In other words, an alternation of just classes is always + // equivalent to a single class corresponding to the union of the branches + // in that class. (Unless some branches match invalid UTF-8 and others + // match non-ASCII Unicode.) + #[test] + fn cat_class_flattened() { + assert_eq!(t(r"[a-z]|[A-Z]"), hir_uclass(&[('A', 'Z'), ('a', 'z')])); + // Combining all of the letter properties should give us the one giant + // letter property. + #[cfg(feature = "unicode-gencat")] + assert_eq!( + t(r"(?x) + \p{Lowercase_Letter} + |\p{Uppercase_Letter} + |\p{Titlecase_Letter} + |\p{Modifier_Letter} + |\p{Other_Letter} + "), + hir_uclass_query(ClassQuery::Binary("letter")) + ); + // Byte classes that can truly match invalid UTF-8 cannot be combined + // with Unicode classes. + assert_eq!( + t_bytes(r"[Δδ]|(?-u:[\x90-\xFF])|[Λλ]"), + hir_alt(vec![ + hir_uclass(&[('Δ', 'Δ'), ('δ', 'δ')]), + hir_bclass(&[(b'\x90', b'\xFF')]), + hir_uclass(&[('Λ', 'Λ'), ('λ', 'λ')]), + ]) + ); + // Byte classes on their own can be combined, even if some are ASCII + // and others are invalid UTF-8. + assert_eq!( + t_bytes(r"[a-z]|(?-u:[\x90-\xFF])|[A-Z]"), + hir_bclass(&[(b'A', b'Z'), (b'a', b'z'), (b'\x90', b'\xFF')]), + ); + } + #[test] fn class_ascii() { assert_eq!( t("[[:alnum:]]"), - hir_uclass(ascii_class(&ast::ClassAsciiKind::Alnum)) + hir_ascii_uclass(&ast::ClassAsciiKind::Alnum) ); assert_eq!( t("[[:alpha:]]"), - hir_uclass(ascii_class(&ast::ClassAsciiKind::Alpha)) + hir_ascii_uclass(&ast::ClassAsciiKind::Alpha) ); assert_eq!( t("[[:ascii:]]"), - hir_uclass(ascii_class(&ast::ClassAsciiKind::Ascii)) + hir_ascii_uclass(&ast::ClassAsciiKind::Ascii) ); assert_eq!( t("[[:blank:]]"), - hir_uclass(ascii_class(&ast::ClassAsciiKind::Blank)) + hir_ascii_uclass(&ast::ClassAsciiKind::Blank) ); assert_eq!( t("[[:cntrl:]]"), - hir_uclass(ascii_class(&ast::ClassAsciiKind::Cntrl)) + hir_ascii_uclass(&ast::ClassAsciiKind::Cntrl) ); assert_eq!( t("[[:digit:]]"), - hir_uclass(ascii_class(&ast::ClassAsciiKind::Digit)) + hir_ascii_uclass(&ast::ClassAsciiKind::Digit) ); assert_eq!( t("[[:graph:]]"), - hir_uclass(ascii_class(&ast::ClassAsciiKind::Graph)) + hir_ascii_uclass(&ast::ClassAsciiKind::Graph) ); assert_eq!( t("[[:lower:]]"), - hir_uclass(ascii_class(&ast::ClassAsciiKind::Lower)) + hir_ascii_uclass(&ast::ClassAsciiKind::Lower) ); assert_eq!( t("[[:print:]]"), - hir_uclass(ascii_class(&ast::ClassAsciiKind::Print)) + hir_ascii_uclass(&ast::ClassAsciiKind::Print) ); assert_eq!( t("[[:punct:]]"), - hir_uclass(ascii_class(&ast::ClassAsciiKind::Punct)) + hir_ascii_uclass(&ast::ClassAsciiKind::Punct) ); assert_eq!( t("[[:space:]]"), - hir_uclass(ascii_class(&ast::ClassAsciiKind::Space)) + hir_ascii_uclass(&ast::ClassAsciiKind::Space) ); assert_eq!( t("[[:upper:]]"), - hir_uclass(ascii_class(&ast::ClassAsciiKind::Upper)) + hir_ascii_uclass(&ast::ClassAsciiKind::Upper) ); assert_eq!( t("[[:word:]]"), - hir_uclass(ascii_class(&ast::ClassAsciiKind::Word)) + hir_ascii_uclass(&ast::ClassAsciiKind::Word) ); assert_eq!( t("[[:xdigit:]]"), - hir_uclass(ascii_class(&ast::ClassAsciiKind::Xdigit)) + hir_ascii_uclass(&ast::ClassAsciiKind::Xdigit) ); assert_eq!( t("[[:^lower:]]"), - hir_negate(hir_uclass(ascii_class(&ast::ClassAsciiKind::Lower))) + hir_negate(hir_ascii_uclass(&ast::ClassAsciiKind::Lower)) ); #[cfg(feature = "unicode-case")] assert_eq!( @@ -1928,13 +2158,11 @@ mod tests { assert_eq!( t("(?-u)[[:lower:]]"), - hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Lower)) + hir_ascii_bclass(&ast::ClassAsciiKind::Lower) ); assert_eq!( t("(?i-u)[[:lower:]]"), - hir_case_fold(hir_bclass_from_char(ascii_class( - &ast::ClassAsciiKind::Lower - ))) + hir_case_fold(hir_ascii_bclass(&ast::ClassAsciiKind::Lower)) ); assert_eq!( @@ -1965,14 +2193,14 @@ mod tests { assert_eq!( t("[[:alnum:][:^ascii:]]"), hir_union( - hir_uclass(ascii_class(&ast::ClassAsciiKind::Alnum)), + hir_ascii_uclass(&ast::ClassAsciiKind::Alnum), hir_uclass(&[('\u{80}', '\u{10FFFF}')]), ), ); assert_eq!( t_bytes("(?-u)[[:alnum:][:^ascii:]]"), hir_union( - hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Alnum)), + hir_ascii_bclass(&ast::ClassAsciiKind::Alnum), hir_bclass(&[(0x80, 0xFF)]), ), ); @@ -1980,7 +2208,7 @@ mod tests { #[test] #[cfg(feature = "unicode-perl")] - fn class_perl() { + fn class_perl_unicode() { // Unicode assert_eq!(t(r"\d"), hir_uclass_query(ClassQuery::Binary("digit"))); assert_eq!(t(r"\s"), hir_uclass_query(ClassQuery::Binary("space"))); @@ -2020,69 +2248,124 @@ mod tests { ); #[cfg(feature = "unicode-case")] assert_eq!(t(r"(?i)\W"), hir_negate(hir_uclass_perl_word())); + } + #[test] + fn class_perl_ascii() { // ASCII only assert_eq!( t(r"(?-u)\d"), - hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Digit)) + hir_ascii_bclass(&ast::ClassAsciiKind::Digit) ); assert_eq!( t(r"(?-u)\s"), - hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Space)) + hir_ascii_bclass(&ast::ClassAsciiKind::Space) ); assert_eq!( t(r"(?-u)\w"), - hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Word)) + hir_ascii_bclass(&ast::ClassAsciiKind::Word) ); assert_eq!( t(r"(?i-u)\d"), - hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Digit)) + hir_ascii_bclass(&ast::ClassAsciiKind::Digit) ); assert_eq!( t(r"(?i-u)\s"), - hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Space)) + hir_ascii_bclass(&ast::ClassAsciiKind::Space) ); assert_eq!( t(r"(?i-u)\w"), - hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Word)) + hir_ascii_bclass(&ast::ClassAsciiKind::Word) ); // ASCII only, negated assert_eq!( - t(r"(?-u)\D"), - hir_negate(hir_bclass_from_char(ascii_class( - &ast::ClassAsciiKind::Digit - ))) + t_bytes(r"(?-u)\D"), + hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit)) ); assert_eq!( - t(r"(?-u)\S"), - hir_negate(hir_bclass_from_char(ascii_class( - &ast::ClassAsciiKind::Space - ))) + t_bytes(r"(?-u)\S"), + hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Space)) ); assert_eq!( - t(r"(?-u)\W"), - hir_negate(hir_bclass_from_char(ascii_class( - &ast::ClassAsciiKind::Word - ))) + t_bytes(r"(?-u)\W"), + hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Word)) ); assert_eq!( - t(r"(?i-u)\D"), - hir_negate(hir_bclass_from_char(ascii_class( - &ast::ClassAsciiKind::Digit - ))) + t_bytes(r"(?i-u)\D"), + hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit)) ); assert_eq!( - t(r"(?i-u)\S"), - hir_negate(hir_bclass_from_char(ascii_class( - &ast::ClassAsciiKind::Space - ))) + t_bytes(r"(?i-u)\S"), + hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Space)) ); assert_eq!( - t(r"(?i-u)\W"), - hir_negate(hir_bclass_from_char(ascii_class( - &ast::ClassAsciiKind::Word - ))) + t_bytes(r"(?i-u)\W"), + hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Word)) + ); + + // ASCII only, negated, with UTF-8 mode enabled. + // In this case, negating any Perl class results in an error because + // all such classes can match invalid UTF-8. + assert_eq!( + t_err(r"(?-u)\D"), + TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new( + Position::new(5, 1, 6), + Position::new(7, 1, 8), + ), + }, + ); + assert_eq!( + t_err(r"(?-u)\S"), + TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new( + Position::new(5, 1, 6), + Position::new(7, 1, 8), + ), + }, + ); + assert_eq!( + t_err(r"(?-u)\W"), + TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new( + Position::new(5, 1, 6), + Position::new(7, 1, 8), + ), + }, + ); + assert_eq!( + t_err(r"(?i-u)\D"), + TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new( + Position::new(6, 1, 7), + Position::new(8, 1, 9), + ), + }, + ); + assert_eq!( + t_err(r"(?i-u)\S"), + TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new( + Position::new(6, 1, 7), + Position::new(8, 1, 9), + ), + }, + ); + assert_eq!( + t_err(r"(?i-u)\W"), + TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new( + Position::new(6, 1, 7), + Position::new(8, 1, 9), + ), + }, ); } @@ -2360,16 +2643,7 @@ mod tests { #[test] #[cfg(feature = "unicode-gencat")] fn class_unicode_any_empty() { - assert_eq!( - t_err(r"\P{any}"), - TestError { - kind: hir::ErrorKind::EmptyClassNotAllowed, - span: Span::new( - Position::new(0, 1, 1), - Position::new(7, 1, 8) - ), - } - ); + assert_eq!(t(r"\P{any}"), hir_uclass(&[]),); } #[test] @@ -2389,8 +2663,9 @@ mod tests { #[test] fn class_bracketed() { - assert_eq!(t("[a]"), hir_uclass(&[('a', 'a')])); - assert_eq!(t("[^[a]]"), hir_negate(hir_uclass(&[('a', 'a')]))); + assert_eq!(t("[a]"), hir_lit("a")); + assert_eq!(t("[ab]"), hir_uclass(&[('a', 'b')])); + assert_eq!(t("[^[a]]"), class_negate(uclass(&[('a', 'a')]))); assert_eq!(t("[a-z]"), hir_uclass(&[('a', 'z')])); assert_eq!(t("[a-fd-h]"), hir_uclass(&[('a', 'h')])); assert_eq!(t("[a-fg-m]"), hir_uclass(&[('a', 'm')])); @@ -2453,11 +2728,11 @@ mod tests { ); assert_eq!(t("(?i-u)[k]"), hir_bclass(&[(b'K', b'K'), (b'k', b'k'),])); - assert_eq!(t("[^a]"), hir_negate(hir_uclass(&[('a', 'a')]))); - assert_eq!(t(r"[^\x00]"), hir_negate(hir_uclass(&[('\0', '\0')]))); + assert_eq!(t("[^a]"), class_negate(uclass(&[('a', 'a')]))); + assert_eq!(t(r"[^\x00]"), class_negate(uclass(&[('\0', '\0')]))); assert_eq!( t_bytes("(?-u)[^a]"), - hir_negate(hir_bclass(&[(b'a', b'a')])) + class_negate(bclass(&[(b'a', b'a')])) ); #[cfg(any(feature = "unicode-perl", feature = "unicode-gencat"))] assert_eq!( @@ -2521,27 +2796,9 @@ mod tests { } ); #[cfg(any(feature = "unicode-perl", feature = "unicode-bool"))] - assert_eq!( - t_err(r"[^\s\S]"), - TestError { - kind: hir::ErrorKind::EmptyClassNotAllowed, - span: Span::new( - Position::new(0, 1, 1), - Position::new(7, 1, 8) - ), - } - ); + assert_eq!(t(r"[^\s\S]"), hir_uclass(&[]),); #[cfg(any(feature = "unicode-perl", feature = "unicode-bool"))] - assert_eq!( - t_err(r"(?-u)[^\s\S]"), - TestError { - kind: hir::ErrorKind::EmptyClassNotAllowed, - span: Span::new( - Position::new(5, 1, 6), - Position::new(12, 1, 13) - ), - } - ); + assert_eq!(t_bytes(r"(?-u)[^\s\S]"), hir_bclass(&[]),); } #[test] @@ -2663,9 +2920,9 @@ mod tests { #[test] fn class_bracketed_nested() { - assert_eq!(t(r"[a[^c]]"), hir_negate(hir_uclass(&[('c', 'c')]))); - assert_eq!(t(r"[a-b[^c]]"), hir_negate(hir_uclass(&[('c', 'c')]))); - assert_eq!(t(r"[a-c[^c]]"), hir_negate(hir_uclass(&[]))); + assert_eq!(t(r"[a[^c]]"), class_negate(uclass(&[('c', 'c')]))); + assert_eq!(t(r"[a-b[^c]]"), class_negate(uclass(&[('c', 'c')]))); + assert_eq!(t(r"[a-c[^c]]"), class_negate(uclass(&[]))); assert_eq!(t(r"[^a[^c]]"), hir_uclass(&[('c', 'c')])); assert_eq!(t(r"[^a-b[^c]]"), hir_uclass(&[('c', 'c')])); @@ -2673,12 +2930,12 @@ mod tests { #[cfg(feature = "unicode-case")] assert_eq!( t(r"(?i)[a[^c]]"), - hir_negate(hir_case_fold(hir_uclass(&[('c', 'c')]))) + hir_negate(class_case_fold(uclass(&[('c', 'c')]))) ); #[cfg(feature = "unicode-case")] assert_eq!( t(r"(?i)[a-b[^c]]"), - hir_negate(hir_case_fold(hir_uclass(&[('c', 'c')]))) + hir_negate(class_case_fold(uclass(&[('c', 'c')]))) ); #[cfg(feature = "unicode-case")] @@ -2689,27 +2946,9 @@ mod tests { hir_uclass(&[('C', 'C'), ('c', 'c')]) ); - assert_eq!( - t_err(r"[^a-c[^c]]"), - TestError { - kind: hir::ErrorKind::EmptyClassNotAllowed, - span: Span::new( - Position::new(0, 1, 1), - Position::new(10, 1, 11) - ), - } - ); + assert_eq!(t(r"[^a-c[^c]]"), hir_uclass(&[]),); #[cfg(feature = "unicode-case")] - assert_eq!( - t_err(r"(?i)[^a-c[^c]]"), - TestError { - kind: hir::ErrorKind::EmptyClassNotAllowed, - span: Span::new( - Position::new(4, 1, 5), - Position::new(14, 1, 15) - ), - } - ); + assert_eq!(t(r"(?i)[^a-c[^c]]"), hir_uclass(&[]),); } #[test] @@ -2826,9 +3065,7 @@ mod tests { #[cfg(feature = "unicode-perl")] assert_eq!( t_bytes(r"(?-u)[^\w&&\d]"), - hir_negate(hir_bclass_from_char(ascii_class( - &ast::ClassAsciiKind::Digit - ))) + hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit)) ); assert_eq!( t_bytes(r"(?-u)[^[a-z&&a-c]]"), @@ -2836,19 +3073,15 @@ mod tests { ); assert_eq!( t_bytes(r"(?-u)[^[\w&&\d]]"), - hir_negate(hir_bclass_from_char(ascii_class( - &ast::ClassAsciiKind::Digit - ))) + hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit)) ); assert_eq!( t_bytes(r"(?-u)[^[^\w&&\d]]"), - hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Digit)) + hir_ascii_bclass(&ast::ClassAsciiKind::Digit) ); assert_eq!( t_bytes(r"(?-u)[[[^\w]&&[^\d]]]"), - hir_negate(hir_bclass_from_char(ascii_class( - &ast::ClassAsciiKind::Word - ))) + hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Word)) ); } @@ -2924,284 +3157,427 @@ mod tests { , # comment 10 # comment } # comment"), - hir_range( - true, - hir::RepetitionRange::Bounded(5, 10), - hir_lit("a") - ) + hir_range(true, 5, Some(10), hir_lit("a")) ); assert_eq!(t(r"(?x)a\ # hi there"), hir_lit("a ")); } #[test] - fn analysis_is_always_utf8() { + fn analysis_is_utf8() { // Positive examples. - assert!(t_bytes(r"a").is_always_utf8()); - assert!(t_bytes(r"ab").is_always_utf8()); - assert!(t_bytes(r"(?-u)a").is_always_utf8()); - assert!(t_bytes(r"(?-u)ab").is_always_utf8()); - assert!(t_bytes(r"\xFF").is_always_utf8()); - assert!(t_bytes(r"\xFF\xFF").is_always_utf8()); - assert!(t_bytes(r"[^a]").is_always_utf8()); - assert!(t_bytes(r"[^a][^a]").is_always_utf8()); - assert!(t_bytes(r"\b").is_always_utf8()); - assert!(t_bytes(r"\B").is_always_utf8()); - assert!(t_bytes(r"(?-u)\b").is_always_utf8()); + assert!(props_bytes(r"a").is_utf8()); + assert!(props_bytes(r"ab").is_utf8()); + assert!(props_bytes(r"(?-u)a").is_utf8()); + assert!(props_bytes(r"(?-u)ab").is_utf8()); + assert!(props_bytes(r"\xFF").is_utf8()); + assert!(props_bytes(r"\xFF\xFF").is_utf8()); + assert!(props_bytes(r"[^a]").is_utf8()); + assert!(props_bytes(r"[^a][^a]").is_utf8()); + assert!(props_bytes(r"\b").is_utf8()); + assert!(props_bytes(r"\B").is_utf8()); + assert!(props_bytes(r"(?-u)\b").is_utf8()); + assert!(props_bytes(r"(?-u)\B").is_utf8()); // Negative examples. - assert!(!t_bytes(r"(?-u)\xFF").is_always_utf8()); - assert!(!t_bytes(r"(?-u)\xFF\xFF").is_always_utf8()); - assert!(!t_bytes(r"(?-u)[^a]").is_always_utf8()); - assert!(!t_bytes(r"(?-u)[^a][^a]").is_always_utf8()); - assert!(!t_bytes(r"(?-u)\B").is_always_utf8()); + assert!(!props_bytes(r"(?-u)\xFF").is_utf8()); + assert!(!props_bytes(r"(?-u)\xFF\xFF").is_utf8()); + assert!(!props_bytes(r"(?-u)[^a]").is_utf8()); + assert!(!props_bytes(r"(?-u)[^a][^a]").is_utf8()); + } + + #[test] + fn analysis_captures_len() { + assert_eq!(0, props(r"a").explicit_captures_len()); + assert_eq!(0, props(r"(?:a)").explicit_captures_len()); + assert_eq!(0, props(r"(?i-u:a)").explicit_captures_len()); + assert_eq!(0, props(r"(?i-u)a").explicit_captures_len()); + assert_eq!(1, props(r"(a)").explicit_captures_len()); + assert_eq!(1, props(r"(?P<foo>a)").explicit_captures_len()); + assert_eq!(1, props(r"()").explicit_captures_len()); + assert_eq!(1, props(r"()a").explicit_captures_len()); + assert_eq!(1, props(r"(a)+").explicit_captures_len()); + assert_eq!(2, props(r"(a)(b)").explicit_captures_len()); + assert_eq!(2, props(r"(a)|(b)").explicit_captures_len()); + assert_eq!(2, props(r"((a))").explicit_captures_len()); + assert_eq!(1, props(r"([a&&b])").explicit_captures_len()); + } + + #[test] + fn analysis_static_captures_len() { + let len = |pattern| props(pattern).static_explicit_captures_len(); + assert_eq!(Some(0), len(r"")); + assert_eq!(Some(0), len(r"foo|bar")); + assert_eq!(None, len(r"(foo)|bar")); + assert_eq!(None, len(r"foo|(bar)")); + assert_eq!(Some(1), len(r"(foo|bar)")); + assert_eq!(Some(1), len(r"(a|b|c|d|e|f)")); + assert_eq!(Some(1), len(r"(a)|(b)|(c)|(d)|(e)|(f)")); + assert_eq!(Some(2), len(r"(a)(b)|(c)(d)|(e)(f)")); + assert_eq!(Some(6), len(r"(a)(b)(c)(d)(e)(f)")); + assert_eq!(Some(3), len(r"(a)(b)(extra)|(a)(b)()")); + assert_eq!(Some(3), len(r"(a)(b)((?:extra)?)")); + assert_eq!(None, len(r"(a)(b)(extra)?")); + assert_eq!(Some(1), len(r"(foo)|(bar)")); + assert_eq!(Some(2), len(r"(foo)(bar)")); + assert_eq!(Some(2), len(r"(foo)+(bar)")); + assert_eq!(None, len(r"(foo)*(bar)")); + assert_eq!(Some(0), len(r"(foo)?{0}")); + assert_eq!(None, len(r"(foo)?{1}")); + assert_eq!(Some(1), len(r"(foo){1}")); + assert_eq!(Some(1), len(r"(foo){1,}")); + assert_eq!(Some(1), len(r"(foo){1,}?")); + assert_eq!(None, len(r"(foo){1,}??")); + assert_eq!(None, len(r"(foo){0,}")); + assert_eq!(Some(1), len(r"(foo)(?:bar)")); + assert_eq!(Some(2), len(r"(foo(?:bar)+)(?:baz(boo))")); + assert_eq!(Some(2), len(r"(?P<bar>foo)(?:bar)(bal|loon)")); + assert_eq!( + Some(2), + len(r#"<(a)[^>]+href="([^"]+)"|<(img)[^>]+src="([^"]+)""#) + ); } #[test] fn analysis_is_all_assertions() { // Positive examples. - assert!(t(r"\b").is_all_assertions()); - assert!(t(r"\B").is_all_assertions()); - assert!(t(r"^").is_all_assertions()); - assert!(t(r"$").is_all_assertions()); - assert!(t(r"\A").is_all_assertions()); - assert!(t(r"\z").is_all_assertions()); - assert!(t(r"$^\z\A\b\B").is_all_assertions()); - assert!(t(r"$|^|\z|\A|\b|\B").is_all_assertions()); - assert!(t(r"^$|$^").is_all_assertions()); - assert!(t(r"((\b)+())*^").is_all_assertions()); + let p = props(r"\b"); + assert!(!p.look_set().is_empty()); + assert_eq!(p.minimum_len(), Some(0)); + + let p = props(r"\B"); + assert!(!p.look_set().is_empty()); + assert_eq!(p.minimum_len(), Some(0)); + + let p = props(r"^"); + assert!(!p.look_set().is_empty()); + assert_eq!(p.minimum_len(), Some(0)); + + let p = props(r"$"); + assert!(!p.look_set().is_empty()); + assert_eq!(p.minimum_len(), Some(0)); + + let p = props(r"\A"); + assert!(!p.look_set().is_empty()); + assert_eq!(p.minimum_len(), Some(0)); + + let p = props(r"\z"); + assert!(!p.look_set().is_empty()); + assert_eq!(p.minimum_len(), Some(0)); + + let p = props(r"$^\z\A\b\B"); + assert!(!p.look_set().is_empty()); + assert_eq!(p.minimum_len(), Some(0)); + + let p = props(r"$|^|\z|\A|\b|\B"); + assert!(!p.look_set().is_empty()); + assert_eq!(p.minimum_len(), Some(0)); + + let p = props(r"^$|$^"); + assert!(!p.look_set().is_empty()); + assert_eq!(p.minimum_len(), Some(0)); + + let p = props(r"((\b)+())*^"); + assert!(!p.look_set().is_empty()); + assert_eq!(p.minimum_len(), Some(0)); // Negative examples. - assert!(!t(r"^a").is_all_assertions()); + let p = props(r"^a"); + assert!(!p.look_set().is_empty()); + assert_eq!(p.minimum_len(), Some(1)); + } + + #[test] + fn analysis_look_set_prefix_any() { + let p = props(r"(?-u)(?i:(?:\b|_)win(?:32|64|dows)?(?:\b|_))"); + assert!(p.look_set_prefix_any().contains(Look::WordAscii)); } #[test] fn analysis_is_anchored() { + let is_start = |p| props(p).look_set_prefix().contains(Look::Start); + let is_end = |p| props(p).look_set_suffix().contains(Look::End); + // Positive examples. - assert!(t(r"^").is_anchored_start()); - assert!(t(r"$").is_anchored_end()); - assert!(t(r"^").is_line_anchored_start()); - assert!(t(r"$").is_line_anchored_end()); - - assert!(t(r"^^").is_anchored_start()); - assert!(t(r"$$").is_anchored_end()); - assert!(t(r"^^").is_line_anchored_start()); - assert!(t(r"$$").is_line_anchored_end()); - - assert!(t(r"^$").is_anchored_start()); - assert!(t(r"^$").is_anchored_end()); - assert!(t(r"^$").is_line_anchored_start()); - assert!(t(r"^$").is_line_anchored_end()); - - assert!(t(r"^foo").is_anchored_start()); - assert!(t(r"foo$").is_anchored_end()); - assert!(t(r"^foo").is_line_anchored_start()); - assert!(t(r"foo$").is_line_anchored_end()); - - assert!(t(r"^foo|^bar").is_anchored_start()); - assert!(t(r"foo$|bar$").is_anchored_end()); - assert!(t(r"^foo|^bar").is_line_anchored_start()); - assert!(t(r"foo$|bar$").is_line_anchored_end()); - - assert!(t(r"^(foo|bar)").is_anchored_start()); - assert!(t(r"(foo|bar)$").is_anchored_end()); - assert!(t(r"^(foo|bar)").is_line_anchored_start()); - assert!(t(r"(foo|bar)$").is_line_anchored_end()); - - assert!(t(r"^+").is_anchored_start()); - assert!(t(r"$+").is_anchored_end()); - assert!(t(r"^+").is_line_anchored_start()); - assert!(t(r"$+").is_line_anchored_end()); - assert!(t(r"^++").is_anchored_start()); - assert!(t(r"$++").is_anchored_end()); - assert!(t(r"^++").is_line_anchored_start()); - assert!(t(r"$++").is_line_anchored_end()); - assert!(t(r"(^)+").is_anchored_start()); - assert!(t(r"($)+").is_anchored_end()); - assert!(t(r"(^)+").is_line_anchored_start()); - assert!(t(r"($)+").is_line_anchored_end()); - - assert!(t(r"$^").is_anchored_start()); - assert!(t(r"$^").is_anchored_start()); - assert!(t(r"$^").is_line_anchored_end()); - assert!(t(r"$^").is_line_anchored_end()); - assert!(t(r"$^|^$").is_anchored_start()); - assert!(t(r"$^|^$").is_anchored_end()); - assert!(t(r"$^|^$").is_line_anchored_start()); - assert!(t(r"$^|^$").is_line_anchored_end()); - - assert!(t(r"\b^").is_anchored_start()); - assert!(t(r"$\b").is_anchored_end()); - assert!(t(r"\b^").is_line_anchored_start()); - assert!(t(r"$\b").is_line_anchored_end()); - assert!(t(r"^(?m:^)").is_anchored_start()); - assert!(t(r"(?m:$)$").is_anchored_end()); - assert!(t(r"^(?m:^)").is_line_anchored_start()); - assert!(t(r"(?m:$)$").is_line_anchored_end()); - assert!(t(r"(?m:^)^").is_anchored_start()); - assert!(t(r"$(?m:$)").is_anchored_end()); - assert!(t(r"(?m:^)^").is_line_anchored_start()); - assert!(t(r"$(?m:$)").is_line_anchored_end()); + assert!(is_start(r"^")); + assert!(is_end(r"$")); - // Negative examples. - assert!(!t(r"(?m)^").is_anchored_start()); - assert!(!t(r"(?m)$").is_anchored_end()); - assert!(!t(r"(?m:^$)|$^").is_anchored_start()); - assert!(!t(r"(?m:^$)|$^").is_anchored_end()); - assert!(!t(r"$^|(?m:^$)").is_anchored_start()); - assert!(!t(r"$^|(?m:^$)").is_anchored_end()); - - assert!(!t(r"a^").is_anchored_start()); - assert!(!t(r"$a").is_anchored_start()); - assert!(!t(r"a^").is_line_anchored_start()); - assert!(!t(r"$a").is_line_anchored_start()); - - assert!(!t(r"a^").is_anchored_end()); - assert!(!t(r"$a").is_anchored_end()); - assert!(!t(r"a^").is_line_anchored_end()); - assert!(!t(r"$a").is_line_anchored_end()); - - assert!(!t(r"^foo|bar").is_anchored_start()); - assert!(!t(r"foo|bar$").is_anchored_end()); - assert!(!t(r"^foo|bar").is_line_anchored_start()); - assert!(!t(r"foo|bar$").is_line_anchored_end()); - - assert!(!t(r"^*").is_anchored_start()); - assert!(!t(r"$*").is_anchored_end()); - assert!(!t(r"^*").is_line_anchored_start()); - assert!(!t(r"$*").is_line_anchored_end()); - assert!(!t(r"^*+").is_anchored_start()); - assert!(!t(r"$*+").is_anchored_end()); - assert!(!t(r"^*+").is_line_anchored_start()); - assert!(!t(r"$*+").is_line_anchored_end()); - assert!(!t(r"^+*").is_anchored_start()); - assert!(!t(r"$+*").is_anchored_end()); - assert!(!t(r"^+*").is_line_anchored_start()); - assert!(!t(r"$+*").is_line_anchored_end()); - assert!(!t(r"(^)*").is_anchored_start()); - assert!(!t(r"($)*").is_anchored_end()); - assert!(!t(r"(^)*").is_line_anchored_start()); - assert!(!t(r"($)*").is_line_anchored_end()); - } + assert!(is_start(r"^^")); + assert!(props(r"$$").look_set_suffix().contains(Look::End)); - #[test] - fn analysis_is_line_anchored() { - assert!(t(r"(?m)^(foo|bar)").is_line_anchored_start()); - assert!(t(r"(?m)(foo|bar)$").is_line_anchored_end()); + assert!(is_start(r"^$")); + assert!(is_end(r"^$")); - assert!(t(r"(?m)^foo|^bar").is_line_anchored_start()); - assert!(t(r"(?m)foo$|bar$").is_line_anchored_end()); + assert!(is_start(r"^foo")); + assert!(is_end(r"foo$")); + + assert!(is_start(r"^foo|^bar")); + assert!(is_end(r"foo$|bar$")); + + assert!(is_start(r"^(foo|bar)")); + assert!(is_end(r"(foo|bar)$")); + + assert!(is_start(r"^+")); + assert!(is_end(r"$+")); + assert!(is_start(r"^++")); + assert!(is_end(r"$++")); + assert!(is_start(r"(^)+")); + assert!(is_end(r"($)+")); + + assert!(is_start(r"$^")); + assert!(is_start(r"$^")); + assert!(is_start(r"$^|^$")); + assert!(is_end(r"$^|^$")); + + assert!(is_start(r"\b^")); + assert!(is_end(r"$\b")); + assert!(is_start(r"^(?m:^)")); + assert!(is_end(r"(?m:$)$")); + assert!(is_start(r"(?m:^)^")); + assert!(is_end(r"$(?m:$)")); + + // Negative examples. + assert!(!is_start(r"(?m)^")); + assert!(!is_end(r"(?m)$")); + assert!(!is_start(r"(?m:^$)|$^")); + assert!(!is_end(r"(?m:^$)|$^")); + assert!(!is_start(r"$^|(?m:^$)")); + assert!(!is_end(r"$^|(?m:^$)")); - assert!(t(r"(?m)^").is_line_anchored_start()); - assert!(t(r"(?m)$").is_line_anchored_end()); + assert!(!is_start(r"a^")); + assert!(!is_start(r"$a")); - assert!(t(r"(?m:^$)|$^").is_line_anchored_start()); - assert!(t(r"(?m:^$)|$^").is_line_anchored_end()); + assert!(!is_end(r"a^")); + assert!(!is_end(r"$a")); - assert!(t(r"$^|(?m:^$)").is_line_anchored_start()); - assert!(t(r"$^|(?m:^$)").is_line_anchored_end()); + assert!(!is_start(r"^foo|bar")); + assert!(!is_end(r"foo|bar$")); + + assert!(!is_start(r"^*")); + assert!(!is_end(r"$*")); + assert!(!is_start(r"^*+")); + assert!(!is_end(r"$*+")); + assert!(!is_start(r"^+*")); + assert!(!is_end(r"$+*")); + assert!(!is_start(r"(^)*")); + assert!(!is_end(r"($)*")); } #[test] fn analysis_is_any_anchored() { + let is_start = |p| props(p).look_set().contains(Look::Start); + let is_end = |p| props(p).look_set().contains(Look::End); + // Positive examples. - assert!(t(r"^").is_any_anchored_start()); - assert!(t(r"$").is_any_anchored_end()); - assert!(t(r"\A").is_any_anchored_start()); - assert!(t(r"\z").is_any_anchored_end()); + assert!(is_start(r"^")); + assert!(is_end(r"$")); + assert!(is_start(r"\A")); + assert!(is_end(r"\z")); // Negative examples. - assert!(!t(r"(?m)^").is_any_anchored_start()); - assert!(!t(r"(?m)$").is_any_anchored_end()); - assert!(!t(r"$").is_any_anchored_start()); - assert!(!t(r"^").is_any_anchored_end()); + assert!(!is_start(r"(?m)^")); + assert!(!is_end(r"(?m)$")); + assert!(!is_start(r"$")); + assert!(!is_end(r"^")); } #[test] - fn analysis_is_match_empty() { + fn analysis_can_empty() { // Positive examples. - assert!(t(r"").is_match_empty()); - assert!(t(r"()").is_match_empty()); - assert!(t(r"()*").is_match_empty()); - assert!(t(r"()+").is_match_empty()); - assert!(t(r"()?").is_match_empty()); - assert!(t(r"a*").is_match_empty()); - assert!(t(r"a?").is_match_empty()); - assert!(t(r"a{0}").is_match_empty()); - assert!(t(r"a{0,}").is_match_empty()); - assert!(t(r"a{0,1}").is_match_empty()); - assert!(t(r"a{0,10}").is_match_empty()); + let assert_empty = + |p| assert_eq!(Some(0), props_bytes(p).minimum_len()); + assert_empty(r""); + assert_empty(r"()"); + assert_empty(r"()*"); + assert_empty(r"()+"); + assert_empty(r"()?"); + assert_empty(r"a*"); + assert_empty(r"a?"); + assert_empty(r"a{0}"); + assert_empty(r"a{0,}"); + assert_empty(r"a{0,1}"); + assert_empty(r"a{0,10}"); #[cfg(feature = "unicode-gencat")] - assert!(t(r"\pL*").is_match_empty()); - assert!(t(r"a*|b").is_match_empty()); - assert!(t(r"b|a*").is_match_empty()); - assert!(t(r"a|").is_match_empty()); - assert!(t(r"|a").is_match_empty()); - assert!(t(r"a||b").is_match_empty()); - assert!(t(r"a*a?(abcd)*").is_match_empty()); - assert!(t(r"^").is_match_empty()); - assert!(t(r"$").is_match_empty()); - assert!(t(r"(?m)^").is_match_empty()); - assert!(t(r"(?m)$").is_match_empty()); - assert!(t(r"\A").is_match_empty()); - assert!(t(r"\z").is_match_empty()); - assert!(t(r"\B").is_match_empty()); - assert!(t_bytes(r"(?-u)\B").is_match_empty()); - assert!(t(r"\b").is_match_empty()); - assert!(t(r"(?-u)\b").is_match_empty()); + assert_empty(r"\pL*"); + assert_empty(r"a*|b"); + assert_empty(r"b|a*"); + assert_empty(r"a|"); + assert_empty(r"|a"); + assert_empty(r"a||b"); + assert_empty(r"a*a?(abcd)*"); + assert_empty(r"^"); + assert_empty(r"$"); + assert_empty(r"(?m)^"); + assert_empty(r"(?m)$"); + assert_empty(r"\A"); + assert_empty(r"\z"); + assert_empty(r"\B"); + assert_empty(r"(?-u)\B"); + assert_empty(r"\b"); + assert_empty(r"(?-u)\b"); // Negative examples. - assert!(!t(r"a+").is_match_empty()); - assert!(!t(r"a{1}").is_match_empty()); - assert!(!t(r"a{1,}").is_match_empty()); - assert!(!t(r"a{1,2}").is_match_empty()); - assert!(!t(r"a{1,10}").is_match_empty()); - assert!(!t(r"b|a").is_match_empty()); - assert!(!t(r"a*a+(abcd)*").is_match_empty()); + let assert_non_empty = + |p| assert_ne!(Some(0), props_bytes(p).minimum_len()); + assert_non_empty(r"a+"); + assert_non_empty(r"a{1}"); + assert_non_empty(r"a{1,}"); + assert_non_empty(r"a{1,2}"); + assert_non_empty(r"a{1,10}"); + assert_non_empty(r"b|a"); + assert_non_empty(r"a*a+(abcd)*"); + #[cfg(feature = "unicode-gencat")] + assert_non_empty(r"\P{any}"); + assert_non_empty(r"[a--a]"); + assert_non_empty(r"[a&&b]"); } #[test] fn analysis_is_literal() { // Positive examples. - assert!(t(r"a").is_literal()); - assert!(t(r"ab").is_literal()); - assert!(t(r"abc").is_literal()); - assert!(t(r"(?m)abc").is_literal()); + assert!(props(r"a").is_literal()); + assert!(props(r"ab").is_literal()); + assert!(props(r"abc").is_literal()); + assert!(props(r"(?m)abc").is_literal()); + assert!(props(r"(?:a)").is_literal()); + assert!(props(r"foo(?:a)").is_literal()); + assert!(props(r"(?:a)foo").is_literal()); + assert!(props(r"[a]").is_literal()); // Negative examples. - assert!(!t(r"").is_literal()); - assert!(!t(r"^").is_literal()); - assert!(!t(r"a|b").is_literal()); - assert!(!t(r"(a)").is_literal()); - assert!(!t(r"a+").is_literal()); - assert!(!t(r"foo(a)").is_literal()); - assert!(!t(r"(a)foo").is_literal()); - assert!(!t(r"[a]").is_literal()); + assert!(!props(r"").is_literal()); + assert!(!props(r"^").is_literal()); + assert!(!props(r"a|b").is_literal()); + assert!(!props(r"(a)").is_literal()); + assert!(!props(r"a+").is_literal()); + assert!(!props(r"foo(a)").is_literal()); + assert!(!props(r"(a)foo").is_literal()); + assert!(!props(r"[ab]").is_literal()); } #[test] fn analysis_is_alternation_literal() { // Positive examples. - assert!(t(r"a").is_alternation_literal()); - assert!(t(r"ab").is_alternation_literal()); - assert!(t(r"abc").is_alternation_literal()); - assert!(t(r"(?m)abc").is_alternation_literal()); - assert!(t(r"a|b").is_alternation_literal()); - assert!(t(r"a|b|c").is_alternation_literal()); - assert!(t(r"foo|bar").is_alternation_literal()); - assert!(t(r"foo|bar|baz").is_alternation_literal()); + assert!(props(r"a").is_alternation_literal()); + assert!(props(r"ab").is_alternation_literal()); + assert!(props(r"abc").is_alternation_literal()); + assert!(props(r"(?m)abc").is_alternation_literal()); + assert!(props(r"foo|bar").is_alternation_literal()); + assert!(props(r"foo|bar|baz").is_alternation_literal()); + assert!(props(r"[a]").is_alternation_literal()); + assert!(props(r"(?:ab)|cd").is_alternation_literal()); + assert!(props(r"ab|(?:cd)").is_alternation_literal()); // Negative examples. - assert!(!t(r"").is_alternation_literal()); - assert!(!t(r"^").is_alternation_literal()); - assert!(!t(r"(a)").is_alternation_literal()); - assert!(!t(r"a+").is_alternation_literal()); - assert!(!t(r"foo(a)").is_alternation_literal()); - assert!(!t(r"(a)foo").is_alternation_literal()); - assert!(!t(r"[a]").is_alternation_literal()); - assert!(!t(r"[a]|b").is_alternation_literal()); - assert!(!t(r"a|[b]").is_alternation_literal()); - assert!(!t(r"(a)|b").is_alternation_literal()); - assert!(!t(r"a|(b)").is_alternation_literal()); + assert!(!props(r"").is_alternation_literal()); + assert!(!props(r"^").is_alternation_literal()); + assert!(!props(r"(a)").is_alternation_literal()); + assert!(!props(r"a+").is_alternation_literal()); + assert!(!props(r"foo(a)").is_alternation_literal()); + assert!(!props(r"(a)foo").is_alternation_literal()); + assert!(!props(r"[ab]").is_alternation_literal()); + assert!(!props(r"[ab]|b").is_alternation_literal()); + assert!(!props(r"a|[ab]").is_alternation_literal()); + assert!(!props(r"(a)|b").is_alternation_literal()); + assert!(!props(r"a|(b)").is_alternation_literal()); + assert!(!props(r"a|b").is_alternation_literal()); + assert!(!props(r"a|b|c").is_alternation_literal()); + assert!(!props(r"[a]|b").is_alternation_literal()); + assert!(!props(r"a|[b]").is_alternation_literal()); + assert!(!props(r"(?:a)|b").is_alternation_literal()); + assert!(!props(r"a|(?:b)").is_alternation_literal()); + assert!(!props(r"(?:z|xx)@|xx").is_alternation_literal()); + } + + // This tests that the smart Hir::concat constructor simplifies the given + // exprs in a way we expect. + #[test] + fn smart_concat() { + assert_eq!(t(""), Hir::empty()); + assert_eq!(t("(?:)"), Hir::empty()); + assert_eq!(t("abc"), hir_lit("abc")); + assert_eq!(t("(?:foo)(?:bar)"), hir_lit("foobar")); + assert_eq!(t("quux(?:foo)(?:bar)baz"), hir_lit("quuxfoobarbaz")); + assert_eq!( + t("foo(?:bar^baz)quux"), + hir_cat(vec![ + hir_lit("foobar"), + hir_look(hir::Look::Start), + hir_lit("bazquux"), + ]) + ); + assert_eq!( + t("foo(?:ba(?:r^b)az)quux"), + hir_cat(vec![ + hir_lit("foobar"), + hir_look(hir::Look::Start), + hir_lit("bazquux"), + ]) + ); + } + + // This tests that the smart Hir::alternation constructor simplifies the + // given exprs in a way we expect. + #[test] + fn smart_alternation() { + assert_eq!( + t("(?:foo)|(?:bar)"), + hir_alt(vec![hir_lit("foo"), hir_lit("bar")]) + ); + assert_eq!( + t("quux|(?:abc|def|xyz)|baz"), + hir_alt(vec![ + hir_lit("quux"), + hir_lit("abc"), + hir_lit("def"), + hir_lit("xyz"), + hir_lit("baz"), + ]) + ); + assert_eq!( + t("quux|(?:abc|(?:def|mno)|xyz)|baz"), + hir_alt(vec![ + hir_lit("quux"), + hir_lit("abc"), + hir_lit("def"), + hir_lit("mno"), + hir_lit("xyz"), + hir_lit("baz"), + ]) + ); + assert_eq!( + t("a|b|c|d|e|f|x|y|z"), + hir_uclass(&[('a', 'f'), ('x', 'z')]), + ); + // Tests that we lift common prefixes out of an alternation. + assert_eq!( + t("[A-Z]foo|[A-Z]quux"), + hir_cat(vec![ + hir_uclass(&[('A', 'Z')]), + hir_alt(vec![hir_lit("foo"), hir_lit("quux")]), + ]), + ); + assert_eq!( + t("[A-Z][A-Z]|[A-Z]quux"), + hir_cat(vec![ + hir_uclass(&[('A', 'Z')]), + hir_alt(vec![hir_uclass(&[('A', 'Z')]), hir_lit("quux")]), + ]), + ); + assert_eq!( + t("[A-Z][A-Z]|[A-Z][A-Z]quux"), + hir_cat(vec![ + hir_uclass(&[('A', 'Z')]), + hir_uclass(&[('A', 'Z')]), + hir_alt(vec![Hir::empty(), hir_lit("quux")]), + ]), + ); + assert_eq!( + t("[A-Z]foo|[A-Z]foobar"), + hir_cat(vec![ + hir_uclass(&[('A', 'Z')]), + hir_alt(vec![hir_lit("foo"), hir_lit("foobar")]), + ]), + ); } } diff --git a/vendor/regex-syntax/src/hir/visitor.rs b/vendor/regex-syntax/src/hir/visitor.rs index 4f5a70909..e5f15cf1c 100644 --- a/vendor/regex-syntax/src/hir/visitor.rs +++ b/vendor/regex-syntax/src/hir/visitor.rs @@ -1,3 +1,5 @@ +use alloc::{vec, vec::Vec}; + use crate::hir::{self, Hir, HirKind}; /// A trait for visiting the high-level IR (HIR) in depth first order. @@ -9,7 +11,7 @@ use crate::hir::{self, Hir, HirKind}; /// important since the size of an HIR may be proportional to end user input. /// /// Typical usage of this trait involves providing an implementation and then -/// running it using the [`visit`](fn.visit.html) function. +/// running it using the [`visit`] function. pub trait Visitor { /// The result of visiting an HIR. type Output; @@ -44,8 +46,7 @@ pub trait Visitor { /// Executes an implementation of `Visitor` in constant stack space. /// /// This function will visit every node in the given `Hir` while calling -/// appropriate methods provided by the -/// [`Visitor`](trait.Visitor.html) trait. +/// appropriate methods provided by the [`Visitor`] trait. /// /// The primary use case for this method is when one wants to perform case /// analysis over an `Hir` without using a stack size proportional to the depth @@ -74,9 +75,9 @@ enum Frame<'a> { /// A stack frame allocated just before descending into a repetition /// operator's child node. Repetition(&'a hir::Repetition), - /// A stack frame allocated just before descending into a group's child + /// A stack frame allocated just before descending into a capture's child /// node. - Group(&'a hir::Group), + Capture(&'a hir::Capture), /// The stack frame used while visiting every child node of a concatenation /// of expressions. Concat { @@ -149,7 +150,7 @@ impl<'a> HeapVisitor<'a> { fn induct(&mut self, hir: &'a Hir) -> Option<Frame<'a>> { match *hir.kind() { HirKind::Repetition(ref x) => Some(Frame::Repetition(x)), - HirKind::Group(ref x) => Some(Frame::Group(x)), + HirKind::Capture(ref x) => Some(Frame::Capture(x)), HirKind::Concat(ref x) if x.is_empty() => None, HirKind::Concat(ref x) => { Some(Frame::Concat { head: &x[0], tail: &x[1..] }) @@ -167,7 +168,7 @@ impl<'a> HeapVisitor<'a> { fn pop(&self, induct: Frame<'a>) -> Option<Frame<'a>> { match induct { Frame::Repetition(_) => None, - Frame::Group(_) => None, + Frame::Capture(_) => None, Frame::Concat { tail, .. } => { if tail.is_empty() { None @@ -194,8 +195,8 @@ impl<'a> Frame<'a> { /// child HIR node to visit. fn child(&self) -> &'a Hir { match *self { - Frame::Repetition(rep) => &rep.hir, - Frame::Group(group) => &group.hir, + Frame::Repetition(rep) => &rep.sub, + Frame::Capture(capture) => &capture.sub, Frame::Concat { head, .. } => head, Frame::Alternation { head, .. } => head, } |