diff options
Diffstat (limited to 'vendor/regex-syntax/src/hir')
-rw-r--r-- | vendor/regex-syntax/src/hir/interval.rs | 581 | ||||
-rw-r--r-- | vendor/regex-syntax/src/hir/literal.rs | 3214 | ||||
-rw-r--r-- | vendor/regex-syntax/src/hir/mod.rs | 3861 | ||||
-rw-r--r-- | vendor/regex-syntax/src/hir/print.rs | 608 | ||||
-rw-r--r-- | vendor/regex-syntax/src/hir/translate.rs | 3724 | ||||
-rw-r--r-- | vendor/regex-syntax/src/hir/visitor.rs | 215 |
6 files changed, 12203 insertions, 0 deletions
diff --git a/vendor/regex-syntax/src/hir/interval.rs b/vendor/regex-syntax/src/hir/interval.rs new file mode 100644 index 0000000..e063390 --- /dev/null +++ b/vendor/regex-syntax/src/hir/interval.rs @@ -0,0 +1,581 @@ +use core::{char, cmp, fmt::Debug, slice}; + +use alloc::vec::Vec; + +use crate::unicode; + +// This module contains an *internal* implementation of interval sets. +// +// The primary invariant that interval sets guards is canonical ordering. That +// is, every interval set contains an ordered sequence of intervals where +// no two intervals are overlapping or adjacent. While this invariant is +// occasionally broken within the implementation, it should be impossible for +// callers to observe it. +// +// Since case folding (as implemented below) breaks that invariant, we roll +// that into this API even though it is a little out of place in an otherwise +// generic interval set. (Hence the reason why the `unicode` module is imported +// here.) +// +// Some of the implementation complexity here is a result of me wanting to +// preserve the sequential representation without using additional memory. +// In many cases, we do use linear extra memory, but it is at most 2x and it +// is amortized. If we relaxed the memory requirements, this implementation +// could become much simpler. The extra memory is honestly probably OK, but +// character classes (especially of the Unicode variety) can become quite +// large, and it would be nice to keep regex compilation snappy even in debug +// builds. (In the past, I have been careless with this area of code and it has +// caused slow regex compilations in debug mode, so this isn't entirely +// unwarranted.) +// +// Tests on this are relegated to the public API of HIR in src/hir.rs. + +#[derive(Clone, Debug)] +pub struct IntervalSet<I> { + /// A sorted set of non-overlapping ranges. + ranges: Vec<I>, + /// While not required at all for correctness, we keep track of whether an + /// interval set has been case folded or not. This helps us avoid doing + /// redundant work if, for example, a set has already been cased folded. + /// And note that whether a set is folded or not is preserved through + /// all of the pairwise set operations. That is, if both interval sets + /// have been case folded, then any of difference, union, intersection or + /// symmetric difference all produce a case folded set. + /// + /// Note that when this is true, it *must* be the case that the set is case + /// folded. But when it's false, the set *may* be case folded. In other + /// words, we only set this to true when we know it to be case, but we're + /// okay with it being false if it would otherwise be costly to determine + /// whether it should be true. This means code cannot assume that a false + /// value necessarily indicates that the set is not case folded. + /// + /// Bottom line: this is a performance optimization. + folded: bool, +} + +impl<I: Interval> Eq for IntervalSet<I> {} + +// We implement PartialEq manually so that we don't consider the set's internal +// 'folded' property to be part of its identity. The 'folded' property is +// strictly an optimization. +impl<I: Interval> PartialEq for IntervalSet<I> { + fn eq(&self, other: &IntervalSet<I>) -> bool { + self.ranges.eq(&other.ranges) + } +} + +impl<I: Interval> IntervalSet<I> { + /// Create a new set from a sequence of intervals. Each interval is + /// specified as a pair of bounds, where both bounds are inclusive. + /// + /// The given ranges do not need to be in any specific order, and ranges + /// may overlap. + pub fn new<T: IntoIterator<Item = I>>(intervals: T) -> IntervalSet<I> { + let ranges: Vec<I> = intervals.into_iter().collect(); + // An empty set is case folded. + let folded = ranges.is_empty(); + let mut set = IntervalSet { ranges, folded }; + set.canonicalize(); + set + } + + /// Add a new interval to this set. + pub fn push(&mut self, interval: I) { + // TODO: This could be faster. e.g., Push the interval such that + // it preserves canonicalization. + self.ranges.push(interval); + self.canonicalize(); + // We don't know whether the new interval added here is considered + // case folded, so we conservatively assume that the entire set is + // no longer case folded if it was previously. + self.folded = false; + } + + /// Return an iterator over all intervals in this set. + /// + /// The iterator yields intervals in ascending order. + pub fn iter(&self) -> IntervalSetIter<'_, I> { + IntervalSetIter(self.ranges.iter()) + } + + /// Return an immutable slice of intervals in this set. + /// + /// The sequence returned is in canonical ordering. + pub fn intervals(&self) -> &[I] { + &self.ranges + } + + /// Expand this interval set such that it contains all case folded + /// characters. For example, if this class consists of the range `a-z`, + /// then applying case folding will result in the class containing both the + /// ranges `a-z` and `A-Z`. + /// + /// This returns an error if the necessary case mapping data is not + /// available. + pub fn case_fold_simple(&mut self) -> Result<(), unicode::CaseFoldError> { + if self.folded { + return Ok(()); + } + let len = self.ranges.len(); + for i in 0..len { + let range = self.ranges[i]; + if let Err(err) = range.case_fold_simple(&mut self.ranges) { + self.canonicalize(); + return Err(err); + } + } + self.canonicalize(); + self.folded = true; + Ok(()) + } + + /// Union this set with the given set, in place. + pub fn union(&mut self, other: &IntervalSet<I>) { + if other.ranges.is_empty() || self.ranges == other.ranges { + return; + } + // This could almost certainly be done more efficiently. + self.ranges.extend(&other.ranges); + self.canonicalize(); + self.folded = self.folded && other.folded; + } + + /// Intersect this set with the given set, in place. + pub fn intersect(&mut self, other: &IntervalSet<I>) { + if self.ranges.is_empty() { + return; + } + if other.ranges.is_empty() { + self.ranges.clear(); + // An empty set is case folded. + self.folded = true; + return; + } + + // There should be a way to do this in-place with constant memory, + // but I couldn't figure out a simple way to do it. So just append + // the intersection to the end of this range, and then drain it before + // we're done. + let drain_end = self.ranges.len(); + + let mut ita = 0..drain_end; + let mut itb = 0..other.ranges.len(); + let mut a = ita.next().unwrap(); + let mut b = itb.next().unwrap(); + loop { + if let Some(ab) = self.ranges[a].intersect(&other.ranges[b]) { + self.ranges.push(ab); + } + let (it, aorb) = + if self.ranges[a].upper() < other.ranges[b].upper() { + (&mut ita, &mut a) + } else { + (&mut itb, &mut b) + }; + match it.next() { + Some(v) => *aorb = v, + None => break, + } + } + self.ranges.drain(..drain_end); + self.folded = self.folded && other.folded; + } + + /// Subtract the given set from this set, in place. + pub fn difference(&mut self, other: &IntervalSet<I>) { + if self.ranges.is_empty() || other.ranges.is_empty() { + return; + } + + // This algorithm is (to me) surprisingly complex. A search of the + // interwebs indicate that this is a potentially interesting problem. + // Folks seem to suggest interval or segment trees, but I'd like to + // avoid the overhead (both runtime and conceptual) of that. + // + // The following is basically my Shitty First Draft. Therefore, in + // order to grok it, you probably need to read each line carefully. + // Simplifications are most welcome! + // + // Remember, we can assume the canonical format invariant here, which + // says that all ranges are sorted, not overlapping and not adjacent in + // each class. + let drain_end = self.ranges.len(); + let (mut a, mut b) = (0, 0); + 'LOOP: while a < drain_end && b < other.ranges.len() { + // Basically, the easy cases are when neither range overlaps with + // each other. If the `b` range is less than our current `a` + // range, then we can skip it and move on. + if other.ranges[b].upper() < self.ranges[a].lower() { + b += 1; + continue; + } + // ... similarly for the `a` range. If it's less than the smallest + // `b` range, then we can add it as-is. + if self.ranges[a].upper() < other.ranges[b].lower() { + let range = self.ranges[a]; + self.ranges.push(range); + a += 1; + continue; + } + // Otherwise, we have overlapping ranges. + assert!(!self.ranges[a].is_intersection_empty(&other.ranges[b])); + + // This part is tricky and was non-obvious to me without looking + // at explicit examples (see the tests). The trickiness stems from + // two things: 1) subtracting a range from another range could + // yield two ranges and 2) after subtracting a range, it's possible + // that future ranges can have an impact. The loop below advances + // the `b` ranges until they can't possible impact the current + // range. + // + // For example, if our `a` range is `a-t` and our next three `b` + // ranges are `a-c`, `g-i`, `r-t` and `x-z`, then we need to apply + // subtraction three times before moving on to the next `a` range. + let mut range = self.ranges[a]; + while b < other.ranges.len() + && !range.is_intersection_empty(&other.ranges[b]) + { + let old_range = range; + range = match range.difference(&other.ranges[b]) { + (None, None) => { + // We lost the entire range, so move on to the next + // without adding this one. + a += 1; + continue 'LOOP; + } + (Some(range1), None) | (None, Some(range1)) => range1, + (Some(range1), Some(range2)) => { + self.ranges.push(range1); + range2 + } + }; + // It's possible that the `b` range has more to contribute + // here. In particular, if it is greater than the original + // range, then it might impact the next `a` range *and* it + // has impacted the current `a` range as much as possible, + // so we can quit. We don't bump `b` so that the next `a` + // range can apply it. + if other.ranges[b].upper() > old_range.upper() { + break; + } + // Otherwise, the next `b` range might apply to the current + // `a` range. + b += 1; + } + self.ranges.push(range); + a += 1; + } + while a < drain_end { + let range = self.ranges[a]; + self.ranges.push(range); + a += 1; + } + self.ranges.drain(..drain_end); + self.folded = self.folded && other.folded; + } + + /// Compute the symmetric difference of the two sets, in place. + /// + /// This computes the symmetric difference of two interval sets. This + /// removes all elements in this set that are also in the given set, + /// but also adds all elements from the given set that aren't in this + /// set. That is, the set will contain all elements in either set, + /// but will not contain any elements that are in both sets. + pub fn symmetric_difference(&mut self, other: &IntervalSet<I>) { + // TODO(burntsushi): Fix this so that it amortizes allocation. + let mut intersection = self.clone(); + intersection.intersect(other); + self.union(other); + self.difference(&intersection); + } + + /// Negate this interval set. + /// + /// For all `x` where `x` is any element, if `x` was in this set, then it + /// will not be in this set after negation. + pub fn negate(&mut self) { + if self.ranges.is_empty() { + let (min, max) = (I::Bound::min_value(), I::Bound::max_value()); + self.ranges.push(I::create(min, max)); + // The set containing everything must case folded. + self.folded = true; + return; + } + + // There should be a way to do this in-place with constant memory, + // but I couldn't figure out a simple way to do it. So just append + // the negation to the end of this range, and then drain it before + // we're done. + let drain_end = self.ranges.len(); + + // We do checked arithmetic below because of the canonical ordering + // invariant. + if self.ranges[0].lower() > I::Bound::min_value() { + let upper = self.ranges[0].lower().decrement(); + self.ranges.push(I::create(I::Bound::min_value(), upper)); + } + for i in 1..drain_end { + let lower = self.ranges[i - 1].upper().increment(); + let upper = self.ranges[i].lower().decrement(); + self.ranges.push(I::create(lower, upper)); + } + if self.ranges[drain_end - 1].upper() < I::Bound::max_value() { + let lower = self.ranges[drain_end - 1].upper().increment(); + self.ranges.push(I::create(lower, I::Bound::max_value())); + } + self.ranges.drain(..drain_end); + // We don't need to update whether this set is folded or not, because + // it is conservatively preserved through negation. Namely, if a set + // is not folded, then it is possible that its negation is folded, for + // example, [^☃]. But we're fine with assuming that the set is not + // folded in that case. (`folded` permits false negatives but not false + // positives.) + // + // But what about when a set is folded, is its negation also + // necessarily folded? Yes. Because if a set is folded, then for every + // character in the set, it necessarily included its equivalence class + // of case folded characters. Negating it in turn means that all + // equivalence classes in the set are negated, and any equivalence + // class that was previously not in the set is now entirely in the set. + } + + /// Converts this set into a canonical ordering. + fn canonicalize(&mut self) { + if self.is_canonical() { + return; + } + self.ranges.sort(); + assert!(!self.ranges.is_empty()); + + // Is there a way to do this in-place with constant memory? I couldn't + // figure out a way to do it. So just append the canonicalization to + // the end of this range, and then drain it before we're done. + let drain_end = self.ranges.len(); + for oldi in 0..drain_end { + // If we've added at least one new range, then check if we can + // merge this range in the previously added range. + if self.ranges.len() > drain_end { + let (last, rest) = self.ranges.split_last_mut().unwrap(); + if let Some(union) = last.union(&rest[oldi]) { + *last = union; + continue; + } + } + let range = self.ranges[oldi]; + self.ranges.push(range); + } + self.ranges.drain(..drain_end); + } + + /// Returns true if and only if this class is in a canonical ordering. + fn is_canonical(&self) -> bool { + for pair in self.ranges.windows(2) { + if pair[0] >= pair[1] { + return false; + } + if pair[0].is_contiguous(&pair[1]) { + return false; + } + } + true + } +} + +/// An iterator over intervals. +#[derive(Debug)] +pub struct IntervalSetIter<'a, I>(slice::Iter<'a, I>); + +impl<'a, I> Iterator for IntervalSetIter<'a, I> { + type Item = &'a I; + + fn next(&mut self) -> Option<&'a I> { + self.0.next() + } +} + +pub trait Interval: + Clone + Copy + Debug + Default + Eq + PartialEq + PartialOrd + Ord +{ + type Bound: Bound; + + fn lower(&self) -> Self::Bound; + fn upper(&self) -> Self::Bound; + fn set_lower(&mut self, bound: Self::Bound); + fn set_upper(&mut self, bound: Self::Bound); + fn case_fold_simple( + &self, + intervals: &mut Vec<Self>, + ) -> Result<(), unicode::CaseFoldError>; + + /// Create a new interval. + fn create(lower: Self::Bound, upper: Self::Bound) -> Self { + let mut int = Self::default(); + if lower <= upper { + int.set_lower(lower); + int.set_upper(upper); + } else { + int.set_lower(upper); + int.set_upper(lower); + } + int + } + + /// Union the given overlapping range into this range. + /// + /// If the two ranges aren't contiguous, then this returns `None`. + fn union(&self, other: &Self) -> Option<Self> { + if !self.is_contiguous(other) { + return None; + } + let lower = cmp::min(self.lower(), other.lower()); + let upper = cmp::max(self.upper(), other.upper()); + Some(Self::create(lower, upper)) + } + + /// Intersect this range with the given range and return the result. + /// + /// If the intersection is empty, then this returns `None`. + fn intersect(&self, other: &Self) -> Option<Self> { + let lower = cmp::max(self.lower(), other.lower()); + let upper = cmp::min(self.upper(), other.upper()); + if lower <= upper { + Some(Self::create(lower, upper)) + } else { + None + } + } + + /// Subtract the given range from this range and return the resulting + /// ranges. + /// + /// If subtraction would result in an empty range, then no ranges are + /// returned. + fn difference(&self, other: &Self) -> (Option<Self>, Option<Self>) { + if self.is_subset(other) { + return (None, None); + } + if self.is_intersection_empty(other) { + return (Some(self.clone()), None); + } + let add_lower = other.lower() > self.lower(); + let add_upper = other.upper() < self.upper(); + // We know this because !self.is_subset(other) and the ranges have + // a non-empty intersection. + assert!(add_lower || add_upper); + let mut ret = (None, None); + if add_lower { + let upper = other.lower().decrement(); + ret.0 = Some(Self::create(self.lower(), upper)); + } + if add_upper { + let lower = other.upper().increment(); + let range = Self::create(lower, self.upper()); + if ret.0.is_none() { + ret.0 = Some(range); + } else { + ret.1 = Some(range); + } + } + ret + } + + /// Compute the symmetric difference the given range from this range. This + /// returns the union of the two ranges minus its intersection. + fn symmetric_difference( + &self, + other: &Self, + ) -> (Option<Self>, Option<Self>) { + let union = match self.union(other) { + None => return (Some(self.clone()), Some(other.clone())), + Some(union) => union, + }; + let intersection = match self.intersect(other) { + None => return (Some(self.clone()), Some(other.clone())), + Some(intersection) => intersection, + }; + union.difference(&intersection) + } + + /// Returns true if and only if the two ranges are contiguous. Two ranges + /// are contiguous if and only if the ranges are either overlapping or + /// adjacent. + fn is_contiguous(&self, other: &Self) -> bool { + let lower1 = self.lower().as_u32(); + let upper1 = self.upper().as_u32(); + let lower2 = other.lower().as_u32(); + let upper2 = other.upper().as_u32(); + cmp::max(lower1, lower2) <= cmp::min(upper1, upper2).saturating_add(1) + } + + /// Returns true if and only if the intersection of this range and the + /// other range is empty. + fn is_intersection_empty(&self, other: &Self) -> bool { + let (lower1, upper1) = (self.lower(), self.upper()); + let (lower2, upper2) = (other.lower(), other.upper()); + cmp::max(lower1, lower2) > cmp::min(upper1, upper2) + } + + /// Returns true if and only if this range is a subset of the other range. + fn is_subset(&self, other: &Self) -> bool { + let (lower1, upper1) = (self.lower(), self.upper()); + let (lower2, upper2) = (other.lower(), other.upper()); + (lower2 <= lower1 && lower1 <= upper2) + && (lower2 <= upper1 && upper1 <= upper2) + } +} + +pub trait Bound: + Copy + Clone + Debug + Eq + PartialEq + PartialOrd + Ord +{ + fn min_value() -> Self; + fn max_value() -> Self; + fn as_u32(self) -> u32; + fn increment(self) -> Self; + fn decrement(self) -> Self; +} + +impl Bound for u8 { + fn min_value() -> Self { + u8::MIN + } + fn max_value() -> Self { + u8::MAX + } + fn as_u32(self) -> u32 { + u32::from(self) + } + fn increment(self) -> Self { + self.checked_add(1).unwrap() + } + fn decrement(self) -> Self { + self.checked_sub(1).unwrap() + } +} + +impl Bound for char { + fn min_value() -> Self { + '\x00' + } + fn max_value() -> Self { + '\u{10FFFF}' + } + fn as_u32(self) -> u32 { + u32::from(self) + } + + fn increment(self) -> Self { + match self { + '\u{D7FF}' => '\u{E000}', + c => char::from_u32(u32::from(c).checked_add(1).unwrap()).unwrap(), + } + } + + fn decrement(self) -> Self { + match self { + '\u{E000}' => '\u{D7FF}', + c => char::from_u32(u32::from(c).checked_sub(1).unwrap()).unwrap(), + } + } +} + +// Tests for interval sets are written in src/hir.rs against the public API. diff --git a/vendor/regex-syntax/src/hir/literal.rs b/vendor/regex-syntax/src/hir/literal.rs new file mode 100644 index 0000000..a5a3737 --- /dev/null +++ b/vendor/regex-syntax/src/hir/literal.rs @@ -0,0 +1,3214 @@ +/*! +Provides literal extraction from `Hir` expressions. + +An [`Extractor`] pulls literals out of [`Hir`] expressions and returns a +[`Seq`] of [`Literal`]s. + +The purpose of literal extraction is generally to provide avenues for +optimizing regex searches. The main idea is that substring searches can be an +order of magnitude faster than a regex search. Therefore, if one can execute +a substring search to find candidate match locations and only run the regex +search at those locations, then it is possible for huge improvements in +performance to be realized. + +With that said, literal optimizations are generally a black art because even +though substring search is generally faster, if the number of candidates +produced is high, then it can create a lot of overhead by ping-ponging between +the substring search and the regex search. + +Here are some heuristics that might be used to help increase the chances of +effective literal optimizations: + +* Stick to small [`Seq`]s. If you search for too many literals, it's likely +to lead to substring search that is only a little faster than a regex search, +and thus the overhead of using literal optimizations in the first place might +make things slower overall. +* The literals in your [`Seq`] shouldn't be too short. In general, longer is +better. A sequence corresponding to single bytes that occur frequently in the +haystack, for example, is probably a bad literal optimization because it's +likely to produce many false positive candidates. Longer literals are less +likely to match, and thus probably produce fewer false positives. +* If it's possible to estimate the approximate frequency of each byte according +to some pre-computed background distribution, it is possible to compute a score +of how "good" a `Seq` is. If a `Seq` isn't good enough, you might consider +skipping the literal optimization and just use the regex engine. + +(It should be noted that there are always pathological cases that can make +any kind of literal optimization be a net slower result. This is why it +might be a good idea to be conservative, or to even provide a means for +literal optimizations to be dynamically disabled if they are determined to be +ineffective according to some measure.) + +You're encouraged to explore the methods on [`Seq`], which permit shrinking +the size of sequences in a preference-order preserving fashion. + +Finally, note that it isn't strictly necessary to use an [`Extractor`]. Namely, +an `Extractor` only uses public APIs of the [`Seq`] and [`Literal`] types, +so it is possible to implement your own extractor. For example, for n-grams +or "inner" literals (i.e., not prefix or suffix literals). The `Extractor` +is mostly responsible for the case analysis over `Hir` expressions. Much of +the "trickier" parts are how to combine literal sequences, and that is all +implemented on [`Seq`]. +*/ + +use core::{cmp, mem, num::NonZeroUsize}; + +use alloc::{vec, vec::Vec}; + +use crate::hir::{self, Hir}; + +/// Extracts prefix or suffix literal sequences from [`Hir`] expressions. +/// +/// Literal extraction is based on the following observations: +/// +/// * Many regexes start with one or a small number of literals. +/// * Substring search for literals is often much faster (sometimes by an order +/// of magnitude) than a regex search. +/// +/// Thus, in many cases, one can search for literals to find candidate starting +/// locations of a match, and then only run the full regex engine at each such +/// location instead of over the full haystack. +/// +/// The main downside of literal extraction is that it can wind up causing a +/// search to be slower overall. For example, if there are many matches or if +/// there are many candidates that don't ultimately lead to a match, then a +/// lot of overhead will be spent in shuffing back-and-forth between substring +/// search and the regex engine. This is the fundamental reason why literal +/// optimizations for regex patterns is sometimes considered a "black art." +/// +/// # Look-around assertions +/// +/// Literal extraction treats all look-around assertions as-if they match every +/// empty string. So for example, the regex `\bquux\b` will yield a sequence +/// containing a single exact literal `quux`. However, not all occurrences +/// of `quux` correspond to a match a of the regex. For example, `\bquux\b` +/// does not match `ZquuxZ` anywhere because `quux` does not fall on a word +/// boundary. +/// +/// In effect, if your regex contains look-around assertions, then a match of +/// an exact literal does not necessarily mean the regex overall matches. So +/// you may still need to run the regex engine in such cases to confirm the +/// match. +/// +/// The precise guarantee you get from a literal sequence is: if every literal +/// in the sequence is exact and the original regex contains zero look-around +/// assertions, then a preference-order multi-substring search of those +/// literals will precisely match a preference-order search of the original +/// regex. +/// +/// # Example +/// +/// This shows how to extract prefixes: +/// +/// ``` +/// use regex_syntax::{hir::literal::{Extractor, Literal, Seq}, parse}; +/// +/// let hir = parse(r"(a|b|c)(x|y|z)[A-Z]+foo")?; +/// let got = Extractor::new().extract(&hir); +/// // All literals returned are "inexact" because none of them reach the +/// // match state. +/// let expected = Seq::from_iter([ +/// Literal::inexact("ax"), +/// Literal::inexact("ay"), +/// Literal::inexact("az"), +/// Literal::inexact("bx"), +/// Literal::inexact("by"), +/// Literal::inexact("bz"), +/// Literal::inexact("cx"), +/// Literal::inexact("cy"), +/// Literal::inexact("cz"), +/// ]); +/// assert_eq!(expected, got); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +/// +/// This shows how to extract suffixes: +/// +/// ``` +/// use regex_syntax::{ +/// hir::literal::{Extractor, ExtractKind, Literal, Seq}, +/// parse, +/// }; +/// +/// let hir = parse(r"foo|[A-Z]+bar")?; +/// let got = Extractor::new().kind(ExtractKind::Suffix).extract(&hir); +/// // Since 'foo' gets to a match state, it is considered exact. But 'bar' +/// // does not because of the '[A-Z]+', and thus is marked inexact. +/// let expected = Seq::from_iter([ +/// Literal::exact("foo"), +/// Literal::inexact("bar"), +/// ]); +/// assert_eq!(expected, got); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +#[derive(Clone, Debug)] +pub struct Extractor { + kind: ExtractKind, + limit_class: usize, + limit_repeat: usize, + limit_literal_len: usize, + limit_total: usize, +} + +impl Extractor { + /// Create a new extractor with a default configuration. + /// + /// The extractor can be optionally configured before calling + /// [`Extractor::extract`] to get a literal sequence. + pub fn new() -> Extractor { + Extractor { + kind: ExtractKind::Prefix, + limit_class: 10, + limit_repeat: 10, + limit_literal_len: 100, + limit_total: 250, + } + } + + /// Execute the extractor and return a sequence of literals. + pub fn extract(&self, hir: &Hir) -> Seq { + use crate::hir::HirKind::*; + + match *hir.kind() { + Empty | Look(_) => Seq::singleton(self::Literal::exact(vec![])), + Literal(hir::Literal(ref bytes)) => { + let mut seq = + Seq::singleton(self::Literal::exact(bytes.to_vec())); + self.enforce_literal_len(&mut seq); + seq + } + Class(hir::Class::Unicode(ref cls)) => { + self.extract_class_unicode(cls) + } + Class(hir::Class::Bytes(ref cls)) => self.extract_class_bytes(cls), + Repetition(ref rep) => self.extract_repetition(rep), + Capture(hir::Capture { ref sub, .. }) => self.extract(sub), + Concat(ref hirs) => match self.kind { + ExtractKind::Prefix => self.extract_concat(hirs.iter()), + ExtractKind::Suffix => self.extract_concat(hirs.iter().rev()), + }, + Alternation(ref hirs) => { + // Unlike concat, we always union starting from the beginning, + // since the beginning corresponds to the highest preference, + // which doesn't change based on forwards vs reverse. + self.extract_alternation(hirs.iter()) + } + } + } + + /// Set the kind of literal sequence to extract from an [`Hir`] expression. + /// + /// The default is to extract prefixes, but suffixes can be selected + /// instead. The contract for prefixes is that every match of the + /// corresponding `Hir` must start with one of the literals in the sequence + /// returned. Moreover, the _order_ of the sequence returned corresponds to + /// the preference order. + /// + /// Suffixes satisfy a similar contract in that every match of the + /// corresponding `Hir` must end with one of the literals in the sequence + /// returned. However, there is no guarantee that the literals are in + /// preference order. + /// + /// Remember that a sequence can be infinite. For example, unless the + /// limits are configured to be impractically large, attempting to extract + /// prefixes (or suffixes) for the pattern `[A-Z]` will return an infinite + /// sequence. Generally speaking, if the sequence returned is infinite, + /// then it is presumed to be unwise to do prefix (or suffix) optimizations + /// for the pattern. + pub fn kind(&mut self, kind: ExtractKind) -> &mut Extractor { + self.kind = kind; + self + } + + /// Configure a limit on the length of the sequence that is permitted for + /// a character class. If a character class exceeds this limit, then the + /// sequence returned for it is infinite. + /// + /// This prevents classes like `[A-Z]` or `\pL` from getting turned into + /// huge and likely unproductive sequences of literals. + /// + /// # Example + /// + /// This example shows how this limit can be lowered to decrease the tolerance + /// for character classes being turned into literal sequences. + /// + /// ``` + /// use regex_syntax::{hir::literal::{Extractor, Seq}, parse}; + /// + /// let hir = parse(r"[0-9]")?; + /// + /// let got = Extractor::new().extract(&hir); + /// let expected = Seq::new([ + /// "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", + /// ]); + /// assert_eq!(expected, got); + /// + /// // Now let's shrink the limit and see how that changes things. + /// let got = Extractor::new().limit_class(4).extract(&hir); + /// let expected = Seq::infinite(); + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn limit_class(&mut self, limit: usize) -> &mut Extractor { + self.limit_class = limit; + self + } + + /// Configure a limit on the total number of repetitions that is permitted + /// before literal extraction is stopped. + /// + /// This is useful for limiting things like `(abcde){50}`, or more + /// insidiously, `(?:){1000000000}`. This limit prevents any one single + /// repetition from adding too much to a literal sequence. + /// + /// With this limit set, repetitions that exceed it will be stopped and any + /// literals extracted up to that point will be made inexact. + /// + /// # Example + /// + /// This shows how to decrease the limit and compares it with the default. + /// + /// ``` + /// use regex_syntax::{hir::literal::{Extractor, Literal, Seq}, parse}; + /// + /// let hir = parse(r"(abc){8}")?; + /// + /// let got = Extractor::new().extract(&hir); + /// let expected = Seq::new(["abcabcabcabcabcabcabcabc"]); + /// assert_eq!(expected, got); + /// + /// // Now let's shrink the limit and see how that changes things. + /// let got = Extractor::new().limit_repeat(4).extract(&hir); + /// let expected = Seq::from_iter([ + /// Literal::inexact("abcabcabcabc"), + /// ]); + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn limit_repeat(&mut self, limit: usize) -> &mut Extractor { + self.limit_repeat = limit; + self + } + + /// Configure a limit on the maximum length of any literal in a sequence. + /// + /// This is useful for limiting things like `(abcde){5}{5}{5}{5}`. While + /// each repetition or literal in that regex is small, when all the + /// repetitions are applied, one ends up with a literal of length `5^4 = + /// 625`. + /// + /// With this limit set, literals that exceed it will be made inexact and + /// thus prevented from growing. + /// + /// # Example + /// + /// This shows how to decrease the limit and compares it with the default. + /// + /// ``` + /// use regex_syntax::{hir::literal::{Extractor, Literal, Seq}, parse}; + /// + /// let hir = parse(r"(abc){2}{2}{2}")?; + /// + /// let got = Extractor::new().extract(&hir); + /// let expected = Seq::new(["abcabcabcabcabcabcabcabc"]); + /// assert_eq!(expected, got); + /// + /// // Now let's shrink the limit and see how that changes things. + /// let got = Extractor::new().limit_literal_len(14).extract(&hir); + /// let expected = Seq::from_iter([ + /// Literal::inexact("abcabcabcabcab"), + /// ]); + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn limit_literal_len(&mut self, limit: usize) -> &mut Extractor { + self.limit_literal_len = limit; + self + } + + /// Configure a limit on the total number of literals that will be + /// returned. + /// + /// This is useful as a practical measure for avoiding the creation of + /// large sequences of literals. While the extractor will automatically + /// handle local creations of large sequences (for example, `[A-Z]` yields + /// an infinite sequence by default), large sequences can be created + /// through non-local means as well. + /// + /// For example, `[ab]{3}{3}` would yield a sequence of length `512 = 2^9` + /// despite each of the repetitions being small on their own. This limit + /// thus represents a "catch all" for avoiding locally small sequences from + /// combining into large sequences. + /// + /// # Example + /// + /// This example shows how reducing the limit will change the literal + /// sequence returned. + /// + /// ``` + /// use regex_syntax::{hir::literal::{Extractor, Literal, Seq}, parse}; + /// + /// let hir = parse(r"[ab]{2}{2}")?; + /// + /// let got = Extractor::new().extract(&hir); + /// let expected = Seq::new([ + /// "aaaa", "aaab", "aaba", "aabb", + /// "abaa", "abab", "abba", "abbb", + /// "baaa", "baab", "baba", "babb", + /// "bbaa", "bbab", "bbba", "bbbb", + /// ]); + /// assert_eq!(expected, got); + /// + /// // The default limit is not too big, but big enough to extract all + /// // literals from '[ab]{2}{2}'. If we shrink the limit to less than 16, + /// // then we'll get a truncated set. Notice that it returns a sequence of + /// // length 4 even though our limit was 10. This is because the sequence + /// // is difficult to increase without blowing the limit. Notice also + /// // that every literal in the sequence is now inexact because they were + /// // stripped of some suffix. + /// let got = Extractor::new().limit_total(10).extract(&hir); + /// let expected = Seq::from_iter([ + /// Literal::inexact("aa"), + /// Literal::inexact("ab"), + /// Literal::inexact("ba"), + /// Literal::inexact("bb"), + /// ]); + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn limit_total(&mut self, limit: usize) -> &mut Extractor { + self.limit_total = limit; + self + } + + /// Extract a sequence from the given concatenation. Sequences from each of + /// the child HIR expressions are combined via cross product. + /// + /// This short circuits once the cross product turns into a sequence + /// containing only inexact literals. + fn extract_concat<'a, I: Iterator<Item = &'a Hir>>(&self, it: I) -> Seq { + let mut seq = Seq::singleton(self::Literal::exact(vec![])); + for hir in it { + // If every element in the sequence is inexact, then a cross + // product will always be a no-op. Thus, there is nothing else we + // can add to it and can quit early. Note that this also includes + // infinite sequences. + if seq.is_inexact() { + break; + } + // Note that 'cross' also dispatches based on whether we're + // extracting prefixes or suffixes. + seq = self.cross(seq, &mut self.extract(hir)); + } + seq + } + + /// Extract a sequence from the given alternation. + /// + /// This short circuits once the union turns into an infinite sequence. + fn extract_alternation<'a, I: Iterator<Item = &'a Hir>>( + &self, + it: I, + ) -> Seq { + let mut seq = Seq::empty(); + for hir in it { + // Once our 'seq' is infinite, every subsequent union + // operation on it will itself always result in an + // infinite sequence. Thus, it can never change and we can + // short-circuit. + if !seq.is_finite() { + break; + } + seq = self.union(seq, &mut self.extract(hir)); + } + seq + } + + /// Extract a sequence of literals from the given repetition. We do our + /// best, Some examples: + /// + /// 'a*' => [inexact(a), exact("")] + /// 'a*?' => [exact(""), inexact(a)] + /// 'a+' => [inexact(a)] + /// 'a{3}' => [exact(aaa)] + /// 'a{3,5} => [inexact(aaa)] + /// + /// The key here really is making sure we get the 'inexact' vs 'exact' + /// attributes correct on each of the literals we add. For example, the + /// fact that 'a*' gives us an inexact 'a' and an exact empty string means + /// that a regex like 'ab*c' will result in [inexact(ab), exact(ac)] + /// literals being extracted, which might actually be a better prefilter + /// than just 'a'. + fn extract_repetition(&self, rep: &hir::Repetition) -> Seq { + let mut subseq = self.extract(&rep.sub); + match *rep { + hir::Repetition { min: 0, max, greedy, .. } => { + // When 'max=1', we can retain exactness, since 'a?' is + // equivalent to 'a|'. Similarly below, 'a??' is equivalent to + // '|a'. + if max != Some(1) { + subseq.make_inexact(); + } + let mut empty = Seq::singleton(Literal::exact(vec![])); + if !greedy { + mem::swap(&mut subseq, &mut empty); + } + self.union(subseq, &mut empty) + } + hir::Repetition { min, max: Some(max), .. } if min == max => { + assert!(min > 0); // handled above + let limit = + u32::try_from(self.limit_repeat).unwrap_or(u32::MAX); + let mut seq = Seq::singleton(Literal::exact(vec![])); + for _ in 0..cmp::min(min, limit) { + if seq.is_inexact() { + break; + } + seq = self.cross(seq, &mut subseq.clone()); + } + if usize::try_from(min).is_err() || min > limit { + seq.make_inexact(); + } + seq + } + hir::Repetition { min, .. } => { + assert!(min > 0); // handled above + let limit = + u32::try_from(self.limit_repeat).unwrap_or(u32::MAX); + let mut seq = Seq::singleton(Literal::exact(vec![])); + for _ in 0..cmp::min(min, limit) { + if seq.is_inexact() { + break; + } + seq = self.cross(seq, &mut subseq.clone()); + } + seq.make_inexact(); + seq + } + } + } + + /// Convert the given Unicode class into a sequence of literals if the + /// class is small enough. If the class is too big, return an infinite + /// sequence. + fn extract_class_unicode(&self, cls: &hir::ClassUnicode) -> Seq { + if self.class_over_limit_unicode(cls) { + return Seq::infinite(); + } + let mut seq = Seq::empty(); + for r in cls.iter() { + for ch in r.start()..=r.end() { + seq.push(Literal::from(ch)); + } + } + self.enforce_literal_len(&mut seq); + seq + } + + /// Convert the given byte class into a sequence of literals if the class + /// is small enough. If the class is too big, return an infinite sequence. + fn extract_class_bytes(&self, cls: &hir::ClassBytes) -> Seq { + if self.class_over_limit_bytes(cls) { + return Seq::infinite(); + } + let mut seq = Seq::empty(); + for r in cls.iter() { + for b in r.start()..=r.end() { + seq.push(Literal::from(b)); + } + } + self.enforce_literal_len(&mut seq); + seq + } + + /// Returns true if the given Unicode class exceeds the configured limits + /// on this extractor. + fn class_over_limit_unicode(&self, cls: &hir::ClassUnicode) -> bool { + let mut count = 0; + for r in cls.iter() { + if count > self.limit_class { + return true; + } + count += r.len(); + } + count > self.limit_class + } + + /// Returns true if the given byte class exceeds the configured limits on + /// this extractor. + fn class_over_limit_bytes(&self, cls: &hir::ClassBytes) -> bool { + let mut count = 0; + for r in cls.iter() { + if count > self.limit_class { + return true; + } + count += r.len(); + } + count > self.limit_class + } + + /// Compute the cross product of the two sequences if the result would be + /// within configured limits. Otherwise, make `seq2` infinite and cross the + /// infinite sequence with `seq1`. + fn cross(&self, mut seq1: Seq, seq2: &mut Seq) -> Seq { + if seq1.max_cross_len(seq2).map_or(false, |len| len > self.limit_total) + { + seq2.make_infinite(); + } + if let ExtractKind::Suffix = self.kind { + seq1.cross_reverse(seq2); + } else { + seq1.cross_forward(seq2); + } + assert!(seq1.len().map_or(true, |x| x <= self.limit_total)); + self.enforce_literal_len(&mut seq1); + seq1 + } + + /// Union the two sequences if the result would be within configured + /// limits. Otherwise, make `seq2` infinite and union the infinite sequence + /// with `seq1`. + fn union(&self, mut seq1: Seq, seq2: &mut Seq) -> Seq { + if seq1.max_union_len(seq2).map_or(false, |len| len > self.limit_total) + { + // We try to trim our literal sequences to see if we can make + // room for more literals. The idea is that we'd rather trim down + // literals already in our sequence if it means we can add a few + // more and retain a finite sequence. Otherwise, we'll union with + // an infinite sequence and that infects everything and effectively + // stops literal extraction in its tracks. + // + // We do we keep 4 bytes here? Well, it's a bit of an abstraction + // leakage. Downstream, the literals may wind up getting fed to + // the Teddy algorithm, which supports searching literals up to + // length 4. So that's why we pick that number here. Arguably this + // should be a tuneable parameter, but it seems a little tricky to + // describe. And I'm still unsure if this is the right way to go + // about culling literal sequences. + match self.kind { + ExtractKind::Prefix => { + seq1.keep_first_bytes(4); + seq2.keep_first_bytes(4); + } + ExtractKind::Suffix => { + seq1.keep_last_bytes(4); + seq2.keep_last_bytes(4); + } + } + seq1.dedup(); + seq2.dedup(); + if seq1 + .max_union_len(seq2) + .map_or(false, |len| len > self.limit_total) + { + seq2.make_infinite(); + } + } + seq1.union(seq2); + assert!(seq1.len().map_or(true, |x| x <= self.limit_total)); + seq1 + } + + /// Applies the literal length limit to the given sequence. If none of the + /// literals in the sequence exceed the limit, then this is a no-op. + fn enforce_literal_len(&self, seq: &mut Seq) { + let len = self.limit_literal_len; + match self.kind { + ExtractKind::Prefix => seq.keep_first_bytes(len), + ExtractKind::Suffix => seq.keep_last_bytes(len), + } + } +} + +impl Default for Extractor { + fn default() -> Extractor { + Extractor::new() + } +} + +/// The kind of literals to extract from an [`Hir`] expression. +/// +/// The default extraction kind is `Prefix`. +#[non_exhaustive] +#[derive(Clone, Debug)] +pub enum ExtractKind { + /// Extracts only prefix literals from a regex. + Prefix, + /// Extracts only suffix literals from a regex. + /// + /// Note that the sequence returned by suffix literals currently may + /// not correctly represent leftmost-first or "preference" order match + /// semantics. + Suffix, +} + +impl ExtractKind { + /// Returns true if this kind is the `Prefix` variant. + pub fn is_prefix(&self) -> bool { + matches!(*self, ExtractKind::Prefix) + } + + /// Returns true if this kind is the `Suffix` variant. + pub fn is_suffix(&self) -> bool { + matches!(*self, ExtractKind::Suffix) + } +} + +impl Default for ExtractKind { + fn default() -> ExtractKind { + ExtractKind::Prefix + } +} + +/// A sequence of literals. +/// +/// A `Seq` is very much like a set in that it represents a union of its +/// members. That is, it corresponds to a set of literals where at least one +/// must match in order for a particular [`Hir`] expression to match. (Whether +/// this corresponds to the entire `Hir` expression, a prefix of it or a suffix +/// of it depends on how the `Seq` was extracted from the `Hir`.) +/// +/// It is also unlike a set in that multiple identical literals may appear, +/// and that the order of the literals in the `Seq` matters. For example, if +/// the sequence is `[sam, samwise]` and leftmost-first matching is used, then +/// `samwise` can never match and the sequence is equivalent to `[sam]`. +/// +/// # States of a sequence +/// +/// A `Seq` has a few different logical states to consider: +/// +/// * The sequence can represent "any" literal. When this happens, the set does +/// not have a finite size. The purpose of this state is to inhibit callers +/// from making assumptions about what literals are required in order to match +/// a particular [`Hir`] expression. Generally speaking, when a set is in this +/// state, literal optimizations are inhibited. A good example of a regex that +/// will cause this sort of set to appear is `[A-Za-z]`. The character class +/// is just too big (and also too narrow) to be usefully expanded into 52 +/// different literals. (Note that the decision for when a seq should become +/// infinite is determined by the caller. A seq itself has no hard-coded +/// limits.) +/// * The sequence can be empty, in which case, it is an affirmative statement +/// that there are no literals that can match the corresponding `Hir`. +/// Consequently, the `Hir` never matches any input. For example, `[a&&b]`. +/// * The sequence can be non-empty, in which case, at least one of the +/// literals must match in order for the corresponding `Hir` to match. +/// +/// # Example +/// +/// This example shows how literal sequences can be simplified by stripping +/// suffixes and minimizing while maintaining preference order. +/// +/// ``` +/// use regex_syntax::hir::literal::{Literal, Seq}; +/// +/// let mut seq = Seq::new(&[ +/// "farm", +/// "appliance", +/// "faraway", +/// "apple", +/// "fare", +/// "gap", +/// "applicant", +/// "applaud", +/// ]); +/// seq.keep_first_bytes(3); +/// seq.minimize_by_preference(); +/// // Notice that 'far' comes before 'app', which matches the order in the +/// // original sequence. This guarantees that leftmost-first semantics are +/// // not altered by simplifying the set. +/// let expected = Seq::from_iter([ +/// Literal::inexact("far"), +/// Literal::inexact("app"), +/// Literal::exact("gap"), +/// ]); +/// assert_eq!(expected, seq); +/// ``` +#[derive(Clone, Eq, PartialEq)] +pub struct Seq { + /// The members of this seq. + /// + /// When `None`, the seq represents all possible literals. That is, it + /// prevents one from making assumptions about specific literals in the + /// seq, and forces one to treat it as if any literal might be in the seq. + /// + /// Note that `Some(vec![])` is valid and corresponds to the empty seq of + /// literals, i.e., a regex that can never match. For example, `[a&&b]`. + /// It is distinct from `Some(vec![""])`, which corresponds to the seq + /// containing an empty string, which matches at every position. + literals: Option<Vec<Literal>>, +} + +impl Seq { + /// Returns an empty sequence. + /// + /// An empty sequence matches zero literals, and thus corresponds to a + /// regex that itself can never match. + #[inline] + pub fn empty() -> Seq { + Seq { literals: Some(vec![]) } + } + + /// Returns a sequence of literals without a finite size and may contain + /// any literal. + /// + /// A sequence without finite size does not reveal anything about the + /// characteristics of the literals in its set. There are no fixed prefixes + /// or suffixes, nor are lower or upper bounds on the length of the literals + /// in the set known. + /// + /// This is useful to represent constructs in a regex that are "too big" + /// to useful represent as a sequence of literals. For example, `[A-Za-z]`. + /// When sequences get too big, they lose their discriminating nature and + /// are more likely to produce false positives, which in turn makes them + /// less likely to speed up searches. + /// + /// More pragmatically, for many regexes, enumerating all possible literals + /// is itself not possible or might otherwise use too many resources. So + /// constraining the size of sets during extraction is a practical trade + /// off to make. + #[inline] + pub fn infinite() -> Seq { + Seq { literals: None } + } + + /// Returns a sequence containing a single literal. + #[inline] + pub fn singleton(lit: Literal) -> Seq { + Seq { literals: Some(vec![lit]) } + } + + /// Returns a sequence of exact literals from the given byte strings. + #[inline] + pub fn new<I, B>(it: I) -> Seq + where + I: IntoIterator<Item = B>, + B: AsRef<[u8]>, + { + it.into_iter().map(|b| Literal::exact(b.as_ref())).collect() + } + + /// If this is a finite sequence, return its members as a slice of + /// literals. + /// + /// The slice returned may be empty, in which case, there are no literals + /// that can match this sequence. + #[inline] + pub fn literals(&self) -> Option<&[Literal]> { + self.literals.as_deref() + } + + /// Push a literal to the end of this sequence. + /// + /// If this sequence is not finite, then this is a no-op. + /// + /// Similarly, if the most recently added item of this sequence is + /// equivalent to the literal given, then it is not added. This reflects + /// a `Seq`'s "set like" behavior, and represents a practical trade off. + /// Namely, there is never any need to have two adjacent and equivalent + /// literals in the same sequence, _and_ it is easy to detect in some + /// cases. + #[inline] + pub fn push(&mut self, lit: Literal) { + let lits = match self.literals { + None => return, + Some(ref mut lits) => lits, + }; + if lits.last().map_or(false, |m| m == &lit) { + return; + } + lits.push(lit); + } + + /// Make all of the literals in this sequence inexact. + /// + /// This is a no-op if this sequence is not finite. + #[inline] + pub fn make_inexact(&mut self) { + let lits = match self.literals { + None => return, + Some(ref mut lits) => lits, + }; + for lit in lits.iter_mut() { + lit.make_inexact(); + } + } + + /// Converts this sequence to an infinite sequence. + /// + /// This is a no-op if the sequence is already infinite. + #[inline] + pub fn make_infinite(&mut self) { + self.literals = None; + } + + /// Modify this sequence to contain the cross product between it and the + /// sequence given. + /// + /// The cross product only considers literals in this sequence that are + /// exact. That is, inexact literals are not extended. + /// + /// The literals are always drained from `other`, even if none are used. + /// This permits callers to reuse the sequence allocation elsewhere. + /// + /// If this sequence is infinite, then this is a no-op, regardless of what + /// `other` contains (and in this case, the literals are still drained from + /// `other`). If `other` is infinite and this sequence is finite, then this + /// is a no-op, unless this sequence contains a zero-length literal. In + /// which case, the infiniteness of `other` infects this sequence, and this + /// sequence is itself made infinite. + /// + /// Like [`Seq::union`], this may attempt to deduplicate literals. See + /// [`Seq::dedup`] for how deduplication deals with exact and inexact + /// literals. + /// + /// # Example + /// + /// This example shows basic usage and how exact and inexact literals + /// interact. + /// + /// ``` + /// use regex_syntax::hir::literal::{Literal, Seq}; + /// + /// let mut seq1 = Seq::from_iter([ + /// Literal::exact("foo"), + /// Literal::inexact("bar"), + /// ]); + /// let mut seq2 = Seq::from_iter([ + /// Literal::inexact("quux"), + /// Literal::exact("baz"), + /// ]); + /// seq1.cross_forward(&mut seq2); + /// + /// // The literals are pulled out of seq2. + /// assert_eq!(Some(0), seq2.len()); + /// + /// let expected = Seq::from_iter([ + /// Literal::inexact("fooquux"), + /// Literal::exact("foobaz"), + /// Literal::inexact("bar"), + /// ]); + /// assert_eq!(expected, seq1); + /// ``` + /// + /// This example shows the behavior of when `other` is an infinite + /// sequence. + /// + /// ``` + /// use regex_syntax::hir::literal::{Literal, Seq}; + /// + /// let mut seq1 = Seq::from_iter([ + /// Literal::exact("foo"), + /// Literal::inexact("bar"), + /// ]); + /// let mut seq2 = Seq::infinite(); + /// seq1.cross_forward(&mut seq2); + /// + /// // When seq2 is infinite, cross product doesn't add anything, but + /// // ensures all members of seq1 are inexact. + /// let expected = Seq::from_iter([ + /// Literal::inexact("foo"), + /// Literal::inexact("bar"), + /// ]); + /// assert_eq!(expected, seq1); + /// ``` + /// + /// This example is like the one above, but shows what happens when this + /// sequence contains an empty string. In this case, an infinite `other` + /// sequence infects this sequence (because the empty string means that + /// there are no finite prefixes): + /// + /// ``` + /// use regex_syntax::hir::literal::{Literal, Seq}; + /// + /// let mut seq1 = Seq::from_iter([ + /// Literal::exact("foo"), + /// Literal::exact(""), // inexact provokes same behavior + /// Literal::inexact("bar"), + /// ]); + /// let mut seq2 = Seq::infinite(); + /// seq1.cross_forward(&mut seq2); + /// + /// // seq1 is now infinite! + /// assert!(!seq1.is_finite()); + /// ``` + /// + /// This example shows the behavior of this sequence is infinite. + /// + /// ``` + /// use regex_syntax::hir::literal::{Literal, Seq}; + /// + /// let mut seq1 = Seq::infinite(); + /// let mut seq2 = Seq::from_iter([ + /// Literal::exact("foo"), + /// Literal::inexact("bar"), + /// ]); + /// seq1.cross_forward(&mut seq2); + /// + /// // seq1 remains unchanged. + /// assert!(!seq1.is_finite()); + /// // Even though the literals in seq2 weren't used, it was still drained. + /// assert_eq!(Some(0), seq2.len()); + /// ``` + #[inline] + pub fn cross_forward(&mut self, other: &mut Seq) { + let (lits1, lits2) = match self.cross_preamble(other) { + None => return, + Some((lits1, lits2)) => (lits1, lits2), + }; + let newcap = lits1.len().saturating_mul(lits2.len()); + for selflit in mem::replace(lits1, Vec::with_capacity(newcap)) { + if !selflit.is_exact() { + lits1.push(selflit); + continue; + } + for otherlit in lits2.iter() { + let mut newlit = Literal::exact(Vec::with_capacity( + selflit.len() + otherlit.len(), + )); + newlit.extend(&selflit); + newlit.extend(&otherlit); + if !otherlit.is_exact() { + newlit.make_inexact(); + } + lits1.push(newlit); + } + } + lits2.drain(..); + self.dedup(); + } + + /// Modify this sequence to contain the cross product between it and + /// the sequence given, where the sequences are treated as suffixes + /// instead of prefixes. Namely, the sequence `other` is *prepended* + /// to `self` (as opposed to `other` being *appended* to `self` in + /// [`Seq::cross_forward`]). + /// + /// The cross product only considers literals in this sequence that are + /// exact. That is, inexact literals are not extended. + /// + /// The literals are always drained from `other`, even if none are used. + /// This permits callers to reuse the sequence allocation elsewhere. + /// + /// If this sequence is infinite, then this is a no-op, regardless of what + /// `other` contains (and in this case, the literals are still drained from + /// `other`). If `other` is infinite and this sequence is finite, then this + /// is a no-op, unless this sequence contains a zero-length literal. In + /// which case, the infiniteness of `other` infects this sequence, and this + /// sequence is itself made infinite. + /// + /// Like [`Seq::union`], this may attempt to deduplicate literals. See + /// [`Seq::dedup`] for how deduplication deals with exact and inexact + /// literals. + /// + /// # Example + /// + /// This example shows basic usage and how exact and inexact literals + /// interact. + /// + /// ``` + /// use regex_syntax::hir::literal::{Literal, Seq}; + /// + /// let mut seq1 = Seq::from_iter([ + /// Literal::exact("foo"), + /// Literal::inexact("bar"), + /// ]); + /// let mut seq2 = Seq::from_iter([ + /// Literal::inexact("quux"), + /// Literal::exact("baz"), + /// ]); + /// seq1.cross_reverse(&mut seq2); + /// + /// // The literals are pulled out of seq2. + /// assert_eq!(Some(0), seq2.len()); + /// + /// let expected = Seq::from_iter([ + /// Literal::inexact("quuxfoo"), + /// Literal::inexact("bar"), + /// Literal::exact("bazfoo"), + /// ]); + /// assert_eq!(expected, seq1); + /// ``` + /// + /// This example shows the behavior of when `other` is an infinite + /// sequence. + /// + /// ``` + /// use regex_syntax::hir::literal::{Literal, Seq}; + /// + /// let mut seq1 = Seq::from_iter([ + /// Literal::exact("foo"), + /// Literal::inexact("bar"), + /// ]); + /// let mut seq2 = Seq::infinite(); + /// seq1.cross_reverse(&mut seq2); + /// + /// // When seq2 is infinite, cross product doesn't add anything, but + /// // ensures all members of seq1 are inexact. + /// let expected = Seq::from_iter([ + /// Literal::inexact("foo"), + /// Literal::inexact("bar"), + /// ]); + /// assert_eq!(expected, seq1); + /// ``` + /// + /// This example is like the one above, but shows what happens when this + /// sequence contains an empty string. In this case, an infinite `other` + /// sequence infects this sequence (because the empty string means that + /// there are no finite suffixes): + /// + /// ``` + /// use regex_syntax::hir::literal::{Literal, Seq}; + /// + /// let mut seq1 = Seq::from_iter([ + /// Literal::exact("foo"), + /// Literal::exact(""), // inexact provokes same behavior + /// Literal::inexact("bar"), + /// ]); + /// let mut seq2 = Seq::infinite(); + /// seq1.cross_reverse(&mut seq2); + /// + /// // seq1 is now infinite! + /// assert!(!seq1.is_finite()); + /// ``` + /// + /// This example shows the behavior when this sequence is infinite. + /// + /// ``` + /// use regex_syntax::hir::literal::{Literal, Seq}; + /// + /// let mut seq1 = Seq::infinite(); + /// let mut seq2 = Seq::from_iter([ + /// Literal::exact("foo"), + /// Literal::inexact("bar"), + /// ]); + /// seq1.cross_reverse(&mut seq2); + /// + /// // seq1 remains unchanged. + /// assert!(!seq1.is_finite()); + /// // Even though the literals in seq2 weren't used, it was still drained. + /// assert_eq!(Some(0), seq2.len()); + /// ``` + #[inline] + pub fn cross_reverse(&mut self, other: &mut Seq) { + let (lits1, lits2) = match self.cross_preamble(other) { + None => return, + Some((lits1, lits2)) => (lits1, lits2), + }; + // We basically proceed as we do in 'cross_forward' at this point, + // except that the outer loop is now 'other' and the inner loop is now + // 'self'. That's because 'self' corresponds to suffixes and 'other' + // corresponds to the sequence we want to *prepend* to the suffixes. + let newcap = lits1.len().saturating_mul(lits2.len()); + let selflits = mem::replace(lits1, Vec::with_capacity(newcap)); + for (i, otherlit) in lits2.drain(..).enumerate() { + for selflit in selflits.iter() { + if !selflit.is_exact() { + // If the suffix isn't exact, then we can't prepend + // anything to it. However, we still want to keep it. But + // we only want to keep one of them, to avoid duplication. + // (The duplication is okay from a correctness perspective, + // but wasteful.) + if i == 0 { + lits1.push(selflit.clone()); + } + continue; + } + let mut newlit = Literal::exact(Vec::with_capacity( + otherlit.len() + selflit.len(), + )); + newlit.extend(&otherlit); + newlit.extend(&selflit); + if !otherlit.is_exact() { + newlit.make_inexact(); + } + lits1.push(newlit); + } + } + self.dedup(); + } + + /// A helper function the corresponds to the subtle preamble for both + /// `cross_forward` and `cross_reverse`. In effect, it handles the cases + /// of infinite sequences for both `self` and `other`, as well as ensuring + /// that literals from `other` are drained even if they aren't used. + fn cross_preamble<'a>( + &'a mut self, + other: &'a mut Seq, + ) -> Option<(&'a mut Vec<Literal>, &'a mut Vec<Literal>)> { + let lits2 = match other.literals { + None => { + // If our current seq contains the empty string and the seq + // we're adding matches any literal, then it follows that the + // current seq must now also match any literal. + // + // Otherwise, we just have to make sure everything in this + // sequence is inexact. + if self.min_literal_len() == Some(0) { + *self = Seq::infinite(); + } else { + self.make_inexact(); + } + return None; + } + Some(ref mut lits) => lits, + }; + let lits1 = match self.literals { + None => { + // If we aren't going to make it to the end of this routine + // where lits2 is drained, then we need to do it now. + lits2.drain(..); + return None; + } + Some(ref mut lits) => lits, + }; + Some((lits1, lits2)) + } + + /// Unions the `other` sequence into this one. + /// + /// The literals are always drained out of the given `other` sequence, + /// even if they are being unioned into an infinite sequence. This permits + /// the caller to reuse the `other` sequence in another context. + /// + /// Some literal deduping may be performed. If any deduping happens, + /// any leftmost-first or "preference" order match semantics will be + /// preserved. + /// + /// # Example + /// + /// This example shows basic usage. + /// + /// ``` + /// use regex_syntax::hir::literal::Seq; + /// + /// let mut seq1 = Seq::new(&["foo", "bar"]); + /// let mut seq2 = Seq::new(&["bar", "quux", "foo"]); + /// seq1.union(&mut seq2); + /// + /// // The literals are pulled out of seq2. + /// assert_eq!(Some(0), seq2.len()); + /// + /// // Adjacent literals are deduped, but non-adjacent literals may not be. + /// assert_eq!(Seq::new(&["foo", "bar", "quux", "foo"]), seq1); + /// ``` + /// + /// This example shows that literals are drained from `other` even when + /// they aren't necessarily used. + /// + /// ``` + /// use regex_syntax::hir::literal::Seq; + /// + /// let mut seq1 = Seq::infinite(); + /// // Infinite sequences have no finite length. + /// assert_eq!(None, seq1.len()); + /// + /// let mut seq2 = Seq::new(&["bar", "quux", "foo"]); + /// seq1.union(&mut seq2); + /// + /// // seq1 is still infinite and seq2 has been drained. + /// assert_eq!(None, seq1.len()); + /// assert_eq!(Some(0), seq2.len()); + /// ``` + #[inline] + pub fn union(&mut self, other: &mut Seq) { + let lits2 = match other.literals { + None => { + // Unioning with an infinite sequence always results in an + // infinite sequence. + self.make_infinite(); + return; + } + Some(ref mut lits) => lits.drain(..), + }; + let lits1 = match self.literals { + None => return, + Some(ref mut lits) => lits, + }; + lits1.extend(lits2); + self.dedup(); + } + + /// Unions the `other` sequence into this one by splice the `other` + /// sequence at the position of the first zero-length literal. + /// + /// This is useful for preserving preference order semantics when combining + /// two literal sequences. For example, in the regex `(a||f)+foo`, the + /// correct preference order prefix sequence is `[a, foo, f]`. + /// + /// The literals are always drained out of the given `other` sequence, + /// even if they are being unioned into an infinite sequence. This permits + /// the caller to reuse the `other` sequence in another context. Note that + /// the literals are drained even if no union is performed as well, i.e., + /// when this sequence does not contain a zero-length literal. + /// + /// Some literal deduping may be performed. If any deduping happens, + /// any leftmost-first or "preference" order match semantics will be + /// preserved. + /// + /// # Example + /// + /// This example shows basic usage. + /// + /// ``` + /// use regex_syntax::hir::literal::Seq; + /// + /// let mut seq1 = Seq::new(&["a", "", "f", ""]); + /// let mut seq2 = Seq::new(&["foo"]); + /// seq1.union_into_empty(&mut seq2); + /// + /// // The literals are pulled out of seq2. + /// assert_eq!(Some(0), seq2.len()); + /// // 'foo' gets spliced into seq1 where the first empty string occurs. + /// assert_eq!(Seq::new(&["a", "foo", "f"]), seq1); + /// ``` + /// + /// This example shows that literals are drained from `other` even when + /// they aren't necessarily used. + /// + /// ``` + /// use regex_syntax::hir::literal::Seq; + /// + /// let mut seq1 = Seq::new(&["foo", "bar"]); + /// let mut seq2 = Seq::new(&["bar", "quux", "foo"]); + /// seq1.union_into_empty(&mut seq2); + /// + /// // seq1 has no zero length literals, so no splicing happens. + /// assert_eq!(Seq::new(&["foo", "bar"]), seq1); + /// // Even though no splicing happens, seq2 is still drained. + /// assert_eq!(Some(0), seq2.len()); + /// ``` + #[inline] + pub fn union_into_empty(&mut self, other: &mut Seq) { + let lits2 = other.literals.as_mut().map(|lits| lits.drain(..)); + let lits1 = match self.literals { + None => return, + Some(ref mut lits) => lits, + }; + let first_empty = match lits1.iter().position(|m| m.is_empty()) { + None => return, + Some(i) => i, + }; + let lits2 = match lits2 { + None => { + // Note that we are only here if we've found an empty literal, + // which implies that an infinite sequence infects this seq and + // also turns it into an infinite sequence. + self.literals = None; + return; + } + Some(lits) => lits, + }; + // Clearing out the empties needs to come before the splice because + // the splice might add more empties that we don't want to get rid + // of. Since we're splicing into the position of the first empty, the + // 'first_empty' position computed above is still correct. + lits1.retain(|m| !m.is_empty()); + lits1.splice(first_empty..first_empty, lits2); + self.dedup(); + } + + /// Deduplicate adjacent equivalent literals in this sequence. + /// + /// If adjacent literals are equivalent strings but one is exact and the + /// other inexact, the inexact literal is kept and the exact one is + /// removed. + /// + /// Deduping an infinite sequence is a no-op. + /// + /// # Example + /// + /// This example shows how literals that are duplicate byte strings but + /// are not equivalent with respect to exactness are resolved. + /// + /// ``` + /// use regex_syntax::hir::literal::{Literal, Seq}; + /// + /// let mut seq = Seq::from_iter([ + /// Literal::exact("foo"), + /// Literal::inexact("foo"), + /// ]); + /// seq.dedup(); + /// + /// assert_eq!(Seq::from_iter([Literal::inexact("foo")]), seq); + /// ``` + #[inline] + pub fn dedup(&mut self) { + if let Some(ref mut lits) = self.literals { + lits.dedup_by(|lit1, lit2| { + if lit1.as_bytes() != lit2.as_bytes() { + return false; + } + if lit1.is_exact() != lit2.is_exact() { + lit1.make_inexact(); + lit2.make_inexact(); + } + true + }); + } + } + + /// Sorts this sequence of literals lexicographically. + /// + /// Note that if, before sorting, if a literal that is a prefix of another + /// literal appears after it, then after sorting, the sequence will not + /// represent the same preference order match semantics. For example, + /// sorting the sequence `[samwise, sam]` yields the sequence `[sam, + /// samwise]`. Under preference order semantics, the latter sequence will + /// never match `samwise` where as the first sequence can. + /// + /// # Example + /// + /// This example shows basic usage. + /// + /// ``` + /// use regex_syntax::hir::literal::Seq; + /// + /// let mut seq = Seq::new(&["foo", "quux", "bar"]); + /// seq.sort(); + /// + /// assert_eq!(Seq::new(&["bar", "foo", "quux"]), seq); + /// ``` + #[inline] + pub fn sort(&mut self) { + if let Some(ref mut lits) = self.literals { + lits.sort(); + } + } + + /// Reverses all of the literals in this sequence. + /// + /// The order of the sequence itself is preserved. + /// + /// # Example + /// + /// This example shows basic usage. + /// + /// ``` + /// use regex_syntax::hir::literal::Seq; + /// + /// let mut seq = Seq::new(&["oof", "rab"]); + /// seq.reverse_literals(); + /// assert_eq!(Seq::new(&["foo", "bar"]), seq); + /// ``` + #[inline] + pub fn reverse_literals(&mut self) { + if let Some(ref mut lits) = self.literals { + for lit in lits.iter_mut() { + lit.reverse(); + } + } + } + + /// Shrinks this seq to its minimal size while respecting the preference + /// order of its literals. + /// + /// While this routine will remove duplicate literals from this seq, it + /// will also remove literals that can never match in a leftmost-first or + /// "preference order" search. Similar to [`Seq::dedup`], if a literal is + /// deduped, then the one that remains is made inexact. + /// + /// This is a no-op on seqs that are empty or not finite. + /// + /// # Example + /// + /// This example shows the difference between `{sam, samwise}` and + /// `{samwise, sam}`. + /// + /// ``` + /// use regex_syntax::hir::literal::{Literal, Seq}; + /// + /// // If 'sam' comes before 'samwise' and a preference order search is + /// // executed, then 'samwise' can never match. + /// let mut seq = Seq::new(&["sam", "samwise"]); + /// seq.minimize_by_preference(); + /// assert_eq!(Seq::from_iter([Literal::inexact("sam")]), seq); + /// + /// // But if they are reversed, then it's possible for 'samwise' to match + /// // since it is given higher preference. + /// let mut seq = Seq::new(&["samwise", "sam"]); + /// seq.minimize_by_preference(); + /// assert_eq!(Seq::new(&["samwise", "sam"]), seq); + /// ``` + /// + /// This example shows that if an empty string is in this seq, then + /// anything that comes after it can never match. + /// + /// ``` + /// use regex_syntax::hir::literal::{Literal, Seq}; + /// + /// // An empty string is a prefix of all strings, so it automatically + /// // inhibits any subsequent strings from matching. + /// let mut seq = Seq::new(&["foo", "bar", "", "quux", "fox"]); + /// seq.minimize_by_preference(); + /// let expected = Seq::from_iter([ + /// Literal::exact("foo"), + /// Literal::exact("bar"), + /// Literal::inexact(""), + /// ]); + /// assert_eq!(expected, seq); + /// + /// // And of course, if it's at the beginning, then it makes it impossible + /// // for anything else to match. + /// let mut seq = Seq::new(&["", "foo", "quux", "fox"]); + /// seq.minimize_by_preference(); + /// assert_eq!(Seq::from_iter([Literal::inexact("")]), seq); + /// ``` + #[inline] + pub fn minimize_by_preference(&mut self) { + if let Some(ref mut lits) = self.literals { + PreferenceTrie::minimize(lits, false); + } + } + + /// Trims all literals in this seq such that only the first `len` bytes + /// remain. If a literal has less than or equal to `len` bytes, then it + /// remains unchanged. Otherwise, it is trimmed and made inexact. + /// + /// # Example + /// + /// ``` + /// use regex_syntax::hir::literal::{Literal, Seq}; + /// + /// let mut seq = Seq::new(&["a", "foo", "quux"]); + /// seq.keep_first_bytes(2); + /// + /// let expected = Seq::from_iter([ + /// Literal::exact("a"), + /// Literal::inexact("fo"), + /// Literal::inexact("qu"), + /// ]); + /// assert_eq!(expected, seq); + /// ``` + #[inline] + pub fn keep_first_bytes(&mut self, len: usize) { + if let Some(ref mut lits) = self.literals { + for m in lits.iter_mut() { + m.keep_first_bytes(len); + } + } + } + + /// Trims all literals in this seq such that only the last `len` bytes + /// remain. If a literal has less than or equal to `len` bytes, then it + /// remains unchanged. Otherwise, it is trimmed and made inexact. + /// + /// # Example + /// + /// ``` + /// use regex_syntax::hir::literal::{Literal, Seq}; + /// + /// let mut seq = Seq::new(&["a", "foo", "quux"]); + /// seq.keep_last_bytes(2); + /// + /// let expected = Seq::from_iter([ + /// Literal::exact("a"), + /// Literal::inexact("oo"), + /// Literal::inexact("ux"), + /// ]); + /// assert_eq!(expected, seq); + /// ``` + #[inline] + pub fn keep_last_bytes(&mut self, len: usize) { + if let Some(ref mut lits) = self.literals { + for m in lits.iter_mut() { + m.keep_last_bytes(len); + } + } + } + + /// Returns true if this sequence is finite. + /// + /// When false, this sequence is infinite and must be treated as if it + /// contains every possible literal. + #[inline] + pub fn is_finite(&self) -> bool { + self.literals.is_some() + } + + /// Returns true if and only if this sequence is finite and empty. + /// + /// An empty sequence never matches anything. It can only be produced by + /// literal extraction when the corresponding regex itself cannot match. + #[inline] + pub fn is_empty(&self) -> bool { + self.len() == Some(0) + } + + /// Returns the number of literals in this sequence if the sequence is + /// finite. If the sequence is infinite, then `None` is returned. + #[inline] + pub fn len(&self) -> Option<usize> { + self.literals.as_ref().map(|lits| lits.len()) + } + + /// Returns true if and only if all literals in this sequence are exact. + /// + /// This returns false if the sequence is infinite. + #[inline] + pub fn is_exact(&self) -> bool { + self.literals().map_or(false, |lits| lits.iter().all(|x| x.is_exact())) + } + + /// Returns true if and only if all literals in this sequence are inexact. + /// + /// This returns true if the sequence is infinite. + #[inline] + pub fn is_inexact(&self) -> bool { + self.literals().map_or(true, |lits| lits.iter().all(|x| !x.is_exact())) + } + + /// Return the maximum length of the sequence that would result from + /// unioning `self` with `other`. If either set is infinite, then this + /// returns `None`. + #[inline] + pub fn max_union_len(&self, other: &Seq) -> Option<usize> { + let len1 = self.len()?; + let len2 = other.len()?; + Some(len1.saturating_add(len2)) + } + + /// Return the maximum length of the sequence that would result from the + /// cross product of `self` with `other`. If either set is infinite, then + /// this returns `None`. + #[inline] + pub fn max_cross_len(&self, other: &Seq) -> Option<usize> { + let len1 = self.len()?; + let len2 = other.len()?; + Some(len1.saturating_mul(len2)) + } + + /// Returns the length of the shortest literal in this sequence. + /// + /// If the sequence is infinite or empty, then this returns `None`. + #[inline] + pub fn min_literal_len(&self) -> Option<usize> { + self.literals.as_ref()?.iter().map(|x| x.len()).min() + } + + /// Returns the length of the longest literal in this sequence. + /// + /// If the sequence is infinite or empty, then this returns `None`. + #[inline] + pub fn max_literal_len(&self) -> Option<usize> { + self.literals.as_ref()?.iter().map(|x| x.len()).max() + } + + /// Returns the longest common prefix from this seq. + /// + /// If the seq matches any literal or other contains no literals, then + /// there is no meaningful prefix and this returns `None`. + /// + /// # Example + /// + /// This shows some example seqs and their longest common prefix. + /// + /// ``` + /// use regex_syntax::hir::literal::Seq; + /// + /// let seq = Seq::new(&["foo", "foobar", "fo"]); + /// assert_eq!(Some(&b"fo"[..]), seq.longest_common_prefix()); + /// let seq = Seq::new(&["foo", "foo"]); + /// assert_eq!(Some(&b"foo"[..]), seq.longest_common_prefix()); + /// let seq = Seq::new(&["foo", "bar"]); + /// assert_eq!(Some(&b""[..]), seq.longest_common_prefix()); + /// let seq = Seq::new(&[""]); + /// assert_eq!(Some(&b""[..]), seq.longest_common_prefix()); + /// + /// let seq = Seq::infinite(); + /// assert_eq!(None, seq.longest_common_prefix()); + /// let seq = Seq::empty(); + /// assert_eq!(None, seq.longest_common_prefix()); + /// ``` + #[inline] + pub fn longest_common_prefix(&self) -> Option<&[u8]> { + // If we match everything or match nothing, then there's no meaningful + // longest common prefix. + let lits = match self.literals { + None => return None, + Some(ref lits) => lits, + }; + if lits.len() == 0 { + return None; + } + let base = lits[0].as_bytes(); + let mut len = base.len(); + for m in lits.iter().skip(1) { + len = m + .as_bytes() + .iter() + .zip(base[..len].iter()) + .take_while(|&(a, b)| a == b) + .count(); + if len == 0 { + return Some(&[]); + } + } + Some(&base[..len]) + } + + /// Returns the longest common suffix from this seq. + /// + /// If the seq matches any literal or other contains no literals, then + /// there is no meaningful suffix and this returns `None`. + /// + /// # Example + /// + /// This shows some example seqs and their longest common suffix. + /// + /// ``` + /// use regex_syntax::hir::literal::Seq; + /// + /// let seq = Seq::new(&["oof", "raboof", "of"]); + /// assert_eq!(Some(&b"of"[..]), seq.longest_common_suffix()); + /// let seq = Seq::new(&["foo", "foo"]); + /// assert_eq!(Some(&b"foo"[..]), seq.longest_common_suffix()); + /// let seq = Seq::new(&["foo", "bar"]); + /// assert_eq!(Some(&b""[..]), seq.longest_common_suffix()); + /// let seq = Seq::new(&[""]); + /// assert_eq!(Some(&b""[..]), seq.longest_common_suffix()); + /// + /// let seq = Seq::infinite(); + /// assert_eq!(None, seq.longest_common_suffix()); + /// let seq = Seq::empty(); + /// assert_eq!(None, seq.longest_common_suffix()); + /// ``` + #[inline] + pub fn longest_common_suffix(&self) -> Option<&[u8]> { + // If we match everything or match nothing, then there's no meaningful + // longest common suffix. + let lits = match self.literals { + None => return None, + Some(ref lits) => lits, + }; + if lits.len() == 0 { + return None; + } + let base = lits[0].as_bytes(); + let mut len = base.len(); + for m in lits.iter().skip(1) { + len = m + .as_bytes() + .iter() + .rev() + .zip(base[base.len() - len..].iter().rev()) + .take_while(|&(a, b)| a == b) + .count(); + if len == 0 { + return Some(&[]); + } + } + Some(&base[base.len() - len..]) + } + + /// Optimizes this seq while treating its literals as prefixes and + /// respecting the preference order of its literals. + /// + /// The specific way "optimization" works is meant to be an implementation + /// detail, as it essentially represents a set of heuristics. The goal + /// that optimization tries to accomplish is to make the literals in this + /// set reflect inputs that will result in a more effective prefilter. + /// Principally by reducing the false positive rate of candidates found by + /// the literals in this sequence. That is, when a match of a literal is + /// found, we would like it to be a strong predictor of the overall match + /// of the regex. If it isn't, then much time will be spent starting and + /// stopping the prefilter search and attempting to confirm the match only + /// to have it fail. + /// + /// Some of those heuristics might be: + /// + /// * Identifying a common prefix from a larger sequence of literals, and + /// shrinking the sequence down to that single common prefix. + /// * Rejecting the sequence entirely if it is believed to result in very + /// high false positive rate. When this happens, the sequence is made + /// infinite. + /// * Shrinking the sequence to a smaller number of literals representing + /// prefixes, but not shrinking it so much as to make literals too short. + /// (A sequence with very short literals, of 1 or 2 bytes, will typically + /// result in a higher false positive rate.) + /// + /// Optimization should only be run once extraction is complete. Namely, + /// optimization may make assumptions that do not compose with other + /// operations in the middle of extraction. For example, optimization will + /// reduce `[E(sam), E(samwise)]` to `[E(sam)]`, but such a transformation + /// is only valid if no other extraction will occur. If other extraction + /// may occur, then the correct transformation would be to `[I(sam)]`. + /// + /// The [`Seq::optimize_for_suffix_by_preference`] does the same thing, but + /// for suffixes. + /// + /// # Example + /// + /// This shows how optimization might transform a sequence. Note that + /// the specific behavior is not a documented guarantee. The heuristics + /// used are an implementation detail and may change over time in semver + /// compatible releases. + /// + /// ``` + /// use regex_syntax::hir::literal::{Seq, Literal}; + /// + /// let mut seq = Seq::new(&[ + /// "samantha", + /// "sam", + /// "samwise", + /// "frodo", + /// ]); + /// seq.optimize_for_prefix_by_preference(); + /// assert_eq!(Seq::from_iter([ + /// Literal::exact("samantha"), + /// // Kept exact even though 'samwise' got pruned + /// // because optimization assumes literal extraction + /// // has finished. + /// Literal::exact("sam"), + /// Literal::exact("frodo"), + /// ]), seq); + /// ``` + /// + /// # Example: optimization may make the sequence infinite + /// + /// If the heuristics deem that the sequence could cause a very high false + /// positive rate, then it may make the sequence infinite, effectively + /// disabling its use as a prefilter. + /// + /// ``` + /// use regex_syntax::hir::literal::{Seq, Literal}; + /// + /// let mut seq = Seq::new(&[ + /// "samantha", + /// // An empty string matches at every position, + /// // thus rendering the prefilter completely + /// // ineffective. + /// "", + /// "sam", + /// "samwise", + /// "frodo", + /// ]); + /// seq.optimize_for_prefix_by_preference(); + /// assert!(!seq.is_finite()); + /// ``` + /// + /// Do note that just because there is a `" "` in the sequence, that + /// doesn't mean the sequence will always be made infinite after it is + /// optimized. Namely, if the sequence is considered exact (any match + /// corresponds to an overall match of the original regex), then any match + /// is an overall match, and so the false positive rate is always `0`. + /// + /// To demonstrate this, we remove `samwise` from our sequence. This + /// results in no optimization happening and all literals remain exact. + /// Thus the entire sequence is exact, and it is kept as-is, even though + /// one is an ASCII space: + /// + /// ``` + /// use regex_syntax::hir::literal::{Seq, Literal}; + /// + /// let mut seq = Seq::new(&[ + /// "samantha", + /// " ", + /// "sam", + /// "frodo", + /// ]); + /// seq.optimize_for_prefix_by_preference(); + /// assert!(seq.is_finite()); + /// ``` + #[inline] + pub fn optimize_for_prefix_by_preference(&mut self) { + self.optimize_by_preference(true); + } + + /// Optimizes this seq while treating its literals as suffixes and + /// respecting the preference order of its literals. + /// + /// Optimization should only be run once extraction is complete. + /// + /// The [`Seq::optimize_for_prefix_by_preference`] does the same thing, but + /// for prefixes. See its documentation for more explanation. + #[inline] + pub fn optimize_for_suffix_by_preference(&mut self) { + self.optimize_by_preference(false); + } + + fn optimize_by_preference(&mut self, prefix: bool) { + let origlen = match self.len() { + None => return, + Some(len) => len, + }; + // Just give up now if our sequence contains an empty string. + if self.min_literal_len().map_or(false, |len| len == 0) { + // We squash the sequence so that nobody else gets any bright + // ideas to try and use it. An empty string implies a match at + // every position. A prefilter cannot help you here. + self.make_infinite(); + return; + } + // Make sure we start with the smallest sequence possible. We use a + // special version of preference minimization that retains exactness. + // This is legal because optimization is only expected to occur once + // extraction is complete. + if prefix { + if let Some(ref mut lits) = self.literals { + PreferenceTrie::minimize(lits, true); + } + } + + // Look for a common prefix (or suffix). If we found one of those and + // it's long enough, then it's a good bet that it will be our fastest + // possible prefilter since single-substring search is so fast. + let fix = if prefix { + self.longest_common_prefix() + } else { + self.longest_common_suffix() + }; + if let Some(fix) = fix { + // As a special case, if we have a common prefix and the leading + // byte of that prefix is one that we think probably occurs rarely, + // then strip everything down to just that single byte. This should + // promote the use of memchr. + // + // ... we only do this though if our sequence has more than one + // literal. Otherwise, we'd rather just stick with a single literal + // scan. That is, using memchr is probably better than looking + // for 2 or more literals, but probably not as good as a straight + // memmem search. + // + // ... and also only do this when the prefix is short and probably + // not too discriminatory anyway. If it's longer, then it's + // probably quite discriminatory and thus is likely to have a low + // false positive rate. + if prefix + && origlen > 1 + && fix.len() >= 1 + && fix.len() <= 3 + && rank(fix[0]) < 200 + { + self.keep_first_bytes(1); + self.dedup(); + return; + } + // We only strip down to the common prefix/suffix if we think + // the existing set of literals isn't great, or if the common + // prefix/suffix is expected to be particularly discriminatory. + let isfast = + self.is_exact() && self.len().map_or(false, |len| len <= 16); + let usefix = fix.len() > 4 || (fix.len() > 1 && !isfast); + if usefix { + // If we keep exactly the number of bytes equal to the length + // of the prefix (or suffix), then by the definition of a + // prefix, every literal in the sequence will be equivalent. + // Thus, 'dedup' will leave us with one literal. + // + // We do it this way to avoid an alloc, but also to make sure + // the exactness of literals is kept (or not). + if prefix { + self.keep_first_bytes(fix.len()); + } else { + self.keep_last_bytes(fix.len()); + } + self.dedup(); + assert_eq!(Some(1), self.len()); + // We still fall through here. In particular, we want our + // longest common prefix to be subject to the poison check. + } + } + // If we have an exact sequence, we *probably* just want to keep it + // as-is. But there are some cases where we don't. So we save a copy of + // the exact sequence now, and then try to do some more optimizations + // below. If those don't work out, we go back to this exact sequence. + // + // The specific motivation for this is that we sometimes wind up with + // an exact sequence with a hefty number of literals. Say, 100. If we + // stuck with that, it would be too big for Teddy and would result in + // using Aho-Corasick. Which is fine... but the lazy DFA is plenty + // suitable in such cases. The real issue is that we will wind up not + // using a fast prefilter at all. So in cases like this, even though + // we have an exact sequence, it would be better to try and shrink the + // sequence (which we do below) and use it as a prefilter that can + // produce false positive matches. + // + // But if the shrinking below results in a sequence that "sucks," then + // we don't want to use that because we already have an exact sequence + // in hand. + let exact: Option<Seq> = + if self.is_exact() { Some(self.clone()) } else { None }; + // Now we attempt to shorten the sequence. The idea here is that we + // don't want to look for too many literals, but we want to shorten + // our sequence enough to improve our odds of using better algorithms + // downstream (such as Teddy). + // + // The pair of numbers in this list corresponds to the maximal prefix + // (in bytes) to keep for all literals and the length of the sequence + // at which to do it. + // + // So for example, the pair (3, 500) would mean, "if we have more than + // 500 literals in our sequence, then truncate all of our literals + // such that they are at most 3 bytes in length and the minimize the + // sequence." + const ATTEMPTS: [(usize, usize); 5] = + [(5, 10), (4, 10), (3, 64), (2, 64), (1, 10)]; + for (keep, limit) in ATTEMPTS { + let len = match self.len() { + None => break, + Some(len) => len, + }; + if len <= limit { + break; + } + if prefix { + self.keep_first_bytes(keep); + } else { + self.keep_last_bytes(keep); + } + if prefix { + if let Some(ref mut lits) = self.literals { + PreferenceTrie::minimize(lits, true); + } + } + } + // Check for a poison literal. A poison literal is one that is short + // and is believed to have a very high match count. These poisons + // generally lead to a prefilter with a very high false positive rate, + // and thus overall worse performance. + // + // We do this last because we could have gone from a non-poisonous + // sequence to a poisonous one. Perhaps we should add some code to + // prevent such transitions in the first place, but then again, we + // likely only made the transition in the first place if the sequence + // was itself huge. And huge sequences are themselves poisonous. So... + if let Some(lits) = self.literals() { + if lits.iter().any(|lit| lit.is_poisonous()) { + self.make_infinite(); + } + } + // OK, if we had an exact sequence before attempting more optimizations + // above and our post-optimized sequence sucks for some reason or + // another, then we go back to the exact sequence. + if let Some(exact) = exact { + // If optimizing resulted in dropping our literals, then certainly + // backup and use the exact sequence that we had. + if !self.is_finite() { + *self = exact; + return; + } + // If our optimized sequence contains a short literal, then it's + // *probably* not so great. So throw it away and revert to the + // exact sequence. + if self.min_literal_len().map_or(true, |len| len <= 2) { + *self = exact; + return; + } + // Finally, if our optimized sequence is "big" (i.e., can't use + // Teddy), then also don't use it and rely on the exact sequence. + if self.len().map_or(true, |len| len > 64) { + *self = exact; + return; + } + } + } +} + +impl core::fmt::Debug for Seq { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + write!(f, "Seq")?; + if let Some(lits) = self.literals() { + f.debug_list().entries(lits.iter()).finish() + } else { + write!(f, "[∞]") + } + } +} + +impl FromIterator<Literal> for Seq { + fn from_iter<T: IntoIterator<Item = Literal>>(it: T) -> Seq { + let mut seq = Seq::empty(); + for literal in it { + seq.push(literal); + } + seq + } +} + +/// A single literal extracted from an [`Hir`] expression. +/// +/// A literal is composed of two things: +/// +/// * A sequence of bytes. No guarantees with respect to UTF-8 are provided. +/// In particular, even if the regex a literal is extracted from is UTF-8, the +/// literal extracted may not be valid UTF-8. (For example, if an [`Extractor`] +/// limit resulted in trimming a literal in a way that splits a codepoint.) +/// * Whether the literal is "exact" or not. An "exact" literal means that it +/// has not been trimmed, and may continue to be extended. If a literal is +/// "exact" after visiting the entire `Hir` expression, then this implies that +/// the literal leads to a match state. (Although it doesn't necessarily imply +/// all occurrences of the literal correspond to a match of the regex, since +/// literal extraction ignores look-around assertions.) +#[derive(Clone, Eq, PartialEq, PartialOrd, Ord)] +pub struct Literal { + bytes: Vec<u8>, + exact: bool, +} + +impl Literal { + /// Returns a new exact literal containing the bytes given. + #[inline] + pub fn exact<B: Into<Vec<u8>>>(bytes: B) -> Literal { + Literal { bytes: bytes.into(), exact: true } + } + + /// Returns a new inexact literal containing the bytes given. + #[inline] + pub fn inexact<B: Into<Vec<u8>>>(bytes: B) -> Literal { + Literal { bytes: bytes.into(), exact: false } + } + + /// Returns the bytes in this literal. + #[inline] + pub fn as_bytes(&self) -> &[u8] { + &self.bytes + } + + /// Yields ownership of the bytes inside this literal. + /// + /// Note that this throws away whether the literal is "exact" or not. + #[inline] + pub fn into_bytes(self) -> Vec<u8> { + self.bytes + } + + /// Returns the length of this literal in bytes. + #[inline] + pub fn len(&self) -> usize { + self.as_bytes().len() + } + + /// Returns true if and only if this literal has zero bytes. + #[inline] + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Returns true if and only if this literal is exact. + #[inline] + pub fn is_exact(&self) -> bool { + self.exact + } + + /// Marks this literal as inexact. + /// + /// Inexact literals can never be extended. For example, + /// [`Seq::cross_forward`] will not extend inexact literals. + #[inline] + pub fn make_inexact(&mut self) { + self.exact = false; + } + + /// Reverse the bytes in this literal. + #[inline] + pub fn reverse(&mut self) { + self.bytes.reverse(); + } + + /// Extend this literal with the literal given. + /// + /// If this literal is inexact, then this is a no-op. + #[inline] + pub fn extend(&mut self, lit: &Literal) { + if !self.is_exact() { + return; + } + self.bytes.extend_from_slice(&lit.bytes); + } + + /// Trims this literal such that only the first `len` bytes remain. If + /// this literal has fewer than `len` bytes, then it remains unchanged. + /// Otherwise, the literal is marked as inexact. + #[inline] + pub fn keep_first_bytes(&mut self, len: usize) { + if len >= self.len() { + return; + } + self.make_inexact(); + self.bytes.truncate(len); + } + + /// Trims this literal such that only the last `len` bytes remain. If this + /// literal has fewer than `len` bytes, then it remains unchanged. + /// Otherwise, the literal is marked as inexact. + #[inline] + pub fn keep_last_bytes(&mut self, len: usize) { + if len >= self.len() { + return; + } + self.make_inexact(); + self.bytes.drain(..self.len() - len); + } + + /// Returns true if it is believe that this literal is likely to match very + /// frequently, and is thus not a good candidate for a prefilter. + fn is_poisonous(&self) -> bool { + self.is_empty() || (self.len() == 1 && rank(self.as_bytes()[0]) >= 250) + } +} + +impl From<u8> for Literal { + fn from(byte: u8) -> Literal { + Literal::exact(vec![byte]) + } +} + +impl From<char> for Literal { + fn from(ch: char) -> Literal { + use alloc::string::ToString; + Literal::exact(ch.encode_utf8(&mut [0; 4]).to_string()) + } +} + +impl AsRef<[u8]> for Literal { + fn as_ref(&self) -> &[u8] { + self.as_bytes() + } +} + +impl core::fmt::Debug for Literal { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + let tag = if self.exact { "E" } else { "I" }; + f.debug_tuple(tag) + .field(&crate::debug::Bytes(self.as_bytes())) + .finish() + } +} + +/// A "preference" trie that rejects literals that will never match when +/// executing a leftmost first or "preference" search. +/// +/// For example, if 'sam' is inserted, then trying to insert 'samwise' will be +/// rejected because 'samwise' can never match since 'sam' will always take +/// priority. However, if 'samwise' is inserted first, then inserting 'sam' +/// after it is accepted. In this case, either 'samwise' or 'sam' can match in +/// a "preference" search. +/// +/// Note that we only use this trie as a "set." That is, given a sequence of +/// literals, we insert each one in order. An `insert` will reject a literal +/// if a prefix of that literal already exists in the trie. Thus, to rebuild +/// the "minimal" sequence, we simply only keep literals that were successfully +/// inserted. (Since we don't need traversal, one wonders whether we can make +/// some simplifications here, but I haven't given it a ton of thought and I've +/// never seen this show up on a profile. Because of the heuristic limits +/// imposed on literal extractions, the size of the inputs here is usually +/// very small.) +#[derive(Debug)] +struct PreferenceTrie { + /// The states in this trie. The index of a state in this vector is its ID. + states: Vec<State>, + /// This vec indicates which states are match states. It always has + /// the same length as `states` and is indexed by the same state ID. + /// A state with identifier `sid` is a match state if and only if + /// `matches[sid].is_some()`. The option contains the index of the literal + /// corresponding to the match. The index is offset by 1 so that it fits in + /// a NonZeroUsize. + matches: Vec<Option<NonZeroUsize>>, + /// The index to allocate to the next literal added to this trie. Starts at + /// 1 and increments by 1 for every literal successfully added to the trie. + next_literal_index: usize, +} + +/// A single state in a trie. Uses a sparse representation for its transitions. +#[derive(Debug, Default)] +struct State { + /// Sparse representation of the transitions out of this state. Transitions + /// are sorted by byte. There is at most one such transition for any + /// particular byte. + trans: Vec<(u8, usize)>, +} + +impl PreferenceTrie { + /// Minimizes the given sequence of literals while preserving preference + /// order semantics. + /// + /// When `keep_exact` is true, the exactness of every literal retained is + /// kept. This is useful when dealing with a fully extracted `Seq` that + /// only contains exact literals. In that case, we can keep all retained + /// literals as exact because we know we'll never need to match anything + /// after them and because any removed literals are guaranteed to never + /// match. + fn minimize(literals: &mut Vec<Literal>, keep_exact: bool) { + let mut trie = PreferenceTrie { + states: vec![], + matches: vec![], + next_literal_index: 1, + }; + let mut make_inexact = vec![]; + literals.retain_mut(|lit| match trie.insert(lit.as_bytes()) { + Ok(_) => true, + Err(i) => { + if !keep_exact { + make_inexact.push(i.checked_sub(1).unwrap()); + } + false + } + }); + for i in make_inexact { + literals[i].make_inexact(); + } + } + + /// Returns `Ok` if the given byte string is accepted into this trie and + /// `Err` otherwise. The index for the success case corresponds to the + /// index of the literal added. The index for the error case corresponds to + /// the index of the literal already in the trie that prevented the given + /// byte string from being added. (Which implies it is a prefix of the one + /// given.) + /// + /// In short, the byte string given is accepted into the trie if and only + /// if it is possible for it to match when executing a preference order + /// search. + fn insert(&mut self, bytes: &[u8]) -> Result<usize, usize> { + let mut prev = self.root(); + if let Some(idx) = self.matches[prev] { + return Err(idx.get()); + } + for &b in bytes.iter() { + match self.states[prev].trans.binary_search_by_key(&b, |t| t.0) { + Ok(i) => { + prev = self.states[prev].trans[i].1; + if let Some(idx) = self.matches[prev] { + return Err(idx.get()); + } + } + Err(i) => { + let next = self.create_state(); + self.states[prev].trans.insert(i, (b, next)); + prev = next; + } + } + } + let idx = self.next_literal_index; + self.next_literal_index += 1; + self.matches[prev] = NonZeroUsize::new(idx); + Ok(idx) + } + + /// Returns the root state ID, and if it doesn't exist, creates it. + fn root(&mut self) -> usize { + if !self.states.is_empty() { + 0 + } else { + self.create_state() + } + } + + /// Creates a new empty state and returns its ID. + fn create_state(&mut self) -> usize { + let id = self.states.len(); + self.states.push(State::default()); + self.matches.push(None); + id + } +} + +/// Returns the "rank" of the given byte. +/// +/// The minimum rank value is `0` and the maximum rank value is `255`. +/// +/// The rank of a byte is derived from a heuristic background distribution of +/// relative frequencies of bytes. The heuristic says that lower the rank of a +/// byte, the less likely that byte is to appear in any arbitrary haystack. +pub fn rank(byte: u8) -> u8 { + crate::rank::BYTE_FREQUENCIES[usize::from(byte)] +} + +#[cfg(test)] +mod tests { + use super::*; + + fn parse(pattern: &str) -> Hir { + crate::ParserBuilder::new().utf8(false).build().parse(pattern).unwrap() + } + + fn prefixes(pattern: &str) -> Seq { + Extractor::new().kind(ExtractKind::Prefix).extract(&parse(pattern)) + } + + fn suffixes(pattern: &str) -> Seq { + Extractor::new().kind(ExtractKind::Suffix).extract(&parse(pattern)) + } + + fn e(pattern: &str) -> (Seq, Seq) { + (prefixes(pattern), suffixes(pattern)) + } + + #[allow(non_snake_case)] + fn E(x: &str) -> Literal { + Literal::exact(x.as_bytes()) + } + + #[allow(non_snake_case)] + fn I(x: &str) -> Literal { + Literal::inexact(x.as_bytes()) + } + + fn seq<I: IntoIterator<Item = Literal>>(it: I) -> Seq { + Seq::from_iter(it) + } + + fn infinite() -> (Seq, Seq) { + (Seq::infinite(), Seq::infinite()) + } + + fn inexact<I1, I2>(it1: I1, it2: I2) -> (Seq, Seq) + where + I1: IntoIterator<Item = Literal>, + I2: IntoIterator<Item = Literal>, + { + (Seq::from_iter(it1), Seq::from_iter(it2)) + } + + fn exact<B: AsRef<[u8]>, I: IntoIterator<Item = B>>(it: I) -> (Seq, Seq) { + let s1 = Seq::new(it); + let s2 = s1.clone(); + (s1, s2) + } + + fn opt<B: AsRef<[u8]>, I: IntoIterator<Item = B>>(it: I) -> (Seq, Seq) { + let (mut p, mut s) = exact(it); + p.optimize_for_prefix_by_preference(); + s.optimize_for_suffix_by_preference(); + (p, s) + } + + #[test] + fn literal() { + assert_eq!(exact(["a"]), e("a")); + assert_eq!(exact(["aaaaa"]), e("aaaaa")); + assert_eq!(exact(["A", "a"]), e("(?i-u)a")); + assert_eq!(exact(["AB", "Ab", "aB", "ab"]), e("(?i-u)ab")); + assert_eq!(exact(["abC", "abc"]), e("ab(?i-u)c")); + + assert_eq!(exact([b"\xFF"]), e(r"(?-u:\xFF)")); + + #[cfg(feature = "unicode-case")] + { + assert_eq!(exact(["☃"]), e("☃")); + assert_eq!(exact(["☃"]), e("(?i)☃")); + assert_eq!(exact(["☃☃☃☃☃"]), e("☃☃☃☃☃")); + + assert_eq!(exact(["Δ"]), e("Δ")); + assert_eq!(exact(["δ"]), e("δ")); + assert_eq!(exact(["Δ", "δ"]), e("(?i)Δ")); + assert_eq!(exact(["Δ", "δ"]), e("(?i)δ")); + + assert_eq!(exact(["S", "s", "ſ"]), e("(?i)S")); + assert_eq!(exact(["S", "s", "ſ"]), e("(?i)s")); + assert_eq!(exact(["S", "s", "ſ"]), e("(?i)ſ")); + } + + let letters = "ͱͳͷΐάέήίΰαβγδεζηθικλμνξοπρςστυφχψωϊϋ"; + assert_eq!(exact([letters]), e(letters)); + } + + #[test] + fn class() { + assert_eq!(exact(["a", "b", "c"]), e("[abc]")); + assert_eq!(exact(["a1b", "a2b", "a3b"]), e("a[123]b")); + assert_eq!(exact(["δ", "ε"]), e("[εδ]")); + #[cfg(feature = "unicode-case")] + { + assert_eq!(exact(["Δ", "Ε", "δ", "ε", "ϵ"]), e(r"(?i)[εδ]")); + } + } + + #[test] + fn look() { + assert_eq!(exact(["ab"]), e(r"a\Ab")); + assert_eq!(exact(["ab"]), e(r"a\zb")); + assert_eq!(exact(["ab"]), e(r"a(?m:^)b")); + assert_eq!(exact(["ab"]), e(r"a(?m:$)b")); + assert_eq!(exact(["ab"]), e(r"a\bb")); + assert_eq!(exact(["ab"]), e(r"a\Bb")); + assert_eq!(exact(["ab"]), e(r"a(?-u:\b)b")); + assert_eq!(exact(["ab"]), e(r"a(?-u:\B)b")); + + assert_eq!(exact(["ab"]), e(r"^ab")); + assert_eq!(exact(["ab"]), e(r"$ab")); + assert_eq!(exact(["ab"]), e(r"(?m:^)ab")); + assert_eq!(exact(["ab"]), e(r"(?m:$)ab")); + assert_eq!(exact(["ab"]), e(r"\bab")); + assert_eq!(exact(["ab"]), e(r"\Bab")); + assert_eq!(exact(["ab"]), e(r"(?-u:\b)ab")); + assert_eq!(exact(["ab"]), e(r"(?-u:\B)ab")); + + assert_eq!(exact(["ab"]), e(r"ab^")); + assert_eq!(exact(["ab"]), e(r"ab$")); + assert_eq!(exact(["ab"]), e(r"ab(?m:^)")); + assert_eq!(exact(["ab"]), e(r"ab(?m:$)")); + assert_eq!(exact(["ab"]), e(r"ab\b")); + assert_eq!(exact(["ab"]), e(r"ab\B")); + assert_eq!(exact(["ab"]), e(r"ab(?-u:\b)")); + assert_eq!(exact(["ab"]), e(r"ab(?-u:\B)")); + + let expected = (seq([I("aZ"), E("ab")]), seq([I("Zb"), E("ab")])); + assert_eq!(expected, e(r"^aZ*b")); + } + + #[test] + fn repetition() { + assert_eq!(exact(["a", ""]), e(r"a?")); + assert_eq!(exact(["", "a"]), e(r"a??")); + assert_eq!(inexact([I("a"), E("")], [I("a"), E("")]), e(r"a*")); + assert_eq!(inexact([E(""), I("a")], [E(""), I("a")]), e(r"a*?")); + assert_eq!(inexact([I("a")], [I("a")]), e(r"a+")); + assert_eq!(inexact([I("a")], [I("a")]), e(r"(a+)+")); + + assert_eq!(exact(["ab"]), e(r"aZ{0}b")); + assert_eq!(exact(["aZb", "ab"]), e(r"aZ?b")); + assert_eq!(exact(["ab", "aZb"]), e(r"aZ??b")); + assert_eq!( + inexact([I("aZ"), E("ab")], [I("Zb"), E("ab")]), + e(r"aZ*b") + ); + assert_eq!( + inexact([E("ab"), I("aZ")], [E("ab"), I("Zb")]), + e(r"aZ*?b") + ); + assert_eq!(inexact([I("aZ")], [I("Zb")]), e(r"aZ+b")); + assert_eq!(inexact([I("aZ")], [I("Zb")]), e(r"aZ+?b")); + + assert_eq!(exact(["aZZb"]), e(r"aZ{2}b")); + assert_eq!(inexact([I("aZZ")], [I("ZZb")]), e(r"aZ{2,3}b")); + + assert_eq!(exact(["abc", ""]), e(r"(abc)?")); + assert_eq!(exact(["", "abc"]), e(r"(abc)??")); + + assert_eq!(inexact([I("a"), E("b")], [I("ab"), E("b")]), e(r"a*b")); + assert_eq!(inexact([E("b"), I("a")], [E("b"), I("ab")]), e(r"a*?b")); + assert_eq!(inexact([I("ab")], [I("b")]), e(r"ab+")); + assert_eq!(inexact([I("a"), I("b")], [I("b")]), e(r"a*b+")); + + // FIXME: The suffixes for this don't look quite right to me. I think + // the right suffixes would be: [I(ac), I(bc), E(c)]. The main issue I + // think is that suffixes are computed by iterating over concatenations + // in reverse, and then [bc, ac, c] ordering is indeed correct from + // that perspective. We also test a few more equivalent regexes, and + // we get the same result, so it is consistent at least I suppose. + // + // The reason why this isn't an issue is that it only messes up + // preference order, and currently, suffixes are never used in a + // context where preference order matters. For prefixes it matters + // because we sometimes want to use prefilters without confirmation + // when all of the literals are exact (and there's no look-around). But + // we never do that for suffixes. Any time we use suffixes, we always + // include a confirmation step. If that ever changes, then it's likely + // this bug will need to be fixed, but last time I looked, it appears + // hard to do so. + assert_eq!( + inexact([I("a"), I("b"), E("c")], [I("bc"), I("ac"), E("c")]), + e(r"a*b*c") + ); + assert_eq!( + inexact([I("a"), I("b"), E("c")], [I("bc"), I("ac"), E("c")]), + e(r"(a+)?(b+)?c") + ); + assert_eq!( + inexact([I("a"), I("b"), E("c")], [I("bc"), I("ac"), E("c")]), + e(r"(a+|)(b+|)c") + ); + // A few more similarish but not identical regexes. These may have a + // similar problem as above. + assert_eq!( + inexact( + [I("a"), I("b"), I("c"), E("")], + [I("c"), I("b"), I("a"), E("")] + ), + e(r"a*b*c*") + ); + assert_eq!(inexact([I("a"), I("b"), I("c")], [I("c")]), e(r"a*b*c+")); + assert_eq!(inexact([I("a"), I("b")], [I("bc")]), e(r"a*b+c")); + assert_eq!(inexact([I("a"), I("b")], [I("c"), I("b")]), e(r"a*b+c*")); + assert_eq!(inexact([I("ab"), E("a")], [I("b"), E("a")]), e(r"ab*")); + assert_eq!( + inexact([I("ab"), E("ac")], [I("bc"), E("ac")]), + e(r"ab*c") + ); + assert_eq!(inexact([I("ab")], [I("b")]), e(r"ab+")); + assert_eq!(inexact([I("ab")], [I("bc")]), e(r"ab+c")); + + assert_eq!( + inexact([I("z"), E("azb")], [I("zazb"), E("azb")]), + e(r"z*azb") + ); + + let expected = + exact(["aaa", "aab", "aba", "abb", "baa", "bab", "bba", "bbb"]); + assert_eq!(expected, e(r"[ab]{3}")); + let expected = inexact( + [ + I("aaa"), + I("aab"), + I("aba"), + I("abb"), + I("baa"), + I("bab"), + I("bba"), + I("bbb"), + ], + [ + I("aaa"), + I("aab"), + I("aba"), + I("abb"), + I("baa"), + I("bab"), + I("bba"), + I("bbb"), + ], + ); + assert_eq!(expected, e(r"[ab]{3,4}")); + } + + #[test] + fn concat() { + let empty: [&str; 0] = []; + + assert_eq!(exact(["abcxyz"]), e(r"abc()xyz")); + assert_eq!(exact(["abcxyz"]), e(r"(abc)(xyz)")); + assert_eq!(exact(["abcmnoxyz"]), e(r"abc()mno()xyz")); + assert_eq!(exact(empty), e(r"abc[a&&b]xyz")); + assert_eq!(exact(["abcxyz"]), e(r"abc[a&&b]*xyz")); + } + + #[test] + fn alternation() { + assert_eq!(exact(["abc", "mno", "xyz"]), e(r"abc|mno|xyz")); + assert_eq!( + inexact( + [E("abc"), I("mZ"), E("mo"), E("xyz")], + [E("abc"), I("Zo"), E("mo"), E("xyz")] + ), + e(r"abc|mZ*o|xyz") + ); + assert_eq!(exact(["abc", "xyz"]), e(r"abc|M[a&&b]N|xyz")); + assert_eq!(exact(["abc", "MN", "xyz"]), e(r"abc|M[a&&b]*N|xyz")); + + assert_eq!(exact(["aaa", "aaaaa"]), e(r"(?:|aa)aaa")); + assert_eq!( + inexact( + [I("aaa"), E(""), I("aaaaa"), E("aa")], + [I("aaa"), E(""), E("aa")] + ), + e(r"(?:|aa)(?:aaa)*") + ); + assert_eq!( + inexact( + [E(""), I("aaa"), E("aa"), I("aaaaa")], + [E(""), I("aaa"), E("aa")] + ), + e(r"(?:|aa)(?:aaa)*?") + ); + + assert_eq!( + inexact([E("a"), I("b"), E("")], [E("a"), I("b"), E("")]), + e(r"a|b*") + ); + assert_eq!(inexact([E("a"), I("b")], [E("a"), I("b")]), e(r"a|b+")); + + assert_eq!( + inexact([I("a"), E("b"), E("c")], [I("ab"), E("b"), E("c")]), + e(r"a*b|c") + ); + + assert_eq!( + inexact( + [E("a"), E("b"), I("c"), E("")], + [E("a"), E("b"), I("c"), E("")] + ), + e(r"a|(?:b|c*)") + ); + + assert_eq!( + inexact( + [I("a"), I("b"), E("c"), I("a"), I("ab"), E("c")], + [I("ac"), I("bc"), E("c"), I("ac"), I("abc"), E("c")], + ), + e(r"(a|b)*c|(a|ab)*c") + ); + + assert_eq!( + exact(["abef", "abgh", "cdef", "cdgh"]), + e(r"(ab|cd)(ef|gh)") + ); + assert_eq!( + exact([ + "abefij", "abefkl", "abghij", "abghkl", "cdefij", "cdefkl", + "cdghij", "cdghkl", + ]), + e(r"(ab|cd)(ef|gh)(ij|kl)") + ); + + assert_eq!(inexact([E("abab")], [E("abab")]), e(r"(ab){2}")); + + assert_eq!(inexact([I("abab")], [I("abab")]), e(r"(ab){2,3}")); + + assert_eq!(inexact([I("abab")], [I("abab")]), e(r"(ab){2,}")); + } + + #[test] + fn impossible() { + let empty: [&str; 0] = []; + + assert_eq!(exact(empty), e(r"[a&&b]")); + assert_eq!(exact(empty), e(r"a[a&&b]")); + assert_eq!(exact(empty), e(r"[a&&b]b")); + assert_eq!(exact(empty), e(r"a[a&&b]b")); + assert_eq!(exact(["a", "b"]), e(r"a|[a&&b]|b")); + assert_eq!(exact(["a", "b"]), e(r"a|c[a&&b]|b")); + assert_eq!(exact(["a", "b"]), e(r"a|[a&&b]d|b")); + assert_eq!(exact(["a", "b"]), e(r"a|c[a&&b]d|b")); + assert_eq!(exact([""]), e(r"[a&&b]*")); + assert_eq!(exact(["MN"]), e(r"M[a&&b]*N")); + } + + // This tests patterns that contain something that defeats literal + // detection, usually because it would blow some limit on the total number + // of literals that can be returned. + // + // The main idea is that when literal extraction sees something that + // it knows will blow a limit, it replaces it with a marker that says + // "any literal will match here." While not necessarily true, the + // over-estimation is just fine for the purposes of literal extraction, + // because the imprecision doesn't matter: too big is too big. + // + // This is one of the trickier parts of literal extraction, since we need + // to make sure all of our literal extraction operations correctly compose + // with the markers. + #[test] + fn anything() { + assert_eq!(infinite(), e(r".")); + assert_eq!(infinite(), e(r"(?s).")); + assert_eq!(infinite(), e(r"[A-Za-z]")); + assert_eq!(infinite(), e(r"[A-Z]")); + assert_eq!(exact([""]), e(r"[A-Z]{0}")); + assert_eq!(infinite(), e(r"[A-Z]?")); + assert_eq!(infinite(), e(r"[A-Z]*")); + assert_eq!(infinite(), e(r"[A-Z]+")); + assert_eq!((seq([I("1")]), Seq::infinite()), e(r"1[A-Z]")); + assert_eq!((seq([I("1")]), seq([I("2")])), e(r"1[A-Z]2")); + assert_eq!((Seq::infinite(), seq([I("123")])), e(r"[A-Z]+123")); + assert_eq!(infinite(), e(r"[A-Z]+123[A-Z]+")); + assert_eq!(infinite(), e(r"1|[A-Z]|3")); + assert_eq!( + (seq([E("1"), I("2"), E("3")]), Seq::infinite()), + e(r"1|2[A-Z]|3"), + ); + assert_eq!( + (Seq::infinite(), seq([E("1"), I("2"), E("3")])), + e(r"1|[A-Z]2|3"), + ); + assert_eq!( + (seq([E("1"), I("2"), E("4")]), seq([E("1"), I("3"), E("4")])), + e(r"1|2[A-Z]3|4"), + ); + assert_eq!((Seq::infinite(), seq([I("2")])), e(r"(?:|1)[A-Z]2")); + assert_eq!(inexact([I("a")], [I("z")]), e(r"a.z")); + } + + // Like the 'anything' test, but it uses smaller limits in order to test + // the logic for effectively aborting literal extraction when the seqs get + // too big. + #[test] + fn anything_small_limits() { + fn prefixes(pattern: &str) -> Seq { + Extractor::new() + .kind(ExtractKind::Prefix) + .limit_total(10) + .extract(&parse(pattern)) + } + + fn suffixes(pattern: &str) -> Seq { + Extractor::new() + .kind(ExtractKind::Suffix) + .limit_total(10) + .extract(&parse(pattern)) + } + + fn e(pattern: &str) -> (Seq, Seq) { + (prefixes(pattern), suffixes(pattern)) + } + + assert_eq!( + ( + seq([ + I("aaa"), + I("aab"), + I("aba"), + I("abb"), + I("baa"), + I("bab"), + I("bba"), + I("bbb") + ]), + seq([ + I("aaa"), + I("aab"), + I("aba"), + I("abb"), + I("baa"), + I("bab"), + I("bba"), + I("bbb") + ]) + ), + e(r"[ab]{3}{3}") + ); + + assert_eq!(infinite(), e(r"ab|cd|ef|gh|ij|kl|mn|op|qr|st|uv|wx|yz")); + } + + #[test] + fn empty() { + assert_eq!(exact([""]), e(r"")); + assert_eq!(exact([""]), e(r"^")); + assert_eq!(exact([""]), e(r"$")); + assert_eq!(exact([""]), e(r"(?m:^)")); + assert_eq!(exact([""]), e(r"(?m:$)")); + assert_eq!(exact([""]), e(r"\b")); + assert_eq!(exact([""]), e(r"\B")); + assert_eq!(exact([""]), e(r"(?-u:\b)")); + assert_eq!(exact([""]), e(r"(?-u:\B)")); + } + + #[test] + fn odds_and_ends() { + assert_eq!((Seq::infinite(), seq([I("a")])), e(r".a")); + assert_eq!((seq([I("a")]), Seq::infinite()), e(r"a.")); + assert_eq!(infinite(), e(r"a|.")); + assert_eq!(infinite(), e(r".|a")); + + let pat = r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]"; + let expected = inexact( + ["Mo'am", "Moam", "Mu'am", "Muam"].map(I), + [ + "ddafi", "ddafy", "dhafi", "dhafy", "dzafi", "dzafy", "dafi", + "dafy", "tdafi", "tdafy", "thafi", "thafy", "tzafi", "tzafy", + "tafi", "tafy", "zdafi", "zdafy", "zhafi", "zhafy", "zzafi", + "zzafy", "zafi", "zafy", + ] + .map(I), + ); + assert_eq!(expected, e(pat)); + + assert_eq!( + (seq(["fn is_", "fn as_"].map(I)), Seq::infinite()), + e(r"fn is_([A-Z]+)|fn as_([A-Z]+)"), + ); + assert_eq!( + inexact([I("foo")], [I("quux")]), + e(r"foo[A-Z]+bar[A-Z]+quux") + ); + assert_eq!(infinite(), e(r"[A-Z]+bar[A-Z]+")); + assert_eq!( + exact(["Sherlock Holmes"]), + e(r"(?m)^Sherlock Holmes|Sherlock Holmes$") + ); + + assert_eq!(exact(["sa", "sb"]), e(r"\bs(?:[ab])")); + } + + // This tests a specific regex along with some heuristic steps to reduce + // the sequences extracted. This is meant to roughly correspond to the + // types of heuristics used to shrink literal sets in practice. (Shrinking + // is done because you want to balance "spend too much work looking for + // too many literals" and "spend too much work processing false positive + // matches from short literals.") + #[test] + #[cfg(feature = "unicode-case")] + fn holmes() { + let expected = inexact( + ["HOL", "HOl", "HoL", "Hol", "hOL", "hOl", "hoL", "hol"].map(I), + [ + "MES", "MEs", "Eſ", "MeS", "Mes", "eſ", "mES", "mEs", "meS", + "mes", + ] + .map(I), + ); + let (mut prefixes, mut suffixes) = e(r"(?i)Holmes"); + prefixes.keep_first_bytes(3); + suffixes.keep_last_bytes(3); + prefixes.minimize_by_preference(); + suffixes.minimize_by_preference(); + assert_eq!(expected, (prefixes, suffixes)); + } + + // This tests that we get some kind of literals extracted for a beefier + // alternation with case insensitive mode enabled. At one point during + // development, this returned nothing, and motivated some special case + // code in Extractor::union to try and trim down the literal sequences + // if the union would blow the limits set. + #[test] + #[cfg(feature = "unicode-case")] + fn holmes_alt() { + let mut pre = + prefixes(r"(?i)Sherlock|Holmes|Watson|Irene|Adler|John|Baker"); + assert!(pre.len().unwrap() > 0); + pre.optimize_for_prefix_by_preference(); + assert!(pre.len().unwrap() > 0); + } + + // See: https://github.com/rust-lang/regex/security/advisories/GHSA-m5pq-gvj9-9vr8 + // See: CVE-2022-24713 + // + // We test this here to ensure literal extraction completes in reasonable + // time and isn't materially impacted by these sorts of pathological + // repeats. + #[test] + fn crazy_repeats() { + assert_eq!(inexact([E("")], [E("")]), e(r"(?:){4294967295}")); + assert_eq!( + inexact([E("")], [E("")]), + e(r"(?:){64}{64}{64}{64}{64}{64}") + ); + assert_eq!(inexact([E("")], [E("")]), e(r"x{0}{4294967295}")); + assert_eq!(inexact([E("")], [E("")]), e(r"(?:|){4294967295}")); + + assert_eq!( + inexact([E("")], [E("")]), + e(r"(?:){8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}") + ); + let repa = "a".repeat(100); + assert_eq!( + inexact([I(&repa)], [I(&repa)]), + e(r"a{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}") + ); + } + + #[test] + fn huge() { + let pat = r#"(?-u) + 2(?: + [45]\d{3}| + 7(?: + 1[0-267]| + 2[0-289]| + 3[0-29]| + 4[01]| + 5[1-3]| + 6[013]| + 7[0178]| + 91 + )| + 8(?: + 0[125]| + [139][1-6]| + 2[0157-9]| + 41| + 6[1-35]| + 7[1-5]| + 8[1-8]| + 90 + )| + 9(?: + 0[0-2]| + 1[0-4]| + 2[568]| + 3[3-6]| + 5[5-7]| + 6[0167]| + 7[15]| + 8[0146-9] + ) + )\d{4}| + 3(?: + 12?[5-7]\d{2}| + 0(?: + 2(?: + [025-79]\d| + [348]\d{1,2} + )| + 3(?: + [2-4]\d| + [56]\d? + ) + )| + 2(?: + 1\d{2}| + 2(?: + [12]\d| + [35]\d{1,2}| + 4\d? + ) + )| + 3(?: + 1\d{2}| + 2(?: + [2356]\d| + 4\d{1,2} + ) + )| + 4(?: + 1\d{2}| + 2(?: + 2\d{1,2}| + [47]| + 5\d{2} + ) + )| + 5(?: + 1\d{2}| + 29 + )| + [67]1\d{2}| + 8(?: + 1\d{2}| + 2(?: + 2\d{2}| + 3| + 4\d + ) + ) + )\d{3}| + 4(?: + 0(?: + 2(?: + [09]\d| + 7 + )| + 33\d{2} + )| + 1\d{3}| + 2(?: + 1\d{2}| + 2(?: + [25]\d?| + [348]\d| + [67]\d{1,2} + ) + )| + 3(?: + 1\d{2}(?: + \d{2} + )?| + 2(?: + [045]\d| + [236-9]\d{1,2} + )| + 32\d{2} + )| + 4(?: + [18]\d{2}| + 2(?: + [2-46]\d{2}| + 3 + )| + 5[25]\d{2} + )| + 5(?: + 1\d{2}| + 2(?: + 3\d| + 5 + ) + )| + 6(?: + [18]\d{2}| + 2(?: + 3(?: + \d{2} + )?| + [46]\d{1,2}| + 5\d{2}| + 7\d + )| + 5(?: + 3\d?| + 4\d| + [57]\d{1,2}| + 6\d{2}| + 8 + ) + )| + 71\d{2}| + 8(?: + [18]\d{2}| + 23\d{2}| + 54\d{2} + )| + 9(?: + [18]\d{2}| + 2[2-5]\d{2}| + 53\d{1,2} + ) + )\d{3}| + 5(?: + 02[03489]\d{2}| + 1\d{2}| + 2(?: + 1\d{2}| + 2(?: + 2(?: + \d{2} + )?| + [457]\d{2} + ) + )| + 3(?: + 1\d{2}| + 2(?: + [37](?: + \d{2} + )?| + [569]\d{2} + ) + )| + 4(?: + 1\d{2}| + 2[46]\d{2} + )| + 5(?: + 1\d{2}| + 26\d{1,2} + )| + 6(?: + [18]\d{2}| + 2| + 53\d{2} + )| + 7(?: + 1| + 24 + )\d{2}| + 8(?: + 1| + 26 + )\d{2}| + 91\d{2} + )\d{3}| + 6(?: + 0(?: + 1\d{2}| + 2(?: + 3\d{2}| + 4\d{1,2} + ) + )| + 2(?: + 2[2-5]\d{2}| + 5(?: + [3-5]\d{2}| + 7 + )| + 8\d{2} + )| + 3(?: + 1| + 2[3478] + )\d{2}| + 4(?: + 1| + 2[34] + )\d{2}| + 5(?: + 1| + 2[47] + )\d{2}| + 6(?: + [18]\d{2}| + 6(?: + 2(?: + 2\d| + [34]\d{2} + )| + 5(?: + [24]\d{2}| + 3\d| + 5\d{1,2} + ) + ) + )| + 72[2-5]\d{2}| + 8(?: + 1\d{2}| + 2[2-5]\d{2} + )| + 9(?: + 1\d{2}| + 2[2-6]\d{2} + ) + )\d{3}| + 7(?: + (?: + 02| + [3-589]1| + 6[12]| + 72[24] + )\d{2}| + 21\d{3}| + 32 + )\d{3}| + 8(?: + (?: + 4[12]| + [5-7]2| + 1\d? + )| + (?: + 0| + 3[12]| + [5-7]1| + 217 + )\d + )\d{4}| + 9(?: + [35]1| + (?: + [024]2| + 81 + )\d| + (?: + 1| + [24]1 + )\d{2} + )\d{3} + "#; + // TODO: This is a good candidate of a seq of literals that could be + // shrunk quite a bit and still be very productive with respect to + // literal optimizations. + let (prefixes, suffixes) = e(pat); + assert!(!suffixes.is_finite()); + assert_eq!(Some(243), prefixes.len()); + } + + #[test] + fn optimize() { + // This gets a common prefix that isn't too short. + let (p, s) = + opt(["foobarfoobar", "foobar", "foobarzfoobar", "foobarfoobar"]); + assert_eq!(seq([I("foobar")]), p); + assert_eq!(seq([I("foobar")]), s); + + // This also finds a common prefix, but since it's only one byte, it + // prefers the multiple literals. + let (p, s) = opt(["abba", "akka", "abccba"]); + assert_eq!(exact(["abba", "akka", "abccba"]), (p, s)); + + let (p, s) = opt(["sam", "samwise"]); + assert_eq!((seq([E("sam")]), seq([E("sam"), E("samwise")])), (p, s)); + + // The empty string is poisonous, so our seq becomes infinite, even + // though all literals are exact. + let (p, s) = opt(["foobarfoo", "foo", "", "foozfoo", "foofoo"]); + assert!(!p.is_finite()); + assert!(!s.is_finite()); + + // A space is also poisonous, so our seq becomes infinite. But this + // only gets triggered when we don't have a completely exact sequence. + // When the sequence is exact, spaces are okay, since we presume that + // any prefilter will match a space more quickly than the regex engine. + // (When the sequence is exact, there's a chance of the prefilter being + // used without needing the regex engine at all.) + let mut p = seq([E("foobarfoo"), I("foo"), E(" "), E("foofoo")]); + p.optimize_for_prefix_by_preference(); + assert!(!p.is_finite()); + } +} diff --git a/vendor/regex-syntax/src/hir/mod.rs b/vendor/regex-syntax/src/hir/mod.rs new file mode 100644 index 0000000..ce38ead --- /dev/null +++ b/vendor/regex-syntax/src/hir/mod.rs @@ -0,0 +1,3861 @@ +/*! +Defines a high-level intermediate (HIR) representation for regular expressions. + +The HIR is represented by the [`Hir`] type, and it principally constructed via +[translation](translate) from an [`Ast`](crate::ast::Ast). Alternatively, users +may use the smart constructors defined on `Hir` to build their own by hand. The +smart constructors simultaneously simplify and "optimize" the HIR, and are also +the same routines used by translation. + +Most regex engines only have an HIR like this, and usually construct it +directly from the concrete syntax. This crate however first parses the +concrete syntax into an `Ast`, and only then creates the HIR from the `Ast`, +as mentioned above. It's done this way to facilitate better error reporting, +and to have a structured representation of a regex that faithfully represents +its concrete syntax. Namely, while an `Hir` value can be converted back to an +equivalent regex pattern string, it is unlikely to look like the original due +to its simplified structure. +*/ + +use core::{char, cmp}; + +use alloc::{ + boxed::Box, + format, + string::{String, ToString}, + vec, + vec::Vec, +}; + +use crate::{ + ast::Span, + hir::interval::{Interval, IntervalSet, IntervalSetIter}, + unicode, +}; + +pub use crate::{ + hir::visitor::{visit, Visitor}, + unicode::CaseFoldError, +}; + +mod interval; +pub mod literal; +pub mod print; +pub mod translate; +mod visitor; + +/// An error that can occur while translating an `Ast` to a `Hir`. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Error { + /// The kind of error. + kind: ErrorKind, + /// The original pattern that the translator's Ast was parsed from. Every + /// span in an error is a valid range into this string. + pattern: String, + /// The span of this error, derived from the Ast given to the translator. + span: Span, +} + +impl Error { + /// Return the type of this error. + pub fn kind(&self) -> &ErrorKind { + &self.kind + } + + /// The original pattern string in which this error occurred. + /// + /// Every span reported by this error is reported in terms of this string. + pub fn pattern(&self) -> &str { + &self.pattern + } + + /// Return the span at which this error occurred. + pub fn span(&self) -> &Span { + &self.span + } +} + +/// The type of an error that occurred while building an `Hir`. +/// +/// This error type is marked as `non_exhaustive`. This means that adding a +/// new variant is not considered a breaking change. +#[non_exhaustive] +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum ErrorKind { + /// This error occurs when a Unicode feature is used when Unicode + /// support is disabled. For example `(?-u:\pL)` would trigger this error. + UnicodeNotAllowed, + /// This error occurs when translating a pattern that could match a byte + /// sequence that isn't UTF-8 and `utf8` was enabled. + InvalidUtf8, + /// This error occurs when one uses a non-ASCII byte for a line terminator, + /// but where Unicode mode is enabled and UTF-8 mode is disabled. + InvalidLineTerminator, + /// This occurs when an unrecognized Unicode property name could not + /// be found. + UnicodePropertyNotFound, + /// This occurs when an unrecognized Unicode property value could not + /// be found. + UnicodePropertyValueNotFound, + /// This occurs when a Unicode-aware Perl character class (`\w`, `\s` or + /// `\d`) could not be found. This can occur when the `unicode-perl` + /// crate feature is not enabled. + UnicodePerlClassNotFound, + /// This occurs when the Unicode simple case mapping tables are not + /// available, and the regular expression required Unicode aware case + /// insensitivity. + UnicodeCaseUnavailable, +} + +#[cfg(feature = "std")] +impl std::error::Error for Error {} + +impl core::fmt::Display for Error { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + crate::error::Formatter::from(self).fmt(f) + } +} + +impl core::fmt::Display for ErrorKind { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + use self::ErrorKind::*; + + let msg = match *self { + UnicodeNotAllowed => "Unicode not allowed here", + InvalidUtf8 => "pattern can match invalid UTF-8", + InvalidLineTerminator => "invalid line terminator, must be ASCII", + UnicodePropertyNotFound => "Unicode property not found", + UnicodePropertyValueNotFound => "Unicode property value not found", + UnicodePerlClassNotFound => { + "Unicode-aware Perl class not found \ + (make sure the unicode-perl feature is enabled)" + } + UnicodeCaseUnavailable => { + "Unicode-aware case insensitivity matching is not available \ + (make sure the unicode-case feature is enabled)" + } + }; + f.write_str(msg) + } +} + +/// A high-level intermediate representation (HIR) for a regular expression. +/// +/// An HIR value is a combination of a [`HirKind`] and a set of [`Properties`]. +/// An `HirKind` indicates what kind of regular expression it is (a literal, +/// a repetition, a look-around assertion, etc.), where as a `Properties` +/// describes various facts about the regular expression. For example, whether +/// it matches UTF-8 or if it matches the empty string. +/// +/// The HIR of a regular expression represents an intermediate step between +/// its abstract syntax (a structured description of the concrete syntax) and +/// an actual regex matcher. The purpose of HIR is to make regular expressions +/// easier to analyze. In particular, the AST is much more complex than the +/// HIR. For example, while an AST supports arbitrarily nested character +/// classes, the HIR will flatten all nested classes into a single set. The HIR +/// will also "compile away" every flag present in the concrete syntax. For +/// example, users of HIR expressions never need to worry about case folding; +/// it is handled automatically by the translator (e.g., by translating +/// `(?i:A)` to `[aA]`). +/// +/// The specific type of an HIR expression can be accessed via its `kind` +/// or `into_kind` methods. This extra level of indirection exists for two +/// reasons: +/// +/// 1. Construction of an HIR expression *must* use the constructor methods on +/// this `Hir` type instead of building the `HirKind` values directly. This +/// permits construction to enforce invariants like "concatenations always +/// consist of two or more sub-expressions." +/// 2. Every HIR expression contains attributes that are defined inductively, +/// and can be computed cheaply during the construction process. For example, +/// one such attribute is whether the expression must match at the beginning of +/// the haystack. +/// +/// In particular, if you have an `HirKind` value, then there is intentionally +/// no way to build an `Hir` value from it. You instead need to do case +/// analysis on the `HirKind` value and build the `Hir` value using its smart +/// constructors. +/// +/// # UTF-8 +/// +/// If the HIR was produced by a translator with +/// [`TranslatorBuilder::utf8`](translate::TranslatorBuilder::utf8) enabled, +/// then the HIR is guaranteed to match UTF-8 exclusively for all non-empty +/// matches. +/// +/// For empty matches, those can occur at any position. It is the +/// responsibility of the regex engine to determine whether empty matches are +/// permitted between the code units of a single codepoint. +/// +/// # Stack space +/// +/// This type defines its own destructor that uses constant stack space and +/// heap space proportional to the size of the HIR. +/// +/// Also, an `Hir`'s `fmt::Display` implementation prints an HIR as a regular +/// expression pattern string, and uses constant stack space and heap space +/// proportional to the size of the `Hir`. The regex it prints is guaranteed to +/// be _semantically_ equivalent to the original concrete syntax, but it may +/// look very different. (And potentially not practically readable by a human.) +/// +/// An `Hir`'s `fmt::Debug` implementation currently does not use constant +/// stack space. The implementation will also suppress some details (such as +/// the `Properties` inlined into every `Hir` value to make it less noisy). +#[derive(Clone, Eq, PartialEq)] +pub struct Hir { + /// The underlying HIR kind. + kind: HirKind, + /// Analysis info about this HIR, computed during construction. + props: Properties, +} + +/// Methods for accessing the underlying `HirKind` and `Properties`. +impl Hir { + /// Returns a reference to the underlying HIR kind. + pub fn kind(&self) -> &HirKind { + &self.kind + } + + /// Consumes ownership of this HIR expression and returns its underlying + /// `HirKind`. + pub fn into_kind(mut self) -> HirKind { + core::mem::replace(&mut self.kind, HirKind::Empty) + } + + /// Returns the properties computed for this `Hir`. + pub fn properties(&self) -> &Properties { + &self.props + } + + /// Splits this HIR into its constituent parts. + /// + /// This is useful because `let Hir { kind, props } = hir;` does not work + /// because of `Hir`'s custom `Drop` implementation. + fn into_parts(mut self) -> (HirKind, Properties) { + ( + core::mem::replace(&mut self.kind, HirKind::Empty), + core::mem::replace(&mut self.props, Properties::empty()), + ) + } +} + +/// Smart constructors for HIR values. +/// +/// These constructors are called "smart" because they do inductive work or +/// simplifications. For example, calling `Hir::repetition` with a repetition +/// like `a{0}` will actually return a `Hir` with a `HirKind::Empty` kind +/// since it is equivalent to an empty regex. Another example is calling +/// `Hir::concat(vec![expr])`. Instead of getting a `HirKind::Concat`, you'll +/// just get back the original `expr` since it's precisely equivalent. +/// +/// Smart constructors enable maintaining invariants about the HIR data type +/// while also simulanteously keeping the representation as simple as possible. +impl Hir { + /// Returns an empty HIR expression. + /// + /// An empty HIR expression always matches, including the empty string. + #[inline] + pub fn empty() -> Hir { + let props = Properties::empty(); + Hir { kind: HirKind::Empty, props } + } + + /// Returns an HIR expression that can never match anything. That is, + /// the size of the set of strings in the language described by the HIR + /// returned is `0`. + /// + /// This is distinct from [`Hir::empty`] in that the empty string matches + /// the HIR returned by `Hir::empty`. That is, the set of strings in the + /// language describe described by `Hir::empty` is non-empty. + /// + /// Note that currently, the HIR returned uses an empty character class to + /// indicate that nothing can match. An equivalent expression that cannot + /// match is an empty alternation, but all such "fail" expressions are + /// normalized (via smart constructors) to empty character classes. This is + /// because empty character classes can be spelled in the concrete syntax + /// of a regex (e.g., `\P{any}` or `(?-u:[^\x00-\xFF])` or `[a&&b]`), but + /// empty alternations cannot. + #[inline] + pub fn fail() -> Hir { + let class = Class::Bytes(ClassBytes::empty()); + let props = Properties::class(&class); + // We can't just call Hir::class here because it defers to Hir::fail + // in order to canonicalize the Hir value used to represent "cannot + // match." + Hir { kind: HirKind::Class(class), props } + } + + /// Creates a literal HIR expression. + /// + /// This accepts anything that can be converted into a `Box<[u8]>`. + /// + /// Note that there is no mechanism for storing a `char` or a `Box<str>` + /// in an HIR. Everything is "just bytes." Whether a `Literal` (or + /// any HIR node) matches valid UTF-8 exclusively can be queried via + /// [`Properties::is_utf8`]. + /// + /// # Example + /// + /// This example shows that concatenations of `Literal` HIR values will + /// automatically get flattened and combined together. So for example, even + /// if you concat multiple `Literal` values that are themselves not valid + /// UTF-8, they might add up to valid UTF-8. This also demonstrates just + /// how "smart" Hir's smart constructors are. + /// + /// ``` + /// use regex_syntax::hir::{Hir, HirKind, Literal}; + /// + /// let literals = vec![ + /// Hir::literal([0xE2]), + /// Hir::literal([0x98]), + /// Hir::literal([0x83]), + /// ]; + /// // Each literal, on its own, is invalid UTF-8. + /// assert!(literals.iter().all(|hir| !hir.properties().is_utf8())); + /// + /// let concat = Hir::concat(literals); + /// // But the concatenation is valid UTF-8! + /// assert!(concat.properties().is_utf8()); + /// + /// // And also notice that the literals have been concatenated into a + /// // single `Literal`, to the point where there is no explicit `Concat`! + /// let expected = HirKind::Literal(Literal(Box::from("☃".as_bytes()))); + /// assert_eq!(&expected, concat.kind()); + /// ``` + #[inline] + pub fn literal<B: Into<Box<[u8]>>>(lit: B) -> Hir { + let bytes = lit.into(); + if bytes.is_empty() { + return Hir::empty(); + } + + let lit = Literal(bytes); + let props = Properties::literal(&lit); + Hir { kind: HirKind::Literal(lit), props } + } + + /// Creates a class HIR expression. The class may either be defined over + /// ranges of Unicode codepoints or ranges of raw byte values. + /// + /// Note that an empty class is permitted. An empty class is equivalent to + /// `Hir::fail()`. + #[inline] + pub fn class(class: Class) -> Hir { + if class.is_empty() { + return Hir::fail(); + } else if let Some(bytes) = class.literal() { + return Hir::literal(bytes); + } + let props = Properties::class(&class); + Hir { kind: HirKind::Class(class), props } + } + + /// Creates a look-around assertion HIR expression. + #[inline] + pub fn look(look: Look) -> Hir { + let props = Properties::look(look); + Hir { kind: HirKind::Look(look), props } + } + + /// Creates a repetition HIR expression. + #[inline] + pub fn repetition(mut rep: Repetition) -> Hir { + // If the sub-expression of a repetition can only match the empty + // string, then we force its maximum to be at most 1. + if rep.sub.properties().maximum_len() == Some(0) { + rep.min = cmp::min(rep.min, 1); + rep.max = rep.max.map(|n| cmp::min(n, 1)).or(Some(1)); + } + // The regex 'a{0}' is always equivalent to the empty regex. This is + // true even when 'a' is an expression that never matches anything + // (like '\P{any}'). + // + // Additionally, the regex 'a{1}' is always equivalent to 'a'. + if rep.min == 0 && rep.max == Some(0) { + return Hir::empty(); + } else if rep.min == 1 && rep.max == Some(1) { + return *rep.sub; + } + let props = Properties::repetition(&rep); + Hir { kind: HirKind::Repetition(rep), props } + } + + /// Creates a capture HIR expression. + /// + /// Note that there is no explicit HIR value for a non-capturing group. + /// Since a non-capturing group only exists to override precedence in the + /// concrete syntax and since an HIR already does its own grouping based on + /// what is parsed, there is no need to explicitly represent non-capturing + /// groups in the HIR. + #[inline] + pub fn capture(capture: Capture) -> Hir { + let props = Properties::capture(&capture); + Hir { kind: HirKind::Capture(capture), props } + } + + /// Returns the concatenation of the given expressions. + /// + /// This attempts to flatten and simplify the concatenation as appropriate. + /// + /// # Example + /// + /// This shows a simple example of basic flattening of both concatenations + /// and literals. + /// + /// ``` + /// use regex_syntax::hir::Hir; + /// + /// let hir = Hir::concat(vec![ + /// Hir::concat(vec![ + /// Hir::literal([b'a']), + /// Hir::literal([b'b']), + /// Hir::literal([b'c']), + /// ]), + /// Hir::concat(vec![ + /// Hir::literal([b'x']), + /// Hir::literal([b'y']), + /// Hir::literal([b'z']), + /// ]), + /// ]); + /// let expected = Hir::literal("abcxyz".as_bytes()); + /// assert_eq!(expected, hir); + /// ``` + pub fn concat(subs: Vec<Hir>) -> Hir { + // We rebuild the concatenation by simplifying it. Would be nice to do + // it in place, but that seems a little tricky? + let mut new = vec![]; + // This gobbles up any adjacent literals in a concatenation and smushes + // them together. Basically, when we see a literal, we add its bytes + // to 'prior_lit', and whenever we see anything else, we first take + // any bytes in 'prior_lit' and add it to the 'new' concatenation. + let mut prior_lit: Option<Vec<u8>> = None; + for sub in subs { + let (kind, props) = sub.into_parts(); + match kind { + HirKind::Literal(Literal(bytes)) => { + if let Some(ref mut prior_bytes) = prior_lit { + prior_bytes.extend_from_slice(&bytes); + } else { + prior_lit = Some(bytes.to_vec()); + } + } + // We also flatten concats that are direct children of another + // concat. We only need to do this one level deep since + // Hir::concat is the only way to build concatenations, and so + // flattening happens inductively. + HirKind::Concat(subs2) => { + for sub2 in subs2 { + let (kind2, props2) = sub2.into_parts(); + match kind2 { + HirKind::Literal(Literal(bytes)) => { + if let Some(ref mut prior_bytes) = prior_lit { + prior_bytes.extend_from_slice(&bytes); + } else { + prior_lit = Some(bytes.to_vec()); + } + } + kind2 => { + if let Some(prior_bytes) = prior_lit.take() { + new.push(Hir::literal(prior_bytes)); + } + new.push(Hir { kind: kind2, props: props2 }); + } + } + } + } + // We can just skip empty HIRs. + HirKind::Empty => {} + kind => { + if let Some(prior_bytes) = prior_lit.take() { + new.push(Hir::literal(prior_bytes)); + } + new.push(Hir { kind, props }); + } + } + } + if let Some(prior_bytes) = prior_lit.take() { + new.push(Hir::literal(prior_bytes)); + } + if new.is_empty() { + return Hir::empty(); + } else if new.len() == 1 { + return new.pop().unwrap(); + } + let props = Properties::concat(&new); + Hir { kind: HirKind::Concat(new), props } + } + + /// Returns the alternation of the given expressions. + /// + /// This flattens and simplifies the alternation as appropriate. This may + /// include factoring out common prefixes or even rewriting the alternation + /// as a character class. + /// + /// Note that an empty alternation is equivalent to `Hir::fail()`. (It + /// is not possible for one to write an empty alternation, or even an + /// alternation with a single sub-expression, in the concrete syntax of a + /// regex.) + /// + /// # Example + /// + /// This is a simple example showing how an alternation might get + /// simplified. + /// + /// ``` + /// use regex_syntax::hir::{Hir, Class, ClassUnicode, ClassUnicodeRange}; + /// + /// let hir = Hir::alternation(vec![ + /// Hir::literal([b'a']), + /// Hir::literal([b'b']), + /// Hir::literal([b'c']), + /// Hir::literal([b'd']), + /// Hir::literal([b'e']), + /// Hir::literal([b'f']), + /// ]); + /// let expected = Hir::class(Class::Unicode(ClassUnicode::new([ + /// ClassUnicodeRange::new('a', 'f'), + /// ]))); + /// assert_eq!(expected, hir); + /// ``` + /// + /// And another example showing how common prefixes might get factored + /// out. + /// + /// ``` + /// use regex_syntax::hir::{Hir, Class, ClassUnicode, ClassUnicodeRange}; + /// + /// let hir = Hir::alternation(vec![ + /// Hir::concat(vec![ + /// Hir::literal("abc".as_bytes()), + /// Hir::class(Class::Unicode(ClassUnicode::new([ + /// ClassUnicodeRange::new('A', 'Z'), + /// ]))), + /// ]), + /// Hir::concat(vec![ + /// Hir::literal("abc".as_bytes()), + /// Hir::class(Class::Unicode(ClassUnicode::new([ + /// ClassUnicodeRange::new('a', 'z'), + /// ]))), + /// ]), + /// ]); + /// let expected = Hir::concat(vec![ + /// Hir::literal("abc".as_bytes()), + /// Hir::alternation(vec![ + /// Hir::class(Class::Unicode(ClassUnicode::new([ + /// ClassUnicodeRange::new('A', 'Z'), + /// ]))), + /// Hir::class(Class::Unicode(ClassUnicode::new([ + /// ClassUnicodeRange::new('a', 'z'), + /// ]))), + /// ]), + /// ]); + /// assert_eq!(expected, hir); + /// ``` + /// + /// Note that these sorts of simplifications are not guaranteed. + pub fn alternation(subs: Vec<Hir>) -> Hir { + // We rebuild the alternation by simplifying it. We proceed similarly + // as the concatenation case. But in this case, there's no literal + // simplification happening. We're just flattening alternations. + let mut new = Vec::with_capacity(subs.len()); + for sub in subs { + let (kind, props) = sub.into_parts(); + match kind { + HirKind::Alternation(subs2) => { + new.extend(subs2); + } + kind => { + new.push(Hir { kind, props }); + } + } + } + if new.is_empty() { + return Hir::fail(); + } else if new.len() == 1 { + return new.pop().unwrap(); + } + // Now that it's completely flattened, look for the special case of + // 'char1|char2|...|charN' and collapse that into a class. Note that + // we look for 'char' first and then bytes. The issue here is that if + // we find both non-ASCII codepoints and non-ASCII singleton bytes, + // then it isn't actually possible to smush them into a single class. + // (Because classes are either "all codepoints" or "all bytes." You + // can have a class that both matches non-ASCII but valid UTF-8 and + // invalid UTF-8.) So we look for all chars and then all bytes, and + // don't handle anything else. + if let Some(singletons) = singleton_chars(&new) { + let it = singletons + .into_iter() + .map(|ch| ClassUnicodeRange { start: ch, end: ch }); + return Hir::class(Class::Unicode(ClassUnicode::new(it))); + } + if let Some(singletons) = singleton_bytes(&new) { + let it = singletons + .into_iter() + .map(|b| ClassBytesRange { start: b, end: b }); + return Hir::class(Class::Bytes(ClassBytes::new(it))); + } + // Similar to singleton chars, we can also look for alternations of + // classes. Those can be smushed into a single class. + if let Some(cls) = class_chars(&new) { + return Hir::class(cls); + } + if let Some(cls) = class_bytes(&new) { + return Hir::class(cls); + } + // Factor out a common prefix if we can, which might potentially + // simplify the expression and unlock other optimizations downstream. + // It also might generally make NFA matching and DFA construction + // faster by reducing the scope of branching in the regex. + new = match lift_common_prefix(new) { + Ok(hir) => return hir, + Err(unchanged) => unchanged, + }; + let props = Properties::alternation(&new); + Hir { kind: HirKind::Alternation(new), props } + } + + /// Returns an HIR expression for `.`. + /// + /// * [`Dot::AnyChar`] maps to `(?su-R:.)`. + /// * [`Dot::AnyByte`] maps to `(?s-Ru:.)`. + /// * [`Dot::AnyCharExceptLF`] maps to `(?u-Rs:.)`. + /// * [`Dot::AnyCharExceptCRLF`] maps to `(?Ru-s:.)`. + /// * [`Dot::AnyByteExceptLF`] maps to `(?-Rsu:.)`. + /// * [`Dot::AnyByteExceptCRLF`] maps to `(?R-su:.)`. + /// + /// # Example + /// + /// Note that this is a convenience routine for constructing the correct + /// character class based on the value of `Dot`. There is no explicit "dot" + /// HIR value. It is just an abbreviation for a common character class. + /// + /// ``` + /// use regex_syntax::hir::{Hir, Dot, Class, ClassBytes, ClassBytesRange}; + /// + /// let hir = Hir::dot(Dot::AnyByte); + /// let expected = Hir::class(Class::Bytes(ClassBytes::new([ + /// ClassBytesRange::new(0x00, 0xFF), + /// ]))); + /// assert_eq!(expected, hir); + /// ``` + #[inline] + pub fn dot(dot: Dot) -> Hir { + match dot { + Dot::AnyChar => { + let mut cls = ClassUnicode::empty(); + cls.push(ClassUnicodeRange::new('\0', '\u{10FFFF}')); + Hir::class(Class::Unicode(cls)) + } + Dot::AnyByte => { + let mut cls = ClassBytes::empty(); + cls.push(ClassBytesRange::new(b'\0', b'\xFF')); + Hir::class(Class::Bytes(cls)) + } + Dot::AnyCharExcept(ch) => { + let mut cls = + ClassUnicode::new([ClassUnicodeRange::new(ch, ch)]); + cls.negate(); + Hir::class(Class::Unicode(cls)) + } + Dot::AnyCharExceptLF => { + let mut cls = ClassUnicode::empty(); + cls.push(ClassUnicodeRange::new('\0', '\x09')); + cls.push(ClassUnicodeRange::new('\x0B', '\u{10FFFF}')); + Hir::class(Class::Unicode(cls)) + } + Dot::AnyCharExceptCRLF => { + let mut cls = ClassUnicode::empty(); + cls.push(ClassUnicodeRange::new('\0', '\x09')); + cls.push(ClassUnicodeRange::new('\x0B', '\x0C')); + cls.push(ClassUnicodeRange::new('\x0E', '\u{10FFFF}')); + Hir::class(Class::Unicode(cls)) + } + Dot::AnyByteExcept(byte) => { + let mut cls = + ClassBytes::new([ClassBytesRange::new(byte, byte)]); + cls.negate(); + Hir::class(Class::Bytes(cls)) + } + Dot::AnyByteExceptLF => { + let mut cls = ClassBytes::empty(); + cls.push(ClassBytesRange::new(b'\0', b'\x09')); + cls.push(ClassBytesRange::new(b'\x0B', b'\xFF')); + Hir::class(Class::Bytes(cls)) + } + Dot::AnyByteExceptCRLF => { + let mut cls = ClassBytes::empty(); + cls.push(ClassBytesRange::new(b'\0', b'\x09')); + cls.push(ClassBytesRange::new(b'\x0B', b'\x0C')); + cls.push(ClassBytesRange::new(b'\x0E', b'\xFF')); + Hir::class(Class::Bytes(cls)) + } + } + } +} + +/// The underlying kind of an arbitrary [`Hir`] expression. +/// +/// An `HirKind` is principally useful for doing case analysis on the type +/// of a regular expression. If you're looking to build new `Hir` values, +/// then you _must_ use the smart constructors defined on `Hir`, like +/// [`Hir::repetition`], to build new `Hir` values. The API intentionally does +/// not expose any way of building an `Hir` directly from an `HirKind`. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum HirKind { + /// The empty regular expression, which matches everything, including the + /// empty string. + Empty, + /// A literalstring that matches exactly these bytes. + Literal(Literal), + /// A single character class that matches any of the characters in the + /// class. A class can either consist of Unicode scalar values as + /// characters, or it can use bytes. + /// + /// A class may be empty. In which case, it matches nothing. + Class(Class), + /// A look-around assertion. A look-around match always has zero length. + Look(Look), + /// A repetition operation applied to a sub-expression. + Repetition(Repetition), + /// A capturing group, which contains a sub-expression. + Capture(Capture), + /// A concatenation of expressions. + /// + /// A concatenation matches only if each of its sub-expressions match one + /// after the other. + /// + /// Concatenations are guaranteed by `Hir`'s smart constructors to always + /// have at least two sub-expressions. + Concat(Vec<Hir>), + /// An alternation of expressions. + /// + /// An alternation matches only if at least one of its sub-expressions + /// match. If multiple sub-expressions match, then the leftmost is + /// preferred. + /// + /// Alternations are guaranteed by `Hir`'s smart constructors to always + /// have at least two sub-expressions. + Alternation(Vec<Hir>), +} + +impl HirKind { + /// Returns a slice of this kind's sub-expressions, if any. + pub fn subs(&self) -> &[Hir] { + use core::slice::from_ref; + + match *self { + HirKind::Empty + | HirKind::Literal(_) + | HirKind::Class(_) + | HirKind::Look(_) => &[], + HirKind::Repetition(Repetition { ref sub, .. }) => from_ref(sub), + HirKind::Capture(Capture { ref sub, .. }) => from_ref(sub), + HirKind::Concat(ref subs) => subs, + HirKind::Alternation(ref subs) => subs, + } + } +} + +impl core::fmt::Debug for Hir { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + self.kind.fmt(f) + } +} + +/// Print a display representation of this Hir. +/// +/// The result of this is a valid regular expression pattern string. +/// +/// This implementation uses constant stack space and heap space proportional +/// to the size of the `Hir`. +impl core::fmt::Display for Hir { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + crate::hir::print::Printer::new().print(self, f) + } +} + +/// The high-level intermediate representation of a literal. +/// +/// A literal corresponds to `0` or more bytes that should be matched +/// literally. The smart constructors defined on `Hir` will automatically +/// concatenate adjacent literals into one literal, and will even automatically +/// replace empty literals with `Hir::empty()`. +/// +/// Note that despite a literal being represented by a sequence of bytes, its +/// `Debug` implementation will attempt to print it as a normal string. (That +/// is, not a sequence of decimal numbers.) +#[derive(Clone, Eq, PartialEq)] +pub struct Literal(pub Box<[u8]>); + +impl core::fmt::Debug for Literal { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + crate::debug::Bytes(&self.0).fmt(f) + } +} + +/// The high-level intermediate representation of a character class. +/// +/// A character class corresponds to a set of characters. A character is either +/// defined by a Unicode scalar value or a byte. +/// +/// A character class, regardless of its character type, is represented by a +/// sequence of non-overlapping non-adjacent ranges of characters. +/// +/// There are no guarantees about which class variant is used. Generally +/// speaking, the Unicode variat is used whenever a class needs to contain +/// non-ASCII Unicode scalar values. But the Unicode variant can be used even +/// when Unicode mode is disabled. For example, at the time of writing, the +/// regex `(?-u:a|\xc2\xa0)` will compile down to HIR for the Unicode class +/// `[a\u00A0]` due to optimizations. +/// +/// Note that `Bytes` variant may be produced even when it exclusively matches +/// valid UTF-8. This is because a `Bytes` variant represents an intention by +/// the author of the regular expression to disable Unicode mode, which in turn +/// impacts the semantics of case insensitive matching. For example, `(?i)k` +/// and `(?i-u)k` will not match the same set of strings. +#[derive(Clone, Eq, PartialEq)] +pub enum Class { + /// A set of characters represented by Unicode scalar values. + Unicode(ClassUnicode), + /// A set of characters represented by arbitrary bytes (one byte per + /// character). + Bytes(ClassBytes), +} + +impl Class { + /// Apply Unicode simple case folding to this character class, in place. + /// The character class will be expanded to include all simple case folded + /// character variants. + /// + /// If this is a byte oriented character class, then this will be limited + /// to the ASCII ranges `A-Z` and `a-z`. + /// + /// # Panics + /// + /// This routine panics when the case mapping data necessary for this + /// routine to complete is unavailable. This occurs when the `unicode-case` + /// feature is not enabled and the underlying class is Unicode oriented. + /// + /// Callers should prefer using `try_case_fold_simple` instead, which will + /// return an error instead of panicking. + pub fn case_fold_simple(&mut self) { + match *self { + Class::Unicode(ref mut x) => x.case_fold_simple(), + Class::Bytes(ref mut x) => x.case_fold_simple(), + } + } + + /// Apply Unicode simple case folding to this character class, in place. + /// The character class will be expanded to include all simple case folded + /// character variants. + /// + /// If this is a byte oriented character class, then this will be limited + /// to the ASCII ranges `A-Z` and `a-z`. + /// + /// # Error + /// + /// This routine returns an error when the case mapping data necessary + /// for this routine to complete is unavailable. This occurs when the + /// `unicode-case` feature is not enabled and the underlying class is + /// Unicode oriented. + pub fn try_case_fold_simple( + &mut self, + ) -> core::result::Result<(), CaseFoldError> { + match *self { + Class::Unicode(ref mut x) => x.try_case_fold_simple()?, + Class::Bytes(ref mut x) => x.case_fold_simple(), + } + Ok(()) + } + + /// Negate this character class in place. + /// + /// After completion, this character class will contain precisely the + /// characters that weren't previously in the class. + pub fn negate(&mut self) { + match *self { + Class::Unicode(ref mut x) => x.negate(), + Class::Bytes(ref mut x) => x.negate(), + } + } + + /// Returns true if and only if this character class will only ever match + /// valid UTF-8. + /// + /// A character class can match invalid UTF-8 only when the following + /// conditions are met: + /// + /// 1. The translator was configured to permit generating an expression + /// that can match invalid UTF-8. (By default, this is disabled.) + /// 2. Unicode mode (via the `u` flag) was disabled either in the concrete + /// syntax or in the parser builder. By default, Unicode mode is + /// enabled. + pub fn is_utf8(&self) -> bool { + match *self { + Class::Unicode(_) => true, + Class::Bytes(ref x) => x.is_ascii(), + } + } + + /// Returns the length, in bytes, of the smallest string matched by this + /// character class. + /// + /// For non-empty byte oriented classes, this always returns `1`. For + /// non-empty Unicode oriented classes, this can return `1`, `2`, `3` or + /// `4`. For empty classes, `None` is returned. It is impossible for `0` to + /// be returned. + /// + /// # Example + /// + /// This example shows some examples of regexes and their corresponding + /// minimum length, if any. + /// + /// ``` + /// use regex_syntax::{hir::Properties, parse}; + /// + /// // The empty string has a min length of 0. + /// let hir = parse(r"")?; + /// assert_eq!(Some(0), hir.properties().minimum_len()); + /// // As do other types of regexes that only match the empty string. + /// let hir = parse(r"^$\b\B")?; + /// assert_eq!(Some(0), hir.properties().minimum_len()); + /// // A regex that can match the empty string but match more is still 0. + /// let hir = parse(r"a*")?; + /// assert_eq!(Some(0), hir.properties().minimum_len()); + /// // A regex that matches nothing has no minimum defined. + /// let hir = parse(r"[a&&b]")?; + /// assert_eq!(None, hir.properties().minimum_len()); + /// // Character classes usually have a minimum length of 1. + /// let hir = parse(r"\w")?; + /// assert_eq!(Some(1), hir.properties().minimum_len()); + /// // But sometimes Unicode classes might be bigger! + /// let hir = parse(r"\p{Cyrillic}")?; + /// assert_eq!(Some(2), hir.properties().minimum_len()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn minimum_len(&self) -> Option<usize> { + match *self { + Class::Unicode(ref x) => x.minimum_len(), + Class::Bytes(ref x) => x.minimum_len(), + } + } + + /// Returns the length, in bytes, of the longest string matched by this + /// character class. + /// + /// For non-empty byte oriented classes, this always returns `1`. For + /// non-empty Unicode oriented classes, this can return `1`, `2`, `3` or + /// `4`. For empty classes, `None` is returned. It is impossible for `0` to + /// be returned. + /// + /// # Example + /// + /// This example shows some examples of regexes and their corresponding + /// maximum length, if any. + /// + /// ``` + /// use regex_syntax::{hir::Properties, parse}; + /// + /// // The empty string has a max length of 0. + /// let hir = parse(r"")?; + /// assert_eq!(Some(0), hir.properties().maximum_len()); + /// // As do other types of regexes that only match the empty string. + /// let hir = parse(r"^$\b\B")?; + /// assert_eq!(Some(0), hir.properties().maximum_len()); + /// // A regex that matches nothing has no maximum defined. + /// let hir = parse(r"[a&&b]")?; + /// assert_eq!(None, hir.properties().maximum_len()); + /// // Bounded repeats work as you expect. + /// let hir = parse(r"x{2,10}")?; + /// assert_eq!(Some(10), hir.properties().maximum_len()); + /// // An unbounded repeat means there is no maximum. + /// let hir = parse(r"x{2,}")?; + /// assert_eq!(None, hir.properties().maximum_len()); + /// // With Unicode enabled, \w can match up to 4 bytes! + /// let hir = parse(r"\w")?; + /// assert_eq!(Some(4), hir.properties().maximum_len()); + /// // Without Unicode enabled, \w matches at most 1 byte. + /// let hir = parse(r"(?-u)\w")?; + /// assert_eq!(Some(1), hir.properties().maximum_len()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn maximum_len(&self) -> Option<usize> { + match *self { + Class::Unicode(ref x) => x.maximum_len(), + Class::Bytes(ref x) => x.maximum_len(), + } + } + + /// Returns true if and only if this character class is empty. That is, + /// it has no elements. + /// + /// An empty character can never match anything, including an empty string. + pub fn is_empty(&self) -> bool { + match *self { + Class::Unicode(ref x) => x.ranges().is_empty(), + Class::Bytes(ref x) => x.ranges().is_empty(), + } + } + + /// If this class consists of exactly one element (whether a codepoint or a + /// byte), then return it as a literal byte string. + /// + /// If this class is empty or contains more than one element, then `None` + /// is returned. + pub fn literal(&self) -> Option<Vec<u8>> { + match *self { + Class::Unicode(ref x) => x.literal(), + Class::Bytes(ref x) => x.literal(), + } + } +} + +impl core::fmt::Debug for Class { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + use crate::debug::Byte; + + let mut fmter = f.debug_set(); + match *self { + Class::Unicode(ref cls) => { + for r in cls.ranges().iter() { + fmter.entry(&(r.start..=r.end)); + } + } + Class::Bytes(ref cls) => { + for r in cls.ranges().iter() { + fmter.entry(&(Byte(r.start)..=Byte(r.end))); + } + } + } + fmter.finish() + } +} + +/// A set of characters represented by Unicode scalar values. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct ClassUnicode { + set: IntervalSet<ClassUnicodeRange>, +} + +impl ClassUnicode { + /// Create a new class from a sequence of ranges. + /// + /// The given ranges do not need to be in any specific order, and ranges + /// may overlap. Ranges will automatically be sorted into a canonical + /// non-overlapping order. + pub fn new<I>(ranges: I) -> ClassUnicode + where + I: IntoIterator<Item = ClassUnicodeRange>, + { + ClassUnicode { set: IntervalSet::new(ranges) } + } + + /// Create a new class with no ranges. + /// + /// An empty class matches nothing. That is, it is equivalent to + /// [`Hir::fail`]. + pub fn empty() -> ClassUnicode { + ClassUnicode::new(vec![]) + } + + /// Add a new range to this set. + pub fn push(&mut self, range: ClassUnicodeRange) { + self.set.push(range); + } + + /// Return an iterator over all ranges in this class. + /// + /// The iterator yields ranges in ascending order. + pub fn iter(&self) -> ClassUnicodeIter<'_> { + ClassUnicodeIter(self.set.iter()) + } + + /// Return the underlying ranges as a slice. + pub fn ranges(&self) -> &[ClassUnicodeRange] { + self.set.intervals() + } + + /// Expand this character class such that it contains all case folded + /// characters, according to Unicode's "simple" mapping. For example, if + /// this class consists of the range `a-z`, then applying case folding will + /// result in the class containing both the ranges `a-z` and `A-Z`. + /// + /// # Panics + /// + /// This routine panics when the case mapping data necessary for this + /// routine to complete is unavailable. This occurs when the `unicode-case` + /// feature is not enabled. + /// + /// Callers should prefer using `try_case_fold_simple` instead, which will + /// return an error instead of panicking. + pub fn case_fold_simple(&mut self) { + self.set + .case_fold_simple() + .expect("unicode-case feature must be enabled"); + } + + /// Expand this character class such that it contains all case folded + /// characters, according to Unicode's "simple" mapping. For example, if + /// this class consists of the range `a-z`, then applying case folding will + /// result in the class containing both the ranges `a-z` and `A-Z`. + /// + /// # Error + /// + /// This routine returns an error when the case mapping data necessary + /// for this routine to complete is unavailable. This occurs when the + /// `unicode-case` feature is not enabled. + pub fn try_case_fold_simple( + &mut self, + ) -> core::result::Result<(), CaseFoldError> { + self.set.case_fold_simple() + } + + /// Negate this character class. + /// + /// For all `c` where `c` is a Unicode scalar value, if `c` was in this + /// set, then it will not be in this set after negation. + pub fn negate(&mut self) { + self.set.negate(); + } + + /// Union this character class with the given character class, in place. + pub fn union(&mut self, other: &ClassUnicode) { + self.set.union(&other.set); + } + + /// Intersect this character class with the given character class, in + /// place. + pub fn intersect(&mut self, other: &ClassUnicode) { + self.set.intersect(&other.set); + } + + /// Subtract the given character class from this character class, in place. + pub fn difference(&mut self, other: &ClassUnicode) { + self.set.difference(&other.set); + } + + /// Compute the symmetric difference of the given character classes, in + /// place. + /// + /// This computes the symmetric difference of two character classes. This + /// removes all elements in this class that are also in the given class, + /// but all adds all elements from the given class that aren't in this + /// class. That is, the class will contain all elements in either class, + /// but will not contain any elements that are in both classes. + pub fn symmetric_difference(&mut self, other: &ClassUnicode) { + self.set.symmetric_difference(&other.set); + } + + /// Returns true if and only if this character class will either match + /// nothing or only ASCII bytes. Stated differently, this returns false + /// if and only if this class contains a non-ASCII codepoint. + pub fn is_ascii(&self) -> bool { + self.set.intervals().last().map_or(true, |r| r.end <= '\x7F') + } + + /// Returns the length, in bytes, of the smallest string matched by this + /// character class. + /// + /// Returns `None` when the class is empty. + pub fn minimum_len(&self) -> Option<usize> { + let first = self.ranges().get(0)?; + // Correct because c1 < c2 implies c1.len_utf8() < c2.len_utf8(). + Some(first.start.len_utf8()) + } + + /// Returns the length, in bytes, of the longest string matched by this + /// character class. + /// + /// Returns `None` when the class is empty. + pub fn maximum_len(&self) -> Option<usize> { + let last = self.ranges().last()?; + // Correct because c1 < c2 implies c1.len_utf8() < c2.len_utf8(). + Some(last.end.len_utf8()) + } + + /// If this class consists of exactly one codepoint, then return it as + /// a literal byte string. + /// + /// If this class is empty or contains more than one codepoint, then `None` + /// is returned. + pub fn literal(&self) -> Option<Vec<u8>> { + let rs = self.ranges(); + if rs.len() == 1 && rs[0].start == rs[0].end { + Some(rs[0].start.encode_utf8(&mut [0; 4]).to_string().into_bytes()) + } else { + None + } + } + + /// If this class consists of only ASCII ranges, then return its + /// corresponding and equivalent byte class. + pub fn to_byte_class(&self) -> Option<ClassBytes> { + if !self.is_ascii() { + return None; + } + Some(ClassBytes::new(self.ranges().iter().map(|r| { + // Since we are guaranteed that our codepoint range is ASCII, the + // 'u8::try_from' calls below are guaranteed to be correct. + ClassBytesRange { + start: u8::try_from(r.start).unwrap(), + end: u8::try_from(r.end).unwrap(), + } + }))) + } +} + +/// An iterator over all ranges in a Unicode character class. +/// +/// The lifetime `'a` refers to the lifetime of the underlying class. +#[derive(Debug)] +pub struct ClassUnicodeIter<'a>(IntervalSetIter<'a, ClassUnicodeRange>); + +impl<'a> Iterator for ClassUnicodeIter<'a> { + type Item = &'a ClassUnicodeRange; + + fn next(&mut self) -> Option<&'a ClassUnicodeRange> { + self.0.next() + } +} + +/// A single range of characters represented by Unicode scalar values. +/// +/// The range is closed. That is, the start and end of the range are included +/// in the range. +#[derive(Clone, Copy, Default, Eq, PartialEq, PartialOrd, Ord)] +pub struct ClassUnicodeRange { + start: char, + end: char, +} + +impl core::fmt::Debug for ClassUnicodeRange { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + let start = if !self.start.is_whitespace() && !self.start.is_control() + { + self.start.to_string() + } else { + format!("0x{:X}", u32::from(self.start)) + }; + let end = if !self.end.is_whitespace() && !self.end.is_control() { + self.end.to_string() + } else { + format!("0x{:X}", u32::from(self.end)) + }; + f.debug_struct("ClassUnicodeRange") + .field("start", &start) + .field("end", &end) + .finish() + } +} + +impl Interval for ClassUnicodeRange { + type Bound = char; + + #[inline] + fn lower(&self) -> char { + self.start + } + #[inline] + fn upper(&self) -> char { + self.end + } + #[inline] + fn set_lower(&mut self, bound: char) { + self.start = bound; + } + #[inline] + fn set_upper(&mut self, bound: char) { + self.end = bound; + } + + /// Apply simple case folding to this Unicode scalar value range. + /// + /// Additional ranges are appended to the given vector. Canonical ordering + /// is *not* maintained in the given vector. + fn case_fold_simple( + &self, + ranges: &mut Vec<ClassUnicodeRange>, + ) -> Result<(), unicode::CaseFoldError> { + let mut folder = unicode::SimpleCaseFolder::new()?; + if !folder.overlaps(self.start, self.end) { + return Ok(()); + } + let (start, end) = (u32::from(self.start), u32::from(self.end)); + for cp in (start..=end).filter_map(char::from_u32) { + for &cp_folded in folder.mapping(cp) { + ranges.push(ClassUnicodeRange::new(cp_folded, cp_folded)); + } + } + Ok(()) + } +} + +impl ClassUnicodeRange { + /// Create a new Unicode scalar value range for a character class. + /// + /// The returned range is always in a canonical form. That is, the range + /// returned always satisfies the invariant that `start <= end`. + pub fn new(start: char, end: char) -> ClassUnicodeRange { + ClassUnicodeRange::create(start, end) + } + + /// Return the start of this range. + /// + /// The start of a range is always less than or equal to the end of the + /// range. + pub fn start(&self) -> char { + self.start + } + + /// Return the end of this range. + /// + /// The end of a range is always greater than or equal to the start of the + /// range. + pub fn end(&self) -> char { + self.end + } + + /// Returns the number of codepoints in this range. + pub fn len(&self) -> usize { + let diff = 1 + u32::from(self.end) - u32::from(self.start); + // This is likely to panic in 16-bit targets since a usize can only fit + // 2^16. It's not clear what to do here, other than to return an error + // when building a Unicode class that contains a range whose length + // overflows usize. (Which, to be honest, is probably quite common on + // 16-bit targets. For example, this would imply that '.' and '\p{any}' + // would be impossible to build.) + usize::try_from(diff).expect("char class len fits in usize") + } +} + +/// A set of characters represented by arbitrary bytes. +/// +/// Each byte corresponds to one character. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct ClassBytes { + set: IntervalSet<ClassBytesRange>, +} + +impl ClassBytes { + /// Create a new class from a sequence of ranges. + /// + /// The given ranges do not need to be in any specific order, and ranges + /// may overlap. Ranges will automatically be sorted into a canonical + /// non-overlapping order. + pub fn new<I>(ranges: I) -> ClassBytes + where + I: IntoIterator<Item = ClassBytesRange>, + { + ClassBytes { set: IntervalSet::new(ranges) } + } + + /// Create a new class with no ranges. + /// + /// An empty class matches nothing. That is, it is equivalent to + /// [`Hir::fail`]. + pub fn empty() -> ClassBytes { + ClassBytes::new(vec![]) + } + + /// Add a new range to this set. + pub fn push(&mut self, range: ClassBytesRange) { + self.set.push(range); + } + + /// Return an iterator over all ranges in this class. + /// + /// The iterator yields ranges in ascending order. + pub fn iter(&self) -> ClassBytesIter<'_> { + ClassBytesIter(self.set.iter()) + } + + /// Return the underlying ranges as a slice. + pub fn ranges(&self) -> &[ClassBytesRange] { + self.set.intervals() + } + + /// Expand this character class such that it contains all case folded + /// characters. For example, if this class consists of the range `a-z`, + /// then applying case folding will result in the class containing both the + /// ranges `a-z` and `A-Z`. + /// + /// Note that this only applies ASCII case folding, which is limited to the + /// characters `a-z` and `A-Z`. + pub fn case_fold_simple(&mut self) { + self.set.case_fold_simple().expect("ASCII case folding never fails"); + } + + /// Negate this byte class. + /// + /// For all `b` where `b` is a any byte, if `b` was in this set, then it + /// will not be in this set after negation. + pub fn negate(&mut self) { + self.set.negate(); + } + + /// Union this byte class with the given byte class, in place. + pub fn union(&mut self, other: &ClassBytes) { + self.set.union(&other.set); + } + + /// Intersect this byte class with the given byte class, in place. + pub fn intersect(&mut self, other: &ClassBytes) { + self.set.intersect(&other.set); + } + + /// Subtract the given byte class from this byte class, in place. + pub fn difference(&mut self, other: &ClassBytes) { + self.set.difference(&other.set); + } + + /// Compute the symmetric difference of the given byte classes, in place. + /// + /// This computes the symmetric difference of two byte classes. This + /// removes all elements in this class that are also in the given class, + /// but all adds all elements from the given class that aren't in this + /// class. That is, the class will contain all elements in either class, + /// but will not contain any elements that are in both classes. + pub fn symmetric_difference(&mut self, other: &ClassBytes) { + self.set.symmetric_difference(&other.set); + } + + /// Returns true if and only if this character class will either match + /// nothing or only ASCII bytes. Stated differently, this returns false + /// if and only if this class contains a non-ASCII byte. + pub fn is_ascii(&self) -> bool { + self.set.intervals().last().map_or(true, |r| r.end <= 0x7F) + } + + /// Returns the length, in bytes, of the smallest string matched by this + /// character class. + /// + /// Returns `None` when the class is empty. + pub fn minimum_len(&self) -> Option<usize> { + if self.ranges().is_empty() { + None + } else { + Some(1) + } + } + + /// Returns the length, in bytes, of the longest string matched by this + /// character class. + /// + /// Returns `None` when the class is empty. + pub fn maximum_len(&self) -> Option<usize> { + if self.ranges().is_empty() { + None + } else { + Some(1) + } + } + + /// If this class consists of exactly one byte, then return it as + /// a literal byte string. + /// + /// If this class is empty or contains more than one byte, then `None` + /// is returned. + pub fn literal(&self) -> Option<Vec<u8>> { + let rs = self.ranges(); + if rs.len() == 1 && rs[0].start == rs[0].end { + Some(vec![rs[0].start]) + } else { + None + } + } + + /// If this class consists of only ASCII ranges, then return its + /// corresponding and equivalent Unicode class. + pub fn to_unicode_class(&self) -> Option<ClassUnicode> { + if !self.is_ascii() { + return None; + } + Some(ClassUnicode::new(self.ranges().iter().map(|r| { + // Since we are guaranteed that our byte range is ASCII, the + // 'char::from' calls below are correct and will not erroneously + // convert a raw byte value into its corresponding codepoint. + ClassUnicodeRange { + start: char::from(r.start), + end: char::from(r.end), + } + }))) + } +} + +/// An iterator over all ranges in a byte character class. +/// +/// The lifetime `'a` refers to the lifetime of the underlying class. +#[derive(Debug)] +pub struct ClassBytesIter<'a>(IntervalSetIter<'a, ClassBytesRange>); + +impl<'a> Iterator for ClassBytesIter<'a> { + type Item = &'a ClassBytesRange; + + fn next(&mut self) -> Option<&'a ClassBytesRange> { + self.0.next() + } +} + +/// A single range of characters represented by arbitrary bytes. +/// +/// The range is closed. That is, the start and end of the range are included +/// in the range. +#[derive(Clone, Copy, Default, Eq, PartialEq, PartialOrd, Ord)] +pub struct ClassBytesRange { + start: u8, + end: u8, +} + +impl Interval for ClassBytesRange { + type Bound = u8; + + #[inline] + fn lower(&self) -> u8 { + self.start + } + #[inline] + fn upper(&self) -> u8 { + self.end + } + #[inline] + fn set_lower(&mut self, bound: u8) { + self.start = bound; + } + #[inline] + fn set_upper(&mut self, bound: u8) { + self.end = bound; + } + + /// Apply simple case folding to this byte range. Only ASCII case mappings + /// (for a-z) are applied. + /// + /// Additional ranges are appended to the given vector. Canonical ordering + /// is *not* maintained in the given vector. + fn case_fold_simple( + &self, + ranges: &mut Vec<ClassBytesRange>, + ) -> Result<(), unicode::CaseFoldError> { + if !ClassBytesRange::new(b'a', b'z').is_intersection_empty(self) { + let lower = cmp::max(self.start, b'a'); + let upper = cmp::min(self.end, b'z'); + ranges.push(ClassBytesRange::new(lower - 32, upper - 32)); + } + if !ClassBytesRange::new(b'A', b'Z').is_intersection_empty(self) { + let lower = cmp::max(self.start, b'A'); + let upper = cmp::min(self.end, b'Z'); + ranges.push(ClassBytesRange::new(lower + 32, upper + 32)); + } + Ok(()) + } +} + +impl ClassBytesRange { + /// Create a new byte range for a character class. + /// + /// The returned range is always in a canonical form. That is, the range + /// returned always satisfies the invariant that `start <= end`. + pub fn new(start: u8, end: u8) -> ClassBytesRange { + ClassBytesRange::create(start, end) + } + + /// Return the start of this range. + /// + /// The start of a range is always less than or equal to the end of the + /// range. + pub fn start(&self) -> u8 { + self.start + } + + /// Return the end of this range. + /// + /// The end of a range is always greater than or equal to the start of the + /// range. + pub fn end(&self) -> u8 { + self.end + } + + /// Returns the number of bytes in this range. + pub fn len(&self) -> usize { + usize::from(self.end.checked_sub(self.start).unwrap()) + .checked_add(1) + .unwrap() + } +} + +impl core::fmt::Debug for ClassBytesRange { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + f.debug_struct("ClassBytesRange") + .field("start", &crate::debug::Byte(self.start)) + .field("end", &crate::debug::Byte(self.end)) + .finish() + } +} + +/// The high-level intermediate representation for a look-around assertion. +/// +/// An assertion match is always zero-length. Also called an "empty match." +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum Look { + /// Match the beginning of text. Specifically, this matches at the starting + /// position of the input. + Start = 1 << 0, + /// Match the end of text. Specifically, this matches at the ending + /// position of the input. + End = 1 << 1, + /// Match the beginning of a line or the beginning of text. Specifically, + /// this matches at the starting position of the input, or at the position + /// immediately following a `\n` character. + StartLF = 1 << 2, + /// Match the end of a line or the end of text. Specifically, this matches + /// at the end position of the input, or at the position immediately + /// preceding a `\n` character. + EndLF = 1 << 3, + /// Match the beginning of a line or the beginning of text. Specifically, + /// this matches at the starting position of the input, or at the position + /// immediately following either a `\r` or `\n` character, but never after + /// a `\r` when a `\n` follows. + StartCRLF = 1 << 4, + /// Match the end of a line or the end of text. Specifically, this matches + /// at the end position of the input, or at the position immediately + /// preceding a `\r` or `\n` character, but never before a `\n` when a `\r` + /// precedes it. + EndCRLF = 1 << 5, + /// Match an ASCII-only word boundary. That is, this matches a position + /// where the left adjacent character and right adjacent character + /// correspond to a word and non-word or a non-word and word character. + WordAscii = 1 << 6, + /// Match an ASCII-only negation of a word boundary. + WordAsciiNegate = 1 << 7, + /// Match a Unicode-aware word boundary. That is, this matches a position + /// where the left adjacent character and right adjacent character + /// correspond to a word and non-word or a non-word and word character. + WordUnicode = 1 << 8, + /// Match a Unicode-aware negation of a word boundary. + WordUnicodeNegate = 1 << 9, + /// Match the start of an ASCII-only word boundary. That is, this matches a + /// position at either the beginning of the haystack or where the previous + /// character is not a word character and the following character is a word + /// character. + WordStartAscii = 1 << 10, + /// Match the end of an ASCII-only word boundary. That is, this matches + /// a position at either the end of the haystack or where the previous + /// character is a word character and the following character is not a word + /// character. + WordEndAscii = 1 << 11, + /// Match the start of a Unicode word boundary. That is, this matches a + /// position at either the beginning of the haystack or where the previous + /// character is not a word character and the following character is a word + /// character. + WordStartUnicode = 1 << 12, + /// Match the end of a Unicode word boundary. That is, this matches a + /// position at either the end of the haystack or where the previous + /// character is a word character and the following character is not a word + /// character. + WordEndUnicode = 1 << 13, + /// Match the start half of an ASCII-only word boundary. That is, this + /// matches a position at either the beginning of the haystack or where the + /// previous character is not a word character. + WordStartHalfAscii = 1 << 14, + /// Match the end half of an ASCII-only word boundary. That is, this + /// matches a position at either the end of the haystack or where the + /// following character is not a word character. + WordEndHalfAscii = 1 << 15, + /// Match the start half of a Unicode word boundary. That is, this matches + /// a position at either the beginning of the haystack or where the + /// previous character is not a word character. + WordStartHalfUnicode = 1 << 16, + /// Match the end half of a Unicode word boundary. That is, this matches + /// a position at either the end of the haystack or where the following + /// character is not a word character. + WordEndHalfUnicode = 1 << 17, +} + +impl Look { + /// Flip the look-around assertion to its equivalent for reverse searches. + /// For example, `StartLF` gets translated to `EndLF`. + /// + /// Some assertions, such as `WordUnicode`, remain the same since they + /// match the same positions regardless of the direction of the search. + #[inline] + pub const fn reversed(self) -> Look { + match self { + Look::Start => Look::End, + Look::End => Look::Start, + Look::StartLF => Look::EndLF, + Look::EndLF => Look::StartLF, + Look::StartCRLF => Look::EndCRLF, + Look::EndCRLF => Look::StartCRLF, + Look::WordAscii => Look::WordAscii, + Look::WordAsciiNegate => Look::WordAsciiNegate, + Look::WordUnicode => Look::WordUnicode, + Look::WordUnicodeNegate => Look::WordUnicodeNegate, + Look::WordStartAscii => Look::WordEndAscii, + Look::WordEndAscii => Look::WordStartAscii, + Look::WordStartUnicode => Look::WordEndUnicode, + Look::WordEndUnicode => Look::WordStartUnicode, + Look::WordStartHalfAscii => Look::WordEndHalfAscii, + Look::WordEndHalfAscii => Look::WordStartHalfAscii, + Look::WordStartHalfUnicode => Look::WordEndHalfUnicode, + Look::WordEndHalfUnicode => Look::WordStartHalfUnicode, + } + } + + /// Return the underlying representation of this look-around enumeration + /// as an integer. Giving the return value to the [`Look::from_repr`] + /// constructor is guaranteed to return the same look-around variant that + /// one started with within a semver compatible release of this crate. + #[inline] + pub const fn as_repr(self) -> u32 { + // AFAIK, 'as' is the only way to zero-cost convert an int enum to an + // actual int. + self as u32 + } + + /// Given the underlying representation of a `Look` value, return the + /// corresponding `Look` value if the representation is valid. Otherwise + /// `None` is returned. + #[inline] + pub const fn from_repr(repr: u32) -> Option<Look> { + match repr { + 0b00_0000_0000_0000_0001 => Some(Look::Start), + 0b00_0000_0000_0000_0010 => Some(Look::End), + 0b00_0000_0000_0000_0100 => Some(Look::StartLF), + 0b00_0000_0000_0000_1000 => Some(Look::EndLF), + 0b00_0000_0000_0001_0000 => Some(Look::StartCRLF), + 0b00_0000_0000_0010_0000 => Some(Look::EndCRLF), + 0b00_0000_0000_0100_0000 => Some(Look::WordAscii), + 0b00_0000_0000_1000_0000 => Some(Look::WordAsciiNegate), + 0b00_0000_0001_0000_0000 => Some(Look::WordUnicode), + 0b00_0000_0010_0000_0000 => Some(Look::WordUnicodeNegate), + 0b00_0000_0100_0000_0000 => Some(Look::WordStartAscii), + 0b00_0000_1000_0000_0000 => Some(Look::WordEndAscii), + 0b00_0001_0000_0000_0000 => Some(Look::WordStartUnicode), + 0b00_0010_0000_0000_0000 => Some(Look::WordEndUnicode), + 0b00_0100_0000_0000_0000 => Some(Look::WordStartHalfAscii), + 0b00_1000_0000_0000_0000 => Some(Look::WordEndHalfAscii), + 0b01_0000_0000_0000_0000 => Some(Look::WordStartHalfUnicode), + 0b10_0000_0000_0000_0000 => Some(Look::WordEndHalfUnicode), + _ => None, + } + } + + /// Returns a convenient single codepoint representation of this + /// look-around assertion. Each assertion is guaranteed to be represented + /// by a distinct character. + /// + /// This is useful for succinctly representing a look-around assertion in + /// human friendly but succinct output intended for a programmer working on + /// regex internals. + #[inline] + pub const fn as_char(self) -> char { + match self { + Look::Start => 'A', + Look::End => 'z', + Look::StartLF => '^', + Look::EndLF => '$', + Look::StartCRLF => 'r', + Look::EndCRLF => 'R', + Look::WordAscii => 'b', + Look::WordAsciiNegate => 'B', + Look::WordUnicode => '𝛃', + Look::WordUnicodeNegate => '𝚩', + Look::WordStartAscii => '<', + Look::WordEndAscii => '>', + Look::WordStartUnicode => '〈', + Look::WordEndUnicode => '〉', + Look::WordStartHalfAscii => '◁', + Look::WordEndHalfAscii => '▷', + Look::WordStartHalfUnicode => '◀', + Look::WordEndHalfUnicode => '▶', + } + } +} + +/// The high-level intermediate representation for a capturing group. +/// +/// A capturing group always has an index and a child expression. It may +/// also have a name associated with it (e.g., `(?P<foo>\w)`), but it's not +/// necessary. +/// +/// Note that there is no explicit representation of a non-capturing group +/// in a `Hir`. Instead, non-capturing grouping is handled automatically by +/// the recursive structure of the `Hir` itself. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Capture { + /// The capture index of the capture. + pub index: u32, + /// The name of the capture, if it exists. + pub name: Option<Box<str>>, + /// The expression inside the capturing group, which may be empty. + pub sub: Box<Hir>, +} + +/// The high-level intermediate representation of a repetition operator. +/// +/// A repetition operator permits the repetition of an arbitrary +/// sub-expression. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Repetition { + /// The minimum range of the repetition. + /// + /// Note that special cases like `?`, `+` and `*` all get translated into + /// the ranges `{0,1}`, `{1,}` and `{0,}`, respectively. + /// + /// When `min` is zero, this expression can match the empty string + /// regardless of what its sub-expression is. + pub min: u32, + /// The maximum range of the repetition. + /// + /// Note that when `max` is `None`, `min` acts as a lower bound but where + /// there is no upper bound. For something like `x{5}` where the min and + /// max are equivalent, `min` will be set to `5` and `max` will be set to + /// `Some(5)`. + pub max: Option<u32>, + /// Whether this repetition operator is greedy or not. A greedy operator + /// will match as much as it can. A non-greedy operator will match as + /// little as it can. + /// + /// Typically, operators are greedy by default and are only non-greedy when + /// a `?` suffix is used, e.g., `(expr)*` is greedy while `(expr)*?` is + /// not. However, this can be inverted via the `U` "ungreedy" flag. + pub greedy: bool, + /// The expression being repeated. + pub sub: Box<Hir>, +} + +impl Repetition { + /// Returns a new repetition with the same `min`, `max` and `greedy` + /// values, but with its sub-expression replaced with the one given. + pub fn with(&self, sub: Hir) -> Repetition { + Repetition { + min: self.min, + max: self.max, + greedy: self.greedy, + sub: Box::new(sub), + } + } +} + +/// A type describing the different flavors of `.`. +/// +/// This type is meant to be used with [`Hir::dot`], which is a convenience +/// routine for building HIR values derived from the `.` regex. +#[non_exhaustive] +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum Dot { + /// Matches the UTF-8 encoding of any Unicode scalar value. + /// + /// This is equivalent to `(?su:.)` and also `\p{any}`. + AnyChar, + /// Matches any byte value. + /// + /// This is equivalent to `(?s-u:.)` and also `(?-u:[\x00-\xFF])`. + AnyByte, + /// Matches the UTF-8 encoding of any Unicode scalar value except for the + /// `char` given. + /// + /// This is equivalent to using `(?u-s:.)` with the line terminator set + /// to a particular ASCII byte. (Because of peculiarities in the regex + /// engines, a line terminator must be a single byte. It follows that when + /// UTF-8 mode is enabled, this single byte must also be a Unicode scalar + /// value. That is, ti must be ASCII.) + /// + /// (This and `AnyCharExceptLF` both exist because of legacy reasons. + /// `AnyCharExceptLF` will be dropped in the next breaking change release.) + AnyCharExcept(char), + /// Matches the UTF-8 encoding of any Unicode scalar value except for `\n`. + /// + /// This is equivalent to `(?u-s:.)` and also `[\p{any}--\n]`. + AnyCharExceptLF, + /// Matches the UTF-8 encoding of any Unicode scalar value except for `\r` + /// and `\n`. + /// + /// This is equivalent to `(?uR-s:.)` and also `[\p{any}--\r\n]`. + AnyCharExceptCRLF, + /// Matches any byte value except for the `u8` given. + /// + /// This is equivalent to using `(?-us:.)` with the line terminator set + /// to a particular ASCII byte. (Because of peculiarities in the regex + /// engines, a line terminator must be a single byte. It follows that when + /// UTF-8 mode is enabled, this single byte must also be a Unicode scalar + /// value. That is, ti must be ASCII.) + /// + /// (This and `AnyByteExceptLF` both exist because of legacy reasons. + /// `AnyByteExceptLF` will be dropped in the next breaking change release.) + AnyByteExcept(u8), + /// Matches any byte value except for `\n`. + /// + /// This is equivalent to `(?-su:.)` and also `(?-u:[[\x00-\xFF]--\n])`. + AnyByteExceptLF, + /// Matches any byte value except for `\r` and `\n`. + /// + /// This is equivalent to `(?R-su:.)` and also `(?-u:[[\x00-\xFF]--\r\n])`. + AnyByteExceptCRLF, +} + +/// A custom `Drop` impl is used for `HirKind` such that it uses constant stack +/// space but heap space proportional to the depth of the total `Hir`. +impl Drop for Hir { + fn drop(&mut self) { + use core::mem; + + match *self.kind() { + HirKind::Empty + | HirKind::Literal(_) + | HirKind::Class(_) + | HirKind::Look(_) => return, + HirKind::Capture(ref x) if x.sub.kind.subs().is_empty() => return, + HirKind::Repetition(ref x) if x.sub.kind.subs().is_empty() => { + return + } + HirKind::Concat(ref x) if x.is_empty() => return, + HirKind::Alternation(ref x) if x.is_empty() => return, + _ => {} + } + + let mut stack = vec![mem::replace(self, Hir::empty())]; + while let Some(mut expr) = stack.pop() { + match expr.kind { + HirKind::Empty + | HirKind::Literal(_) + | HirKind::Class(_) + | HirKind::Look(_) => {} + HirKind::Capture(ref mut x) => { + stack.push(mem::replace(&mut x.sub, Hir::empty())); + } + HirKind::Repetition(ref mut x) => { + stack.push(mem::replace(&mut x.sub, Hir::empty())); + } + HirKind::Concat(ref mut x) => { + stack.extend(x.drain(..)); + } + HirKind::Alternation(ref mut x) => { + stack.extend(x.drain(..)); + } + } + } + } +} + +/// A type that collects various properties of an HIR value. +/// +/// Properties are always scalar values and represent meta data that is +/// computed inductively on an HIR value. Properties are defined for all +/// HIR values. +/// +/// All methods on a `Properties` value take constant time and are meant to +/// be cheap to call. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Properties(Box<PropertiesI>); + +/// The property definition. It is split out so that we can box it, and +/// there by make `Properties` use less stack size. This is kind-of important +/// because every HIR value has a `Properties` attached to it. +/// +/// This does have the unfortunate consequence that creating any HIR value +/// always leads to at least one alloc for properties, but this is generally +/// true anyway (for pretty much all HirKinds except for look-arounds). +#[derive(Clone, Debug, Eq, PartialEq)] +struct PropertiesI { + minimum_len: Option<usize>, + maximum_len: Option<usize>, + look_set: LookSet, + look_set_prefix: LookSet, + look_set_suffix: LookSet, + look_set_prefix_any: LookSet, + look_set_suffix_any: LookSet, + utf8: bool, + explicit_captures_len: usize, + static_explicit_captures_len: Option<usize>, + literal: bool, + alternation_literal: bool, +} + +impl Properties { + /// Returns the length (in bytes) of the smallest string matched by this + /// HIR. + /// + /// A return value of `0` is possible and occurs when the HIR can match an + /// empty string. + /// + /// `None` is returned when there is no minimum length. This occurs in + /// precisely the cases where the HIR matches nothing. i.e., The language + /// the regex matches is empty. An example of such a regex is `\P{any}`. + #[inline] + pub fn minimum_len(&self) -> Option<usize> { + self.0.minimum_len + } + + /// Returns the length (in bytes) of the longest string matched by this + /// HIR. + /// + /// A return value of `0` is possible and occurs when nothing longer than + /// the empty string is in the language described by this HIR. + /// + /// `None` is returned when there is no longest matching string. This + /// occurs when the HIR matches nothing or when there is no upper bound on + /// the length of matching strings. Example of such regexes are `\P{any}` + /// (matches nothing) and `a+` (has no upper bound). + #[inline] + pub fn maximum_len(&self) -> Option<usize> { + self.0.maximum_len + } + + /// Returns a set of all look-around assertions that appear at least once + /// in this HIR value. + #[inline] + pub fn look_set(&self) -> LookSet { + self.0.look_set + } + + /// Returns a set of all look-around assertions that appear as a prefix for + /// this HIR value. That is, the set returned corresponds to the set of + /// assertions that must be passed before matching any bytes in a haystack. + /// + /// For example, `hir.look_set_prefix().contains(Look::Start)` returns true + /// if and only if the HIR is fully anchored at the start. + #[inline] + pub fn look_set_prefix(&self) -> LookSet { + self.0.look_set_prefix + } + + /// Returns a set of all look-around assertions that appear as a _possible_ + /// prefix for this HIR value. That is, the set returned corresponds to the + /// set of assertions that _may_ be passed before matching any bytes in a + /// haystack. + /// + /// For example, `hir.look_set_prefix_any().contains(Look::Start)` returns + /// true if and only if it's possible for the regex to match through a + /// anchored assertion before consuming any input. + #[inline] + pub fn look_set_prefix_any(&self) -> LookSet { + self.0.look_set_prefix_any + } + + /// Returns a set of all look-around assertions that appear as a suffix for + /// this HIR value. That is, the set returned corresponds to the set of + /// assertions that must be passed in order to be considered a match after + /// all other consuming HIR expressions. + /// + /// For example, `hir.look_set_suffix().contains(Look::End)` returns true + /// if and only if the HIR is fully anchored at the end. + #[inline] + pub fn look_set_suffix(&self) -> LookSet { + self.0.look_set_suffix + } + + /// Returns a set of all look-around assertions that appear as a _possible_ + /// suffix for this HIR value. That is, the set returned corresponds to the + /// set of assertions that _may_ be passed before matching any bytes in a + /// haystack. + /// + /// For example, `hir.look_set_suffix_any().contains(Look::End)` returns + /// true if and only if it's possible for the regex to match through a + /// anchored assertion at the end of a match without consuming any input. + #[inline] + pub fn look_set_suffix_any(&self) -> LookSet { + self.0.look_set_suffix_any + } + + /// Return true if and only if the corresponding HIR will always match + /// valid UTF-8. + /// + /// When this returns false, then it is possible for this HIR expression to + /// match invalid UTF-8, including by matching between the code units of + /// a single UTF-8 encoded codepoint. + /// + /// Note that this returns true even when the corresponding HIR can match + /// the empty string. Since an empty string can technically appear between + /// UTF-8 code units, it is possible for a match to be reported that splits + /// a codepoint which could in turn be considered matching invalid UTF-8. + /// However, it is generally assumed that such empty matches are handled + /// specially by the search routine if it is absolutely required that + /// matches not split a codepoint. + /// + /// # Example + /// + /// This code example shows the UTF-8 property of a variety of patterns. + /// + /// ``` + /// use regex_syntax::{ParserBuilder, parse}; + /// + /// // Examples of 'is_utf8() == true'. + /// assert!(parse(r"a")?.properties().is_utf8()); + /// assert!(parse(r"[^a]")?.properties().is_utf8()); + /// assert!(parse(r".")?.properties().is_utf8()); + /// assert!(parse(r"\W")?.properties().is_utf8()); + /// assert!(parse(r"\b")?.properties().is_utf8()); + /// assert!(parse(r"\B")?.properties().is_utf8()); + /// assert!(parse(r"(?-u)\b")?.properties().is_utf8()); + /// assert!(parse(r"(?-u)\B")?.properties().is_utf8()); + /// // Unicode mode is enabled by default, and in + /// // that mode, all \x hex escapes are treated as + /// // codepoints. So this actually matches the UTF-8 + /// // encoding of U+00FF. + /// assert!(parse(r"\xFF")?.properties().is_utf8()); + /// + /// // Now we show examples of 'is_utf8() == false'. + /// // The only way to do this is to force the parser + /// // to permit invalid UTF-8, otherwise all of these + /// // would fail to parse! + /// let parse = |pattern| { + /// ParserBuilder::new().utf8(false).build().parse(pattern) + /// }; + /// assert!(!parse(r"(?-u)[^a]")?.properties().is_utf8()); + /// assert!(!parse(r"(?-u).")?.properties().is_utf8()); + /// assert!(!parse(r"(?-u)\W")?.properties().is_utf8()); + /// // Conversely to the equivalent example above, + /// // when Unicode mode is disabled, \x hex escapes + /// // are treated as their raw byte values. + /// assert!(!parse(r"(?-u)\xFF")?.properties().is_utf8()); + /// // Note that just because we disabled UTF-8 in the + /// // parser doesn't mean we still can't use Unicode. + /// // It is enabled by default, so \xFF is still + /// // equivalent to matching the UTF-8 encoding of + /// // U+00FF by default. + /// assert!(parse(r"\xFF")?.properties().is_utf8()); + /// // Even though we use raw bytes that individually + /// // are not valid UTF-8, when combined together, the + /// // overall expression *does* match valid UTF-8! + /// assert!(parse(r"(?-u)\xE2\x98\x83")?.properties().is_utf8()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn is_utf8(&self) -> bool { + self.0.utf8 + } + + /// Returns the total number of explicit capturing groups in the + /// corresponding HIR. + /// + /// Note that this does not include the implicit capturing group + /// corresponding to the entire match that is typically included by regex + /// engines. + /// + /// # Example + /// + /// This method will return `0` for `a` and `1` for `(a)`: + /// + /// ``` + /// use regex_syntax::parse; + /// + /// assert_eq!(0, parse("a")?.properties().explicit_captures_len()); + /// assert_eq!(1, parse("(a)")?.properties().explicit_captures_len()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn explicit_captures_len(&self) -> usize { + self.0.explicit_captures_len + } + + /// Returns the total number of explicit capturing groups that appear in + /// every possible match. + /// + /// If the number of capture groups can vary depending on the match, then + /// this returns `None`. That is, a value is only returned when the number + /// of matching groups is invariant or "static." + /// + /// Note that this does not include the implicit capturing group + /// corresponding to the entire match. + /// + /// # Example + /// + /// This shows a few cases where a static number of capture groups is + /// available and a few cases where it is not. + /// + /// ``` + /// use regex_syntax::parse; + /// + /// let len = |pattern| { + /// parse(pattern).map(|h| { + /// h.properties().static_explicit_captures_len() + /// }) + /// }; + /// + /// assert_eq!(Some(0), len("a")?); + /// assert_eq!(Some(1), len("(a)")?); + /// assert_eq!(Some(1), len("(a)|(b)")?); + /// assert_eq!(Some(2), len("(a)(b)|(c)(d)")?); + /// assert_eq!(None, len("(a)|b")?); + /// assert_eq!(None, len("a|(b)")?); + /// assert_eq!(None, len("(b)*")?); + /// assert_eq!(Some(1), len("(b)+")?); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn static_explicit_captures_len(&self) -> Option<usize> { + self.0.static_explicit_captures_len + } + + /// Return true if and only if this HIR is a simple literal. This is + /// only true when this HIR expression is either itself a `Literal` or a + /// concatenation of only `Literal`s. + /// + /// For example, `f` and `foo` are literals, but `f+`, `(foo)`, `foo()` and + /// the empty string are not (even though they contain sub-expressions that + /// are literals). + #[inline] + pub fn is_literal(&self) -> bool { + self.0.literal + } + + /// Return true if and only if this HIR is either a simple literal or an + /// alternation of simple literals. This is only + /// true when this HIR expression is either itself a `Literal` or a + /// concatenation of only `Literal`s or an alternation of only `Literal`s. + /// + /// For example, `f`, `foo`, `a|b|c`, and `foo|bar|baz` are alternation + /// literals, but `f+`, `(foo)`, `foo()`, and the empty pattern are not + /// (even though that contain sub-expressions that are literals). + #[inline] + pub fn is_alternation_literal(&self) -> bool { + self.0.alternation_literal + } + + /// Returns the total amount of heap memory usage, in bytes, used by this + /// `Properties` value. + #[inline] + pub fn memory_usage(&self) -> usize { + core::mem::size_of::<PropertiesI>() + } + + /// Returns a new set of properties that corresponds to the union of the + /// iterator of properties given. + /// + /// This is useful when one has multiple `Hir` expressions and wants + /// to combine them into a single alternation without constructing the + /// corresponding `Hir`. This routine provides a way of combining the + /// properties of each `Hir` expression into one set of properties + /// representing the union of those expressions. + /// + /// # Example: union with HIRs that never match + /// + /// This example shows that unioning properties together with one that + /// represents a regex that never matches will "poison" certain attributes, + /// like the minimum and maximum lengths. + /// + /// ``` + /// use regex_syntax::{hir::Properties, parse}; + /// + /// let hir1 = parse("ab?c?")?; + /// assert_eq!(Some(1), hir1.properties().minimum_len()); + /// assert_eq!(Some(3), hir1.properties().maximum_len()); + /// + /// let hir2 = parse(r"[a&&b]")?; + /// assert_eq!(None, hir2.properties().minimum_len()); + /// assert_eq!(None, hir2.properties().maximum_len()); + /// + /// let hir3 = parse(r"wxy?z?")?; + /// assert_eq!(Some(2), hir3.properties().minimum_len()); + /// assert_eq!(Some(4), hir3.properties().maximum_len()); + /// + /// let unioned = Properties::union([ + /// hir1.properties(), + /// hir2.properties(), + /// hir3.properties(), + /// ]); + /// assert_eq!(None, unioned.minimum_len()); + /// assert_eq!(None, unioned.maximum_len()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// The maximum length can also be "poisoned" by a pattern that has no + /// upper bound on the length of a match. The minimum length remains + /// unaffected: + /// + /// ``` + /// use regex_syntax::{hir::Properties, parse}; + /// + /// let hir1 = parse("ab?c?")?; + /// assert_eq!(Some(1), hir1.properties().minimum_len()); + /// assert_eq!(Some(3), hir1.properties().maximum_len()); + /// + /// let hir2 = parse(r"a+")?; + /// assert_eq!(Some(1), hir2.properties().minimum_len()); + /// assert_eq!(None, hir2.properties().maximum_len()); + /// + /// let hir3 = parse(r"wxy?z?")?; + /// assert_eq!(Some(2), hir3.properties().minimum_len()); + /// assert_eq!(Some(4), hir3.properties().maximum_len()); + /// + /// let unioned = Properties::union([ + /// hir1.properties(), + /// hir2.properties(), + /// hir3.properties(), + /// ]); + /// assert_eq!(Some(1), unioned.minimum_len()); + /// assert_eq!(None, unioned.maximum_len()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn union<I, P>(props: I) -> Properties + where + I: IntoIterator<Item = P>, + P: core::borrow::Borrow<Properties>, + { + let mut it = props.into_iter().peekable(); + // While empty alternations aren't possible, we still behave as if they + // are. When we have an empty alternate, then clearly the look-around + // prefix and suffix is empty. Otherwise, it is the intersection of all + // prefixes and suffixes (respectively) of the branches. + let fix = if it.peek().is_none() { + LookSet::empty() + } else { + LookSet::full() + }; + // And also, an empty alternate means we have 0 static capture groups, + // but we otherwise start with the number corresponding to the first + // alternate. If any subsequent alternate has a different number of + // static capture groups, then we overall have a variation and not a + // static number of groups. + let static_explicit_captures_len = + it.peek().and_then(|p| p.borrow().static_explicit_captures_len()); + // The base case is an empty alternation, which matches nothing. + // Note though that empty alternations aren't possible, because the + // Hir::alternation smart constructor rewrites those as empty character + // classes. + let mut props = PropertiesI { + minimum_len: None, + maximum_len: None, + look_set: LookSet::empty(), + look_set_prefix: fix, + look_set_suffix: fix, + look_set_prefix_any: LookSet::empty(), + look_set_suffix_any: LookSet::empty(), + utf8: true, + explicit_captures_len: 0, + static_explicit_captures_len, + literal: false, + alternation_literal: true, + }; + let (mut min_poisoned, mut max_poisoned) = (false, false); + // Handle properties that need to visit every child hir. + for prop in it { + let p = prop.borrow(); + props.look_set.set_union(p.look_set()); + props.look_set_prefix.set_intersect(p.look_set_prefix()); + props.look_set_suffix.set_intersect(p.look_set_suffix()); + props.look_set_prefix_any.set_union(p.look_set_prefix_any()); + props.look_set_suffix_any.set_union(p.look_set_suffix_any()); + props.utf8 = props.utf8 && p.is_utf8(); + props.explicit_captures_len = props + .explicit_captures_len + .saturating_add(p.explicit_captures_len()); + if props.static_explicit_captures_len + != p.static_explicit_captures_len() + { + props.static_explicit_captures_len = None; + } + props.alternation_literal = + props.alternation_literal && p.is_literal(); + if !min_poisoned { + if let Some(xmin) = p.minimum_len() { + if props.minimum_len.map_or(true, |pmin| xmin < pmin) { + props.minimum_len = Some(xmin); + } + } else { + props.minimum_len = None; + min_poisoned = true; + } + } + if !max_poisoned { + if let Some(xmax) = p.maximum_len() { + if props.maximum_len.map_or(true, |pmax| xmax > pmax) { + props.maximum_len = Some(xmax); + } + } else { + props.maximum_len = None; + max_poisoned = true; + } + } + } + Properties(Box::new(props)) + } +} + +impl Properties { + /// Create a new set of HIR properties for an empty regex. + fn empty() -> Properties { + let inner = PropertiesI { + minimum_len: Some(0), + maximum_len: Some(0), + look_set: LookSet::empty(), + look_set_prefix: LookSet::empty(), + look_set_suffix: LookSet::empty(), + look_set_prefix_any: LookSet::empty(), + look_set_suffix_any: LookSet::empty(), + // It is debatable whether an empty regex always matches at valid + // UTF-8 boundaries. Strictly speaking, at a byte oriented view, + // it is clearly false. There are, for example, many empty strings + // between the bytes encoding a '☃'. + // + // However, when Unicode mode is enabled, the fundamental atom + // of matching is really a codepoint. And in that scenario, an + // empty regex is defined to only match at valid UTF-8 boundaries + // and to never split a codepoint. It just so happens that this + // enforcement is somewhat tricky to do for regexes that match + // the empty string inside regex engines themselves. It usually + // requires some layer above the regex engine to filter out such + // matches. + // + // In any case, 'true' is really the only coherent option. If it + // were false, for example, then 'a*' would also need to be false + // since it too can match the empty string. + utf8: true, + explicit_captures_len: 0, + static_explicit_captures_len: Some(0), + literal: false, + alternation_literal: false, + }; + Properties(Box::new(inner)) + } + + /// Create a new set of HIR properties for a literal regex. + fn literal(lit: &Literal) -> Properties { + let inner = PropertiesI { + minimum_len: Some(lit.0.len()), + maximum_len: Some(lit.0.len()), + look_set: LookSet::empty(), + look_set_prefix: LookSet::empty(), + look_set_suffix: LookSet::empty(), + look_set_prefix_any: LookSet::empty(), + look_set_suffix_any: LookSet::empty(), + utf8: core::str::from_utf8(&lit.0).is_ok(), + explicit_captures_len: 0, + static_explicit_captures_len: Some(0), + literal: true, + alternation_literal: true, + }; + Properties(Box::new(inner)) + } + + /// Create a new set of HIR properties for a character class. + fn class(class: &Class) -> Properties { + let inner = PropertiesI { + minimum_len: class.minimum_len(), + maximum_len: class.maximum_len(), + look_set: LookSet::empty(), + look_set_prefix: LookSet::empty(), + look_set_suffix: LookSet::empty(), + look_set_prefix_any: LookSet::empty(), + look_set_suffix_any: LookSet::empty(), + utf8: class.is_utf8(), + explicit_captures_len: 0, + static_explicit_captures_len: Some(0), + literal: false, + alternation_literal: false, + }; + Properties(Box::new(inner)) + } + + /// Create a new set of HIR properties for a look-around assertion. + fn look(look: Look) -> Properties { + let inner = PropertiesI { + minimum_len: Some(0), + maximum_len: Some(0), + look_set: LookSet::singleton(look), + look_set_prefix: LookSet::singleton(look), + look_set_suffix: LookSet::singleton(look), + look_set_prefix_any: LookSet::singleton(look), + look_set_suffix_any: LookSet::singleton(look), + // This requires a little explanation. Basically, we don't consider + // matching an empty string to be equivalent to matching invalid + // UTF-8, even though technically matching every empty string will + // split the UTF-8 encoding of a single codepoint when treating a + // UTF-8 encoded string as a sequence of bytes. Our defense here is + // that in such a case, a codepoint should logically be treated as + // the fundamental atom for matching, and thus the only valid match + // points are between codepoints and not bytes. + // + // More practically, this is true here because it's also true + // for 'Hir::empty()', otherwise something like 'a*' would be + // considered to match invalid UTF-8. That in turn makes this + // property borderline useless. + utf8: true, + explicit_captures_len: 0, + static_explicit_captures_len: Some(0), + literal: false, + alternation_literal: false, + }; + Properties(Box::new(inner)) + } + + /// Create a new set of HIR properties for a repetition. + fn repetition(rep: &Repetition) -> Properties { + let p = rep.sub.properties(); + let minimum_len = p.minimum_len().map(|child_min| { + let rep_min = usize::try_from(rep.min).unwrap_or(usize::MAX); + child_min.saturating_mul(rep_min) + }); + let maximum_len = rep.max.and_then(|rep_max| { + let rep_max = usize::try_from(rep_max).ok()?; + let child_max = p.maximum_len()?; + child_max.checked_mul(rep_max) + }); + + let mut inner = PropertiesI { + minimum_len, + maximum_len, + look_set: p.look_set(), + look_set_prefix: LookSet::empty(), + look_set_suffix: LookSet::empty(), + look_set_prefix_any: p.look_set_prefix_any(), + look_set_suffix_any: p.look_set_suffix_any(), + utf8: p.is_utf8(), + explicit_captures_len: p.explicit_captures_len(), + static_explicit_captures_len: p.static_explicit_captures_len(), + literal: false, + alternation_literal: false, + }; + // If the repetition operator can match the empty string, then its + // lookset prefix and suffixes themselves remain empty since they are + // no longer required to match. + if rep.min > 0 { + inner.look_set_prefix = p.look_set_prefix(); + inner.look_set_suffix = p.look_set_suffix(); + } + // If the static captures len of the sub-expression is not known or + // is greater than zero, then it automatically propagates to the + // repetition, regardless of the repetition. Otherwise, it might + // change, but only when the repetition can match 0 times. + if rep.min == 0 + && inner.static_explicit_captures_len.map_or(false, |len| len > 0) + { + // If we require a match 0 times, then our captures len is + // guaranteed to be zero. Otherwise, if we *can* match the empty + // string, then it's impossible to know how many captures will be + // in the resulting match. + if rep.max == Some(0) { + inner.static_explicit_captures_len = Some(0); + } else { + inner.static_explicit_captures_len = None; + } + } + Properties(Box::new(inner)) + } + + /// Create a new set of HIR properties for a capture. + fn capture(capture: &Capture) -> Properties { + let p = capture.sub.properties(); + Properties(Box::new(PropertiesI { + explicit_captures_len: p.explicit_captures_len().saturating_add(1), + static_explicit_captures_len: p + .static_explicit_captures_len() + .map(|len| len.saturating_add(1)), + literal: false, + alternation_literal: false, + ..*p.0.clone() + })) + } + + /// Create a new set of HIR properties for a concatenation. + fn concat(concat: &[Hir]) -> Properties { + // The base case is an empty concatenation, which matches the empty + // string. Note though that empty concatenations aren't possible, + // because the Hir::concat smart constructor rewrites those as + // Hir::empty. + let mut props = PropertiesI { + minimum_len: Some(0), + maximum_len: Some(0), + look_set: LookSet::empty(), + look_set_prefix: LookSet::empty(), + look_set_suffix: LookSet::empty(), + look_set_prefix_any: LookSet::empty(), + look_set_suffix_any: LookSet::empty(), + utf8: true, + explicit_captures_len: 0, + static_explicit_captures_len: Some(0), + literal: true, + alternation_literal: true, + }; + // Handle properties that need to visit every child hir. + for x in concat.iter() { + let p = x.properties(); + props.look_set.set_union(p.look_set()); + props.utf8 = props.utf8 && p.is_utf8(); + props.explicit_captures_len = props + .explicit_captures_len + .saturating_add(p.explicit_captures_len()); + props.static_explicit_captures_len = p + .static_explicit_captures_len() + .and_then(|len1| { + Some((len1, props.static_explicit_captures_len?)) + }) + .and_then(|(len1, len2)| Some(len1.saturating_add(len2))); + props.literal = props.literal && p.is_literal(); + props.alternation_literal = + props.alternation_literal && p.is_alternation_literal(); + if let Some(minimum_len) = props.minimum_len { + match p.minimum_len() { + None => props.minimum_len = None, + Some(len) => { + // We use saturating arithmetic here because the + // minimum is just a lower bound. We can't go any + // higher than what our number types permit. + props.minimum_len = + Some(minimum_len.saturating_add(len)); + } + } + } + if let Some(maximum_len) = props.maximum_len { + match p.maximum_len() { + None => props.maximum_len = None, + Some(len) => { + props.maximum_len = maximum_len.checked_add(len) + } + } + } + } + // Handle the prefix properties, which only requires visiting + // child exprs until one matches more than the empty string. + let mut it = concat.iter(); + while let Some(x) = it.next() { + props.look_set_prefix.set_union(x.properties().look_set_prefix()); + props + .look_set_prefix_any + .set_union(x.properties().look_set_prefix_any()); + if x.properties().maximum_len().map_or(true, |x| x > 0) { + break; + } + } + // Same thing for the suffix properties, but in reverse. + let mut it = concat.iter().rev(); + while let Some(x) = it.next() { + props.look_set_suffix.set_union(x.properties().look_set_suffix()); + props + .look_set_suffix_any + .set_union(x.properties().look_set_suffix_any()); + if x.properties().maximum_len().map_or(true, |x| x > 0) { + break; + } + } + Properties(Box::new(props)) + } + + /// Create a new set of HIR properties for a concatenation. + fn alternation(alts: &[Hir]) -> Properties { + Properties::union(alts.iter().map(|hir| hir.properties())) + } +} + +/// A set of look-around assertions. +/// +/// This is useful for efficiently tracking look-around assertions. For +/// example, an [`Hir`] provides properties that return `LookSet`s. +#[derive(Clone, Copy, Default, Eq, PartialEq)] +pub struct LookSet { + /// The underlying representation this set is exposed to make it possible + /// to store it somewhere efficiently. The representation is that + /// of a bitset, where each assertion occupies bit `i` where `i = + /// Look::as_repr()`. + /// + /// Note that users of this internal representation must permit the full + /// range of `u16` values to be represented. For example, even if the + /// current implementation only makes use of the 10 least significant bits, + /// it may use more bits in a future semver compatible release. + pub bits: u32, +} + +impl LookSet { + /// Create an empty set of look-around assertions. + #[inline] + pub fn empty() -> LookSet { + LookSet { bits: 0 } + } + + /// Create a full set of look-around assertions. + /// + /// This set contains all possible look-around assertions. + #[inline] + pub fn full() -> LookSet { + LookSet { bits: !0 } + } + + /// Create a look-around set containing the look-around assertion given. + /// + /// This is a convenience routine for creating an empty set and inserting + /// one look-around assertions. + #[inline] + pub fn singleton(look: Look) -> LookSet { + LookSet::empty().insert(look) + } + + /// Returns the total number of look-around assertions in this set. + #[inline] + pub fn len(self) -> usize { + // OK because max value always fits in a u8, which in turn always + // fits in a usize, regardless of target. + usize::try_from(self.bits.count_ones()).unwrap() + } + + /// Returns true if and only if this set is empty. + #[inline] + pub fn is_empty(self) -> bool { + self.len() == 0 + } + + /// Returns true if and only if the given look-around assertion is in this + /// set. + #[inline] + pub fn contains(self, look: Look) -> bool { + self.bits & look.as_repr() != 0 + } + + /// Returns true if and only if this set contains any anchor assertions. + /// This includes both "start/end of haystack" and "start/end of line." + #[inline] + pub fn contains_anchor(&self) -> bool { + self.contains_anchor_haystack() || self.contains_anchor_line() + } + + /// Returns true if and only if this set contains any "start/end of + /// haystack" anchors. This doesn't include "start/end of line" anchors. + #[inline] + pub fn contains_anchor_haystack(&self) -> bool { + self.contains(Look::Start) || self.contains(Look::End) + } + + /// Returns true if and only if this set contains any "start/end of line" + /// anchors. This doesn't include "start/end of haystack" anchors. This + /// includes both `\n` line anchors and CRLF (`\r\n`) aware line anchors. + #[inline] + pub fn contains_anchor_line(&self) -> bool { + self.contains(Look::StartLF) + || self.contains(Look::EndLF) + || self.contains(Look::StartCRLF) + || self.contains(Look::EndCRLF) + } + + /// Returns true if and only if this set contains any "start/end of line" + /// anchors that only treat `\n` as line terminators. This does not include + /// haystack anchors or CRLF aware line anchors. + #[inline] + pub fn contains_anchor_lf(&self) -> bool { + self.contains(Look::StartLF) || self.contains(Look::EndLF) + } + + /// Returns true if and only if this set contains any "start/end of line" + /// anchors that are CRLF-aware. This doesn't include "start/end of + /// haystack" or "start/end of line-feed" anchors. + #[inline] + pub fn contains_anchor_crlf(&self) -> bool { + self.contains(Look::StartCRLF) || self.contains(Look::EndCRLF) + } + + /// Returns true if and only if this set contains any word boundary or + /// negated word boundary assertions. This include both Unicode and ASCII + /// word boundaries. + #[inline] + pub fn contains_word(self) -> bool { + self.contains_word_unicode() || self.contains_word_ascii() + } + + /// Returns true if and only if this set contains any Unicode word boundary + /// or negated Unicode word boundary assertions. + #[inline] + pub fn contains_word_unicode(self) -> bool { + self.contains(Look::WordUnicode) + || self.contains(Look::WordUnicodeNegate) + || self.contains(Look::WordStartUnicode) + || self.contains(Look::WordEndUnicode) + || self.contains(Look::WordStartHalfUnicode) + || self.contains(Look::WordEndHalfUnicode) + } + + /// Returns true if and only if this set contains any ASCII word boundary + /// or negated ASCII word boundary assertions. + #[inline] + pub fn contains_word_ascii(self) -> bool { + self.contains(Look::WordAscii) + || self.contains(Look::WordAsciiNegate) + || self.contains(Look::WordStartAscii) + || self.contains(Look::WordEndAscii) + || self.contains(Look::WordStartHalfAscii) + || self.contains(Look::WordEndHalfAscii) + } + + /// Returns an iterator over all of the look-around assertions in this set. + #[inline] + pub fn iter(self) -> LookSetIter { + LookSetIter { set: self } + } + + /// Return a new set that is equivalent to the original, but with the given + /// assertion added to it. If the assertion is already in the set, then the + /// returned set is equivalent to the original. + #[inline] + pub fn insert(self, look: Look) -> LookSet { + LookSet { bits: self.bits | look.as_repr() } + } + + /// Updates this set in place with the result of inserting the given + /// assertion into this set. + #[inline] + pub fn set_insert(&mut self, look: Look) { + *self = self.insert(look); + } + + /// Return a new set that is equivalent to the original, but with the given + /// assertion removed from it. If the assertion is not in the set, then the + /// returned set is equivalent to the original. + #[inline] + pub fn remove(self, look: Look) -> LookSet { + LookSet { bits: self.bits & !look.as_repr() } + } + + /// Updates this set in place with the result of removing the given + /// assertion from this set. + #[inline] + pub fn set_remove(&mut self, look: Look) { + *self = self.remove(look); + } + + /// Returns a new set that is the result of subtracting the given set from + /// this set. + #[inline] + pub fn subtract(self, other: LookSet) -> LookSet { + LookSet { bits: self.bits & !other.bits } + } + + /// Updates this set in place with the result of subtracting the given set + /// from this set. + #[inline] + pub fn set_subtract(&mut self, other: LookSet) { + *self = self.subtract(other); + } + + /// Returns a new set that is the union of this and the one given. + #[inline] + pub fn union(self, other: LookSet) -> LookSet { + LookSet { bits: self.bits | other.bits } + } + + /// Updates this set in place with the result of unioning it with the one + /// given. + #[inline] + pub fn set_union(&mut self, other: LookSet) { + *self = self.union(other); + } + + /// Returns a new set that is the intersection of this and the one given. + #[inline] + pub fn intersect(self, other: LookSet) -> LookSet { + LookSet { bits: self.bits & other.bits } + } + + /// Updates this set in place with the result of intersecting it with the + /// one given. + #[inline] + pub fn set_intersect(&mut self, other: LookSet) { + *self = self.intersect(other); + } + + /// Return a `LookSet` from the slice given as a native endian 32-bit + /// integer. + /// + /// # Panics + /// + /// This panics if `slice.len() < 4`. + #[inline] + pub fn read_repr(slice: &[u8]) -> LookSet { + let bits = u32::from_ne_bytes(slice[..4].try_into().unwrap()); + LookSet { bits } + } + + /// Write a `LookSet` as a native endian 32-bit integer to the beginning + /// of the slice given. + /// + /// # Panics + /// + /// This panics if `slice.len() < 4`. + #[inline] + pub fn write_repr(self, slice: &mut [u8]) { + let raw = self.bits.to_ne_bytes(); + slice[0] = raw[0]; + slice[1] = raw[1]; + slice[2] = raw[2]; + slice[3] = raw[3]; + } +} + +impl core::fmt::Debug for LookSet { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + if self.is_empty() { + return write!(f, "∅"); + } + for look in self.iter() { + write!(f, "{}", look.as_char())?; + } + Ok(()) + } +} + +/// An iterator over all look-around assertions in a [`LookSet`]. +/// +/// This iterator is created by [`LookSet::iter`]. +#[derive(Clone, Debug)] +pub struct LookSetIter { + set: LookSet, +} + +impl Iterator for LookSetIter { + type Item = Look; + + #[inline] + fn next(&mut self) -> Option<Look> { + if self.set.is_empty() { + return None; + } + // We'll never have more than u8::MAX distinct look-around assertions, + // so 'bit' will always fit into a u16. + let bit = u16::try_from(self.set.bits.trailing_zeros()).unwrap(); + let look = Look::from_repr(1 << bit)?; + self.set = self.set.remove(look); + Some(look) + } +} + +/// Given a sequence of HIR values where each value corresponds to a Unicode +/// class (or an all-ASCII byte class), return a single Unicode class +/// corresponding to the union of the classes found. +fn class_chars(hirs: &[Hir]) -> Option<Class> { + let mut cls = ClassUnicode::new(vec![]); + for hir in hirs.iter() { + match *hir.kind() { + HirKind::Class(Class::Unicode(ref cls2)) => { + cls.union(cls2); + } + HirKind::Class(Class::Bytes(ref cls2)) => { + cls.union(&cls2.to_unicode_class()?); + } + _ => return None, + }; + } + Some(Class::Unicode(cls)) +} + +/// Given a sequence of HIR values where each value corresponds to a byte class +/// (or an all-ASCII Unicode class), return a single byte class corresponding +/// to the union of the classes found. +fn class_bytes(hirs: &[Hir]) -> Option<Class> { + let mut cls = ClassBytes::new(vec![]); + for hir in hirs.iter() { + match *hir.kind() { + HirKind::Class(Class::Unicode(ref cls2)) => { + cls.union(&cls2.to_byte_class()?); + } + HirKind::Class(Class::Bytes(ref cls2)) => { + cls.union(cls2); + } + _ => return None, + }; + } + Some(Class::Bytes(cls)) +} + +/// Given a sequence of HIR values where each value corresponds to a literal +/// that is a single `char`, return that sequence of `char`s. Otherwise return +/// None. No deduplication is done. +fn singleton_chars(hirs: &[Hir]) -> Option<Vec<char>> { + let mut singletons = vec![]; + for hir in hirs.iter() { + let literal = match *hir.kind() { + HirKind::Literal(Literal(ref bytes)) => bytes, + _ => return None, + }; + let ch = match crate::debug::utf8_decode(literal) { + None => return None, + Some(Err(_)) => return None, + Some(Ok(ch)) => ch, + }; + if literal.len() != ch.len_utf8() { + return None; + } + singletons.push(ch); + } + Some(singletons) +} + +/// Given a sequence of HIR values where each value corresponds to a literal +/// that is a single byte, return that sequence of bytes. Otherwise return +/// None. No deduplication is done. +fn singleton_bytes(hirs: &[Hir]) -> Option<Vec<u8>> { + let mut singletons = vec![]; + for hir in hirs.iter() { + let literal = match *hir.kind() { + HirKind::Literal(Literal(ref bytes)) => bytes, + _ => return None, + }; + if literal.len() != 1 { + return None; + } + singletons.push(literal[0]); + } + Some(singletons) +} + +/// Looks for a common prefix in the list of alternation branches given. If one +/// is found, then an equivalent but (hopefully) simplified Hir is returned. +/// Otherwise, the original given list of branches is returned unmodified. +/// +/// This is not quite as good as it could be. Right now, it requires that +/// all branches are 'Concat' expressions. It also doesn't do well with +/// literals. For example, given 'foofoo|foobar', it will not refactor it to +/// 'foo(?:foo|bar)' because literals are flattened into their own special +/// concatenation. (One wonders if perhaps 'Literal' should be a single atom +/// instead of a string of bytes because of this. Otherwise, handling the +/// current representation in this routine will be pretty gnarly. Sigh.) +fn lift_common_prefix(hirs: Vec<Hir>) -> Result<Hir, Vec<Hir>> { + if hirs.len() <= 1 { + return Err(hirs); + } + let mut prefix = match hirs[0].kind() { + HirKind::Concat(ref xs) => &**xs, + _ => return Err(hirs), + }; + if prefix.is_empty() { + return Err(hirs); + } + for h in hirs.iter().skip(1) { + let concat = match h.kind() { + HirKind::Concat(ref xs) => xs, + _ => return Err(hirs), + }; + let common_len = prefix + .iter() + .zip(concat.iter()) + .take_while(|(x, y)| x == y) + .count(); + prefix = &prefix[..common_len]; + if prefix.is_empty() { + return Err(hirs); + } + } + let len = prefix.len(); + assert_ne!(0, len); + let mut prefix_concat = vec![]; + let mut suffix_alts = vec![]; + for h in hirs { + let mut concat = match h.into_kind() { + HirKind::Concat(xs) => xs, + // We required all sub-expressions to be + // concats above, so we're only here if we + // have a concat. + _ => unreachable!(), + }; + suffix_alts.push(Hir::concat(concat.split_off(len))); + if prefix_concat.is_empty() { + prefix_concat = concat; + } + } + let mut concat = prefix_concat; + concat.push(Hir::alternation(suffix_alts)); + Ok(Hir::concat(concat)) +} + +#[cfg(test)] +mod tests { + use super::*; + + fn uclass(ranges: &[(char, char)]) -> ClassUnicode { + let ranges: Vec<ClassUnicodeRange> = ranges + .iter() + .map(|&(s, e)| ClassUnicodeRange::new(s, e)) + .collect(); + ClassUnicode::new(ranges) + } + + fn bclass(ranges: &[(u8, u8)]) -> ClassBytes { + let ranges: Vec<ClassBytesRange> = + ranges.iter().map(|&(s, e)| ClassBytesRange::new(s, e)).collect(); + ClassBytes::new(ranges) + } + + fn uranges(cls: &ClassUnicode) -> Vec<(char, char)> { + cls.iter().map(|x| (x.start(), x.end())).collect() + } + + #[cfg(feature = "unicode-case")] + fn ucasefold(cls: &ClassUnicode) -> ClassUnicode { + let mut cls_ = cls.clone(); + cls_.case_fold_simple(); + cls_ + } + + fn uunion(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode { + let mut cls_ = cls1.clone(); + cls_.union(cls2); + cls_ + } + + fn uintersect(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode { + let mut cls_ = cls1.clone(); + cls_.intersect(cls2); + cls_ + } + + fn udifference(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode { + let mut cls_ = cls1.clone(); + cls_.difference(cls2); + cls_ + } + + fn usymdifference( + cls1: &ClassUnicode, + cls2: &ClassUnicode, + ) -> ClassUnicode { + let mut cls_ = cls1.clone(); + cls_.symmetric_difference(cls2); + cls_ + } + + fn unegate(cls: &ClassUnicode) -> ClassUnicode { + let mut cls_ = cls.clone(); + cls_.negate(); + cls_ + } + + fn branges(cls: &ClassBytes) -> Vec<(u8, u8)> { + cls.iter().map(|x| (x.start(), x.end())).collect() + } + + fn bcasefold(cls: &ClassBytes) -> ClassBytes { + let mut cls_ = cls.clone(); + cls_.case_fold_simple(); + cls_ + } + + fn bunion(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes { + let mut cls_ = cls1.clone(); + cls_.union(cls2); + cls_ + } + + fn bintersect(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes { + let mut cls_ = cls1.clone(); + cls_.intersect(cls2); + cls_ + } + + fn bdifference(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes { + let mut cls_ = cls1.clone(); + cls_.difference(cls2); + cls_ + } + + fn bsymdifference(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes { + let mut cls_ = cls1.clone(); + cls_.symmetric_difference(cls2); + cls_ + } + + fn bnegate(cls: &ClassBytes) -> ClassBytes { + let mut cls_ = cls.clone(); + cls_.negate(); + cls_ + } + + #[test] + fn class_range_canonical_unicode() { + let range = ClassUnicodeRange::new('\u{00FF}', '\0'); + assert_eq!('\0', range.start()); + assert_eq!('\u{00FF}', range.end()); + } + + #[test] + fn class_range_canonical_bytes() { + let range = ClassBytesRange::new(b'\xFF', b'\0'); + assert_eq!(b'\0', range.start()); + assert_eq!(b'\xFF', range.end()); + } + + #[test] + fn class_canonicalize_unicode() { + let cls = uclass(&[('a', 'c'), ('x', 'z')]); + let expected = vec![('a', 'c'), ('x', 'z')]; + assert_eq!(expected, uranges(&cls)); + + let cls = uclass(&[('x', 'z'), ('a', 'c')]); + let expected = vec![('a', 'c'), ('x', 'z')]; + assert_eq!(expected, uranges(&cls)); + + let cls = uclass(&[('x', 'z'), ('w', 'y')]); + let expected = vec![('w', 'z')]; + assert_eq!(expected, uranges(&cls)); + + let cls = uclass(&[ + ('c', 'f'), + ('a', 'g'), + ('d', 'j'), + ('a', 'c'), + ('m', 'p'), + ('l', 's'), + ]); + let expected = vec![('a', 'j'), ('l', 's')]; + assert_eq!(expected, uranges(&cls)); + + let cls = uclass(&[('x', 'z'), ('u', 'w')]); + let expected = vec![('u', 'z')]; + assert_eq!(expected, uranges(&cls)); + + let cls = uclass(&[('\x00', '\u{10FFFF}'), ('\x00', '\u{10FFFF}')]); + let expected = vec![('\x00', '\u{10FFFF}')]; + assert_eq!(expected, uranges(&cls)); + + let cls = uclass(&[('a', 'a'), ('b', 'b')]); + let expected = vec![('a', 'b')]; + assert_eq!(expected, uranges(&cls)); + } + + #[test] + fn class_canonicalize_bytes() { + let cls = bclass(&[(b'a', b'c'), (b'x', b'z')]); + let expected = vec![(b'a', b'c'), (b'x', b'z')]; + assert_eq!(expected, branges(&cls)); + + let cls = bclass(&[(b'x', b'z'), (b'a', b'c')]); + let expected = vec![(b'a', b'c'), (b'x', b'z')]; + assert_eq!(expected, branges(&cls)); + + let cls = bclass(&[(b'x', b'z'), (b'w', b'y')]); + let expected = vec![(b'w', b'z')]; + assert_eq!(expected, branges(&cls)); + + let cls = bclass(&[ + (b'c', b'f'), + (b'a', b'g'), + (b'd', b'j'), + (b'a', b'c'), + (b'm', b'p'), + (b'l', b's'), + ]); + let expected = vec![(b'a', b'j'), (b'l', b's')]; + assert_eq!(expected, branges(&cls)); + + let cls = bclass(&[(b'x', b'z'), (b'u', b'w')]); + let expected = vec![(b'u', b'z')]; + assert_eq!(expected, branges(&cls)); + + let cls = bclass(&[(b'\x00', b'\xFF'), (b'\x00', b'\xFF')]); + let expected = vec![(b'\x00', b'\xFF')]; + assert_eq!(expected, branges(&cls)); + + let cls = bclass(&[(b'a', b'a'), (b'b', b'b')]); + let expected = vec![(b'a', b'b')]; + assert_eq!(expected, branges(&cls)); + } + + #[test] + #[cfg(feature = "unicode-case")] + fn class_case_fold_unicode() { + let cls = uclass(&[ + ('C', 'F'), + ('A', 'G'), + ('D', 'J'), + ('A', 'C'), + ('M', 'P'), + ('L', 'S'), + ('c', 'f'), + ]); + let expected = uclass(&[ + ('A', 'J'), + ('L', 'S'), + ('a', 'j'), + ('l', 's'), + ('\u{17F}', '\u{17F}'), + ]); + assert_eq!(expected, ucasefold(&cls)); + + let cls = uclass(&[('A', 'Z')]); + let expected = uclass(&[ + ('A', 'Z'), + ('a', 'z'), + ('\u{17F}', '\u{17F}'), + ('\u{212A}', '\u{212A}'), + ]); + assert_eq!(expected, ucasefold(&cls)); + + let cls = uclass(&[('a', 'z')]); + let expected = uclass(&[ + ('A', 'Z'), + ('a', 'z'), + ('\u{17F}', '\u{17F}'), + ('\u{212A}', '\u{212A}'), + ]); + assert_eq!(expected, ucasefold(&cls)); + + let cls = uclass(&[('A', 'A'), ('_', '_')]); + let expected = uclass(&[('A', 'A'), ('_', '_'), ('a', 'a')]); + assert_eq!(expected, ucasefold(&cls)); + + let cls = uclass(&[('A', 'A'), ('=', '=')]); + let expected = uclass(&[('=', '='), ('A', 'A'), ('a', 'a')]); + assert_eq!(expected, ucasefold(&cls)); + + let cls = uclass(&[('\x00', '\x10')]); + assert_eq!(cls, ucasefold(&cls)); + + let cls = uclass(&[('k', 'k')]); + let expected = + uclass(&[('K', 'K'), ('k', 'k'), ('\u{212A}', '\u{212A}')]); + assert_eq!(expected, ucasefold(&cls)); + + let cls = uclass(&[('@', '@')]); + assert_eq!(cls, ucasefold(&cls)); + } + + #[test] + #[cfg(not(feature = "unicode-case"))] + fn class_case_fold_unicode_disabled() { + let mut cls = uclass(&[ + ('C', 'F'), + ('A', 'G'), + ('D', 'J'), + ('A', 'C'), + ('M', 'P'), + ('L', 'S'), + ('c', 'f'), + ]); + assert!(cls.try_case_fold_simple().is_err()); + } + + #[test] + #[should_panic] + #[cfg(not(feature = "unicode-case"))] + fn class_case_fold_unicode_disabled_panics() { + let mut cls = uclass(&[ + ('C', 'F'), + ('A', 'G'), + ('D', 'J'), + ('A', 'C'), + ('M', 'P'), + ('L', 'S'), + ('c', 'f'), + ]); + cls.case_fold_simple(); + } + + #[test] + fn class_case_fold_bytes() { + let cls = bclass(&[ + (b'C', b'F'), + (b'A', b'G'), + (b'D', b'J'), + (b'A', b'C'), + (b'M', b'P'), + (b'L', b'S'), + (b'c', b'f'), + ]); + let expected = + bclass(&[(b'A', b'J'), (b'L', b'S'), (b'a', b'j'), (b'l', b's')]); + assert_eq!(expected, bcasefold(&cls)); + + let cls = bclass(&[(b'A', b'Z')]); + let expected = bclass(&[(b'A', b'Z'), (b'a', b'z')]); + assert_eq!(expected, bcasefold(&cls)); + + let cls = bclass(&[(b'a', b'z')]); + let expected = bclass(&[(b'A', b'Z'), (b'a', b'z')]); + assert_eq!(expected, bcasefold(&cls)); + + let cls = bclass(&[(b'A', b'A'), (b'_', b'_')]); + let expected = bclass(&[(b'A', b'A'), (b'_', b'_'), (b'a', b'a')]); + assert_eq!(expected, bcasefold(&cls)); + + let cls = bclass(&[(b'A', b'A'), (b'=', b'=')]); + let expected = bclass(&[(b'=', b'='), (b'A', b'A'), (b'a', b'a')]); + assert_eq!(expected, bcasefold(&cls)); + + let cls = bclass(&[(b'\x00', b'\x10')]); + assert_eq!(cls, bcasefold(&cls)); + + let cls = bclass(&[(b'k', b'k')]); + let expected = bclass(&[(b'K', b'K'), (b'k', b'k')]); + assert_eq!(expected, bcasefold(&cls)); + + let cls = bclass(&[(b'@', b'@')]); + assert_eq!(cls, bcasefold(&cls)); + } + + #[test] + fn class_negate_unicode() { + let cls = uclass(&[('a', 'a')]); + let expected = uclass(&[('\x00', '\x60'), ('\x62', '\u{10FFFF}')]); + assert_eq!(expected, unegate(&cls)); + + let cls = uclass(&[('a', 'a'), ('b', 'b')]); + let expected = uclass(&[('\x00', '\x60'), ('\x63', '\u{10FFFF}')]); + assert_eq!(expected, unegate(&cls)); + + let cls = uclass(&[('a', 'c'), ('x', 'z')]); + let expected = uclass(&[ + ('\x00', '\x60'), + ('\x64', '\x77'), + ('\x7B', '\u{10FFFF}'), + ]); + assert_eq!(expected, unegate(&cls)); + + let cls = uclass(&[('\x00', 'a')]); + let expected = uclass(&[('\x62', '\u{10FFFF}')]); + assert_eq!(expected, unegate(&cls)); + + let cls = uclass(&[('a', '\u{10FFFF}')]); + let expected = uclass(&[('\x00', '\x60')]); + assert_eq!(expected, unegate(&cls)); + + let cls = uclass(&[('\x00', '\u{10FFFF}')]); + let expected = uclass(&[]); + assert_eq!(expected, unegate(&cls)); + + let cls = uclass(&[]); + let expected = uclass(&[('\x00', '\u{10FFFF}')]); + assert_eq!(expected, unegate(&cls)); + + let cls = + uclass(&[('\x00', '\u{10FFFD}'), ('\u{10FFFF}', '\u{10FFFF}')]); + let expected = uclass(&[('\u{10FFFE}', '\u{10FFFE}')]); + assert_eq!(expected, unegate(&cls)); + + let cls = uclass(&[('\x00', '\u{D7FF}')]); + let expected = uclass(&[('\u{E000}', '\u{10FFFF}')]); + assert_eq!(expected, unegate(&cls)); + + let cls = uclass(&[('\x00', '\u{D7FE}')]); + let expected = uclass(&[('\u{D7FF}', '\u{10FFFF}')]); + assert_eq!(expected, unegate(&cls)); + + let cls = uclass(&[('\u{E000}', '\u{10FFFF}')]); + let expected = uclass(&[('\x00', '\u{D7FF}')]); + assert_eq!(expected, unegate(&cls)); + + let cls = uclass(&[('\u{E001}', '\u{10FFFF}')]); + let expected = uclass(&[('\x00', '\u{E000}')]); + assert_eq!(expected, unegate(&cls)); + } + + #[test] + fn class_negate_bytes() { + let cls = bclass(&[(b'a', b'a')]); + let expected = bclass(&[(b'\x00', b'\x60'), (b'\x62', b'\xFF')]); + assert_eq!(expected, bnegate(&cls)); + + let cls = bclass(&[(b'a', b'a'), (b'b', b'b')]); + let expected = bclass(&[(b'\x00', b'\x60'), (b'\x63', b'\xFF')]); + assert_eq!(expected, bnegate(&cls)); + + let cls = bclass(&[(b'a', b'c'), (b'x', b'z')]); + let expected = bclass(&[ + (b'\x00', b'\x60'), + (b'\x64', b'\x77'), + (b'\x7B', b'\xFF'), + ]); + assert_eq!(expected, bnegate(&cls)); + + let cls = bclass(&[(b'\x00', b'a')]); + let expected = bclass(&[(b'\x62', b'\xFF')]); + assert_eq!(expected, bnegate(&cls)); + + let cls = bclass(&[(b'a', b'\xFF')]); + let expected = bclass(&[(b'\x00', b'\x60')]); + assert_eq!(expected, bnegate(&cls)); + + let cls = bclass(&[(b'\x00', b'\xFF')]); + let expected = bclass(&[]); + assert_eq!(expected, bnegate(&cls)); + + let cls = bclass(&[]); + let expected = bclass(&[(b'\x00', b'\xFF')]); + assert_eq!(expected, bnegate(&cls)); + + let cls = bclass(&[(b'\x00', b'\xFD'), (b'\xFF', b'\xFF')]); + let expected = bclass(&[(b'\xFE', b'\xFE')]); + assert_eq!(expected, bnegate(&cls)); + } + + #[test] + fn class_union_unicode() { + let cls1 = uclass(&[('a', 'g'), ('m', 't'), ('A', 'C')]); + let cls2 = uclass(&[('a', 'z')]); + let expected = uclass(&[('a', 'z'), ('A', 'C')]); + assert_eq!(expected, uunion(&cls1, &cls2)); + } + + #[test] + fn class_union_bytes() { + let cls1 = bclass(&[(b'a', b'g'), (b'm', b't'), (b'A', b'C')]); + let cls2 = bclass(&[(b'a', b'z')]); + let expected = bclass(&[(b'a', b'z'), (b'A', b'C')]); + assert_eq!(expected, bunion(&cls1, &cls2)); + } + + #[test] + fn class_intersect_unicode() { + let cls1 = uclass(&[]); + let cls2 = uclass(&[('a', 'a')]); + let expected = uclass(&[]); + assert_eq!(expected, uintersect(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'a')]); + let cls2 = uclass(&[('a', 'a')]); + let expected = uclass(&[('a', 'a')]); + assert_eq!(expected, uintersect(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'a')]); + let cls2 = uclass(&[('b', 'b')]); + let expected = uclass(&[]); + assert_eq!(expected, uintersect(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'a')]); + let cls2 = uclass(&[('a', 'c')]); + let expected = uclass(&[('a', 'a')]); + assert_eq!(expected, uintersect(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'b')]); + let cls2 = uclass(&[('a', 'c')]); + let expected = uclass(&[('a', 'b')]); + assert_eq!(expected, uintersect(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'b')]); + let cls2 = uclass(&[('b', 'c')]); + let expected = uclass(&[('b', 'b')]); + assert_eq!(expected, uintersect(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'b')]); + let cls2 = uclass(&[('c', 'd')]); + let expected = uclass(&[]); + assert_eq!(expected, uintersect(&cls1, &cls2)); + + let cls1 = uclass(&[('b', 'c')]); + let cls2 = uclass(&[('a', 'd')]); + let expected = uclass(&[('b', 'c')]); + assert_eq!(expected, uintersect(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]); + let cls2 = uclass(&[('a', 'h')]); + let expected = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]); + assert_eq!(expected, uintersect(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]); + let cls2 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]); + let expected = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]); + assert_eq!(expected, uintersect(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'b'), ('g', 'h')]); + let cls2 = uclass(&[('d', 'e'), ('k', 'l')]); + let expected = uclass(&[]); + assert_eq!(expected, uintersect(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]); + let cls2 = uclass(&[('h', 'h')]); + let expected = uclass(&[('h', 'h')]); + assert_eq!(expected, uintersect(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'b'), ('e', 'f'), ('i', 'j')]); + let cls2 = uclass(&[('c', 'd'), ('g', 'h'), ('k', 'l')]); + let expected = uclass(&[]); + assert_eq!(expected, uintersect(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'b'), ('c', 'd'), ('e', 'f')]); + let cls2 = uclass(&[('b', 'c'), ('d', 'e'), ('f', 'g')]); + let expected = uclass(&[('b', 'f')]); + assert_eq!(expected, uintersect(&cls1, &cls2)); + } + + #[test] + fn class_intersect_bytes() { + let cls1 = bclass(&[]); + let cls2 = bclass(&[(b'a', b'a')]); + let expected = bclass(&[]); + assert_eq!(expected, bintersect(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'a')]); + let cls2 = bclass(&[(b'a', b'a')]); + let expected = bclass(&[(b'a', b'a')]); + assert_eq!(expected, bintersect(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'a')]); + let cls2 = bclass(&[(b'b', b'b')]); + let expected = bclass(&[]); + assert_eq!(expected, bintersect(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'a')]); + let cls2 = bclass(&[(b'a', b'c')]); + let expected = bclass(&[(b'a', b'a')]); + assert_eq!(expected, bintersect(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'b')]); + let cls2 = bclass(&[(b'a', b'c')]); + let expected = bclass(&[(b'a', b'b')]); + assert_eq!(expected, bintersect(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'b')]); + let cls2 = bclass(&[(b'b', b'c')]); + let expected = bclass(&[(b'b', b'b')]); + assert_eq!(expected, bintersect(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'b')]); + let cls2 = bclass(&[(b'c', b'd')]); + let expected = bclass(&[]); + assert_eq!(expected, bintersect(&cls1, &cls2)); + + let cls1 = bclass(&[(b'b', b'c')]); + let cls2 = bclass(&[(b'a', b'd')]); + let expected = bclass(&[(b'b', b'c')]); + assert_eq!(expected, bintersect(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]); + let cls2 = bclass(&[(b'a', b'h')]); + let expected = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]); + assert_eq!(expected, bintersect(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]); + let cls2 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]); + let expected = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]); + assert_eq!(expected, bintersect(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'b'), (b'g', b'h')]); + let cls2 = bclass(&[(b'd', b'e'), (b'k', b'l')]); + let expected = bclass(&[]); + assert_eq!(expected, bintersect(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]); + let cls2 = bclass(&[(b'h', b'h')]); + let expected = bclass(&[(b'h', b'h')]); + assert_eq!(expected, bintersect(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'b'), (b'e', b'f'), (b'i', b'j')]); + let cls2 = bclass(&[(b'c', b'd'), (b'g', b'h'), (b'k', b'l')]); + let expected = bclass(&[]); + assert_eq!(expected, bintersect(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'b'), (b'c', b'd'), (b'e', b'f')]); + let cls2 = bclass(&[(b'b', b'c'), (b'd', b'e'), (b'f', b'g')]); + let expected = bclass(&[(b'b', b'f')]); + assert_eq!(expected, bintersect(&cls1, &cls2)); + } + + #[test] + fn class_difference_unicode() { + let cls1 = uclass(&[('a', 'a')]); + let cls2 = uclass(&[('a', 'a')]); + let expected = uclass(&[]); + assert_eq!(expected, udifference(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'a')]); + let cls2 = uclass(&[]); + let expected = uclass(&[('a', 'a')]); + assert_eq!(expected, udifference(&cls1, &cls2)); + + let cls1 = uclass(&[]); + let cls2 = uclass(&[('a', 'a')]); + let expected = uclass(&[]); + assert_eq!(expected, udifference(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'z')]); + let cls2 = uclass(&[('a', 'a')]); + let expected = uclass(&[('b', 'z')]); + assert_eq!(expected, udifference(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'z')]); + let cls2 = uclass(&[('z', 'z')]); + let expected = uclass(&[('a', 'y')]); + assert_eq!(expected, udifference(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'z')]); + let cls2 = uclass(&[('m', 'm')]); + let expected = uclass(&[('a', 'l'), ('n', 'z')]); + assert_eq!(expected, udifference(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]); + let cls2 = uclass(&[('a', 'z')]); + let expected = uclass(&[]); + assert_eq!(expected, udifference(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]); + let cls2 = uclass(&[('d', 'v')]); + let expected = uclass(&[('a', 'c')]); + assert_eq!(expected, udifference(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]); + let cls2 = uclass(&[('b', 'g'), ('s', 'u')]); + let expected = uclass(&[('a', 'a'), ('h', 'i'), ('r', 'r')]); + assert_eq!(expected, udifference(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]); + let cls2 = uclass(&[('b', 'd'), ('e', 'g'), ('s', 'u')]); + let expected = uclass(&[('a', 'a'), ('h', 'i'), ('r', 'r')]); + assert_eq!(expected, udifference(&cls1, &cls2)); + + let cls1 = uclass(&[('x', 'z')]); + let cls2 = uclass(&[('a', 'c'), ('e', 'g'), ('s', 'u')]); + let expected = uclass(&[('x', 'z')]); + assert_eq!(expected, udifference(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'z')]); + let cls2 = uclass(&[('a', 'c'), ('e', 'g'), ('s', 'u')]); + let expected = uclass(&[('d', 'd'), ('h', 'r'), ('v', 'z')]); + assert_eq!(expected, udifference(&cls1, &cls2)); + } + + #[test] + fn class_difference_bytes() { + let cls1 = bclass(&[(b'a', b'a')]); + let cls2 = bclass(&[(b'a', b'a')]); + let expected = bclass(&[]); + assert_eq!(expected, bdifference(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'a')]); + let cls2 = bclass(&[]); + let expected = bclass(&[(b'a', b'a')]); + assert_eq!(expected, bdifference(&cls1, &cls2)); + + let cls1 = bclass(&[]); + let cls2 = bclass(&[(b'a', b'a')]); + let expected = bclass(&[]); + assert_eq!(expected, bdifference(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'z')]); + let cls2 = bclass(&[(b'a', b'a')]); + let expected = bclass(&[(b'b', b'z')]); + assert_eq!(expected, bdifference(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'z')]); + let cls2 = bclass(&[(b'z', b'z')]); + let expected = bclass(&[(b'a', b'y')]); + assert_eq!(expected, bdifference(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'z')]); + let cls2 = bclass(&[(b'm', b'm')]); + let expected = bclass(&[(b'a', b'l'), (b'n', b'z')]); + assert_eq!(expected, bdifference(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]); + let cls2 = bclass(&[(b'a', b'z')]); + let expected = bclass(&[]); + assert_eq!(expected, bdifference(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]); + let cls2 = bclass(&[(b'd', b'v')]); + let expected = bclass(&[(b'a', b'c')]); + assert_eq!(expected, bdifference(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]); + let cls2 = bclass(&[(b'b', b'g'), (b's', b'u')]); + let expected = bclass(&[(b'a', b'a'), (b'h', b'i'), (b'r', b'r')]); + assert_eq!(expected, bdifference(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]); + let cls2 = bclass(&[(b'b', b'd'), (b'e', b'g'), (b's', b'u')]); + let expected = bclass(&[(b'a', b'a'), (b'h', b'i'), (b'r', b'r')]); + assert_eq!(expected, bdifference(&cls1, &cls2)); + + let cls1 = bclass(&[(b'x', b'z')]); + let cls2 = bclass(&[(b'a', b'c'), (b'e', b'g'), (b's', b'u')]); + let expected = bclass(&[(b'x', b'z')]); + assert_eq!(expected, bdifference(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'z')]); + let cls2 = bclass(&[(b'a', b'c'), (b'e', b'g'), (b's', b'u')]); + let expected = bclass(&[(b'd', b'd'), (b'h', b'r'), (b'v', b'z')]); + assert_eq!(expected, bdifference(&cls1, &cls2)); + } + + #[test] + fn class_symmetric_difference_unicode() { + let cls1 = uclass(&[('a', 'm')]); + let cls2 = uclass(&[('g', 't')]); + let expected = uclass(&[('a', 'f'), ('n', 't')]); + assert_eq!(expected, usymdifference(&cls1, &cls2)); + } + + #[test] + fn class_symmetric_difference_bytes() { + let cls1 = bclass(&[(b'a', b'm')]); + let cls2 = bclass(&[(b'g', b't')]); + let expected = bclass(&[(b'a', b'f'), (b'n', b't')]); + assert_eq!(expected, bsymdifference(&cls1, &cls2)); + } + + // We use a thread with an explicit stack size to test that our destructor + // for Hir can handle arbitrarily sized expressions in constant stack + // space. In case we run on a platform without threads (WASM?), we limit + // this test to Windows/Unix. + #[test] + #[cfg(any(unix, windows))] + fn no_stack_overflow_on_drop() { + use std::thread; + + let run = || { + let mut expr = Hir::empty(); + for _ in 0..100 { + expr = Hir::capture(Capture { + index: 1, + name: None, + sub: Box::new(expr), + }); + expr = Hir::repetition(Repetition { + min: 0, + max: Some(1), + greedy: true, + sub: Box::new(expr), + }); + + expr = Hir { + kind: HirKind::Concat(vec![expr]), + props: Properties::empty(), + }; + expr = Hir { + kind: HirKind::Alternation(vec![expr]), + props: Properties::empty(), + }; + } + assert!(!matches!(*expr.kind(), HirKind::Empty)); + }; + + // We run our test on a thread with a small stack size so we can + // force the issue more easily. + // + // NOTE(2023-03-21): See the corresponding test in 'crate::ast::tests' + // for context on the specific stack size chosen here. + thread::Builder::new() + .stack_size(16 << 10) + .spawn(run) + .unwrap() + .join() + .unwrap(); + } + + #[test] + fn look_set_iter() { + let set = LookSet::empty(); + assert_eq!(0, set.iter().count()); + + let set = LookSet::full(); + assert_eq!(18, set.iter().count()); + + let set = + LookSet::empty().insert(Look::StartLF).insert(Look::WordUnicode); + assert_eq!(2, set.iter().count()); + + let set = LookSet::empty().insert(Look::StartLF); + assert_eq!(1, set.iter().count()); + + let set = LookSet::empty().insert(Look::WordAsciiNegate); + assert_eq!(1, set.iter().count()); + } + + #[test] + fn look_set_debug() { + let res = format!("{:?}", LookSet::empty()); + assert_eq!("∅", res); + let res = format!("{:?}", LookSet::full()); + assert_eq!("Az^$rRbB𝛃𝚩<>〈〉◁▷◀▶", res); + } +} diff --git a/vendor/regex-syntax/src/hir/print.rs b/vendor/regex-syntax/src/hir/print.rs new file mode 100644 index 0000000..dfa6d40 --- /dev/null +++ b/vendor/regex-syntax/src/hir/print.rs @@ -0,0 +1,608 @@ +/*! +This module provides a regular expression printer for `Hir`. +*/ + +use core::fmt; + +use crate::{ + hir::{ + self, + visitor::{self, Visitor}, + Hir, HirKind, + }, + is_meta_character, +}; + +/// A builder for constructing a printer. +/// +/// Note that since a printer doesn't have any configuration knobs, this type +/// remains unexported. +#[derive(Clone, Debug)] +struct PrinterBuilder { + _priv: (), +} + +impl Default for PrinterBuilder { + fn default() -> PrinterBuilder { + PrinterBuilder::new() + } +} + +impl PrinterBuilder { + fn new() -> PrinterBuilder { + PrinterBuilder { _priv: () } + } + + fn build(&self) -> Printer { + Printer { _priv: () } + } +} + +/// A printer for a regular expression's high-level intermediate +/// representation. +/// +/// A printer converts a high-level intermediate representation (HIR) to a +/// regular expression pattern string. This particular printer uses constant +/// stack space and heap space proportional to the size of the HIR. +/// +/// Since this printer is only using the HIR, the pattern it prints will likely +/// not resemble the original pattern at all. For example, a pattern like +/// `\pL` will have its entire class written out. +/// +/// The purpose of this printer is to provide a means to mutate an HIR and then +/// build a regular expression from the result of that mutation. (A regex +/// library could provide a constructor from this HIR explicitly, but that +/// creates an unnecessary public coupling between the regex library and this +/// specific HIR representation.) +#[derive(Debug)] +pub struct Printer { + _priv: (), +} + +impl Printer { + /// Create a new printer. + pub fn new() -> Printer { + PrinterBuilder::new().build() + } + + /// Print the given `Ast` to the given writer. The writer must implement + /// `fmt::Write`. Typical implementations of `fmt::Write` that can be used + /// here are a `fmt::Formatter` (which is available in `fmt::Display` + /// implementations) or a `&mut String`. + pub fn print<W: fmt::Write>(&mut self, hir: &Hir, wtr: W) -> fmt::Result { + visitor::visit(hir, Writer { wtr }) + } +} + +#[derive(Debug)] +struct Writer<W> { + wtr: W, +} + +impl<W: fmt::Write> Visitor for Writer<W> { + type Output = (); + type Err = fmt::Error; + + fn finish(self) -> fmt::Result { + Ok(()) + } + + fn visit_pre(&mut self, hir: &Hir) -> fmt::Result { + match *hir.kind() { + HirKind::Empty => { + // Technically an empty sub-expression could be "printed" by + // just ignoring it, but in practice, you could have a + // repetition operator attached to an empty expression, and you + // really need something in the concrete syntax to make that + // work as you'd expect. + self.wtr.write_str(r"(?:)")?; + } + // Repetition operators are strictly suffix oriented. + HirKind::Repetition(_) => {} + HirKind::Literal(hir::Literal(ref bytes)) => { + // See the comment on the 'Concat' and 'Alternation' case below + // for why we put parens here. Literals are, conceptually, + // a special case of concatenation where each element is a + // character. The HIR flattens this into a Box<[u8]>, but we + // still need to treat it like a concatenation for correct + // printing. As a special case, we don't write parens if there + // is only one character. One character means there is no + // concat so we don't need parens. Adding parens would still be + // correct, but we drop them here because it tends to create + // rather noisy regexes even in simple cases. + let result = core::str::from_utf8(bytes); + let len = result.map_or(bytes.len(), |s| s.chars().count()); + if len > 1 { + self.wtr.write_str(r"(?:")?; + } + match result { + Ok(string) => { + for c in string.chars() { + self.write_literal_char(c)?; + } + } + Err(_) => { + for &b in bytes.iter() { + self.write_literal_byte(b)?; + } + } + } + if len > 1 { + self.wtr.write_str(r")")?; + } + } + HirKind::Class(hir::Class::Unicode(ref cls)) => { + if cls.ranges().is_empty() { + return self.wtr.write_str("[a&&b]"); + } + self.wtr.write_str("[")?; + for range in cls.iter() { + if range.start() == range.end() { + self.write_literal_char(range.start())?; + } else if u32::from(range.start()) + 1 + == u32::from(range.end()) + { + self.write_literal_char(range.start())?; + self.write_literal_char(range.end())?; + } else { + self.write_literal_char(range.start())?; + self.wtr.write_str("-")?; + self.write_literal_char(range.end())?; + } + } + self.wtr.write_str("]")?; + } + HirKind::Class(hir::Class::Bytes(ref cls)) => { + if cls.ranges().is_empty() { + return self.wtr.write_str("[a&&b]"); + } + self.wtr.write_str("(?-u:[")?; + for range in cls.iter() { + if range.start() == range.end() { + self.write_literal_class_byte(range.start())?; + } else if range.start() + 1 == range.end() { + self.write_literal_class_byte(range.start())?; + self.write_literal_class_byte(range.end())?; + } else { + self.write_literal_class_byte(range.start())?; + self.wtr.write_str("-")?; + self.write_literal_class_byte(range.end())?; + } + } + self.wtr.write_str("])")?; + } + HirKind::Look(ref look) => match *look { + hir::Look::Start => { + self.wtr.write_str(r"\A")?; + } + hir::Look::End => { + self.wtr.write_str(r"\z")?; + } + hir::Look::StartLF => { + self.wtr.write_str("(?m:^)")?; + } + hir::Look::EndLF => { + self.wtr.write_str("(?m:$)")?; + } + hir::Look::StartCRLF => { + self.wtr.write_str("(?mR:^)")?; + } + hir::Look::EndCRLF => { + self.wtr.write_str("(?mR:$)")?; + } + hir::Look::WordAscii => { + self.wtr.write_str(r"(?-u:\b)")?; + } + hir::Look::WordAsciiNegate => { + self.wtr.write_str(r"(?-u:\B)")?; + } + hir::Look::WordUnicode => { + self.wtr.write_str(r"\b")?; + } + hir::Look::WordUnicodeNegate => { + self.wtr.write_str(r"\B")?; + } + hir::Look::WordStartAscii => { + self.wtr.write_str(r"(?-u:\b{start})")?; + } + hir::Look::WordEndAscii => { + self.wtr.write_str(r"(?-u:\b{end})")?; + } + hir::Look::WordStartUnicode => { + self.wtr.write_str(r"\b{start}")?; + } + hir::Look::WordEndUnicode => { + self.wtr.write_str(r"\b{end}")?; + } + hir::Look::WordStartHalfAscii => { + self.wtr.write_str(r"(?-u:\b{start-half})")?; + } + hir::Look::WordEndHalfAscii => { + self.wtr.write_str(r"(?-u:\b{end-half})")?; + } + hir::Look::WordStartHalfUnicode => { + self.wtr.write_str(r"\b{start-half}")?; + } + hir::Look::WordEndHalfUnicode => { + self.wtr.write_str(r"\b{end-half}")?; + } + }, + HirKind::Capture(hir::Capture { ref name, .. }) => { + self.wtr.write_str("(")?; + if let Some(ref name) = *name { + write!(self.wtr, "?P<{}>", name)?; + } + } + // Why do this? Wrapping concats and alts in non-capturing groups + // is not *always* necessary, but is sometimes necessary. For + // example, 'concat(a, alt(b, c))' should be written as 'a(?:b|c)' + // and not 'ab|c'. The former is clearly the intended meaning, but + // the latter is actually 'alt(concat(a, b), c)'. + // + // It would be possible to only group these things in cases where + // it's strictly necessary, but it requires knowing the parent + // expression. And since this technique is simpler and always + // correct, we take this route. More to the point, it is a non-goal + // of an HIR printer to show a nice easy-to-read regex. Indeed, + // its construction forbids it from doing so. Therefore, inserting + // extra groups where they aren't necessary is perfectly okay. + HirKind::Concat(_) | HirKind::Alternation(_) => { + self.wtr.write_str(r"(?:")?; + } + } + Ok(()) + } + + fn visit_post(&mut self, hir: &Hir) -> fmt::Result { + match *hir.kind() { + // Handled during visit_pre + HirKind::Empty + | HirKind::Literal(_) + | HirKind::Class(_) + | HirKind::Look(_) => {} + HirKind::Repetition(ref x) => { + match (x.min, x.max) { + (0, Some(1)) => { + self.wtr.write_str("?")?; + } + (0, None) => { + self.wtr.write_str("*")?; + } + (1, None) => { + self.wtr.write_str("+")?; + } + (1, Some(1)) => { + // 'a{1}' and 'a{1}?' are exactly equivalent to 'a'. + return Ok(()); + } + (m, None) => { + write!(self.wtr, "{{{},}}", m)?; + } + (m, Some(n)) if m == n => { + write!(self.wtr, "{{{}}}", m)?; + // a{m} and a{m}? are always exactly equivalent. + return Ok(()); + } + (m, Some(n)) => { + write!(self.wtr, "{{{},{}}}", m, n)?; + } + } + if !x.greedy { + self.wtr.write_str("?")?; + } + } + HirKind::Capture(_) + | HirKind::Concat(_) + | HirKind::Alternation(_) => { + self.wtr.write_str(r")")?; + } + } + Ok(()) + } + + fn visit_alternation_in(&mut self) -> fmt::Result { + self.wtr.write_str("|") + } +} + +impl<W: fmt::Write> Writer<W> { + fn write_literal_char(&mut self, c: char) -> fmt::Result { + if is_meta_character(c) { + self.wtr.write_str("\\")?; + } + self.wtr.write_char(c) + } + + fn write_literal_byte(&mut self, b: u8) -> fmt::Result { + if b <= 0x7F && !b.is_ascii_control() && !b.is_ascii_whitespace() { + self.write_literal_char(char::try_from(b).unwrap()) + } else { + write!(self.wtr, "(?-u:\\x{:02X})", b) + } + } + + fn write_literal_class_byte(&mut self, b: u8) -> fmt::Result { + if b <= 0x7F && !b.is_ascii_control() && !b.is_ascii_whitespace() { + self.write_literal_char(char::try_from(b).unwrap()) + } else { + write!(self.wtr, "\\x{:02X}", b) + } + } +} + +#[cfg(test)] +mod tests { + use alloc::{ + boxed::Box, + string::{String, ToString}, + }; + + use crate::ParserBuilder; + + use super::*; + + fn roundtrip(given: &str, expected: &str) { + roundtrip_with(|b| b, given, expected); + } + + fn roundtrip_bytes(given: &str, expected: &str) { + roundtrip_with(|b| b.utf8(false), given, expected); + } + + fn roundtrip_with<F>(mut f: F, given: &str, expected: &str) + where + F: FnMut(&mut ParserBuilder) -> &mut ParserBuilder, + { + let mut builder = ParserBuilder::new(); + f(&mut builder); + let hir = builder.build().parse(given).unwrap(); + + let mut printer = Printer::new(); + let mut dst = String::new(); + printer.print(&hir, &mut dst).unwrap(); + + // Check that the result is actually valid. + builder.build().parse(&dst).unwrap(); + + assert_eq!(expected, dst); + } + + #[test] + fn print_literal() { + roundtrip("a", "a"); + roundtrip(r"\xff", "\u{FF}"); + roundtrip_bytes(r"\xff", "\u{FF}"); + roundtrip_bytes(r"(?-u)\xff", r"(?-u:\xFF)"); + roundtrip("☃", "☃"); + } + + #[test] + fn print_class() { + roundtrip(r"[a]", r"a"); + roundtrip(r"[ab]", r"[ab]"); + roundtrip(r"[a-z]", r"[a-z]"); + roundtrip(r"[a-z--b-c--x-y]", r"[ad-wz]"); + roundtrip(r"[^\x01-\u{10FFFF}]", "\u{0}"); + roundtrip(r"[-]", r"\-"); + roundtrip(r"[☃-⛄]", r"[☃-⛄]"); + + roundtrip(r"(?-u)[a]", r"a"); + roundtrip(r"(?-u)[ab]", r"(?-u:[ab])"); + roundtrip(r"(?-u)[a-z]", r"(?-u:[a-z])"); + roundtrip_bytes(r"(?-u)[a-\xFF]", r"(?-u:[a-\xFF])"); + + // The following test that the printer escapes meta characters + // in character classes. + roundtrip(r"[\[]", r"\["); + roundtrip(r"[Z-_]", r"[Z-_]"); + roundtrip(r"[Z-_--Z]", r"[\[-_]"); + + // The following test that the printer escapes meta characters + // in byte oriented character classes. + roundtrip_bytes(r"(?-u)[\[]", r"\["); + roundtrip_bytes(r"(?-u)[Z-_]", r"(?-u:[Z-_])"); + roundtrip_bytes(r"(?-u)[Z-_--Z]", r"(?-u:[\[-_])"); + + // This tests that an empty character class is correctly roundtripped. + #[cfg(feature = "unicode-gencat")] + roundtrip(r"\P{any}", r"[a&&b]"); + roundtrip_bytes(r"(?-u)[^\x00-\xFF]", r"[a&&b]"); + } + + #[test] + fn print_anchor() { + roundtrip(r"^", r"\A"); + roundtrip(r"$", r"\z"); + roundtrip(r"(?m)^", r"(?m:^)"); + roundtrip(r"(?m)$", r"(?m:$)"); + } + + #[test] + fn print_word_boundary() { + roundtrip(r"\b", r"\b"); + roundtrip(r"\B", r"\B"); + roundtrip(r"(?-u)\b", r"(?-u:\b)"); + roundtrip_bytes(r"(?-u)\B", r"(?-u:\B)"); + } + + #[test] + fn print_repetition() { + roundtrip("a?", "a?"); + roundtrip("a??", "a??"); + roundtrip("(?U)a?", "a??"); + + roundtrip("a*", "a*"); + roundtrip("a*?", "a*?"); + roundtrip("(?U)a*", "a*?"); + + roundtrip("a+", "a+"); + roundtrip("a+?", "a+?"); + roundtrip("(?U)a+", "a+?"); + + roundtrip("a{1}", "a"); + roundtrip("a{2}", "a{2}"); + roundtrip("a{1,}", "a+"); + roundtrip("a{1,5}", "a{1,5}"); + roundtrip("a{1}?", "a"); + roundtrip("a{2}?", "a{2}"); + roundtrip("a{1,}?", "a+?"); + roundtrip("a{1,5}?", "a{1,5}?"); + roundtrip("(?U)a{1}", "a"); + roundtrip("(?U)a{2}", "a{2}"); + roundtrip("(?U)a{1,}", "a+?"); + roundtrip("(?U)a{1,5}", "a{1,5}?"); + + // Test that various zero-length repetitions always translate to an + // empty regex. This is more a property of HIR's smart constructors + // than the printer though. + roundtrip("a{0}", "(?:)"); + roundtrip("(?:ab){0}", "(?:)"); + #[cfg(feature = "unicode-gencat")] + { + roundtrip(r"\p{any}{0}", "(?:)"); + roundtrip(r"\P{any}{0}", "(?:)"); + } + } + + #[test] + fn print_group() { + roundtrip("()", "((?:))"); + roundtrip("(?P<foo>)", "(?P<foo>(?:))"); + roundtrip("(?:)", "(?:)"); + + roundtrip("(a)", "(a)"); + roundtrip("(?P<foo>a)", "(?P<foo>a)"); + roundtrip("(?:a)", "a"); + + roundtrip("((((a))))", "((((a))))"); + } + + #[test] + fn print_alternation() { + roundtrip("|", "(?:(?:)|(?:))"); + roundtrip("||", "(?:(?:)|(?:)|(?:))"); + + roundtrip("a|b", "[ab]"); + roundtrip("ab|cd", "(?:(?:ab)|(?:cd))"); + roundtrip("a|b|c", "[a-c]"); + roundtrip("ab|cd|ef", "(?:(?:ab)|(?:cd)|(?:ef))"); + roundtrip("foo|bar|quux", "(?:(?:foo)|(?:bar)|(?:quux))"); + } + + // This is a regression test that stresses a peculiarity of how the HIR + // is both constructed and printed. Namely, it is legal for a repetition + // to directly contain a concatenation. This particular construct isn't + // really possible to build from the concrete syntax directly, since you'd + // be forced to put the concatenation into (at least) a non-capturing + // group. Concurrently, the printer doesn't consider this case and just + // kind of naively prints the child expression and tacks on the repetition + // operator. + // + // As a result, if you attached '+' to a 'concat(a, b)', the printer gives + // you 'ab+', but clearly it really should be '(?:ab)+'. + // + // This bug isn't easy to surface because most ways of building an HIR + // come directly from the concrete syntax, and as mentioned above, it just + // isn't possible to build this kind of HIR from the concrete syntax. + // Nevertheless, this is definitely a bug. + // + // See: https://github.com/rust-lang/regex/issues/731 + #[test] + fn regression_repetition_concat() { + let expr = Hir::concat(alloc::vec![ + Hir::literal("x".as_bytes()), + Hir::repetition(hir::Repetition { + min: 1, + max: None, + greedy: true, + sub: Box::new(Hir::literal("ab".as_bytes())), + }), + Hir::literal("y".as_bytes()), + ]); + assert_eq!(r"(?:x(?:ab)+y)", expr.to_string()); + + let expr = Hir::concat(alloc::vec![ + Hir::look(hir::Look::Start), + Hir::repetition(hir::Repetition { + min: 1, + max: None, + greedy: true, + sub: Box::new(Hir::concat(alloc::vec![ + Hir::look(hir::Look::Start), + Hir::look(hir::Look::End), + ])), + }), + Hir::look(hir::Look::End), + ]); + assert_eq!(r"(?:\A\A\z\z)", expr.to_string()); + } + + // Just like regression_repetition_concat, but with the repetition using + // an alternation as a child expression instead. + // + // See: https://github.com/rust-lang/regex/issues/731 + #[test] + fn regression_repetition_alternation() { + let expr = Hir::concat(alloc::vec![ + Hir::literal("ab".as_bytes()), + Hir::repetition(hir::Repetition { + min: 1, + max: None, + greedy: true, + sub: Box::new(Hir::alternation(alloc::vec![ + Hir::literal("cd".as_bytes()), + Hir::literal("ef".as_bytes()), + ])), + }), + Hir::literal("gh".as_bytes()), + ]); + assert_eq!(r"(?:(?:ab)(?:(?:cd)|(?:ef))+(?:gh))", expr.to_string()); + + let expr = Hir::concat(alloc::vec![ + Hir::look(hir::Look::Start), + Hir::repetition(hir::Repetition { + min: 1, + max: None, + greedy: true, + sub: Box::new(Hir::alternation(alloc::vec![ + Hir::look(hir::Look::Start), + Hir::look(hir::Look::End), + ])), + }), + Hir::look(hir::Look::End), + ]); + assert_eq!(r"(?:\A(?:\A|\z)\z)", expr.to_string()); + } + + // This regression test is very similar in flavor to + // regression_repetition_concat in that the root of the issue lies in a + // peculiarity of how the HIR is represented and how the printer writes it + // out. Like the other regression, this one is also rooted in the fact that + // you can't produce the peculiar HIR from the concrete syntax. Namely, you + // just can't have a 'concat(a, alt(b, c))' because the 'alt' will normally + // be in (at least) a non-capturing group. Why? Because the '|' has very + // low precedence (lower that concatenation), and so something like 'ab|c' + // is actually 'alt(ab, c)'. + // + // See: https://github.com/rust-lang/regex/issues/516 + #[test] + fn regression_alternation_concat() { + let expr = Hir::concat(alloc::vec![ + Hir::literal("ab".as_bytes()), + Hir::alternation(alloc::vec![ + Hir::literal("mn".as_bytes()), + Hir::literal("xy".as_bytes()), + ]), + ]); + assert_eq!(r"(?:(?:ab)(?:(?:mn)|(?:xy)))", expr.to_string()); + + let expr = Hir::concat(alloc::vec![ + Hir::look(hir::Look::Start), + Hir::alternation(alloc::vec![ + Hir::look(hir::Look::Start), + Hir::look(hir::Look::End), + ]), + ]); + assert_eq!(r"(?:\A(?:\A|\z))", expr.to_string()); + } +} diff --git a/vendor/regex-syntax/src/hir/translate.rs b/vendor/regex-syntax/src/hir/translate.rs new file mode 100644 index 0000000..313a1e9 --- /dev/null +++ b/vendor/regex-syntax/src/hir/translate.rs @@ -0,0 +1,3724 @@ +/*! +Defines a translator that converts an `Ast` to an `Hir`. +*/ + +use core::cell::{Cell, RefCell}; + +use alloc::{boxed::Box, string::ToString, vec, vec::Vec}; + +use crate::{ + ast::{self, Ast, Span, Visitor}, + either::Either, + hir::{self, Error, ErrorKind, Hir, HirKind}, + unicode::{self, ClassQuery}, +}; + +type Result<T> = core::result::Result<T, Error>; + +/// A builder for constructing an AST->HIR translator. +#[derive(Clone, Debug)] +pub struct TranslatorBuilder { + utf8: bool, + line_terminator: u8, + flags: Flags, +} + +impl Default for TranslatorBuilder { + fn default() -> TranslatorBuilder { + TranslatorBuilder::new() + } +} + +impl TranslatorBuilder { + /// Create a new translator builder with a default c onfiguration. + pub fn new() -> TranslatorBuilder { + TranslatorBuilder { + utf8: true, + line_terminator: b'\n', + flags: Flags::default(), + } + } + + /// Build a translator using the current configuration. + pub fn build(&self) -> Translator { + Translator { + stack: RefCell::new(vec![]), + flags: Cell::new(self.flags), + utf8: self.utf8, + line_terminator: self.line_terminator, + } + } + + /// When disabled, translation will permit the construction of a regular + /// expression that may match invalid UTF-8. + /// + /// When enabled (the default), the translator is guaranteed to produce an + /// expression that, for non-empty matches, will only ever produce spans + /// that are entirely valid UTF-8 (otherwise, the translator will return an + /// error). + /// + /// Perhaps surprisingly, when UTF-8 is enabled, an empty regex or even + /// a negated ASCII word boundary (uttered as `(?-u:\B)` in the concrete + /// syntax) will be allowed even though they can produce matches that split + /// a UTF-8 encoded codepoint. This only applies to zero-width or "empty" + /// matches, and it is expected that the regex engine itself must handle + /// these cases if necessary (perhaps by suppressing any zero-width matches + /// that split a codepoint). + pub fn utf8(&mut self, yes: bool) -> &mut TranslatorBuilder { + self.utf8 = yes; + self + } + + /// Sets the line terminator for use with `(?u-s:.)` and `(?-us:.)`. + /// + /// Namely, instead of `.` (by default) matching everything except for `\n`, + /// this will cause `.` to match everything except for the byte given. + /// + /// If `.` is used in a context where Unicode mode is enabled and this byte + /// isn't ASCII, then an error will be returned. When Unicode mode is + /// disabled, then any byte is permitted, but will return an error if UTF-8 + /// mode is enabled and it is a non-ASCII byte. + /// + /// In short, any ASCII value for a line terminator is always okay. But a + /// non-ASCII byte might result in an error depending on whether Unicode + /// mode or UTF-8 mode are enabled. + /// + /// Note that if `R` mode is enabled then it always takes precedence and + /// the line terminator will be treated as `\r` and `\n` simultaneously. + /// + /// Note also that this *doesn't* impact the look-around assertions + /// `(?m:^)` and `(?m:$)`. That's usually controlled by additional + /// configuration in the regex engine itself. + pub fn line_terminator(&mut self, byte: u8) -> &mut TranslatorBuilder { + self.line_terminator = byte; + self + } + + /// Enable or disable the case insensitive flag (`i`) by default. + pub fn case_insensitive(&mut self, yes: bool) -> &mut TranslatorBuilder { + self.flags.case_insensitive = if yes { Some(true) } else { None }; + self + } + + /// Enable or disable the multi-line matching flag (`m`) by default. + pub fn multi_line(&mut self, yes: bool) -> &mut TranslatorBuilder { + self.flags.multi_line = if yes { Some(true) } else { None }; + self + } + + /// Enable or disable the "dot matches any character" flag (`s`) by + /// default. + pub fn dot_matches_new_line( + &mut self, + yes: bool, + ) -> &mut TranslatorBuilder { + self.flags.dot_matches_new_line = if yes { Some(true) } else { None }; + self + } + + /// Enable or disable the CRLF mode flag (`R`) by default. + pub fn crlf(&mut self, yes: bool) -> &mut TranslatorBuilder { + self.flags.crlf = if yes { Some(true) } else { None }; + self + } + + /// Enable or disable the "swap greed" flag (`U`) by default. + pub fn swap_greed(&mut self, yes: bool) -> &mut TranslatorBuilder { + self.flags.swap_greed = if yes { Some(true) } else { None }; + self + } + + /// Enable or disable the Unicode flag (`u`) by default. + pub fn unicode(&mut self, yes: bool) -> &mut TranslatorBuilder { + self.flags.unicode = if yes { None } else { Some(false) }; + self + } +} + +/// A translator maps abstract syntax to a high level intermediate +/// representation. +/// +/// A translator may be benefit from reuse. That is, a translator can translate +/// many abstract syntax trees. +/// +/// A `Translator` can be configured in more detail via a +/// [`TranslatorBuilder`]. +#[derive(Clone, Debug)] +pub struct Translator { + /// Our call stack, but on the heap. + stack: RefCell<Vec<HirFrame>>, + /// The current flag settings. + flags: Cell<Flags>, + /// Whether we're allowed to produce HIR that can match arbitrary bytes. + utf8: bool, + /// The line terminator to use for `.`. + line_terminator: u8, +} + +impl Translator { + /// Create a new translator using the default configuration. + pub fn new() -> Translator { + TranslatorBuilder::new().build() + } + + /// Translate the given abstract syntax tree (AST) into a high level + /// intermediate representation (HIR). + /// + /// If there was a problem doing the translation, then an HIR-specific + /// error is returned. + /// + /// The original pattern string used to produce the `Ast` *must* also be + /// provided. The translator does not use the pattern string during any + /// correct translation, but is used for error reporting. + pub fn translate(&mut self, pattern: &str, ast: &Ast) -> Result<Hir> { + ast::visit(ast, TranslatorI::new(self, pattern)) + } +} + +/// An HirFrame is a single stack frame, represented explicitly, which is +/// created for each item in the Ast that we traverse. +/// +/// Note that technically, this type doesn't represent our entire stack +/// frame. In particular, the Ast visitor represents any state associated with +/// traversing the Ast itself. +#[derive(Clone, Debug)] +enum HirFrame { + /// An arbitrary HIR expression. These get pushed whenever we hit a base + /// case in the Ast. They get popped after an inductive (i.e., recursive) + /// step is complete. + Expr(Hir), + /// A literal that is being constructed, character by character, from the + /// AST. We need this because the AST gives each individual character its + /// own node. So as we see characters, we peek at the top-most HirFrame. + /// If it's a literal, then we add to it. Otherwise, we push a new literal. + /// When it comes time to pop it, we convert it to an Hir via Hir::literal. + Literal(Vec<u8>), + /// A Unicode character class. This frame is mutated as we descend into + /// the Ast of a character class (which is itself its own mini recursive + /// structure). + ClassUnicode(hir::ClassUnicode), + /// A byte-oriented character class. This frame is mutated as we descend + /// into the Ast of a character class (which is itself its own mini + /// recursive structure). + /// + /// Byte character classes are created when Unicode mode (`u`) is disabled. + /// If `utf8` is enabled (the default), then a byte character is only + /// permitted to match ASCII text. + ClassBytes(hir::ClassBytes), + /// This is pushed whenever a repetition is observed. After visiting every + /// sub-expression in the repetition, the translator's stack is expected to + /// have this sentinel at the top. + /// + /// This sentinel only exists to stop other things (like flattening + /// literals) from reaching across repetition operators. + Repetition, + /// This is pushed on to the stack upon first seeing any kind of capture, + /// indicated by parentheses (including non-capturing groups). It is popped + /// upon leaving a group. + Group { + /// The old active flags when this group was opened. + /// + /// If this group sets flags, then the new active flags are set to the + /// result of merging the old flags with the flags introduced by this + /// group. If the group doesn't set any flags, then this is simply + /// equivalent to whatever flags were set when the group was opened. + /// + /// When this group is popped, the active flags should be restored to + /// the flags set here. + /// + /// The "active" flags correspond to whatever flags are set in the + /// Translator. + old_flags: Flags, + }, + /// This is pushed whenever a concatenation is observed. After visiting + /// every sub-expression in the concatenation, the translator's stack is + /// popped until it sees a Concat frame. + Concat, + /// This is pushed whenever an alternation is observed. After visiting + /// every sub-expression in the alternation, the translator's stack is + /// popped until it sees an Alternation frame. + Alternation, + /// This is pushed immediately before each sub-expression in an + /// alternation. This separates the branches of an alternation on the + /// stack and prevents literal flattening from reaching across alternation + /// branches. + /// + /// It is popped after each expression in a branch until an 'Alternation' + /// frame is observed when doing a post visit on an alternation. + AlternationBranch, +} + +impl HirFrame { + /// Assert that the current stack frame is an Hir expression and return it. + fn unwrap_expr(self) -> Hir { + match self { + HirFrame::Expr(expr) => expr, + HirFrame::Literal(lit) => Hir::literal(lit), + _ => panic!("tried to unwrap expr from HirFrame, got: {:?}", self), + } + } + + /// Assert that the current stack frame is a Unicode class expression and + /// return it. + fn unwrap_class_unicode(self) -> hir::ClassUnicode { + match self { + HirFrame::ClassUnicode(cls) => cls, + _ => panic!( + "tried to unwrap Unicode class \ + from HirFrame, got: {:?}", + self + ), + } + } + + /// Assert that the current stack frame is a byte class expression and + /// return it. + fn unwrap_class_bytes(self) -> hir::ClassBytes { + match self { + HirFrame::ClassBytes(cls) => cls, + _ => panic!( + "tried to unwrap byte class \ + from HirFrame, got: {:?}", + self + ), + } + } + + /// Assert that the current stack frame is a repetition sentinel. If it + /// isn't, then panic. + fn unwrap_repetition(self) { + match self { + HirFrame::Repetition => {} + _ => { + panic!( + "tried to unwrap repetition from HirFrame, got: {:?}", + self + ) + } + } + } + + /// Assert that the current stack frame is a group indicator and return + /// its corresponding flags (the flags that were active at the time the + /// group was entered). + fn unwrap_group(self) -> Flags { + match self { + HirFrame::Group { old_flags } => old_flags, + _ => { + panic!("tried to unwrap group from HirFrame, got: {:?}", self) + } + } + } + + /// Assert that the current stack frame is an alternation pipe sentinel. If + /// it isn't, then panic. + fn unwrap_alternation_pipe(self) { + match self { + HirFrame::AlternationBranch => {} + _ => { + panic!( + "tried to unwrap alt pipe from HirFrame, got: {:?}", + self + ) + } + } + } +} + +impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { + type Output = Hir; + type Err = Error; + + fn finish(self) -> Result<Hir> { + // ... otherwise, we should have exactly one HIR on the stack. + assert_eq!(self.trans().stack.borrow().len(), 1); + Ok(self.pop().unwrap().unwrap_expr()) + } + + fn visit_pre(&mut self, ast: &Ast) -> Result<()> { + match *ast { + Ast::ClassBracketed(_) => { + if self.flags().unicode() { + let cls = hir::ClassUnicode::empty(); + self.push(HirFrame::ClassUnicode(cls)); + } else { + let cls = hir::ClassBytes::empty(); + self.push(HirFrame::ClassBytes(cls)); + } + } + Ast::Repetition(_) => self.push(HirFrame::Repetition), + Ast::Group(ref x) => { + let old_flags = x + .flags() + .map(|ast| self.set_flags(ast)) + .unwrap_or_else(|| self.flags()); + self.push(HirFrame::Group { old_flags }); + } + Ast::Concat(_) => { + self.push(HirFrame::Concat); + } + Ast::Alternation(ref x) => { + self.push(HirFrame::Alternation); + if !x.asts.is_empty() { + self.push(HirFrame::AlternationBranch); + } + } + _ => {} + } + Ok(()) + } + + fn visit_post(&mut self, ast: &Ast) -> Result<()> { + match *ast { + Ast::Empty(_) => { + self.push(HirFrame::Expr(Hir::empty())); + } + Ast::Flags(ref x) => { + self.set_flags(&x.flags); + // Flags in the AST are generally considered directives and + // not actual sub-expressions. However, they can be used in + // the concrete syntax like `((?i))`, and we need some kind of + // indication of an expression there, and Empty is the correct + // choice. + // + // There can also be things like `(?i)+`, but we rule those out + // in the parser. In the future, we might allow them for + // consistency sake. + self.push(HirFrame::Expr(Hir::empty())); + } + Ast::Literal(ref x) => match self.ast_literal_to_scalar(x)? { + Either::Right(byte) => self.push_byte(byte), + Either::Left(ch) => match self.case_fold_char(x.span, ch)? { + None => self.push_char(ch), + Some(expr) => self.push(HirFrame::Expr(expr)), + }, + }, + Ast::Dot(ref span) => { + self.push(HirFrame::Expr(self.hir_dot(**span)?)); + } + Ast::Assertion(ref x) => { + self.push(HirFrame::Expr(self.hir_assertion(x)?)); + } + Ast::ClassPerl(ref x) => { + if self.flags().unicode() { + let cls = self.hir_perl_unicode_class(x)?; + let hcls = hir::Class::Unicode(cls); + self.push(HirFrame::Expr(Hir::class(hcls))); + } else { + let cls = self.hir_perl_byte_class(x)?; + let hcls = hir::Class::Bytes(cls); + self.push(HirFrame::Expr(Hir::class(hcls))); + } + } + Ast::ClassUnicode(ref x) => { + let cls = hir::Class::Unicode(self.hir_unicode_class(x)?); + self.push(HirFrame::Expr(Hir::class(cls))); + } + Ast::ClassBracketed(ref ast) => { + if self.flags().unicode() { + let mut cls = self.pop().unwrap().unwrap_class_unicode(); + self.unicode_fold_and_negate( + &ast.span, + ast.negated, + &mut cls, + )?; + let expr = Hir::class(hir::Class::Unicode(cls)); + self.push(HirFrame::Expr(expr)); + } else { + let mut cls = self.pop().unwrap().unwrap_class_bytes(); + self.bytes_fold_and_negate( + &ast.span, + ast.negated, + &mut cls, + )?; + let expr = Hir::class(hir::Class::Bytes(cls)); + self.push(HirFrame::Expr(expr)); + } + } + Ast::Repetition(ref x) => { + let expr = self.pop().unwrap().unwrap_expr(); + self.pop().unwrap().unwrap_repetition(); + self.push(HirFrame::Expr(self.hir_repetition(x, expr))); + } + Ast::Group(ref x) => { + let expr = self.pop().unwrap().unwrap_expr(); + let old_flags = self.pop().unwrap().unwrap_group(); + self.trans().flags.set(old_flags); + self.push(HirFrame::Expr(self.hir_capture(x, expr))); + } + Ast::Concat(_) => { + let mut exprs = vec![]; + while let Some(expr) = self.pop_concat_expr() { + if !matches!(*expr.kind(), HirKind::Empty) { + exprs.push(expr); + } + } + exprs.reverse(); + self.push(HirFrame::Expr(Hir::concat(exprs))); + } + Ast::Alternation(_) => { + let mut exprs = vec![]; + while let Some(expr) = self.pop_alt_expr() { + self.pop().unwrap().unwrap_alternation_pipe(); + exprs.push(expr); + } + exprs.reverse(); + self.push(HirFrame::Expr(Hir::alternation(exprs))); + } + } + Ok(()) + } + + fn visit_alternation_in(&mut self) -> Result<()> { + self.push(HirFrame::AlternationBranch); + Ok(()) + } + + fn visit_class_set_item_pre( + &mut self, + ast: &ast::ClassSetItem, + ) -> Result<()> { + match *ast { + ast::ClassSetItem::Bracketed(_) => { + if self.flags().unicode() { + let cls = hir::ClassUnicode::empty(); + self.push(HirFrame::ClassUnicode(cls)); + } else { + let cls = hir::ClassBytes::empty(); + self.push(HirFrame::ClassBytes(cls)); + } + } + // We needn't handle the Union case here since the visitor will + // do it for us. + _ => {} + } + Ok(()) + } + + fn visit_class_set_item_post( + &mut self, + ast: &ast::ClassSetItem, + ) -> Result<()> { + match *ast { + ast::ClassSetItem::Empty(_) => {} + ast::ClassSetItem::Literal(ref x) => { + if self.flags().unicode() { + let mut cls = self.pop().unwrap().unwrap_class_unicode(); + cls.push(hir::ClassUnicodeRange::new(x.c, x.c)); + self.push(HirFrame::ClassUnicode(cls)); + } else { + let mut cls = self.pop().unwrap().unwrap_class_bytes(); + let byte = self.class_literal_byte(x)?; + cls.push(hir::ClassBytesRange::new(byte, byte)); + self.push(HirFrame::ClassBytes(cls)); + } + } + ast::ClassSetItem::Range(ref x) => { + if self.flags().unicode() { + let mut cls = self.pop().unwrap().unwrap_class_unicode(); + cls.push(hir::ClassUnicodeRange::new(x.start.c, x.end.c)); + self.push(HirFrame::ClassUnicode(cls)); + } else { + let mut cls = self.pop().unwrap().unwrap_class_bytes(); + let start = self.class_literal_byte(&x.start)?; + let end = self.class_literal_byte(&x.end)?; + cls.push(hir::ClassBytesRange::new(start, end)); + self.push(HirFrame::ClassBytes(cls)); + } + } + ast::ClassSetItem::Ascii(ref x) => { + if self.flags().unicode() { + let xcls = self.hir_ascii_unicode_class(x)?; + let mut cls = self.pop().unwrap().unwrap_class_unicode(); + cls.union(&xcls); + self.push(HirFrame::ClassUnicode(cls)); + } else { + let xcls = self.hir_ascii_byte_class(x)?; + let mut cls = self.pop().unwrap().unwrap_class_bytes(); + cls.union(&xcls); + self.push(HirFrame::ClassBytes(cls)); + } + } + ast::ClassSetItem::Unicode(ref x) => { + let xcls = self.hir_unicode_class(x)?; + let mut cls = self.pop().unwrap().unwrap_class_unicode(); + cls.union(&xcls); + self.push(HirFrame::ClassUnicode(cls)); + } + ast::ClassSetItem::Perl(ref x) => { + if self.flags().unicode() { + let xcls = self.hir_perl_unicode_class(x)?; + let mut cls = self.pop().unwrap().unwrap_class_unicode(); + cls.union(&xcls); + self.push(HirFrame::ClassUnicode(cls)); + } else { + let xcls = self.hir_perl_byte_class(x)?; + let mut cls = self.pop().unwrap().unwrap_class_bytes(); + cls.union(&xcls); + self.push(HirFrame::ClassBytes(cls)); + } + } + ast::ClassSetItem::Bracketed(ref ast) => { + if self.flags().unicode() { + let mut cls1 = self.pop().unwrap().unwrap_class_unicode(); + self.unicode_fold_and_negate( + &ast.span, + ast.negated, + &mut cls1, + )?; + + let mut cls2 = self.pop().unwrap().unwrap_class_unicode(); + cls2.union(&cls1); + self.push(HirFrame::ClassUnicode(cls2)); + } else { + let mut cls1 = self.pop().unwrap().unwrap_class_bytes(); + self.bytes_fold_and_negate( + &ast.span, + ast.negated, + &mut cls1, + )?; + + let mut cls2 = self.pop().unwrap().unwrap_class_bytes(); + cls2.union(&cls1); + self.push(HirFrame::ClassBytes(cls2)); + } + } + // This is handled automatically by the visitor. + ast::ClassSetItem::Union(_) => {} + } + Ok(()) + } + + fn visit_class_set_binary_op_pre( + &mut self, + _op: &ast::ClassSetBinaryOp, + ) -> Result<()> { + if self.flags().unicode() { + let cls = hir::ClassUnicode::empty(); + self.push(HirFrame::ClassUnicode(cls)); + } else { + let cls = hir::ClassBytes::empty(); + self.push(HirFrame::ClassBytes(cls)); + } + Ok(()) + } + + fn visit_class_set_binary_op_in( + &mut self, + _op: &ast::ClassSetBinaryOp, + ) -> Result<()> { + if self.flags().unicode() { + let cls = hir::ClassUnicode::empty(); + self.push(HirFrame::ClassUnicode(cls)); + } else { + let cls = hir::ClassBytes::empty(); + self.push(HirFrame::ClassBytes(cls)); + } + Ok(()) + } + + fn visit_class_set_binary_op_post( + &mut self, + op: &ast::ClassSetBinaryOp, + ) -> Result<()> { + use crate::ast::ClassSetBinaryOpKind::*; + + if self.flags().unicode() { + let mut rhs = self.pop().unwrap().unwrap_class_unicode(); + let mut lhs = self.pop().unwrap().unwrap_class_unicode(); + let mut cls = self.pop().unwrap().unwrap_class_unicode(); + if self.flags().case_insensitive() { + rhs.try_case_fold_simple().map_err(|_| { + self.error( + op.rhs.span().clone(), + ErrorKind::UnicodeCaseUnavailable, + ) + })?; + lhs.try_case_fold_simple().map_err(|_| { + self.error( + op.lhs.span().clone(), + ErrorKind::UnicodeCaseUnavailable, + ) + })?; + } + match op.kind { + Intersection => lhs.intersect(&rhs), + Difference => lhs.difference(&rhs), + SymmetricDifference => lhs.symmetric_difference(&rhs), + } + cls.union(&lhs); + self.push(HirFrame::ClassUnicode(cls)); + } else { + let mut rhs = self.pop().unwrap().unwrap_class_bytes(); + let mut lhs = self.pop().unwrap().unwrap_class_bytes(); + let mut cls = self.pop().unwrap().unwrap_class_bytes(); + if self.flags().case_insensitive() { + rhs.case_fold_simple(); + lhs.case_fold_simple(); + } + match op.kind { + Intersection => lhs.intersect(&rhs), + Difference => lhs.difference(&rhs), + SymmetricDifference => lhs.symmetric_difference(&rhs), + } + cls.union(&lhs); + self.push(HirFrame::ClassBytes(cls)); + } + Ok(()) + } +} + +/// The internal implementation of a translator. +/// +/// This type is responsible for carrying around the original pattern string, +/// which is not tied to the internal state of a translator. +/// +/// A TranslatorI exists for the time it takes to translate a single Ast. +#[derive(Clone, Debug)] +struct TranslatorI<'t, 'p> { + trans: &'t Translator, + pattern: &'p str, +} + +impl<'t, 'p> TranslatorI<'t, 'p> { + /// Build a new internal translator. + fn new(trans: &'t Translator, pattern: &'p str) -> TranslatorI<'t, 'p> { + TranslatorI { trans, pattern } + } + + /// Return a reference to the underlying translator. + fn trans(&self) -> &Translator { + &self.trans + } + + /// Push the given frame on to the call stack. + fn push(&self, frame: HirFrame) { + self.trans().stack.borrow_mut().push(frame); + } + + /// Push the given literal char on to the call stack. + /// + /// If the top-most element of the stack is a literal, then the char + /// is appended to the end of that literal. Otherwise, a new literal + /// containing just the given char is pushed to the top of the stack. + fn push_char(&self, ch: char) { + let mut buf = [0; 4]; + let bytes = ch.encode_utf8(&mut buf).as_bytes(); + let mut stack = self.trans().stack.borrow_mut(); + if let Some(HirFrame::Literal(ref mut literal)) = stack.last_mut() { + literal.extend_from_slice(bytes); + } else { + stack.push(HirFrame::Literal(bytes.to_vec())); + } + } + + /// Push the given literal byte on to the call stack. + /// + /// If the top-most element of the stack is a literal, then the byte + /// is appended to the end of that literal. Otherwise, a new literal + /// containing just the given byte is pushed to the top of the stack. + fn push_byte(&self, byte: u8) { + let mut stack = self.trans().stack.borrow_mut(); + if let Some(HirFrame::Literal(ref mut literal)) = stack.last_mut() { + literal.push(byte); + } else { + stack.push(HirFrame::Literal(vec![byte])); + } + } + + /// Pop the top of the call stack. If the call stack is empty, return None. + fn pop(&self) -> Option<HirFrame> { + self.trans().stack.borrow_mut().pop() + } + + /// Pop an HIR expression from the top of the stack for a concatenation. + /// + /// This returns None if the stack is empty or when a concat frame is seen. + /// Otherwise, it panics if it could not find an HIR expression. + fn pop_concat_expr(&self) -> Option<Hir> { + let frame = self.pop()?; + match frame { + HirFrame::Concat => None, + HirFrame::Expr(expr) => Some(expr), + HirFrame::Literal(lit) => Some(Hir::literal(lit)), + HirFrame::ClassUnicode(_) => { + unreachable!("expected expr or concat, got Unicode class") + } + HirFrame::ClassBytes(_) => { + unreachable!("expected expr or concat, got byte class") + } + HirFrame::Repetition => { + unreachable!("expected expr or concat, got repetition") + } + HirFrame::Group { .. } => { + unreachable!("expected expr or concat, got group") + } + HirFrame::Alternation => { + unreachable!("expected expr or concat, got alt marker") + } + HirFrame::AlternationBranch => { + unreachable!("expected expr or concat, got alt branch marker") + } + } + } + + /// Pop an HIR expression from the top of the stack for an alternation. + /// + /// This returns None if the stack is empty or when an alternation frame is + /// seen. Otherwise, it panics if it could not find an HIR expression. + fn pop_alt_expr(&self) -> Option<Hir> { + let frame = self.pop()?; + match frame { + HirFrame::Alternation => None, + HirFrame::Expr(expr) => Some(expr), + HirFrame::Literal(lit) => Some(Hir::literal(lit)), + HirFrame::ClassUnicode(_) => { + unreachable!("expected expr or alt, got Unicode class") + } + HirFrame::ClassBytes(_) => { + unreachable!("expected expr or alt, got byte class") + } + HirFrame::Repetition => { + unreachable!("expected expr or alt, got repetition") + } + HirFrame::Group { .. } => { + unreachable!("expected expr or alt, got group") + } + HirFrame::Concat => { + unreachable!("expected expr or alt, got concat marker") + } + HirFrame::AlternationBranch => { + unreachable!("expected expr or alt, got alt branch marker") + } + } + } + + /// Create a new error with the given span and error type. + fn error(&self, span: Span, kind: ErrorKind) -> Error { + Error { kind, pattern: self.pattern.to_string(), span } + } + + /// Return a copy of the active flags. + fn flags(&self) -> Flags { + self.trans().flags.get() + } + + /// Set the flags of this translator from the flags set in the given AST. + /// Then, return the old flags. + fn set_flags(&self, ast_flags: &ast::Flags) -> Flags { + let old_flags = self.flags(); + let mut new_flags = Flags::from_ast(ast_flags); + new_flags.merge(&old_flags); + self.trans().flags.set(new_flags); + old_flags + } + + /// Convert an Ast literal to its scalar representation. + /// + /// When Unicode mode is enabled, then this always succeeds and returns a + /// `char` (Unicode scalar value). + /// + /// When Unicode mode is disabled, then a `char` will still be returned + /// whenever possible. A byte is returned only when invalid UTF-8 is + /// allowed and when the byte is not ASCII. Otherwise, a non-ASCII byte + /// will result in an error when invalid UTF-8 is not allowed. + fn ast_literal_to_scalar( + &self, + lit: &ast::Literal, + ) -> Result<Either<char, u8>> { + if self.flags().unicode() { + return Ok(Either::Left(lit.c)); + } + let byte = match lit.byte() { + None => return Ok(Either::Left(lit.c)), + Some(byte) => byte, + }; + if byte <= 0x7F { + return Ok(Either::Left(char::try_from(byte).unwrap())); + } + if self.trans().utf8 { + return Err(self.error(lit.span, ErrorKind::InvalidUtf8)); + } + Ok(Either::Right(byte)) + } + + fn case_fold_char(&self, span: Span, c: char) -> Result<Option<Hir>> { + if !self.flags().case_insensitive() { + return Ok(None); + } + if self.flags().unicode() { + // If case folding won't do anything, then don't bother trying. + let map = unicode::SimpleCaseFolder::new() + .map(|f| f.overlaps(c, c)) + .map_err(|_| { + self.error(span, ErrorKind::UnicodeCaseUnavailable) + })?; + if !map { + return Ok(None); + } + let mut cls = + hir::ClassUnicode::new(vec![hir::ClassUnicodeRange::new( + c, c, + )]); + cls.try_case_fold_simple().map_err(|_| { + self.error(span, ErrorKind::UnicodeCaseUnavailable) + })?; + Ok(Some(Hir::class(hir::Class::Unicode(cls)))) + } else { + if !c.is_ascii() { + return Ok(None); + } + // If case folding won't do anything, then don't bother trying. + match c { + 'A'..='Z' | 'a'..='z' => {} + _ => return Ok(None), + } + let mut cls = + hir::ClassBytes::new(vec![hir::ClassBytesRange::new( + // OK because 'c.len_utf8() == 1' which in turn implies + // that 'c' is ASCII. + u8::try_from(c).unwrap(), + u8::try_from(c).unwrap(), + )]); + cls.case_fold_simple(); + Ok(Some(Hir::class(hir::Class::Bytes(cls)))) + } + } + + fn hir_dot(&self, span: Span) -> Result<Hir> { + let (utf8, lineterm, flags) = + (self.trans().utf8, self.trans().line_terminator, self.flags()); + if utf8 && (!flags.unicode() || !lineterm.is_ascii()) { + return Err(self.error(span, ErrorKind::InvalidUtf8)); + } + let dot = if flags.dot_matches_new_line() { + if flags.unicode() { + hir::Dot::AnyChar + } else { + hir::Dot::AnyByte + } + } else { + if flags.unicode() { + if flags.crlf() { + hir::Dot::AnyCharExceptCRLF + } else { + if !lineterm.is_ascii() { + return Err( + self.error(span, ErrorKind::InvalidLineTerminator) + ); + } + hir::Dot::AnyCharExcept(char::from(lineterm)) + } + } else { + if flags.crlf() { + hir::Dot::AnyByteExceptCRLF + } else { + hir::Dot::AnyByteExcept(lineterm) + } + } + }; + Ok(Hir::dot(dot)) + } + + fn hir_assertion(&self, asst: &ast::Assertion) -> Result<Hir> { + let unicode = self.flags().unicode(); + let multi_line = self.flags().multi_line(); + let crlf = self.flags().crlf(); + Ok(match asst.kind { + ast::AssertionKind::StartLine => Hir::look(if multi_line { + if crlf { + hir::Look::StartCRLF + } else { + hir::Look::StartLF + } + } else { + hir::Look::Start + }), + ast::AssertionKind::EndLine => Hir::look(if multi_line { + if crlf { + hir::Look::EndCRLF + } else { + hir::Look::EndLF + } + } else { + hir::Look::End + }), + ast::AssertionKind::StartText => Hir::look(hir::Look::Start), + ast::AssertionKind::EndText => Hir::look(hir::Look::End), + ast::AssertionKind::WordBoundary => Hir::look(if unicode { + hir::Look::WordUnicode + } else { + hir::Look::WordAscii + }), + ast::AssertionKind::NotWordBoundary => Hir::look(if unicode { + hir::Look::WordUnicodeNegate + } else { + hir::Look::WordAsciiNegate + }), + ast::AssertionKind::WordBoundaryStart + | ast::AssertionKind::WordBoundaryStartAngle => { + Hir::look(if unicode { + hir::Look::WordStartUnicode + } else { + hir::Look::WordStartAscii + }) + } + ast::AssertionKind::WordBoundaryEnd + | ast::AssertionKind::WordBoundaryEndAngle => { + Hir::look(if unicode { + hir::Look::WordEndUnicode + } else { + hir::Look::WordEndAscii + }) + } + ast::AssertionKind::WordBoundaryStartHalf => { + Hir::look(if unicode { + hir::Look::WordStartHalfUnicode + } else { + hir::Look::WordStartHalfAscii + }) + } + ast::AssertionKind::WordBoundaryEndHalf => Hir::look(if unicode { + hir::Look::WordEndHalfUnicode + } else { + hir::Look::WordEndHalfAscii + }), + }) + } + + fn hir_capture(&self, group: &ast::Group, expr: Hir) -> Hir { + let (index, name) = match group.kind { + ast::GroupKind::CaptureIndex(index) => (index, None), + ast::GroupKind::CaptureName { ref name, .. } => { + (name.index, Some(name.name.clone().into_boxed_str())) + } + // The HIR doesn't need to use non-capturing groups, since the way + // in which the data type is defined handles this automatically. + ast::GroupKind::NonCapturing(_) => return expr, + }; + Hir::capture(hir::Capture { index, name, sub: Box::new(expr) }) + } + + fn hir_repetition(&self, rep: &ast::Repetition, expr: Hir) -> Hir { + let (min, max) = match rep.op.kind { + ast::RepetitionKind::ZeroOrOne => (0, Some(1)), + ast::RepetitionKind::ZeroOrMore => (0, None), + ast::RepetitionKind::OneOrMore => (1, None), + ast::RepetitionKind::Range(ast::RepetitionRange::Exactly(m)) => { + (m, Some(m)) + } + ast::RepetitionKind::Range(ast::RepetitionRange::AtLeast(m)) => { + (m, None) + } + ast::RepetitionKind::Range(ast::RepetitionRange::Bounded( + m, + n, + )) => (m, Some(n)), + }; + let greedy = + if self.flags().swap_greed() { !rep.greedy } else { rep.greedy }; + Hir::repetition(hir::Repetition { + min, + max, + greedy, + sub: Box::new(expr), + }) + } + + fn hir_unicode_class( + &self, + ast_class: &ast::ClassUnicode, + ) -> Result<hir::ClassUnicode> { + use crate::ast::ClassUnicodeKind::*; + + if !self.flags().unicode() { + return Err( + self.error(ast_class.span, ErrorKind::UnicodeNotAllowed) + ); + } + let query = match ast_class.kind { + OneLetter(name) => ClassQuery::OneLetter(name), + Named(ref name) => ClassQuery::Binary(name), + NamedValue { ref name, ref value, .. } => ClassQuery::ByValue { + property_name: name, + property_value: value, + }, + }; + let mut result = self.convert_unicode_class_error( + &ast_class.span, + unicode::class(query), + ); + if let Ok(ref mut class) = result { + self.unicode_fold_and_negate( + &ast_class.span, + ast_class.negated, + class, + )?; + } + result + } + + fn hir_ascii_unicode_class( + &self, + ast: &ast::ClassAscii, + ) -> Result<hir::ClassUnicode> { + let mut cls = hir::ClassUnicode::new( + ascii_class_as_chars(&ast.kind) + .map(|(s, e)| hir::ClassUnicodeRange::new(s, e)), + ); + self.unicode_fold_and_negate(&ast.span, ast.negated, &mut cls)?; + Ok(cls) + } + + fn hir_ascii_byte_class( + &self, + ast: &ast::ClassAscii, + ) -> Result<hir::ClassBytes> { + let mut cls = hir::ClassBytes::new( + ascii_class(&ast.kind) + .map(|(s, e)| hir::ClassBytesRange::new(s, e)), + ); + self.bytes_fold_and_negate(&ast.span, ast.negated, &mut cls)?; + Ok(cls) + } + + fn hir_perl_unicode_class( + &self, + ast_class: &ast::ClassPerl, + ) -> Result<hir::ClassUnicode> { + use crate::ast::ClassPerlKind::*; + + assert!(self.flags().unicode()); + let result = match ast_class.kind { + Digit => unicode::perl_digit(), + Space => unicode::perl_space(), + Word => unicode::perl_word(), + }; + let mut class = + self.convert_unicode_class_error(&ast_class.span, result)?; + // We needn't apply case folding here because the Perl Unicode classes + // are already closed under Unicode simple case folding. + if ast_class.negated { + class.negate(); + } + Ok(class) + } + + fn hir_perl_byte_class( + &self, + ast_class: &ast::ClassPerl, + ) -> Result<hir::ClassBytes> { + use crate::ast::ClassPerlKind::*; + + assert!(!self.flags().unicode()); + let mut class = match ast_class.kind { + Digit => hir_ascii_class_bytes(&ast::ClassAsciiKind::Digit), + Space => hir_ascii_class_bytes(&ast::ClassAsciiKind::Space), + Word => hir_ascii_class_bytes(&ast::ClassAsciiKind::Word), + }; + // We needn't apply case folding here because the Perl ASCII classes + // are already closed (under ASCII case folding). + if ast_class.negated { + class.negate(); + } + // Negating a Perl byte class is likely to cause it to match invalid + // UTF-8. That's only OK if the translator is configured to allow such + // things. + if self.trans().utf8 && !class.is_ascii() { + return Err(self.error(ast_class.span, ErrorKind::InvalidUtf8)); + } + Ok(class) + } + + /// Converts the given Unicode specific error to an HIR translation error. + /// + /// The span given should approximate the position at which an error would + /// occur. + fn convert_unicode_class_error( + &self, + span: &Span, + result: core::result::Result<hir::ClassUnicode, unicode::Error>, + ) -> Result<hir::ClassUnicode> { + result.map_err(|err| { + let sp = span.clone(); + match err { + unicode::Error::PropertyNotFound => { + self.error(sp, ErrorKind::UnicodePropertyNotFound) + } + unicode::Error::PropertyValueNotFound => { + self.error(sp, ErrorKind::UnicodePropertyValueNotFound) + } + unicode::Error::PerlClassNotFound => { + self.error(sp, ErrorKind::UnicodePerlClassNotFound) + } + } + }) + } + + fn unicode_fold_and_negate( + &self, + span: &Span, + negated: bool, + class: &mut hir::ClassUnicode, + ) -> Result<()> { + // Note that we must apply case folding before negation! + // Consider `(?i)[^x]`. If we applied negation first, then + // the result would be the character class that matched any + // Unicode scalar value. + if self.flags().case_insensitive() { + class.try_case_fold_simple().map_err(|_| { + self.error(span.clone(), ErrorKind::UnicodeCaseUnavailable) + })?; + } + if negated { + class.negate(); + } + Ok(()) + } + + fn bytes_fold_and_negate( + &self, + span: &Span, + negated: bool, + class: &mut hir::ClassBytes, + ) -> Result<()> { + // Note that we must apply case folding before negation! + // Consider `(?i)[^x]`. If we applied negation first, then + // the result would be the character class that matched any + // Unicode scalar value. + if self.flags().case_insensitive() { + class.case_fold_simple(); + } + if negated { + class.negate(); + } + if self.trans().utf8 && !class.is_ascii() { + return Err(self.error(span.clone(), ErrorKind::InvalidUtf8)); + } + Ok(()) + } + + /// Return a scalar byte value suitable for use as a literal in a byte + /// character class. + fn class_literal_byte(&self, ast: &ast::Literal) -> Result<u8> { + match self.ast_literal_to_scalar(ast)? { + Either::Right(byte) => Ok(byte), + Either::Left(ch) => { + if ch.is_ascii() { + Ok(u8::try_from(ch).unwrap()) + } else { + // We can't feasibly support Unicode in + // byte oriented classes. Byte classes don't + // do Unicode case folding. + Err(self.error(ast.span, ErrorKind::UnicodeNotAllowed)) + } + } + } + } +} + +/// A translator's representation of a regular expression's flags at any given +/// moment in time. +/// +/// Each flag can be in one of three states: absent, present but disabled or +/// present but enabled. +#[derive(Clone, Copy, Debug, Default)] +struct Flags { + case_insensitive: Option<bool>, + multi_line: Option<bool>, + dot_matches_new_line: Option<bool>, + swap_greed: Option<bool>, + unicode: Option<bool>, + crlf: Option<bool>, + // Note that `ignore_whitespace` is omitted here because it is handled + // entirely in the parser. +} + +impl Flags { + fn from_ast(ast: &ast::Flags) -> Flags { + let mut flags = Flags::default(); + let mut enable = true; + for item in &ast.items { + match item.kind { + ast::FlagsItemKind::Negation => { + enable = false; + } + ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive) => { + flags.case_insensitive = Some(enable); + } + ast::FlagsItemKind::Flag(ast::Flag::MultiLine) => { + flags.multi_line = Some(enable); + } + ast::FlagsItemKind::Flag(ast::Flag::DotMatchesNewLine) => { + flags.dot_matches_new_line = Some(enable); + } + ast::FlagsItemKind::Flag(ast::Flag::SwapGreed) => { + flags.swap_greed = Some(enable); + } + ast::FlagsItemKind::Flag(ast::Flag::Unicode) => { + flags.unicode = Some(enable); + } + ast::FlagsItemKind::Flag(ast::Flag::CRLF) => { + flags.crlf = Some(enable); + } + ast::FlagsItemKind::Flag(ast::Flag::IgnoreWhitespace) => {} + } + } + flags + } + + fn merge(&mut self, previous: &Flags) { + if self.case_insensitive.is_none() { + self.case_insensitive = previous.case_insensitive; + } + if self.multi_line.is_none() { + self.multi_line = previous.multi_line; + } + if self.dot_matches_new_line.is_none() { + self.dot_matches_new_line = previous.dot_matches_new_line; + } + if self.swap_greed.is_none() { + self.swap_greed = previous.swap_greed; + } + if self.unicode.is_none() { + self.unicode = previous.unicode; + } + if self.crlf.is_none() { + self.crlf = previous.crlf; + } + } + + fn case_insensitive(&self) -> bool { + self.case_insensitive.unwrap_or(false) + } + + fn multi_line(&self) -> bool { + self.multi_line.unwrap_or(false) + } + + fn dot_matches_new_line(&self) -> bool { + self.dot_matches_new_line.unwrap_or(false) + } + + fn swap_greed(&self) -> bool { + self.swap_greed.unwrap_or(false) + } + + fn unicode(&self) -> bool { + self.unicode.unwrap_or(true) + } + + fn crlf(&self) -> bool { + self.crlf.unwrap_or(false) + } +} + +fn hir_ascii_class_bytes(kind: &ast::ClassAsciiKind) -> hir::ClassBytes { + let ranges: Vec<_> = ascii_class(kind) + .map(|(s, e)| hir::ClassBytesRange::new(s, e)) + .collect(); + hir::ClassBytes::new(ranges) +} + +fn ascii_class(kind: &ast::ClassAsciiKind) -> impl Iterator<Item = (u8, u8)> { + use crate::ast::ClassAsciiKind::*; + + let slice: &'static [(u8, u8)] = match *kind { + Alnum => &[(b'0', b'9'), (b'A', b'Z'), (b'a', b'z')], + Alpha => &[(b'A', b'Z'), (b'a', b'z')], + Ascii => &[(b'\x00', b'\x7F')], + Blank => &[(b'\t', b'\t'), (b' ', b' ')], + Cntrl => &[(b'\x00', b'\x1F'), (b'\x7F', b'\x7F')], + Digit => &[(b'0', b'9')], + Graph => &[(b'!', b'~')], + Lower => &[(b'a', b'z')], + Print => &[(b' ', b'~')], + Punct => &[(b'!', b'/'), (b':', b'@'), (b'[', b'`'), (b'{', b'~')], + Space => &[ + (b'\t', b'\t'), + (b'\n', b'\n'), + (b'\x0B', b'\x0B'), + (b'\x0C', b'\x0C'), + (b'\r', b'\r'), + (b' ', b' '), + ], + Upper => &[(b'A', b'Z')], + Word => &[(b'0', b'9'), (b'A', b'Z'), (b'_', b'_'), (b'a', b'z')], + Xdigit => &[(b'0', b'9'), (b'A', b'F'), (b'a', b'f')], + }; + slice.iter().copied() +} + +fn ascii_class_as_chars( + kind: &ast::ClassAsciiKind, +) -> impl Iterator<Item = (char, char)> { + ascii_class(kind).map(|(s, e)| (char::from(s), char::from(e))) +} + +#[cfg(test)] +mod tests { + use crate::{ + ast::{self, parse::ParserBuilder, Ast, Position, Span}, + hir::{self, Hir, HirKind, Look, Properties}, + unicode::{self, ClassQuery}, + }; + + use super::*; + + // We create these errors to compare with real hir::Errors in the tests. + // We define equality between TestError and hir::Error to disregard the + // pattern string in hir::Error, which is annoying to provide in tests. + #[derive(Clone, Debug)] + struct TestError { + span: Span, + kind: hir::ErrorKind, + } + + impl PartialEq<hir::Error> for TestError { + fn eq(&self, other: &hir::Error) -> bool { + self.span == other.span && self.kind == other.kind + } + } + + impl PartialEq<TestError> for hir::Error { + fn eq(&self, other: &TestError) -> bool { + self.span == other.span && self.kind == other.kind + } + } + + fn parse(pattern: &str) -> Ast { + ParserBuilder::new().octal(true).build().parse(pattern).unwrap() + } + + fn t(pattern: &str) -> Hir { + TranslatorBuilder::new() + .utf8(true) + .build() + .translate(pattern, &parse(pattern)) + .unwrap() + } + + fn t_err(pattern: &str) -> hir::Error { + TranslatorBuilder::new() + .utf8(true) + .build() + .translate(pattern, &parse(pattern)) + .unwrap_err() + } + + fn t_bytes(pattern: &str) -> Hir { + TranslatorBuilder::new() + .utf8(false) + .build() + .translate(pattern, &parse(pattern)) + .unwrap() + } + + fn props(pattern: &str) -> Properties { + t(pattern).properties().clone() + } + + fn props_bytes(pattern: &str) -> Properties { + t_bytes(pattern).properties().clone() + } + + fn hir_lit(s: &str) -> Hir { + hir_blit(s.as_bytes()) + } + + fn hir_blit(s: &[u8]) -> Hir { + Hir::literal(s) + } + + fn hir_capture(index: u32, expr: Hir) -> Hir { + Hir::capture(hir::Capture { index, name: None, sub: Box::new(expr) }) + } + + fn hir_capture_name(index: u32, name: &str, expr: Hir) -> Hir { + Hir::capture(hir::Capture { + index, + name: Some(name.into()), + sub: Box::new(expr), + }) + } + + fn hir_quest(greedy: bool, expr: Hir) -> Hir { + Hir::repetition(hir::Repetition { + min: 0, + max: Some(1), + greedy, + sub: Box::new(expr), + }) + } + + fn hir_star(greedy: bool, expr: Hir) -> Hir { + Hir::repetition(hir::Repetition { + min: 0, + max: None, + greedy, + sub: Box::new(expr), + }) + } + + fn hir_plus(greedy: bool, expr: Hir) -> Hir { + Hir::repetition(hir::Repetition { + min: 1, + max: None, + greedy, + sub: Box::new(expr), + }) + } + + fn hir_range(greedy: bool, min: u32, max: Option<u32>, expr: Hir) -> Hir { + Hir::repetition(hir::Repetition { + min, + max, + greedy, + sub: Box::new(expr), + }) + } + + fn hir_alt(alts: Vec<Hir>) -> Hir { + Hir::alternation(alts) + } + + fn hir_cat(exprs: Vec<Hir>) -> Hir { + Hir::concat(exprs) + } + + #[allow(dead_code)] + fn hir_uclass_query(query: ClassQuery<'_>) -> Hir { + Hir::class(hir::Class::Unicode(unicode::class(query).unwrap())) + } + + #[allow(dead_code)] + fn hir_uclass_perl_word() -> Hir { + Hir::class(hir::Class::Unicode(unicode::perl_word().unwrap())) + } + + fn hir_ascii_uclass(kind: &ast::ClassAsciiKind) -> Hir { + Hir::class(hir::Class::Unicode(hir::ClassUnicode::new( + ascii_class_as_chars(kind) + .map(|(s, e)| hir::ClassUnicodeRange::new(s, e)), + ))) + } + + fn hir_ascii_bclass(kind: &ast::ClassAsciiKind) -> Hir { + Hir::class(hir::Class::Bytes(hir::ClassBytes::new( + ascii_class(kind).map(|(s, e)| hir::ClassBytesRange::new(s, e)), + ))) + } + + fn hir_uclass(ranges: &[(char, char)]) -> Hir { + Hir::class(uclass(ranges)) + } + + fn hir_bclass(ranges: &[(u8, u8)]) -> Hir { + Hir::class(bclass(ranges)) + } + + fn hir_case_fold(expr: Hir) -> Hir { + match expr.into_kind() { + HirKind::Class(mut cls) => { + cls.case_fold_simple(); + Hir::class(cls) + } + _ => panic!("cannot case fold non-class Hir expr"), + } + } + + fn hir_negate(expr: Hir) -> Hir { + match expr.into_kind() { + HirKind::Class(mut cls) => { + cls.negate(); + Hir::class(cls) + } + _ => panic!("cannot negate non-class Hir expr"), + } + } + + fn uclass(ranges: &[(char, char)]) -> hir::Class { + let ranges: Vec<hir::ClassUnicodeRange> = ranges + .iter() + .map(|&(s, e)| hir::ClassUnicodeRange::new(s, e)) + .collect(); + hir::Class::Unicode(hir::ClassUnicode::new(ranges)) + } + + fn bclass(ranges: &[(u8, u8)]) -> hir::Class { + let ranges: Vec<hir::ClassBytesRange> = ranges + .iter() + .map(|&(s, e)| hir::ClassBytesRange::new(s, e)) + .collect(); + hir::Class::Bytes(hir::ClassBytes::new(ranges)) + } + + #[cfg(feature = "unicode-case")] + fn class_case_fold(mut cls: hir::Class) -> Hir { + cls.case_fold_simple(); + Hir::class(cls) + } + + fn class_negate(mut cls: hir::Class) -> Hir { + cls.negate(); + Hir::class(cls) + } + + #[allow(dead_code)] + fn hir_union(expr1: Hir, expr2: Hir) -> Hir { + use crate::hir::Class::{Bytes, Unicode}; + + match (expr1.into_kind(), expr2.into_kind()) { + (HirKind::Class(Unicode(mut c1)), HirKind::Class(Unicode(c2))) => { + c1.union(&c2); + Hir::class(hir::Class::Unicode(c1)) + } + (HirKind::Class(Bytes(mut c1)), HirKind::Class(Bytes(c2))) => { + c1.union(&c2); + Hir::class(hir::Class::Bytes(c1)) + } + _ => panic!("cannot union non-class Hir exprs"), + } + } + + #[allow(dead_code)] + fn hir_difference(expr1: Hir, expr2: Hir) -> Hir { + use crate::hir::Class::{Bytes, Unicode}; + + match (expr1.into_kind(), expr2.into_kind()) { + (HirKind::Class(Unicode(mut c1)), HirKind::Class(Unicode(c2))) => { + c1.difference(&c2); + Hir::class(hir::Class::Unicode(c1)) + } + (HirKind::Class(Bytes(mut c1)), HirKind::Class(Bytes(c2))) => { + c1.difference(&c2); + Hir::class(hir::Class::Bytes(c1)) + } + _ => panic!("cannot difference non-class Hir exprs"), + } + } + + fn hir_look(look: hir::Look) -> Hir { + Hir::look(look) + } + + #[test] + fn empty() { + assert_eq!(t(""), Hir::empty()); + assert_eq!(t("(?i)"), Hir::empty()); + assert_eq!(t("()"), hir_capture(1, Hir::empty())); + assert_eq!(t("(?:)"), Hir::empty()); + assert_eq!(t("(?P<wat>)"), hir_capture_name(1, "wat", Hir::empty())); + assert_eq!(t("|"), hir_alt(vec![Hir::empty(), Hir::empty()])); + assert_eq!( + t("()|()"), + hir_alt(vec![ + hir_capture(1, Hir::empty()), + hir_capture(2, Hir::empty()), + ]) + ); + assert_eq!( + t("(|b)"), + hir_capture(1, hir_alt(vec![Hir::empty(), hir_lit("b"),])) + ); + assert_eq!( + t("(a|)"), + hir_capture(1, hir_alt(vec![hir_lit("a"), Hir::empty(),])) + ); + assert_eq!( + t("(a||c)"), + hir_capture( + 1, + hir_alt(vec![hir_lit("a"), Hir::empty(), hir_lit("c"),]) + ) + ); + assert_eq!( + t("(||)"), + hir_capture( + 1, + hir_alt(vec![Hir::empty(), Hir::empty(), Hir::empty(),]) + ) + ); + } + + #[test] + fn literal() { + assert_eq!(t("a"), hir_lit("a")); + assert_eq!(t("(?-u)a"), hir_lit("a")); + assert_eq!(t("☃"), hir_lit("☃")); + assert_eq!(t("abcd"), hir_lit("abcd")); + + assert_eq!(t_bytes("(?-u)a"), hir_lit("a")); + assert_eq!(t_bytes("(?-u)\x61"), hir_lit("a")); + assert_eq!(t_bytes(r"(?-u)\x61"), hir_lit("a")); + assert_eq!(t_bytes(r"(?-u)\xFF"), hir_blit(b"\xFF")); + + assert_eq!(t("(?-u)☃"), hir_lit("☃")); + assert_eq!( + t_err(r"(?-u)\xFF"), + TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new( + Position::new(5, 1, 6), + Position::new(9, 1, 10) + ), + } + ); + } + + #[test] + fn literal_case_insensitive() { + #[cfg(feature = "unicode-case")] + assert_eq!(t("(?i)a"), hir_uclass(&[('A', 'A'), ('a', 'a'),])); + #[cfg(feature = "unicode-case")] + assert_eq!(t("(?i:a)"), hir_uclass(&[('A', 'A'), ('a', 'a')])); + #[cfg(feature = "unicode-case")] + assert_eq!( + t("a(?i)a(?-i)a"), + hir_cat(vec![ + hir_lit("a"), + hir_uclass(&[('A', 'A'), ('a', 'a')]), + hir_lit("a"), + ]) + ); + #[cfg(feature = "unicode-case")] + assert_eq!( + t("(?i)ab@c"), + hir_cat(vec![ + hir_uclass(&[('A', 'A'), ('a', 'a')]), + hir_uclass(&[('B', 'B'), ('b', 'b')]), + hir_lit("@"), + hir_uclass(&[('C', 'C'), ('c', 'c')]), + ]) + ); + #[cfg(feature = "unicode-case")] + assert_eq!( + t("(?i)β"), + hir_uclass(&[('Β', 'Β'), ('β', 'β'), ('ϐ', 'ϐ'),]) + ); + + assert_eq!(t("(?i-u)a"), hir_bclass(&[(b'A', b'A'), (b'a', b'a'),])); + #[cfg(feature = "unicode-case")] + assert_eq!( + t("(?-u)a(?i)a(?-i)a"), + hir_cat(vec![ + hir_lit("a"), + hir_bclass(&[(b'A', b'A'), (b'a', b'a')]), + hir_lit("a"), + ]) + ); + assert_eq!( + t("(?i-u)ab@c"), + hir_cat(vec![ + hir_bclass(&[(b'A', b'A'), (b'a', b'a')]), + hir_bclass(&[(b'B', b'B'), (b'b', b'b')]), + hir_lit("@"), + hir_bclass(&[(b'C', b'C'), (b'c', b'c')]), + ]) + ); + + assert_eq!( + t_bytes("(?i-u)a"), + hir_bclass(&[(b'A', b'A'), (b'a', b'a'),]) + ); + assert_eq!( + t_bytes("(?i-u)\x61"), + hir_bclass(&[(b'A', b'A'), (b'a', b'a'),]) + ); + assert_eq!( + t_bytes(r"(?i-u)\x61"), + hir_bclass(&[(b'A', b'A'), (b'a', b'a'),]) + ); + assert_eq!(t_bytes(r"(?i-u)\xFF"), hir_blit(b"\xFF")); + + assert_eq!(t("(?i-u)β"), hir_lit("β"),); + } + + #[test] + fn dot() { + assert_eq!( + t("."), + hir_uclass(&[('\0', '\t'), ('\x0B', '\u{10FFFF}')]) + ); + assert_eq!( + t("(?R)."), + hir_uclass(&[ + ('\0', '\t'), + ('\x0B', '\x0C'), + ('\x0E', '\u{10FFFF}'), + ]) + ); + assert_eq!(t("(?s)."), hir_uclass(&[('\0', '\u{10FFFF}')])); + assert_eq!(t("(?Rs)."), hir_uclass(&[('\0', '\u{10FFFF}')])); + assert_eq!( + t_bytes("(?-u)."), + hir_bclass(&[(b'\0', b'\t'), (b'\x0B', b'\xFF')]) + ); + assert_eq!( + t_bytes("(?R-u)."), + hir_bclass(&[ + (b'\0', b'\t'), + (b'\x0B', b'\x0C'), + (b'\x0E', b'\xFF'), + ]) + ); + assert_eq!(t_bytes("(?s-u)."), hir_bclass(&[(b'\0', b'\xFF'),])); + assert_eq!(t_bytes("(?Rs-u)."), hir_bclass(&[(b'\0', b'\xFF'),])); + + // If invalid UTF-8 isn't allowed, then non-Unicode `.` isn't allowed. + assert_eq!( + t_err("(?-u)."), + TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new( + Position::new(5, 1, 6), + Position::new(6, 1, 7) + ), + } + ); + assert_eq!( + t_err("(?R-u)."), + TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new( + Position::new(6, 1, 7), + Position::new(7, 1, 8) + ), + } + ); + assert_eq!( + t_err("(?s-u)."), + TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new( + Position::new(6, 1, 7), + Position::new(7, 1, 8) + ), + } + ); + assert_eq!( + t_err("(?Rs-u)."), + TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new( + Position::new(7, 1, 8), + Position::new(8, 1, 9) + ), + } + ); + } + + #[test] + fn assertions() { + assert_eq!(t("^"), hir_look(hir::Look::Start)); + assert_eq!(t("$"), hir_look(hir::Look::End)); + assert_eq!(t(r"\A"), hir_look(hir::Look::Start)); + assert_eq!(t(r"\z"), hir_look(hir::Look::End)); + assert_eq!(t("(?m)^"), hir_look(hir::Look::StartLF)); + assert_eq!(t("(?m)$"), hir_look(hir::Look::EndLF)); + assert_eq!(t(r"(?m)\A"), hir_look(hir::Look::Start)); + assert_eq!(t(r"(?m)\z"), hir_look(hir::Look::End)); + + assert_eq!(t(r"\b"), hir_look(hir::Look::WordUnicode)); + assert_eq!(t(r"\B"), hir_look(hir::Look::WordUnicodeNegate)); + assert_eq!(t(r"(?-u)\b"), hir_look(hir::Look::WordAscii)); + assert_eq!(t(r"(?-u)\B"), hir_look(hir::Look::WordAsciiNegate)); + } + + #[test] + fn group() { + assert_eq!(t("(a)"), hir_capture(1, hir_lit("a"))); + assert_eq!( + t("(a)(b)"), + hir_cat(vec![ + hir_capture(1, hir_lit("a")), + hir_capture(2, hir_lit("b")), + ]) + ); + assert_eq!( + t("(a)|(b)"), + hir_alt(vec![ + hir_capture(1, hir_lit("a")), + hir_capture(2, hir_lit("b")), + ]) + ); + assert_eq!(t("(?P<foo>)"), hir_capture_name(1, "foo", Hir::empty())); + assert_eq!(t("(?P<foo>a)"), hir_capture_name(1, "foo", hir_lit("a"))); + assert_eq!( + t("(?P<foo>a)(?P<bar>b)"), + hir_cat(vec![ + hir_capture_name(1, "foo", hir_lit("a")), + hir_capture_name(2, "bar", hir_lit("b")), + ]) + ); + assert_eq!(t("(?:)"), Hir::empty()); + assert_eq!(t("(?:a)"), hir_lit("a")); + assert_eq!( + t("(?:a)(b)"), + hir_cat(vec![hir_lit("a"), hir_capture(1, hir_lit("b")),]) + ); + assert_eq!( + t("(a)(?:b)(c)"), + hir_cat(vec![ + hir_capture(1, hir_lit("a")), + hir_lit("b"), + hir_capture(2, hir_lit("c")), + ]) + ); + assert_eq!( + t("(a)(?P<foo>b)(c)"), + hir_cat(vec![ + hir_capture(1, hir_lit("a")), + hir_capture_name(2, "foo", hir_lit("b")), + hir_capture(3, hir_lit("c")), + ]) + ); + assert_eq!(t("()"), hir_capture(1, Hir::empty())); + assert_eq!(t("((?i))"), hir_capture(1, Hir::empty())); + assert_eq!(t("((?x))"), hir_capture(1, Hir::empty())); + assert_eq!( + t("(((?x)))"), + hir_capture(1, hir_capture(2, Hir::empty())) + ); + } + + #[test] + fn line_anchors() { + assert_eq!(t("^"), hir_look(hir::Look::Start)); + assert_eq!(t("$"), hir_look(hir::Look::End)); + assert_eq!(t(r"\A"), hir_look(hir::Look::Start)); + assert_eq!(t(r"\z"), hir_look(hir::Look::End)); + + assert_eq!(t(r"(?m)\A"), hir_look(hir::Look::Start)); + assert_eq!(t(r"(?m)\z"), hir_look(hir::Look::End)); + assert_eq!(t("(?m)^"), hir_look(hir::Look::StartLF)); + assert_eq!(t("(?m)$"), hir_look(hir::Look::EndLF)); + + assert_eq!(t(r"(?R)\A"), hir_look(hir::Look::Start)); + assert_eq!(t(r"(?R)\z"), hir_look(hir::Look::End)); + assert_eq!(t("(?R)^"), hir_look(hir::Look::Start)); + assert_eq!(t("(?R)$"), hir_look(hir::Look::End)); + + assert_eq!(t(r"(?Rm)\A"), hir_look(hir::Look::Start)); + assert_eq!(t(r"(?Rm)\z"), hir_look(hir::Look::End)); + assert_eq!(t("(?Rm)^"), hir_look(hir::Look::StartCRLF)); + assert_eq!(t("(?Rm)$"), hir_look(hir::Look::EndCRLF)); + } + + #[test] + fn flags() { + #[cfg(feature = "unicode-case")] + assert_eq!( + t("(?i:a)a"), + hir_cat( + vec![hir_uclass(&[('A', 'A'), ('a', 'a')]), hir_lit("a"),] + ) + ); + assert_eq!( + t("(?i-u:a)β"), + hir_cat(vec![ + hir_bclass(&[(b'A', b'A'), (b'a', b'a')]), + hir_lit("β"), + ]) + ); + assert_eq!( + t("(?:(?i-u)a)b"), + hir_cat(vec![ + hir_bclass(&[(b'A', b'A'), (b'a', b'a')]), + hir_lit("b"), + ]) + ); + assert_eq!( + t("((?i-u)a)b"), + hir_cat(vec![ + hir_capture(1, hir_bclass(&[(b'A', b'A'), (b'a', b'a')])), + hir_lit("b"), + ]) + ); + #[cfg(feature = "unicode-case")] + assert_eq!( + t("(?i)(?-i:a)a"), + hir_cat( + vec![hir_lit("a"), hir_uclass(&[('A', 'A'), ('a', 'a')]),] + ) + ); + #[cfg(feature = "unicode-case")] + assert_eq!( + t("(?im)a^"), + hir_cat(vec![ + hir_uclass(&[('A', 'A'), ('a', 'a')]), + hir_look(hir::Look::StartLF), + ]) + ); + #[cfg(feature = "unicode-case")] + assert_eq!( + t("(?im)a^(?i-m)a^"), + hir_cat(vec![ + hir_uclass(&[('A', 'A'), ('a', 'a')]), + hir_look(hir::Look::StartLF), + hir_uclass(&[('A', 'A'), ('a', 'a')]), + hir_look(hir::Look::Start), + ]) + ); + assert_eq!( + t("(?U)a*a*?(?-U)a*a*?"), + hir_cat(vec![ + hir_star(false, hir_lit("a")), + hir_star(true, hir_lit("a")), + hir_star(true, hir_lit("a")), + hir_star(false, hir_lit("a")), + ]) + ); + #[cfg(feature = "unicode-case")] + assert_eq!( + t("(?:a(?i)a)a"), + hir_cat(vec![ + hir_cat(vec![ + hir_lit("a"), + hir_uclass(&[('A', 'A'), ('a', 'a')]), + ]), + hir_lit("a"), + ]) + ); + #[cfg(feature = "unicode-case")] + assert_eq!( + t("(?i)(?:a(?-i)a)a"), + hir_cat(vec![ + hir_cat(vec![ + hir_uclass(&[('A', 'A'), ('a', 'a')]), + hir_lit("a"), + ]), + hir_uclass(&[('A', 'A'), ('a', 'a')]), + ]) + ); + } + + #[test] + fn escape() { + assert_eq!( + t(r"\\\.\+\*\?\(\)\|\[\]\{\}\^\$\#"), + hir_lit(r"\.+*?()|[]{}^$#") + ); + } + + #[test] + fn repetition() { + assert_eq!(t("a?"), hir_quest(true, hir_lit("a"))); + assert_eq!(t("a*"), hir_star(true, hir_lit("a"))); + assert_eq!(t("a+"), hir_plus(true, hir_lit("a"))); + assert_eq!(t("a??"), hir_quest(false, hir_lit("a"))); + assert_eq!(t("a*?"), hir_star(false, hir_lit("a"))); + assert_eq!(t("a+?"), hir_plus(false, hir_lit("a"))); + + assert_eq!(t("a{1}"), hir_range(true, 1, Some(1), hir_lit("a"),)); + assert_eq!(t("a{1,}"), hir_range(true, 1, None, hir_lit("a"),)); + assert_eq!(t("a{1,2}"), hir_range(true, 1, Some(2), hir_lit("a"),)); + assert_eq!(t("a{1}?"), hir_range(false, 1, Some(1), hir_lit("a"),)); + assert_eq!(t("a{1,}?"), hir_range(false, 1, None, hir_lit("a"),)); + assert_eq!(t("a{1,2}?"), hir_range(false, 1, Some(2), hir_lit("a"),)); + + assert_eq!( + t("ab?"), + hir_cat(vec![hir_lit("a"), hir_quest(true, hir_lit("b")),]) + ); + assert_eq!(t("(ab)?"), hir_quest(true, hir_capture(1, hir_lit("ab")))); + assert_eq!( + t("a|b?"), + hir_alt(vec![hir_lit("a"), hir_quest(true, hir_lit("b")),]) + ); + } + + #[test] + fn cat_alt() { + let a = || hir_look(hir::Look::Start); + let b = || hir_look(hir::Look::End); + let c = || hir_look(hir::Look::WordUnicode); + let d = || hir_look(hir::Look::WordUnicodeNegate); + + assert_eq!(t("(^$)"), hir_capture(1, hir_cat(vec![a(), b()]))); + assert_eq!(t("^|$"), hir_alt(vec![a(), b()])); + assert_eq!(t(r"^|$|\b"), hir_alt(vec![a(), b(), c()])); + assert_eq!( + t(r"^$|$\b|\b\B"), + hir_alt(vec![ + hir_cat(vec![a(), b()]), + hir_cat(vec![b(), c()]), + hir_cat(vec![c(), d()]), + ]) + ); + assert_eq!(t("(^|$)"), hir_capture(1, hir_alt(vec![a(), b()]))); + assert_eq!( + t(r"(^|$|\b)"), + hir_capture(1, hir_alt(vec![a(), b(), c()])) + ); + assert_eq!( + t(r"(^$|$\b|\b\B)"), + hir_capture( + 1, + hir_alt(vec![ + hir_cat(vec![a(), b()]), + hir_cat(vec![b(), c()]), + hir_cat(vec![c(), d()]), + ]) + ) + ); + assert_eq!( + t(r"(^$|($\b|(\b\B)))"), + hir_capture( + 1, + hir_alt(vec![ + hir_cat(vec![a(), b()]), + hir_capture( + 2, + hir_alt(vec![ + hir_cat(vec![b(), c()]), + hir_capture(3, hir_cat(vec![c(), d()])), + ]) + ), + ]) + ) + ); + } + + // Tests the HIR transformation of things like '[a-z]|[A-Z]' into + // '[A-Za-z]'. In other words, an alternation of just classes is always + // equivalent to a single class corresponding to the union of the branches + // in that class. (Unless some branches match invalid UTF-8 and others + // match non-ASCII Unicode.) + #[test] + fn cat_class_flattened() { + assert_eq!(t(r"[a-z]|[A-Z]"), hir_uclass(&[('A', 'Z'), ('a', 'z')])); + // Combining all of the letter properties should give us the one giant + // letter property. + #[cfg(feature = "unicode-gencat")] + assert_eq!( + t(r"(?x) + \p{Lowercase_Letter} + |\p{Uppercase_Letter} + |\p{Titlecase_Letter} + |\p{Modifier_Letter} + |\p{Other_Letter} + "), + hir_uclass_query(ClassQuery::Binary("letter")) + ); + // Byte classes that can truly match invalid UTF-8 cannot be combined + // with Unicode classes. + assert_eq!( + t_bytes(r"[Δδ]|(?-u:[\x90-\xFF])|[Λλ]"), + hir_alt(vec![ + hir_uclass(&[('Δ', 'Δ'), ('δ', 'δ')]), + hir_bclass(&[(b'\x90', b'\xFF')]), + hir_uclass(&[('Λ', 'Λ'), ('λ', 'λ')]), + ]) + ); + // Byte classes on their own can be combined, even if some are ASCII + // and others are invalid UTF-8. + assert_eq!( + t_bytes(r"[a-z]|(?-u:[\x90-\xFF])|[A-Z]"), + hir_bclass(&[(b'A', b'Z'), (b'a', b'z'), (b'\x90', b'\xFF')]), + ); + } + + #[test] + fn class_ascii() { + assert_eq!( + t("[[:alnum:]]"), + hir_ascii_uclass(&ast::ClassAsciiKind::Alnum) + ); + assert_eq!( + t("[[:alpha:]]"), + hir_ascii_uclass(&ast::ClassAsciiKind::Alpha) + ); + assert_eq!( + t("[[:ascii:]]"), + hir_ascii_uclass(&ast::ClassAsciiKind::Ascii) + ); + assert_eq!( + t("[[:blank:]]"), + hir_ascii_uclass(&ast::ClassAsciiKind::Blank) + ); + assert_eq!( + t("[[:cntrl:]]"), + hir_ascii_uclass(&ast::ClassAsciiKind::Cntrl) + ); + assert_eq!( + t("[[:digit:]]"), + hir_ascii_uclass(&ast::ClassAsciiKind::Digit) + ); + assert_eq!( + t("[[:graph:]]"), + hir_ascii_uclass(&ast::ClassAsciiKind::Graph) + ); + assert_eq!( + t("[[:lower:]]"), + hir_ascii_uclass(&ast::ClassAsciiKind::Lower) + ); + assert_eq!( + t("[[:print:]]"), + hir_ascii_uclass(&ast::ClassAsciiKind::Print) + ); + assert_eq!( + t("[[:punct:]]"), + hir_ascii_uclass(&ast::ClassAsciiKind::Punct) + ); + assert_eq!( + t("[[:space:]]"), + hir_ascii_uclass(&ast::ClassAsciiKind::Space) + ); + assert_eq!( + t("[[:upper:]]"), + hir_ascii_uclass(&ast::ClassAsciiKind::Upper) + ); + assert_eq!( + t("[[:word:]]"), + hir_ascii_uclass(&ast::ClassAsciiKind::Word) + ); + assert_eq!( + t("[[:xdigit:]]"), + hir_ascii_uclass(&ast::ClassAsciiKind::Xdigit) + ); + + assert_eq!( + t("[[:^lower:]]"), + hir_negate(hir_ascii_uclass(&ast::ClassAsciiKind::Lower)) + ); + #[cfg(feature = "unicode-case")] + assert_eq!( + t("(?i)[[:lower:]]"), + hir_uclass(&[ + ('A', 'Z'), + ('a', 'z'), + ('\u{17F}', '\u{17F}'), + ('\u{212A}', '\u{212A}'), + ]) + ); + + assert_eq!( + t("(?-u)[[:lower:]]"), + hir_ascii_bclass(&ast::ClassAsciiKind::Lower) + ); + assert_eq!( + t("(?i-u)[[:lower:]]"), + hir_case_fold(hir_ascii_bclass(&ast::ClassAsciiKind::Lower)) + ); + + assert_eq!( + t_err("(?-u)[[:^lower:]]"), + TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new( + Position::new(6, 1, 7), + Position::new(16, 1, 17) + ), + } + ); + assert_eq!( + t_err("(?i-u)[[:^lower:]]"), + TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new( + Position::new(7, 1, 8), + Position::new(17, 1, 18) + ), + } + ); + } + + #[test] + fn class_ascii_multiple() { + // See: https://github.com/rust-lang/regex/issues/680 + assert_eq!( + t("[[:alnum:][:^ascii:]]"), + hir_union( + hir_ascii_uclass(&ast::ClassAsciiKind::Alnum), + hir_uclass(&[('\u{80}', '\u{10FFFF}')]), + ), + ); + assert_eq!( + t_bytes("(?-u)[[:alnum:][:^ascii:]]"), + hir_union( + hir_ascii_bclass(&ast::ClassAsciiKind::Alnum), + hir_bclass(&[(0x80, 0xFF)]), + ), + ); + } + + #[test] + #[cfg(feature = "unicode-perl")] + fn class_perl_unicode() { + // Unicode + assert_eq!(t(r"\d"), hir_uclass_query(ClassQuery::Binary("digit"))); + assert_eq!(t(r"\s"), hir_uclass_query(ClassQuery::Binary("space"))); + assert_eq!(t(r"\w"), hir_uclass_perl_word()); + #[cfg(feature = "unicode-case")] + assert_eq!( + t(r"(?i)\d"), + hir_uclass_query(ClassQuery::Binary("digit")) + ); + #[cfg(feature = "unicode-case")] + assert_eq!( + t(r"(?i)\s"), + hir_uclass_query(ClassQuery::Binary("space")) + ); + #[cfg(feature = "unicode-case")] + assert_eq!(t(r"(?i)\w"), hir_uclass_perl_word()); + + // Unicode, negated + assert_eq!( + t(r"\D"), + hir_negate(hir_uclass_query(ClassQuery::Binary("digit"))) + ); + assert_eq!( + t(r"\S"), + hir_negate(hir_uclass_query(ClassQuery::Binary("space"))) + ); + assert_eq!(t(r"\W"), hir_negate(hir_uclass_perl_word())); + #[cfg(feature = "unicode-case")] + assert_eq!( + t(r"(?i)\D"), + hir_negate(hir_uclass_query(ClassQuery::Binary("digit"))) + ); + #[cfg(feature = "unicode-case")] + assert_eq!( + t(r"(?i)\S"), + hir_negate(hir_uclass_query(ClassQuery::Binary("space"))) + ); + #[cfg(feature = "unicode-case")] + assert_eq!(t(r"(?i)\W"), hir_negate(hir_uclass_perl_word())); + } + + #[test] + fn class_perl_ascii() { + // ASCII only + assert_eq!( + t(r"(?-u)\d"), + hir_ascii_bclass(&ast::ClassAsciiKind::Digit) + ); + assert_eq!( + t(r"(?-u)\s"), + hir_ascii_bclass(&ast::ClassAsciiKind::Space) + ); + assert_eq!( + t(r"(?-u)\w"), + hir_ascii_bclass(&ast::ClassAsciiKind::Word) + ); + assert_eq!( + t(r"(?i-u)\d"), + hir_ascii_bclass(&ast::ClassAsciiKind::Digit) + ); + assert_eq!( + t(r"(?i-u)\s"), + hir_ascii_bclass(&ast::ClassAsciiKind::Space) + ); + assert_eq!( + t(r"(?i-u)\w"), + hir_ascii_bclass(&ast::ClassAsciiKind::Word) + ); + + // ASCII only, negated + assert_eq!( + t_bytes(r"(?-u)\D"), + hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit)) + ); + assert_eq!( + t_bytes(r"(?-u)\S"), + hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Space)) + ); + assert_eq!( + t_bytes(r"(?-u)\W"), + hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Word)) + ); + assert_eq!( + t_bytes(r"(?i-u)\D"), + hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit)) + ); + assert_eq!( + t_bytes(r"(?i-u)\S"), + hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Space)) + ); + assert_eq!( + t_bytes(r"(?i-u)\W"), + hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Word)) + ); + + // ASCII only, negated, with UTF-8 mode enabled. + // In this case, negating any Perl class results in an error because + // all such classes can match invalid UTF-8. + assert_eq!( + t_err(r"(?-u)\D"), + TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new( + Position::new(5, 1, 6), + Position::new(7, 1, 8), + ), + }, + ); + assert_eq!( + t_err(r"(?-u)\S"), + TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new( + Position::new(5, 1, 6), + Position::new(7, 1, 8), + ), + }, + ); + assert_eq!( + t_err(r"(?-u)\W"), + TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new( + Position::new(5, 1, 6), + Position::new(7, 1, 8), + ), + }, + ); + assert_eq!( + t_err(r"(?i-u)\D"), + TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new( + Position::new(6, 1, 7), + Position::new(8, 1, 9), + ), + }, + ); + assert_eq!( + t_err(r"(?i-u)\S"), + TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new( + Position::new(6, 1, 7), + Position::new(8, 1, 9), + ), + }, + ); + assert_eq!( + t_err(r"(?i-u)\W"), + TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new( + Position::new(6, 1, 7), + Position::new(8, 1, 9), + ), + }, + ); + } + + #[test] + #[cfg(not(feature = "unicode-perl"))] + fn class_perl_word_disabled() { + assert_eq!( + t_err(r"\w"), + TestError { + kind: hir::ErrorKind::UnicodePerlClassNotFound, + span: Span::new( + Position::new(0, 1, 1), + Position::new(2, 1, 3) + ), + } + ); + } + + #[test] + #[cfg(all(not(feature = "unicode-perl"), not(feature = "unicode-bool")))] + fn class_perl_space_disabled() { + assert_eq!( + t_err(r"\s"), + TestError { + kind: hir::ErrorKind::UnicodePerlClassNotFound, + span: Span::new( + Position::new(0, 1, 1), + Position::new(2, 1, 3) + ), + } + ); + } + + #[test] + #[cfg(all( + not(feature = "unicode-perl"), + not(feature = "unicode-gencat") + ))] + fn class_perl_digit_disabled() { + assert_eq!( + t_err(r"\d"), + TestError { + kind: hir::ErrorKind::UnicodePerlClassNotFound, + span: Span::new( + Position::new(0, 1, 1), + Position::new(2, 1, 3) + ), + } + ); + } + + #[test] + #[cfg(feature = "unicode-gencat")] + fn class_unicode_gencat() { + assert_eq!(t(r"\pZ"), hir_uclass_query(ClassQuery::Binary("Z"))); + assert_eq!(t(r"\pz"), hir_uclass_query(ClassQuery::Binary("Z"))); + assert_eq!( + t(r"\p{Separator}"), + hir_uclass_query(ClassQuery::Binary("Z")) + ); + assert_eq!( + t(r"\p{se PaRa ToR}"), + hir_uclass_query(ClassQuery::Binary("Z")) + ); + assert_eq!( + t(r"\p{gc:Separator}"), + hir_uclass_query(ClassQuery::Binary("Z")) + ); + assert_eq!( + t(r"\p{gc=Separator}"), + hir_uclass_query(ClassQuery::Binary("Z")) + ); + assert_eq!( + t(r"\p{Other}"), + hir_uclass_query(ClassQuery::Binary("Other")) + ); + assert_eq!(t(r"\pC"), hir_uclass_query(ClassQuery::Binary("Other"))); + + assert_eq!( + t(r"\PZ"), + hir_negate(hir_uclass_query(ClassQuery::Binary("Z"))) + ); + assert_eq!( + t(r"\P{separator}"), + hir_negate(hir_uclass_query(ClassQuery::Binary("Z"))) + ); + assert_eq!( + t(r"\P{gc!=separator}"), + hir_negate(hir_uclass_query(ClassQuery::Binary("Z"))) + ); + + assert_eq!(t(r"\p{any}"), hir_uclass_query(ClassQuery::Binary("Any"))); + assert_eq!( + t(r"\p{assigned}"), + hir_uclass_query(ClassQuery::Binary("Assigned")) + ); + assert_eq!( + t(r"\p{ascii}"), + hir_uclass_query(ClassQuery::Binary("ASCII")) + ); + assert_eq!( + t(r"\p{gc:any}"), + hir_uclass_query(ClassQuery::Binary("Any")) + ); + assert_eq!( + t(r"\p{gc:assigned}"), + hir_uclass_query(ClassQuery::Binary("Assigned")) + ); + assert_eq!( + t(r"\p{gc:ascii}"), + hir_uclass_query(ClassQuery::Binary("ASCII")) + ); + + assert_eq!( + t_err(r"(?-u)\pZ"), + TestError { + kind: hir::ErrorKind::UnicodeNotAllowed, + span: Span::new( + Position::new(5, 1, 6), + Position::new(8, 1, 9) + ), + } + ); + assert_eq!( + t_err(r"(?-u)\p{Separator}"), + TestError { + kind: hir::ErrorKind::UnicodeNotAllowed, + span: Span::new( + Position::new(5, 1, 6), + Position::new(18, 1, 19) + ), + } + ); + assert_eq!( + t_err(r"\pE"), + TestError { + kind: hir::ErrorKind::UnicodePropertyNotFound, + span: Span::new( + Position::new(0, 1, 1), + Position::new(3, 1, 4) + ), + } + ); + assert_eq!( + t_err(r"\p{Foo}"), + TestError { + kind: hir::ErrorKind::UnicodePropertyNotFound, + span: Span::new( + Position::new(0, 1, 1), + Position::new(7, 1, 8) + ), + } + ); + assert_eq!( + t_err(r"\p{gc:Foo}"), + TestError { + kind: hir::ErrorKind::UnicodePropertyValueNotFound, + span: Span::new( + Position::new(0, 1, 1), + Position::new(10, 1, 11) + ), + } + ); + } + + #[test] + #[cfg(not(feature = "unicode-gencat"))] + fn class_unicode_gencat_disabled() { + assert_eq!( + t_err(r"\p{Separator}"), + TestError { + kind: hir::ErrorKind::UnicodePropertyNotFound, + span: Span::new( + Position::new(0, 1, 1), + Position::new(13, 1, 14) + ), + } + ); + + assert_eq!( + t_err(r"\p{Any}"), + TestError { + kind: hir::ErrorKind::UnicodePropertyNotFound, + span: Span::new( + Position::new(0, 1, 1), + Position::new(7, 1, 8) + ), + } + ); + } + + #[test] + #[cfg(feature = "unicode-script")] + fn class_unicode_script() { + assert_eq!( + t(r"\p{Greek}"), + hir_uclass_query(ClassQuery::Binary("Greek")) + ); + #[cfg(feature = "unicode-case")] + assert_eq!( + t(r"(?i)\p{Greek}"), + hir_case_fold(hir_uclass_query(ClassQuery::Binary("Greek"))) + ); + #[cfg(feature = "unicode-case")] + assert_eq!( + t(r"(?i)\P{Greek}"), + hir_negate(hir_case_fold(hir_uclass_query(ClassQuery::Binary( + "Greek" + )))) + ); + + assert_eq!( + t_err(r"\p{sc:Foo}"), + TestError { + kind: hir::ErrorKind::UnicodePropertyValueNotFound, + span: Span::new( + Position::new(0, 1, 1), + Position::new(10, 1, 11) + ), + } + ); + assert_eq!( + t_err(r"\p{scx:Foo}"), + TestError { + kind: hir::ErrorKind::UnicodePropertyValueNotFound, + span: Span::new( + Position::new(0, 1, 1), + Position::new(11, 1, 12) + ), + } + ); + } + + #[test] + #[cfg(not(feature = "unicode-script"))] + fn class_unicode_script_disabled() { + assert_eq!( + t_err(r"\p{Greek}"), + TestError { + kind: hir::ErrorKind::UnicodePropertyNotFound, + span: Span::new( + Position::new(0, 1, 1), + Position::new(9, 1, 10) + ), + } + ); + + assert_eq!( + t_err(r"\p{scx:Greek}"), + TestError { + kind: hir::ErrorKind::UnicodePropertyNotFound, + span: Span::new( + Position::new(0, 1, 1), + Position::new(13, 1, 14) + ), + } + ); + } + + #[test] + #[cfg(feature = "unicode-age")] + fn class_unicode_age() { + assert_eq!( + t_err(r"\p{age:Foo}"), + TestError { + kind: hir::ErrorKind::UnicodePropertyValueNotFound, + span: Span::new( + Position::new(0, 1, 1), + Position::new(11, 1, 12) + ), + } + ); + } + + #[test] + #[cfg(feature = "unicode-gencat")] + fn class_unicode_any_empty() { + assert_eq!(t(r"\P{any}"), hir_uclass(&[]),); + } + + #[test] + #[cfg(not(feature = "unicode-age"))] + fn class_unicode_age_disabled() { + assert_eq!( + t_err(r"\p{age:3.0}"), + TestError { + kind: hir::ErrorKind::UnicodePropertyNotFound, + span: Span::new( + Position::new(0, 1, 1), + Position::new(11, 1, 12) + ), + } + ); + } + + #[test] + fn class_bracketed() { + assert_eq!(t("[a]"), hir_lit("a")); + assert_eq!(t("[ab]"), hir_uclass(&[('a', 'b')])); + assert_eq!(t("[^[a]]"), class_negate(uclass(&[('a', 'a')]))); + assert_eq!(t("[a-z]"), hir_uclass(&[('a', 'z')])); + assert_eq!(t("[a-fd-h]"), hir_uclass(&[('a', 'h')])); + assert_eq!(t("[a-fg-m]"), hir_uclass(&[('a', 'm')])); + assert_eq!(t(r"[\x00]"), hir_uclass(&[('\0', '\0')])); + assert_eq!(t(r"[\n]"), hir_uclass(&[('\n', '\n')])); + assert_eq!(t("[\n]"), hir_uclass(&[('\n', '\n')])); + #[cfg(any(feature = "unicode-perl", feature = "unicode-gencat"))] + assert_eq!(t(r"[\d]"), hir_uclass_query(ClassQuery::Binary("digit"))); + #[cfg(feature = "unicode-gencat")] + assert_eq!( + t(r"[\pZ]"), + hir_uclass_query(ClassQuery::Binary("separator")) + ); + #[cfg(feature = "unicode-gencat")] + assert_eq!( + t(r"[\p{separator}]"), + hir_uclass_query(ClassQuery::Binary("separator")) + ); + #[cfg(any(feature = "unicode-perl", feature = "unicode-gencat"))] + assert_eq!(t(r"[^\D]"), hir_uclass_query(ClassQuery::Binary("digit"))); + #[cfg(feature = "unicode-gencat")] + assert_eq!( + t(r"[^\PZ]"), + hir_uclass_query(ClassQuery::Binary("separator")) + ); + #[cfg(feature = "unicode-gencat")] + assert_eq!( + t(r"[^\P{separator}]"), + hir_uclass_query(ClassQuery::Binary("separator")) + ); + #[cfg(all( + feature = "unicode-case", + any(feature = "unicode-perl", feature = "unicode-gencat") + ))] + assert_eq!( + t(r"(?i)[^\D]"), + hir_uclass_query(ClassQuery::Binary("digit")) + ); + #[cfg(all(feature = "unicode-case", feature = "unicode-script"))] + assert_eq!( + t(r"(?i)[^\P{greek}]"), + hir_case_fold(hir_uclass_query(ClassQuery::Binary("greek"))) + ); + + assert_eq!(t("(?-u)[a]"), hir_bclass(&[(b'a', b'a')])); + assert_eq!(t(r"(?-u)[\x00]"), hir_bclass(&[(b'\0', b'\0')])); + assert_eq!(t_bytes(r"(?-u)[\xFF]"), hir_bclass(&[(b'\xFF', b'\xFF')])); + + #[cfg(feature = "unicode-case")] + assert_eq!(t("(?i)[a]"), hir_uclass(&[('A', 'A'), ('a', 'a')])); + #[cfg(feature = "unicode-case")] + assert_eq!( + t("(?i)[k]"), + hir_uclass(&[('K', 'K'), ('k', 'k'), ('\u{212A}', '\u{212A}'),]) + ); + #[cfg(feature = "unicode-case")] + assert_eq!( + t("(?i)[β]"), + hir_uclass(&[('Β', 'Β'), ('β', 'β'), ('ϐ', 'ϐ'),]) + ); + assert_eq!(t("(?i-u)[k]"), hir_bclass(&[(b'K', b'K'), (b'k', b'k'),])); + + assert_eq!(t("[^a]"), class_negate(uclass(&[('a', 'a')]))); + assert_eq!(t(r"[^\x00]"), class_negate(uclass(&[('\0', '\0')]))); + assert_eq!( + t_bytes("(?-u)[^a]"), + class_negate(bclass(&[(b'a', b'a')])) + ); + #[cfg(any(feature = "unicode-perl", feature = "unicode-gencat"))] + assert_eq!( + t(r"[^\d]"), + hir_negate(hir_uclass_query(ClassQuery::Binary("digit"))) + ); + #[cfg(feature = "unicode-gencat")] + assert_eq!( + t(r"[^\pZ]"), + hir_negate(hir_uclass_query(ClassQuery::Binary("separator"))) + ); + #[cfg(feature = "unicode-gencat")] + assert_eq!( + t(r"[^\p{separator}]"), + hir_negate(hir_uclass_query(ClassQuery::Binary("separator"))) + ); + #[cfg(all(feature = "unicode-case", feature = "unicode-script"))] + assert_eq!( + t(r"(?i)[^\p{greek}]"), + hir_negate(hir_case_fold(hir_uclass_query(ClassQuery::Binary( + "greek" + )))) + ); + #[cfg(all(feature = "unicode-case", feature = "unicode-script"))] + assert_eq!( + t(r"(?i)[\P{greek}]"), + hir_negate(hir_case_fold(hir_uclass_query(ClassQuery::Binary( + "greek" + )))) + ); + + // Test some weird cases. + assert_eq!(t(r"[\[]"), hir_uclass(&[('[', '[')])); + + assert_eq!(t(r"[&]"), hir_uclass(&[('&', '&')])); + assert_eq!(t(r"[\&]"), hir_uclass(&[('&', '&')])); + assert_eq!(t(r"[\&\&]"), hir_uclass(&[('&', '&')])); + assert_eq!(t(r"[\x00-&]"), hir_uclass(&[('\0', '&')])); + assert_eq!(t(r"[&-\xFF]"), hir_uclass(&[('&', '\u{FF}')])); + + assert_eq!(t(r"[~]"), hir_uclass(&[('~', '~')])); + assert_eq!(t(r"[\~]"), hir_uclass(&[('~', '~')])); + assert_eq!(t(r"[\~\~]"), hir_uclass(&[('~', '~')])); + assert_eq!(t(r"[\x00-~]"), hir_uclass(&[('\0', '~')])); + assert_eq!(t(r"[~-\xFF]"), hir_uclass(&[('~', '\u{FF}')])); + + assert_eq!(t(r"[-]"), hir_uclass(&[('-', '-')])); + assert_eq!(t(r"[\-]"), hir_uclass(&[('-', '-')])); + assert_eq!(t(r"[\-\-]"), hir_uclass(&[('-', '-')])); + assert_eq!(t(r"[\x00-\-]"), hir_uclass(&[('\0', '-')])); + assert_eq!(t(r"[\--\xFF]"), hir_uclass(&[('-', '\u{FF}')])); + + assert_eq!( + t_err("(?-u)[^a]"), + TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new( + Position::new(5, 1, 6), + Position::new(9, 1, 10) + ), + } + ); + #[cfg(any(feature = "unicode-perl", feature = "unicode-bool"))] + assert_eq!(t(r"[^\s\S]"), hir_uclass(&[]),); + #[cfg(any(feature = "unicode-perl", feature = "unicode-bool"))] + assert_eq!(t_bytes(r"(?-u)[^\s\S]"), hir_bclass(&[]),); + } + + #[test] + fn class_bracketed_union() { + assert_eq!(t("[a-zA-Z]"), hir_uclass(&[('A', 'Z'), ('a', 'z')])); + #[cfg(feature = "unicode-gencat")] + assert_eq!( + t(r"[a\pZb]"), + hir_union( + hir_uclass(&[('a', 'b')]), + hir_uclass_query(ClassQuery::Binary("separator")) + ) + ); + #[cfg(all(feature = "unicode-gencat", feature = "unicode-script"))] + assert_eq!( + t(r"[\pZ\p{Greek}]"), + hir_union( + hir_uclass_query(ClassQuery::Binary("greek")), + hir_uclass_query(ClassQuery::Binary("separator")) + ) + ); + #[cfg(all( + feature = "unicode-age", + feature = "unicode-gencat", + feature = "unicode-script" + ))] + assert_eq!( + t(r"[\p{age:3.0}\pZ\p{Greek}]"), + hir_union( + hir_uclass_query(ClassQuery::ByValue { + property_name: "age", + property_value: "3.0", + }), + hir_union( + hir_uclass_query(ClassQuery::Binary("greek")), + hir_uclass_query(ClassQuery::Binary("separator")) + ) + ) + ); + #[cfg(all( + feature = "unicode-age", + feature = "unicode-gencat", + feature = "unicode-script" + ))] + assert_eq!( + t(r"[[[\p{age:3.0}\pZ]\p{Greek}][\p{Cyrillic}]]"), + hir_union( + hir_uclass_query(ClassQuery::ByValue { + property_name: "age", + property_value: "3.0", + }), + hir_union( + hir_uclass_query(ClassQuery::Binary("cyrillic")), + hir_union( + hir_uclass_query(ClassQuery::Binary("greek")), + hir_uclass_query(ClassQuery::Binary("separator")) + ) + ) + ) + ); + + #[cfg(all( + feature = "unicode-age", + feature = "unicode-case", + feature = "unicode-gencat", + feature = "unicode-script" + ))] + assert_eq!( + t(r"(?i)[\p{age:3.0}\pZ\p{Greek}]"), + hir_case_fold(hir_union( + hir_uclass_query(ClassQuery::ByValue { + property_name: "age", + property_value: "3.0", + }), + hir_union( + hir_uclass_query(ClassQuery::Binary("greek")), + hir_uclass_query(ClassQuery::Binary("separator")) + ) + )) + ); + #[cfg(all( + feature = "unicode-age", + feature = "unicode-gencat", + feature = "unicode-script" + ))] + assert_eq!( + t(r"[^\p{age:3.0}\pZ\p{Greek}]"), + hir_negate(hir_union( + hir_uclass_query(ClassQuery::ByValue { + property_name: "age", + property_value: "3.0", + }), + hir_union( + hir_uclass_query(ClassQuery::Binary("greek")), + hir_uclass_query(ClassQuery::Binary("separator")) + ) + )) + ); + #[cfg(all( + feature = "unicode-age", + feature = "unicode-case", + feature = "unicode-gencat", + feature = "unicode-script" + ))] + assert_eq!( + t(r"(?i)[^\p{age:3.0}\pZ\p{Greek}]"), + hir_negate(hir_case_fold(hir_union( + hir_uclass_query(ClassQuery::ByValue { + property_name: "age", + property_value: "3.0", + }), + hir_union( + hir_uclass_query(ClassQuery::Binary("greek")), + hir_uclass_query(ClassQuery::Binary("separator")) + ) + ))) + ); + } + + #[test] + fn class_bracketed_nested() { + assert_eq!(t(r"[a[^c]]"), class_negate(uclass(&[('c', 'c')]))); + assert_eq!(t(r"[a-b[^c]]"), class_negate(uclass(&[('c', 'c')]))); + assert_eq!(t(r"[a-c[^c]]"), class_negate(uclass(&[]))); + + assert_eq!(t(r"[^a[^c]]"), hir_uclass(&[('c', 'c')])); + assert_eq!(t(r"[^a-b[^c]]"), hir_uclass(&[('c', 'c')])); + + #[cfg(feature = "unicode-case")] + assert_eq!( + t(r"(?i)[a[^c]]"), + hir_negate(class_case_fold(uclass(&[('c', 'c')]))) + ); + #[cfg(feature = "unicode-case")] + assert_eq!( + t(r"(?i)[a-b[^c]]"), + hir_negate(class_case_fold(uclass(&[('c', 'c')]))) + ); + + #[cfg(feature = "unicode-case")] + assert_eq!(t(r"(?i)[^a[^c]]"), hir_uclass(&[('C', 'C'), ('c', 'c')])); + #[cfg(feature = "unicode-case")] + assert_eq!( + t(r"(?i)[^a-b[^c]]"), + hir_uclass(&[('C', 'C'), ('c', 'c')]) + ); + + assert_eq!(t(r"[^a-c[^c]]"), hir_uclass(&[]),); + #[cfg(feature = "unicode-case")] + assert_eq!(t(r"(?i)[^a-c[^c]]"), hir_uclass(&[]),); + } + + #[test] + fn class_bracketed_intersect() { + assert_eq!(t("[abc&&b-c]"), hir_uclass(&[('b', 'c')])); + assert_eq!(t("[abc&&[b-c]]"), hir_uclass(&[('b', 'c')])); + assert_eq!(t("[[abc]&&[b-c]]"), hir_uclass(&[('b', 'c')])); + assert_eq!(t("[a-z&&b-y&&c-x]"), hir_uclass(&[('c', 'x')])); + assert_eq!(t("[c-da-b&&a-d]"), hir_uclass(&[('a', 'd')])); + assert_eq!(t("[a-d&&c-da-b]"), hir_uclass(&[('a', 'd')])); + assert_eq!(t(r"[a-z&&a-c]"), hir_uclass(&[('a', 'c')])); + assert_eq!(t(r"[[a-z&&a-c]]"), hir_uclass(&[('a', 'c')])); + assert_eq!(t(r"[^[a-z&&a-c]]"), hir_negate(hir_uclass(&[('a', 'c')]))); + + assert_eq!(t("(?-u)[abc&&b-c]"), hir_bclass(&[(b'b', b'c')])); + assert_eq!(t("(?-u)[abc&&[b-c]]"), hir_bclass(&[(b'b', b'c')])); + assert_eq!(t("(?-u)[[abc]&&[b-c]]"), hir_bclass(&[(b'b', b'c')])); + assert_eq!(t("(?-u)[a-z&&b-y&&c-x]"), hir_bclass(&[(b'c', b'x')])); + assert_eq!(t("(?-u)[c-da-b&&a-d]"), hir_bclass(&[(b'a', b'd')])); + assert_eq!(t("(?-u)[a-d&&c-da-b]"), hir_bclass(&[(b'a', b'd')])); + + #[cfg(feature = "unicode-case")] + assert_eq!( + t("(?i)[abc&&b-c]"), + hir_case_fold(hir_uclass(&[('b', 'c')])) + ); + #[cfg(feature = "unicode-case")] + assert_eq!( + t("(?i)[abc&&[b-c]]"), + hir_case_fold(hir_uclass(&[('b', 'c')])) + ); + #[cfg(feature = "unicode-case")] + assert_eq!( + t("(?i)[[abc]&&[b-c]]"), + hir_case_fold(hir_uclass(&[('b', 'c')])) + ); + #[cfg(feature = "unicode-case")] + assert_eq!( + t("(?i)[a-z&&b-y&&c-x]"), + hir_case_fold(hir_uclass(&[('c', 'x')])) + ); + #[cfg(feature = "unicode-case")] + assert_eq!( + t("(?i)[c-da-b&&a-d]"), + hir_case_fold(hir_uclass(&[('a', 'd')])) + ); + #[cfg(feature = "unicode-case")] + assert_eq!( + t("(?i)[a-d&&c-da-b]"), + hir_case_fold(hir_uclass(&[('a', 'd')])) + ); + + assert_eq!( + t("(?i-u)[abc&&b-c]"), + hir_case_fold(hir_bclass(&[(b'b', b'c')])) + ); + assert_eq!( + t("(?i-u)[abc&&[b-c]]"), + hir_case_fold(hir_bclass(&[(b'b', b'c')])) + ); + assert_eq!( + t("(?i-u)[[abc]&&[b-c]]"), + hir_case_fold(hir_bclass(&[(b'b', b'c')])) + ); + assert_eq!( + t("(?i-u)[a-z&&b-y&&c-x]"), + hir_case_fold(hir_bclass(&[(b'c', b'x')])) + ); + assert_eq!( + t("(?i-u)[c-da-b&&a-d]"), + hir_case_fold(hir_bclass(&[(b'a', b'd')])) + ); + assert_eq!( + t("(?i-u)[a-d&&c-da-b]"), + hir_case_fold(hir_bclass(&[(b'a', b'd')])) + ); + + // In `[a^]`, `^` does not need to be escaped, so it makes sense that + // `^` is also allowed to be unescaped after `&&`. + assert_eq!(t(r"[\^&&^]"), hir_uclass(&[('^', '^')])); + // `]` needs to be escaped after `&&` since it's not at start of class. + assert_eq!(t(r"[]&&\]]"), hir_uclass(&[(']', ']')])); + assert_eq!(t(r"[-&&-]"), hir_uclass(&[('-', '-')])); + assert_eq!(t(r"[\&&&&]"), hir_uclass(&[('&', '&')])); + assert_eq!(t(r"[\&&&\&]"), hir_uclass(&[('&', '&')])); + // Test precedence. + assert_eq!( + t(r"[a-w&&[^c-g]z]"), + hir_uclass(&[('a', 'b'), ('h', 'w')]) + ); + } + + #[test] + fn class_bracketed_intersect_negate() { + #[cfg(feature = "unicode-perl")] + assert_eq!( + t(r"[^\w&&\d]"), + hir_negate(hir_uclass_query(ClassQuery::Binary("digit"))) + ); + assert_eq!(t(r"[^[a-z&&a-c]]"), hir_negate(hir_uclass(&[('a', 'c')]))); + #[cfg(feature = "unicode-perl")] + assert_eq!( + t(r"[^[\w&&\d]]"), + hir_negate(hir_uclass_query(ClassQuery::Binary("digit"))) + ); + #[cfg(feature = "unicode-perl")] + assert_eq!( + t(r"[^[^\w&&\d]]"), + hir_uclass_query(ClassQuery::Binary("digit")) + ); + #[cfg(feature = "unicode-perl")] + assert_eq!(t(r"[[[^\w]&&[^\d]]]"), hir_negate(hir_uclass_perl_word())); + + #[cfg(feature = "unicode-perl")] + assert_eq!( + t_bytes(r"(?-u)[^\w&&\d]"), + hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit)) + ); + assert_eq!( + t_bytes(r"(?-u)[^[a-z&&a-c]]"), + hir_negate(hir_bclass(&[(b'a', b'c')])) + ); + assert_eq!( + t_bytes(r"(?-u)[^[\w&&\d]]"), + hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit)) + ); + assert_eq!( + t_bytes(r"(?-u)[^[^\w&&\d]]"), + hir_ascii_bclass(&ast::ClassAsciiKind::Digit) + ); + assert_eq!( + t_bytes(r"(?-u)[[[^\w]&&[^\d]]]"), + hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Word)) + ); + } + + #[test] + fn class_bracketed_difference() { + #[cfg(feature = "unicode-gencat")] + assert_eq!( + t(r"[\pL--[:ascii:]]"), + hir_difference( + hir_uclass_query(ClassQuery::Binary("letter")), + hir_uclass(&[('\0', '\x7F')]) + ) + ); + + assert_eq!( + t(r"(?-u)[[:alpha:]--[:lower:]]"), + hir_bclass(&[(b'A', b'Z')]) + ); + } + + #[test] + fn class_bracketed_symmetric_difference() { + #[cfg(feature = "unicode-script")] + assert_eq!( + t(r"[\p{sc:Greek}~~\p{scx:Greek}]"), + hir_uclass(&[ + ('\u{0342}', '\u{0342}'), + ('\u{0345}', '\u{0345}'), + ('\u{1DC0}', '\u{1DC1}'), + ]) + ); + assert_eq!(t(r"[a-g~~c-j]"), hir_uclass(&[('a', 'b'), ('h', 'j')])); + + assert_eq!( + t(r"(?-u)[a-g~~c-j]"), + hir_bclass(&[(b'a', b'b'), (b'h', b'j')]) + ); + } + + #[test] + fn ignore_whitespace() { + assert_eq!(t(r"(?x)\12 3"), hir_lit("\n3")); + assert_eq!(t(r"(?x)\x { 53 }"), hir_lit("S")); + assert_eq!( + t(r"(?x)\x # comment +{ # comment + 53 # comment +} #comment"), + hir_lit("S") + ); + + assert_eq!(t(r"(?x)\x 53"), hir_lit("S")); + assert_eq!( + t(r"(?x)\x # comment + 53 # comment"), + hir_lit("S") + ); + assert_eq!(t(r"(?x)\x5 3"), hir_lit("S")); + + #[cfg(feature = "unicode-gencat")] + assert_eq!( + t(r"(?x)\p # comment +{ # comment + Separator # comment +} # comment"), + hir_uclass_query(ClassQuery::Binary("separator")) + ); + + assert_eq!( + t(r"(?x)a # comment +{ # comment + 5 # comment + , # comment + 10 # comment +} # comment"), + hir_range(true, 5, Some(10), hir_lit("a")) + ); + + assert_eq!(t(r"(?x)a\ # hi there"), hir_lit("a ")); + } + + #[test] + fn analysis_is_utf8() { + // Positive examples. + assert!(props_bytes(r"a").is_utf8()); + assert!(props_bytes(r"ab").is_utf8()); + assert!(props_bytes(r"(?-u)a").is_utf8()); + assert!(props_bytes(r"(?-u)ab").is_utf8()); + assert!(props_bytes(r"\xFF").is_utf8()); + assert!(props_bytes(r"\xFF\xFF").is_utf8()); + assert!(props_bytes(r"[^a]").is_utf8()); + assert!(props_bytes(r"[^a][^a]").is_utf8()); + assert!(props_bytes(r"\b").is_utf8()); + assert!(props_bytes(r"\B").is_utf8()); + assert!(props_bytes(r"(?-u)\b").is_utf8()); + assert!(props_bytes(r"(?-u)\B").is_utf8()); + + // Negative examples. + assert!(!props_bytes(r"(?-u)\xFF").is_utf8()); + assert!(!props_bytes(r"(?-u)\xFF\xFF").is_utf8()); + assert!(!props_bytes(r"(?-u)[^a]").is_utf8()); + assert!(!props_bytes(r"(?-u)[^a][^a]").is_utf8()); + } + + #[test] + fn analysis_captures_len() { + assert_eq!(0, props(r"a").explicit_captures_len()); + assert_eq!(0, props(r"(?:a)").explicit_captures_len()); + assert_eq!(0, props(r"(?i-u:a)").explicit_captures_len()); + assert_eq!(0, props(r"(?i-u)a").explicit_captures_len()); + assert_eq!(1, props(r"(a)").explicit_captures_len()); + assert_eq!(1, props(r"(?P<foo>a)").explicit_captures_len()); + assert_eq!(1, props(r"()").explicit_captures_len()); + assert_eq!(1, props(r"()a").explicit_captures_len()); + assert_eq!(1, props(r"(a)+").explicit_captures_len()); + assert_eq!(2, props(r"(a)(b)").explicit_captures_len()); + assert_eq!(2, props(r"(a)|(b)").explicit_captures_len()); + assert_eq!(2, props(r"((a))").explicit_captures_len()); + assert_eq!(1, props(r"([a&&b])").explicit_captures_len()); + } + + #[test] + fn analysis_static_captures_len() { + let len = |pattern| props(pattern).static_explicit_captures_len(); + assert_eq!(Some(0), len(r"")); + assert_eq!(Some(0), len(r"foo|bar")); + assert_eq!(None, len(r"(foo)|bar")); + assert_eq!(None, len(r"foo|(bar)")); + assert_eq!(Some(1), len(r"(foo|bar)")); + assert_eq!(Some(1), len(r"(a|b|c|d|e|f)")); + assert_eq!(Some(1), len(r"(a)|(b)|(c)|(d)|(e)|(f)")); + assert_eq!(Some(2), len(r"(a)(b)|(c)(d)|(e)(f)")); + assert_eq!(Some(6), len(r"(a)(b)(c)(d)(e)(f)")); + assert_eq!(Some(3), len(r"(a)(b)(extra)|(a)(b)()")); + assert_eq!(Some(3), len(r"(a)(b)((?:extra)?)")); + assert_eq!(None, len(r"(a)(b)(extra)?")); + assert_eq!(Some(1), len(r"(foo)|(bar)")); + assert_eq!(Some(2), len(r"(foo)(bar)")); + assert_eq!(Some(2), len(r"(foo)+(bar)")); + assert_eq!(None, len(r"(foo)*(bar)")); + assert_eq!(Some(0), len(r"(foo)?{0}")); + assert_eq!(None, len(r"(foo)?{1}")); + assert_eq!(Some(1), len(r"(foo){1}")); + assert_eq!(Some(1), len(r"(foo){1,}")); + assert_eq!(Some(1), len(r"(foo){1,}?")); + assert_eq!(None, len(r"(foo){1,}??")); + assert_eq!(None, len(r"(foo){0,}")); + assert_eq!(Some(1), len(r"(foo)(?:bar)")); + assert_eq!(Some(2), len(r"(foo(?:bar)+)(?:baz(boo))")); + assert_eq!(Some(2), len(r"(?P<bar>foo)(?:bar)(bal|loon)")); + assert_eq!( + Some(2), + len(r#"<(a)[^>]+href="([^"]+)"|<(img)[^>]+src="([^"]+)""#) + ); + } + + #[test] + fn analysis_is_all_assertions() { + // Positive examples. + let p = props(r"\b"); + assert!(!p.look_set().is_empty()); + assert_eq!(p.minimum_len(), Some(0)); + + let p = props(r"\B"); + assert!(!p.look_set().is_empty()); + assert_eq!(p.minimum_len(), Some(0)); + + let p = props(r"^"); + assert!(!p.look_set().is_empty()); + assert_eq!(p.minimum_len(), Some(0)); + + let p = props(r"$"); + assert!(!p.look_set().is_empty()); + assert_eq!(p.minimum_len(), Some(0)); + + let p = props(r"\A"); + assert!(!p.look_set().is_empty()); + assert_eq!(p.minimum_len(), Some(0)); + + let p = props(r"\z"); + assert!(!p.look_set().is_empty()); + assert_eq!(p.minimum_len(), Some(0)); + + let p = props(r"$^\z\A\b\B"); + assert!(!p.look_set().is_empty()); + assert_eq!(p.minimum_len(), Some(0)); + + let p = props(r"$|^|\z|\A|\b|\B"); + assert!(!p.look_set().is_empty()); + assert_eq!(p.minimum_len(), Some(0)); + + let p = props(r"^$|$^"); + assert!(!p.look_set().is_empty()); + assert_eq!(p.minimum_len(), Some(0)); + + let p = props(r"((\b)+())*^"); + assert!(!p.look_set().is_empty()); + assert_eq!(p.minimum_len(), Some(0)); + + // Negative examples. + let p = props(r"^a"); + assert!(!p.look_set().is_empty()); + assert_eq!(p.minimum_len(), Some(1)); + } + + #[test] + fn analysis_look_set_prefix_any() { + let p = props(r"(?-u)(?i:(?:\b|_)win(?:32|64|dows)?(?:\b|_))"); + assert!(p.look_set_prefix_any().contains(Look::WordAscii)); + } + + #[test] + fn analysis_is_anchored() { + let is_start = |p| props(p).look_set_prefix().contains(Look::Start); + let is_end = |p| props(p).look_set_suffix().contains(Look::End); + + // Positive examples. + assert!(is_start(r"^")); + assert!(is_end(r"$")); + + assert!(is_start(r"^^")); + assert!(props(r"$$").look_set_suffix().contains(Look::End)); + + assert!(is_start(r"^$")); + assert!(is_end(r"^$")); + + assert!(is_start(r"^foo")); + assert!(is_end(r"foo$")); + + assert!(is_start(r"^foo|^bar")); + assert!(is_end(r"foo$|bar$")); + + assert!(is_start(r"^(foo|bar)")); + assert!(is_end(r"(foo|bar)$")); + + assert!(is_start(r"^+")); + assert!(is_end(r"$+")); + assert!(is_start(r"^++")); + assert!(is_end(r"$++")); + assert!(is_start(r"(^)+")); + assert!(is_end(r"($)+")); + + assert!(is_start(r"$^")); + assert!(is_start(r"$^")); + assert!(is_start(r"$^|^$")); + assert!(is_end(r"$^|^$")); + + assert!(is_start(r"\b^")); + assert!(is_end(r"$\b")); + assert!(is_start(r"^(?m:^)")); + assert!(is_end(r"(?m:$)$")); + assert!(is_start(r"(?m:^)^")); + assert!(is_end(r"$(?m:$)")); + + // Negative examples. + assert!(!is_start(r"(?m)^")); + assert!(!is_end(r"(?m)$")); + assert!(!is_start(r"(?m:^$)|$^")); + assert!(!is_end(r"(?m:^$)|$^")); + assert!(!is_start(r"$^|(?m:^$)")); + assert!(!is_end(r"$^|(?m:^$)")); + + assert!(!is_start(r"a^")); + assert!(!is_start(r"$a")); + + assert!(!is_end(r"a^")); + assert!(!is_end(r"$a")); + + assert!(!is_start(r"^foo|bar")); + assert!(!is_end(r"foo|bar$")); + + assert!(!is_start(r"^*")); + assert!(!is_end(r"$*")); + assert!(!is_start(r"^*+")); + assert!(!is_end(r"$*+")); + assert!(!is_start(r"^+*")); + assert!(!is_end(r"$+*")); + assert!(!is_start(r"(^)*")); + assert!(!is_end(r"($)*")); + } + + #[test] + fn analysis_is_any_anchored() { + let is_start = |p| props(p).look_set().contains(Look::Start); + let is_end = |p| props(p).look_set().contains(Look::End); + + // Positive examples. + assert!(is_start(r"^")); + assert!(is_end(r"$")); + assert!(is_start(r"\A")); + assert!(is_end(r"\z")); + + // Negative examples. + assert!(!is_start(r"(?m)^")); + assert!(!is_end(r"(?m)$")); + assert!(!is_start(r"$")); + assert!(!is_end(r"^")); + } + + #[test] + fn analysis_can_empty() { + // Positive examples. + let assert_empty = + |p| assert_eq!(Some(0), props_bytes(p).minimum_len()); + assert_empty(r""); + assert_empty(r"()"); + assert_empty(r"()*"); + assert_empty(r"()+"); + assert_empty(r"()?"); + assert_empty(r"a*"); + assert_empty(r"a?"); + assert_empty(r"a{0}"); + assert_empty(r"a{0,}"); + assert_empty(r"a{0,1}"); + assert_empty(r"a{0,10}"); + #[cfg(feature = "unicode-gencat")] + assert_empty(r"\pL*"); + assert_empty(r"a*|b"); + assert_empty(r"b|a*"); + assert_empty(r"a|"); + assert_empty(r"|a"); + assert_empty(r"a||b"); + assert_empty(r"a*a?(abcd)*"); + assert_empty(r"^"); + assert_empty(r"$"); + assert_empty(r"(?m)^"); + assert_empty(r"(?m)$"); + assert_empty(r"\A"); + assert_empty(r"\z"); + assert_empty(r"\B"); + assert_empty(r"(?-u)\B"); + assert_empty(r"\b"); + assert_empty(r"(?-u)\b"); + + // Negative examples. + let assert_non_empty = + |p| assert_ne!(Some(0), props_bytes(p).minimum_len()); + assert_non_empty(r"a+"); + assert_non_empty(r"a{1}"); + assert_non_empty(r"a{1,}"); + assert_non_empty(r"a{1,2}"); + assert_non_empty(r"a{1,10}"); + assert_non_empty(r"b|a"); + assert_non_empty(r"a*a+(abcd)*"); + #[cfg(feature = "unicode-gencat")] + assert_non_empty(r"\P{any}"); + assert_non_empty(r"[a--a]"); + assert_non_empty(r"[a&&b]"); + } + + #[test] + fn analysis_is_literal() { + // Positive examples. + assert!(props(r"a").is_literal()); + assert!(props(r"ab").is_literal()); + assert!(props(r"abc").is_literal()); + assert!(props(r"(?m)abc").is_literal()); + assert!(props(r"(?:a)").is_literal()); + assert!(props(r"foo(?:a)").is_literal()); + assert!(props(r"(?:a)foo").is_literal()); + assert!(props(r"[a]").is_literal()); + + // Negative examples. + assert!(!props(r"").is_literal()); + assert!(!props(r"^").is_literal()); + assert!(!props(r"a|b").is_literal()); + assert!(!props(r"(a)").is_literal()); + assert!(!props(r"a+").is_literal()); + assert!(!props(r"foo(a)").is_literal()); + assert!(!props(r"(a)foo").is_literal()); + assert!(!props(r"[ab]").is_literal()); + } + + #[test] + fn analysis_is_alternation_literal() { + // Positive examples. + assert!(props(r"a").is_alternation_literal()); + assert!(props(r"ab").is_alternation_literal()); + assert!(props(r"abc").is_alternation_literal()); + assert!(props(r"(?m)abc").is_alternation_literal()); + assert!(props(r"foo|bar").is_alternation_literal()); + assert!(props(r"foo|bar|baz").is_alternation_literal()); + assert!(props(r"[a]").is_alternation_literal()); + assert!(props(r"(?:ab)|cd").is_alternation_literal()); + assert!(props(r"ab|(?:cd)").is_alternation_literal()); + + // Negative examples. + assert!(!props(r"").is_alternation_literal()); + assert!(!props(r"^").is_alternation_literal()); + assert!(!props(r"(a)").is_alternation_literal()); + assert!(!props(r"a+").is_alternation_literal()); + assert!(!props(r"foo(a)").is_alternation_literal()); + assert!(!props(r"(a)foo").is_alternation_literal()); + assert!(!props(r"[ab]").is_alternation_literal()); + assert!(!props(r"[ab]|b").is_alternation_literal()); + assert!(!props(r"a|[ab]").is_alternation_literal()); + assert!(!props(r"(a)|b").is_alternation_literal()); + assert!(!props(r"a|(b)").is_alternation_literal()); + assert!(!props(r"a|b").is_alternation_literal()); + assert!(!props(r"a|b|c").is_alternation_literal()); + assert!(!props(r"[a]|b").is_alternation_literal()); + assert!(!props(r"a|[b]").is_alternation_literal()); + assert!(!props(r"(?:a)|b").is_alternation_literal()); + assert!(!props(r"a|(?:b)").is_alternation_literal()); + assert!(!props(r"(?:z|xx)@|xx").is_alternation_literal()); + } + + // This tests that the smart Hir::repetition constructors does some basic + // simplifications. + #[test] + fn smart_repetition() { + assert_eq!(t(r"a{0}"), Hir::empty()); + assert_eq!(t(r"a{1}"), hir_lit("a")); + assert_eq!(t(r"\B{32111}"), hir_look(hir::Look::WordUnicodeNegate)); + } + + // This tests that the smart Hir::concat constructor simplifies the given + // exprs in a way we expect. + #[test] + fn smart_concat() { + assert_eq!(t(""), Hir::empty()); + assert_eq!(t("(?:)"), Hir::empty()); + assert_eq!(t("abc"), hir_lit("abc")); + assert_eq!(t("(?:foo)(?:bar)"), hir_lit("foobar")); + assert_eq!(t("quux(?:foo)(?:bar)baz"), hir_lit("quuxfoobarbaz")); + assert_eq!( + t("foo(?:bar^baz)quux"), + hir_cat(vec![ + hir_lit("foobar"), + hir_look(hir::Look::Start), + hir_lit("bazquux"), + ]) + ); + assert_eq!( + t("foo(?:ba(?:r^b)az)quux"), + hir_cat(vec![ + hir_lit("foobar"), + hir_look(hir::Look::Start), + hir_lit("bazquux"), + ]) + ); + } + + // This tests that the smart Hir::alternation constructor simplifies the + // given exprs in a way we expect. + #[test] + fn smart_alternation() { + assert_eq!( + t("(?:foo)|(?:bar)"), + hir_alt(vec![hir_lit("foo"), hir_lit("bar")]) + ); + assert_eq!( + t("quux|(?:abc|def|xyz)|baz"), + hir_alt(vec![ + hir_lit("quux"), + hir_lit("abc"), + hir_lit("def"), + hir_lit("xyz"), + hir_lit("baz"), + ]) + ); + assert_eq!( + t("quux|(?:abc|(?:def|mno)|xyz)|baz"), + hir_alt(vec![ + hir_lit("quux"), + hir_lit("abc"), + hir_lit("def"), + hir_lit("mno"), + hir_lit("xyz"), + hir_lit("baz"), + ]) + ); + assert_eq!( + t("a|b|c|d|e|f|x|y|z"), + hir_uclass(&[('a', 'f'), ('x', 'z')]), + ); + // Tests that we lift common prefixes out of an alternation. + assert_eq!( + t("[A-Z]foo|[A-Z]quux"), + hir_cat(vec![ + hir_uclass(&[('A', 'Z')]), + hir_alt(vec![hir_lit("foo"), hir_lit("quux")]), + ]), + ); + assert_eq!( + t("[A-Z][A-Z]|[A-Z]quux"), + hir_cat(vec![ + hir_uclass(&[('A', 'Z')]), + hir_alt(vec![hir_uclass(&[('A', 'Z')]), hir_lit("quux")]), + ]), + ); + assert_eq!( + t("[A-Z][A-Z]|[A-Z][A-Z]quux"), + hir_cat(vec![ + hir_uclass(&[('A', 'Z')]), + hir_uclass(&[('A', 'Z')]), + hir_alt(vec![Hir::empty(), hir_lit("quux")]), + ]), + ); + assert_eq!( + t("[A-Z]foo|[A-Z]foobar"), + hir_cat(vec![ + hir_uclass(&[('A', 'Z')]), + hir_alt(vec![hir_lit("foo"), hir_lit("foobar")]), + ]), + ); + } + + #[test] + fn regression_alt_empty_concat() { + use crate::ast::{self, Ast}; + + let span = Span::splat(Position::new(0, 0, 0)); + let ast = Ast::alternation(ast::Alternation { + span, + asts: vec![Ast::concat(ast::Concat { span, asts: vec![] })], + }); + + let mut t = Translator::new(); + assert_eq!(Ok(Hir::empty()), t.translate("", &ast)); + } + + #[test] + fn regression_empty_alt() { + use crate::ast::{self, Ast}; + + let span = Span::splat(Position::new(0, 0, 0)); + let ast = Ast::concat(ast::Concat { + span, + asts: vec![Ast::alternation(ast::Alternation { + span, + asts: vec![], + })], + }); + + let mut t = Translator::new(); + assert_eq!(Ok(Hir::fail()), t.translate("", &ast)); + } + + #[test] + fn regression_singleton_alt() { + use crate::{ + ast::{self, Ast}, + hir::Dot, + }; + + let span = Span::splat(Position::new(0, 0, 0)); + let ast = Ast::concat(ast::Concat { + span, + asts: vec![Ast::alternation(ast::Alternation { + span, + asts: vec![Ast::dot(span)], + })], + }); + + let mut t = Translator::new(); + assert_eq!(Ok(Hir::dot(Dot::AnyCharExceptLF)), t.translate("", &ast)); + } + + // See: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=63168 + #[test] + fn regression_fuzz_match() { + let pat = "[(\u{6} \0-\u{afdf5}] \0 "; + let ast = ParserBuilder::new() + .octal(false) + .ignore_whitespace(true) + .build() + .parse(pat) + .unwrap(); + let hir = TranslatorBuilder::new() + .utf8(true) + .case_insensitive(false) + .multi_line(false) + .dot_matches_new_line(false) + .swap_greed(true) + .unicode(true) + .build() + .translate(pat, &ast) + .unwrap(); + assert_eq!( + hir, + Hir::concat(vec![ + hir_uclass(&[('\0', '\u{afdf5}')]), + hir_lit("\0"), + ]) + ); + } + + // See: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=63155 + #[cfg(feature = "unicode")] + #[test] + fn regression_fuzz_difference1() { + let pat = r"\W\W|\W[^\v--\W\W\P{Script_Extensions:Pau_Cin_Hau}\u10A1A1-\U{3E3E3}--~~~~--~~~~~~~~------~~~~~~--~~~~~~]*"; + let _ = t(pat); // shouldn't panic + } + + // See: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=63153 + #[test] + fn regression_fuzz_char_decrement1() { + let pat = "w[w[^w?\rw\rw[^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w?\r\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0w?\rw[^w?\rw[^w?\rw[^w\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\u{1}\0]\0\0\0\0\0\0\0\0\0*\0\0\u{1}\0]\0\0-*\0][^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\u{1}\0]\0\0\0\0\0\0\0\0\0x\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\0\0\0*??\0\u{7f}{2}\u{10}??\0\0\0\0\0\0\0\0\0\u{3}\0\0\0}\0-*\0]\0\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\0\u{1}\0]\0\u{1}\u{1}H-i]-]\0\0\0\0\u{1}\0]\0\0\0\u{1}\0]\0\0-*\0\0\0\0\u{1}9-\u{7f}]\0'|-\u{7f}]\0'|(?i-ux)[-\u{7f}]\0'\u{3}\0\0\0}\0-*\0]<D\0\0\0\0\0\0\u{1}]\0\0\0\0]\0\0-*\0]\0\0 "; + let _ = t(pat); // shouldn't panic + } +} diff --git a/vendor/regex-syntax/src/hir/visitor.rs b/vendor/regex-syntax/src/hir/visitor.rs new file mode 100644 index 0000000..f30f0a1 --- /dev/null +++ b/vendor/regex-syntax/src/hir/visitor.rs @@ -0,0 +1,215 @@ +use alloc::{vec, vec::Vec}; + +use crate::hir::{self, Hir, HirKind}; + +/// A trait for visiting the high-level IR (HIR) in depth first order. +/// +/// The principle aim of this trait is to enable callers to perform case +/// analysis on a high-level intermediate representation of a regular +/// expression without necessarily using recursion. In particular, this permits +/// callers to do case analysis with constant stack usage, which can be +/// important since the size of an HIR may be proportional to end user input. +/// +/// Typical usage of this trait involves providing an implementation and then +/// running it using the [`visit`] function. +pub trait Visitor { + /// The result of visiting an HIR. + type Output; + /// An error that visiting an HIR might return. + type Err; + + /// All implementors of `Visitor` must provide a `finish` method, which + /// yields the result of visiting the HIR or an error. + fn finish(self) -> Result<Self::Output, Self::Err>; + + /// This method is called before beginning traversal of the HIR. + fn start(&mut self) {} + + /// This method is called on an `Hir` before descending into child `Hir` + /// nodes. + fn visit_pre(&mut self, _hir: &Hir) -> Result<(), Self::Err> { + Ok(()) + } + + /// This method is called on an `Hir` after descending all of its child + /// `Hir` nodes. + fn visit_post(&mut self, _hir: &Hir) -> Result<(), Self::Err> { + Ok(()) + } + + /// This method is called between child nodes of an alternation. + fn visit_alternation_in(&mut self) -> Result<(), Self::Err> { + Ok(()) + } + + /// This method is called between child nodes of a concatenation. + fn visit_concat_in(&mut self) -> Result<(), Self::Err> { + Ok(()) + } +} + +/// Executes an implementation of `Visitor` in constant stack space. +/// +/// This function will visit every node in the given `Hir` while calling +/// appropriate methods provided by the [`Visitor`] trait. +/// +/// The primary use case for this method is when one wants to perform case +/// analysis over an `Hir` without using a stack size proportional to the depth +/// of the `Hir`. Namely, this method will instead use constant stack space, +/// but will use heap space proportional to the size of the `Hir`. This may be +/// desirable in cases where the size of `Hir` is proportional to end user +/// input. +/// +/// If the visitor returns an error at any point, then visiting is stopped and +/// the error is returned. +pub fn visit<V: Visitor>(hir: &Hir, visitor: V) -> Result<V::Output, V::Err> { + HeapVisitor::new().visit(hir, visitor) +} + +/// HeapVisitor visits every item in an `Hir` recursively using constant stack +/// size and a heap size proportional to the size of the `Hir`. +struct HeapVisitor<'a> { + /// A stack of `Hir` nodes. This is roughly analogous to the call stack + /// used in a typical recursive visitor. + stack: Vec<(&'a Hir, Frame<'a>)>, +} + +/// Represents a single stack frame while performing structural induction over +/// an `Hir`. +enum Frame<'a> { + /// A stack frame allocated just before descending into a repetition + /// operator's child node. + Repetition(&'a hir::Repetition), + /// A stack frame allocated just before descending into a capture's child + /// node. + Capture(&'a hir::Capture), + /// The stack frame used while visiting every child node of a concatenation + /// of expressions. + Concat { + /// The child node we are currently visiting. + head: &'a Hir, + /// The remaining child nodes to visit (which may be empty). + tail: &'a [Hir], + }, + /// The stack frame used while visiting every child node of an alternation + /// of expressions. + Alternation { + /// The child node we are currently visiting. + head: &'a Hir, + /// The remaining child nodes to visit (which may be empty). + tail: &'a [Hir], + }, +} + +impl<'a> HeapVisitor<'a> { + fn new() -> HeapVisitor<'a> { + HeapVisitor { stack: vec![] } + } + + fn visit<V: Visitor>( + &mut self, + mut hir: &'a Hir, + mut visitor: V, + ) -> Result<V::Output, V::Err> { + self.stack.clear(); + + visitor.start(); + loop { + visitor.visit_pre(hir)?; + if let Some(x) = self.induct(hir) { + let child = x.child(); + self.stack.push((hir, x)); + hir = child; + continue; + } + // No induction means we have a base case, so we can post visit + // it now. + visitor.visit_post(hir)?; + + // At this point, we now try to pop our call stack until it is + // either empty or we hit another inductive case. + loop { + let (post_hir, frame) = match self.stack.pop() { + None => return visitor.finish(), + Some((post_hir, frame)) => (post_hir, frame), + }; + // If this is a concat/alternate, then we might have additional + // inductive steps to process. + if let Some(x) = self.pop(frame) { + match x { + Frame::Alternation { .. } => { + visitor.visit_alternation_in()?; + } + Frame::Concat { .. } => { + visitor.visit_concat_in()?; + } + _ => {} + } + hir = x.child(); + self.stack.push((post_hir, x)); + break; + } + // Otherwise, we've finished visiting all the child nodes for + // this HIR, so we can post visit it now. + visitor.visit_post(post_hir)?; + } + } + } + + /// Build a stack frame for the given HIR if one is needed (which occurs if + /// and only if there are child nodes in the HIR). Otherwise, return None. + fn induct(&mut self, hir: &'a Hir) -> Option<Frame<'a>> { + match *hir.kind() { + HirKind::Repetition(ref x) => Some(Frame::Repetition(x)), + HirKind::Capture(ref x) => Some(Frame::Capture(x)), + HirKind::Concat(ref x) if x.is_empty() => None, + HirKind::Concat(ref x) => { + Some(Frame::Concat { head: &x[0], tail: &x[1..] }) + } + HirKind::Alternation(ref x) if x.is_empty() => None, + HirKind::Alternation(ref x) => { + Some(Frame::Alternation { head: &x[0], tail: &x[1..] }) + } + _ => None, + } + } + + /// Pops the given frame. If the frame has an additional inductive step, + /// then return it, otherwise return `None`. + fn pop(&self, induct: Frame<'a>) -> Option<Frame<'a>> { + match induct { + Frame::Repetition(_) => None, + Frame::Capture(_) => None, + Frame::Concat { tail, .. } => { + if tail.is_empty() { + None + } else { + Some(Frame::Concat { head: &tail[0], tail: &tail[1..] }) + } + } + Frame::Alternation { tail, .. } => { + if tail.is_empty() { + None + } else { + Some(Frame::Alternation { + head: &tail[0], + tail: &tail[1..], + }) + } + } + } + } +} + +impl<'a> Frame<'a> { + /// Perform the next inductive step on this frame and return the next + /// child HIR node to visit. + fn child(&self) -> &'a Hir { + match *self { + Frame::Repetition(rep) => &rep.sub, + Frame::Capture(capture) => &capture.sub, + Frame::Concat { head, .. } => head, + Frame::Alternation { head, .. } => head, + } + } +} |