summaryrefslogtreecommitdiffstats
path: root/vendor/regex-syntax/src/hir
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/regex-syntax/src/hir')
-rw-r--r--vendor/regex-syntax/src/hir/interval.rs581
-rw-r--r--vendor/regex-syntax/src/hir/literal.rs3214
-rw-r--r--vendor/regex-syntax/src/hir/mod.rs3861
-rw-r--r--vendor/regex-syntax/src/hir/print.rs608
-rw-r--r--vendor/regex-syntax/src/hir/translate.rs3724
-rw-r--r--vendor/regex-syntax/src/hir/visitor.rs215
6 files changed, 12203 insertions, 0 deletions
diff --git a/vendor/regex-syntax/src/hir/interval.rs b/vendor/regex-syntax/src/hir/interval.rs
new file mode 100644
index 0000000..e063390
--- /dev/null
+++ b/vendor/regex-syntax/src/hir/interval.rs
@@ -0,0 +1,581 @@
+use core::{char, cmp, fmt::Debug, slice};
+
+use alloc::vec::Vec;
+
+use crate::unicode;
+
+// This module contains an *internal* implementation of interval sets.
+//
+// The primary invariant that interval sets guards is canonical ordering. That
+// is, every interval set contains an ordered sequence of intervals where
+// no two intervals are overlapping or adjacent. While this invariant is
+// occasionally broken within the implementation, it should be impossible for
+// callers to observe it.
+//
+// Since case folding (as implemented below) breaks that invariant, we roll
+// that into this API even though it is a little out of place in an otherwise
+// generic interval set. (Hence the reason why the `unicode` module is imported
+// here.)
+//
+// Some of the implementation complexity here is a result of me wanting to
+// preserve the sequential representation without using additional memory.
+// In many cases, we do use linear extra memory, but it is at most 2x and it
+// is amortized. If we relaxed the memory requirements, this implementation
+// could become much simpler. The extra memory is honestly probably OK, but
+// character classes (especially of the Unicode variety) can become quite
+// large, and it would be nice to keep regex compilation snappy even in debug
+// builds. (In the past, I have been careless with this area of code and it has
+// caused slow regex compilations in debug mode, so this isn't entirely
+// unwarranted.)
+//
+// Tests on this are relegated to the public API of HIR in src/hir.rs.
+
+#[derive(Clone, Debug)]
+pub struct IntervalSet<I> {
+ /// A sorted set of non-overlapping ranges.
+ ranges: Vec<I>,
+ /// While not required at all for correctness, we keep track of whether an
+ /// interval set has been case folded or not. This helps us avoid doing
+ /// redundant work if, for example, a set has already been cased folded.
+ /// And note that whether a set is folded or not is preserved through
+ /// all of the pairwise set operations. That is, if both interval sets
+ /// have been case folded, then any of difference, union, intersection or
+ /// symmetric difference all produce a case folded set.
+ ///
+ /// Note that when this is true, it *must* be the case that the set is case
+ /// folded. But when it's false, the set *may* be case folded. In other
+ /// words, we only set this to true when we know it to be case, but we're
+ /// okay with it being false if it would otherwise be costly to determine
+ /// whether it should be true. This means code cannot assume that a false
+ /// value necessarily indicates that the set is not case folded.
+ ///
+ /// Bottom line: this is a performance optimization.
+ folded: bool,
+}
+
+impl<I: Interval> Eq for IntervalSet<I> {}
+
+// We implement PartialEq manually so that we don't consider the set's internal
+// 'folded' property to be part of its identity. The 'folded' property is
+// strictly an optimization.
+impl<I: Interval> PartialEq for IntervalSet<I> {
+ fn eq(&self, other: &IntervalSet<I>) -> bool {
+ self.ranges.eq(&other.ranges)
+ }
+}
+
+impl<I: Interval> IntervalSet<I> {
+ /// Create a new set from a sequence of intervals. Each interval is
+ /// specified as a pair of bounds, where both bounds are inclusive.
+ ///
+ /// The given ranges do not need to be in any specific order, and ranges
+ /// may overlap.
+ pub fn new<T: IntoIterator<Item = I>>(intervals: T) -> IntervalSet<I> {
+ let ranges: Vec<I> = intervals.into_iter().collect();
+ // An empty set is case folded.
+ let folded = ranges.is_empty();
+ let mut set = IntervalSet { ranges, folded };
+ set.canonicalize();
+ set
+ }
+
+ /// Add a new interval to this set.
+ pub fn push(&mut self, interval: I) {
+ // TODO: This could be faster. e.g., Push the interval such that
+ // it preserves canonicalization.
+ self.ranges.push(interval);
+ self.canonicalize();
+ // We don't know whether the new interval added here is considered
+ // case folded, so we conservatively assume that the entire set is
+ // no longer case folded if it was previously.
+ self.folded = false;
+ }
+
+ /// Return an iterator over all intervals in this set.
+ ///
+ /// The iterator yields intervals in ascending order.
+ pub fn iter(&self) -> IntervalSetIter<'_, I> {
+ IntervalSetIter(self.ranges.iter())
+ }
+
+ /// Return an immutable slice of intervals in this set.
+ ///
+ /// The sequence returned is in canonical ordering.
+ pub fn intervals(&self) -> &[I] {
+ &self.ranges
+ }
+
+ /// Expand this interval set such that it contains all case folded
+ /// characters. For example, if this class consists of the range `a-z`,
+ /// then applying case folding will result in the class containing both the
+ /// ranges `a-z` and `A-Z`.
+ ///
+ /// This returns an error if the necessary case mapping data is not
+ /// available.
+ pub fn case_fold_simple(&mut self) -> Result<(), unicode::CaseFoldError> {
+ if self.folded {
+ return Ok(());
+ }
+ let len = self.ranges.len();
+ for i in 0..len {
+ let range = self.ranges[i];
+ if let Err(err) = range.case_fold_simple(&mut self.ranges) {
+ self.canonicalize();
+ return Err(err);
+ }
+ }
+ self.canonicalize();
+ self.folded = true;
+ Ok(())
+ }
+
+ /// Union this set with the given set, in place.
+ pub fn union(&mut self, other: &IntervalSet<I>) {
+ if other.ranges.is_empty() || self.ranges == other.ranges {
+ return;
+ }
+ // This could almost certainly be done more efficiently.
+ self.ranges.extend(&other.ranges);
+ self.canonicalize();
+ self.folded = self.folded && other.folded;
+ }
+
+ /// Intersect this set with the given set, in place.
+ pub fn intersect(&mut self, other: &IntervalSet<I>) {
+ if self.ranges.is_empty() {
+ return;
+ }
+ if other.ranges.is_empty() {
+ self.ranges.clear();
+ // An empty set is case folded.
+ self.folded = true;
+ return;
+ }
+
+ // There should be a way to do this in-place with constant memory,
+ // but I couldn't figure out a simple way to do it. So just append
+ // the intersection to the end of this range, and then drain it before
+ // we're done.
+ let drain_end = self.ranges.len();
+
+ let mut ita = 0..drain_end;
+ let mut itb = 0..other.ranges.len();
+ let mut a = ita.next().unwrap();
+ let mut b = itb.next().unwrap();
+ loop {
+ if let Some(ab) = self.ranges[a].intersect(&other.ranges[b]) {
+ self.ranges.push(ab);
+ }
+ let (it, aorb) =
+ if self.ranges[a].upper() < other.ranges[b].upper() {
+ (&mut ita, &mut a)
+ } else {
+ (&mut itb, &mut b)
+ };
+ match it.next() {
+ Some(v) => *aorb = v,
+ None => break,
+ }
+ }
+ self.ranges.drain(..drain_end);
+ self.folded = self.folded && other.folded;
+ }
+
+ /// Subtract the given set from this set, in place.
+ pub fn difference(&mut self, other: &IntervalSet<I>) {
+ if self.ranges.is_empty() || other.ranges.is_empty() {
+ return;
+ }
+
+ // This algorithm is (to me) surprisingly complex. A search of the
+ // interwebs indicate that this is a potentially interesting problem.
+ // Folks seem to suggest interval or segment trees, but I'd like to
+ // avoid the overhead (both runtime and conceptual) of that.
+ //
+ // The following is basically my Shitty First Draft. Therefore, in
+ // order to grok it, you probably need to read each line carefully.
+ // Simplifications are most welcome!
+ //
+ // Remember, we can assume the canonical format invariant here, which
+ // says that all ranges are sorted, not overlapping and not adjacent in
+ // each class.
+ let drain_end = self.ranges.len();
+ let (mut a, mut b) = (0, 0);
+ 'LOOP: while a < drain_end && b < other.ranges.len() {
+ // Basically, the easy cases are when neither range overlaps with
+ // each other. If the `b` range is less than our current `a`
+ // range, then we can skip it and move on.
+ if other.ranges[b].upper() < self.ranges[a].lower() {
+ b += 1;
+ continue;
+ }
+ // ... similarly for the `a` range. If it's less than the smallest
+ // `b` range, then we can add it as-is.
+ if self.ranges[a].upper() < other.ranges[b].lower() {
+ let range = self.ranges[a];
+ self.ranges.push(range);
+ a += 1;
+ continue;
+ }
+ // Otherwise, we have overlapping ranges.
+ assert!(!self.ranges[a].is_intersection_empty(&other.ranges[b]));
+
+ // This part is tricky and was non-obvious to me without looking
+ // at explicit examples (see the tests). The trickiness stems from
+ // two things: 1) subtracting a range from another range could
+ // yield two ranges and 2) after subtracting a range, it's possible
+ // that future ranges can have an impact. The loop below advances
+ // the `b` ranges until they can't possible impact the current
+ // range.
+ //
+ // For example, if our `a` range is `a-t` and our next three `b`
+ // ranges are `a-c`, `g-i`, `r-t` and `x-z`, then we need to apply
+ // subtraction three times before moving on to the next `a` range.
+ let mut range = self.ranges[a];
+ while b < other.ranges.len()
+ && !range.is_intersection_empty(&other.ranges[b])
+ {
+ let old_range = range;
+ range = match range.difference(&other.ranges[b]) {
+ (None, None) => {
+ // We lost the entire range, so move on to the next
+ // without adding this one.
+ a += 1;
+ continue 'LOOP;
+ }
+ (Some(range1), None) | (None, Some(range1)) => range1,
+ (Some(range1), Some(range2)) => {
+ self.ranges.push(range1);
+ range2
+ }
+ };
+ // It's possible that the `b` range has more to contribute
+ // here. In particular, if it is greater than the original
+ // range, then it might impact the next `a` range *and* it
+ // has impacted the current `a` range as much as possible,
+ // so we can quit. We don't bump `b` so that the next `a`
+ // range can apply it.
+ if other.ranges[b].upper() > old_range.upper() {
+ break;
+ }
+ // Otherwise, the next `b` range might apply to the current
+ // `a` range.
+ b += 1;
+ }
+ self.ranges.push(range);
+ a += 1;
+ }
+ while a < drain_end {
+ let range = self.ranges[a];
+ self.ranges.push(range);
+ a += 1;
+ }
+ self.ranges.drain(..drain_end);
+ self.folded = self.folded && other.folded;
+ }
+
+ /// Compute the symmetric difference of the two sets, in place.
+ ///
+ /// This computes the symmetric difference of two interval sets. This
+ /// removes all elements in this set that are also in the given set,
+ /// but also adds all elements from the given set that aren't in this
+ /// set. That is, the set will contain all elements in either set,
+ /// but will not contain any elements that are in both sets.
+ pub fn symmetric_difference(&mut self, other: &IntervalSet<I>) {
+ // TODO(burntsushi): Fix this so that it amortizes allocation.
+ let mut intersection = self.clone();
+ intersection.intersect(other);
+ self.union(other);
+ self.difference(&intersection);
+ }
+
+ /// Negate this interval set.
+ ///
+ /// For all `x` where `x` is any element, if `x` was in this set, then it
+ /// will not be in this set after negation.
+ pub fn negate(&mut self) {
+ if self.ranges.is_empty() {
+ let (min, max) = (I::Bound::min_value(), I::Bound::max_value());
+ self.ranges.push(I::create(min, max));
+ // The set containing everything must case folded.
+ self.folded = true;
+ return;
+ }
+
+ // There should be a way to do this in-place with constant memory,
+ // but I couldn't figure out a simple way to do it. So just append
+ // the negation to the end of this range, and then drain it before
+ // we're done.
+ let drain_end = self.ranges.len();
+
+ // We do checked arithmetic below because of the canonical ordering
+ // invariant.
+ if self.ranges[0].lower() > I::Bound::min_value() {
+ let upper = self.ranges[0].lower().decrement();
+ self.ranges.push(I::create(I::Bound::min_value(), upper));
+ }
+ for i in 1..drain_end {
+ let lower = self.ranges[i - 1].upper().increment();
+ let upper = self.ranges[i].lower().decrement();
+ self.ranges.push(I::create(lower, upper));
+ }
+ if self.ranges[drain_end - 1].upper() < I::Bound::max_value() {
+ let lower = self.ranges[drain_end - 1].upper().increment();
+ self.ranges.push(I::create(lower, I::Bound::max_value()));
+ }
+ self.ranges.drain(..drain_end);
+ // We don't need to update whether this set is folded or not, because
+ // it is conservatively preserved through negation. Namely, if a set
+ // is not folded, then it is possible that its negation is folded, for
+ // example, [^☃]. But we're fine with assuming that the set is not
+ // folded in that case. (`folded` permits false negatives but not false
+ // positives.)
+ //
+ // But what about when a set is folded, is its negation also
+ // necessarily folded? Yes. Because if a set is folded, then for every
+ // character in the set, it necessarily included its equivalence class
+ // of case folded characters. Negating it in turn means that all
+ // equivalence classes in the set are negated, and any equivalence
+ // class that was previously not in the set is now entirely in the set.
+ }
+
+ /// Converts this set into a canonical ordering.
+ fn canonicalize(&mut self) {
+ if self.is_canonical() {
+ return;
+ }
+ self.ranges.sort();
+ assert!(!self.ranges.is_empty());
+
+ // Is there a way to do this in-place with constant memory? I couldn't
+ // figure out a way to do it. So just append the canonicalization to
+ // the end of this range, and then drain it before we're done.
+ let drain_end = self.ranges.len();
+ for oldi in 0..drain_end {
+ // If we've added at least one new range, then check if we can
+ // merge this range in the previously added range.
+ if self.ranges.len() > drain_end {
+ let (last, rest) = self.ranges.split_last_mut().unwrap();
+ if let Some(union) = last.union(&rest[oldi]) {
+ *last = union;
+ continue;
+ }
+ }
+ let range = self.ranges[oldi];
+ self.ranges.push(range);
+ }
+ self.ranges.drain(..drain_end);
+ }
+
+ /// Returns true if and only if this class is in a canonical ordering.
+ fn is_canonical(&self) -> bool {
+ for pair in self.ranges.windows(2) {
+ if pair[0] >= pair[1] {
+ return false;
+ }
+ if pair[0].is_contiguous(&pair[1]) {
+ return false;
+ }
+ }
+ true
+ }
+}
+
+/// An iterator over intervals.
+#[derive(Debug)]
+pub struct IntervalSetIter<'a, I>(slice::Iter<'a, I>);
+
+impl<'a, I> Iterator for IntervalSetIter<'a, I> {
+ type Item = &'a I;
+
+ fn next(&mut self) -> Option<&'a I> {
+ self.0.next()
+ }
+}
+
+pub trait Interval:
+ Clone + Copy + Debug + Default + Eq + PartialEq + PartialOrd + Ord
+{
+ type Bound: Bound;
+
+ fn lower(&self) -> Self::Bound;
+ fn upper(&self) -> Self::Bound;
+ fn set_lower(&mut self, bound: Self::Bound);
+ fn set_upper(&mut self, bound: Self::Bound);
+ fn case_fold_simple(
+ &self,
+ intervals: &mut Vec<Self>,
+ ) -> Result<(), unicode::CaseFoldError>;
+
+ /// Create a new interval.
+ fn create(lower: Self::Bound, upper: Self::Bound) -> Self {
+ let mut int = Self::default();
+ if lower <= upper {
+ int.set_lower(lower);
+ int.set_upper(upper);
+ } else {
+ int.set_lower(upper);
+ int.set_upper(lower);
+ }
+ int
+ }
+
+ /// Union the given overlapping range into this range.
+ ///
+ /// If the two ranges aren't contiguous, then this returns `None`.
+ fn union(&self, other: &Self) -> Option<Self> {
+ if !self.is_contiguous(other) {
+ return None;
+ }
+ let lower = cmp::min(self.lower(), other.lower());
+ let upper = cmp::max(self.upper(), other.upper());
+ Some(Self::create(lower, upper))
+ }
+
+ /// Intersect this range with the given range and return the result.
+ ///
+ /// If the intersection is empty, then this returns `None`.
+ fn intersect(&self, other: &Self) -> Option<Self> {
+ let lower = cmp::max(self.lower(), other.lower());
+ let upper = cmp::min(self.upper(), other.upper());
+ if lower <= upper {
+ Some(Self::create(lower, upper))
+ } else {
+ None
+ }
+ }
+
+ /// Subtract the given range from this range and return the resulting
+ /// ranges.
+ ///
+ /// If subtraction would result in an empty range, then no ranges are
+ /// returned.
+ fn difference(&self, other: &Self) -> (Option<Self>, Option<Self>) {
+ if self.is_subset(other) {
+ return (None, None);
+ }
+ if self.is_intersection_empty(other) {
+ return (Some(self.clone()), None);
+ }
+ let add_lower = other.lower() > self.lower();
+ let add_upper = other.upper() < self.upper();
+ // We know this because !self.is_subset(other) and the ranges have
+ // a non-empty intersection.
+ assert!(add_lower || add_upper);
+ let mut ret = (None, None);
+ if add_lower {
+ let upper = other.lower().decrement();
+ ret.0 = Some(Self::create(self.lower(), upper));
+ }
+ if add_upper {
+ let lower = other.upper().increment();
+ let range = Self::create(lower, self.upper());
+ if ret.0.is_none() {
+ ret.0 = Some(range);
+ } else {
+ ret.1 = Some(range);
+ }
+ }
+ ret
+ }
+
+ /// Compute the symmetric difference the given range from this range. This
+ /// returns the union of the two ranges minus its intersection.
+ fn symmetric_difference(
+ &self,
+ other: &Self,
+ ) -> (Option<Self>, Option<Self>) {
+ let union = match self.union(other) {
+ None => return (Some(self.clone()), Some(other.clone())),
+ Some(union) => union,
+ };
+ let intersection = match self.intersect(other) {
+ None => return (Some(self.clone()), Some(other.clone())),
+ Some(intersection) => intersection,
+ };
+ union.difference(&intersection)
+ }
+
+ /// Returns true if and only if the two ranges are contiguous. Two ranges
+ /// are contiguous if and only if the ranges are either overlapping or
+ /// adjacent.
+ fn is_contiguous(&self, other: &Self) -> bool {
+ let lower1 = self.lower().as_u32();
+ let upper1 = self.upper().as_u32();
+ let lower2 = other.lower().as_u32();
+ let upper2 = other.upper().as_u32();
+ cmp::max(lower1, lower2) <= cmp::min(upper1, upper2).saturating_add(1)
+ }
+
+ /// Returns true if and only if the intersection of this range and the
+ /// other range is empty.
+ fn is_intersection_empty(&self, other: &Self) -> bool {
+ let (lower1, upper1) = (self.lower(), self.upper());
+ let (lower2, upper2) = (other.lower(), other.upper());
+ cmp::max(lower1, lower2) > cmp::min(upper1, upper2)
+ }
+
+ /// Returns true if and only if this range is a subset of the other range.
+ fn is_subset(&self, other: &Self) -> bool {
+ let (lower1, upper1) = (self.lower(), self.upper());
+ let (lower2, upper2) = (other.lower(), other.upper());
+ (lower2 <= lower1 && lower1 <= upper2)
+ && (lower2 <= upper1 && upper1 <= upper2)
+ }
+}
+
+pub trait Bound:
+ Copy + Clone + Debug + Eq + PartialEq + PartialOrd + Ord
+{
+ fn min_value() -> Self;
+ fn max_value() -> Self;
+ fn as_u32(self) -> u32;
+ fn increment(self) -> Self;
+ fn decrement(self) -> Self;
+}
+
+impl Bound for u8 {
+ fn min_value() -> Self {
+ u8::MIN
+ }
+ fn max_value() -> Self {
+ u8::MAX
+ }
+ fn as_u32(self) -> u32 {
+ u32::from(self)
+ }
+ fn increment(self) -> Self {
+ self.checked_add(1).unwrap()
+ }
+ fn decrement(self) -> Self {
+ self.checked_sub(1).unwrap()
+ }
+}
+
+impl Bound for char {
+ fn min_value() -> Self {
+ '\x00'
+ }
+ fn max_value() -> Self {
+ '\u{10FFFF}'
+ }
+ fn as_u32(self) -> u32 {
+ u32::from(self)
+ }
+
+ fn increment(self) -> Self {
+ match self {
+ '\u{D7FF}' => '\u{E000}',
+ c => char::from_u32(u32::from(c).checked_add(1).unwrap()).unwrap(),
+ }
+ }
+
+ fn decrement(self) -> Self {
+ match self {
+ '\u{E000}' => '\u{D7FF}',
+ c => char::from_u32(u32::from(c).checked_sub(1).unwrap()).unwrap(),
+ }
+ }
+}
+
+// Tests for interval sets are written in src/hir.rs against the public API.
diff --git a/vendor/regex-syntax/src/hir/literal.rs b/vendor/regex-syntax/src/hir/literal.rs
new file mode 100644
index 0000000..a5a3737
--- /dev/null
+++ b/vendor/regex-syntax/src/hir/literal.rs
@@ -0,0 +1,3214 @@
+/*!
+Provides literal extraction from `Hir` expressions.
+
+An [`Extractor`] pulls literals out of [`Hir`] expressions and returns a
+[`Seq`] of [`Literal`]s.
+
+The purpose of literal extraction is generally to provide avenues for
+optimizing regex searches. The main idea is that substring searches can be an
+order of magnitude faster than a regex search. Therefore, if one can execute
+a substring search to find candidate match locations and only run the regex
+search at those locations, then it is possible for huge improvements in
+performance to be realized.
+
+With that said, literal optimizations are generally a black art because even
+though substring search is generally faster, if the number of candidates
+produced is high, then it can create a lot of overhead by ping-ponging between
+the substring search and the regex search.
+
+Here are some heuristics that might be used to help increase the chances of
+effective literal optimizations:
+
+* Stick to small [`Seq`]s. If you search for too many literals, it's likely
+to lead to substring search that is only a little faster than a regex search,
+and thus the overhead of using literal optimizations in the first place might
+make things slower overall.
+* The literals in your [`Seq`] shouldn't be too short. In general, longer is
+better. A sequence corresponding to single bytes that occur frequently in the
+haystack, for example, is probably a bad literal optimization because it's
+likely to produce many false positive candidates. Longer literals are less
+likely to match, and thus probably produce fewer false positives.
+* If it's possible to estimate the approximate frequency of each byte according
+to some pre-computed background distribution, it is possible to compute a score
+of how "good" a `Seq` is. If a `Seq` isn't good enough, you might consider
+skipping the literal optimization and just use the regex engine.
+
+(It should be noted that there are always pathological cases that can make
+any kind of literal optimization be a net slower result. This is why it
+might be a good idea to be conservative, or to even provide a means for
+literal optimizations to be dynamically disabled if they are determined to be
+ineffective according to some measure.)
+
+You're encouraged to explore the methods on [`Seq`], which permit shrinking
+the size of sequences in a preference-order preserving fashion.
+
+Finally, note that it isn't strictly necessary to use an [`Extractor`]. Namely,
+an `Extractor` only uses public APIs of the [`Seq`] and [`Literal`] types,
+so it is possible to implement your own extractor. For example, for n-grams
+or "inner" literals (i.e., not prefix or suffix literals). The `Extractor`
+is mostly responsible for the case analysis over `Hir` expressions. Much of
+the "trickier" parts are how to combine literal sequences, and that is all
+implemented on [`Seq`].
+*/
+
+use core::{cmp, mem, num::NonZeroUsize};
+
+use alloc::{vec, vec::Vec};
+
+use crate::hir::{self, Hir};
+
+/// Extracts prefix or suffix literal sequences from [`Hir`] expressions.
+///
+/// Literal extraction is based on the following observations:
+///
+/// * Many regexes start with one or a small number of literals.
+/// * Substring search for literals is often much faster (sometimes by an order
+/// of magnitude) than a regex search.
+///
+/// Thus, in many cases, one can search for literals to find candidate starting
+/// locations of a match, and then only run the full regex engine at each such
+/// location instead of over the full haystack.
+///
+/// The main downside of literal extraction is that it can wind up causing a
+/// search to be slower overall. For example, if there are many matches or if
+/// there are many candidates that don't ultimately lead to a match, then a
+/// lot of overhead will be spent in shuffing back-and-forth between substring
+/// search and the regex engine. This is the fundamental reason why literal
+/// optimizations for regex patterns is sometimes considered a "black art."
+///
+/// # Look-around assertions
+///
+/// Literal extraction treats all look-around assertions as-if they match every
+/// empty string. So for example, the regex `\bquux\b` will yield a sequence
+/// containing a single exact literal `quux`. However, not all occurrences
+/// of `quux` correspond to a match a of the regex. For example, `\bquux\b`
+/// does not match `ZquuxZ` anywhere because `quux` does not fall on a word
+/// boundary.
+///
+/// In effect, if your regex contains look-around assertions, then a match of
+/// an exact literal does not necessarily mean the regex overall matches. So
+/// you may still need to run the regex engine in such cases to confirm the
+/// match.
+///
+/// The precise guarantee you get from a literal sequence is: if every literal
+/// in the sequence is exact and the original regex contains zero look-around
+/// assertions, then a preference-order multi-substring search of those
+/// literals will precisely match a preference-order search of the original
+/// regex.
+///
+/// # Example
+///
+/// This shows how to extract prefixes:
+///
+/// ```
+/// use regex_syntax::{hir::literal::{Extractor, Literal, Seq}, parse};
+///
+/// let hir = parse(r"(a|b|c)(x|y|z)[A-Z]+foo")?;
+/// let got = Extractor::new().extract(&hir);
+/// // All literals returned are "inexact" because none of them reach the
+/// // match state.
+/// let expected = Seq::from_iter([
+/// Literal::inexact("ax"),
+/// Literal::inexact("ay"),
+/// Literal::inexact("az"),
+/// Literal::inexact("bx"),
+/// Literal::inexact("by"),
+/// Literal::inexact("bz"),
+/// Literal::inexact("cx"),
+/// Literal::inexact("cy"),
+/// Literal::inexact("cz"),
+/// ]);
+/// assert_eq!(expected, got);
+///
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+///
+/// This shows how to extract suffixes:
+///
+/// ```
+/// use regex_syntax::{
+/// hir::literal::{Extractor, ExtractKind, Literal, Seq},
+/// parse,
+/// };
+///
+/// let hir = parse(r"foo|[A-Z]+bar")?;
+/// let got = Extractor::new().kind(ExtractKind::Suffix).extract(&hir);
+/// // Since 'foo' gets to a match state, it is considered exact. But 'bar'
+/// // does not because of the '[A-Z]+', and thus is marked inexact.
+/// let expected = Seq::from_iter([
+/// Literal::exact("foo"),
+/// Literal::inexact("bar"),
+/// ]);
+/// assert_eq!(expected, got);
+///
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+#[derive(Clone, Debug)]
+pub struct Extractor {
+ kind: ExtractKind,
+ limit_class: usize,
+ limit_repeat: usize,
+ limit_literal_len: usize,
+ limit_total: usize,
+}
+
+impl Extractor {
+ /// Create a new extractor with a default configuration.
+ ///
+ /// The extractor can be optionally configured before calling
+ /// [`Extractor::extract`] to get a literal sequence.
+ pub fn new() -> Extractor {
+ Extractor {
+ kind: ExtractKind::Prefix,
+ limit_class: 10,
+ limit_repeat: 10,
+ limit_literal_len: 100,
+ limit_total: 250,
+ }
+ }
+
+ /// Execute the extractor and return a sequence of literals.
+ pub fn extract(&self, hir: &Hir) -> Seq {
+ use crate::hir::HirKind::*;
+
+ match *hir.kind() {
+ Empty | Look(_) => Seq::singleton(self::Literal::exact(vec![])),
+ Literal(hir::Literal(ref bytes)) => {
+ let mut seq =
+ Seq::singleton(self::Literal::exact(bytes.to_vec()));
+ self.enforce_literal_len(&mut seq);
+ seq
+ }
+ Class(hir::Class::Unicode(ref cls)) => {
+ self.extract_class_unicode(cls)
+ }
+ Class(hir::Class::Bytes(ref cls)) => self.extract_class_bytes(cls),
+ Repetition(ref rep) => self.extract_repetition(rep),
+ Capture(hir::Capture { ref sub, .. }) => self.extract(sub),
+ Concat(ref hirs) => match self.kind {
+ ExtractKind::Prefix => self.extract_concat(hirs.iter()),
+ ExtractKind::Suffix => self.extract_concat(hirs.iter().rev()),
+ },
+ Alternation(ref hirs) => {
+ // Unlike concat, we always union starting from the beginning,
+ // since the beginning corresponds to the highest preference,
+ // which doesn't change based on forwards vs reverse.
+ self.extract_alternation(hirs.iter())
+ }
+ }
+ }
+
+ /// Set the kind of literal sequence to extract from an [`Hir`] expression.
+ ///
+ /// The default is to extract prefixes, but suffixes can be selected
+ /// instead. The contract for prefixes is that every match of the
+ /// corresponding `Hir` must start with one of the literals in the sequence
+ /// returned. Moreover, the _order_ of the sequence returned corresponds to
+ /// the preference order.
+ ///
+ /// Suffixes satisfy a similar contract in that every match of the
+ /// corresponding `Hir` must end with one of the literals in the sequence
+ /// returned. However, there is no guarantee that the literals are in
+ /// preference order.
+ ///
+ /// Remember that a sequence can be infinite. For example, unless the
+ /// limits are configured to be impractically large, attempting to extract
+ /// prefixes (or suffixes) for the pattern `[A-Z]` will return an infinite
+ /// sequence. Generally speaking, if the sequence returned is infinite,
+ /// then it is presumed to be unwise to do prefix (or suffix) optimizations
+ /// for the pattern.
+ pub fn kind(&mut self, kind: ExtractKind) -> &mut Extractor {
+ self.kind = kind;
+ self
+ }
+
+ /// Configure a limit on the length of the sequence that is permitted for
+ /// a character class. If a character class exceeds this limit, then the
+ /// sequence returned for it is infinite.
+ ///
+ /// This prevents classes like `[A-Z]` or `\pL` from getting turned into
+ /// huge and likely unproductive sequences of literals.
+ ///
+ /// # Example
+ ///
+ /// This example shows how this limit can be lowered to decrease the tolerance
+ /// for character classes being turned into literal sequences.
+ ///
+ /// ```
+ /// use regex_syntax::{hir::literal::{Extractor, Seq}, parse};
+ ///
+ /// let hir = parse(r"[0-9]")?;
+ ///
+ /// let got = Extractor::new().extract(&hir);
+ /// let expected = Seq::new([
+ /// "0", "1", "2", "3", "4", "5", "6", "7", "8", "9",
+ /// ]);
+ /// assert_eq!(expected, got);
+ ///
+ /// // Now let's shrink the limit and see how that changes things.
+ /// let got = Extractor::new().limit_class(4).extract(&hir);
+ /// let expected = Seq::infinite();
+ /// assert_eq!(expected, got);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn limit_class(&mut self, limit: usize) -> &mut Extractor {
+ self.limit_class = limit;
+ self
+ }
+
+ /// Configure a limit on the total number of repetitions that is permitted
+ /// before literal extraction is stopped.
+ ///
+ /// This is useful for limiting things like `(abcde){50}`, or more
+ /// insidiously, `(?:){1000000000}`. This limit prevents any one single
+ /// repetition from adding too much to a literal sequence.
+ ///
+ /// With this limit set, repetitions that exceed it will be stopped and any
+ /// literals extracted up to that point will be made inexact.
+ ///
+ /// # Example
+ ///
+ /// This shows how to decrease the limit and compares it with the default.
+ ///
+ /// ```
+ /// use regex_syntax::{hir::literal::{Extractor, Literal, Seq}, parse};
+ ///
+ /// let hir = parse(r"(abc){8}")?;
+ ///
+ /// let got = Extractor::new().extract(&hir);
+ /// let expected = Seq::new(["abcabcabcabcabcabcabcabc"]);
+ /// assert_eq!(expected, got);
+ ///
+ /// // Now let's shrink the limit and see how that changes things.
+ /// let got = Extractor::new().limit_repeat(4).extract(&hir);
+ /// let expected = Seq::from_iter([
+ /// Literal::inexact("abcabcabcabc"),
+ /// ]);
+ /// assert_eq!(expected, got);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn limit_repeat(&mut self, limit: usize) -> &mut Extractor {
+ self.limit_repeat = limit;
+ self
+ }
+
+ /// Configure a limit on the maximum length of any literal in a sequence.
+ ///
+ /// This is useful for limiting things like `(abcde){5}{5}{5}{5}`. While
+ /// each repetition or literal in that regex is small, when all the
+ /// repetitions are applied, one ends up with a literal of length `5^4 =
+ /// 625`.
+ ///
+ /// With this limit set, literals that exceed it will be made inexact and
+ /// thus prevented from growing.
+ ///
+ /// # Example
+ ///
+ /// This shows how to decrease the limit and compares it with the default.
+ ///
+ /// ```
+ /// use regex_syntax::{hir::literal::{Extractor, Literal, Seq}, parse};
+ ///
+ /// let hir = parse(r"(abc){2}{2}{2}")?;
+ ///
+ /// let got = Extractor::new().extract(&hir);
+ /// let expected = Seq::new(["abcabcabcabcabcabcabcabc"]);
+ /// assert_eq!(expected, got);
+ ///
+ /// // Now let's shrink the limit and see how that changes things.
+ /// let got = Extractor::new().limit_literal_len(14).extract(&hir);
+ /// let expected = Seq::from_iter([
+ /// Literal::inexact("abcabcabcabcab"),
+ /// ]);
+ /// assert_eq!(expected, got);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn limit_literal_len(&mut self, limit: usize) -> &mut Extractor {
+ self.limit_literal_len = limit;
+ self
+ }
+
+ /// Configure a limit on the total number of literals that will be
+ /// returned.
+ ///
+ /// This is useful as a practical measure for avoiding the creation of
+ /// large sequences of literals. While the extractor will automatically
+ /// handle local creations of large sequences (for example, `[A-Z]` yields
+ /// an infinite sequence by default), large sequences can be created
+ /// through non-local means as well.
+ ///
+ /// For example, `[ab]{3}{3}` would yield a sequence of length `512 = 2^9`
+ /// despite each of the repetitions being small on their own. This limit
+ /// thus represents a "catch all" for avoiding locally small sequences from
+ /// combining into large sequences.
+ ///
+ /// # Example
+ ///
+ /// This example shows how reducing the limit will change the literal
+ /// sequence returned.
+ ///
+ /// ```
+ /// use regex_syntax::{hir::literal::{Extractor, Literal, Seq}, parse};
+ ///
+ /// let hir = parse(r"[ab]{2}{2}")?;
+ ///
+ /// let got = Extractor::new().extract(&hir);
+ /// let expected = Seq::new([
+ /// "aaaa", "aaab", "aaba", "aabb",
+ /// "abaa", "abab", "abba", "abbb",
+ /// "baaa", "baab", "baba", "babb",
+ /// "bbaa", "bbab", "bbba", "bbbb",
+ /// ]);
+ /// assert_eq!(expected, got);
+ ///
+ /// // The default limit is not too big, but big enough to extract all
+ /// // literals from '[ab]{2}{2}'. If we shrink the limit to less than 16,
+ /// // then we'll get a truncated set. Notice that it returns a sequence of
+ /// // length 4 even though our limit was 10. This is because the sequence
+ /// // is difficult to increase without blowing the limit. Notice also
+ /// // that every literal in the sequence is now inexact because they were
+ /// // stripped of some suffix.
+ /// let got = Extractor::new().limit_total(10).extract(&hir);
+ /// let expected = Seq::from_iter([
+ /// Literal::inexact("aa"),
+ /// Literal::inexact("ab"),
+ /// Literal::inexact("ba"),
+ /// Literal::inexact("bb"),
+ /// ]);
+ /// assert_eq!(expected, got);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn limit_total(&mut self, limit: usize) -> &mut Extractor {
+ self.limit_total = limit;
+ self
+ }
+
+ /// Extract a sequence from the given concatenation. Sequences from each of
+ /// the child HIR expressions are combined via cross product.
+ ///
+ /// This short circuits once the cross product turns into a sequence
+ /// containing only inexact literals.
+ fn extract_concat<'a, I: Iterator<Item = &'a Hir>>(&self, it: I) -> Seq {
+ let mut seq = Seq::singleton(self::Literal::exact(vec![]));
+ for hir in it {
+ // If every element in the sequence is inexact, then a cross
+ // product will always be a no-op. Thus, there is nothing else we
+ // can add to it and can quit early. Note that this also includes
+ // infinite sequences.
+ if seq.is_inexact() {
+ break;
+ }
+ // Note that 'cross' also dispatches based on whether we're
+ // extracting prefixes or suffixes.
+ seq = self.cross(seq, &mut self.extract(hir));
+ }
+ seq
+ }
+
+ /// Extract a sequence from the given alternation.
+ ///
+ /// This short circuits once the union turns into an infinite sequence.
+ fn extract_alternation<'a, I: Iterator<Item = &'a Hir>>(
+ &self,
+ it: I,
+ ) -> Seq {
+ let mut seq = Seq::empty();
+ for hir in it {
+ // Once our 'seq' is infinite, every subsequent union
+ // operation on it will itself always result in an
+ // infinite sequence. Thus, it can never change and we can
+ // short-circuit.
+ if !seq.is_finite() {
+ break;
+ }
+ seq = self.union(seq, &mut self.extract(hir));
+ }
+ seq
+ }
+
+ /// Extract a sequence of literals from the given repetition. We do our
+ /// best, Some examples:
+ ///
+ /// 'a*' => [inexact(a), exact("")]
+ /// 'a*?' => [exact(""), inexact(a)]
+ /// 'a+' => [inexact(a)]
+ /// 'a{3}' => [exact(aaa)]
+ /// 'a{3,5} => [inexact(aaa)]
+ ///
+ /// The key here really is making sure we get the 'inexact' vs 'exact'
+ /// attributes correct on each of the literals we add. For example, the
+ /// fact that 'a*' gives us an inexact 'a' and an exact empty string means
+ /// that a regex like 'ab*c' will result in [inexact(ab), exact(ac)]
+ /// literals being extracted, which might actually be a better prefilter
+ /// than just 'a'.
+ fn extract_repetition(&self, rep: &hir::Repetition) -> Seq {
+ let mut subseq = self.extract(&rep.sub);
+ match *rep {
+ hir::Repetition { min: 0, max, greedy, .. } => {
+ // When 'max=1', we can retain exactness, since 'a?' is
+ // equivalent to 'a|'. Similarly below, 'a??' is equivalent to
+ // '|a'.
+ if max != Some(1) {
+ subseq.make_inexact();
+ }
+ let mut empty = Seq::singleton(Literal::exact(vec![]));
+ if !greedy {
+ mem::swap(&mut subseq, &mut empty);
+ }
+ self.union(subseq, &mut empty)
+ }
+ hir::Repetition { min, max: Some(max), .. } if min == max => {
+ assert!(min > 0); // handled above
+ let limit =
+ u32::try_from(self.limit_repeat).unwrap_or(u32::MAX);
+ let mut seq = Seq::singleton(Literal::exact(vec![]));
+ for _ in 0..cmp::min(min, limit) {
+ if seq.is_inexact() {
+ break;
+ }
+ seq = self.cross(seq, &mut subseq.clone());
+ }
+ if usize::try_from(min).is_err() || min > limit {
+ seq.make_inexact();
+ }
+ seq
+ }
+ hir::Repetition { min, .. } => {
+ assert!(min > 0); // handled above
+ let limit =
+ u32::try_from(self.limit_repeat).unwrap_or(u32::MAX);
+ let mut seq = Seq::singleton(Literal::exact(vec![]));
+ for _ in 0..cmp::min(min, limit) {
+ if seq.is_inexact() {
+ break;
+ }
+ seq = self.cross(seq, &mut subseq.clone());
+ }
+ seq.make_inexact();
+ seq
+ }
+ }
+ }
+
+ /// Convert the given Unicode class into a sequence of literals if the
+ /// class is small enough. If the class is too big, return an infinite
+ /// sequence.
+ fn extract_class_unicode(&self, cls: &hir::ClassUnicode) -> Seq {
+ if self.class_over_limit_unicode(cls) {
+ return Seq::infinite();
+ }
+ let mut seq = Seq::empty();
+ for r in cls.iter() {
+ for ch in r.start()..=r.end() {
+ seq.push(Literal::from(ch));
+ }
+ }
+ self.enforce_literal_len(&mut seq);
+ seq
+ }
+
+ /// Convert the given byte class into a sequence of literals if the class
+ /// is small enough. If the class is too big, return an infinite sequence.
+ fn extract_class_bytes(&self, cls: &hir::ClassBytes) -> Seq {
+ if self.class_over_limit_bytes(cls) {
+ return Seq::infinite();
+ }
+ let mut seq = Seq::empty();
+ for r in cls.iter() {
+ for b in r.start()..=r.end() {
+ seq.push(Literal::from(b));
+ }
+ }
+ self.enforce_literal_len(&mut seq);
+ seq
+ }
+
+ /// Returns true if the given Unicode class exceeds the configured limits
+ /// on this extractor.
+ fn class_over_limit_unicode(&self, cls: &hir::ClassUnicode) -> bool {
+ let mut count = 0;
+ for r in cls.iter() {
+ if count > self.limit_class {
+ return true;
+ }
+ count += r.len();
+ }
+ count > self.limit_class
+ }
+
+ /// Returns true if the given byte class exceeds the configured limits on
+ /// this extractor.
+ fn class_over_limit_bytes(&self, cls: &hir::ClassBytes) -> bool {
+ let mut count = 0;
+ for r in cls.iter() {
+ if count > self.limit_class {
+ return true;
+ }
+ count += r.len();
+ }
+ count > self.limit_class
+ }
+
+ /// Compute the cross product of the two sequences if the result would be
+ /// within configured limits. Otherwise, make `seq2` infinite and cross the
+ /// infinite sequence with `seq1`.
+ fn cross(&self, mut seq1: Seq, seq2: &mut Seq) -> Seq {
+ if seq1.max_cross_len(seq2).map_or(false, |len| len > self.limit_total)
+ {
+ seq2.make_infinite();
+ }
+ if let ExtractKind::Suffix = self.kind {
+ seq1.cross_reverse(seq2);
+ } else {
+ seq1.cross_forward(seq2);
+ }
+ assert!(seq1.len().map_or(true, |x| x <= self.limit_total));
+ self.enforce_literal_len(&mut seq1);
+ seq1
+ }
+
+ /// Union the two sequences if the result would be within configured
+ /// limits. Otherwise, make `seq2` infinite and union the infinite sequence
+ /// with `seq1`.
+ fn union(&self, mut seq1: Seq, seq2: &mut Seq) -> Seq {
+ if seq1.max_union_len(seq2).map_or(false, |len| len > self.limit_total)
+ {
+ // We try to trim our literal sequences to see if we can make
+ // room for more literals. The idea is that we'd rather trim down
+ // literals already in our sequence if it means we can add a few
+ // more and retain a finite sequence. Otherwise, we'll union with
+ // an infinite sequence and that infects everything and effectively
+ // stops literal extraction in its tracks.
+ //
+ // We do we keep 4 bytes here? Well, it's a bit of an abstraction
+ // leakage. Downstream, the literals may wind up getting fed to
+ // the Teddy algorithm, which supports searching literals up to
+ // length 4. So that's why we pick that number here. Arguably this
+ // should be a tuneable parameter, but it seems a little tricky to
+ // describe. And I'm still unsure if this is the right way to go
+ // about culling literal sequences.
+ match self.kind {
+ ExtractKind::Prefix => {
+ seq1.keep_first_bytes(4);
+ seq2.keep_first_bytes(4);
+ }
+ ExtractKind::Suffix => {
+ seq1.keep_last_bytes(4);
+ seq2.keep_last_bytes(4);
+ }
+ }
+ seq1.dedup();
+ seq2.dedup();
+ if seq1
+ .max_union_len(seq2)
+ .map_or(false, |len| len > self.limit_total)
+ {
+ seq2.make_infinite();
+ }
+ }
+ seq1.union(seq2);
+ assert!(seq1.len().map_or(true, |x| x <= self.limit_total));
+ seq1
+ }
+
+ /// Applies the literal length limit to the given sequence. If none of the
+ /// literals in the sequence exceed the limit, then this is a no-op.
+ fn enforce_literal_len(&self, seq: &mut Seq) {
+ let len = self.limit_literal_len;
+ match self.kind {
+ ExtractKind::Prefix => seq.keep_first_bytes(len),
+ ExtractKind::Suffix => seq.keep_last_bytes(len),
+ }
+ }
+}
+
+impl Default for Extractor {
+ fn default() -> Extractor {
+ Extractor::new()
+ }
+}
+
+/// The kind of literals to extract from an [`Hir`] expression.
+///
+/// The default extraction kind is `Prefix`.
+#[non_exhaustive]
+#[derive(Clone, Debug)]
+pub enum ExtractKind {
+ /// Extracts only prefix literals from a regex.
+ Prefix,
+ /// Extracts only suffix literals from a regex.
+ ///
+ /// Note that the sequence returned by suffix literals currently may
+ /// not correctly represent leftmost-first or "preference" order match
+ /// semantics.
+ Suffix,
+}
+
+impl ExtractKind {
+ /// Returns true if this kind is the `Prefix` variant.
+ pub fn is_prefix(&self) -> bool {
+ matches!(*self, ExtractKind::Prefix)
+ }
+
+ /// Returns true if this kind is the `Suffix` variant.
+ pub fn is_suffix(&self) -> bool {
+ matches!(*self, ExtractKind::Suffix)
+ }
+}
+
+impl Default for ExtractKind {
+ fn default() -> ExtractKind {
+ ExtractKind::Prefix
+ }
+}
+
+/// A sequence of literals.
+///
+/// A `Seq` is very much like a set in that it represents a union of its
+/// members. That is, it corresponds to a set of literals where at least one
+/// must match in order for a particular [`Hir`] expression to match. (Whether
+/// this corresponds to the entire `Hir` expression, a prefix of it or a suffix
+/// of it depends on how the `Seq` was extracted from the `Hir`.)
+///
+/// It is also unlike a set in that multiple identical literals may appear,
+/// and that the order of the literals in the `Seq` matters. For example, if
+/// the sequence is `[sam, samwise]` and leftmost-first matching is used, then
+/// `samwise` can never match and the sequence is equivalent to `[sam]`.
+///
+/// # States of a sequence
+///
+/// A `Seq` has a few different logical states to consider:
+///
+/// * The sequence can represent "any" literal. When this happens, the set does
+/// not have a finite size. The purpose of this state is to inhibit callers
+/// from making assumptions about what literals are required in order to match
+/// a particular [`Hir`] expression. Generally speaking, when a set is in this
+/// state, literal optimizations are inhibited. A good example of a regex that
+/// will cause this sort of set to appear is `[A-Za-z]`. The character class
+/// is just too big (and also too narrow) to be usefully expanded into 52
+/// different literals. (Note that the decision for when a seq should become
+/// infinite is determined by the caller. A seq itself has no hard-coded
+/// limits.)
+/// * The sequence can be empty, in which case, it is an affirmative statement
+/// that there are no literals that can match the corresponding `Hir`.
+/// Consequently, the `Hir` never matches any input. For example, `[a&&b]`.
+/// * The sequence can be non-empty, in which case, at least one of the
+/// literals must match in order for the corresponding `Hir` to match.
+///
+/// # Example
+///
+/// This example shows how literal sequences can be simplified by stripping
+/// suffixes and minimizing while maintaining preference order.
+///
+/// ```
+/// use regex_syntax::hir::literal::{Literal, Seq};
+///
+/// let mut seq = Seq::new(&[
+/// "farm",
+/// "appliance",
+/// "faraway",
+/// "apple",
+/// "fare",
+/// "gap",
+/// "applicant",
+/// "applaud",
+/// ]);
+/// seq.keep_first_bytes(3);
+/// seq.minimize_by_preference();
+/// // Notice that 'far' comes before 'app', which matches the order in the
+/// // original sequence. This guarantees that leftmost-first semantics are
+/// // not altered by simplifying the set.
+/// let expected = Seq::from_iter([
+/// Literal::inexact("far"),
+/// Literal::inexact("app"),
+/// Literal::exact("gap"),
+/// ]);
+/// assert_eq!(expected, seq);
+/// ```
+#[derive(Clone, Eq, PartialEq)]
+pub struct Seq {
+ /// The members of this seq.
+ ///
+ /// When `None`, the seq represents all possible literals. That is, it
+ /// prevents one from making assumptions about specific literals in the
+ /// seq, and forces one to treat it as if any literal might be in the seq.
+ ///
+ /// Note that `Some(vec![])` is valid and corresponds to the empty seq of
+ /// literals, i.e., a regex that can never match. For example, `[a&&b]`.
+ /// It is distinct from `Some(vec![""])`, which corresponds to the seq
+ /// containing an empty string, which matches at every position.
+ literals: Option<Vec<Literal>>,
+}
+
+impl Seq {
+ /// Returns an empty sequence.
+ ///
+ /// An empty sequence matches zero literals, and thus corresponds to a
+ /// regex that itself can never match.
+ #[inline]
+ pub fn empty() -> Seq {
+ Seq { literals: Some(vec![]) }
+ }
+
+ /// Returns a sequence of literals without a finite size and may contain
+ /// any literal.
+ ///
+ /// A sequence without finite size does not reveal anything about the
+ /// characteristics of the literals in its set. There are no fixed prefixes
+ /// or suffixes, nor are lower or upper bounds on the length of the literals
+ /// in the set known.
+ ///
+ /// This is useful to represent constructs in a regex that are "too big"
+ /// to useful represent as a sequence of literals. For example, `[A-Za-z]`.
+ /// When sequences get too big, they lose their discriminating nature and
+ /// are more likely to produce false positives, which in turn makes them
+ /// less likely to speed up searches.
+ ///
+ /// More pragmatically, for many regexes, enumerating all possible literals
+ /// is itself not possible or might otherwise use too many resources. So
+ /// constraining the size of sets during extraction is a practical trade
+ /// off to make.
+ #[inline]
+ pub fn infinite() -> Seq {
+ Seq { literals: None }
+ }
+
+ /// Returns a sequence containing a single literal.
+ #[inline]
+ pub fn singleton(lit: Literal) -> Seq {
+ Seq { literals: Some(vec![lit]) }
+ }
+
+ /// Returns a sequence of exact literals from the given byte strings.
+ #[inline]
+ pub fn new<I, B>(it: I) -> Seq
+ where
+ I: IntoIterator<Item = B>,
+ B: AsRef<[u8]>,
+ {
+ it.into_iter().map(|b| Literal::exact(b.as_ref())).collect()
+ }
+
+ /// If this is a finite sequence, return its members as a slice of
+ /// literals.
+ ///
+ /// The slice returned may be empty, in which case, there are no literals
+ /// that can match this sequence.
+ #[inline]
+ pub fn literals(&self) -> Option<&[Literal]> {
+ self.literals.as_deref()
+ }
+
+ /// Push a literal to the end of this sequence.
+ ///
+ /// If this sequence is not finite, then this is a no-op.
+ ///
+ /// Similarly, if the most recently added item of this sequence is
+ /// equivalent to the literal given, then it is not added. This reflects
+ /// a `Seq`'s "set like" behavior, and represents a practical trade off.
+ /// Namely, there is never any need to have two adjacent and equivalent
+ /// literals in the same sequence, _and_ it is easy to detect in some
+ /// cases.
+ #[inline]
+ pub fn push(&mut self, lit: Literal) {
+ let lits = match self.literals {
+ None => return,
+ Some(ref mut lits) => lits,
+ };
+ if lits.last().map_or(false, |m| m == &lit) {
+ return;
+ }
+ lits.push(lit);
+ }
+
+ /// Make all of the literals in this sequence inexact.
+ ///
+ /// This is a no-op if this sequence is not finite.
+ #[inline]
+ pub fn make_inexact(&mut self) {
+ let lits = match self.literals {
+ None => return,
+ Some(ref mut lits) => lits,
+ };
+ for lit in lits.iter_mut() {
+ lit.make_inexact();
+ }
+ }
+
+ /// Converts this sequence to an infinite sequence.
+ ///
+ /// This is a no-op if the sequence is already infinite.
+ #[inline]
+ pub fn make_infinite(&mut self) {
+ self.literals = None;
+ }
+
+ /// Modify this sequence to contain the cross product between it and the
+ /// sequence given.
+ ///
+ /// The cross product only considers literals in this sequence that are
+ /// exact. That is, inexact literals are not extended.
+ ///
+ /// The literals are always drained from `other`, even if none are used.
+ /// This permits callers to reuse the sequence allocation elsewhere.
+ ///
+ /// If this sequence is infinite, then this is a no-op, regardless of what
+ /// `other` contains (and in this case, the literals are still drained from
+ /// `other`). If `other` is infinite and this sequence is finite, then this
+ /// is a no-op, unless this sequence contains a zero-length literal. In
+ /// which case, the infiniteness of `other` infects this sequence, and this
+ /// sequence is itself made infinite.
+ ///
+ /// Like [`Seq::union`], this may attempt to deduplicate literals. See
+ /// [`Seq::dedup`] for how deduplication deals with exact and inexact
+ /// literals.
+ ///
+ /// # Example
+ ///
+ /// This example shows basic usage and how exact and inexact literals
+ /// interact.
+ ///
+ /// ```
+ /// use regex_syntax::hir::literal::{Literal, Seq};
+ ///
+ /// let mut seq1 = Seq::from_iter([
+ /// Literal::exact("foo"),
+ /// Literal::inexact("bar"),
+ /// ]);
+ /// let mut seq2 = Seq::from_iter([
+ /// Literal::inexact("quux"),
+ /// Literal::exact("baz"),
+ /// ]);
+ /// seq1.cross_forward(&mut seq2);
+ ///
+ /// // The literals are pulled out of seq2.
+ /// assert_eq!(Some(0), seq2.len());
+ ///
+ /// let expected = Seq::from_iter([
+ /// Literal::inexact("fooquux"),
+ /// Literal::exact("foobaz"),
+ /// Literal::inexact("bar"),
+ /// ]);
+ /// assert_eq!(expected, seq1);
+ /// ```
+ ///
+ /// This example shows the behavior of when `other` is an infinite
+ /// sequence.
+ ///
+ /// ```
+ /// use regex_syntax::hir::literal::{Literal, Seq};
+ ///
+ /// let mut seq1 = Seq::from_iter([
+ /// Literal::exact("foo"),
+ /// Literal::inexact("bar"),
+ /// ]);
+ /// let mut seq2 = Seq::infinite();
+ /// seq1.cross_forward(&mut seq2);
+ ///
+ /// // When seq2 is infinite, cross product doesn't add anything, but
+ /// // ensures all members of seq1 are inexact.
+ /// let expected = Seq::from_iter([
+ /// Literal::inexact("foo"),
+ /// Literal::inexact("bar"),
+ /// ]);
+ /// assert_eq!(expected, seq1);
+ /// ```
+ ///
+ /// This example is like the one above, but shows what happens when this
+ /// sequence contains an empty string. In this case, an infinite `other`
+ /// sequence infects this sequence (because the empty string means that
+ /// there are no finite prefixes):
+ ///
+ /// ```
+ /// use regex_syntax::hir::literal::{Literal, Seq};
+ ///
+ /// let mut seq1 = Seq::from_iter([
+ /// Literal::exact("foo"),
+ /// Literal::exact(""), // inexact provokes same behavior
+ /// Literal::inexact("bar"),
+ /// ]);
+ /// let mut seq2 = Seq::infinite();
+ /// seq1.cross_forward(&mut seq2);
+ ///
+ /// // seq1 is now infinite!
+ /// assert!(!seq1.is_finite());
+ /// ```
+ ///
+ /// This example shows the behavior of this sequence is infinite.
+ ///
+ /// ```
+ /// use regex_syntax::hir::literal::{Literal, Seq};
+ ///
+ /// let mut seq1 = Seq::infinite();
+ /// let mut seq2 = Seq::from_iter([
+ /// Literal::exact("foo"),
+ /// Literal::inexact("bar"),
+ /// ]);
+ /// seq1.cross_forward(&mut seq2);
+ ///
+ /// // seq1 remains unchanged.
+ /// assert!(!seq1.is_finite());
+ /// // Even though the literals in seq2 weren't used, it was still drained.
+ /// assert_eq!(Some(0), seq2.len());
+ /// ```
+ #[inline]
+ pub fn cross_forward(&mut self, other: &mut Seq) {
+ let (lits1, lits2) = match self.cross_preamble(other) {
+ None => return,
+ Some((lits1, lits2)) => (lits1, lits2),
+ };
+ let newcap = lits1.len().saturating_mul(lits2.len());
+ for selflit in mem::replace(lits1, Vec::with_capacity(newcap)) {
+ if !selflit.is_exact() {
+ lits1.push(selflit);
+ continue;
+ }
+ for otherlit in lits2.iter() {
+ let mut newlit = Literal::exact(Vec::with_capacity(
+ selflit.len() + otherlit.len(),
+ ));
+ newlit.extend(&selflit);
+ newlit.extend(&otherlit);
+ if !otherlit.is_exact() {
+ newlit.make_inexact();
+ }
+ lits1.push(newlit);
+ }
+ }
+ lits2.drain(..);
+ self.dedup();
+ }
+
+ /// Modify this sequence to contain the cross product between it and
+ /// the sequence given, where the sequences are treated as suffixes
+ /// instead of prefixes. Namely, the sequence `other` is *prepended*
+ /// to `self` (as opposed to `other` being *appended* to `self` in
+ /// [`Seq::cross_forward`]).
+ ///
+ /// The cross product only considers literals in this sequence that are
+ /// exact. That is, inexact literals are not extended.
+ ///
+ /// The literals are always drained from `other`, even if none are used.
+ /// This permits callers to reuse the sequence allocation elsewhere.
+ ///
+ /// If this sequence is infinite, then this is a no-op, regardless of what
+ /// `other` contains (and in this case, the literals are still drained from
+ /// `other`). If `other` is infinite and this sequence is finite, then this
+ /// is a no-op, unless this sequence contains a zero-length literal. In
+ /// which case, the infiniteness of `other` infects this sequence, and this
+ /// sequence is itself made infinite.
+ ///
+ /// Like [`Seq::union`], this may attempt to deduplicate literals. See
+ /// [`Seq::dedup`] for how deduplication deals with exact and inexact
+ /// literals.
+ ///
+ /// # Example
+ ///
+ /// This example shows basic usage and how exact and inexact literals
+ /// interact.
+ ///
+ /// ```
+ /// use regex_syntax::hir::literal::{Literal, Seq};
+ ///
+ /// let mut seq1 = Seq::from_iter([
+ /// Literal::exact("foo"),
+ /// Literal::inexact("bar"),
+ /// ]);
+ /// let mut seq2 = Seq::from_iter([
+ /// Literal::inexact("quux"),
+ /// Literal::exact("baz"),
+ /// ]);
+ /// seq1.cross_reverse(&mut seq2);
+ ///
+ /// // The literals are pulled out of seq2.
+ /// assert_eq!(Some(0), seq2.len());
+ ///
+ /// let expected = Seq::from_iter([
+ /// Literal::inexact("quuxfoo"),
+ /// Literal::inexact("bar"),
+ /// Literal::exact("bazfoo"),
+ /// ]);
+ /// assert_eq!(expected, seq1);
+ /// ```
+ ///
+ /// This example shows the behavior of when `other` is an infinite
+ /// sequence.
+ ///
+ /// ```
+ /// use regex_syntax::hir::literal::{Literal, Seq};
+ ///
+ /// let mut seq1 = Seq::from_iter([
+ /// Literal::exact("foo"),
+ /// Literal::inexact("bar"),
+ /// ]);
+ /// let mut seq2 = Seq::infinite();
+ /// seq1.cross_reverse(&mut seq2);
+ ///
+ /// // When seq2 is infinite, cross product doesn't add anything, but
+ /// // ensures all members of seq1 are inexact.
+ /// let expected = Seq::from_iter([
+ /// Literal::inexact("foo"),
+ /// Literal::inexact("bar"),
+ /// ]);
+ /// assert_eq!(expected, seq1);
+ /// ```
+ ///
+ /// This example is like the one above, but shows what happens when this
+ /// sequence contains an empty string. In this case, an infinite `other`
+ /// sequence infects this sequence (because the empty string means that
+ /// there are no finite suffixes):
+ ///
+ /// ```
+ /// use regex_syntax::hir::literal::{Literal, Seq};
+ ///
+ /// let mut seq1 = Seq::from_iter([
+ /// Literal::exact("foo"),
+ /// Literal::exact(""), // inexact provokes same behavior
+ /// Literal::inexact("bar"),
+ /// ]);
+ /// let mut seq2 = Seq::infinite();
+ /// seq1.cross_reverse(&mut seq2);
+ ///
+ /// // seq1 is now infinite!
+ /// assert!(!seq1.is_finite());
+ /// ```
+ ///
+ /// This example shows the behavior when this sequence is infinite.
+ ///
+ /// ```
+ /// use regex_syntax::hir::literal::{Literal, Seq};
+ ///
+ /// let mut seq1 = Seq::infinite();
+ /// let mut seq2 = Seq::from_iter([
+ /// Literal::exact("foo"),
+ /// Literal::inexact("bar"),
+ /// ]);
+ /// seq1.cross_reverse(&mut seq2);
+ ///
+ /// // seq1 remains unchanged.
+ /// assert!(!seq1.is_finite());
+ /// // Even though the literals in seq2 weren't used, it was still drained.
+ /// assert_eq!(Some(0), seq2.len());
+ /// ```
+ #[inline]
+ pub fn cross_reverse(&mut self, other: &mut Seq) {
+ let (lits1, lits2) = match self.cross_preamble(other) {
+ None => return,
+ Some((lits1, lits2)) => (lits1, lits2),
+ };
+ // We basically proceed as we do in 'cross_forward' at this point,
+ // except that the outer loop is now 'other' and the inner loop is now
+ // 'self'. That's because 'self' corresponds to suffixes and 'other'
+ // corresponds to the sequence we want to *prepend* to the suffixes.
+ let newcap = lits1.len().saturating_mul(lits2.len());
+ let selflits = mem::replace(lits1, Vec::with_capacity(newcap));
+ for (i, otherlit) in lits2.drain(..).enumerate() {
+ for selflit in selflits.iter() {
+ if !selflit.is_exact() {
+ // If the suffix isn't exact, then we can't prepend
+ // anything to it. However, we still want to keep it. But
+ // we only want to keep one of them, to avoid duplication.
+ // (The duplication is okay from a correctness perspective,
+ // but wasteful.)
+ if i == 0 {
+ lits1.push(selflit.clone());
+ }
+ continue;
+ }
+ let mut newlit = Literal::exact(Vec::with_capacity(
+ otherlit.len() + selflit.len(),
+ ));
+ newlit.extend(&otherlit);
+ newlit.extend(&selflit);
+ if !otherlit.is_exact() {
+ newlit.make_inexact();
+ }
+ lits1.push(newlit);
+ }
+ }
+ self.dedup();
+ }
+
+ /// A helper function the corresponds to the subtle preamble for both
+ /// `cross_forward` and `cross_reverse`. In effect, it handles the cases
+ /// of infinite sequences for both `self` and `other`, as well as ensuring
+ /// that literals from `other` are drained even if they aren't used.
+ fn cross_preamble<'a>(
+ &'a mut self,
+ other: &'a mut Seq,
+ ) -> Option<(&'a mut Vec<Literal>, &'a mut Vec<Literal>)> {
+ let lits2 = match other.literals {
+ None => {
+ // If our current seq contains the empty string and the seq
+ // we're adding matches any literal, then it follows that the
+ // current seq must now also match any literal.
+ //
+ // Otherwise, we just have to make sure everything in this
+ // sequence is inexact.
+ if self.min_literal_len() == Some(0) {
+ *self = Seq::infinite();
+ } else {
+ self.make_inexact();
+ }
+ return None;
+ }
+ Some(ref mut lits) => lits,
+ };
+ let lits1 = match self.literals {
+ None => {
+ // If we aren't going to make it to the end of this routine
+ // where lits2 is drained, then we need to do it now.
+ lits2.drain(..);
+ return None;
+ }
+ Some(ref mut lits) => lits,
+ };
+ Some((lits1, lits2))
+ }
+
+ /// Unions the `other` sequence into this one.
+ ///
+ /// The literals are always drained out of the given `other` sequence,
+ /// even if they are being unioned into an infinite sequence. This permits
+ /// the caller to reuse the `other` sequence in another context.
+ ///
+ /// Some literal deduping may be performed. If any deduping happens,
+ /// any leftmost-first or "preference" order match semantics will be
+ /// preserved.
+ ///
+ /// # Example
+ ///
+ /// This example shows basic usage.
+ ///
+ /// ```
+ /// use regex_syntax::hir::literal::Seq;
+ ///
+ /// let mut seq1 = Seq::new(&["foo", "bar"]);
+ /// let mut seq2 = Seq::new(&["bar", "quux", "foo"]);
+ /// seq1.union(&mut seq2);
+ ///
+ /// // The literals are pulled out of seq2.
+ /// assert_eq!(Some(0), seq2.len());
+ ///
+ /// // Adjacent literals are deduped, but non-adjacent literals may not be.
+ /// assert_eq!(Seq::new(&["foo", "bar", "quux", "foo"]), seq1);
+ /// ```
+ ///
+ /// This example shows that literals are drained from `other` even when
+ /// they aren't necessarily used.
+ ///
+ /// ```
+ /// use regex_syntax::hir::literal::Seq;
+ ///
+ /// let mut seq1 = Seq::infinite();
+ /// // Infinite sequences have no finite length.
+ /// assert_eq!(None, seq1.len());
+ ///
+ /// let mut seq2 = Seq::new(&["bar", "quux", "foo"]);
+ /// seq1.union(&mut seq2);
+ ///
+ /// // seq1 is still infinite and seq2 has been drained.
+ /// assert_eq!(None, seq1.len());
+ /// assert_eq!(Some(0), seq2.len());
+ /// ```
+ #[inline]
+ pub fn union(&mut self, other: &mut Seq) {
+ let lits2 = match other.literals {
+ None => {
+ // Unioning with an infinite sequence always results in an
+ // infinite sequence.
+ self.make_infinite();
+ return;
+ }
+ Some(ref mut lits) => lits.drain(..),
+ };
+ let lits1 = match self.literals {
+ None => return,
+ Some(ref mut lits) => lits,
+ };
+ lits1.extend(lits2);
+ self.dedup();
+ }
+
+ /// Unions the `other` sequence into this one by splice the `other`
+ /// sequence at the position of the first zero-length literal.
+ ///
+ /// This is useful for preserving preference order semantics when combining
+ /// two literal sequences. For example, in the regex `(a||f)+foo`, the
+ /// correct preference order prefix sequence is `[a, foo, f]`.
+ ///
+ /// The literals are always drained out of the given `other` sequence,
+ /// even if they are being unioned into an infinite sequence. This permits
+ /// the caller to reuse the `other` sequence in another context. Note that
+ /// the literals are drained even if no union is performed as well, i.e.,
+ /// when this sequence does not contain a zero-length literal.
+ ///
+ /// Some literal deduping may be performed. If any deduping happens,
+ /// any leftmost-first or "preference" order match semantics will be
+ /// preserved.
+ ///
+ /// # Example
+ ///
+ /// This example shows basic usage.
+ ///
+ /// ```
+ /// use regex_syntax::hir::literal::Seq;
+ ///
+ /// let mut seq1 = Seq::new(&["a", "", "f", ""]);
+ /// let mut seq2 = Seq::new(&["foo"]);
+ /// seq1.union_into_empty(&mut seq2);
+ ///
+ /// // The literals are pulled out of seq2.
+ /// assert_eq!(Some(0), seq2.len());
+ /// // 'foo' gets spliced into seq1 where the first empty string occurs.
+ /// assert_eq!(Seq::new(&["a", "foo", "f"]), seq1);
+ /// ```
+ ///
+ /// This example shows that literals are drained from `other` even when
+ /// they aren't necessarily used.
+ ///
+ /// ```
+ /// use regex_syntax::hir::literal::Seq;
+ ///
+ /// let mut seq1 = Seq::new(&["foo", "bar"]);
+ /// let mut seq2 = Seq::new(&["bar", "quux", "foo"]);
+ /// seq1.union_into_empty(&mut seq2);
+ ///
+ /// // seq1 has no zero length literals, so no splicing happens.
+ /// assert_eq!(Seq::new(&["foo", "bar"]), seq1);
+ /// // Even though no splicing happens, seq2 is still drained.
+ /// assert_eq!(Some(0), seq2.len());
+ /// ```
+ #[inline]
+ pub fn union_into_empty(&mut self, other: &mut Seq) {
+ let lits2 = other.literals.as_mut().map(|lits| lits.drain(..));
+ let lits1 = match self.literals {
+ None => return,
+ Some(ref mut lits) => lits,
+ };
+ let first_empty = match lits1.iter().position(|m| m.is_empty()) {
+ None => return,
+ Some(i) => i,
+ };
+ let lits2 = match lits2 {
+ None => {
+ // Note that we are only here if we've found an empty literal,
+ // which implies that an infinite sequence infects this seq and
+ // also turns it into an infinite sequence.
+ self.literals = None;
+ return;
+ }
+ Some(lits) => lits,
+ };
+ // Clearing out the empties needs to come before the splice because
+ // the splice might add more empties that we don't want to get rid
+ // of. Since we're splicing into the position of the first empty, the
+ // 'first_empty' position computed above is still correct.
+ lits1.retain(|m| !m.is_empty());
+ lits1.splice(first_empty..first_empty, lits2);
+ self.dedup();
+ }
+
+ /// Deduplicate adjacent equivalent literals in this sequence.
+ ///
+ /// If adjacent literals are equivalent strings but one is exact and the
+ /// other inexact, the inexact literal is kept and the exact one is
+ /// removed.
+ ///
+ /// Deduping an infinite sequence is a no-op.
+ ///
+ /// # Example
+ ///
+ /// This example shows how literals that are duplicate byte strings but
+ /// are not equivalent with respect to exactness are resolved.
+ ///
+ /// ```
+ /// use regex_syntax::hir::literal::{Literal, Seq};
+ ///
+ /// let mut seq = Seq::from_iter([
+ /// Literal::exact("foo"),
+ /// Literal::inexact("foo"),
+ /// ]);
+ /// seq.dedup();
+ ///
+ /// assert_eq!(Seq::from_iter([Literal::inexact("foo")]), seq);
+ /// ```
+ #[inline]
+ pub fn dedup(&mut self) {
+ if let Some(ref mut lits) = self.literals {
+ lits.dedup_by(|lit1, lit2| {
+ if lit1.as_bytes() != lit2.as_bytes() {
+ return false;
+ }
+ if lit1.is_exact() != lit2.is_exact() {
+ lit1.make_inexact();
+ lit2.make_inexact();
+ }
+ true
+ });
+ }
+ }
+
+ /// Sorts this sequence of literals lexicographically.
+ ///
+ /// Note that if, before sorting, if a literal that is a prefix of another
+ /// literal appears after it, then after sorting, the sequence will not
+ /// represent the same preference order match semantics. For example,
+ /// sorting the sequence `[samwise, sam]` yields the sequence `[sam,
+ /// samwise]`. Under preference order semantics, the latter sequence will
+ /// never match `samwise` where as the first sequence can.
+ ///
+ /// # Example
+ ///
+ /// This example shows basic usage.
+ ///
+ /// ```
+ /// use regex_syntax::hir::literal::Seq;
+ ///
+ /// let mut seq = Seq::new(&["foo", "quux", "bar"]);
+ /// seq.sort();
+ ///
+ /// assert_eq!(Seq::new(&["bar", "foo", "quux"]), seq);
+ /// ```
+ #[inline]
+ pub fn sort(&mut self) {
+ if let Some(ref mut lits) = self.literals {
+ lits.sort();
+ }
+ }
+
+ /// Reverses all of the literals in this sequence.
+ ///
+ /// The order of the sequence itself is preserved.
+ ///
+ /// # Example
+ ///
+ /// This example shows basic usage.
+ ///
+ /// ```
+ /// use regex_syntax::hir::literal::Seq;
+ ///
+ /// let mut seq = Seq::new(&["oof", "rab"]);
+ /// seq.reverse_literals();
+ /// assert_eq!(Seq::new(&["foo", "bar"]), seq);
+ /// ```
+ #[inline]
+ pub fn reverse_literals(&mut self) {
+ if let Some(ref mut lits) = self.literals {
+ for lit in lits.iter_mut() {
+ lit.reverse();
+ }
+ }
+ }
+
+ /// Shrinks this seq to its minimal size while respecting the preference
+ /// order of its literals.
+ ///
+ /// While this routine will remove duplicate literals from this seq, it
+ /// will also remove literals that can never match in a leftmost-first or
+ /// "preference order" search. Similar to [`Seq::dedup`], if a literal is
+ /// deduped, then the one that remains is made inexact.
+ ///
+ /// This is a no-op on seqs that are empty or not finite.
+ ///
+ /// # Example
+ ///
+ /// This example shows the difference between `{sam, samwise}` and
+ /// `{samwise, sam}`.
+ ///
+ /// ```
+ /// use regex_syntax::hir::literal::{Literal, Seq};
+ ///
+ /// // If 'sam' comes before 'samwise' and a preference order search is
+ /// // executed, then 'samwise' can never match.
+ /// let mut seq = Seq::new(&["sam", "samwise"]);
+ /// seq.minimize_by_preference();
+ /// assert_eq!(Seq::from_iter([Literal::inexact("sam")]), seq);
+ ///
+ /// // But if they are reversed, then it's possible for 'samwise' to match
+ /// // since it is given higher preference.
+ /// let mut seq = Seq::new(&["samwise", "sam"]);
+ /// seq.minimize_by_preference();
+ /// assert_eq!(Seq::new(&["samwise", "sam"]), seq);
+ /// ```
+ ///
+ /// This example shows that if an empty string is in this seq, then
+ /// anything that comes after it can never match.
+ ///
+ /// ```
+ /// use regex_syntax::hir::literal::{Literal, Seq};
+ ///
+ /// // An empty string is a prefix of all strings, so it automatically
+ /// // inhibits any subsequent strings from matching.
+ /// let mut seq = Seq::new(&["foo", "bar", "", "quux", "fox"]);
+ /// seq.minimize_by_preference();
+ /// let expected = Seq::from_iter([
+ /// Literal::exact("foo"),
+ /// Literal::exact("bar"),
+ /// Literal::inexact(""),
+ /// ]);
+ /// assert_eq!(expected, seq);
+ ///
+ /// // And of course, if it's at the beginning, then it makes it impossible
+ /// // for anything else to match.
+ /// let mut seq = Seq::new(&["", "foo", "quux", "fox"]);
+ /// seq.minimize_by_preference();
+ /// assert_eq!(Seq::from_iter([Literal::inexact("")]), seq);
+ /// ```
+ #[inline]
+ pub fn minimize_by_preference(&mut self) {
+ if let Some(ref mut lits) = self.literals {
+ PreferenceTrie::minimize(lits, false);
+ }
+ }
+
+ /// Trims all literals in this seq such that only the first `len` bytes
+ /// remain. If a literal has less than or equal to `len` bytes, then it
+ /// remains unchanged. Otherwise, it is trimmed and made inexact.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_syntax::hir::literal::{Literal, Seq};
+ ///
+ /// let mut seq = Seq::new(&["a", "foo", "quux"]);
+ /// seq.keep_first_bytes(2);
+ ///
+ /// let expected = Seq::from_iter([
+ /// Literal::exact("a"),
+ /// Literal::inexact("fo"),
+ /// Literal::inexact("qu"),
+ /// ]);
+ /// assert_eq!(expected, seq);
+ /// ```
+ #[inline]
+ pub fn keep_first_bytes(&mut self, len: usize) {
+ if let Some(ref mut lits) = self.literals {
+ for m in lits.iter_mut() {
+ m.keep_first_bytes(len);
+ }
+ }
+ }
+
+ /// Trims all literals in this seq such that only the last `len` bytes
+ /// remain. If a literal has less than or equal to `len` bytes, then it
+ /// remains unchanged. Otherwise, it is trimmed and made inexact.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_syntax::hir::literal::{Literal, Seq};
+ ///
+ /// let mut seq = Seq::new(&["a", "foo", "quux"]);
+ /// seq.keep_last_bytes(2);
+ ///
+ /// let expected = Seq::from_iter([
+ /// Literal::exact("a"),
+ /// Literal::inexact("oo"),
+ /// Literal::inexact("ux"),
+ /// ]);
+ /// assert_eq!(expected, seq);
+ /// ```
+ #[inline]
+ pub fn keep_last_bytes(&mut self, len: usize) {
+ if let Some(ref mut lits) = self.literals {
+ for m in lits.iter_mut() {
+ m.keep_last_bytes(len);
+ }
+ }
+ }
+
+ /// Returns true if this sequence is finite.
+ ///
+ /// When false, this sequence is infinite and must be treated as if it
+ /// contains every possible literal.
+ #[inline]
+ pub fn is_finite(&self) -> bool {
+ self.literals.is_some()
+ }
+
+ /// Returns true if and only if this sequence is finite and empty.
+ ///
+ /// An empty sequence never matches anything. It can only be produced by
+ /// literal extraction when the corresponding regex itself cannot match.
+ #[inline]
+ pub fn is_empty(&self) -> bool {
+ self.len() == Some(0)
+ }
+
+ /// Returns the number of literals in this sequence if the sequence is
+ /// finite. If the sequence is infinite, then `None` is returned.
+ #[inline]
+ pub fn len(&self) -> Option<usize> {
+ self.literals.as_ref().map(|lits| lits.len())
+ }
+
+ /// Returns true if and only if all literals in this sequence are exact.
+ ///
+ /// This returns false if the sequence is infinite.
+ #[inline]
+ pub fn is_exact(&self) -> bool {
+ self.literals().map_or(false, |lits| lits.iter().all(|x| x.is_exact()))
+ }
+
+ /// Returns true if and only if all literals in this sequence are inexact.
+ ///
+ /// This returns true if the sequence is infinite.
+ #[inline]
+ pub fn is_inexact(&self) -> bool {
+ self.literals().map_or(true, |lits| lits.iter().all(|x| !x.is_exact()))
+ }
+
+ /// Return the maximum length of the sequence that would result from
+ /// unioning `self` with `other`. If either set is infinite, then this
+ /// returns `None`.
+ #[inline]
+ pub fn max_union_len(&self, other: &Seq) -> Option<usize> {
+ let len1 = self.len()?;
+ let len2 = other.len()?;
+ Some(len1.saturating_add(len2))
+ }
+
+ /// Return the maximum length of the sequence that would result from the
+ /// cross product of `self` with `other`. If either set is infinite, then
+ /// this returns `None`.
+ #[inline]
+ pub fn max_cross_len(&self, other: &Seq) -> Option<usize> {
+ let len1 = self.len()?;
+ let len2 = other.len()?;
+ Some(len1.saturating_mul(len2))
+ }
+
+ /// Returns the length of the shortest literal in this sequence.
+ ///
+ /// If the sequence is infinite or empty, then this returns `None`.
+ #[inline]
+ pub fn min_literal_len(&self) -> Option<usize> {
+ self.literals.as_ref()?.iter().map(|x| x.len()).min()
+ }
+
+ /// Returns the length of the longest literal in this sequence.
+ ///
+ /// If the sequence is infinite or empty, then this returns `None`.
+ #[inline]
+ pub fn max_literal_len(&self) -> Option<usize> {
+ self.literals.as_ref()?.iter().map(|x| x.len()).max()
+ }
+
+ /// Returns the longest common prefix from this seq.
+ ///
+ /// If the seq matches any literal or other contains no literals, then
+ /// there is no meaningful prefix and this returns `None`.
+ ///
+ /// # Example
+ ///
+ /// This shows some example seqs and their longest common prefix.
+ ///
+ /// ```
+ /// use regex_syntax::hir::literal::Seq;
+ ///
+ /// let seq = Seq::new(&["foo", "foobar", "fo"]);
+ /// assert_eq!(Some(&b"fo"[..]), seq.longest_common_prefix());
+ /// let seq = Seq::new(&["foo", "foo"]);
+ /// assert_eq!(Some(&b"foo"[..]), seq.longest_common_prefix());
+ /// let seq = Seq::new(&["foo", "bar"]);
+ /// assert_eq!(Some(&b""[..]), seq.longest_common_prefix());
+ /// let seq = Seq::new(&[""]);
+ /// assert_eq!(Some(&b""[..]), seq.longest_common_prefix());
+ ///
+ /// let seq = Seq::infinite();
+ /// assert_eq!(None, seq.longest_common_prefix());
+ /// let seq = Seq::empty();
+ /// assert_eq!(None, seq.longest_common_prefix());
+ /// ```
+ #[inline]
+ pub fn longest_common_prefix(&self) -> Option<&[u8]> {
+ // If we match everything or match nothing, then there's no meaningful
+ // longest common prefix.
+ let lits = match self.literals {
+ None => return None,
+ Some(ref lits) => lits,
+ };
+ if lits.len() == 0 {
+ return None;
+ }
+ let base = lits[0].as_bytes();
+ let mut len = base.len();
+ for m in lits.iter().skip(1) {
+ len = m
+ .as_bytes()
+ .iter()
+ .zip(base[..len].iter())
+ .take_while(|&(a, b)| a == b)
+ .count();
+ if len == 0 {
+ return Some(&[]);
+ }
+ }
+ Some(&base[..len])
+ }
+
+ /// Returns the longest common suffix from this seq.
+ ///
+ /// If the seq matches any literal or other contains no literals, then
+ /// there is no meaningful suffix and this returns `None`.
+ ///
+ /// # Example
+ ///
+ /// This shows some example seqs and their longest common suffix.
+ ///
+ /// ```
+ /// use regex_syntax::hir::literal::Seq;
+ ///
+ /// let seq = Seq::new(&["oof", "raboof", "of"]);
+ /// assert_eq!(Some(&b"of"[..]), seq.longest_common_suffix());
+ /// let seq = Seq::new(&["foo", "foo"]);
+ /// assert_eq!(Some(&b"foo"[..]), seq.longest_common_suffix());
+ /// let seq = Seq::new(&["foo", "bar"]);
+ /// assert_eq!(Some(&b""[..]), seq.longest_common_suffix());
+ /// let seq = Seq::new(&[""]);
+ /// assert_eq!(Some(&b""[..]), seq.longest_common_suffix());
+ ///
+ /// let seq = Seq::infinite();
+ /// assert_eq!(None, seq.longest_common_suffix());
+ /// let seq = Seq::empty();
+ /// assert_eq!(None, seq.longest_common_suffix());
+ /// ```
+ #[inline]
+ pub fn longest_common_suffix(&self) -> Option<&[u8]> {
+ // If we match everything or match nothing, then there's no meaningful
+ // longest common suffix.
+ let lits = match self.literals {
+ None => return None,
+ Some(ref lits) => lits,
+ };
+ if lits.len() == 0 {
+ return None;
+ }
+ let base = lits[0].as_bytes();
+ let mut len = base.len();
+ for m in lits.iter().skip(1) {
+ len = m
+ .as_bytes()
+ .iter()
+ .rev()
+ .zip(base[base.len() - len..].iter().rev())
+ .take_while(|&(a, b)| a == b)
+ .count();
+ if len == 0 {
+ return Some(&[]);
+ }
+ }
+ Some(&base[base.len() - len..])
+ }
+
+ /// Optimizes this seq while treating its literals as prefixes and
+ /// respecting the preference order of its literals.
+ ///
+ /// The specific way "optimization" works is meant to be an implementation
+ /// detail, as it essentially represents a set of heuristics. The goal
+ /// that optimization tries to accomplish is to make the literals in this
+ /// set reflect inputs that will result in a more effective prefilter.
+ /// Principally by reducing the false positive rate of candidates found by
+ /// the literals in this sequence. That is, when a match of a literal is
+ /// found, we would like it to be a strong predictor of the overall match
+ /// of the regex. If it isn't, then much time will be spent starting and
+ /// stopping the prefilter search and attempting to confirm the match only
+ /// to have it fail.
+ ///
+ /// Some of those heuristics might be:
+ ///
+ /// * Identifying a common prefix from a larger sequence of literals, and
+ /// shrinking the sequence down to that single common prefix.
+ /// * Rejecting the sequence entirely if it is believed to result in very
+ /// high false positive rate. When this happens, the sequence is made
+ /// infinite.
+ /// * Shrinking the sequence to a smaller number of literals representing
+ /// prefixes, but not shrinking it so much as to make literals too short.
+ /// (A sequence with very short literals, of 1 or 2 bytes, will typically
+ /// result in a higher false positive rate.)
+ ///
+ /// Optimization should only be run once extraction is complete. Namely,
+ /// optimization may make assumptions that do not compose with other
+ /// operations in the middle of extraction. For example, optimization will
+ /// reduce `[E(sam), E(samwise)]` to `[E(sam)]`, but such a transformation
+ /// is only valid if no other extraction will occur. If other extraction
+ /// may occur, then the correct transformation would be to `[I(sam)]`.
+ ///
+ /// The [`Seq::optimize_for_suffix_by_preference`] does the same thing, but
+ /// for suffixes.
+ ///
+ /// # Example
+ ///
+ /// This shows how optimization might transform a sequence. Note that
+ /// the specific behavior is not a documented guarantee. The heuristics
+ /// used are an implementation detail and may change over time in semver
+ /// compatible releases.
+ ///
+ /// ```
+ /// use regex_syntax::hir::literal::{Seq, Literal};
+ ///
+ /// let mut seq = Seq::new(&[
+ /// "samantha",
+ /// "sam",
+ /// "samwise",
+ /// "frodo",
+ /// ]);
+ /// seq.optimize_for_prefix_by_preference();
+ /// assert_eq!(Seq::from_iter([
+ /// Literal::exact("samantha"),
+ /// // Kept exact even though 'samwise' got pruned
+ /// // because optimization assumes literal extraction
+ /// // has finished.
+ /// Literal::exact("sam"),
+ /// Literal::exact("frodo"),
+ /// ]), seq);
+ /// ```
+ ///
+ /// # Example: optimization may make the sequence infinite
+ ///
+ /// If the heuristics deem that the sequence could cause a very high false
+ /// positive rate, then it may make the sequence infinite, effectively
+ /// disabling its use as a prefilter.
+ ///
+ /// ```
+ /// use regex_syntax::hir::literal::{Seq, Literal};
+ ///
+ /// let mut seq = Seq::new(&[
+ /// "samantha",
+ /// // An empty string matches at every position,
+ /// // thus rendering the prefilter completely
+ /// // ineffective.
+ /// "",
+ /// "sam",
+ /// "samwise",
+ /// "frodo",
+ /// ]);
+ /// seq.optimize_for_prefix_by_preference();
+ /// assert!(!seq.is_finite());
+ /// ```
+ ///
+ /// Do note that just because there is a `" "` in the sequence, that
+ /// doesn't mean the sequence will always be made infinite after it is
+ /// optimized. Namely, if the sequence is considered exact (any match
+ /// corresponds to an overall match of the original regex), then any match
+ /// is an overall match, and so the false positive rate is always `0`.
+ ///
+ /// To demonstrate this, we remove `samwise` from our sequence. This
+ /// results in no optimization happening and all literals remain exact.
+ /// Thus the entire sequence is exact, and it is kept as-is, even though
+ /// one is an ASCII space:
+ ///
+ /// ```
+ /// use regex_syntax::hir::literal::{Seq, Literal};
+ ///
+ /// let mut seq = Seq::new(&[
+ /// "samantha",
+ /// " ",
+ /// "sam",
+ /// "frodo",
+ /// ]);
+ /// seq.optimize_for_prefix_by_preference();
+ /// assert!(seq.is_finite());
+ /// ```
+ #[inline]
+ pub fn optimize_for_prefix_by_preference(&mut self) {
+ self.optimize_by_preference(true);
+ }
+
+ /// Optimizes this seq while treating its literals as suffixes and
+ /// respecting the preference order of its literals.
+ ///
+ /// Optimization should only be run once extraction is complete.
+ ///
+ /// The [`Seq::optimize_for_prefix_by_preference`] does the same thing, but
+ /// for prefixes. See its documentation for more explanation.
+ #[inline]
+ pub fn optimize_for_suffix_by_preference(&mut self) {
+ self.optimize_by_preference(false);
+ }
+
+ fn optimize_by_preference(&mut self, prefix: bool) {
+ let origlen = match self.len() {
+ None => return,
+ Some(len) => len,
+ };
+ // Just give up now if our sequence contains an empty string.
+ if self.min_literal_len().map_or(false, |len| len == 0) {
+ // We squash the sequence so that nobody else gets any bright
+ // ideas to try and use it. An empty string implies a match at
+ // every position. A prefilter cannot help you here.
+ self.make_infinite();
+ return;
+ }
+ // Make sure we start with the smallest sequence possible. We use a
+ // special version of preference minimization that retains exactness.
+ // This is legal because optimization is only expected to occur once
+ // extraction is complete.
+ if prefix {
+ if let Some(ref mut lits) = self.literals {
+ PreferenceTrie::minimize(lits, true);
+ }
+ }
+
+ // Look for a common prefix (or suffix). If we found one of those and
+ // it's long enough, then it's a good bet that it will be our fastest
+ // possible prefilter since single-substring search is so fast.
+ let fix = if prefix {
+ self.longest_common_prefix()
+ } else {
+ self.longest_common_suffix()
+ };
+ if let Some(fix) = fix {
+ // As a special case, if we have a common prefix and the leading
+ // byte of that prefix is one that we think probably occurs rarely,
+ // then strip everything down to just that single byte. This should
+ // promote the use of memchr.
+ //
+ // ... we only do this though if our sequence has more than one
+ // literal. Otherwise, we'd rather just stick with a single literal
+ // scan. That is, using memchr is probably better than looking
+ // for 2 or more literals, but probably not as good as a straight
+ // memmem search.
+ //
+ // ... and also only do this when the prefix is short and probably
+ // not too discriminatory anyway. If it's longer, then it's
+ // probably quite discriminatory and thus is likely to have a low
+ // false positive rate.
+ if prefix
+ && origlen > 1
+ && fix.len() >= 1
+ && fix.len() <= 3
+ && rank(fix[0]) < 200
+ {
+ self.keep_first_bytes(1);
+ self.dedup();
+ return;
+ }
+ // We only strip down to the common prefix/suffix if we think
+ // the existing set of literals isn't great, or if the common
+ // prefix/suffix is expected to be particularly discriminatory.
+ let isfast =
+ self.is_exact() && self.len().map_or(false, |len| len <= 16);
+ let usefix = fix.len() > 4 || (fix.len() > 1 && !isfast);
+ if usefix {
+ // If we keep exactly the number of bytes equal to the length
+ // of the prefix (or suffix), then by the definition of a
+ // prefix, every literal in the sequence will be equivalent.
+ // Thus, 'dedup' will leave us with one literal.
+ //
+ // We do it this way to avoid an alloc, but also to make sure
+ // the exactness of literals is kept (or not).
+ if prefix {
+ self.keep_first_bytes(fix.len());
+ } else {
+ self.keep_last_bytes(fix.len());
+ }
+ self.dedup();
+ assert_eq!(Some(1), self.len());
+ // We still fall through here. In particular, we want our
+ // longest common prefix to be subject to the poison check.
+ }
+ }
+ // If we have an exact sequence, we *probably* just want to keep it
+ // as-is. But there are some cases where we don't. So we save a copy of
+ // the exact sequence now, and then try to do some more optimizations
+ // below. If those don't work out, we go back to this exact sequence.
+ //
+ // The specific motivation for this is that we sometimes wind up with
+ // an exact sequence with a hefty number of literals. Say, 100. If we
+ // stuck with that, it would be too big for Teddy and would result in
+ // using Aho-Corasick. Which is fine... but the lazy DFA is plenty
+ // suitable in such cases. The real issue is that we will wind up not
+ // using a fast prefilter at all. So in cases like this, even though
+ // we have an exact sequence, it would be better to try and shrink the
+ // sequence (which we do below) and use it as a prefilter that can
+ // produce false positive matches.
+ //
+ // But if the shrinking below results in a sequence that "sucks," then
+ // we don't want to use that because we already have an exact sequence
+ // in hand.
+ let exact: Option<Seq> =
+ if self.is_exact() { Some(self.clone()) } else { None };
+ // Now we attempt to shorten the sequence. The idea here is that we
+ // don't want to look for too many literals, but we want to shorten
+ // our sequence enough to improve our odds of using better algorithms
+ // downstream (such as Teddy).
+ //
+ // The pair of numbers in this list corresponds to the maximal prefix
+ // (in bytes) to keep for all literals and the length of the sequence
+ // at which to do it.
+ //
+ // So for example, the pair (3, 500) would mean, "if we have more than
+ // 500 literals in our sequence, then truncate all of our literals
+ // such that they are at most 3 bytes in length and the minimize the
+ // sequence."
+ const ATTEMPTS: [(usize, usize); 5] =
+ [(5, 10), (4, 10), (3, 64), (2, 64), (1, 10)];
+ for (keep, limit) in ATTEMPTS {
+ let len = match self.len() {
+ None => break,
+ Some(len) => len,
+ };
+ if len <= limit {
+ break;
+ }
+ if prefix {
+ self.keep_first_bytes(keep);
+ } else {
+ self.keep_last_bytes(keep);
+ }
+ if prefix {
+ if let Some(ref mut lits) = self.literals {
+ PreferenceTrie::minimize(lits, true);
+ }
+ }
+ }
+ // Check for a poison literal. A poison literal is one that is short
+ // and is believed to have a very high match count. These poisons
+ // generally lead to a prefilter with a very high false positive rate,
+ // and thus overall worse performance.
+ //
+ // We do this last because we could have gone from a non-poisonous
+ // sequence to a poisonous one. Perhaps we should add some code to
+ // prevent such transitions in the first place, but then again, we
+ // likely only made the transition in the first place if the sequence
+ // was itself huge. And huge sequences are themselves poisonous. So...
+ if let Some(lits) = self.literals() {
+ if lits.iter().any(|lit| lit.is_poisonous()) {
+ self.make_infinite();
+ }
+ }
+ // OK, if we had an exact sequence before attempting more optimizations
+ // above and our post-optimized sequence sucks for some reason or
+ // another, then we go back to the exact sequence.
+ if let Some(exact) = exact {
+ // If optimizing resulted in dropping our literals, then certainly
+ // backup and use the exact sequence that we had.
+ if !self.is_finite() {
+ *self = exact;
+ return;
+ }
+ // If our optimized sequence contains a short literal, then it's
+ // *probably* not so great. So throw it away and revert to the
+ // exact sequence.
+ if self.min_literal_len().map_or(true, |len| len <= 2) {
+ *self = exact;
+ return;
+ }
+ // Finally, if our optimized sequence is "big" (i.e., can't use
+ // Teddy), then also don't use it and rely on the exact sequence.
+ if self.len().map_or(true, |len| len > 64) {
+ *self = exact;
+ return;
+ }
+ }
+ }
+}
+
+impl core::fmt::Debug for Seq {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ write!(f, "Seq")?;
+ if let Some(lits) = self.literals() {
+ f.debug_list().entries(lits.iter()).finish()
+ } else {
+ write!(f, "[∞]")
+ }
+ }
+}
+
+impl FromIterator<Literal> for Seq {
+ fn from_iter<T: IntoIterator<Item = Literal>>(it: T) -> Seq {
+ let mut seq = Seq::empty();
+ for literal in it {
+ seq.push(literal);
+ }
+ seq
+ }
+}
+
+/// A single literal extracted from an [`Hir`] expression.
+///
+/// A literal is composed of two things:
+///
+/// * A sequence of bytes. No guarantees with respect to UTF-8 are provided.
+/// In particular, even if the regex a literal is extracted from is UTF-8, the
+/// literal extracted may not be valid UTF-8. (For example, if an [`Extractor`]
+/// limit resulted in trimming a literal in a way that splits a codepoint.)
+/// * Whether the literal is "exact" or not. An "exact" literal means that it
+/// has not been trimmed, and may continue to be extended. If a literal is
+/// "exact" after visiting the entire `Hir` expression, then this implies that
+/// the literal leads to a match state. (Although it doesn't necessarily imply
+/// all occurrences of the literal correspond to a match of the regex, since
+/// literal extraction ignores look-around assertions.)
+#[derive(Clone, Eq, PartialEq, PartialOrd, Ord)]
+pub struct Literal {
+ bytes: Vec<u8>,
+ exact: bool,
+}
+
+impl Literal {
+ /// Returns a new exact literal containing the bytes given.
+ #[inline]
+ pub fn exact<B: Into<Vec<u8>>>(bytes: B) -> Literal {
+ Literal { bytes: bytes.into(), exact: true }
+ }
+
+ /// Returns a new inexact literal containing the bytes given.
+ #[inline]
+ pub fn inexact<B: Into<Vec<u8>>>(bytes: B) -> Literal {
+ Literal { bytes: bytes.into(), exact: false }
+ }
+
+ /// Returns the bytes in this literal.
+ #[inline]
+ pub fn as_bytes(&self) -> &[u8] {
+ &self.bytes
+ }
+
+ /// Yields ownership of the bytes inside this literal.
+ ///
+ /// Note that this throws away whether the literal is "exact" or not.
+ #[inline]
+ pub fn into_bytes(self) -> Vec<u8> {
+ self.bytes
+ }
+
+ /// Returns the length of this literal in bytes.
+ #[inline]
+ pub fn len(&self) -> usize {
+ self.as_bytes().len()
+ }
+
+ /// Returns true if and only if this literal has zero bytes.
+ #[inline]
+ pub fn is_empty(&self) -> bool {
+ self.len() == 0
+ }
+
+ /// Returns true if and only if this literal is exact.
+ #[inline]
+ pub fn is_exact(&self) -> bool {
+ self.exact
+ }
+
+ /// Marks this literal as inexact.
+ ///
+ /// Inexact literals can never be extended. For example,
+ /// [`Seq::cross_forward`] will not extend inexact literals.
+ #[inline]
+ pub fn make_inexact(&mut self) {
+ self.exact = false;
+ }
+
+ /// Reverse the bytes in this literal.
+ #[inline]
+ pub fn reverse(&mut self) {
+ self.bytes.reverse();
+ }
+
+ /// Extend this literal with the literal given.
+ ///
+ /// If this literal is inexact, then this is a no-op.
+ #[inline]
+ pub fn extend(&mut self, lit: &Literal) {
+ if !self.is_exact() {
+ return;
+ }
+ self.bytes.extend_from_slice(&lit.bytes);
+ }
+
+ /// Trims this literal such that only the first `len` bytes remain. If
+ /// this literal has fewer than `len` bytes, then it remains unchanged.
+ /// Otherwise, the literal is marked as inexact.
+ #[inline]
+ pub fn keep_first_bytes(&mut self, len: usize) {
+ if len >= self.len() {
+ return;
+ }
+ self.make_inexact();
+ self.bytes.truncate(len);
+ }
+
+ /// Trims this literal such that only the last `len` bytes remain. If this
+ /// literal has fewer than `len` bytes, then it remains unchanged.
+ /// Otherwise, the literal is marked as inexact.
+ #[inline]
+ pub fn keep_last_bytes(&mut self, len: usize) {
+ if len >= self.len() {
+ return;
+ }
+ self.make_inexact();
+ self.bytes.drain(..self.len() - len);
+ }
+
+ /// Returns true if it is believe that this literal is likely to match very
+ /// frequently, and is thus not a good candidate for a prefilter.
+ fn is_poisonous(&self) -> bool {
+ self.is_empty() || (self.len() == 1 && rank(self.as_bytes()[0]) >= 250)
+ }
+}
+
+impl From<u8> for Literal {
+ fn from(byte: u8) -> Literal {
+ Literal::exact(vec![byte])
+ }
+}
+
+impl From<char> for Literal {
+ fn from(ch: char) -> Literal {
+ use alloc::string::ToString;
+ Literal::exact(ch.encode_utf8(&mut [0; 4]).to_string())
+ }
+}
+
+impl AsRef<[u8]> for Literal {
+ fn as_ref(&self) -> &[u8] {
+ self.as_bytes()
+ }
+}
+
+impl core::fmt::Debug for Literal {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ let tag = if self.exact { "E" } else { "I" };
+ f.debug_tuple(tag)
+ .field(&crate::debug::Bytes(self.as_bytes()))
+ .finish()
+ }
+}
+
+/// A "preference" trie that rejects literals that will never match when
+/// executing a leftmost first or "preference" search.
+///
+/// For example, if 'sam' is inserted, then trying to insert 'samwise' will be
+/// rejected because 'samwise' can never match since 'sam' will always take
+/// priority. However, if 'samwise' is inserted first, then inserting 'sam'
+/// after it is accepted. In this case, either 'samwise' or 'sam' can match in
+/// a "preference" search.
+///
+/// Note that we only use this trie as a "set." That is, given a sequence of
+/// literals, we insert each one in order. An `insert` will reject a literal
+/// if a prefix of that literal already exists in the trie. Thus, to rebuild
+/// the "minimal" sequence, we simply only keep literals that were successfully
+/// inserted. (Since we don't need traversal, one wonders whether we can make
+/// some simplifications here, but I haven't given it a ton of thought and I've
+/// never seen this show up on a profile. Because of the heuristic limits
+/// imposed on literal extractions, the size of the inputs here is usually
+/// very small.)
+#[derive(Debug)]
+struct PreferenceTrie {
+ /// The states in this trie. The index of a state in this vector is its ID.
+ states: Vec<State>,
+ /// This vec indicates which states are match states. It always has
+ /// the same length as `states` and is indexed by the same state ID.
+ /// A state with identifier `sid` is a match state if and only if
+ /// `matches[sid].is_some()`. The option contains the index of the literal
+ /// corresponding to the match. The index is offset by 1 so that it fits in
+ /// a NonZeroUsize.
+ matches: Vec<Option<NonZeroUsize>>,
+ /// The index to allocate to the next literal added to this trie. Starts at
+ /// 1 and increments by 1 for every literal successfully added to the trie.
+ next_literal_index: usize,
+}
+
+/// A single state in a trie. Uses a sparse representation for its transitions.
+#[derive(Debug, Default)]
+struct State {
+ /// Sparse representation of the transitions out of this state. Transitions
+ /// are sorted by byte. There is at most one such transition for any
+ /// particular byte.
+ trans: Vec<(u8, usize)>,
+}
+
+impl PreferenceTrie {
+ /// Minimizes the given sequence of literals while preserving preference
+ /// order semantics.
+ ///
+ /// When `keep_exact` is true, the exactness of every literal retained is
+ /// kept. This is useful when dealing with a fully extracted `Seq` that
+ /// only contains exact literals. In that case, we can keep all retained
+ /// literals as exact because we know we'll never need to match anything
+ /// after them and because any removed literals are guaranteed to never
+ /// match.
+ fn minimize(literals: &mut Vec<Literal>, keep_exact: bool) {
+ let mut trie = PreferenceTrie {
+ states: vec![],
+ matches: vec![],
+ next_literal_index: 1,
+ };
+ let mut make_inexact = vec![];
+ literals.retain_mut(|lit| match trie.insert(lit.as_bytes()) {
+ Ok(_) => true,
+ Err(i) => {
+ if !keep_exact {
+ make_inexact.push(i.checked_sub(1).unwrap());
+ }
+ false
+ }
+ });
+ for i in make_inexact {
+ literals[i].make_inexact();
+ }
+ }
+
+ /// Returns `Ok` if the given byte string is accepted into this trie and
+ /// `Err` otherwise. The index for the success case corresponds to the
+ /// index of the literal added. The index for the error case corresponds to
+ /// the index of the literal already in the trie that prevented the given
+ /// byte string from being added. (Which implies it is a prefix of the one
+ /// given.)
+ ///
+ /// In short, the byte string given is accepted into the trie if and only
+ /// if it is possible for it to match when executing a preference order
+ /// search.
+ fn insert(&mut self, bytes: &[u8]) -> Result<usize, usize> {
+ let mut prev = self.root();
+ if let Some(idx) = self.matches[prev] {
+ return Err(idx.get());
+ }
+ for &b in bytes.iter() {
+ match self.states[prev].trans.binary_search_by_key(&b, |t| t.0) {
+ Ok(i) => {
+ prev = self.states[prev].trans[i].1;
+ if let Some(idx) = self.matches[prev] {
+ return Err(idx.get());
+ }
+ }
+ Err(i) => {
+ let next = self.create_state();
+ self.states[prev].trans.insert(i, (b, next));
+ prev = next;
+ }
+ }
+ }
+ let idx = self.next_literal_index;
+ self.next_literal_index += 1;
+ self.matches[prev] = NonZeroUsize::new(idx);
+ Ok(idx)
+ }
+
+ /// Returns the root state ID, and if it doesn't exist, creates it.
+ fn root(&mut self) -> usize {
+ if !self.states.is_empty() {
+ 0
+ } else {
+ self.create_state()
+ }
+ }
+
+ /// Creates a new empty state and returns its ID.
+ fn create_state(&mut self) -> usize {
+ let id = self.states.len();
+ self.states.push(State::default());
+ self.matches.push(None);
+ id
+ }
+}
+
+/// Returns the "rank" of the given byte.
+///
+/// The minimum rank value is `0` and the maximum rank value is `255`.
+///
+/// The rank of a byte is derived from a heuristic background distribution of
+/// relative frequencies of bytes. The heuristic says that lower the rank of a
+/// byte, the less likely that byte is to appear in any arbitrary haystack.
+pub fn rank(byte: u8) -> u8 {
+ crate::rank::BYTE_FREQUENCIES[usize::from(byte)]
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ fn parse(pattern: &str) -> Hir {
+ crate::ParserBuilder::new().utf8(false).build().parse(pattern).unwrap()
+ }
+
+ fn prefixes(pattern: &str) -> Seq {
+ Extractor::new().kind(ExtractKind::Prefix).extract(&parse(pattern))
+ }
+
+ fn suffixes(pattern: &str) -> Seq {
+ Extractor::new().kind(ExtractKind::Suffix).extract(&parse(pattern))
+ }
+
+ fn e(pattern: &str) -> (Seq, Seq) {
+ (prefixes(pattern), suffixes(pattern))
+ }
+
+ #[allow(non_snake_case)]
+ fn E(x: &str) -> Literal {
+ Literal::exact(x.as_bytes())
+ }
+
+ #[allow(non_snake_case)]
+ fn I(x: &str) -> Literal {
+ Literal::inexact(x.as_bytes())
+ }
+
+ fn seq<I: IntoIterator<Item = Literal>>(it: I) -> Seq {
+ Seq::from_iter(it)
+ }
+
+ fn infinite() -> (Seq, Seq) {
+ (Seq::infinite(), Seq::infinite())
+ }
+
+ fn inexact<I1, I2>(it1: I1, it2: I2) -> (Seq, Seq)
+ where
+ I1: IntoIterator<Item = Literal>,
+ I2: IntoIterator<Item = Literal>,
+ {
+ (Seq::from_iter(it1), Seq::from_iter(it2))
+ }
+
+ fn exact<B: AsRef<[u8]>, I: IntoIterator<Item = B>>(it: I) -> (Seq, Seq) {
+ let s1 = Seq::new(it);
+ let s2 = s1.clone();
+ (s1, s2)
+ }
+
+ fn opt<B: AsRef<[u8]>, I: IntoIterator<Item = B>>(it: I) -> (Seq, Seq) {
+ let (mut p, mut s) = exact(it);
+ p.optimize_for_prefix_by_preference();
+ s.optimize_for_suffix_by_preference();
+ (p, s)
+ }
+
+ #[test]
+ fn literal() {
+ assert_eq!(exact(["a"]), e("a"));
+ assert_eq!(exact(["aaaaa"]), e("aaaaa"));
+ assert_eq!(exact(["A", "a"]), e("(?i-u)a"));
+ assert_eq!(exact(["AB", "Ab", "aB", "ab"]), e("(?i-u)ab"));
+ assert_eq!(exact(["abC", "abc"]), e("ab(?i-u)c"));
+
+ assert_eq!(exact([b"\xFF"]), e(r"(?-u:\xFF)"));
+
+ #[cfg(feature = "unicode-case")]
+ {
+ assert_eq!(exact(["☃"]), e("☃"));
+ assert_eq!(exact(["☃"]), e("(?i)☃"));
+ assert_eq!(exact(["☃☃☃☃☃"]), e("☃☃☃☃☃"));
+
+ assert_eq!(exact(["Δ"]), e("Δ"));
+ assert_eq!(exact(["δ"]), e("δ"));
+ assert_eq!(exact(["Δ", "δ"]), e("(?i)Δ"));
+ assert_eq!(exact(["Δ", "δ"]), e("(?i)δ"));
+
+ assert_eq!(exact(["S", "s", "ſ"]), e("(?i)S"));
+ assert_eq!(exact(["S", "s", "ſ"]), e("(?i)s"));
+ assert_eq!(exact(["S", "s", "ſ"]), e("(?i)ſ"));
+ }
+
+ let letters = "ͱͳͷΐάέήίΰαβγδεζηθικλμνξοπρςστυφχψωϊϋ";
+ assert_eq!(exact([letters]), e(letters));
+ }
+
+ #[test]
+ fn class() {
+ assert_eq!(exact(["a", "b", "c"]), e("[abc]"));
+ assert_eq!(exact(["a1b", "a2b", "a3b"]), e("a[123]b"));
+ assert_eq!(exact(["δ", "ε"]), e("[εδ]"));
+ #[cfg(feature = "unicode-case")]
+ {
+ assert_eq!(exact(["Δ", "Ε", "δ", "ε", "ϵ"]), e(r"(?i)[εδ]"));
+ }
+ }
+
+ #[test]
+ fn look() {
+ assert_eq!(exact(["ab"]), e(r"a\Ab"));
+ assert_eq!(exact(["ab"]), e(r"a\zb"));
+ assert_eq!(exact(["ab"]), e(r"a(?m:^)b"));
+ assert_eq!(exact(["ab"]), e(r"a(?m:$)b"));
+ assert_eq!(exact(["ab"]), e(r"a\bb"));
+ assert_eq!(exact(["ab"]), e(r"a\Bb"));
+ assert_eq!(exact(["ab"]), e(r"a(?-u:\b)b"));
+ assert_eq!(exact(["ab"]), e(r"a(?-u:\B)b"));
+
+ assert_eq!(exact(["ab"]), e(r"^ab"));
+ assert_eq!(exact(["ab"]), e(r"$ab"));
+ assert_eq!(exact(["ab"]), e(r"(?m:^)ab"));
+ assert_eq!(exact(["ab"]), e(r"(?m:$)ab"));
+ assert_eq!(exact(["ab"]), e(r"\bab"));
+ assert_eq!(exact(["ab"]), e(r"\Bab"));
+ assert_eq!(exact(["ab"]), e(r"(?-u:\b)ab"));
+ assert_eq!(exact(["ab"]), e(r"(?-u:\B)ab"));
+
+ assert_eq!(exact(["ab"]), e(r"ab^"));
+ assert_eq!(exact(["ab"]), e(r"ab$"));
+ assert_eq!(exact(["ab"]), e(r"ab(?m:^)"));
+ assert_eq!(exact(["ab"]), e(r"ab(?m:$)"));
+ assert_eq!(exact(["ab"]), e(r"ab\b"));
+ assert_eq!(exact(["ab"]), e(r"ab\B"));
+ assert_eq!(exact(["ab"]), e(r"ab(?-u:\b)"));
+ assert_eq!(exact(["ab"]), e(r"ab(?-u:\B)"));
+
+ let expected = (seq([I("aZ"), E("ab")]), seq([I("Zb"), E("ab")]));
+ assert_eq!(expected, e(r"^aZ*b"));
+ }
+
+ #[test]
+ fn repetition() {
+ assert_eq!(exact(["a", ""]), e(r"a?"));
+ assert_eq!(exact(["", "a"]), e(r"a??"));
+ assert_eq!(inexact([I("a"), E("")], [I("a"), E("")]), e(r"a*"));
+ assert_eq!(inexact([E(""), I("a")], [E(""), I("a")]), e(r"a*?"));
+ assert_eq!(inexact([I("a")], [I("a")]), e(r"a+"));
+ assert_eq!(inexact([I("a")], [I("a")]), e(r"(a+)+"));
+
+ assert_eq!(exact(["ab"]), e(r"aZ{0}b"));
+ assert_eq!(exact(["aZb", "ab"]), e(r"aZ?b"));
+ assert_eq!(exact(["ab", "aZb"]), e(r"aZ??b"));
+ assert_eq!(
+ inexact([I("aZ"), E("ab")], [I("Zb"), E("ab")]),
+ e(r"aZ*b")
+ );
+ assert_eq!(
+ inexact([E("ab"), I("aZ")], [E("ab"), I("Zb")]),
+ e(r"aZ*?b")
+ );
+ assert_eq!(inexact([I("aZ")], [I("Zb")]), e(r"aZ+b"));
+ assert_eq!(inexact([I("aZ")], [I("Zb")]), e(r"aZ+?b"));
+
+ assert_eq!(exact(["aZZb"]), e(r"aZ{2}b"));
+ assert_eq!(inexact([I("aZZ")], [I("ZZb")]), e(r"aZ{2,3}b"));
+
+ assert_eq!(exact(["abc", ""]), e(r"(abc)?"));
+ assert_eq!(exact(["", "abc"]), e(r"(abc)??"));
+
+ assert_eq!(inexact([I("a"), E("b")], [I("ab"), E("b")]), e(r"a*b"));
+ assert_eq!(inexact([E("b"), I("a")], [E("b"), I("ab")]), e(r"a*?b"));
+ assert_eq!(inexact([I("ab")], [I("b")]), e(r"ab+"));
+ assert_eq!(inexact([I("a"), I("b")], [I("b")]), e(r"a*b+"));
+
+ // FIXME: The suffixes for this don't look quite right to me. I think
+ // the right suffixes would be: [I(ac), I(bc), E(c)]. The main issue I
+ // think is that suffixes are computed by iterating over concatenations
+ // in reverse, and then [bc, ac, c] ordering is indeed correct from
+ // that perspective. We also test a few more equivalent regexes, and
+ // we get the same result, so it is consistent at least I suppose.
+ //
+ // The reason why this isn't an issue is that it only messes up
+ // preference order, and currently, suffixes are never used in a
+ // context where preference order matters. For prefixes it matters
+ // because we sometimes want to use prefilters without confirmation
+ // when all of the literals are exact (and there's no look-around). But
+ // we never do that for suffixes. Any time we use suffixes, we always
+ // include a confirmation step. If that ever changes, then it's likely
+ // this bug will need to be fixed, but last time I looked, it appears
+ // hard to do so.
+ assert_eq!(
+ inexact([I("a"), I("b"), E("c")], [I("bc"), I("ac"), E("c")]),
+ e(r"a*b*c")
+ );
+ assert_eq!(
+ inexact([I("a"), I("b"), E("c")], [I("bc"), I("ac"), E("c")]),
+ e(r"(a+)?(b+)?c")
+ );
+ assert_eq!(
+ inexact([I("a"), I("b"), E("c")], [I("bc"), I("ac"), E("c")]),
+ e(r"(a+|)(b+|)c")
+ );
+ // A few more similarish but not identical regexes. These may have a
+ // similar problem as above.
+ assert_eq!(
+ inexact(
+ [I("a"), I("b"), I("c"), E("")],
+ [I("c"), I("b"), I("a"), E("")]
+ ),
+ e(r"a*b*c*")
+ );
+ assert_eq!(inexact([I("a"), I("b"), I("c")], [I("c")]), e(r"a*b*c+"));
+ assert_eq!(inexact([I("a"), I("b")], [I("bc")]), e(r"a*b+c"));
+ assert_eq!(inexact([I("a"), I("b")], [I("c"), I("b")]), e(r"a*b+c*"));
+ assert_eq!(inexact([I("ab"), E("a")], [I("b"), E("a")]), e(r"ab*"));
+ assert_eq!(
+ inexact([I("ab"), E("ac")], [I("bc"), E("ac")]),
+ e(r"ab*c")
+ );
+ assert_eq!(inexact([I("ab")], [I("b")]), e(r"ab+"));
+ assert_eq!(inexact([I("ab")], [I("bc")]), e(r"ab+c"));
+
+ assert_eq!(
+ inexact([I("z"), E("azb")], [I("zazb"), E("azb")]),
+ e(r"z*azb")
+ );
+
+ let expected =
+ exact(["aaa", "aab", "aba", "abb", "baa", "bab", "bba", "bbb"]);
+ assert_eq!(expected, e(r"[ab]{3}"));
+ let expected = inexact(
+ [
+ I("aaa"),
+ I("aab"),
+ I("aba"),
+ I("abb"),
+ I("baa"),
+ I("bab"),
+ I("bba"),
+ I("bbb"),
+ ],
+ [
+ I("aaa"),
+ I("aab"),
+ I("aba"),
+ I("abb"),
+ I("baa"),
+ I("bab"),
+ I("bba"),
+ I("bbb"),
+ ],
+ );
+ assert_eq!(expected, e(r"[ab]{3,4}"));
+ }
+
+ #[test]
+ fn concat() {
+ let empty: [&str; 0] = [];
+
+ assert_eq!(exact(["abcxyz"]), e(r"abc()xyz"));
+ assert_eq!(exact(["abcxyz"]), e(r"(abc)(xyz)"));
+ assert_eq!(exact(["abcmnoxyz"]), e(r"abc()mno()xyz"));
+ assert_eq!(exact(empty), e(r"abc[a&&b]xyz"));
+ assert_eq!(exact(["abcxyz"]), e(r"abc[a&&b]*xyz"));
+ }
+
+ #[test]
+ fn alternation() {
+ assert_eq!(exact(["abc", "mno", "xyz"]), e(r"abc|mno|xyz"));
+ assert_eq!(
+ inexact(
+ [E("abc"), I("mZ"), E("mo"), E("xyz")],
+ [E("abc"), I("Zo"), E("mo"), E("xyz")]
+ ),
+ e(r"abc|mZ*o|xyz")
+ );
+ assert_eq!(exact(["abc", "xyz"]), e(r"abc|M[a&&b]N|xyz"));
+ assert_eq!(exact(["abc", "MN", "xyz"]), e(r"abc|M[a&&b]*N|xyz"));
+
+ assert_eq!(exact(["aaa", "aaaaa"]), e(r"(?:|aa)aaa"));
+ assert_eq!(
+ inexact(
+ [I("aaa"), E(""), I("aaaaa"), E("aa")],
+ [I("aaa"), E(""), E("aa")]
+ ),
+ e(r"(?:|aa)(?:aaa)*")
+ );
+ assert_eq!(
+ inexact(
+ [E(""), I("aaa"), E("aa"), I("aaaaa")],
+ [E(""), I("aaa"), E("aa")]
+ ),
+ e(r"(?:|aa)(?:aaa)*?")
+ );
+
+ assert_eq!(
+ inexact([E("a"), I("b"), E("")], [E("a"), I("b"), E("")]),
+ e(r"a|b*")
+ );
+ assert_eq!(inexact([E("a"), I("b")], [E("a"), I("b")]), e(r"a|b+"));
+
+ assert_eq!(
+ inexact([I("a"), E("b"), E("c")], [I("ab"), E("b"), E("c")]),
+ e(r"a*b|c")
+ );
+
+ assert_eq!(
+ inexact(
+ [E("a"), E("b"), I("c"), E("")],
+ [E("a"), E("b"), I("c"), E("")]
+ ),
+ e(r"a|(?:b|c*)")
+ );
+
+ assert_eq!(
+ inexact(
+ [I("a"), I("b"), E("c"), I("a"), I("ab"), E("c")],
+ [I("ac"), I("bc"), E("c"), I("ac"), I("abc"), E("c")],
+ ),
+ e(r"(a|b)*c|(a|ab)*c")
+ );
+
+ assert_eq!(
+ exact(["abef", "abgh", "cdef", "cdgh"]),
+ e(r"(ab|cd)(ef|gh)")
+ );
+ assert_eq!(
+ exact([
+ "abefij", "abefkl", "abghij", "abghkl", "cdefij", "cdefkl",
+ "cdghij", "cdghkl",
+ ]),
+ e(r"(ab|cd)(ef|gh)(ij|kl)")
+ );
+
+ assert_eq!(inexact([E("abab")], [E("abab")]), e(r"(ab){2}"));
+
+ assert_eq!(inexact([I("abab")], [I("abab")]), e(r"(ab){2,3}"));
+
+ assert_eq!(inexact([I("abab")], [I("abab")]), e(r"(ab){2,}"));
+ }
+
+ #[test]
+ fn impossible() {
+ let empty: [&str; 0] = [];
+
+ assert_eq!(exact(empty), e(r"[a&&b]"));
+ assert_eq!(exact(empty), e(r"a[a&&b]"));
+ assert_eq!(exact(empty), e(r"[a&&b]b"));
+ assert_eq!(exact(empty), e(r"a[a&&b]b"));
+ assert_eq!(exact(["a", "b"]), e(r"a|[a&&b]|b"));
+ assert_eq!(exact(["a", "b"]), e(r"a|c[a&&b]|b"));
+ assert_eq!(exact(["a", "b"]), e(r"a|[a&&b]d|b"));
+ assert_eq!(exact(["a", "b"]), e(r"a|c[a&&b]d|b"));
+ assert_eq!(exact([""]), e(r"[a&&b]*"));
+ assert_eq!(exact(["MN"]), e(r"M[a&&b]*N"));
+ }
+
+ // This tests patterns that contain something that defeats literal
+ // detection, usually because it would blow some limit on the total number
+ // of literals that can be returned.
+ //
+ // The main idea is that when literal extraction sees something that
+ // it knows will blow a limit, it replaces it with a marker that says
+ // "any literal will match here." While not necessarily true, the
+ // over-estimation is just fine for the purposes of literal extraction,
+ // because the imprecision doesn't matter: too big is too big.
+ //
+ // This is one of the trickier parts of literal extraction, since we need
+ // to make sure all of our literal extraction operations correctly compose
+ // with the markers.
+ #[test]
+ fn anything() {
+ assert_eq!(infinite(), e(r"."));
+ assert_eq!(infinite(), e(r"(?s)."));
+ assert_eq!(infinite(), e(r"[A-Za-z]"));
+ assert_eq!(infinite(), e(r"[A-Z]"));
+ assert_eq!(exact([""]), e(r"[A-Z]{0}"));
+ assert_eq!(infinite(), e(r"[A-Z]?"));
+ assert_eq!(infinite(), e(r"[A-Z]*"));
+ assert_eq!(infinite(), e(r"[A-Z]+"));
+ assert_eq!((seq([I("1")]), Seq::infinite()), e(r"1[A-Z]"));
+ assert_eq!((seq([I("1")]), seq([I("2")])), e(r"1[A-Z]2"));
+ assert_eq!((Seq::infinite(), seq([I("123")])), e(r"[A-Z]+123"));
+ assert_eq!(infinite(), e(r"[A-Z]+123[A-Z]+"));
+ assert_eq!(infinite(), e(r"1|[A-Z]|3"));
+ assert_eq!(
+ (seq([E("1"), I("2"), E("3")]), Seq::infinite()),
+ e(r"1|2[A-Z]|3"),
+ );
+ assert_eq!(
+ (Seq::infinite(), seq([E("1"), I("2"), E("3")])),
+ e(r"1|[A-Z]2|3"),
+ );
+ assert_eq!(
+ (seq([E("1"), I("2"), E("4")]), seq([E("1"), I("3"), E("4")])),
+ e(r"1|2[A-Z]3|4"),
+ );
+ assert_eq!((Seq::infinite(), seq([I("2")])), e(r"(?:|1)[A-Z]2"));
+ assert_eq!(inexact([I("a")], [I("z")]), e(r"a.z"));
+ }
+
+ // Like the 'anything' test, but it uses smaller limits in order to test
+ // the logic for effectively aborting literal extraction when the seqs get
+ // too big.
+ #[test]
+ fn anything_small_limits() {
+ fn prefixes(pattern: &str) -> Seq {
+ Extractor::new()
+ .kind(ExtractKind::Prefix)
+ .limit_total(10)
+ .extract(&parse(pattern))
+ }
+
+ fn suffixes(pattern: &str) -> Seq {
+ Extractor::new()
+ .kind(ExtractKind::Suffix)
+ .limit_total(10)
+ .extract(&parse(pattern))
+ }
+
+ fn e(pattern: &str) -> (Seq, Seq) {
+ (prefixes(pattern), suffixes(pattern))
+ }
+
+ assert_eq!(
+ (
+ seq([
+ I("aaa"),
+ I("aab"),
+ I("aba"),
+ I("abb"),
+ I("baa"),
+ I("bab"),
+ I("bba"),
+ I("bbb")
+ ]),
+ seq([
+ I("aaa"),
+ I("aab"),
+ I("aba"),
+ I("abb"),
+ I("baa"),
+ I("bab"),
+ I("bba"),
+ I("bbb")
+ ])
+ ),
+ e(r"[ab]{3}{3}")
+ );
+
+ assert_eq!(infinite(), e(r"ab|cd|ef|gh|ij|kl|mn|op|qr|st|uv|wx|yz"));
+ }
+
+ #[test]
+ fn empty() {
+ assert_eq!(exact([""]), e(r""));
+ assert_eq!(exact([""]), e(r"^"));
+ assert_eq!(exact([""]), e(r"$"));
+ assert_eq!(exact([""]), e(r"(?m:^)"));
+ assert_eq!(exact([""]), e(r"(?m:$)"));
+ assert_eq!(exact([""]), e(r"\b"));
+ assert_eq!(exact([""]), e(r"\B"));
+ assert_eq!(exact([""]), e(r"(?-u:\b)"));
+ assert_eq!(exact([""]), e(r"(?-u:\B)"));
+ }
+
+ #[test]
+ fn odds_and_ends() {
+ assert_eq!((Seq::infinite(), seq([I("a")])), e(r".a"));
+ assert_eq!((seq([I("a")]), Seq::infinite()), e(r"a."));
+ assert_eq!(infinite(), e(r"a|."));
+ assert_eq!(infinite(), e(r".|a"));
+
+ let pat = r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]";
+ let expected = inexact(
+ ["Mo'am", "Moam", "Mu'am", "Muam"].map(I),
+ [
+ "ddafi", "ddafy", "dhafi", "dhafy", "dzafi", "dzafy", "dafi",
+ "dafy", "tdafi", "tdafy", "thafi", "thafy", "tzafi", "tzafy",
+ "tafi", "tafy", "zdafi", "zdafy", "zhafi", "zhafy", "zzafi",
+ "zzafy", "zafi", "zafy",
+ ]
+ .map(I),
+ );
+ assert_eq!(expected, e(pat));
+
+ assert_eq!(
+ (seq(["fn is_", "fn as_"].map(I)), Seq::infinite()),
+ e(r"fn is_([A-Z]+)|fn as_([A-Z]+)"),
+ );
+ assert_eq!(
+ inexact([I("foo")], [I("quux")]),
+ e(r"foo[A-Z]+bar[A-Z]+quux")
+ );
+ assert_eq!(infinite(), e(r"[A-Z]+bar[A-Z]+"));
+ assert_eq!(
+ exact(["Sherlock Holmes"]),
+ e(r"(?m)^Sherlock Holmes|Sherlock Holmes$")
+ );
+
+ assert_eq!(exact(["sa", "sb"]), e(r"\bs(?:[ab])"));
+ }
+
+ // This tests a specific regex along with some heuristic steps to reduce
+ // the sequences extracted. This is meant to roughly correspond to the
+ // types of heuristics used to shrink literal sets in practice. (Shrinking
+ // is done because you want to balance "spend too much work looking for
+ // too many literals" and "spend too much work processing false positive
+ // matches from short literals.")
+ #[test]
+ #[cfg(feature = "unicode-case")]
+ fn holmes() {
+ let expected = inexact(
+ ["HOL", "HOl", "HoL", "Hol", "hOL", "hOl", "hoL", "hol"].map(I),
+ [
+ "MES", "MEs", "Eſ", "MeS", "Mes", "eſ", "mES", "mEs", "meS",
+ "mes",
+ ]
+ .map(I),
+ );
+ let (mut prefixes, mut suffixes) = e(r"(?i)Holmes");
+ prefixes.keep_first_bytes(3);
+ suffixes.keep_last_bytes(3);
+ prefixes.minimize_by_preference();
+ suffixes.minimize_by_preference();
+ assert_eq!(expected, (prefixes, suffixes));
+ }
+
+ // This tests that we get some kind of literals extracted for a beefier
+ // alternation with case insensitive mode enabled. At one point during
+ // development, this returned nothing, and motivated some special case
+ // code in Extractor::union to try and trim down the literal sequences
+ // if the union would blow the limits set.
+ #[test]
+ #[cfg(feature = "unicode-case")]
+ fn holmes_alt() {
+ let mut pre =
+ prefixes(r"(?i)Sherlock|Holmes|Watson|Irene|Adler|John|Baker");
+ assert!(pre.len().unwrap() > 0);
+ pre.optimize_for_prefix_by_preference();
+ assert!(pre.len().unwrap() > 0);
+ }
+
+ // See: https://github.com/rust-lang/regex/security/advisories/GHSA-m5pq-gvj9-9vr8
+ // See: CVE-2022-24713
+ //
+ // We test this here to ensure literal extraction completes in reasonable
+ // time and isn't materially impacted by these sorts of pathological
+ // repeats.
+ #[test]
+ fn crazy_repeats() {
+ assert_eq!(inexact([E("")], [E("")]), e(r"(?:){4294967295}"));
+ assert_eq!(
+ inexact([E("")], [E("")]),
+ e(r"(?:){64}{64}{64}{64}{64}{64}")
+ );
+ assert_eq!(inexact([E("")], [E("")]), e(r"x{0}{4294967295}"));
+ assert_eq!(inexact([E("")], [E("")]), e(r"(?:|){4294967295}"));
+
+ assert_eq!(
+ inexact([E("")], [E("")]),
+ e(r"(?:){8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}")
+ );
+ let repa = "a".repeat(100);
+ assert_eq!(
+ inexact([I(&repa)], [I(&repa)]),
+ e(r"a{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}")
+ );
+ }
+
+ #[test]
+ fn huge() {
+ let pat = r#"(?-u)
+ 2(?:
+ [45]\d{3}|
+ 7(?:
+ 1[0-267]|
+ 2[0-289]|
+ 3[0-29]|
+ 4[01]|
+ 5[1-3]|
+ 6[013]|
+ 7[0178]|
+ 91
+ )|
+ 8(?:
+ 0[125]|
+ [139][1-6]|
+ 2[0157-9]|
+ 41|
+ 6[1-35]|
+ 7[1-5]|
+ 8[1-8]|
+ 90
+ )|
+ 9(?:
+ 0[0-2]|
+ 1[0-4]|
+ 2[568]|
+ 3[3-6]|
+ 5[5-7]|
+ 6[0167]|
+ 7[15]|
+ 8[0146-9]
+ )
+ )\d{4}|
+ 3(?:
+ 12?[5-7]\d{2}|
+ 0(?:
+ 2(?:
+ [025-79]\d|
+ [348]\d{1,2}
+ )|
+ 3(?:
+ [2-4]\d|
+ [56]\d?
+ )
+ )|
+ 2(?:
+ 1\d{2}|
+ 2(?:
+ [12]\d|
+ [35]\d{1,2}|
+ 4\d?
+ )
+ )|
+ 3(?:
+ 1\d{2}|
+ 2(?:
+ [2356]\d|
+ 4\d{1,2}
+ )
+ )|
+ 4(?:
+ 1\d{2}|
+ 2(?:
+ 2\d{1,2}|
+ [47]|
+ 5\d{2}
+ )
+ )|
+ 5(?:
+ 1\d{2}|
+ 29
+ )|
+ [67]1\d{2}|
+ 8(?:
+ 1\d{2}|
+ 2(?:
+ 2\d{2}|
+ 3|
+ 4\d
+ )
+ )
+ )\d{3}|
+ 4(?:
+ 0(?:
+ 2(?:
+ [09]\d|
+ 7
+ )|
+ 33\d{2}
+ )|
+ 1\d{3}|
+ 2(?:
+ 1\d{2}|
+ 2(?:
+ [25]\d?|
+ [348]\d|
+ [67]\d{1,2}
+ )
+ )|
+ 3(?:
+ 1\d{2}(?:
+ \d{2}
+ )?|
+ 2(?:
+ [045]\d|
+ [236-9]\d{1,2}
+ )|
+ 32\d{2}
+ )|
+ 4(?:
+ [18]\d{2}|
+ 2(?:
+ [2-46]\d{2}|
+ 3
+ )|
+ 5[25]\d{2}
+ )|
+ 5(?:
+ 1\d{2}|
+ 2(?:
+ 3\d|
+ 5
+ )
+ )|
+ 6(?:
+ [18]\d{2}|
+ 2(?:
+ 3(?:
+ \d{2}
+ )?|
+ [46]\d{1,2}|
+ 5\d{2}|
+ 7\d
+ )|
+ 5(?:
+ 3\d?|
+ 4\d|
+ [57]\d{1,2}|
+ 6\d{2}|
+ 8
+ )
+ )|
+ 71\d{2}|
+ 8(?:
+ [18]\d{2}|
+ 23\d{2}|
+ 54\d{2}
+ )|
+ 9(?:
+ [18]\d{2}|
+ 2[2-5]\d{2}|
+ 53\d{1,2}
+ )
+ )\d{3}|
+ 5(?:
+ 02[03489]\d{2}|
+ 1\d{2}|
+ 2(?:
+ 1\d{2}|
+ 2(?:
+ 2(?:
+ \d{2}
+ )?|
+ [457]\d{2}
+ )
+ )|
+ 3(?:
+ 1\d{2}|
+ 2(?:
+ [37](?:
+ \d{2}
+ )?|
+ [569]\d{2}
+ )
+ )|
+ 4(?:
+ 1\d{2}|
+ 2[46]\d{2}
+ )|
+ 5(?:
+ 1\d{2}|
+ 26\d{1,2}
+ )|
+ 6(?:
+ [18]\d{2}|
+ 2|
+ 53\d{2}
+ )|
+ 7(?:
+ 1|
+ 24
+ )\d{2}|
+ 8(?:
+ 1|
+ 26
+ )\d{2}|
+ 91\d{2}
+ )\d{3}|
+ 6(?:
+ 0(?:
+ 1\d{2}|
+ 2(?:
+ 3\d{2}|
+ 4\d{1,2}
+ )
+ )|
+ 2(?:
+ 2[2-5]\d{2}|
+ 5(?:
+ [3-5]\d{2}|
+ 7
+ )|
+ 8\d{2}
+ )|
+ 3(?:
+ 1|
+ 2[3478]
+ )\d{2}|
+ 4(?:
+ 1|
+ 2[34]
+ )\d{2}|
+ 5(?:
+ 1|
+ 2[47]
+ )\d{2}|
+ 6(?:
+ [18]\d{2}|
+ 6(?:
+ 2(?:
+ 2\d|
+ [34]\d{2}
+ )|
+ 5(?:
+ [24]\d{2}|
+ 3\d|
+ 5\d{1,2}
+ )
+ )
+ )|
+ 72[2-5]\d{2}|
+ 8(?:
+ 1\d{2}|
+ 2[2-5]\d{2}
+ )|
+ 9(?:
+ 1\d{2}|
+ 2[2-6]\d{2}
+ )
+ )\d{3}|
+ 7(?:
+ (?:
+ 02|
+ [3-589]1|
+ 6[12]|
+ 72[24]
+ )\d{2}|
+ 21\d{3}|
+ 32
+ )\d{3}|
+ 8(?:
+ (?:
+ 4[12]|
+ [5-7]2|
+ 1\d?
+ )|
+ (?:
+ 0|
+ 3[12]|
+ [5-7]1|
+ 217
+ )\d
+ )\d{4}|
+ 9(?:
+ [35]1|
+ (?:
+ [024]2|
+ 81
+ )\d|
+ (?:
+ 1|
+ [24]1
+ )\d{2}
+ )\d{3}
+ "#;
+ // TODO: This is a good candidate of a seq of literals that could be
+ // shrunk quite a bit and still be very productive with respect to
+ // literal optimizations.
+ let (prefixes, suffixes) = e(pat);
+ assert!(!suffixes.is_finite());
+ assert_eq!(Some(243), prefixes.len());
+ }
+
+ #[test]
+ fn optimize() {
+ // This gets a common prefix that isn't too short.
+ let (p, s) =
+ opt(["foobarfoobar", "foobar", "foobarzfoobar", "foobarfoobar"]);
+ assert_eq!(seq([I("foobar")]), p);
+ assert_eq!(seq([I("foobar")]), s);
+
+ // This also finds a common prefix, but since it's only one byte, it
+ // prefers the multiple literals.
+ let (p, s) = opt(["abba", "akka", "abccba"]);
+ assert_eq!(exact(["abba", "akka", "abccba"]), (p, s));
+
+ let (p, s) = opt(["sam", "samwise"]);
+ assert_eq!((seq([E("sam")]), seq([E("sam"), E("samwise")])), (p, s));
+
+ // The empty string is poisonous, so our seq becomes infinite, even
+ // though all literals are exact.
+ let (p, s) = opt(["foobarfoo", "foo", "", "foozfoo", "foofoo"]);
+ assert!(!p.is_finite());
+ assert!(!s.is_finite());
+
+ // A space is also poisonous, so our seq becomes infinite. But this
+ // only gets triggered when we don't have a completely exact sequence.
+ // When the sequence is exact, spaces are okay, since we presume that
+ // any prefilter will match a space more quickly than the regex engine.
+ // (When the sequence is exact, there's a chance of the prefilter being
+ // used without needing the regex engine at all.)
+ let mut p = seq([E("foobarfoo"), I("foo"), E(" "), E("foofoo")]);
+ p.optimize_for_prefix_by_preference();
+ assert!(!p.is_finite());
+ }
+}
diff --git a/vendor/regex-syntax/src/hir/mod.rs b/vendor/regex-syntax/src/hir/mod.rs
new file mode 100644
index 0000000..ce38ead
--- /dev/null
+++ b/vendor/regex-syntax/src/hir/mod.rs
@@ -0,0 +1,3861 @@
+/*!
+Defines a high-level intermediate (HIR) representation for regular expressions.
+
+The HIR is represented by the [`Hir`] type, and it principally constructed via
+[translation](translate) from an [`Ast`](crate::ast::Ast). Alternatively, users
+may use the smart constructors defined on `Hir` to build their own by hand. The
+smart constructors simultaneously simplify and "optimize" the HIR, and are also
+the same routines used by translation.
+
+Most regex engines only have an HIR like this, and usually construct it
+directly from the concrete syntax. This crate however first parses the
+concrete syntax into an `Ast`, and only then creates the HIR from the `Ast`,
+as mentioned above. It's done this way to facilitate better error reporting,
+and to have a structured representation of a regex that faithfully represents
+its concrete syntax. Namely, while an `Hir` value can be converted back to an
+equivalent regex pattern string, it is unlikely to look like the original due
+to its simplified structure.
+*/
+
+use core::{char, cmp};
+
+use alloc::{
+ boxed::Box,
+ format,
+ string::{String, ToString},
+ vec,
+ vec::Vec,
+};
+
+use crate::{
+ ast::Span,
+ hir::interval::{Interval, IntervalSet, IntervalSetIter},
+ unicode,
+};
+
+pub use crate::{
+ hir::visitor::{visit, Visitor},
+ unicode::CaseFoldError,
+};
+
+mod interval;
+pub mod literal;
+pub mod print;
+pub mod translate;
+mod visitor;
+
+/// An error that can occur while translating an `Ast` to a `Hir`.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct Error {
+ /// The kind of error.
+ kind: ErrorKind,
+ /// The original pattern that the translator's Ast was parsed from. Every
+ /// span in an error is a valid range into this string.
+ pattern: String,
+ /// The span of this error, derived from the Ast given to the translator.
+ span: Span,
+}
+
+impl Error {
+ /// Return the type of this error.
+ pub fn kind(&self) -> &ErrorKind {
+ &self.kind
+ }
+
+ /// The original pattern string in which this error occurred.
+ ///
+ /// Every span reported by this error is reported in terms of this string.
+ pub fn pattern(&self) -> &str {
+ &self.pattern
+ }
+
+ /// Return the span at which this error occurred.
+ pub fn span(&self) -> &Span {
+ &self.span
+ }
+}
+
+/// The type of an error that occurred while building an `Hir`.
+///
+/// This error type is marked as `non_exhaustive`. This means that adding a
+/// new variant is not considered a breaking change.
+#[non_exhaustive]
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub enum ErrorKind {
+ /// This error occurs when a Unicode feature is used when Unicode
+ /// support is disabled. For example `(?-u:\pL)` would trigger this error.
+ UnicodeNotAllowed,
+ /// This error occurs when translating a pattern that could match a byte
+ /// sequence that isn't UTF-8 and `utf8` was enabled.
+ InvalidUtf8,
+ /// This error occurs when one uses a non-ASCII byte for a line terminator,
+ /// but where Unicode mode is enabled and UTF-8 mode is disabled.
+ InvalidLineTerminator,
+ /// This occurs when an unrecognized Unicode property name could not
+ /// be found.
+ UnicodePropertyNotFound,
+ /// This occurs when an unrecognized Unicode property value could not
+ /// be found.
+ UnicodePropertyValueNotFound,
+ /// This occurs when a Unicode-aware Perl character class (`\w`, `\s` or
+ /// `\d`) could not be found. This can occur when the `unicode-perl`
+ /// crate feature is not enabled.
+ UnicodePerlClassNotFound,
+ /// This occurs when the Unicode simple case mapping tables are not
+ /// available, and the regular expression required Unicode aware case
+ /// insensitivity.
+ UnicodeCaseUnavailable,
+}
+
+#[cfg(feature = "std")]
+impl std::error::Error for Error {}
+
+impl core::fmt::Display for Error {
+ fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+ crate::error::Formatter::from(self).fmt(f)
+ }
+}
+
+impl core::fmt::Display for ErrorKind {
+ fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+ use self::ErrorKind::*;
+
+ let msg = match *self {
+ UnicodeNotAllowed => "Unicode not allowed here",
+ InvalidUtf8 => "pattern can match invalid UTF-8",
+ InvalidLineTerminator => "invalid line terminator, must be ASCII",
+ UnicodePropertyNotFound => "Unicode property not found",
+ UnicodePropertyValueNotFound => "Unicode property value not found",
+ UnicodePerlClassNotFound => {
+ "Unicode-aware Perl class not found \
+ (make sure the unicode-perl feature is enabled)"
+ }
+ UnicodeCaseUnavailable => {
+ "Unicode-aware case insensitivity matching is not available \
+ (make sure the unicode-case feature is enabled)"
+ }
+ };
+ f.write_str(msg)
+ }
+}
+
+/// A high-level intermediate representation (HIR) for a regular expression.
+///
+/// An HIR value is a combination of a [`HirKind`] and a set of [`Properties`].
+/// An `HirKind` indicates what kind of regular expression it is (a literal,
+/// a repetition, a look-around assertion, etc.), where as a `Properties`
+/// describes various facts about the regular expression. For example, whether
+/// it matches UTF-8 or if it matches the empty string.
+///
+/// The HIR of a regular expression represents an intermediate step between
+/// its abstract syntax (a structured description of the concrete syntax) and
+/// an actual regex matcher. The purpose of HIR is to make regular expressions
+/// easier to analyze. In particular, the AST is much more complex than the
+/// HIR. For example, while an AST supports arbitrarily nested character
+/// classes, the HIR will flatten all nested classes into a single set. The HIR
+/// will also "compile away" every flag present in the concrete syntax. For
+/// example, users of HIR expressions never need to worry about case folding;
+/// it is handled automatically by the translator (e.g., by translating
+/// `(?i:A)` to `[aA]`).
+///
+/// The specific type of an HIR expression can be accessed via its `kind`
+/// or `into_kind` methods. This extra level of indirection exists for two
+/// reasons:
+///
+/// 1. Construction of an HIR expression *must* use the constructor methods on
+/// this `Hir` type instead of building the `HirKind` values directly. This
+/// permits construction to enforce invariants like "concatenations always
+/// consist of two or more sub-expressions."
+/// 2. Every HIR expression contains attributes that are defined inductively,
+/// and can be computed cheaply during the construction process. For example,
+/// one such attribute is whether the expression must match at the beginning of
+/// the haystack.
+///
+/// In particular, if you have an `HirKind` value, then there is intentionally
+/// no way to build an `Hir` value from it. You instead need to do case
+/// analysis on the `HirKind` value and build the `Hir` value using its smart
+/// constructors.
+///
+/// # UTF-8
+///
+/// If the HIR was produced by a translator with
+/// [`TranslatorBuilder::utf8`](translate::TranslatorBuilder::utf8) enabled,
+/// then the HIR is guaranteed to match UTF-8 exclusively for all non-empty
+/// matches.
+///
+/// For empty matches, those can occur at any position. It is the
+/// responsibility of the regex engine to determine whether empty matches are
+/// permitted between the code units of a single codepoint.
+///
+/// # Stack space
+///
+/// This type defines its own destructor that uses constant stack space and
+/// heap space proportional to the size of the HIR.
+///
+/// Also, an `Hir`'s `fmt::Display` implementation prints an HIR as a regular
+/// expression pattern string, and uses constant stack space and heap space
+/// proportional to the size of the `Hir`. The regex it prints is guaranteed to
+/// be _semantically_ equivalent to the original concrete syntax, but it may
+/// look very different. (And potentially not practically readable by a human.)
+///
+/// An `Hir`'s `fmt::Debug` implementation currently does not use constant
+/// stack space. The implementation will also suppress some details (such as
+/// the `Properties` inlined into every `Hir` value to make it less noisy).
+#[derive(Clone, Eq, PartialEq)]
+pub struct Hir {
+ /// The underlying HIR kind.
+ kind: HirKind,
+ /// Analysis info about this HIR, computed during construction.
+ props: Properties,
+}
+
+/// Methods for accessing the underlying `HirKind` and `Properties`.
+impl Hir {
+ /// Returns a reference to the underlying HIR kind.
+ pub fn kind(&self) -> &HirKind {
+ &self.kind
+ }
+
+ /// Consumes ownership of this HIR expression and returns its underlying
+ /// `HirKind`.
+ pub fn into_kind(mut self) -> HirKind {
+ core::mem::replace(&mut self.kind, HirKind::Empty)
+ }
+
+ /// Returns the properties computed for this `Hir`.
+ pub fn properties(&self) -> &Properties {
+ &self.props
+ }
+
+ /// Splits this HIR into its constituent parts.
+ ///
+ /// This is useful because `let Hir { kind, props } = hir;` does not work
+ /// because of `Hir`'s custom `Drop` implementation.
+ fn into_parts(mut self) -> (HirKind, Properties) {
+ (
+ core::mem::replace(&mut self.kind, HirKind::Empty),
+ core::mem::replace(&mut self.props, Properties::empty()),
+ )
+ }
+}
+
+/// Smart constructors for HIR values.
+///
+/// These constructors are called "smart" because they do inductive work or
+/// simplifications. For example, calling `Hir::repetition` with a repetition
+/// like `a{0}` will actually return a `Hir` with a `HirKind::Empty` kind
+/// since it is equivalent to an empty regex. Another example is calling
+/// `Hir::concat(vec![expr])`. Instead of getting a `HirKind::Concat`, you'll
+/// just get back the original `expr` since it's precisely equivalent.
+///
+/// Smart constructors enable maintaining invariants about the HIR data type
+/// while also simulanteously keeping the representation as simple as possible.
+impl Hir {
+ /// Returns an empty HIR expression.
+ ///
+ /// An empty HIR expression always matches, including the empty string.
+ #[inline]
+ pub fn empty() -> Hir {
+ let props = Properties::empty();
+ Hir { kind: HirKind::Empty, props }
+ }
+
+ /// Returns an HIR expression that can never match anything. That is,
+ /// the size of the set of strings in the language described by the HIR
+ /// returned is `0`.
+ ///
+ /// This is distinct from [`Hir::empty`] in that the empty string matches
+ /// the HIR returned by `Hir::empty`. That is, the set of strings in the
+ /// language describe described by `Hir::empty` is non-empty.
+ ///
+ /// Note that currently, the HIR returned uses an empty character class to
+ /// indicate that nothing can match. An equivalent expression that cannot
+ /// match is an empty alternation, but all such "fail" expressions are
+ /// normalized (via smart constructors) to empty character classes. This is
+ /// because empty character classes can be spelled in the concrete syntax
+ /// of a regex (e.g., `\P{any}` or `(?-u:[^\x00-\xFF])` or `[a&&b]`), but
+ /// empty alternations cannot.
+ #[inline]
+ pub fn fail() -> Hir {
+ let class = Class::Bytes(ClassBytes::empty());
+ let props = Properties::class(&class);
+ // We can't just call Hir::class here because it defers to Hir::fail
+ // in order to canonicalize the Hir value used to represent "cannot
+ // match."
+ Hir { kind: HirKind::Class(class), props }
+ }
+
+ /// Creates a literal HIR expression.
+ ///
+ /// This accepts anything that can be converted into a `Box<[u8]>`.
+ ///
+ /// Note that there is no mechanism for storing a `char` or a `Box<str>`
+ /// in an HIR. Everything is "just bytes." Whether a `Literal` (or
+ /// any HIR node) matches valid UTF-8 exclusively can be queried via
+ /// [`Properties::is_utf8`].
+ ///
+ /// # Example
+ ///
+ /// This example shows that concatenations of `Literal` HIR values will
+ /// automatically get flattened and combined together. So for example, even
+ /// if you concat multiple `Literal` values that are themselves not valid
+ /// UTF-8, they might add up to valid UTF-8. This also demonstrates just
+ /// how "smart" Hir's smart constructors are.
+ ///
+ /// ```
+ /// use regex_syntax::hir::{Hir, HirKind, Literal};
+ ///
+ /// let literals = vec![
+ /// Hir::literal([0xE2]),
+ /// Hir::literal([0x98]),
+ /// Hir::literal([0x83]),
+ /// ];
+ /// // Each literal, on its own, is invalid UTF-8.
+ /// assert!(literals.iter().all(|hir| !hir.properties().is_utf8()));
+ ///
+ /// let concat = Hir::concat(literals);
+ /// // But the concatenation is valid UTF-8!
+ /// assert!(concat.properties().is_utf8());
+ ///
+ /// // And also notice that the literals have been concatenated into a
+ /// // single `Literal`, to the point where there is no explicit `Concat`!
+ /// let expected = HirKind::Literal(Literal(Box::from("☃".as_bytes())));
+ /// assert_eq!(&expected, concat.kind());
+ /// ```
+ #[inline]
+ pub fn literal<B: Into<Box<[u8]>>>(lit: B) -> Hir {
+ let bytes = lit.into();
+ if bytes.is_empty() {
+ return Hir::empty();
+ }
+
+ let lit = Literal(bytes);
+ let props = Properties::literal(&lit);
+ Hir { kind: HirKind::Literal(lit), props }
+ }
+
+ /// Creates a class HIR expression. The class may either be defined over
+ /// ranges of Unicode codepoints or ranges of raw byte values.
+ ///
+ /// Note that an empty class is permitted. An empty class is equivalent to
+ /// `Hir::fail()`.
+ #[inline]
+ pub fn class(class: Class) -> Hir {
+ if class.is_empty() {
+ return Hir::fail();
+ } else if let Some(bytes) = class.literal() {
+ return Hir::literal(bytes);
+ }
+ let props = Properties::class(&class);
+ Hir { kind: HirKind::Class(class), props }
+ }
+
+ /// Creates a look-around assertion HIR expression.
+ #[inline]
+ pub fn look(look: Look) -> Hir {
+ let props = Properties::look(look);
+ Hir { kind: HirKind::Look(look), props }
+ }
+
+ /// Creates a repetition HIR expression.
+ #[inline]
+ pub fn repetition(mut rep: Repetition) -> Hir {
+ // If the sub-expression of a repetition can only match the empty
+ // string, then we force its maximum to be at most 1.
+ if rep.sub.properties().maximum_len() == Some(0) {
+ rep.min = cmp::min(rep.min, 1);
+ rep.max = rep.max.map(|n| cmp::min(n, 1)).or(Some(1));
+ }
+ // The regex 'a{0}' is always equivalent to the empty regex. This is
+ // true even when 'a' is an expression that never matches anything
+ // (like '\P{any}').
+ //
+ // Additionally, the regex 'a{1}' is always equivalent to 'a'.
+ if rep.min == 0 && rep.max == Some(0) {
+ return Hir::empty();
+ } else if rep.min == 1 && rep.max == Some(1) {
+ return *rep.sub;
+ }
+ let props = Properties::repetition(&rep);
+ Hir { kind: HirKind::Repetition(rep), props }
+ }
+
+ /// Creates a capture HIR expression.
+ ///
+ /// Note that there is no explicit HIR value for a non-capturing group.
+ /// Since a non-capturing group only exists to override precedence in the
+ /// concrete syntax and since an HIR already does its own grouping based on
+ /// what is parsed, there is no need to explicitly represent non-capturing
+ /// groups in the HIR.
+ #[inline]
+ pub fn capture(capture: Capture) -> Hir {
+ let props = Properties::capture(&capture);
+ Hir { kind: HirKind::Capture(capture), props }
+ }
+
+ /// Returns the concatenation of the given expressions.
+ ///
+ /// This attempts to flatten and simplify the concatenation as appropriate.
+ ///
+ /// # Example
+ ///
+ /// This shows a simple example of basic flattening of both concatenations
+ /// and literals.
+ ///
+ /// ```
+ /// use regex_syntax::hir::Hir;
+ ///
+ /// let hir = Hir::concat(vec![
+ /// Hir::concat(vec![
+ /// Hir::literal([b'a']),
+ /// Hir::literal([b'b']),
+ /// Hir::literal([b'c']),
+ /// ]),
+ /// Hir::concat(vec![
+ /// Hir::literal([b'x']),
+ /// Hir::literal([b'y']),
+ /// Hir::literal([b'z']),
+ /// ]),
+ /// ]);
+ /// let expected = Hir::literal("abcxyz".as_bytes());
+ /// assert_eq!(expected, hir);
+ /// ```
+ pub fn concat(subs: Vec<Hir>) -> Hir {
+ // We rebuild the concatenation by simplifying it. Would be nice to do
+ // it in place, but that seems a little tricky?
+ let mut new = vec![];
+ // This gobbles up any adjacent literals in a concatenation and smushes
+ // them together. Basically, when we see a literal, we add its bytes
+ // to 'prior_lit', and whenever we see anything else, we first take
+ // any bytes in 'prior_lit' and add it to the 'new' concatenation.
+ let mut prior_lit: Option<Vec<u8>> = None;
+ for sub in subs {
+ let (kind, props) = sub.into_parts();
+ match kind {
+ HirKind::Literal(Literal(bytes)) => {
+ if let Some(ref mut prior_bytes) = prior_lit {
+ prior_bytes.extend_from_slice(&bytes);
+ } else {
+ prior_lit = Some(bytes.to_vec());
+ }
+ }
+ // We also flatten concats that are direct children of another
+ // concat. We only need to do this one level deep since
+ // Hir::concat is the only way to build concatenations, and so
+ // flattening happens inductively.
+ HirKind::Concat(subs2) => {
+ for sub2 in subs2 {
+ let (kind2, props2) = sub2.into_parts();
+ match kind2 {
+ HirKind::Literal(Literal(bytes)) => {
+ if let Some(ref mut prior_bytes) = prior_lit {
+ prior_bytes.extend_from_slice(&bytes);
+ } else {
+ prior_lit = Some(bytes.to_vec());
+ }
+ }
+ kind2 => {
+ if let Some(prior_bytes) = prior_lit.take() {
+ new.push(Hir::literal(prior_bytes));
+ }
+ new.push(Hir { kind: kind2, props: props2 });
+ }
+ }
+ }
+ }
+ // We can just skip empty HIRs.
+ HirKind::Empty => {}
+ kind => {
+ if let Some(prior_bytes) = prior_lit.take() {
+ new.push(Hir::literal(prior_bytes));
+ }
+ new.push(Hir { kind, props });
+ }
+ }
+ }
+ if let Some(prior_bytes) = prior_lit.take() {
+ new.push(Hir::literal(prior_bytes));
+ }
+ if new.is_empty() {
+ return Hir::empty();
+ } else if new.len() == 1 {
+ return new.pop().unwrap();
+ }
+ let props = Properties::concat(&new);
+ Hir { kind: HirKind::Concat(new), props }
+ }
+
+ /// Returns the alternation of the given expressions.
+ ///
+ /// This flattens and simplifies the alternation as appropriate. This may
+ /// include factoring out common prefixes or even rewriting the alternation
+ /// as a character class.
+ ///
+ /// Note that an empty alternation is equivalent to `Hir::fail()`. (It
+ /// is not possible for one to write an empty alternation, or even an
+ /// alternation with a single sub-expression, in the concrete syntax of a
+ /// regex.)
+ ///
+ /// # Example
+ ///
+ /// This is a simple example showing how an alternation might get
+ /// simplified.
+ ///
+ /// ```
+ /// use regex_syntax::hir::{Hir, Class, ClassUnicode, ClassUnicodeRange};
+ ///
+ /// let hir = Hir::alternation(vec![
+ /// Hir::literal([b'a']),
+ /// Hir::literal([b'b']),
+ /// Hir::literal([b'c']),
+ /// Hir::literal([b'd']),
+ /// Hir::literal([b'e']),
+ /// Hir::literal([b'f']),
+ /// ]);
+ /// let expected = Hir::class(Class::Unicode(ClassUnicode::new([
+ /// ClassUnicodeRange::new('a', 'f'),
+ /// ])));
+ /// assert_eq!(expected, hir);
+ /// ```
+ ///
+ /// And another example showing how common prefixes might get factored
+ /// out.
+ ///
+ /// ```
+ /// use regex_syntax::hir::{Hir, Class, ClassUnicode, ClassUnicodeRange};
+ ///
+ /// let hir = Hir::alternation(vec![
+ /// Hir::concat(vec![
+ /// Hir::literal("abc".as_bytes()),
+ /// Hir::class(Class::Unicode(ClassUnicode::new([
+ /// ClassUnicodeRange::new('A', 'Z'),
+ /// ]))),
+ /// ]),
+ /// Hir::concat(vec![
+ /// Hir::literal("abc".as_bytes()),
+ /// Hir::class(Class::Unicode(ClassUnicode::new([
+ /// ClassUnicodeRange::new('a', 'z'),
+ /// ]))),
+ /// ]),
+ /// ]);
+ /// let expected = Hir::concat(vec![
+ /// Hir::literal("abc".as_bytes()),
+ /// Hir::alternation(vec![
+ /// Hir::class(Class::Unicode(ClassUnicode::new([
+ /// ClassUnicodeRange::new('A', 'Z'),
+ /// ]))),
+ /// Hir::class(Class::Unicode(ClassUnicode::new([
+ /// ClassUnicodeRange::new('a', 'z'),
+ /// ]))),
+ /// ]),
+ /// ]);
+ /// assert_eq!(expected, hir);
+ /// ```
+ ///
+ /// Note that these sorts of simplifications are not guaranteed.
+ pub fn alternation(subs: Vec<Hir>) -> Hir {
+ // We rebuild the alternation by simplifying it. We proceed similarly
+ // as the concatenation case. But in this case, there's no literal
+ // simplification happening. We're just flattening alternations.
+ let mut new = Vec::with_capacity(subs.len());
+ for sub in subs {
+ let (kind, props) = sub.into_parts();
+ match kind {
+ HirKind::Alternation(subs2) => {
+ new.extend(subs2);
+ }
+ kind => {
+ new.push(Hir { kind, props });
+ }
+ }
+ }
+ if new.is_empty() {
+ return Hir::fail();
+ } else if new.len() == 1 {
+ return new.pop().unwrap();
+ }
+ // Now that it's completely flattened, look for the special case of
+ // 'char1|char2|...|charN' and collapse that into a class. Note that
+ // we look for 'char' first and then bytes. The issue here is that if
+ // we find both non-ASCII codepoints and non-ASCII singleton bytes,
+ // then it isn't actually possible to smush them into a single class.
+ // (Because classes are either "all codepoints" or "all bytes." You
+ // can have a class that both matches non-ASCII but valid UTF-8 and
+ // invalid UTF-8.) So we look for all chars and then all bytes, and
+ // don't handle anything else.
+ if let Some(singletons) = singleton_chars(&new) {
+ let it = singletons
+ .into_iter()
+ .map(|ch| ClassUnicodeRange { start: ch, end: ch });
+ return Hir::class(Class::Unicode(ClassUnicode::new(it)));
+ }
+ if let Some(singletons) = singleton_bytes(&new) {
+ let it = singletons
+ .into_iter()
+ .map(|b| ClassBytesRange { start: b, end: b });
+ return Hir::class(Class::Bytes(ClassBytes::new(it)));
+ }
+ // Similar to singleton chars, we can also look for alternations of
+ // classes. Those can be smushed into a single class.
+ if let Some(cls) = class_chars(&new) {
+ return Hir::class(cls);
+ }
+ if let Some(cls) = class_bytes(&new) {
+ return Hir::class(cls);
+ }
+ // Factor out a common prefix if we can, which might potentially
+ // simplify the expression and unlock other optimizations downstream.
+ // It also might generally make NFA matching and DFA construction
+ // faster by reducing the scope of branching in the regex.
+ new = match lift_common_prefix(new) {
+ Ok(hir) => return hir,
+ Err(unchanged) => unchanged,
+ };
+ let props = Properties::alternation(&new);
+ Hir { kind: HirKind::Alternation(new), props }
+ }
+
+ /// Returns an HIR expression for `.`.
+ ///
+ /// * [`Dot::AnyChar`] maps to `(?su-R:.)`.
+ /// * [`Dot::AnyByte`] maps to `(?s-Ru:.)`.
+ /// * [`Dot::AnyCharExceptLF`] maps to `(?u-Rs:.)`.
+ /// * [`Dot::AnyCharExceptCRLF`] maps to `(?Ru-s:.)`.
+ /// * [`Dot::AnyByteExceptLF`] maps to `(?-Rsu:.)`.
+ /// * [`Dot::AnyByteExceptCRLF`] maps to `(?R-su:.)`.
+ ///
+ /// # Example
+ ///
+ /// Note that this is a convenience routine for constructing the correct
+ /// character class based on the value of `Dot`. There is no explicit "dot"
+ /// HIR value. It is just an abbreviation for a common character class.
+ ///
+ /// ```
+ /// use regex_syntax::hir::{Hir, Dot, Class, ClassBytes, ClassBytesRange};
+ ///
+ /// let hir = Hir::dot(Dot::AnyByte);
+ /// let expected = Hir::class(Class::Bytes(ClassBytes::new([
+ /// ClassBytesRange::new(0x00, 0xFF),
+ /// ])));
+ /// assert_eq!(expected, hir);
+ /// ```
+ #[inline]
+ pub fn dot(dot: Dot) -> Hir {
+ match dot {
+ Dot::AnyChar => {
+ let mut cls = ClassUnicode::empty();
+ cls.push(ClassUnicodeRange::new('\0', '\u{10FFFF}'));
+ Hir::class(Class::Unicode(cls))
+ }
+ Dot::AnyByte => {
+ let mut cls = ClassBytes::empty();
+ cls.push(ClassBytesRange::new(b'\0', b'\xFF'));
+ Hir::class(Class::Bytes(cls))
+ }
+ Dot::AnyCharExcept(ch) => {
+ let mut cls =
+ ClassUnicode::new([ClassUnicodeRange::new(ch, ch)]);
+ cls.negate();
+ Hir::class(Class::Unicode(cls))
+ }
+ Dot::AnyCharExceptLF => {
+ let mut cls = ClassUnicode::empty();
+ cls.push(ClassUnicodeRange::new('\0', '\x09'));
+ cls.push(ClassUnicodeRange::new('\x0B', '\u{10FFFF}'));
+ Hir::class(Class::Unicode(cls))
+ }
+ Dot::AnyCharExceptCRLF => {
+ let mut cls = ClassUnicode::empty();
+ cls.push(ClassUnicodeRange::new('\0', '\x09'));
+ cls.push(ClassUnicodeRange::new('\x0B', '\x0C'));
+ cls.push(ClassUnicodeRange::new('\x0E', '\u{10FFFF}'));
+ Hir::class(Class::Unicode(cls))
+ }
+ Dot::AnyByteExcept(byte) => {
+ let mut cls =
+ ClassBytes::new([ClassBytesRange::new(byte, byte)]);
+ cls.negate();
+ Hir::class(Class::Bytes(cls))
+ }
+ Dot::AnyByteExceptLF => {
+ let mut cls = ClassBytes::empty();
+ cls.push(ClassBytesRange::new(b'\0', b'\x09'));
+ cls.push(ClassBytesRange::new(b'\x0B', b'\xFF'));
+ Hir::class(Class::Bytes(cls))
+ }
+ Dot::AnyByteExceptCRLF => {
+ let mut cls = ClassBytes::empty();
+ cls.push(ClassBytesRange::new(b'\0', b'\x09'));
+ cls.push(ClassBytesRange::new(b'\x0B', b'\x0C'));
+ cls.push(ClassBytesRange::new(b'\x0E', b'\xFF'));
+ Hir::class(Class::Bytes(cls))
+ }
+ }
+ }
+}
+
+/// The underlying kind of an arbitrary [`Hir`] expression.
+///
+/// An `HirKind` is principally useful for doing case analysis on the type
+/// of a regular expression. If you're looking to build new `Hir` values,
+/// then you _must_ use the smart constructors defined on `Hir`, like
+/// [`Hir::repetition`], to build new `Hir` values. The API intentionally does
+/// not expose any way of building an `Hir` directly from an `HirKind`.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub enum HirKind {
+ /// The empty regular expression, which matches everything, including the
+ /// empty string.
+ Empty,
+ /// A literalstring that matches exactly these bytes.
+ Literal(Literal),
+ /// A single character class that matches any of the characters in the
+ /// class. A class can either consist of Unicode scalar values as
+ /// characters, or it can use bytes.
+ ///
+ /// A class may be empty. In which case, it matches nothing.
+ Class(Class),
+ /// A look-around assertion. A look-around match always has zero length.
+ Look(Look),
+ /// A repetition operation applied to a sub-expression.
+ Repetition(Repetition),
+ /// A capturing group, which contains a sub-expression.
+ Capture(Capture),
+ /// A concatenation of expressions.
+ ///
+ /// A concatenation matches only if each of its sub-expressions match one
+ /// after the other.
+ ///
+ /// Concatenations are guaranteed by `Hir`'s smart constructors to always
+ /// have at least two sub-expressions.
+ Concat(Vec<Hir>),
+ /// An alternation of expressions.
+ ///
+ /// An alternation matches only if at least one of its sub-expressions
+ /// match. If multiple sub-expressions match, then the leftmost is
+ /// preferred.
+ ///
+ /// Alternations are guaranteed by `Hir`'s smart constructors to always
+ /// have at least two sub-expressions.
+ Alternation(Vec<Hir>),
+}
+
+impl HirKind {
+ /// Returns a slice of this kind's sub-expressions, if any.
+ pub fn subs(&self) -> &[Hir] {
+ use core::slice::from_ref;
+
+ match *self {
+ HirKind::Empty
+ | HirKind::Literal(_)
+ | HirKind::Class(_)
+ | HirKind::Look(_) => &[],
+ HirKind::Repetition(Repetition { ref sub, .. }) => from_ref(sub),
+ HirKind::Capture(Capture { ref sub, .. }) => from_ref(sub),
+ HirKind::Concat(ref subs) => subs,
+ HirKind::Alternation(ref subs) => subs,
+ }
+ }
+}
+
+impl core::fmt::Debug for Hir {
+ fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+ self.kind.fmt(f)
+ }
+}
+
+/// Print a display representation of this Hir.
+///
+/// The result of this is a valid regular expression pattern string.
+///
+/// This implementation uses constant stack space and heap space proportional
+/// to the size of the `Hir`.
+impl core::fmt::Display for Hir {
+ fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+ crate::hir::print::Printer::new().print(self, f)
+ }
+}
+
+/// The high-level intermediate representation of a literal.
+///
+/// A literal corresponds to `0` or more bytes that should be matched
+/// literally. The smart constructors defined on `Hir` will automatically
+/// concatenate adjacent literals into one literal, and will even automatically
+/// replace empty literals with `Hir::empty()`.
+///
+/// Note that despite a literal being represented by a sequence of bytes, its
+/// `Debug` implementation will attempt to print it as a normal string. (That
+/// is, not a sequence of decimal numbers.)
+#[derive(Clone, Eq, PartialEq)]
+pub struct Literal(pub Box<[u8]>);
+
+impl core::fmt::Debug for Literal {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ crate::debug::Bytes(&self.0).fmt(f)
+ }
+}
+
+/// The high-level intermediate representation of a character class.
+///
+/// A character class corresponds to a set of characters. A character is either
+/// defined by a Unicode scalar value or a byte.
+///
+/// A character class, regardless of its character type, is represented by a
+/// sequence of non-overlapping non-adjacent ranges of characters.
+///
+/// There are no guarantees about which class variant is used. Generally
+/// speaking, the Unicode variat is used whenever a class needs to contain
+/// non-ASCII Unicode scalar values. But the Unicode variant can be used even
+/// when Unicode mode is disabled. For example, at the time of writing, the
+/// regex `(?-u:a|\xc2\xa0)` will compile down to HIR for the Unicode class
+/// `[a\u00A0]` due to optimizations.
+///
+/// Note that `Bytes` variant may be produced even when it exclusively matches
+/// valid UTF-8. This is because a `Bytes` variant represents an intention by
+/// the author of the regular expression to disable Unicode mode, which in turn
+/// impacts the semantics of case insensitive matching. For example, `(?i)k`
+/// and `(?i-u)k` will not match the same set of strings.
+#[derive(Clone, Eq, PartialEq)]
+pub enum Class {
+ /// A set of characters represented by Unicode scalar values.
+ Unicode(ClassUnicode),
+ /// A set of characters represented by arbitrary bytes (one byte per
+ /// character).
+ Bytes(ClassBytes),
+}
+
+impl Class {
+ /// Apply Unicode simple case folding to this character class, in place.
+ /// The character class will be expanded to include all simple case folded
+ /// character variants.
+ ///
+ /// If this is a byte oriented character class, then this will be limited
+ /// to the ASCII ranges `A-Z` and `a-z`.
+ ///
+ /// # Panics
+ ///
+ /// This routine panics when the case mapping data necessary for this
+ /// routine to complete is unavailable. This occurs when the `unicode-case`
+ /// feature is not enabled and the underlying class is Unicode oriented.
+ ///
+ /// Callers should prefer using `try_case_fold_simple` instead, which will
+ /// return an error instead of panicking.
+ pub fn case_fold_simple(&mut self) {
+ match *self {
+ Class::Unicode(ref mut x) => x.case_fold_simple(),
+ Class::Bytes(ref mut x) => x.case_fold_simple(),
+ }
+ }
+
+ /// Apply Unicode simple case folding to this character class, in place.
+ /// The character class will be expanded to include all simple case folded
+ /// character variants.
+ ///
+ /// If this is a byte oriented character class, then this will be limited
+ /// to the ASCII ranges `A-Z` and `a-z`.
+ ///
+ /// # Error
+ ///
+ /// This routine returns an error when the case mapping data necessary
+ /// for this routine to complete is unavailable. This occurs when the
+ /// `unicode-case` feature is not enabled and the underlying class is
+ /// Unicode oriented.
+ pub fn try_case_fold_simple(
+ &mut self,
+ ) -> core::result::Result<(), CaseFoldError> {
+ match *self {
+ Class::Unicode(ref mut x) => x.try_case_fold_simple()?,
+ Class::Bytes(ref mut x) => x.case_fold_simple(),
+ }
+ Ok(())
+ }
+
+ /// Negate this character class in place.
+ ///
+ /// After completion, this character class will contain precisely the
+ /// characters that weren't previously in the class.
+ pub fn negate(&mut self) {
+ match *self {
+ Class::Unicode(ref mut x) => x.negate(),
+ Class::Bytes(ref mut x) => x.negate(),
+ }
+ }
+
+ /// Returns true if and only if this character class will only ever match
+ /// valid UTF-8.
+ ///
+ /// A character class can match invalid UTF-8 only when the following
+ /// conditions are met:
+ ///
+ /// 1. The translator was configured to permit generating an expression
+ /// that can match invalid UTF-8. (By default, this is disabled.)
+ /// 2. Unicode mode (via the `u` flag) was disabled either in the concrete
+ /// syntax or in the parser builder. By default, Unicode mode is
+ /// enabled.
+ pub fn is_utf8(&self) -> bool {
+ match *self {
+ Class::Unicode(_) => true,
+ Class::Bytes(ref x) => x.is_ascii(),
+ }
+ }
+
+ /// Returns the length, in bytes, of the smallest string matched by this
+ /// character class.
+ ///
+ /// For non-empty byte oriented classes, this always returns `1`. For
+ /// non-empty Unicode oriented classes, this can return `1`, `2`, `3` or
+ /// `4`. For empty classes, `None` is returned. It is impossible for `0` to
+ /// be returned.
+ ///
+ /// # Example
+ ///
+ /// This example shows some examples of regexes and their corresponding
+ /// minimum length, if any.
+ ///
+ /// ```
+ /// use regex_syntax::{hir::Properties, parse};
+ ///
+ /// // The empty string has a min length of 0.
+ /// let hir = parse(r"")?;
+ /// assert_eq!(Some(0), hir.properties().minimum_len());
+ /// // As do other types of regexes that only match the empty string.
+ /// let hir = parse(r"^$\b\B")?;
+ /// assert_eq!(Some(0), hir.properties().minimum_len());
+ /// // A regex that can match the empty string but match more is still 0.
+ /// let hir = parse(r"a*")?;
+ /// assert_eq!(Some(0), hir.properties().minimum_len());
+ /// // A regex that matches nothing has no minimum defined.
+ /// let hir = parse(r"[a&&b]")?;
+ /// assert_eq!(None, hir.properties().minimum_len());
+ /// // Character classes usually have a minimum length of 1.
+ /// let hir = parse(r"\w")?;
+ /// assert_eq!(Some(1), hir.properties().minimum_len());
+ /// // But sometimes Unicode classes might be bigger!
+ /// let hir = parse(r"\p{Cyrillic}")?;
+ /// assert_eq!(Some(2), hir.properties().minimum_len());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn minimum_len(&self) -> Option<usize> {
+ match *self {
+ Class::Unicode(ref x) => x.minimum_len(),
+ Class::Bytes(ref x) => x.minimum_len(),
+ }
+ }
+
+ /// Returns the length, in bytes, of the longest string matched by this
+ /// character class.
+ ///
+ /// For non-empty byte oriented classes, this always returns `1`. For
+ /// non-empty Unicode oriented classes, this can return `1`, `2`, `3` or
+ /// `4`. For empty classes, `None` is returned. It is impossible for `0` to
+ /// be returned.
+ ///
+ /// # Example
+ ///
+ /// This example shows some examples of regexes and their corresponding
+ /// maximum length, if any.
+ ///
+ /// ```
+ /// use regex_syntax::{hir::Properties, parse};
+ ///
+ /// // The empty string has a max length of 0.
+ /// let hir = parse(r"")?;
+ /// assert_eq!(Some(0), hir.properties().maximum_len());
+ /// // As do other types of regexes that only match the empty string.
+ /// let hir = parse(r"^$\b\B")?;
+ /// assert_eq!(Some(0), hir.properties().maximum_len());
+ /// // A regex that matches nothing has no maximum defined.
+ /// let hir = parse(r"[a&&b]")?;
+ /// assert_eq!(None, hir.properties().maximum_len());
+ /// // Bounded repeats work as you expect.
+ /// let hir = parse(r"x{2,10}")?;
+ /// assert_eq!(Some(10), hir.properties().maximum_len());
+ /// // An unbounded repeat means there is no maximum.
+ /// let hir = parse(r"x{2,}")?;
+ /// assert_eq!(None, hir.properties().maximum_len());
+ /// // With Unicode enabled, \w can match up to 4 bytes!
+ /// let hir = parse(r"\w")?;
+ /// assert_eq!(Some(4), hir.properties().maximum_len());
+ /// // Without Unicode enabled, \w matches at most 1 byte.
+ /// let hir = parse(r"(?-u)\w")?;
+ /// assert_eq!(Some(1), hir.properties().maximum_len());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn maximum_len(&self) -> Option<usize> {
+ match *self {
+ Class::Unicode(ref x) => x.maximum_len(),
+ Class::Bytes(ref x) => x.maximum_len(),
+ }
+ }
+
+ /// Returns true if and only if this character class is empty. That is,
+ /// it has no elements.
+ ///
+ /// An empty character can never match anything, including an empty string.
+ pub fn is_empty(&self) -> bool {
+ match *self {
+ Class::Unicode(ref x) => x.ranges().is_empty(),
+ Class::Bytes(ref x) => x.ranges().is_empty(),
+ }
+ }
+
+ /// If this class consists of exactly one element (whether a codepoint or a
+ /// byte), then return it as a literal byte string.
+ ///
+ /// If this class is empty or contains more than one element, then `None`
+ /// is returned.
+ pub fn literal(&self) -> Option<Vec<u8>> {
+ match *self {
+ Class::Unicode(ref x) => x.literal(),
+ Class::Bytes(ref x) => x.literal(),
+ }
+ }
+}
+
+impl core::fmt::Debug for Class {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ use crate::debug::Byte;
+
+ let mut fmter = f.debug_set();
+ match *self {
+ Class::Unicode(ref cls) => {
+ for r in cls.ranges().iter() {
+ fmter.entry(&(r.start..=r.end));
+ }
+ }
+ Class::Bytes(ref cls) => {
+ for r in cls.ranges().iter() {
+ fmter.entry(&(Byte(r.start)..=Byte(r.end)));
+ }
+ }
+ }
+ fmter.finish()
+ }
+}
+
+/// A set of characters represented by Unicode scalar values.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct ClassUnicode {
+ set: IntervalSet<ClassUnicodeRange>,
+}
+
+impl ClassUnicode {
+ /// Create a new class from a sequence of ranges.
+ ///
+ /// The given ranges do not need to be in any specific order, and ranges
+ /// may overlap. Ranges will automatically be sorted into a canonical
+ /// non-overlapping order.
+ pub fn new<I>(ranges: I) -> ClassUnicode
+ where
+ I: IntoIterator<Item = ClassUnicodeRange>,
+ {
+ ClassUnicode { set: IntervalSet::new(ranges) }
+ }
+
+ /// Create a new class with no ranges.
+ ///
+ /// An empty class matches nothing. That is, it is equivalent to
+ /// [`Hir::fail`].
+ pub fn empty() -> ClassUnicode {
+ ClassUnicode::new(vec![])
+ }
+
+ /// Add a new range to this set.
+ pub fn push(&mut self, range: ClassUnicodeRange) {
+ self.set.push(range);
+ }
+
+ /// Return an iterator over all ranges in this class.
+ ///
+ /// The iterator yields ranges in ascending order.
+ pub fn iter(&self) -> ClassUnicodeIter<'_> {
+ ClassUnicodeIter(self.set.iter())
+ }
+
+ /// Return the underlying ranges as a slice.
+ pub fn ranges(&self) -> &[ClassUnicodeRange] {
+ self.set.intervals()
+ }
+
+ /// Expand this character class such that it contains all case folded
+ /// characters, according to Unicode's "simple" mapping. For example, if
+ /// this class consists of the range `a-z`, then applying case folding will
+ /// result in the class containing both the ranges `a-z` and `A-Z`.
+ ///
+ /// # Panics
+ ///
+ /// This routine panics when the case mapping data necessary for this
+ /// routine to complete is unavailable. This occurs when the `unicode-case`
+ /// feature is not enabled.
+ ///
+ /// Callers should prefer using `try_case_fold_simple` instead, which will
+ /// return an error instead of panicking.
+ pub fn case_fold_simple(&mut self) {
+ self.set
+ .case_fold_simple()
+ .expect("unicode-case feature must be enabled");
+ }
+
+ /// Expand this character class such that it contains all case folded
+ /// characters, according to Unicode's "simple" mapping. For example, if
+ /// this class consists of the range `a-z`, then applying case folding will
+ /// result in the class containing both the ranges `a-z` and `A-Z`.
+ ///
+ /// # Error
+ ///
+ /// This routine returns an error when the case mapping data necessary
+ /// for this routine to complete is unavailable. This occurs when the
+ /// `unicode-case` feature is not enabled.
+ pub fn try_case_fold_simple(
+ &mut self,
+ ) -> core::result::Result<(), CaseFoldError> {
+ self.set.case_fold_simple()
+ }
+
+ /// Negate this character class.
+ ///
+ /// For all `c` where `c` is a Unicode scalar value, if `c` was in this
+ /// set, then it will not be in this set after negation.
+ pub fn negate(&mut self) {
+ self.set.negate();
+ }
+
+ /// Union this character class with the given character class, in place.
+ pub fn union(&mut self, other: &ClassUnicode) {
+ self.set.union(&other.set);
+ }
+
+ /// Intersect this character class with the given character class, in
+ /// place.
+ pub fn intersect(&mut self, other: &ClassUnicode) {
+ self.set.intersect(&other.set);
+ }
+
+ /// Subtract the given character class from this character class, in place.
+ pub fn difference(&mut self, other: &ClassUnicode) {
+ self.set.difference(&other.set);
+ }
+
+ /// Compute the symmetric difference of the given character classes, in
+ /// place.
+ ///
+ /// This computes the symmetric difference of two character classes. This
+ /// removes all elements in this class that are also in the given class,
+ /// but all adds all elements from the given class that aren't in this
+ /// class. That is, the class will contain all elements in either class,
+ /// but will not contain any elements that are in both classes.
+ pub fn symmetric_difference(&mut self, other: &ClassUnicode) {
+ self.set.symmetric_difference(&other.set);
+ }
+
+ /// Returns true if and only if this character class will either match
+ /// nothing or only ASCII bytes. Stated differently, this returns false
+ /// if and only if this class contains a non-ASCII codepoint.
+ pub fn is_ascii(&self) -> bool {
+ self.set.intervals().last().map_or(true, |r| r.end <= '\x7F')
+ }
+
+ /// Returns the length, in bytes, of the smallest string matched by this
+ /// character class.
+ ///
+ /// Returns `None` when the class is empty.
+ pub fn minimum_len(&self) -> Option<usize> {
+ let first = self.ranges().get(0)?;
+ // Correct because c1 < c2 implies c1.len_utf8() < c2.len_utf8().
+ Some(first.start.len_utf8())
+ }
+
+ /// Returns the length, in bytes, of the longest string matched by this
+ /// character class.
+ ///
+ /// Returns `None` when the class is empty.
+ pub fn maximum_len(&self) -> Option<usize> {
+ let last = self.ranges().last()?;
+ // Correct because c1 < c2 implies c1.len_utf8() < c2.len_utf8().
+ Some(last.end.len_utf8())
+ }
+
+ /// If this class consists of exactly one codepoint, then return it as
+ /// a literal byte string.
+ ///
+ /// If this class is empty or contains more than one codepoint, then `None`
+ /// is returned.
+ pub fn literal(&self) -> Option<Vec<u8>> {
+ let rs = self.ranges();
+ if rs.len() == 1 && rs[0].start == rs[0].end {
+ Some(rs[0].start.encode_utf8(&mut [0; 4]).to_string().into_bytes())
+ } else {
+ None
+ }
+ }
+
+ /// If this class consists of only ASCII ranges, then return its
+ /// corresponding and equivalent byte class.
+ pub fn to_byte_class(&self) -> Option<ClassBytes> {
+ if !self.is_ascii() {
+ return None;
+ }
+ Some(ClassBytes::new(self.ranges().iter().map(|r| {
+ // Since we are guaranteed that our codepoint range is ASCII, the
+ // 'u8::try_from' calls below are guaranteed to be correct.
+ ClassBytesRange {
+ start: u8::try_from(r.start).unwrap(),
+ end: u8::try_from(r.end).unwrap(),
+ }
+ })))
+ }
+}
+
+/// An iterator over all ranges in a Unicode character class.
+///
+/// The lifetime `'a` refers to the lifetime of the underlying class.
+#[derive(Debug)]
+pub struct ClassUnicodeIter<'a>(IntervalSetIter<'a, ClassUnicodeRange>);
+
+impl<'a> Iterator for ClassUnicodeIter<'a> {
+ type Item = &'a ClassUnicodeRange;
+
+ fn next(&mut self) -> Option<&'a ClassUnicodeRange> {
+ self.0.next()
+ }
+}
+
+/// A single range of characters represented by Unicode scalar values.
+///
+/// The range is closed. That is, the start and end of the range are included
+/// in the range.
+#[derive(Clone, Copy, Default, Eq, PartialEq, PartialOrd, Ord)]
+pub struct ClassUnicodeRange {
+ start: char,
+ end: char,
+}
+
+impl core::fmt::Debug for ClassUnicodeRange {
+ fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+ let start = if !self.start.is_whitespace() && !self.start.is_control()
+ {
+ self.start.to_string()
+ } else {
+ format!("0x{:X}", u32::from(self.start))
+ };
+ let end = if !self.end.is_whitespace() && !self.end.is_control() {
+ self.end.to_string()
+ } else {
+ format!("0x{:X}", u32::from(self.end))
+ };
+ f.debug_struct("ClassUnicodeRange")
+ .field("start", &start)
+ .field("end", &end)
+ .finish()
+ }
+}
+
+impl Interval for ClassUnicodeRange {
+ type Bound = char;
+
+ #[inline]
+ fn lower(&self) -> char {
+ self.start
+ }
+ #[inline]
+ fn upper(&self) -> char {
+ self.end
+ }
+ #[inline]
+ fn set_lower(&mut self, bound: char) {
+ self.start = bound;
+ }
+ #[inline]
+ fn set_upper(&mut self, bound: char) {
+ self.end = bound;
+ }
+
+ /// Apply simple case folding to this Unicode scalar value range.
+ ///
+ /// Additional ranges are appended to the given vector. Canonical ordering
+ /// is *not* maintained in the given vector.
+ fn case_fold_simple(
+ &self,
+ ranges: &mut Vec<ClassUnicodeRange>,
+ ) -> Result<(), unicode::CaseFoldError> {
+ let mut folder = unicode::SimpleCaseFolder::new()?;
+ if !folder.overlaps(self.start, self.end) {
+ return Ok(());
+ }
+ let (start, end) = (u32::from(self.start), u32::from(self.end));
+ for cp in (start..=end).filter_map(char::from_u32) {
+ for &cp_folded in folder.mapping(cp) {
+ ranges.push(ClassUnicodeRange::new(cp_folded, cp_folded));
+ }
+ }
+ Ok(())
+ }
+}
+
+impl ClassUnicodeRange {
+ /// Create a new Unicode scalar value range for a character class.
+ ///
+ /// The returned range is always in a canonical form. That is, the range
+ /// returned always satisfies the invariant that `start <= end`.
+ pub fn new(start: char, end: char) -> ClassUnicodeRange {
+ ClassUnicodeRange::create(start, end)
+ }
+
+ /// Return the start of this range.
+ ///
+ /// The start of a range is always less than or equal to the end of the
+ /// range.
+ pub fn start(&self) -> char {
+ self.start
+ }
+
+ /// Return the end of this range.
+ ///
+ /// The end of a range is always greater than or equal to the start of the
+ /// range.
+ pub fn end(&self) -> char {
+ self.end
+ }
+
+ /// Returns the number of codepoints in this range.
+ pub fn len(&self) -> usize {
+ let diff = 1 + u32::from(self.end) - u32::from(self.start);
+ // This is likely to panic in 16-bit targets since a usize can only fit
+ // 2^16. It's not clear what to do here, other than to return an error
+ // when building a Unicode class that contains a range whose length
+ // overflows usize. (Which, to be honest, is probably quite common on
+ // 16-bit targets. For example, this would imply that '.' and '\p{any}'
+ // would be impossible to build.)
+ usize::try_from(diff).expect("char class len fits in usize")
+ }
+}
+
+/// A set of characters represented by arbitrary bytes.
+///
+/// Each byte corresponds to one character.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct ClassBytes {
+ set: IntervalSet<ClassBytesRange>,
+}
+
+impl ClassBytes {
+ /// Create a new class from a sequence of ranges.
+ ///
+ /// The given ranges do not need to be in any specific order, and ranges
+ /// may overlap. Ranges will automatically be sorted into a canonical
+ /// non-overlapping order.
+ pub fn new<I>(ranges: I) -> ClassBytes
+ where
+ I: IntoIterator<Item = ClassBytesRange>,
+ {
+ ClassBytes { set: IntervalSet::new(ranges) }
+ }
+
+ /// Create a new class with no ranges.
+ ///
+ /// An empty class matches nothing. That is, it is equivalent to
+ /// [`Hir::fail`].
+ pub fn empty() -> ClassBytes {
+ ClassBytes::new(vec![])
+ }
+
+ /// Add a new range to this set.
+ pub fn push(&mut self, range: ClassBytesRange) {
+ self.set.push(range);
+ }
+
+ /// Return an iterator over all ranges in this class.
+ ///
+ /// The iterator yields ranges in ascending order.
+ pub fn iter(&self) -> ClassBytesIter<'_> {
+ ClassBytesIter(self.set.iter())
+ }
+
+ /// Return the underlying ranges as a slice.
+ pub fn ranges(&self) -> &[ClassBytesRange] {
+ self.set.intervals()
+ }
+
+ /// Expand this character class such that it contains all case folded
+ /// characters. For example, if this class consists of the range `a-z`,
+ /// then applying case folding will result in the class containing both the
+ /// ranges `a-z` and `A-Z`.
+ ///
+ /// Note that this only applies ASCII case folding, which is limited to the
+ /// characters `a-z` and `A-Z`.
+ pub fn case_fold_simple(&mut self) {
+ self.set.case_fold_simple().expect("ASCII case folding never fails");
+ }
+
+ /// Negate this byte class.
+ ///
+ /// For all `b` where `b` is a any byte, if `b` was in this set, then it
+ /// will not be in this set after negation.
+ pub fn negate(&mut self) {
+ self.set.negate();
+ }
+
+ /// Union this byte class with the given byte class, in place.
+ pub fn union(&mut self, other: &ClassBytes) {
+ self.set.union(&other.set);
+ }
+
+ /// Intersect this byte class with the given byte class, in place.
+ pub fn intersect(&mut self, other: &ClassBytes) {
+ self.set.intersect(&other.set);
+ }
+
+ /// Subtract the given byte class from this byte class, in place.
+ pub fn difference(&mut self, other: &ClassBytes) {
+ self.set.difference(&other.set);
+ }
+
+ /// Compute the symmetric difference of the given byte classes, in place.
+ ///
+ /// This computes the symmetric difference of two byte classes. This
+ /// removes all elements in this class that are also in the given class,
+ /// but all adds all elements from the given class that aren't in this
+ /// class. That is, the class will contain all elements in either class,
+ /// but will not contain any elements that are in both classes.
+ pub fn symmetric_difference(&mut self, other: &ClassBytes) {
+ self.set.symmetric_difference(&other.set);
+ }
+
+ /// Returns true if and only if this character class will either match
+ /// nothing or only ASCII bytes. Stated differently, this returns false
+ /// if and only if this class contains a non-ASCII byte.
+ pub fn is_ascii(&self) -> bool {
+ self.set.intervals().last().map_or(true, |r| r.end <= 0x7F)
+ }
+
+ /// Returns the length, in bytes, of the smallest string matched by this
+ /// character class.
+ ///
+ /// Returns `None` when the class is empty.
+ pub fn minimum_len(&self) -> Option<usize> {
+ if self.ranges().is_empty() {
+ None
+ } else {
+ Some(1)
+ }
+ }
+
+ /// Returns the length, in bytes, of the longest string matched by this
+ /// character class.
+ ///
+ /// Returns `None` when the class is empty.
+ pub fn maximum_len(&self) -> Option<usize> {
+ if self.ranges().is_empty() {
+ None
+ } else {
+ Some(1)
+ }
+ }
+
+ /// If this class consists of exactly one byte, then return it as
+ /// a literal byte string.
+ ///
+ /// If this class is empty or contains more than one byte, then `None`
+ /// is returned.
+ pub fn literal(&self) -> Option<Vec<u8>> {
+ let rs = self.ranges();
+ if rs.len() == 1 && rs[0].start == rs[0].end {
+ Some(vec![rs[0].start])
+ } else {
+ None
+ }
+ }
+
+ /// If this class consists of only ASCII ranges, then return its
+ /// corresponding and equivalent Unicode class.
+ pub fn to_unicode_class(&self) -> Option<ClassUnicode> {
+ if !self.is_ascii() {
+ return None;
+ }
+ Some(ClassUnicode::new(self.ranges().iter().map(|r| {
+ // Since we are guaranteed that our byte range is ASCII, the
+ // 'char::from' calls below are correct and will not erroneously
+ // convert a raw byte value into its corresponding codepoint.
+ ClassUnicodeRange {
+ start: char::from(r.start),
+ end: char::from(r.end),
+ }
+ })))
+ }
+}
+
+/// An iterator over all ranges in a byte character class.
+///
+/// The lifetime `'a` refers to the lifetime of the underlying class.
+#[derive(Debug)]
+pub struct ClassBytesIter<'a>(IntervalSetIter<'a, ClassBytesRange>);
+
+impl<'a> Iterator for ClassBytesIter<'a> {
+ type Item = &'a ClassBytesRange;
+
+ fn next(&mut self) -> Option<&'a ClassBytesRange> {
+ self.0.next()
+ }
+}
+
+/// A single range of characters represented by arbitrary bytes.
+///
+/// The range is closed. That is, the start and end of the range are included
+/// in the range.
+#[derive(Clone, Copy, Default, Eq, PartialEq, PartialOrd, Ord)]
+pub struct ClassBytesRange {
+ start: u8,
+ end: u8,
+}
+
+impl Interval for ClassBytesRange {
+ type Bound = u8;
+
+ #[inline]
+ fn lower(&self) -> u8 {
+ self.start
+ }
+ #[inline]
+ fn upper(&self) -> u8 {
+ self.end
+ }
+ #[inline]
+ fn set_lower(&mut self, bound: u8) {
+ self.start = bound;
+ }
+ #[inline]
+ fn set_upper(&mut self, bound: u8) {
+ self.end = bound;
+ }
+
+ /// Apply simple case folding to this byte range. Only ASCII case mappings
+ /// (for a-z) are applied.
+ ///
+ /// Additional ranges are appended to the given vector. Canonical ordering
+ /// is *not* maintained in the given vector.
+ fn case_fold_simple(
+ &self,
+ ranges: &mut Vec<ClassBytesRange>,
+ ) -> Result<(), unicode::CaseFoldError> {
+ if !ClassBytesRange::new(b'a', b'z').is_intersection_empty(self) {
+ let lower = cmp::max(self.start, b'a');
+ let upper = cmp::min(self.end, b'z');
+ ranges.push(ClassBytesRange::new(lower - 32, upper - 32));
+ }
+ if !ClassBytesRange::new(b'A', b'Z').is_intersection_empty(self) {
+ let lower = cmp::max(self.start, b'A');
+ let upper = cmp::min(self.end, b'Z');
+ ranges.push(ClassBytesRange::new(lower + 32, upper + 32));
+ }
+ Ok(())
+ }
+}
+
+impl ClassBytesRange {
+ /// Create a new byte range for a character class.
+ ///
+ /// The returned range is always in a canonical form. That is, the range
+ /// returned always satisfies the invariant that `start <= end`.
+ pub fn new(start: u8, end: u8) -> ClassBytesRange {
+ ClassBytesRange::create(start, end)
+ }
+
+ /// Return the start of this range.
+ ///
+ /// The start of a range is always less than or equal to the end of the
+ /// range.
+ pub fn start(&self) -> u8 {
+ self.start
+ }
+
+ /// Return the end of this range.
+ ///
+ /// The end of a range is always greater than or equal to the start of the
+ /// range.
+ pub fn end(&self) -> u8 {
+ self.end
+ }
+
+ /// Returns the number of bytes in this range.
+ pub fn len(&self) -> usize {
+ usize::from(self.end.checked_sub(self.start).unwrap())
+ .checked_add(1)
+ .unwrap()
+ }
+}
+
+impl core::fmt::Debug for ClassBytesRange {
+ fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+ f.debug_struct("ClassBytesRange")
+ .field("start", &crate::debug::Byte(self.start))
+ .field("end", &crate::debug::Byte(self.end))
+ .finish()
+ }
+}
+
+/// The high-level intermediate representation for a look-around assertion.
+///
+/// An assertion match is always zero-length. Also called an "empty match."
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+pub enum Look {
+ /// Match the beginning of text. Specifically, this matches at the starting
+ /// position of the input.
+ Start = 1 << 0,
+ /// Match the end of text. Specifically, this matches at the ending
+ /// position of the input.
+ End = 1 << 1,
+ /// Match the beginning of a line or the beginning of text. Specifically,
+ /// this matches at the starting position of the input, or at the position
+ /// immediately following a `\n` character.
+ StartLF = 1 << 2,
+ /// Match the end of a line or the end of text. Specifically, this matches
+ /// at the end position of the input, or at the position immediately
+ /// preceding a `\n` character.
+ EndLF = 1 << 3,
+ /// Match the beginning of a line or the beginning of text. Specifically,
+ /// this matches at the starting position of the input, or at the position
+ /// immediately following either a `\r` or `\n` character, but never after
+ /// a `\r` when a `\n` follows.
+ StartCRLF = 1 << 4,
+ /// Match the end of a line or the end of text. Specifically, this matches
+ /// at the end position of the input, or at the position immediately
+ /// preceding a `\r` or `\n` character, but never before a `\n` when a `\r`
+ /// precedes it.
+ EndCRLF = 1 << 5,
+ /// Match an ASCII-only word boundary. That is, this matches a position
+ /// where the left adjacent character and right adjacent character
+ /// correspond to a word and non-word or a non-word and word character.
+ WordAscii = 1 << 6,
+ /// Match an ASCII-only negation of a word boundary.
+ WordAsciiNegate = 1 << 7,
+ /// Match a Unicode-aware word boundary. That is, this matches a position
+ /// where the left adjacent character and right adjacent character
+ /// correspond to a word and non-word or a non-word and word character.
+ WordUnicode = 1 << 8,
+ /// Match a Unicode-aware negation of a word boundary.
+ WordUnicodeNegate = 1 << 9,
+ /// Match the start of an ASCII-only word boundary. That is, this matches a
+ /// position at either the beginning of the haystack or where the previous
+ /// character is not a word character and the following character is a word
+ /// character.
+ WordStartAscii = 1 << 10,
+ /// Match the end of an ASCII-only word boundary. That is, this matches
+ /// a position at either the end of the haystack or where the previous
+ /// character is a word character and the following character is not a word
+ /// character.
+ WordEndAscii = 1 << 11,
+ /// Match the start of a Unicode word boundary. That is, this matches a
+ /// position at either the beginning of the haystack or where the previous
+ /// character is not a word character and the following character is a word
+ /// character.
+ WordStartUnicode = 1 << 12,
+ /// Match the end of a Unicode word boundary. That is, this matches a
+ /// position at either the end of the haystack or where the previous
+ /// character is a word character and the following character is not a word
+ /// character.
+ WordEndUnicode = 1 << 13,
+ /// Match the start half of an ASCII-only word boundary. That is, this
+ /// matches a position at either the beginning of the haystack or where the
+ /// previous character is not a word character.
+ WordStartHalfAscii = 1 << 14,
+ /// Match the end half of an ASCII-only word boundary. That is, this
+ /// matches a position at either the end of the haystack or where the
+ /// following character is not a word character.
+ WordEndHalfAscii = 1 << 15,
+ /// Match the start half of a Unicode word boundary. That is, this matches
+ /// a position at either the beginning of the haystack or where the
+ /// previous character is not a word character.
+ WordStartHalfUnicode = 1 << 16,
+ /// Match the end half of a Unicode word boundary. That is, this matches
+ /// a position at either the end of the haystack or where the following
+ /// character is not a word character.
+ WordEndHalfUnicode = 1 << 17,
+}
+
+impl Look {
+ /// Flip the look-around assertion to its equivalent for reverse searches.
+ /// For example, `StartLF` gets translated to `EndLF`.
+ ///
+ /// Some assertions, such as `WordUnicode`, remain the same since they
+ /// match the same positions regardless of the direction of the search.
+ #[inline]
+ pub const fn reversed(self) -> Look {
+ match self {
+ Look::Start => Look::End,
+ Look::End => Look::Start,
+ Look::StartLF => Look::EndLF,
+ Look::EndLF => Look::StartLF,
+ Look::StartCRLF => Look::EndCRLF,
+ Look::EndCRLF => Look::StartCRLF,
+ Look::WordAscii => Look::WordAscii,
+ Look::WordAsciiNegate => Look::WordAsciiNegate,
+ Look::WordUnicode => Look::WordUnicode,
+ Look::WordUnicodeNegate => Look::WordUnicodeNegate,
+ Look::WordStartAscii => Look::WordEndAscii,
+ Look::WordEndAscii => Look::WordStartAscii,
+ Look::WordStartUnicode => Look::WordEndUnicode,
+ Look::WordEndUnicode => Look::WordStartUnicode,
+ Look::WordStartHalfAscii => Look::WordEndHalfAscii,
+ Look::WordEndHalfAscii => Look::WordStartHalfAscii,
+ Look::WordStartHalfUnicode => Look::WordEndHalfUnicode,
+ Look::WordEndHalfUnicode => Look::WordStartHalfUnicode,
+ }
+ }
+
+ /// Return the underlying representation of this look-around enumeration
+ /// as an integer. Giving the return value to the [`Look::from_repr`]
+ /// constructor is guaranteed to return the same look-around variant that
+ /// one started with within a semver compatible release of this crate.
+ #[inline]
+ pub const fn as_repr(self) -> u32 {
+ // AFAIK, 'as' is the only way to zero-cost convert an int enum to an
+ // actual int.
+ self as u32
+ }
+
+ /// Given the underlying representation of a `Look` value, return the
+ /// corresponding `Look` value if the representation is valid. Otherwise
+ /// `None` is returned.
+ #[inline]
+ pub const fn from_repr(repr: u32) -> Option<Look> {
+ match repr {
+ 0b00_0000_0000_0000_0001 => Some(Look::Start),
+ 0b00_0000_0000_0000_0010 => Some(Look::End),
+ 0b00_0000_0000_0000_0100 => Some(Look::StartLF),
+ 0b00_0000_0000_0000_1000 => Some(Look::EndLF),
+ 0b00_0000_0000_0001_0000 => Some(Look::StartCRLF),
+ 0b00_0000_0000_0010_0000 => Some(Look::EndCRLF),
+ 0b00_0000_0000_0100_0000 => Some(Look::WordAscii),
+ 0b00_0000_0000_1000_0000 => Some(Look::WordAsciiNegate),
+ 0b00_0000_0001_0000_0000 => Some(Look::WordUnicode),
+ 0b00_0000_0010_0000_0000 => Some(Look::WordUnicodeNegate),
+ 0b00_0000_0100_0000_0000 => Some(Look::WordStartAscii),
+ 0b00_0000_1000_0000_0000 => Some(Look::WordEndAscii),
+ 0b00_0001_0000_0000_0000 => Some(Look::WordStartUnicode),
+ 0b00_0010_0000_0000_0000 => Some(Look::WordEndUnicode),
+ 0b00_0100_0000_0000_0000 => Some(Look::WordStartHalfAscii),
+ 0b00_1000_0000_0000_0000 => Some(Look::WordEndHalfAscii),
+ 0b01_0000_0000_0000_0000 => Some(Look::WordStartHalfUnicode),
+ 0b10_0000_0000_0000_0000 => Some(Look::WordEndHalfUnicode),
+ _ => None,
+ }
+ }
+
+ /// Returns a convenient single codepoint representation of this
+ /// look-around assertion. Each assertion is guaranteed to be represented
+ /// by a distinct character.
+ ///
+ /// This is useful for succinctly representing a look-around assertion in
+ /// human friendly but succinct output intended for a programmer working on
+ /// regex internals.
+ #[inline]
+ pub const fn as_char(self) -> char {
+ match self {
+ Look::Start => 'A',
+ Look::End => 'z',
+ Look::StartLF => '^',
+ Look::EndLF => '$',
+ Look::StartCRLF => 'r',
+ Look::EndCRLF => 'R',
+ Look::WordAscii => 'b',
+ Look::WordAsciiNegate => 'B',
+ Look::WordUnicode => '𝛃',
+ Look::WordUnicodeNegate => '𝚩',
+ Look::WordStartAscii => '<',
+ Look::WordEndAscii => '>',
+ Look::WordStartUnicode => '〈',
+ Look::WordEndUnicode => '〉',
+ Look::WordStartHalfAscii => '◁',
+ Look::WordEndHalfAscii => '▷',
+ Look::WordStartHalfUnicode => '◀',
+ Look::WordEndHalfUnicode => '▶',
+ }
+ }
+}
+
+/// The high-level intermediate representation for a capturing group.
+///
+/// A capturing group always has an index and a child expression. It may
+/// also have a name associated with it (e.g., `(?P<foo>\w)`), but it's not
+/// necessary.
+///
+/// Note that there is no explicit representation of a non-capturing group
+/// in a `Hir`. Instead, non-capturing grouping is handled automatically by
+/// the recursive structure of the `Hir` itself.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct Capture {
+ /// The capture index of the capture.
+ pub index: u32,
+ /// The name of the capture, if it exists.
+ pub name: Option<Box<str>>,
+ /// The expression inside the capturing group, which may be empty.
+ pub sub: Box<Hir>,
+}
+
+/// The high-level intermediate representation of a repetition operator.
+///
+/// A repetition operator permits the repetition of an arbitrary
+/// sub-expression.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct Repetition {
+ /// The minimum range of the repetition.
+ ///
+ /// Note that special cases like `?`, `+` and `*` all get translated into
+ /// the ranges `{0,1}`, `{1,}` and `{0,}`, respectively.
+ ///
+ /// When `min` is zero, this expression can match the empty string
+ /// regardless of what its sub-expression is.
+ pub min: u32,
+ /// The maximum range of the repetition.
+ ///
+ /// Note that when `max` is `None`, `min` acts as a lower bound but where
+ /// there is no upper bound. For something like `x{5}` where the min and
+ /// max are equivalent, `min` will be set to `5` and `max` will be set to
+ /// `Some(5)`.
+ pub max: Option<u32>,
+ /// Whether this repetition operator is greedy or not. A greedy operator
+ /// will match as much as it can. A non-greedy operator will match as
+ /// little as it can.
+ ///
+ /// Typically, operators are greedy by default and are only non-greedy when
+ /// a `?` suffix is used, e.g., `(expr)*` is greedy while `(expr)*?` is
+ /// not. However, this can be inverted via the `U` "ungreedy" flag.
+ pub greedy: bool,
+ /// The expression being repeated.
+ pub sub: Box<Hir>,
+}
+
+impl Repetition {
+ /// Returns a new repetition with the same `min`, `max` and `greedy`
+ /// values, but with its sub-expression replaced with the one given.
+ pub fn with(&self, sub: Hir) -> Repetition {
+ Repetition {
+ min: self.min,
+ max: self.max,
+ greedy: self.greedy,
+ sub: Box::new(sub),
+ }
+ }
+}
+
+/// A type describing the different flavors of `.`.
+///
+/// This type is meant to be used with [`Hir::dot`], which is a convenience
+/// routine for building HIR values derived from the `.` regex.
+#[non_exhaustive]
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+pub enum Dot {
+ /// Matches the UTF-8 encoding of any Unicode scalar value.
+ ///
+ /// This is equivalent to `(?su:.)` and also `\p{any}`.
+ AnyChar,
+ /// Matches any byte value.
+ ///
+ /// This is equivalent to `(?s-u:.)` and also `(?-u:[\x00-\xFF])`.
+ AnyByte,
+ /// Matches the UTF-8 encoding of any Unicode scalar value except for the
+ /// `char` given.
+ ///
+ /// This is equivalent to using `(?u-s:.)` with the line terminator set
+ /// to a particular ASCII byte. (Because of peculiarities in the regex
+ /// engines, a line terminator must be a single byte. It follows that when
+ /// UTF-8 mode is enabled, this single byte must also be a Unicode scalar
+ /// value. That is, ti must be ASCII.)
+ ///
+ /// (This and `AnyCharExceptLF` both exist because of legacy reasons.
+ /// `AnyCharExceptLF` will be dropped in the next breaking change release.)
+ AnyCharExcept(char),
+ /// Matches the UTF-8 encoding of any Unicode scalar value except for `\n`.
+ ///
+ /// This is equivalent to `(?u-s:.)` and also `[\p{any}--\n]`.
+ AnyCharExceptLF,
+ /// Matches the UTF-8 encoding of any Unicode scalar value except for `\r`
+ /// and `\n`.
+ ///
+ /// This is equivalent to `(?uR-s:.)` and also `[\p{any}--\r\n]`.
+ AnyCharExceptCRLF,
+ /// Matches any byte value except for the `u8` given.
+ ///
+ /// This is equivalent to using `(?-us:.)` with the line terminator set
+ /// to a particular ASCII byte. (Because of peculiarities in the regex
+ /// engines, a line terminator must be a single byte. It follows that when
+ /// UTF-8 mode is enabled, this single byte must also be a Unicode scalar
+ /// value. That is, ti must be ASCII.)
+ ///
+ /// (This and `AnyByteExceptLF` both exist because of legacy reasons.
+ /// `AnyByteExceptLF` will be dropped in the next breaking change release.)
+ AnyByteExcept(u8),
+ /// Matches any byte value except for `\n`.
+ ///
+ /// This is equivalent to `(?-su:.)` and also `(?-u:[[\x00-\xFF]--\n])`.
+ AnyByteExceptLF,
+ /// Matches any byte value except for `\r` and `\n`.
+ ///
+ /// This is equivalent to `(?R-su:.)` and also `(?-u:[[\x00-\xFF]--\r\n])`.
+ AnyByteExceptCRLF,
+}
+
+/// A custom `Drop` impl is used for `HirKind` such that it uses constant stack
+/// space but heap space proportional to the depth of the total `Hir`.
+impl Drop for Hir {
+ fn drop(&mut self) {
+ use core::mem;
+
+ match *self.kind() {
+ HirKind::Empty
+ | HirKind::Literal(_)
+ | HirKind::Class(_)
+ | HirKind::Look(_) => return,
+ HirKind::Capture(ref x) if x.sub.kind.subs().is_empty() => return,
+ HirKind::Repetition(ref x) if x.sub.kind.subs().is_empty() => {
+ return
+ }
+ HirKind::Concat(ref x) if x.is_empty() => return,
+ HirKind::Alternation(ref x) if x.is_empty() => return,
+ _ => {}
+ }
+
+ let mut stack = vec![mem::replace(self, Hir::empty())];
+ while let Some(mut expr) = stack.pop() {
+ match expr.kind {
+ HirKind::Empty
+ | HirKind::Literal(_)
+ | HirKind::Class(_)
+ | HirKind::Look(_) => {}
+ HirKind::Capture(ref mut x) => {
+ stack.push(mem::replace(&mut x.sub, Hir::empty()));
+ }
+ HirKind::Repetition(ref mut x) => {
+ stack.push(mem::replace(&mut x.sub, Hir::empty()));
+ }
+ HirKind::Concat(ref mut x) => {
+ stack.extend(x.drain(..));
+ }
+ HirKind::Alternation(ref mut x) => {
+ stack.extend(x.drain(..));
+ }
+ }
+ }
+ }
+}
+
+/// A type that collects various properties of an HIR value.
+///
+/// Properties are always scalar values and represent meta data that is
+/// computed inductively on an HIR value. Properties are defined for all
+/// HIR values.
+///
+/// All methods on a `Properties` value take constant time and are meant to
+/// be cheap to call.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct Properties(Box<PropertiesI>);
+
+/// The property definition. It is split out so that we can box it, and
+/// there by make `Properties` use less stack size. This is kind-of important
+/// because every HIR value has a `Properties` attached to it.
+///
+/// This does have the unfortunate consequence that creating any HIR value
+/// always leads to at least one alloc for properties, but this is generally
+/// true anyway (for pretty much all HirKinds except for look-arounds).
+#[derive(Clone, Debug, Eq, PartialEq)]
+struct PropertiesI {
+ minimum_len: Option<usize>,
+ maximum_len: Option<usize>,
+ look_set: LookSet,
+ look_set_prefix: LookSet,
+ look_set_suffix: LookSet,
+ look_set_prefix_any: LookSet,
+ look_set_suffix_any: LookSet,
+ utf8: bool,
+ explicit_captures_len: usize,
+ static_explicit_captures_len: Option<usize>,
+ literal: bool,
+ alternation_literal: bool,
+}
+
+impl Properties {
+ /// Returns the length (in bytes) of the smallest string matched by this
+ /// HIR.
+ ///
+ /// A return value of `0` is possible and occurs when the HIR can match an
+ /// empty string.
+ ///
+ /// `None` is returned when there is no minimum length. This occurs in
+ /// precisely the cases where the HIR matches nothing. i.e., The language
+ /// the regex matches is empty. An example of such a regex is `\P{any}`.
+ #[inline]
+ pub fn minimum_len(&self) -> Option<usize> {
+ self.0.minimum_len
+ }
+
+ /// Returns the length (in bytes) of the longest string matched by this
+ /// HIR.
+ ///
+ /// A return value of `0` is possible and occurs when nothing longer than
+ /// the empty string is in the language described by this HIR.
+ ///
+ /// `None` is returned when there is no longest matching string. This
+ /// occurs when the HIR matches nothing or when there is no upper bound on
+ /// the length of matching strings. Example of such regexes are `\P{any}`
+ /// (matches nothing) and `a+` (has no upper bound).
+ #[inline]
+ pub fn maximum_len(&self) -> Option<usize> {
+ self.0.maximum_len
+ }
+
+ /// Returns a set of all look-around assertions that appear at least once
+ /// in this HIR value.
+ #[inline]
+ pub fn look_set(&self) -> LookSet {
+ self.0.look_set
+ }
+
+ /// Returns a set of all look-around assertions that appear as a prefix for
+ /// this HIR value. That is, the set returned corresponds to the set of
+ /// assertions that must be passed before matching any bytes in a haystack.
+ ///
+ /// For example, `hir.look_set_prefix().contains(Look::Start)` returns true
+ /// if and only if the HIR is fully anchored at the start.
+ #[inline]
+ pub fn look_set_prefix(&self) -> LookSet {
+ self.0.look_set_prefix
+ }
+
+ /// Returns a set of all look-around assertions that appear as a _possible_
+ /// prefix for this HIR value. That is, the set returned corresponds to the
+ /// set of assertions that _may_ be passed before matching any bytes in a
+ /// haystack.
+ ///
+ /// For example, `hir.look_set_prefix_any().contains(Look::Start)` returns
+ /// true if and only if it's possible for the regex to match through a
+ /// anchored assertion before consuming any input.
+ #[inline]
+ pub fn look_set_prefix_any(&self) -> LookSet {
+ self.0.look_set_prefix_any
+ }
+
+ /// Returns a set of all look-around assertions that appear as a suffix for
+ /// this HIR value. That is, the set returned corresponds to the set of
+ /// assertions that must be passed in order to be considered a match after
+ /// all other consuming HIR expressions.
+ ///
+ /// For example, `hir.look_set_suffix().contains(Look::End)` returns true
+ /// if and only if the HIR is fully anchored at the end.
+ #[inline]
+ pub fn look_set_suffix(&self) -> LookSet {
+ self.0.look_set_suffix
+ }
+
+ /// Returns a set of all look-around assertions that appear as a _possible_
+ /// suffix for this HIR value. That is, the set returned corresponds to the
+ /// set of assertions that _may_ be passed before matching any bytes in a
+ /// haystack.
+ ///
+ /// For example, `hir.look_set_suffix_any().contains(Look::End)` returns
+ /// true if and only if it's possible for the regex to match through a
+ /// anchored assertion at the end of a match without consuming any input.
+ #[inline]
+ pub fn look_set_suffix_any(&self) -> LookSet {
+ self.0.look_set_suffix_any
+ }
+
+ /// Return true if and only if the corresponding HIR will always match
+ /// valid UTF-8.
+ ///
+ /// When this returns false, then it is possible for this HIR expression to
+ /// match invalid UTF-8, including by matching between the code units of
+ /// a single UTF-8 encoded codepoint.
+ ///
+ /// Note that this returns true even when the corresponding HIR can match
+ /// the empty string. Since an empty string can technically appear between
+ /// UTF-8 code units, it is possible for a match to be reported that splits
+ /// a codepoint which could in turn be considered matching invalid UTF-8.
+ /// However, it is generally assumed that such empty matches are handled
+ /// specially by the search routine if it is absolutely required that
+ /// matches not split a codepoint.
+ ///
+ /// # Example
+ ///
+ /// This code example shows the UTF-8 property of a variety of patterns.
+ ///
+ /// ```
+ /// use regex_syntax::{ParserBuilder, parse};
+ ///
+ /// // Examples of 'is_utf8() == true'.
+ /// assert!(parse(r"a")?.properties().is_utf8());
+ /// assert!(parse(r"[^a]")?.properties().is_utf8());
+ /// assert!(parse(r".")?.properties().is_utf8());
+ /// assert!(parse(r"\W")?.properties().is_utf8());
+ /// assert!(parse(r"\b")?.properties().is_utf8());
+ /// assert!(parse(r"\B")?.properties().is_utf8());
+ /// assert!(parse(r"(?-u)\b")?.properties().is_utf8());
+ /// assert!(parse(r"(?-u)\B")?.properties().is_utf8());
+ /// // Unicode mode is enabled by default, and in
+ /// // that mode, all \x hex escapes are treated as
+ /// // codepoints. So this actually matches the UTF-8
+ /// // encoding of U+00FF.
+ /// assert!(parse(r"\xFF")?.properties().is_utf8());
+ ///
+ /// // Now we show examples of 'is_utf8() == false'.
+ /// // The only way to do this is to force the parser
+ /// // to permit invalid UTF-8, otherwise all of these
+ /// // would fail to parse!
+ /// let parse = |pattern| {
+ /// ParserBuilder::new().utf8(false).build().parse(pattern)
+ /// };
+ /// assert!(!parse(r"(?-u)[^a]")?.properties().is_utf8());
+ /// assert!(!parse(r"(?-u).")?.properties().is_utf8());
+ /// assert!(!parse(r"(?-u)\W")?.properties().is_utf8());
+ /// // Conversely to the equivalent example above,
+ /// // when Unicode mode is disabled, \x hex escapes
+ /// // are treated as their raw byte values.
+ /// assert!(!parse(r"(?-u)\xFF")?.properties().is_utf8());
+ /// // Note that just because we disabled UTF-8 in the
+ /// // parser doesn't mean we still can't use Unicode.
+ /// // It is enabled by default, so \xFF is still
+ /// // equivalent to matching the UTF-8 encoding of
+ /// // U+00FF by default.
+ /// assert!(parse(r"\xFF")?.properties().is_utf8());
+ /// // Even though we use raw bytes that individually
+ /// // are not valid UTF-8, when combined together, the
+ /// // overall expression *does* match valid UTF-8!
+ /// assert!(parse(r"(?-u)\xE2\x98\x83")?.properties().is_utf8());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn is_utf8(&self) -> bool {
+ self.0.utf8
+ }
+
+ /// Returns the total number of explicit capturing groups in the
+ /// corresponding HIR.
+ ///
+ /// Note that this does not include the implicit capturing group
+ /// corresponding to the entire match that is typically included by regex
+ /// engines.
+ ///
+ /// # Example
+ ///
+ /// This method will return `0` for `a` and `1` for `(a)`:
+ ///
+ /// ```
+ /// use regex_syntax::parse;
+ ///
+ /// assert_eq!(0, parse("a")?.properties().explicit_captures_len());
+ /// assert_eq!(1, parse("(a)")?.properties().explicit_captures_len());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn explicit_captures_len(&self) -> usize {
+ self.0.explicit_captures_len
+ }
+
+ /// Returns the total number of explicit capturing groups that appear in
+ /// every possible match.
+ ///
+ /// If the number of capture groups can vary depending on the match, then
+ /// this returns `None`. That is, a value is only returned when the number
+ /// of matching groups is invariant or "static."
+ ///
+ /// Note that this does not include the implicit capturing group
+ /// corresponding to the entire match.
+ ///
+ /// # Example
+ ///
+ /// This shows a few cases where a static number of capture groups is
+ /// available and a few cases where it is not.
+ ///
+ /// ```
+ /// use regex_syntax::parse;
+ ///
+ /// let len = |pattern| {
+ /// parse(pattern).map(|h| {
+ /// h.properties().static_explicit_captures_len()
+ /// })
+ /// };
+ ///
+ /// assert_eq!(Some(0), len("a")?);
+ /// assert_eq!(Some(1), len("(a)")?);
+ /// assert_eq!(Some(1), len("(a)|(b)")?);
+ /// assert_eq!(Some(2), len("(a)(b)|(c)(d)")?);
+ /// assert_eq!(None, len("(a)|b")?);
+ /// assert_eq!(None, len("a|(b)")?);
+ /// assert_eq!(None, len("(b)*")?);
+ /// assert_eq!(Some(1), len("(b)+")?);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn static_explicit_captures_len(&self) -> Option<usize> {
+ self.0.static_explicit_captures_len
+ }
+
+ /// Return true if and only if this HIR is a simple literal. This is
+ /// only true when this HIR expression is either itself a `Literal` or a
+ /// concatenation of only `Literal`s.
+ ///
+ /// For example, `f` and `foo` are literals, but `f+`, `(foo)`, `foo()` and
+ /// the empty string are not (even though they contain sub-expressions that
+ /// are literals).
+ #[inline]
+ pub fn is_literal(&self) -> bool {
+ self.0.literal
+ }
+
+ /// Return true if and only if this HIR is either a simple literal or an
+ /// alternation of simple literals. This is only
+ /// true when this HIR expression is either itself a `Literal` or a
+ /// concatenation of only `Literal`s or an alternation of only `Literal`s.
+ ///
+ /// For example, `f`, `foo`, `a|b|c`, and `foo|bar|baz` are alternation
+ /// literals, but `f+`, `(foo)`, `foo()`, and the empty pattern are not
+ /// (even though that contain sub-expressions that are literals).
+ #[inline]
+ pub fn is_alternation_literal(&self) -> bool {
+ self.0.alternation_literal
+ }
+
+ /// Returns the total amount of heap memory usage, in bytes, used by this
+ /// `Properties` value.
+ #[inline]
+ pub fn memory_usage(&self) -> usize {
+ core::mem::size_of::<PropertiesI>()
+ }
+
+ /// Returns a new set of properties that corresponds to the union of the
+ /// iterator of properties given.
+ ///
+ /// This is useful when one has multiple `Hir` expressions and wants
+ /// to combine them into a single alternation without constructing the
+ /// corresponding `Hir`. This routine provides a way of combining the
+ /// properties of each `Hir` expression into one set of properties
+ /// representing the union of those expressions.
+ ///
+ /// # Example: union with HIRs that never match
+ ///
+ /// This example shows that unioning properties together with one that
+ /// represents a regex that never matches will "poison" certain attributes,
+ /// like the minimum and maximum lengths.
+ ///
+ /// ```
+ /// use regex_syntax::{hir::Properties, parse};
+ ///
+ /// let hir1 = parse("ab?c?")?;
+ /// assert_eq!(Some(1), hir1.properties().minimum_len());
+ /// assert_eq!(Some(3), hir1.properties().maximum_len());
+ ///
+ /// let hir2 = parse(r"[a&&b]")?;
+ /// assert_eq!(None, hir2.properties().minimum_len());
+ /// assert_eq!(None, hir2.properties().maximum_len());
+ ///
+ /// let hir3 = parse(r"wxy?z?")?;
+ /// assert_eq!(Some(2), hir3.properties().minimum_len());
+ /// assert_eq!(Some(4), hir3.properties().maximum_len());
+ ///
+ /// let unioned = Properties::union([
+ /// hir1.properties(),
+ /// hir2.properties(),
+ /// hir3.properties(),
+ /// ]);
+ /// assert_eq!(None, unioned.minimum_len());
+ /// assert_eq!(None, unioned.maximum_len());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// The maximum length can also be "poisoned" by a pattern that has no
+ /// upper bound on the length of a match. The minimum length remains
+ /// unaffected:
+ ///
+ /// ```
+ /// use regex_syntax::{hir::Properties, parse};
+ ///
+ /// let hir1 = parse("ab?c?")?;
+ /// assert_eq!(Some(1), hir1.properties().minimum_len());
+ /// assert_eq!(Some(3), hir1.properties().maximum_len());
+ ///
+ /// let hir2 = parse(r"a+")?;
+ /// assert_eq!(Some(1), hir2.properties().minimum_len());
+ /// assert_eq!(None, hir2.properties().maximum_len());
+ ///
+ /// let hir3 = parse(r"wxy?z?")?;
+ /// assert_eq!(Some(2), hir3.properties().minimum_len());
+ /// assert_eq!(Some(4), hir3.properties().maximum_len());
+ ///
+ /// let unioned = Properties::union([
+ /// hir1.properties(),
+ /// hir2.properties(),
+ /// hir3.properties(),
+ /// ]);
+ /// assert_eq!(Some(1), unioned.minimum_len());
+ /// assert_eq!(None, unioned.maximum_len());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn union<I, P>(props: I) -> Properties
+ where
+ I: IntoIterator<Item = P>,
+ P: core::borrow::Borrow<Properties>,
+ {
+ let mut it = props.into_iter().peekable();
+ // While empty alternations aren't possible, we still behave as if they
+ // are. When we have an empty alternate, then clearly the look-around
+ // prefix and suffix is empty. Otherwise, it is the intersection of all
+ // prefixes and suffixes (respectively) of the branches.
+ let fix = if it.peek().is_none() {
+ LookSet::empty()
+ } else {
+ LookSet::full()
+ };
+ // And also, an empty alternate means we have 0 static capture groups,
+ // but we otherwise start with the number corresponding to the first
+ // alternate. If any subsequent alternate has a different number of
+ // static capture groups, then we overall have a variation and not a
+ // static number of groups.
+ let static_explicit_captures_len =
+ it.peek().and_then(|p| p.borrow().static_explicit_captures_len());
+ // The base case is an empty alternation, which matches nothing.
+ // Note though that empty alternations aren't possible, because the
+ // Hir::alternation smart constructor rewrites those as empty character
+ // classes.
+ let mut props = PropertiesI {
+ minimum_len: None,
+ maximum_len: None,
+ look_set: LookSet::empty(),
+ look_set_prefix: fix,
+ look_set_suffix: fix,
+ look_set_prefix_any: LookSet::empty(),
+ look_set_suffix_any: LookSet::empty(),
+ utf8: true,
+ explicit_captures_len: 0,
+ static_explicit_captures_len,
+ literal: false,
+ alternation_literal: true,
+ };
+ let (mut min_poisoned, mut max_poisoned) = (false, false);
+ // Handle properties that need to visit every child hir.
+ for prop in it {
+ let p = prop.borrow();
+ props.look_set.set_union(p.look_set());
+ props.look_set_prefix.set_intersect(p.look_set_prefix());
+ props.look_set_suffix.set_intersect(p.look_set_suffix());
+ props.look_set_prefix_any.set_union(p.look_set_prefix_any());
+ props.look_set_suffix_any.set_union(p.look_set_suffix_any());
+ props.utf8 = props.utf8 && p.is_utf8();
+ props.explicit_captures_len = props
+ .explicit_captures_len
+ .saturating_add(p.explicit_captures_len());
+ if props.static_explicit_captures_len
+ != p.static_explicit_captures_len()
+ {
+ props.static_explicit_captures_len = None;
+ }
+ props.alternation_literal =
+ props.alternation_literal && p.is_literal();
+ if !min_poisoned {
+ if let Some(xmin) = p.minimum_len() {
+ if props.minimum_len.map_or(true, |pmin| xmin < pmin) {
+ props.minimum_len = Some(xmin);
+ }
+ } else {
+ props.minimum_len = None;
+ min_poisoned = true;
+ }
+ }
+ if !max_poisoned {
+ if let Some(xmax) = p.maximum_len() {
+ if props.maximum_len.map_or(true, |pmax| xmax > pmax) {
+ props.maximum_len = Some(xmax);
+ }
+ } else {
+ props.maximum_len = None;
+ max_poisoned = true;
+ }
+ }
+ }
+ Properties(Box::new(props))
+ }
+}
+
+impl Properties {
+ /// Create a new set of HIR properties for an empty regex.
+ fn empty() -> Properties {
+ let inner = PropertiesI {
+ minimum_len: Some(0),
+ maximum_len: Some(0),
+ look_set: LookSet::empty(),
+ look_set_prefix: LookSet::empty(),
+ look_set_suffix: LookSet::empty(),
+ look_set_prefix_any: LookSet::empty(),
+ look_set_suffix_any: LookSet::empty(),
+ // It is debatable whether an empty regex always matches at valid
+ // UTF-8 boundaries. Strictly speaking, at a byte oriented view,
+ // it is clearly false. There are, for example, many empty strings
+ // between the bytes encoding a '☃'.
+ //
+ // However, when Unicode mode is enabled, the fundamental atom
+ // of matching is really a codepoint. And in that scenario, an
+ // empty regex is defined to only match at valid UTF-8 boundaries
+ // and to never split a codepoint. It just so happens that this
+ // enforcement is somewhat tricky to do for regexes that match
+ // the empty string inside regex engines themselves. It usually
+ // requires some layer above the regex engine to filter out such
+ // matches.
+ //
+ // In any case, 'true' is really the only coherent option. If it
+ // were false, for example, then 'a*' would also need to be false
+ // since it too can match the empty string.
+ utf8: true,
+ explicit_captures_len: 0,
+ static_explicit_captures_len: Some(0),
+ literal: false,
+ alternation_literal: false,
+ };
+ Properties(Box::new(inner))
+ }
+
+ /// Create a new set of HIR properties for a literal regex.
+ fn literal(lit: &Literal) -> Properties {
+ let inner = PropertiesI {
+ minimum_len: Some(lit.0.len()),
+ maximum_len: Some(lit.0.len()),
+ look_set: LookSet::empty(),
+ look_set_prefix: LookSet::empty(),
+ look_set_suffix: LookSet::empty(),
+ look_set_prefix_any: LookSet::empty(),
+ look_set_suffix_any: LookSet::empty(),
+ utf8: core::str::from_utf8(&lit.0).is_ok(),
+ explicit_captures_len: 0,
+ static_explicit_captures_len: Some(0),
+ literal: true,
+ alternation_literal: true,
+ };
+ Properties(Box::new(inner))
+ }
+
+ /// Create a new set of HIR properties for a character class.
+ fn class(class: &Class) -> Properties {
+ let inner = PropertiesI {
+ minimum_len: class.minimum_len(),
+ maximum_len: class.maximum_len(),
+ look_set: LookSet::empty(),
+ look_set_prefix: LookSet::empty(),
+ look_set_suffix: LookSet::empty(),
+ look_set_prefix_any: LookSet::empty(),
+ look_set_suffix_any: LookSet::empty(),
+ utf8: class.is_utf8(),
+ explicit_captures_len: 0,
+ static_explicit_captures_len: Some(0),
+ literal: false,
+ alternation_literal: false,
+ };
+ Properties(Box::new(inner))
+ }
+
+ /// Create a new set of HIR properties for a look-around assertion.
+ fn look(look: Look) -> Properties {
+ let inner = PropertiesI {
+ minimum_len: Some(0),
+ maximum_len: Some(0),
+ look_set: LookSet::singleton(look),
+ look_set_prefix: LookSet::singleton(look),
+ look_set_suffix: LookSet::singleton(look),
+ look_set_prefix_any: LookSet::singleton(look),
+ look_set_suffix_any: LookSet::singleton(look),
+ // This requires a little explanation. Basically, we don't consider
+ // matching an empty string to be equivalent to matching invalid
+ // UTF-8, even though technically matching every empty string will
+ // split the UTF-8 encoding of a single codepoint when treating a
+ // UTF-8 encoded string as a sequence of bytes. Our defense here is
+ // that in such a case, a codepoint should logically be treated as
+ // the fundamental atom for matching, and thus the only valid match
+ // points are between codepoints and not bytes.
+ //
+ // More practically, this is true here because it's also true
+ // for 'Hir::empty()', otherwise something like 'a*' would be
+ // considered to match invalid UTF-8. That in turn makes this
+ // property borderline useless.
+ utf8: true,
+ explicit_captures_len: 0,
+ static_explicit_captures_len: Some(0),
+ literal: false,
+ alternation_literal: false,
+ };
+ Properties(Box::new(inner))
+ }
+
+ /// Create a new set of HIR properties for a repetition.
+ fn repetition(rep: &Repetition) -> Properties {
+ let p = rep.sub.properties();
+ let minimum_len = p.minimum_len().map(|child_min| {
+ let rep_min = usize::try_from(rep.min).unwrap_or(usize::MAX);
+ child_min.saturating_mul(rep_min)
+ });
+ let maximum_len = rep.max.and_then(|rep_max| {
+ let rep_max = usize::try_from(rep_max).ok()?;
+ let child_max = p.maximum_len()?;
+ child_max.checked_mul(rep_max)
+ });
+
+ let mut inner = PropertiesI {
+ minimum_len,
+ maximum_len,
+ look_set: p.look_set(),
+ look_set_prefix: LookSet::empty(),
+ look_set_suffix: LookSet::empty(),
+ look_set_prefix_any: p.look_set_prefix_any(),
+ look_set_suffix_any: p.look_set_suffix_any(),
+ utf8: p.is_utf8(),
+ explicit_captures_len: p.explicit_captures_len(),
+ static_explicit_captures_len: p.static_explicit_captures_len(),
+ literal: false,
+ alternation_literal: false,
+ };
+ // If the repetition operator can match the empty string, then its
+ // lookset prefix and suffixes themselves remain empty since they are
+ // no longer required to match.
+ if rep.min > 0 {
+ inner.look_set_prefix = p.look_set_prefix();
+ inner.look_set_suffix = p.look_set_suffix();
+ }
+ // If the static captures len of the sub-expression is not known or
+ // is greater than zero, then it automatically propagates to the
+ // repetition, regardless of the repetition. Otherwise, it might
+ // change, but only when the repetition can match 0 times.
+ if rep.min == 0
+ && inner.static_explicit_captures_len.map_or(false, |len| len > 0)
+ {
+ // If we require a match 0 times, then our captures len is
+ // guaranteed to be zero. Otherwise, if we *can* match the empty
+ // string, then it's impossible to know how many captures will be
+ // in the resulting match.
+ if rep.max == Some(0) {
+ inner.static_explicit_captures_len = Some(0);
+ } else {
+ inner.static_explicit_captures_len = None;
+ }
+ }
+ Properties(Box::new(inner))
+ }
+
+ /// Create a new set of HIR properties for a capture.
+ fn capture(capture: &Capture) -> Properties {
+ let p = capture.sub.properties();
+ Properties(Box::new(PropertiesI {
+ explicit_captures_len: p.explicit_captures_len().saturating_add(1),
+ static_explicit_captures_len: p
+ .static_explicit_captures_len()
+ .map(|len| len.saturating_add(1)),
+ literal: false,
+ alternation_literal: false,
+ ..*p.0.clone()
+ }))
+ }
+
+ /// Create a new set of HIR properties for a concatenation.
+ fn concat(concat: &[Hir]) -> Properties {
+ // The base case is an empty concatenation, which matches the empty
+ // string. Note though that empty concatenations aren't possible,
+ // because the Hir::concat smart constructor rewrites those as
+ // Hir::empty.
+ let mut props = PropertiesI {
+ minimum_len: Some(0),
+ maximum_len: Some(0),
+ look_set: LookSet::empty(),
+ look_set_prefix: LookSet::empty(),
+ look_set_suffix: LookSet::empty(),
+ look_set_prefix_any: LookSet::empty(),
+ look_set_suffix_any: LookSet::empty(),
+ utf8: true,
+ explicit_captures_len: 0,
+ static_explicit_captures_len: Some(0),
+ literal: true,
+ alternation_literal: true,
+ };
+ // Handle properties that need to visit every child hir.
+ for x in concat.iter() {
+ let p = x.properties();
+ props.look_set.set_union(p.look_set());
+ props.utf8 = props.utf8 && p.is_utf8();
+ props.explicit_captures_len = props
+ .explicit_captures_len
+ .saturating_add(p.explicit_captures_len());
+ props.static_explicit_captures_len = p
+ .static_explicit_captures_len()
+ .and_then(|len1| {
+ Some((len1, props.static_explicit_captures_len?))
+ })
+ .and_then(|(len1, len2)| Some(len1.saturating_add(len2)));
+ props.literal = props.literal && p.is_literal();
+ props.alternation_literal =
+ props.alternation_literal && p.is_alternation_literal();
+ if let Some(minimum_len) = props.minimum_len {
+ match p.minimum_len() {
+ None => props.minimum_len = None,
+ Some(len) => {
+ // We use saturating arithmetic here because the
+ // minimum is just a lower bound. We can't go any
+ // higher than what our number types permit.
+ props.minimum_len =
+ Some(minimum_len.saturating_add(len));
+ }
+ }
+ }
+ if let Some(maximum_len) = props.maximum_len {
+ match p.maximum_len() {
+ None => props.maximum_len = None,
+ Some(len) => {
+ props.maximum_len = maximum_len.checked_add(len)
+ }
+ }
+ }
+ }
+ // Handle the prefix properties, which only requires visiting
+ // child exprs until one matches more than the empty string.
+ let mut it = concat.iter();
+ while let Some(x) = it.next() {
+ props.look_set_prefix.set_union(x.properties().look_set_prefix());
+ props
+ .look_set_prefix_any
+ .set_union(x.properties().look_set_prefix_any());
+ if x.properties().maximum_len().map_or(true, |x| x > 0) {
+ break;
+ }
+ }
+ // Same thing for the suffix properties, but in reverse.
+ let mut it = concat.iter().rev();
+ while let Some(x) = it.next() {
+ props.look_set_suffix.set_union(x.properties().look_set_suffix());
+ props
+ .look_set_suffix_any
+ .set_union(x.properties().look_set_suffix_any());
+ if x.properties().maximum_len().map_or(true, |x| x > 0) {
+ break;
+ }
+ }
+ Properties(Box::new(props))
+ }
+
+ /// Create a new set of HIR properties for a concatenation.
+ fn alternation(alts: &[Hir]) -> Properties {
+ Properties::union(alts.iter().map(|hir| hir.properties()))
+ }
+}
+
+/// A set of look-around assertions.
+///
+/// This is useful for efficiently tracking look-around assertions. For
+/// example, an [`Hir`] provides properties that return `LookSet`s.
+#[derive(Clone, Copy, Default, Eq, PartialEq)]
+pub struct LookSet {
+ /// The underlying representation this set is exposed to make it possible
+ /// to store it somewhere efficiently. The representation is that
+ /// of a bitset, where each assertion occupies bit `i` where `i =
+ /// Look::as_repr()`.
+ ///
+ /// Note that users of this internal representation must permit the full
+ /// range of `u16` values to be represented. For example, even if the
+ /// current implementation only makes use of the 10 least significant bits,
+ /// it may use more bits in a future semver compatible release.
+ pub bits: u32,
+}
+
+impl LookSet {
+ /// Create an empty set of look-around assertions.
+ #[inline]
+ pub fn empty() -> LookSet {
+ LookSet { bits: 0 }
+ }
+
+ /// Create a full set of look-around assertions.
+ ///
+ /// This set contains all possible look-around assertions.
+ #[inline]
+ pub fn full() -> LookSet {
+ LookSet { bits: !0 }
+ }
+
+ /// Create a look-around set containing the look-around assertion given.
+ ///
+ /// This is a convenience routine for creating an empty set and inserting
+ /// one look-around assertions.
+ #[inline]
+ pub fn singleton(look: Look) -> LookSet {
+ LookSet::empty().insert(look)
+ }
+
+ /// Returns the total number of look-around assertions in this set.
+ #[inline]
+ pub fn len(self) -> usize {
+ // OK because max value always fits in a u8, which in turn always
+ // fits in a usize, regardless of target.
+ usize::try_from(self.bits.count_ones()).unwrap()
+ }
+
+ /// Returns true if and only if this set is empty.
+ #[inline]
+ pub fn is_empty(self) -> bool {
+ self.len() == 0
+ }
+
+ /// Returns true if and only if the given look-around assertion is in this
+ /// set.
+ #[inline]
+ pub fn contains(self, look: Look) -> bool {
+ self.bits & look.as_repr() != 0
+ }
+
+ /// Returns true if and only if this set contains any anchor assertions.
+ /// This includes both "start/end of haystack" and "start/end of line."
+ #[inline]
+ pub fn contains_anchor(&self) -> bool {
+ self.contains_anchor_haystack() || self.contains_anchor_line()
+ }
+
+ /// Returns true if and only if this set contains any "start/end of
+ /// haystack" anchors. This doesn't include "start/end of line" anchors.
+ #[inline]
+ pub fn contains_anchor_haystack(&self) -> bool {
+ self.contains(Look::Start) || self.contains(Look::End)
+ }
+
+ /// Returns true if and only if this set contains any "start/end of line"
+ /// anchors. This doesn't include "start/end of haystack" anchors. This
+ /// includes both `\n` line anchors and CRLF (`\r\n`) aware line anchors.
+ #[inline]
+ pub fn contains_anchor_line(&self) -> bool {
+ self.contains(Look::StartLF)
+ || self.contains(Look::EndLF)
+ || self.contains(Look::StartCRLF)
+ || self.contains(Look::EndCRLF)
+ }
+
+ /// Returns true if and only if this set contains any "start/end of line"
+ /// anchors that only treat `\n` as line terminators. This does not include
+ /// haystack anchors or CRLF aware line anchors.
+ #[inline]
+ pub fn contains_anchor_lf(&self) -> bool {
+ self.contains(Look::StartLF) || self.contains(Look::EndLF)
+ }
+
+ /// Returns true if and only if this set contains any "start/end of line"
+ /// anchors that are CRLF-aware. This doesn't include "start/end of
+ /// haystack" or "start/end of line-feed" anchors.
+ #[inline]
+ pub fn contains_anchor_crlf(&self) -> bool {
+ self.contains(Look::StartCRLF) || self.contains(Look::EndCRLF)
+ }
+
+ /// Returns true if and only if this set contains any word boundary or
+ /// negated word boundary assertions. This include both Unicode and ASCII
+ /// word boundaries.
+ #[inline]
+ pub fn contains_word(self) -> bool {
+ self.contains_word_unicode() || self.contains_word_ascii()
+ }
+
+ /// Returns true if and only if this set contains any Unicode word boundary
+ /// or negated Unicode word boundary assertions.
+ #[inline]
+ pub fn contains_word_unicode(self) -> bool {
+ self.contains(Look::WordUnicode)
+ || self.contains(Look::WordUnicodeNegate)
+ || self.contains(Look::WordStartUnicode)
+ || self.contains(Look::WordEndUnicode)
+ || self.contains(Look::WordStartHalfUnicode)
+ || self.contains(Look::WordEndHalfUnicode)
+ }
+
+ /// Returns true if and only if this set contains any ASCII word boundary
+ /// or negated ASCII word boundary assertions.
+ #[inline]
+ pub fn contains_word_ascii(self) -> bool {
+ self.contains(Look::WordAscii)
+ || self.contains(Look::WordAsciiNegate)
+ || self.contains(Look::WordStartAscii)
+ || self.contains(Look::WordEndAscii)
+ || self.contains(Look::WordStartHalfAscii)
+ || self.contains(Look::WordEndHalfAscii)
+ }
+
+ /// Returns an iterator over all of the look-around assertions in this set.
+ #[inline]
+ pub fn iter(self) -> LookSetIter {
+ LookSetIter { set: self }
+ }
+
+ /// Return a new set that is equivalent to the original, but with the given
+ /// assertion added to it. If the assertion is already in the set, then the
+ /// returned set is equivalent to the original.
+ #[inline]
+ pub fn insert(self, look: Look) -> LookSet {
+ LookSet { bits: self.bits | look.as_repr() }
+ }
+
+ /// Updates this set in place with the result of inserting the given
+ /// assertion into this set.
+ #[inline]
+ pub fn set_insert(&mut self, look: Look) {
+ *self = self.insert(look);
+ }
+
+ /// Return a new set that is equivalent to the original, but with the given
+ /// assertion removed from it. If the assertion is not in the set, then the
+ /// returned set is equivalent to the original.
+ #[inline]
+ pub fn remove(self, look: Look) -> LookSet {
+ LookSet { bits: self.bits & !look.as_repr() }
+ }
+
+ /// Updates this set in place with the result of removing the given
+ /// assertion from this set.
+ #[inline]
+ pub fn set_remove(&mut self, look: Look) {
+ *self = self.remove(look);
+ }
+
+ /// Returns a new set that is the result of subtracting the given set from
+ /// this set.
+ #[inline]
+ pub fn subtract(self, other: LookSet) -> LookSet {
+ LookSet { bits: self.bits & !other.bits }
+ }
+
+ /// Updates this set in place with the result of subtracting the given set
+ /// from this set.
+ #[inline]
+ pub fn set_subtract(&mut self, other: LookSet) {
+ *self = self.subtract(other);
+ }
+
+ /// Returns a new set that is the union of this and the one given.
+ #[inline]
+ pub fn union(self, other: LookSet) -> LookSet {
+ LookSet { bits: self.bits | other.bits }
+ }
+
+ /// Updates this set in place with the result of unioning it with the one
+ /// given.
+ #[inline]
+ pub fn set_union(&mut self, other: LookSet) {
+ *self = self.union(other);
+ }
+
+ /// Returns a new set that is the intersection of this and the one given.
+ #[inline]
+ pub fn intersect(self, other: LookSet) -> LookSet {
+ LookSet { bits: self.bits & other.bits }
+ }
+
+ /// Updates this set in place with the result of intersecting it with the
+ /// one given.
+ #[inline]
+ pub fn set_intersect(&mut self, other: LookSet) {
+ *self = self.intersect(other);
+ }
+
+ /// Return a `LookSet` from the slice given as a native endian 32-bit
+ /// integer.
+ ///
+ /// # Panics
+ ///
+ /// This panics if `slice.len() < 4`.
+ #[inline]
+ pub fn read_repr(slice: &[u8]) -> LookSet {
+ let bits = u32::from_ne_bytes(slice[..4].try_into().unwrap());
+ LookSet { bits }
+ }
+
+ /// Write a `LookSet` as a native endian 32-bit integer to the beginning
+ /// of the slice given.
+ ///
+ /// # Panics
+ ///
+ /// This panics if `slice.len() < 4`.
+ #[inline]
+ pub fn write_repr(self, slice: &mut [u8]) {
+ let raw = self.bits.to_ne_bytes();
+ slice[0] = raw[0];
+ slice[1] = raw[1];
+ slice[2] = raw[2];
+ slice[3] = raw[3];
+ }
+}
+
+impl core::fmt::Debug for LookSet {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ if self.is_empty() {
+ return write!(f, "∅");
+ }
+ for look in self.iter() {
+ write!(f, "{}", look.as_char())?;
+ }
+ Ok(())
+ }
+}
+
+/// An iterator over all look-around assertions in a [`LookSet`].
+///
+/// This iterator is created by [`LookSet::iter`].
+#[derive(Clone, Debug)]
+pub struct LookSetIter {
+ set: LookSet,
+}
+
+impl Iterator for LookSetIter {
+ type Item = Look;
+
+ #[inline]
+ fn next(&mut self) -> Option<Look> {
+ if self.set.is_empty() {
+ return None;
+ }
+ // We'll never have more than u8::MAX distinct look-around assertions,
+ // so 'bit' will always fit into a u16.
+ let bit = u16::try_from(self.set.bits.trailing_zeros()).unwrap();
+ let look = Look::from_repr(1 << bit)?;
+ self.set = self.set.remove(look);
+ Some(look)
+ }
+}
+
+/// Given a sequence of HIR values where each value corresponds to a Unicode
+/// class (or an all-ASCII byte class), return a single Unicode class
+/// corresponding to the union of the classes found.
+fn class_chars(hirs: &[Hir]) -> Option<Class> {
+ let mut cls = ClassUnicode::new(vec![]);
+ for hir in hirs.iter() {
+ match *hir.kind() {
+ HirKind::Class(Class::Unicode(ref cls2)) => {
+ cls.union(cls2);
+ }
+ HirKind::Class(Class::Bytes(ref cls2)) => {
+ cls.union(&cls2.to_unicode_class()?);
+ }
+ _ => return None,
+ };
+ }
+ Some(Class::Unicode(cls))
+}
+
+/// Given a sequence of HIR values where each value corresponds to a byte class
+/// (or an all-ASCII Unicode class), return a single byte class corresponding
+/// to the union of the classes found.
+fn class_bytes(hirs: &[Hir]) -> Option<Class> {
+ let mut cls = ClassBytes::new(vec![]);
+ for hir in hirs.iter() {
+ match *hir.kind() {
+ HirKind::Class(Class::Unicode(ref cls2)) => {
+ cls.union(&cls2.to_byte_class()?);
+ }
+ HirKind::Class(Class::Bytes(ref cls2)) => {
+ cls.union(cls2);
+ }
+ _ => return None,
+ };
+ }
+ Some(Class::Bytes(cls))
+}
+
+/// Given a sequence of HIR values where each value corresponds to a literal
+/// that is a single `char`, return that sequence of `char`s. Otherwise return
+/// None. No deduplication is done.
+fn singleton_chars(hirs: &[Hir]) -> Option<Vec<char>> {
+ let mut singletons = vec![];
+ for hir in hirs.iter() {
+ let literal = match *hir.kind() {
+ HirKind::Literal(Literal(ref bytes)) => bytes,
+ _ => return None,
+ };
+ let ch = match crate::debug::utf8_decode(literal) {
+ None => return None,
+ Some(Err(_)) => return None,
+ Some(Ok(ch)) => ch,
+ };
+ if literal.len() != ch.len_utf8() {
+ return None;
+ }
+ singletons.push(ch);
+ }
+ Some(singletons)
+}
+
+/// Given a sequence of HIR values where each value corresponds to a literal
+/// that is a single byte, return that sequence of bytes. Otherwise return
+/// None. No deduplication is done.
+fn singleton_bytes(hirs: &[Hir]) -> Option<Vec<u8>> {
+ let mut singletons = vec![];
+ for hir in hirs.iter() {
+ let literal = match *hir.kind() {
+ HirKind::Literal(Literal(ref bytes)) => bytes,
+ _ => return None,
+ };
+ if literal.len() != 1 {
+ return None;
+ }
+ singletons.push(literal[0]);
+ }
+ Some(singletons)
+}
+
+/// Looks for a common prefix in the list of alternation branches given. If one
+/// is found, then an equivalent but (hopefully) simplified Hir is returned.
+/// Otherwise, the original given list of branches is returned unmodified.
+///
+/// This is not quite as good as it could be. Right now, it requires that
+/// all branches are 'Concat' expressions. It also doesn't do well with
+/// literals. For example, given 'foofoo|foobar', it will not refactor it to
+/// 'foo(?:foo|bar)' because literals are flattened into their own special
+/// concatenation. (One wonders if perhaps 'Literal' should be a single atom
+/// instead of a string of bytes because of this. Otherwise, handling the
+/// current representation in this routine will be pretty gnarly. Sigh.)
+fn lift_common_prefix(hirs: Vec<Hir>) -> Result<Hir, Vec<Hir>> {
+ if hirs.len() <= 1 {
+ return Err(hirs);
+ }
+ let mut prefix = match hirs[0].kind() {
+ HirKind::Concat(ref xs) => &**xs,
+ _ => return Err(hirs),
+ };
+ if prefix.is_empty() {
+ return Err(hirs);
+ }
+ for h in hirs.iter().skip(1) {
+ let concat = match h.kind() {
+ HirKind::Concat(ref xs) => xs,
+ _ => return Err(hirs),
+ };
+ let common_len = prefix
+ .iter()
+ .zip(concat.iter())
+ .take_while(|(x, y)| x == y)
+ .count();
+ prefix = &prefix[..common_len];
+ if prefix.is_empty() {
+ return Err(hirs);
+ }
+ }
+ let len = prefix.len();
+ assert_ne!(0, len);
+ let mut prefix_concat = vec![];
+ let mut suffix_alts = vec![];
+ for h in hirs {
+ let mut concat = match h.into_kind() {
+ HirKind::Concat(xs) => xs,
+ // We required all sub-expressions to be
+ // concats above, so we're only here if we
+ // have a concat.
+ _ => unreachable!(),
+ };
+ suffix_alts.push(Hir::concat(concat.split_off(len)));
+ if prefix_concat.is_empty() {
+ prefix_concat = concat;
+ }
+ }
+ let mut concat = prefix_concat;
+ concat.push(Hir::alternation(suffix_alts));
+ Ok(Hir::concat(concat))
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ fn uclass(ranges: &[(char, char)]) -> ClassUnicode {
+ let ranges: Vec<ClassUnicodeRange> = ranges
+ .iter()
+ .map(|&(s, e)| ClassUnicodeRange::new(s, e))
+ .collect();
+ ClassUnicode::new(ranges)
+ }
+
+ fn bclass(ranges: &[(u8, u8)]) -> ClassBytes {
+ let ranges: Vec<ClassBytesRange> =
+ ranges.iter().map(|&(s, e)| ClassBytesRange::new(s, e)).collect();
+ ClassBytes::new(ranges)
+ }
+
+ fn uranges(cls: &ClassUnicode) -> Vec<(char, char)> {
+ cls.iter().map(|x| (x.start(), x.end())).collect()
+ }
+
+ #[cfg(feature = "unicode-case")]
+ fn ucasefold(cls: &ClassUnicode) -> ClassUnicode {
+ let mut cls_ = cls.clone();
+ cls_.case_fold_simple();
+ cls_
+ }
+
+ fn uunion(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode {
+ let mut cls_ = cls1.clone();
+ cls_.union(cls2);
+ cls_
+ }
+
+ fn uintersect(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode {
+ let mut cls_ = cls1.clone();
+ cls_.intersect(cls2);
+ cls_
+ }
+
+ fn udifference(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode {
+ let mut cls_ = cls1.clone();
+ cls_.difference(cls2);
+ cls_
+ }
+
+ fn usymdifference(
+ cls1: &ClassUnicode,
+ cls2: &ClassUnicode,
+ ) -> ClassUnicode {
+ let mut cls_ = cls1.clone();
+ cls_.symmetric_difference(cls2);
+ cls_
+ }
+
+ fn unegate(cls: &ClassUnicode) -> ClassUnicode {
+ let mut cls_ = cls.clone();
+ cls_.negate();
+ cls_
+ }
+
+ fn branges(cls: &ClassBytes) -> Vec<(u8, u8)> {
+ cls.iter().map(|x| (x.start(), x.end())).collect()
+ }
+
+ fn bcasefold(cls: &ClassBytes) -> ClassBytes {
+ let mut cls_ = cls.clone();
+ cls_.case_fold_simple();
+ cls_
+ }
+
+ fn bunion(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
+ let mut cls_ = cls1.clone();
+ cls_.union(cls2);
+ cls_
+ }
+
+ fn bintersect(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
+ let mut cls_ = cls1.clone();
+ cls_.intersect(cls2);
+ cls_
+ }
+
+ fn bdifference(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
+ let mut cls_ = cls1.clone();
+ cls_.difference(cls2);
+ cls_
+ }
+
+ fn bsymdifference(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
+ let mut cls_ = cls1.clone();
+ cls_.symmetric_difference(cls2);
+ cls_
+ }
+
+ fn bnegate(cls: &ClassBytes) -> ClassBytes {
+ let mut cls_ = cls.clone();
+ cls_.negate();
+ cls_
+ }
+
+ #[test]
+ fn class_range_canonical_unicode() {
+ let range = ClassUnicodeRange::new('\u{00FF}', '\0');
+ assert_eq!('\0', range.start());
+ assert_eq!('\u{00FF}', range.end());
+ }
+
+ #[test]
+ fn class_range_canonical_bytes() {
+ let range = ClassBytesRange::new(b'\xFF', b'\0');
+ assert_eq!(b'\0', range.start());
+ assert_eq!(b'\xFF', range.end());
+ }
+
+ #[test]
+ fn class_canonicalize_unicode() {
+ let cls = uclass(&[('a', 'c'), ('x', 'z')]);
+ let expected = vec![('a', 'c'), ('x', 'z')];
+ assert_eq!(expected, uranges(&cls));
+
+ let cls = uclass(&[('x', 'z'), ('a', 'c')]);
+ let expected = vec![('a', 'c'), ('x', 'z')];
+ assert_eq!(expected, uranges(&cls));
+
+ let cls = uclass(&[('x', 'z'), ('w', 'y')]);
+ let expected = vec![('w', 'z')];
+ assert_eq!(expected, uranges(&cls));
+
+ let cls = uclass(&[
+ ('c', 'f'),
+ ('a', 'g'),
+ ('d', 'j'),
+ ('a', 'c'),
+ ('m', 'p'),
+ ('l', 's'),
+ ]);
+ let expected = vec![('a', 'j'), ('l', 's')];
+ assert_eq!(expected, uranges(&cls));
+
+ let cls = uclass(&[('x', 'z'), ('u', 'w')]);
+ let expected = vec![('u', 'z')];
+ assert_eq!(expected, uranges(&cls));
+
+ let cls = uclass(&[('\x00', '\u{10FFFF}'), ('\x00', '\u{10FFFF}')]);
+ let expected = vec![('\x00', '\u{10FFFF}')];
+ assert_eq!(expected, uranges(&cls));
+
+ let cls = uclass(&[('a', 'a'), ('b', 'b')]);
+ let expected = vec![('a', 'b')];
+ assert_eq!(expected, uranges(&cls));
+ }
+
+ #[test]
+ fn class_canonicalize_bytes() {
+ let cls = bclass(&[(b'a', b'c'), (b'x', b'z')]);
+ let expected = vec![(b'a', b'c'), (b'x', b'z')];
+ assert_eq!(expected, branges(&cls));
+
+ let cls = bclass(&[(b'x', b'z'), (b'a', b'c')]);
+ let expected = vec![(b'a', b'c'), (b'x', b'z')];
+ assert_eq!(expected, branges(&cls));
+
+ let cls = bclass(&[(b'x', b'z'), (b'w', b'y')]);
+ let expected = vec![(b'w', b'z')];
+ assert_eq!(expected, branges(&cls));
+
+ let cls = bclass(&[
+ (b'c', b'f'),
+ (b'a', b'g'),
+ (b'd', b'j'),
+ (b'a', b'c'),
+ (b'm', b'p'),
+ (b'l', b's'),
+ ]);
+ let expected = vec![(b'a', b'j'), (b'l', b's')];
+ assert_eq!(expected, branges(&cls));
+
+ let cls = bclass(&[(b'x', b'z'), (b'u', b'w')]);
+ let expected = vec![(b'u', b'z')];
+ assert_eq!(expected, branges(&cls));
+
+ let cls = bclass(&[(b'\x00', b'\xFF'), (b'\x00', b'\xFF')]);
+ let expected = vec![(b'\x00', b'\xFF')];
+ assert_eq!(expected, branges(&cls));
+
+ let cls = bclass(&[(b'a', b'a'), (b'b', b'b')]);
+ let expected = vec![(b'a', b'b')];
+ assert_eq!(expected, branges(&cls));
+ }
+
+ #[test]
+ #[cfg(feature = "unicode-case")]
+ fn class_case_fold_unicode() {
+ let cls = uclass(&[
+ ('C', 'F'),
+ ('A', 'G'),
+ ('D', 'J'),
+ ('A', 'C'),
+ ('M', 'P'),
+ ('L', 'S'),
+ ('c', 'f'),
+ ]);
+ let expected = uclass(&[
+ ('A', 'J'),
+ ('L', 'S'),
+ ('a', 'j'),
+ ('l', 's'),
+ ('\u{17F}', '\u{17F}'),
+ ]);
+ assert_eq!(expected, ucasefold(&cls));
+
+ let cls = uclass(&[('A', 'Z')]);
+ let expected = uclass(&[
+ ('A', 'Z'),
+ ('a', 'z'),
+ ('\u{17F}', '\u{17F}'),
+ ('\u{212A}', '\u{212A}'),
+ ]);
+ assert_eq!(expected, ucasefold(&cls));
+
+ let cls = uclass(&[('a', 'z')]);
+ let expected = uclass(&[
+ ('A', 'Z'),
+ ('a', 'z'),
+ ('\u{17F}', '\u{17F}'),
+ ('\u{212A}', '\u{212A}'),
+ ]);
+ assert_eq!(expected, ucasefold(&cls));
+
+ let cls = uclass(&[('A', 'A'), ('_', '_')]);
+ let expected = uclass(&[('A', 'A'), ('_', '_'), ('a', 'a')]);
+ assert_eq!(expected, ucasefold(&cls));
+
+ let cls = uclass(&[('A', 'A'), ('=', '=')]);
+ let expected = uclass(&[('=', '='), ('A', 'A'), ('a', 'a')]);
+ assert_eq!(expected, ucasefold(&cls));
+
+ let cls = uclass(&[('\x00', '\x10')]);
+ assert_eq!(cls, ucasefold(&cls));
+
+ let cls = uclass(&[('k', 'k')]);
+ let expected =
+ uclass(&[('K', 'K'), ('k', 'k'), ('\u{212A}', '\u{212A}')]);
+ assert_eq!(expected, ucasefold(&cls));
+
+ let cls = uclass(&[('@', '@')]);
+ assert_eq!(cls, ucasefold(&cls));
+ }
+
+ #[test]
+ #[cfg(not(feature = "unicode-case"))]
+ fn class_case_fold_unicode_disabled() {
+ let mut cls = uclass(&[
+ ('C', 'F'),
+ ('A', 'G'),
+ ('D', 'J'),
+ ('A', 'C'),
+ ('M', 'P'),
+ ('L', 'S'),
+ ('c', 'f'),
+ ]);
+ assert!(cls.try_case_fold_simple().is_err());
+ }
+
+ #[test]
+ #[should_panic]
+ #[cfg(not(feature = "unicode-case"))]
+ fn class_case_fold_unicode_disabled_panics() {
+ let mut cls = uclass(&[
+ ('C', 'F'),
+ ('A', 'G'),
+ ('D', 'J'),
+ ('A', 'C'),
+ ('M', 'P'),
+ ('L', 'S'),
+ ('c', 'f'),
+ ]);
+ cls.case_fold_simple();
+ }
+
+ #[test]
+ fn class_case_fold_bytes() {
+ let cls = bclass(&[
+ (b'C', b'F'),
+ (b'A', b'G'),
+ (b'D', b'J'),
+ (b'A', b'C'),
+ (b'M', b'P'),
+ (b'L', b'S'),
+ (b'c', b'f'),
+ ]);
+ let expected =
+ bclass(&[(b'A', b'J'), (b'L', b'S'), (b'a', b'j'), (b'l', b's')]);
+ assert_eq!(expected, bcasefold(&cls));
+
+ let cls = bclass(&[(b'A', b'Z')]);
+ let expected = bclass(&[(b'A', b'Z'), (b'a', b'z')]);
+ assert_eq!(expected, bcasefold(&cls));
+
+ let cls = bclass(&[(b'a', b'z')]);
+ let expected = bclass(&[(b'A', b'Z'), (b'a', b'z')]);
+ assert_eq!(expected, bcasefold(&cls));
+
+ let cls = bclass(&[(b'A', b'A'), (b'_', b'_')]);
+ let expected = bclass(&[(b'A', b'A'), (b'_', b'_'), (b'a', b'a')]);
+ assert_eq!(expected, bcasefold(&cls));
+
+ let cls = bclass(&[(b'A', b'A'), (b'=', b'=')]);
+ let expected = bclass(&[(b'=', b'='), (b'A', b'A'), (b'a', b'a')]);
+ assert_eq!(expected, bcasefold(&cls));
+
+ let cls = bclass(&[(b'\x00', b'\x10')]);
+ assert_eq!(cls, bcasefold(&cls));
+
+ let cls = bclass(&[(b'k', b'k')]);
+ let expected = bclass(&[(b'K', b'K'), (b'k', b'k')]);
+ assert_eq!(expected, bcasefold(&cls));
+
+ let cls = bclass(&[(b'@', b'@')]);
+ assert_eq!(cls, bcasefold(&cls));
+ }
+
+ #[test]
+ fn class_negate_unicode() {
+ let cls = uclass(&[('a', 'a')]);
+ let expected = uclass(&[('\x00', '\x60'), ('\x62', '\u{10FFFF}')]);
+ assert_eq!(expected, unegate(&cls));
+
+ let cls = uclass(&[('a', 'a'), ('b', 'b')]);
+ let expected = uclass(&[('\x00', '\x60'), ('\x63', '\u{10FFFF}')]);
+ assert_eq!(expected, unegate(&cls));
+
+ let cls = uclass(&[('a', 'c'), ('x', 'z')]);
+ let expected = uclass(&[
+ ('\x00', '\x60'),
+ ('\x64', '\x77'),
+ ('\x7B', '\u{10FFFF}'),
+ ]);
+ assert_eq!(expected, unegate(&cls));
+
+ let cls = uclass(&[('\x00', 'a')]);
+ let expected = uclass(&[('\x62', '\u{10FFFF}')]);
+ assert_eq!(expected, unegate(&cls));
+
+ let cls = uclass(&[('a', '\u{10FFFF}')]);
+ let expected = uclass(&[('\x00', '\x60')]);
+ assert_eq!(expected, unegate(&cls));
+
+ let cls = uclass(&[('\x00', '\u{10FFFF}')]);
+ let expected = uclass(&[]);
+ assert_eq!(expected, unegate(&cls));
+
+ let cls = uclass(&[]);
+ let expected = uclass(&[('\x00', '\u{10FFFF}')]);
+ assert_eq!(expected, unegate(&cls));
+
+ let cls =
+ uclass(&[('\x00', '\u{10FFFD}'), ('\u{10FFFF}', '\u{10FFFF}')]);
+ let expected = uclass(&[('\u{10FFFE}', '\u{10FFFE}')]);
+ assert_eq!(expected, unegate(&cls));
+
+ let cls = uclass(&[('\x00', '\u{D7FF}')]);
+ let expected = uclass(&[('\u{E000}', '\u{10FFFF}')]);
+ assert_eq!(expected, unegate(&cls));
+
+ let cls = uclass(&[('\x00', '\u{D7FE}')]);
+ let expected = uclass(&[('\u{D7FF}', '\u{10FFFF}')]);
+ assert_eq!(expected, unegate(&cls));
+
+ let cls = uclass(&[('\u{E000}', '\u{10FFFF}')]);
+ let expected = uclass(&[('\x00', '\u{D7FF}')]);
+ assert_eq!(expected, unegate(&cls));
+
+ let cls = uclass(&[('\u{E001}', '\u{10FFFF}')]);
+ let expected = uclass(&[('\x00', '\u{E000}')]);
+ assert_eq!(expected, unegate(&cls));
+ }
+
+ #[test]
+ fn class_negate_bytes() {
+ let cls = bclass(&[(b'a', b'a')]);
+ let expected = bclass(&[(b'\x00', b'\x60'), (b'\x62', b'\xFF')]);
+ assert_eq!(expected, bnegate(&cls));
+
+ let cls = bclass(&[(b'a', b'a'), (b'b', b'b')]);
+ let expected = bclass(&[(b'\x00', b'\x60'), (b'\x63', b'\xFF')]);
+ assert_eq!(expected, bnegate(&cls));
+
+ let cls = bclass(&[(b'a', b'c'), (b'x', b'z')]);
+ let expected = bclass(&[
+ (b'\x00', b'\x60'),
+ (b'\x64', b'\x77'),
+ (b'\x7B', b'\xFF'),
+ ]);
+ assert_eq!(expected, bnegate(&cls));
+
+ let cls = bclass(&[(b'\x00', b'a')]);
+ let expected = bclass(&[(b'\x62', b'\xFF')]);
+ assert_eq!(expected, bnegate(&cls));
+
+ let cls = bclass(&[(b'a', b'\xFF')]);
+ let expected = bclass(&[(b'\x00', b'\x60')]);
+ assert_eq!(expected, bnegate(&cls));
+
+ let cls = bclass(&[(b'\x00', b'\xFF')]);
+ let expected = bclass(&[]);
+ assert_eq!(expected, bnegate(&cls));
+
+ let cls = bclass(&[]);
+ let expected = bclass(&[(b'\x00', b'\xFF')]);
+ assert_eq!(expected, bnegate(&cls));
+
+ let cls = bclass(&[(b'\x00', b'\xFD'), (b'\xFF', b'\xFF')]);
+ let expected = bclass(&[(b'\xFE', b'\xFE')]);
+ assert_eq!(expected, bnegate(&cls));
+ }
+
+ #[test]
+ fn class_union_unicode() {
+ let cls1 = uclass(&[('a', 'g'), ('m', 't'), ('A', 'C')]);
+ let cls2 = uclass(&[('a', 'z')]);
+ let expected = uclass(&[('a', 'z'), ('A', 'C')]);
+ assert_eq!(expected, uunion(&cls1, &cls2));
+ }
+
+ #[test]
+ fn class_union_bytes() {
+ let cls1 = bclass(&[(b'a', b'g'), (b'm', b't'), (b'A', b'C')]);
+ let cls2 = bclass(&[(b'a', b'z')]);
+ let expected = bclass(&[(b'a', b'z'), (b'A', b'C')]);
+ assert_eq!(expected, bunion(&cls1, &cls2));
+ }
+
+ #[test]
+ fn class_intersect_unicode() {
+ let cls1 = uclass(&[]);
+ let cls2 = uclass(&[('a', 'a')]);
+ let expected = uclass(&[]);
+ assert_eq!(expected, uintersect(&cls1, &cls2));
+
+ let cls1 = uclass(&[('a', 'a')]);
+ let cls2 = uclass(&[('a', 'a')]);
+ let expected = uclass(&[('a', 'a')]);
+ assert_eq!(expected, uintersect(&cls1, &cls2));
+
+ let cls1 = uclass(&[('a', 'a')]);
+ let cls2 = uclass(&[('b', 'b')]);
+ let expected = uclass(&[]);
+ assert_eq!(expected, uintersect(&cls1, &cls2));
+
+ let cls1 = uclass(&[('a', 'a')]);
+ let cls2 = uclass(&[('a', 'c')]);
+ let expected = uclass(&[('a', 'a')]);
+ assert_eq!(expected, uintersect(&cls1, &cls2));
+
+ let cls1 = uclass(&[('a', 'b')]);
+ let cls2 = uclass(&[('a', 'c')]);
+ let expected = uclass(&[('a', 'b')]);
+ assert_eq!(expected, uintersect(&cls1, &cls2));
+
+ let cls1 = uclass(&[('a', 'b')]);
+ let cls2 = uclass(&[('b', 'c')]);
+ let expected = uclass(&[('b', 'b')]);
+ assert_eq!(expected, uintersect(&cls1, &cls2));
+
+ let cls1 = uclass(&[('a', 'b')]);
+ let cls2 = uclass(&[('c', 'd')]);
+ let expected = uclass(&[]);
+ assert_eq!(expected, uintersect(&cls1, &cls2));
+
+ let cls1 = uclass(&[('b', 'c')]);
+ let cls2 = uclass(&[('a', 'd')]);
+ let expected = uclass(&[('b', 'c')]);
+ assert_eq!(expected, uintersect(&cls1, &cls2));
+
+ let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
+ let cls2 = uclass(&[('a', 'h')]);
+ let expected = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
+ assert_eq!(expected, uintersect(&cls1, &cls2));
+
+ let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
+ let cls2 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
+ let expected = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
+ assert_eq!(expected, uintersect(&cls1, &cls2));
+
+ let cls1 = uclass(&[('a', 'b'), ('g', 'h')]);
+ let cls2 = uclass(&[('d', 'e'), ('k', 'l')]);
+ let expected = uclass(&[]);
+ assert_eq!(expected, uintersect(&cls1, &cls2));
+
+ let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
+ let cls2 = uclass(&[('h', 'h')]);
+ let expected = uclass(&[('h', 'h')]);
+ assert_eq!(expected, uintersect(&cls1, &cls2));
+
+ let cls1 = uclass(&[('a', 'b'), ('e', 'f'), ('i', 'j')]);
+ let cls2 = uclass(&[('c', 'd'), ('g', 'h'), ('k', 'l')]);
+ let expected = uclass(&[]);
+ assert_eq!(expected, uintersect(&cls1, &cls2));
+
+ let cls1 = uclass(&[('a', 'b'), ('c', 'd'), ('e', 'f')]);
+ let cls2 = uclass(&[('b', 'c'), ('d', 'e'), ('f', 'g')]);
+ let expected = uclass(&[('b', 'f')]);
+ assert_eq!(expected, uintersect(&cls1, &cls2));
+ }
+
+ #[test]
+ fn class_intersect_bytes() {
+ let cls1 = bclass(&[]);
+ let cls2 = bclass(&[(b'a', b'a')]);
+ let expected = bclass(&[]);
+ assert_eq!(expected, bintersect(&cls1, &cls2));
+
+ let cls1 = bclass(&[(b'a', b'a')]);
+ let cls2 = bclass(&[(b'a', b'a')]);
+ let expected = bclass(&[(b'a', b'a')]);
+ assert_eq!(expected, bintersect(&cls1, &cls2));
+
+ let cls1 = bclass(&[(b'a', b'a')]);
+ let cls2 = bclass(&[(b'b', b'b')]);
+ let expected = bclass(&[]);
+ assert_eq!(expected, bintersect(&cls1, &cls2));
+
+ let cls1 = bclass(&[(b'a', b'a')]);
+ let cls2 = bclass(&[(b'a', b'c')]);
+ let expected = bclass(&[(b'a', b'a')]);
+ assert_eq!(expected, bintersect(&cls1, &cls2));
+
+ let cls1 = bclass(&[(b'a', b'b')]);
+ let cls2 = bclass(&[(b'a', b'c')]);
+ let expected = bclass(&[(b'a', b'b')]);
+ assert_eq!(expected, bintersect(&cls1, &cls2));
+
+ let cls1 = bclass(&[(b'a', b'b')]);
+ let cls2 = bclass(&[(b'b', b'c')]);
+ let expected = bclass(&[(b'b', b'b')]);
+ assert_eq!(expected, bintersect(&cls1, &cls2));
+
+ let cls1 = bclass(&[(b'a', b'b')]);
+ let cls2 = bclass(&[(b'c', b'd')]);
+ let expected = bclass(&[]);
+ assert_eq!(expected, bintersect(&cls1, &cls2));
+
+ let cls1 = bclass(&[(b'b', b'c')]);
+ let cls2 = bclass(&[(b'a', b'd')]);
+ let expected = bclass(&[(b'b', b'c')]);
+ assert_eq!(expected, bintersect(&cls1, &cls2));
+
+ let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
+ let cls2 = bclass(&[(b'a', b'h')]);
+ let expected = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
+ assert_eq!(expected, bintersect(&cls1, &cls2));
+
+ let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
+ let cls2 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
+ let expected = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
+ assert_eq!(expected, bintersect(&cls1, &cls2));
+
+ let cls1 = bclass(&[(b'a', b'b'), (b'g', b'h')]);
+ let cls2 = bclass(&[(b'd', b'e'), (b'k', b'l')]);
+ let expected = bclass(&[]);
+ assert_eq!(expected, bintersect(&cls1, &cls2));
+
+ let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
+ let cls2 = bclass(&[(b'h', b'h')]);
+ let expected = bclass(&[(b'h', b'h')]);
+ assert_eq!(expected, bintersect(&cls1, &cls2));
+
+ let cls1 = bclass(&[(b'a', b'b'), (b'e', b'f'), (b'i', b'j')]);
+ let cls2 = bclass(&[(b'c', b'd'), (b'g', b'h'), (b'k', b'l')]);
+ let expected = bclass(&[]);
+ assert_eq!(expected, bintersect(&cls1, &cls2));
+
+ let cls1 = bclass(&[(b'a', b'b'), (b'c', b'd'), (b'e', b'f')]);
+ let cls2 = bclass(&[(b'b', b'c'), (b'd', b'e'), (b'f', b'g')]);
+ let expected = bclass(&[(b'b', b'f')]);
+ assert_eq!(expected, bintersect(&cls1, &cls2));
+ }
+
+ #[test]
+ fn class_difference_unicode() {
+ let cls1 = uclass(&[('a', 'a')]);
+ let cls2 = uclass(&[('a', 'a')]);
+ let expected = uclass(&[]);
+ assert_eq!(expected, udifference(&cls1, &cls2));
+
+ let cls1 = uclass(&[('a', 'a')]);
+ let cls2 = uclass(&[]);
+ let expected = uclass(&[('a', 'a')]);
+ assert_eq!(expected, udifference(&cls1, &cls2));
+
+ let cls1 = uclass(&[]);
+ let cls2 = uclass(&[('a', 'a')]);
+ let expected = uclass(&[]);
+ assert_eq!(expected, udifference(&cls1, &cls2));
+
+ let cls1 = uclass(&[('a', 'z')]);
+ let cls2 = uclass(&[('a', 'a')]);
+ let expected = uclass(&[('b', 'z')]);
+ assert_eq!(expected, udifference(&cls1, &cls2));
+
+ let cls1 = uclass(&[('a', 'z')]);
+ let cls2 = uclass(&[('z', 'z')]);
+ let expected = uclass(&[('a', 'y')]);
+ assert_eq!(expected, udifference(&cls1, &cls2));
+
+ let cls1 = uclass(&[('a', 'z')]);
+ let cls2 = uclass(&[('m', 'm')]);
+ let expected = uclass(&[('a', 'l'), ('n', 'z')]);
+ assert_eq!(expected, udifference(&cls1, &cls2));
+
+ let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
+ let cls2 = uclass(&[('a', 'z')]);
+ let expected = uclass(&[]);
+ assert_eq!(expected, udifference(&cls1, &cls2));
+
+ let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
+ let cls2 = uclass(&[('d', 'v')]);
+ let expected = uclass(&[('a', 'c')]);
+ assert_eq!(expected, udifference(&cls1, &cls2));
+
+ let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
+ let cls2 = uclass(&[('b', 'g'), ('s', 'u')]);
+ let expected = uclass(&[('a', 'a'), ('h', 'i'), ('r', 'r')]);
+ assert_eq!(expected, udifference(&cls1, &cls2));
+
+ let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
+ let cls2 = uclass(&[('b', 'd'), ('e', 'g'), ('s', 'u')]);
+ let expected = uclass(&[('a', 'a'), ('h', 'i'), ('r', 'r')]);
+ assert_eq!(expected, udifference(&cls1, &cls2));
+
+ let cls1 = uclass(&[('x', 'z')]);
+ let cls2 = uclass(&[('a', 'c'), ('e', 'g'), ('s', 'u')]);
+ let expected = uclass(&[('x', 'z')]);
+ assert_eq!(expected, udifference(&cls1, &cls2));
+
+ let cls1 = uclass(&[('a', 'z')]);
+ let cls2 = uclass(&[('a', 'c'), ('e', 'g'), ('s', 'u')]);
+ let expected = uclass(&[('d', 'd'), ('h', 'r'), ('v', 'z')]);
+ assert_eq!(expected, udifference(&cls1, &cls2));
+ }
+
+ #[test]
+ fn class_difference_bytes() {
+ let cls1 = bclass(&[(b'a', b'a')]);
+ let cls2 = bclass(&[(b'a', b'a')]);
+ let expected = bclass(&[]);
+ assert_eq!(expected, bdifference(&cls1, &cls2));
+
+ let cls1 = bclass(&[(b'a', b'a')]);
+ let cls2 = bclass(&[]);
+ let expected = bclass(&[(b'a', b'a')]);
+ assert_eq!(expected, bdifference(&cls1, &cls2));
+
+ let cls1 = bclass(&[]);
+ let cls2 = bclass(&[(b'a', b'a')]);
+ let expected = bclass(&[]);
+ assert_eq!(expected, bdifference(&cls1, &cls2));
+
+ let cls1 = bclass(&[(b'a', b'z')]);
+ let cls2 = bclass(&[(b'a', b'a')]);
+ let expected = bclass(&[(b'b', b'z')]);
+ assert_eq!(expected, bdifference(&cls1, &cls2));
+
+ let cls1 = bclass(&[(b'a', b'z')]);
+ let cls2 = bclass(&[(b'z', b'z')]);
+ let expected = bclass(&[(b'a', b'y')]);
+ assert_eq!(expected, bdifference(&cls1, &cls2));
+
+ let cls1 = bclass(&[(b'a', b'z')]);
+ let cls2 = bclass(&[(b'm', b'm')]);
+ let expected = bclass(&[(b'a', b'l'), (b'n', b'z')]);
+ assert_eq!(expected, bdifference(&cls1, &cls2));
+
+ let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
+ let cls2 = bclass(&[(b'a', b'z')]);
+ let expected = bclass(&[]);
+ assert_eq!(expected, bdifference(&cls1, &cls2));
+
+ let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
+ let cls2 = bclass(&[(b'd', b'v')]);
+ let expected = bclass(&[(b'a', b'c')]);
+ assert_eq!(expected, bdifference(&cls1, &cls2));
+
+ let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
+ let cls2 = bclass(&[(b'b', b'g'), (b's', b'u')]);
+ let expected = bclass(&[(b'a', b'a'), (b'h', b'i'), (b'r', b'r')]);
+ assert_eq!(expected, bdifference(&cls1, &cls2));
+
+ let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
+ let cls2 = bclass(&[(b'b', b'd'), (b'e', b'g'), (b's', b'u')]);
+ let expected = bclass(&[(b'a', b'a'), (b'h', b'i'), (b'r', b'r')]);
+ assert_eq!(expected, bdifference(&cls1, &cls2));
+
+ let cls1 = bclass(&[(b'x', b'z')]);
+ let cls2 = bclass(&[(b'a', b'c'), (b'e', b'g'), (b's', b'u')]);
+ let expected = bclass(&[(b'x', b'z')]);
+ assert_eq!(expected, bdifference(&cls1, &cls2));
+
+ let cls1 = bclass(&[(b'a', b'z')]);
+ let cls2 = bclass(&[(b'a', b'c'), (b'e', b'g'), (b's', b'u')]);
+ let expected = bclass(&[(b'd', b'd'), (b'h', b'r'), (b'v', b'z')]);
+ assert_eq!(expected, bdifference(&cls1, &cls2));
+ }
+
+ #[test]
+ fn class_symmetric_difference_unicode() {
+ let cls1 = uclass(&[('a', 'm')]);
+ let cls2 = uclass(&[('g', 't')]);
+ let expected = uclass(&[('a', 'f'), ('n', 't')]);
+ assert_eq!(expected, usymdifference(&cls1, &cls2));
+ }
+
+ #[test]
+ fn class_symmetric_difference_bytes() {
+ let cls1 = bclass(&[(b'a', b'm')]);
+ let cls2 = bclass(&[(b'g', b't')]);
+ let expected = bclass(&[(b'a', b'f'), (b'n', b't')]);
+ assert_eq!(expected, bsymdifference(&cls1, &cls2));
+ }
+
+ // We use a thread with an explicit stack size to test that our destructor
+ // for Hir can handle arbitrarily sized expressions in constant stack
+ // space. In case we run on a platform without threads (WASM?), we limit
+ // this test to Windows/Unix.
+ #[test]
+ #[cfg(any(unix, windows))]
+ fn no_stack_overflow_on_drop() {
+ use std::thread;
+
+ let run = || {
+ let mut expr = Hir::empty();
+ for _ in 0..100 {
+ expr = Hir::capture(Capture {
+ index: 1,
+ name: None,
+ sub: Box::new(expr),
+ });
+ expr = Hir::repetition(Repetition {
+ min: 0,
+ max: Some(1),
+ greedy: true,
+ sub: Box::new(expr),
+ });
+
+ expr = Hir {
+ kind: HirKind::Concat(vec![expr]),
+ props: Properties::empty(),
+ };
+ expr = Hir {
+ kind: HirKind::Alternation(vec![expr]),
+ props: Properties::empty(),
+ };
+ }
+ assert!(!matches!(*expr.kind(), HirKind::Empty));
+ };
+
+ // We run our test on a thread with a small stack size so we can
+ // force the issue more easily.
+ //
+ // NOTE(2023-03-21): See the corresponding test in 'crate::ast::tests'
+ // for context on the specific stack size chosen here.
+ thread::Builder::new()
+ .stack_size(16 << 10)
+ .spawn(run)
+ .unwrap()
+ .join()
+ .unwrap();
+ }
+
+ #[test]
+ fn look_set_iter() {
+ let set = LookSet::empty();
+ assert_eq!(0, set.iter().count());
+
+ let set = LookSet::full();
+ assert_eq!(18, set.iter().count());
+
+ let set =
+ LookSet::empty().insert(Look::StartLF).insert(Look::WordUnicode);
+ assert_eq!(2, set.iter().count());
+
+ let set = LookSet::empty().insert(Look::StartLF);
+ assert_eq!(1, set.iter().count());
+
+ let set = LookSet::empty().insert(Look::WordAsciiNegate);
+ assert_eq!(1, set.iter().count());
+ }
+
+ #[test]
+ fn look_set_debug() {
+ let res = format!("{:?}", LookSet::empty());
+ assert_eq!("∅", res);
+ let res = format!("{:?}", LookSet::full());
+ assert_eq!("Az^$rRbB𝛃𝚩<>〈〉◁▷◀▶", res);
+ }
+}
diff --git a/vendor/regex-syntax/src/hir/print.rs b/vendor/regex-syntax/src/hir/print.rs
new file mode 100644
index 0000000..dfa6d40
--- /dev/null
+++ b/vendor/regex-syntax/src/hir/print.rs
@@ -0,0 +1,608 @@
+/*!
+This module provides a regular expression printer for `Hir`.
+*/
+
+use core::fmt;
+
+use crate::{
+ hir::{
+ self,
+ visitor::{self, Visitor},
+ Hir, HirKind,
+ },
+ is_meta_character,
+};
+
+/// A builder for constructing a printer.
+///
+/// Note that since a printer doesn't have any configuration knobs, this type
+/// remains unexported.
+#[derive(Clone, Debug)]
+struct PrinterBuilder {
+ _priv: (),
+}
+
+impl Default for PrinterBuilder {
+ fn default() -> PrinterBuilder {
+ PrinterBuilder::new()
+ }
+}
+
+impl PrinterBuilder {
+ fn new() -> PrinterBuilder {
+ PrinterBuilder { _priv: () }
+ }
+
+ fn build(&self) -> Printer {
+ Printer { _priv: () }
+ }
+}
+
+/// A printer for a regular expression's high-level intermediate
+/// representation.
+///
+/// A printer converts a high-level intermediate representation (HIR) to a
+/// regular expression pattern string. This particular printer uses constant
+/// stack space and heap space proportional to the size of the HIR.
+///
+/// Since this printer is only using the HIR, the pattern it prints will likely
+/// not resemble the original pattern at all. For example, a pattern like
+/// `\pL` will have its entire class written out.
+///
+/// The purpose of this printer is to provide a means to mutate an HIR and then
+/// build a regular expression from the result of that mutation. (A regex
+/// library could provide a constructor from this HIR explicitly, but that
+/// creates an unnecessary public coupling between the regex library and this
+/// specific HIR representation.)
+#[derive(Debug)]
+pub struct Printer {
+ _priv: (),
+}
+
+impl Printer {
+ /// Create a new printer.
+ pub fn new() -> Printer {
+ PrinterBuilder::new().build()
+ }
+
+ /// Print the given `Ast` to the given writer. The writer must implement
+ /// `fmt::Write`. Typical implementations of `fmt::Write` that can be used
+ /// here are a `fmt::Formatter` (which is available in `fmt::Display`
+ /// implementations) or a `&mut String`.
+ pub fn print<W: fmt::Write>(&mut self, hir: &Hir, wtr: W) -> fmt::Result {
+ visitor::visit(hir, Writer { wtr })
+ }
+}
+
+#[derive(Debug)]
+struct Writer<W> {
+ wtr: W,
+}
+
+impl<W: fmt::Write> Visitor for Writer<W> {
+ type Output = ();
+ type Err = fmt::Error;
+
+ fn finish(self) -> fmt::Result {
+ Ok(())
+ }
+
+ fn visit_pre(&mut self, hir: &Hir) -> fmt::Result {
+ match *hir.kind() {
+ HirKind::Empty => {
+ // Technically an empty sub-expression could be "printed" by
+ // just ignoring it, but in practice, you could have a
+ // repetition operator attached to an empty expression, and you
+ // really need something in the concrete syntax to make that
+ // work as you'd expect.
+ self.wtr.write_str(r"(?:)")?;
+ }
+ // Repetition operators are strictly suffix oriented.
+ HirKind::Repetition(_) => {}
+ HirKind::Literal(hir::Literal(ref bytes)) => {
+ // See the comment on the 'Concat' and 'Alternation' case below
+ // for why we put parens here. Literals are, conceptually,
+ // a special case of concatenation where each element is a
+ // character. The HIR flattens this into a Box<[u8]>, but we
+ // still need to treat it like a concatenation for correct
+ // printing. As a special case, we don't write parens if there
+ // is only one character. One character means there is no
+ // concat so we don't need parens. Adding parens would still be
+ // correct, but we drop them here because it tends to create
+ // rather noisy regexes even in simple cases.
+ let result = core::str::from_utf8(bytes);
+ let len = result.map_or(bytes.len(), |s| s.chars().count());
+ if len > 1 {
+ self.wtr.write_str(r"(?:")?;
+ }
+ match result {
+ Ok(string) => {
+ for c in string.chars() {
+ self.write_literal_char(c)?;
+ }
+ }
+ Err(_) => {
+ for &b in bytes.iter() {
+ self.write_literal_byte(b)?;
+ }
+ }
+ }
+ if len > 1 {
+ self.wtr.write_str(r")")?;
+ }
+ }
+ HirKind::Class(hir::Class::Unicode(ref cls)) => {
+ if cls.ranges().is_empty() {
+ return self.wtr.write_str("[a&&b]");
+ }
+ self.wtr.write_str("[")?;
+ for range in cls.iter() {
+ if range.start() == range.end() {
+ self.write_literal_char(range.start())?;
+ } else if u32::from(range.start()) + 1
+ == u32::from(range.end())
+ {
+ self.write_literal_char(range.start())?;
+ self.write_literal_char(range.end())?;
+ } else {
+ self.write_literal_char(range.start())?;
+ self.wtr.write_str("-")?;
+ self.write_literal_char(range.end())?;
+ }
+ }
+ self.wtr.write_str("]")?;
+ }
+ HirKind::Class(hir::Class::Bytes(ref cls)) => {
+ if cls.ranges().is_empty() {
+ return self.wtr.write_str("[a&&b]");
+ }
+ self.wtr.write_str("(?-u:[")?;
+ for range in cls.iter() {
+ if range.start() == range.end() {
+ self.write_literal_class_byte(range.start())?;
+ } else if range.start() + 1 == range.end() {
+ self.write_literal_class_byte(range.start())?;
+ self.write_literal_class_byte(range.end())?;
+ } else {
+ self.write_literal_class_byte(range.start())?;
+ self.wtr.write_str("-")?;
+ self.write_literal_class_byte(range.end())?;
+ }
+ }
+ self.wtr.write_str("])")?;
+ }
+ HirKind::Look(ref look) => match *look {
+ hir::Look::Start => {
+ self.wtr.write_str(r"\A")?;
+ }
+ hir::Look::End => {
+ self.wtr.write_str(r"\z")?;
+ }
+ hir::Look::StartLF => {
+ self.wtr.write_str("(?m:^)")?;
+ }
+ hir::Look::EndLF => {
+ self.wtr.write_str("(?m:$)")?;
+ }
+ hir::Look::StartCRLF => {
+ self.wtr.write_str("(?mR:^)")?;
+ }
+ hir::Look::EndCRLF => {
+ self.wtr.write_str("(?mR:$)")?;
+ }
+ hir::Look::WordAscii => {
+ self.wtr.write_str(r"(?-u:\b)")?;
+ }
+ hir::Look::WordAsciiNegate => {
+ self.wtr.write_str(r"(?-u:\B)")?;
+ }
+ hir::Look::WordUnicode => {
+ self.wtr.write_str(r"\b")?;
+ }
+ hir::Look::WordUnicodeNegate => {
+ self.wtr.write_str(r"\B")?;
+ }
+ hir::Look::WordStartAscii => {
+ self.wtr.write_str(r"(?-u:\b{start})")?;
+ }
+ hir::Look::WordEndAscii => {
+ self.wtr.write_str(r"(?-u:\b{end})")?;
+ }
+ hir::Look::WordStartUnicode => {
+ self.wtr.write_str(r"\b{start}")?;
+ }
+ hir::Look::WordEndUnicode => {
+ self.wtr.write_str(r"\b{end}")?;
+ }
+ hir::Look::WordStartHalfAscii => {
+ self.wtr.write_str(r"(?-u:\b{start-half})")?;
+ }
+ hir::Look::WordEndHalfAscii => {
+ self.wtr.write_str(r"(?-u:\b{end-half})")?;
+ }
+ hir::Look::WordStartHalfUnicode => {
+ self.wtr.write_str(r"\b{start-half}")?;
+ }
+ hir::Look::WordEndHalfUnicode => {
+ self.wtr.write_str(r"\b{end-half}")?;
+ }
+ },
+ HirKind::Capture(hir::Capture { ref name, .. }) => {
+ self.wtr.write_str("(")?;
+ if let Some(ref name) = *name {
+ write!(self.wtr, "?P<{}>", name)?;
+ }
+ }
+ // Why do this? Wrapping concats and alts in non-capturing groups
+ // is not *always* necessary, but is sometimes necessary. For
+ // example, 'concat(a, alt(b, c))' should be written as 'a(?:b|c)'
+ // and not 'ab|c'. The former is clearly the intended meaning, but
+ // the latter is actually 'alt(concat(a, b), c)'.
+ //
+ // It would be possible to only group these things in cases where
+ // it's strictly necessary, but it requires knowing the parent
+ // expression. And since this technique is simpler and always
+ // correct, we take this route. More to the point, it is a non-goal
+ // of an HIR printer to show a nice easy-to-read regex. Indeed,
+ // its construction forbids it from doing so. Therefore, inserting
+ // extra groups where they aren't necessary is perfectly okay.
+ HirKind::Concat(_) | HirKind::Alternation(_) => {
+ self.wtr.write_str(r"(?:")?;
+ }
+ }
+ Ok(())
+ }
+
+ fn visit_post(&mut self, hir: &Hir) -> fmt::Result {
+ match *hir.kind() {
+ // Handled during visit_pre
+ HirKind::Empty
+ | HirKind::Literal(_)
+ | HirKind::Class(_)
+ | HirKind::Look(_) => {}
+ HirKind::Repetition(ref x) => {
+ match (x.min, x.max) {
+ (0, Some(1)) => {
+ self.wtr.write_str("?")?;
+ }
+ (0, None) => {
+ self.wtr.write_str("*")?;
+ }
+ (1, None) => {
+ self.wtr.write_str("+")?;
+ }
+ (1, Some(1)) => {
+ // 'a{1}' and 'a{1}?' are exactly equivalent to 'a'.
+ return Ok(());
+ }
+ (m, None) => {
+ write!(self.wtr, "{{{},}}", m)?;
+ }
+ (m, Some(n)) if m == n => {
+ write!(self.wtr, "{{{}}}", m)?;
+ // a{m} and a{m}? are always exactly equivalent.
+ return Ok(());
+ }
+ (m, Some(n)) => {
+ write!(self.wtr, "{{{},{}}}", m, n)?;
+ }
+ }
+ if !x.greedy {
+ self.wtr.write_str("?")?;
+ }
+ }
+ HirKind::Capture(_)
+ | HirKind::Concat(_)
+ | HirKind::Alternation(_) => {
+ self.wtr.write_str(r")")?;
+ }
+ }
+ Ok(())
+ }
+
+ fn visit_alternation_in(&mut self) -> fmt::Result {
+ self.wtr.write_str("|")
+ }
+}
+
+impl<W: fmt::Write> Writer<W> {
+ fn write_literal_char(&mut self, c: char) -> fmt::Result {
+ if is_meta_character(c) {
+ self.wtr.write_str("\\")?;
+ }
+ self.wtr.write_char(c)
+ }
+
+ fn write_literal_byte(&mut self, b: u8) -> fmt::Result {
+ if b <= 0x7F && !b.is_ascii_control() && !b.is_ascii_whitespace() {
+ self.write_literal_char(char::try_from(b).unwrap())
+ } else {
+ write!(self.wtr, "(?-u:\\x{:02X})", b)
+ }
+ }
+
+ fn write_literal_class_byte(&mut self, b: u8) -> fmt::Result {
+ if b <= 0x7F && !b.is_ascii_control() && !b.is_ascii_whitespace() {
+ self.write_literal_char(char::try_from(b).unwrap())
+ } else {
+ write!(self.wtr, "\\x{:02X}", b)
+ }
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use alloc::{
+ boxed::Box,
+ string::{String, ToString},
+ };
+
+ use crate::ParserBuilder;
+
+ use super::*;
+
+ fn roundtrip(given: &str, expected: &str) {
+ roundtrip_with(|b| b, given, expected);
+ }
+
+ fn roundtrip_bytes(given: &str, expected: &str) {
+ roundtrip_with(|b| b.utf8(false), given, expected);
+ }
+
+ fn roundtrip_with<F>(mut f: F, given: &str, expected: &str)
+ where
+ F: FnMut(&mut ParserBuilder) -> &mut ParserBuilder,
+ {
+ let mut builder = ParserBuilder::new();
+ f(&mut builder);
+ let hir = builder.build().parse(given).unwrap();
+
+ let mut printer = Printer::new();
+ let mut dst = String::new();
+ printer.print(&hir, &mut dst).unwrap();
+
+ // Check that the result is actually valid.
+ builder.build().parse(&dst).unwrap();
+
+ assert_eq!(expected, dst);
+ }
+
+ #[test]
+ fn print_literal() {
+ roundtrip("a", "a");
+ roundtrip(r"\xff", "\u{FF}");
+ roundtrip_bytes(r"\xff", "\u{FF}");
+ roundtrip_bytes(r"(?-u)\xff", r"(?-u:\xFF)");
+ roundtrip("☃", "☃");
+ }
+
+ #[test]
+ fn print_class() {
+ roundtrip(r"[a]", r"a");
+ roundtrip(r"[ab]", r"[ab]");
+ roundtrip(r"[a-z]", r"[a-z]");
+ roundtrip(r"[a-z--b-c--x-y]", r"[ad-wz]");
+ roundtrip(r"[^\x01-\u{10FFFF}]", "\u{0}");
+ roundtrip(r"[-]", r"\-");
+ roundtrip(r"[☃-⛄]", r"[☃-⛄]");
+
+ roundtrip(r"(?-u)[a]", r"a");
+ roundtrip(r"(?-u)[ab]", r"(?-u:[ab])");
+ roundtrip(r"(?-u)[a-z]", r"(?-u:[a-z])");
+ roundtrip_bytes(r"(?-u)[a-\xFF]", r"(?-u:[a-\xFF])");
+
+ // The following test that the printer escapes meta characters
+ // in character classes.
+ roundtrip(r"[\[]", r"\[");
+ roundtrip(r"[Z-_]", r"[Z-_]");
+ roundtrip(r"[Z-_--Z]", r"[\[-_]");
+
+ // The following test that the printer escapes meta characters
+ // in byte oriented character classes.
+ roundtrip_bytes(r"(?-u)[\[]", r"\[");
+ roundtrip_bytes(r"(?-u)[Z-_]", r"(?-u:[Z-_])");
+ roundtrip_bytes(r"(?-u)[Z-_--Z]", r"(?-u:[\[-_])");
+
+ // This tests that an empty character class is correctly roundtripped.
+ #[cfg(feature = "unicode-gencat")]
+ roundtrip(r"\P{any}", r"[a&&b]");
+ roundtrip_bytes(r"(?-u)[^\x00-\xFF]", r"[a&&b]");
+ }
+
+ #[test]
+ fn print_anchor() {
+ roundtrip(r"^", r"\A");
+ roundtrip(r"$", r"\z");
+ roundtrip(r"(?m)^", r"(?m:^)");
+ roundtrip(r"(?m)$", r"(?m:$)");
+ }
+
+ #[test]
+ fn print_word_boundary() {
+ roundtrip(r"\b", r"\b");
+ roundtrip(r"\B", r"\B");
+ roundtrip(r"(?-u)\b", r"(?-u:\b)");
+ roundtrip_bytes(r"(?-u)\B", r"(?-u:\B)");
+ }
+
+ #[test]
+ fn print_repetition() {
+ roundtrip("a?", "a?");
+ roundtrip("a??", "a??");
+ roundtrip("(?U)a?", "a??");
+
+ roundtrip("a*", "a*");
+ roundtrip("a*?", "a*?");
+ roundtrip("(?U)a*", "a*?");
+
+ roundtrip("a+", "a+");
+ roundtrip("a+?", "a+?");
+ roundtrip("(?U)a+", "a+?");
+
+ roundtrip("a{1}", "a");
+ roundtrip("a{2}", "a{2}");
+ roundtrip("a{1,}", "a+");
+ roundtrip("a{1,5}", "a{1,5}");
+ roundtrip("a{1}?", "a");
+ roundtrip("a{2}?", "a{2}");
+ roundtrip("a{1,}?", "a+?");
+ roundtrip("a{1,5}?", "a{1,5}?");
+ roundtrip("(?U)a{1}", "a");
+ roundtrip("(?U)a{2}", "a{2}");
+ roundtrip("(?U)a{1,}", "a+?");
+ roundtrip("(?U)a{1,5}", "a{1,5}?");
+
+ // Test that various zero-length repetitions always translate to an
+ // empty regex. This is more a property of HIR's smart constructors
+ // than the printer though.
+ roundtrip("a{0}", "(?:)");
+ roundtrip("(?:ab){0}", "(?:)");
+ #[cfg(feature = "unicode-gencat")]
+ {
+ roundtrip(r"\p{any}{0}", "(?:)");
+ roundtrip(r"\P{any}{0}", "(?:)");
+ }
+ }
+
+ #[test]
+ fn print_group() {
+ roundtrip("()", "((?:))");
+ roundtrip("(?P<foo>)", "(?P<foo>(?:))");
+ roundtrip("(?:)", "(?:)");
+
+ roundtrip("(a)", "(a)");
+ roundtrip("(?P<foo>a)", "(?P<foo>a)");
+ roundtrip("(?:a)", "a");
+
+ roundtrip("((((a))))", "((((a))))");
+ }
+
+ #[test]
+ fn print_alternation() {
+ roundtrip("|", "(?:(?:)|(?:))");
+ roundtrip("||", "(?:(?:)|(?:)|(?:))");
+
+ roundtrip("a|b", "[ab]");
+ roundtrip("ab|cd", "(?:(?:ab)|(?:cd))");
+ roundtrip("a|b|c", "[a-c]");
+ roundtrip("ab|cd|ef", "(?:(?:ab)|(?:cd)|(?:ef))");
+ roundtrip("foo|bar|quux", "(?:(?:foo)|(?:bar)|(?:quux))");
+ }
+
+ // This is a regression test that stresses a peculiarity of how the HIR
+ // is both constructed and printed. Namely, it is legal for a repetition
+ // to directly contain a concatenation. This particular construct isn't
+ // really possible to build from the concrete syntax directly, since you'd
+ // be forced to put the concatenation into (at least) a non-capturing
+ // group. Concurrently, the printer doesn't consider this case and just
+ // kind of naively prints the child expression and tacks on the repetition
+ // operator.
+ //
+ // As a result, if you attached '+' to a 'concat(a, b)', the printer gives
+ // you 'ab+', but clearly it really should be '(?:ab)+'.
+ //
+ // This bug isn't easy to surface because most ways of building an HIR
+ // come directly from the concrete syntax, and as mentioned above, it just
+ // isn't possible to build this kind of HIR from the concrete syntax.
+ // Nevertheless, this is definitely a bug.
+ //
+ // See: https://github.com/rust-lang/regex/issues/731
+ #[test]
+ fn regression_repetition_concat() {
+ let expr = Hir::concat(alloc::vec![
+ Hir::literal("x".as_bytes()),
+ Hir::repetition(hir::Repetition {
+ min: 1,
+ max: None,
+ greedy: true,
+ sub: Box::new(Hir::literal("ab".as_bytes())),
+ }),
+ Hir::literal("y".as_bytes()),
+ ]);
+ assert_eq!(r"(?:x(?:ab)+y)", expr.to_string());
+
+ let expr = Hir::concat(alloc::vec![
+ Hir::look(hir::Look::Start),
+ Hir::repetition(hir::Repetition {
+ min: 1,
+ max: None,
+ greedy: true,
+ sub: Box::new(Hir::concat(alloc::vec![
+ Hir::look(hir::Look::Start),
+ Hir::look(hir::Look::End),
+ ])),
+ }),
+ Hir::look(hir::Look::End),
+ ]);
+ assert_eq!(r"(?:\A\A\z\z)", expr.to_string());
+ }
+
+ // Just like regression_repetition_concat, but with the repetition using
+ // an alternation as a child expression instead.
+ //
+ // See: https://github.com/rust-lang/regex/issues/731
+ #[test]
+ fn regression_repetition_alternation() {
+ let expr = Hir::concat(alloc::vec![
+ Hir::literal("ab".as_bytes()),
+ Hir::repetition(hir::Repetition {
+ min: 1,
+ max: None,
+ greedy: true,
+ sub: Box::new(Hir::alternation(alloc::vec![
+ Hir::literal("cd".as_bytes()),
+ Hir::literal("ef".as_bytes()),
+ ])),
+ }),
+ Hir::literal("gh".as_bytes()),
+ ]);
+ assert_eq!(r"(?:(?:ab)(?:(?:cd)|(?:ef))+(?:gh))", expr.to_string());
+
+ let expr = Hir::concat(alloc::vec![
+ Hir::look(hir::Look::Start),
+ Hir::repetition(hir::Repetition {
+ min: 1,
+ max: None,
+ greedy: true,
+ sub: Box::new(Hir::alternation(alloc::vec![
+ Hir::look(hir::Look::Start),
+ Hir::look(hir::Look::End),
+ ])),
+ }),
+ Hir::look(hir::Look::End),
+ ]);
+ assert_eq!(r"(?:\A(?:\A|\z)\z)", expr.to_string());
+ }
+
+ // This regression test is very similar in flavor to
+ // regression_repetition_concat in that the root of the issue lies in a
+ // peculiarity of how the HIR is represented and how the printer writes it
+ // out. Like the other regression, this one is also rooted in the fact that
+ // you can't produce the peculiar HIR from the concrete syntax. Namely, you
+ // just can't have a 'concat(a, alt(b, c))' because the 'alt' will normally
+ // be in (at least) a non-capturing group. Why? Because the '|' has very
+ // low precedence (lower that concatenation), and so something like 'ab|c'
+ // is actually 'alt(ab, c)'.
+ //
+ // See: https://github.com/rust-lang/regex/issues/516
+ #[test]
+ fn regression_alternation_concat() {
+ let expr = Hir::concat(alloc::vec![
+ Hir::literal("ab".as_bytes()),
+ Hir::alternation(alloc::vec![
+ Hir::literal("mn".as_bytes()),
+ Hir::literal("xy".as_bytes()),
+ ]),
+ ]);
+ assert_eq!(r"(?:(?:ab)(?:(?:mn)|(?:xy)))", expr.to_string());
+
+ let expr = Hir::concat(alloc::vec![
+ Hir::look(hir::Look::Start),
+ Hir::alternation(alloc::vec![
+ Hir::look(hir::Look::Start),
+ Hir::look(hir::Look::End),
+ ]),
+ ]);
+ assert_eq!(r"(?:\A(?:\A|\z))", expr.to_string());
+ }
+}
diff --git a/vendor/regex-syntax/src/hir/translate.rs b/vendor/regex-syntax/src/hir/translate.rs
new file mode 100644
index 0000000..313a1e9
--- /dev/null
+++ b/vendor/regex-syntax/src/hir/translate.rs
@@ -0,0 +1,3724 @@
+/*!
+Defines a translator that converts an `Ast` to an `Hir`.
+*/
+
+use core::cell::{Cell, RefCell};
+
+use alloc::{boxed::Box, string::ToString, vec, vec::Vec};
+
+use crate::{
+ ast::{self, Ast, Span, Visitor},
+ either::Either,
+ hir::{self, Error, ErrorKind, Hir, HirKind},
+ unicode::{self, ClassQuery},
+};
+
+type Result<T> = core::result::Result<T, Error>;
+
+/// A builder for constructing an AST->HIR translator.
+#[derive(Clone, Debug)]
+pub struct TranslatorBuilder {
+ utf8: bool,
+ line_terminator: u8,
+ flags: Flags,
+}
+
+impl Default for TranslatorBuilder {
+ fn default() -> TranslatorBuilder {
+ TranslatorBuilder::new()
+ }
+}
+
+impl TranslatorBuilder {
+ /// Create a new translator builder with a default c onfiguration.
+ pub fn new() -> TranslatorBuilder {
+ TranslatorBuilder {
+ utf8: true,
+ line_terminator: b'\n',
+ flags: Flags::default(),
+ }
+ }
+
+ /// Build a translator using the current configuration.
+ pub fn build(&self) -> Translator {
+ Translator {
+ stack: RefCell::new(vec![]),
+ flags: Cell::new(self.flags),
+ utf8: self.utf8,
+ line_terminator: self.line_terminator,
+ }
+ }
+
+ /// When disabled, translation will permit the construction of a regular
+ /// expression that may match invalid UTF-8.
+ ///
+ /// When enabled (the default), the translator is guaranteed to produce an
+ /// expression that, for non-empty matches, will only ever produce spans
+ /// that are entirely valid UTF-8 (otherwise, the translator will return an
+ /// error).
+ ///
+ /// Perhaps surprisingly, when UTF-8 is enabled, an empty regex or even
+ /// a negated ASCII word boundary (uttered as `(?-u:\B)` in the concrete
+ /// syntax) will be allowed even though they can produce matches that split
+ /// a UTF-8 encoded codepoint. This only applies to zero-width or "empty"
+ /// matches, and it is expected that the regex engine itself must handle
+ /// these cases if necessary (perhaps by suppressing any zero-width matches
+ /// that split a codepoint).
+ pub fn utf8(&mut self, yes: bool) -> &mut TranslatorBuilder {
+ self.utf8 = yes;
+ self
+ }
+
+ /// Sets the line terminator for use with `(?u-s:.)` and `(?-us:.)`.
+ ///
+ /// Namely, instead of `.` (by default) matching everything except for `\n`,
+ /// this will cause `.` to match everything except for the byte given.
+ ///
+ /// If `.` is used in a context where Unicode mode is enabled and this byte
+ /// isn't ASCII, then an error will be returned. When Unicode mode is
+ /// disabled, then any byte is permitted, but will return an error if UTF-8
+ /// mode is enabled and it is a non-ASCII byte.
+ ///
+ /// In short, any ASCII value for a line terminator is always okay. But a
+ /// non-ASCII byte might result in an error depending on whether Unicode
+ /// mode or UTF-8 mode are enabled.
+ ///
+ /// Note that if `R` mode is enabled then it always takes precedence and
+ /// the line terminator will be treated as `\r` and `\n` simultaneously.
+ ///
+ /// Note also that this *doesn't* impact the look-around assertions
+ /// `(?m:^)` and `(?m:$)`. That's usually controlled by additional
+ /// configuration in the regex engine itself.
+ pub fn line_terminator(&mut self, byte: u8) -> &mut TranslatorBuilder {
+ self.line_terminator = byte;
+ self
+ }
+
+ /// Enable or disable the case insensitive flag (`i`) by default.
+ pub fn case_insensitive(&mut self, yes: bool) -> &mut TranslatorBuilder {
+ self.flags.case_insensitive = if yes { Some(true) } else { None };
+ self
+ }
+
+ /// Enable or disable the multi-line matching flag (`m`) by default.
+ pub fn multi_line(&mut self, yes: bool) -> &mut TranslatorBuilder {
+ self.flags.multi_line = if yes { Some(true) } else { None };
+ self
+ }
+
+ /// Enable or disable the "dot matches any character" flag (`s`) by
+ /// default.
+ pub fn dot_matches_new_line(
+ &mut self,
+ yes: bool,
+ ) -> &mut TranslatorBuilder {
+ self.flags.dot_matches_new_line = if yes { Some(true) } else { None };
+ self
+ }
+
+ /// Enable or disable the CRLF mode flag (`R`) by default.
+ pub fn crlf(&mut self, yes: bool) -> &mut TranslatorBuilder {
+ self.flags.crlf = if yes { Some(true) } else { None };
+ self
+ }
+
+ /// Enable or disable the "swap greed" flag (`U`) by default.
+ pub fn swap_greed(&mut self, yes: bool) -> &mut TranslatorBuilder {
+ self.flags.swap_greed = if yes { Some(true) } else { None };
+ self
+ }
+
+ /// Enable or disable the Unicode flag (`u`) by default.
+ pub fn unicode(&mut self, yes: bool) -> &mut TranslatorBuilder {
+ self.flags.unicode = if yes { None } else { Some(false) };
+ self
+ }
+}
+
+/// A translator maps abstract syntax to a high level intermediate
+/// representation.
+///
+/// A translator may be benefit from reuse. That is, a translator can translate
+/// many abstract syntax trees.
+///
+/// A `Translator` can be configured in more detail via a
+/// [`TranslatorBuilder`].
+#[derive(Clone, Debug)]
+pub struct Translator {
+ /// Our call stack, but on the heap.
+ stack: RefCell<Vec<HirFrame>>,
+ /// The current flag settings.
+ flags: Cell<Flags>,
+ /// Whether we're allowed to produce HIR that can match arbitrary bytes.
+ utf8: bool,
+ /// The line terminator to use for `.`.
+ line_terminator: u8,
+}
+
+impl Translator {
+ /// Create a new translator using the default configuration.
+ pub fn new() -> Translator {
+ TranslatorBuilder::new().build()
+ }
+
+ /// Translate the given abstract syntax tree (AST) into a high level
+ /// intermediate representation (HIR).
+ ///
+ /// If there was a problem doing the translation, then an HIR-specific
+ /// error is returned.
+ ///
+ /// The original pattern string used to produce the `Ast` *must* also be
+ /// provided. The translator does not use the pattern string during any
+ /// correct translation, but is used for error reporting.
+ pub fn translate(&mut self, pattern: &str, ast: &Ast) -> Result<Hir> {
+ ast::visit(ast, TranslatorI::new(self, pattern))
+ }
+}
+
+/// An HirFrame is a single stack frame, represented explicitly, which is
+/// created for each item in the Ast that we traverse.
+///
+/// Note that technically, this type doesn't represent our entire stack
+/// frame. In particular, the Ast visitor represents any state associated with
+/// traversing the Ast itself.
+#[derive(Clone, Debug)]
+enum HirFrame {
+ /// An arbitrary HIR expression. These get pushed whenever we hit a base
+ /// case in the Ast. They get popped after an inductive (i.e., recursive)
+ /// step is complete.
+ Expr(Hir),
+ /// A literal that is being constructed, character by character, from the
+ /// AST. We need this because the AST gives each individual character its
+ /// own node. So as we see characters, we peek at the top-most HirFrame.
+ /// If it's a literal, then we add to it. Otherwise, we push a new literal.
+ /// When it comes time to pop it, we convert it to an Hir via Hir::literal.
+ Literal(Vec<u8>),
+ /// A Unicode character class. This frame is mutated as we descend into
+ /// the Ast of a character class (which is itself its own mini recursive
+ /// structure).
+ ClassUnicode(hir::ClassUnicode),
+ /// A byte-oriented character class. This frame is mutated as we descend
+ /// into the Ast of a character class (which is itself its own mini
+ /// recursive structure).
+ ///
+ /// Byte character classes are created when Unicode mode (`u`) is disabled.
+ /// If `utf8` is enabled (the default), then a byte character is only
+ /// permitted to match ASCII text.
+ ClassBytes(hir::ClassBytes),
+ /// This is pushed whenever a repetition is observed. After visiting every
+ /// sub-expression in the repetition, the translator's stack is expected to
+ /// have this sentinel at the top.
+ ///
+ /// This sentinel only exists to stop other things (like flattening
+ /// literals) from reaching across repetition operators.
+ Repetition,
+ /// This is pushed on to the stack upon first seeing any kind of capture,
+ /// indicated by parentheses (including non-capturing groups). It is popped
+ /// upon leaving a group.
+ Group {
+ /// The old active flags when this group was opened.
+ ///
+ /// If this group sets flags, then the new active flags are set to the
+ /// result of merging the old flags with the flags introduced by this
+ /// group. If the group doesn't set any flags, then this is simply
+ /// equivalent to whatever flags were set when the group was opened.
+ ///
+ /// When this group is popped, the active flags should be restored to
+ /// the flags set here.
+ ///
+ /// The "active" flags correspond to whatever flags are set in the
+ /// Translator.
+ old_flags: Flags,
+ },
+ /// This is pushed whenever a concatenation is observed. After visiting
+ /// every sub-expression in the concatenation, the translator's stack is
+ /// popped until it sees a Concat frame.
+ Concat,
+ /// This is pushed whenever an alternation is observed. After visiting
+ /// every sub-expression in the alternation, the translator's stack is
+ /// popped until it sees an Alternation frame.
+ Alternation,
+ /// This is pushed immediately before each sub-expression in an
+ /// alternation. This separates the branches of an alternation on the
+ /// stack and prevents literal flattening from reaching across alternation
+ /// branches.
+ ///
+ /// It is popped after each expression in a branch until an 'Alternation'
+ /// frame is observed when doing a post visit on an alternation.
+ AlternationBranch,
+}
+
+impl HirFrame {
+ /// Assert that the current stack frame is an Hir expression and return it.
+ fn unwrap_expr(self) -> Hir {
+ match self {
+ HirFrame::Expr(expr) => expr,
+ HirFrame::Literal(lit) => Hir::literal(lit),
+ _ => panic!("tried to unwrap expr from HirFrame, got: {:?}", self),
+ }
+ }
+
+ /// Assert that the current stack frame is a Unicode class expression and
+ /// return it.
+ fn unwrap_class_unicode(self) -> hir::ClassUnicode {
+ match self {
+ HirFrame::ClassUnicode(cls) => cls,
+ _ => panic!(
+ "tried to unwrap Unicode class \
+ from HirFrame, got: {:?}",
+ self
+ ),
+ }
+ }
+
+ /// Assert that the current stack frame is a byte class expression and
+ /// return it.
+ fn unwrap_class_bytes(self) -> hir::ClassBytes {
+ match self {
+ HirFrame::ClassBytes(cls) => cls,
+ _ => panic!(
+ "tried to unwrap byte class \
+ from HirFrame, got: {:?}",
+ self
+ ),
+ }
+ }
+
+ /// Assert that the current stack frame is a repetition sentinel. If it
+ /// isn't, then panic.
+ fn unwrap_repetition(self) {
+ match self {
+ HirFrame::Repetition => {}
+ _ => {
+ panic!(
+ "tried to unwrap repetition from HirFrame, got: {:?}",
+ self
+ )
+ }
+ }
+ }
+
+ /// Assert that the current stack frame is a group indicator and return
+ /// its corresponding flags (the flags that were active at the time the
+ /// group was entered).
+ fn unwrap_group(self) -> Flags {
+ match self {
+ HirFrame::Group { old_flags } => old_flags,
+ _ => {
+ panic!("tried to unwrap group from HirFrame, got: {:?}", self)
+ }
+ }
+ }
+
+ /// Assert that the current stack frame is an alternation pipe sentinel. If
+ /// it isn't, then panic.
+ fn unwrap_alternation_pipe(self) {
+ match self {
+ HirFrame::AlternationBranch => {}
+ _ => {
+ panic!(
+ "tried to unwrap alt pipe from HirFrame, got: {:?}",
+ self
+ )
+ }
+ }
+ }
+}
+
+impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
+ type Output = Hir;
+ type Err = Error;
+
+ fn finish(self) -> Result<Hir> {
+ // ... otherwise, we should have exactly one HIR on the stack.
+ assert_eq!(self.trans().stack.borrow().len(), 1);
+ Ok(self.pop().unwrap().unwrap_expr())
+ }
+
+ fn visit_pre(&mut self, ast: &Ast) -> Result<()> {
+ match *ast {
+ Ast::ClassBracketed(_) => {
+ if self.flags().unicode() {
+ let cls = hir::ClassUnicode::empty();
+ self.push(HirFrame::ClassUnicode(cls));
+ } else {
+ let cls = hir::ClassBytes::empty();
+ self.push(HirFrame::ClassBytes(cls));
+ }
+ }
+ Ast::Repetition(_) => self.push(HirFrame::Repetition),
+ Ast::Group(ref x) => {
+ let old_flags = x
+ .flags()
+ .map(|ast| self.set_flags(ast))
+ .unwrap_or_else(|| self.flags());
+ self.push(HirFrame::Group { old_flags });
+ }
+ Ast::Concat(_) => {
+ self.push(HirFrame::Concat);
+ }
+ Ast::Alternation(ref x) => {
+ self.push(HirFrame::Alternation);
+ if !x.asts.is_empty() {
+ self.push(HirFrame::AlternationBranch);
+ }
+ }
+ _ => {}
+ }
+ Ok(())
+ }
+
+ fn visit_post(&mut self, ast: &Ast) -> Result<()> {
+ match *ast {
+ Ast::Empty(_) => {
+ self.push(HirFrame::Expr(Hir::empty()));
+ }
+ Ast::Flags(ref x) => {
+ self.set_flags(&x.flags);
+ // Flags in the AST are generally considered directives and
+ // not actual sub-expressions. However, they can be used in
+ // the concrete syntax like `((?i))`, and we need some kind of
+ // indication of an expression there, and Empty is the correct
+ // choice.
+ //
+ // There can also be things like `(?i)+`, but we rule those out
+ // in the parser. In the future, we might allow them for
+ // consistency sake.
+ self.push(HirFrame::Expr(Hir::empty()));
+ }
+ Ast::Literal(ref x) => match self.ast_literal_to_scalar(x)? {
+ Either::Right(byte) => self.push_byte(byte),
+ Either::Left(ch) => match self.case_fold_char(x.span, ch)? {
+ None => self.push_char(ch),
+ Some(expr) => self.push(HirFrame::Expr(expr)),
+ },
+ },
+ Ast::Dot(ref span) => {
+ self.push(HirFrame::Expr(self.hir_dot(**span)?));
+ }
+ Ast::Assertion(ref x) => {
+ self.push(HirFrame::Expr(self.hir_assertion(x)?));
+ }
+ Ast::ClassPerl(ref x) => {
+ if self.flags().unicode() {
+ let cls = self.hir_perl_unicode_class(x)?;
+ let hcls = hir::Class::Unicode(cls);
+ self.push(HirFrame::Expr(Hir::class(hcls)));
+ } else {
+ let cls = self.hir_perl_byte_class(x)?;
+ let hcls = hir::Class::Bytes(cls);
+ self.push(HirFrame::Expr(Hir::class(hcls)));
+ }
+ }
+ Ast::ClassUnicode(ref x) => {
+ let cls = hir::Class::Unicode(self.hir_unicode_class(x)?);
+ self.push(HirFrame::Expr(Hir::class(cls)));
+ }
+ Ast::ClassBracketed(ref ast) => {
+ if self.flags().unicode() {
+ let mut cls = self.pop().unwrap().unwrap_class_unicode();
+ self.unicode_fold_and_negate(
+ &ast.span,
+ ast.negated,
+ &mut cls,
+ )?;
+ let expr = Hir::class(hir::Class::Unicode(cls));
+ self.push(HirFrame::Expr(expr));
+ } else {
+ let mut cls = self.pop().unwrap().unwrap_class_bytes();
+ self.bytes_fold_and_negate(
+ &ast.span,
+ ast.negated,
+ &mut cls,
+ )?;
+ let expr = Hir::class(hir::Class::Bytes(cls));
+ self.push(HirFrame::Expr(expr));
+ }
+ }
+ Ast::Repetition(ref x) => {
+ let expr = self.pop().unwrap().unwrap_expr();
+ self.pop().unwrap().unwrap_repetition();
+ self.push(HirFrame::Expr(self.hir_repetition(x, expr)));
+ }
+ Ast::Group(ref x) => {
+ let expr = self.pop().unwrap().unwrap_expr();
+ let old_flags = self.pop().unwrap().unwrap_group();
+ self.trans().flags.set(old_flags);
+ self.push(HirFrame::Expr(self.hir_capture(x, expr)));
+ }
+ Ast::Concat(_) => {
+ let mut exprs = vec![];
+ while let Some(expr) = self.pop_concat_expr() {
+ if !matches!(*expr.kind(), HirKind::Empty) {
+ exprs.push(expr);
+ }
+ }
+ exprs.reverse();
+ self.push(HirFrame::Expr(Hir::concat(exprs)));
+ }
+ Ast::Alternation(_) => {
+ let mut exprs = vec![];
+ while let Some(expr) = self.pop_alt_expr() {
+ self.pop().unwrap().unwrap_alternation_pipe();
+ exprs.push(expr);
+ }
+ exprs.reverse();
+ self.push(HirFrame::Expr(Hir::alternation(exprs)));
+ }
+ }
+ Ok(())
+ }
+
+ fn visit_alternation_in(&mut self) -> Result<()> {
+ self.push(HirFrame::AlternationBranch);
+ Ok(())
+ }
+
+ fn visit_class_set_item_pre(
+ &mut self,
+ ast: &ast::ClassSetItem,
+ ) -> Result<()> {
+ match *ast {
+ ast::ClassSetItem::Bracketed(_) => {
+ if self.flags().unicode() {
+ let cls = hir::ClassUnicode::empty();
+ self.push(HirFrame::ClassUnicode(cls));
+ } else {
+ let cls = hir::ClassBytes::empty();
+ self.push(HirFrame::ClassBytes(cls));
+ }
+ }
+ // We needn't handle the Union case here since the visitor will
+ // do it for us.
+ _ => {}
+ }
+ Ok(())
+ }
+
+ fn visit_class_set_item_post(
+ &mut self,
+ ast: &ast::ClassSetItem,
+ ) -> Result<()> {
+ match *ast {
+ ast::ClassSetItem::Empty(_) => {}
+ ast::ClassSetItem::Literal(ref x) => {
+ if self.flags().unicode() {
+ let mut cls = self.pop().unwrap().unwrap_class_unicode();
+ cls.push(hir::ClassUnicodeRange::new(x.c, x.c));
+ self.push(HirFrame::ClassUnicode(cls));
+ } else {
+ let mut cls = self.pop().unwrap().unwrap_class_bytes();
+ let byte = self.class_literal_byte(x)?;
+ cls.push(hir::ClassBytesRange::new(byte, byte));
+ self.push(HirFrame::ClassBytes(cls));
+ }
+ }
+ ast::ClassSetItem::Range(ref x) => {
+ if self.flags().unicode() {
+ let mut cls = self.pop().unwrap().unwrap_class_unicode();
+ cls.push(hir::ClassUnicodeRange::new(x.start.c, x.end.c));
+ self.push(HirFrame::ClassUnicode(cls));
+ } else {
+ let mut cls = self.pop().unwrap().unwrap_class_bytes();
+ let start = self.class_literal_byte(&x.start)?;
+ let end = self.class_literal_byte(&x.end)?;
+ cls.push(hir::ClassBytesRange::new(start, end));
+ self.push(HirFrame::ClassBytes(cls));
+ }
+ }
+ ast::ClassSetItem::Ascii(ref x) => {
+ if self.flags().unicode() {
+ let xcls = self.hir_ascii_unicode_class(x)?;
+ let mut cls = self.pop().unwrap().unwrap_class_unicode();
+ cls.union(&xcls);
+ self.push(HirFrame::ClassUnicode(cls));
+ } else {
+ let xcls = self.hir_ascii_byte_class(x)?;
+ let mut cls = self.pop().unwrap().unwrap_class_bytes();
+ cls.union(&xcls);
+ self.push(HirFrame::ClassBytes(cls));
+ }
+ }
+ ast::ClassSetItem::Unicode(ref x) => {
+ let xcls = self.hir_unicode_class(x)?;
+ let mut cls = self.pop().unwrap().unwrap_class_unicode();
+ cls.union(&xcls);
+ self.push(HirFrame::ClassUnicode(cls));
+ }
+ ast::ClassSetItem::Perl(ref x) => {
+ if self.flags().unicode() {
+ let xcls = self.hir_perl_unicode_class(x)?;
+ let mut cls = self.pop().unwrap().unwrap_class_unicode();
+ cls.union(&xcls);
+ self.push(HirFrame::ClassUnicode(cls));
+ } else {
+ let xcls = self.hir_perl_byte_class(x)?;
+ let mut cls = self.pop().unwrap().unwrap_class_bytes();
+ cls.union(&xcls);
+ self.push(HirFrame::ClassBytes(cls));
+ }
+ }
+ ast::ClassSetItem::Bracketed(ref ast) => {
+ if self.flags().unicode() {
+ let mut cls1 = self.pop().unwrap().unwrap_class_unicode();
+ self.unicode_fold_and_negate(
+ &ast.span,
+ ast.negated,
+ &mut cls1,
+ )?;
+
+ let mut cls2 = self.pop().unwrap().unwrap_class_unicode();
+ cls2.union(&cls1);
+ self.push(HirFrame::ClassUnicode(cls2));
+ } else {
+ let mut cls1 = self.pop().unwrap().unwrap_class_bytes();
+ self.bytes_fold_and_negate(
+ &ast.span,
+ ast.negated,
+ &mut cls1,
+ )?;
+
+ let mut cls2 = self.pop().unwrap().unwrap_class_bytes();
+ cls2.union(&cls1);
+ self.push(HirFrame::ClassBytes(cls2));
+ }
+ }
+ // This is handled automatically by the visitor.
+ ast::ClassSetItem::Union(_) => {}
+ }
+ Ok(())
+ }
+
+ fn visit_class_set_binary_op_pre(
+ &mut self,
+ _op: &ast::ClassSetBinaryOp,
+ ) -> Result<()> {
+ if self.flags().unicode() {
+ let cls = hir::ClassUnicode::empty();
+ self.push(HirFrame::ClassUnicode(cls));
+ } else {
+ let cls = hir::ClassBytes::empty();
+ self.push(HirFrame::ClassBytes(cls));
+ }
+ Ok(())
+ }
+
+ fn visit_class_set_binary_op_in(
+ &mut self,
+ _op: &ast::ClassSetBinaryOp,
+ ) -> Result<()> {
+ if self.flags().unicode() {
+ let cls = hir::ClassUnicode::empty();
+ self.push(HirFrame::ClassUnicode(cls));
+ } else {
+ let cls = hir::ClassBytes::empty();
+ self.push(HirFrame::ClassBytes(cls));
+ }
+ Ok(())
+ }
+
+ fn visit_class_set_binary_op_post(
+ &mut self,
+ op: &ast::ClassSetBinaryOp,
+ ) -> Result<()> {
+ use crate::ast::ClassSetBinaryOpKind::*;
+
+ if self.flags().unicode() {
+ let mut rhs = self.pop().unwrap().unwrap_class_unicode();
+ let mut lhs = self.pop().unwrap().unwrap_class_unicode();
+ let mut cls = self.pop().unwrap().unwrap_class_unicode();
+ if self.flags().case_insensitive() {
+ rhs.try_case_fold_simple().map_err(|_| {
+ self.error(
+ op.rhs.span().clone(),
+ ErrorKind::UnicodeCaseUnavailable,
+ )
+ })?;
+ lhs.try_case_fold_simple().map_err(|_| {
+ self.error(
+ op.lhs.span().clone(),
+ ErrorKind::UnicodeCaseUnavailable,
+ )
+ })?;
+ }
+ match op.kind {
+ Intersection => lhs.intersect(&rhs),
+ Difference => lhs.difference(&rhs),
+ SymmetricDifference => lhs.symmetric_difference(&rhs),
+ }
+ cls.union(&lhs);
+ self.push(HirFrame::ClassUnicode(cls));
+ } else {
+ let mut rhs = self.pop().unwrap().unwrap_class_bytes();
+ let mut lhs = self.pop().unwrap().unwrap_class_bytes();
+ let mut cls = self.pop().unwrap().unwrap_class_bytes();
+ if self.flags().case_insensitive() {
+ rhs.case_fold_simple();
+ lhs.case_fold_simple();
+ }
+ match op.kind {
+ Intersection => lhs.intersect(&rhs),
+ Difference => lhs.difference(&rhs),
+ SymmetricDifference => lhs.symmetric_difference(&rhs),
+ }
+ cls.union(&lhs);
+ self.push(HirFrame::ClassBytes(cls));
+ }
+ Ok(())
+ }
+}
+
+/// The internal implementation of a translator.
+///
+/// This type is responsible for carrying around the original pattern string,
+/// which is not tied to the internal state of a translator.
+///
+/// A TranslatorI exists for the time it takes to translate a single Ast.
+#[derive(Clone, Debug)]
+struct TranslatorI<'t, 'p> {
+ trans: &'t Translator,
+ pattern: &'p str,
+}
+
+impl<'t, 'p> TranslatorI<'t, 'p> {
+ /// Build a new internal translator.
+ fn new(trans: &'t Translator, pattern: &'p str) -> TranslatorI<'t, 'p> {
+ TranslatorI { trans, pattern }
+ }
+
+ /// Return a reference to the underlying translator.
+ fn trans(&self) -> &Translator {
+ &self.trans
+ }
+
+ /// Push the given frame on to the call stack.
+ fn push(&self, frame: HirFrame) {
+ self.trans().stack.borrow_mut().push(frame);
+ }
+
+ /// Push the given literal char on to the call stack.
+ ///
+ /// If the top-most element of the stack is a literal, then the char
+ /// is appended to the end of that literal. Otherwise, a new literal
+ /// containing just the given char is pushed to the top of the stack.
+ fn push_char(&self, ch: char) {
+ let mut buf = [0; 4];
+ let bytes = ch.encode_utf8(&mut buf).as_bytes();
+ let mut stack = self.trans().stack.borrow_mut();
+ if let Some(HirFrame::Literal(ref mut literal)) = stack.last_mut() {
+ literal.extend_from_slice(bytes);
+ } else {
+ stack.push(HirFrame::Literal(bytes.to_vec()));
+ }
+ }
+
+ /// Push the given literal byte on to the call stack.
+ ///
+ /// If the top-most element of the stack is a literal, then the byte
+ /// is appended to the end of that literal. Otherwise, a new literal
+ /// containing just the given byte is pushed to the top of the stack.
+ fn push_byte(&self, byte: u8) {
+ let mut stack = self.trans().stack.borrow_mut();
+ if let Some(HirFrame::Literal(ref mut literal)) = stack.last_mut() {
+ literal.push(byte);
+ } else {
+ stack.push(HirFrame::Literal(vec![byte]));
+ }
+ }
+
+ /// Pop the top of the call stack. If the call stack is empty, return None.
+ fn pop(&self) -> Option<HirFrame> {
+ self.trans().stack.borrow_mut().pop()
+ }
+
+ /// Pop an HIR expression from the top of the stack for a concatenation.
+ ///
+ /// This returns None if the stack is empty or when a concat frame is seen.
+ /// Otherwise, it panics if it could not find an HIR expression.
+ fn pop_concat_expr(&self) -> Option<Hir> {
+ let frame = self.pop()?;
+ match frame {
+ HirFrame::Concat => None,
+ HirFrame::Expr(expr) => Some(expr),
+ HirFrame::Literal(lit) => Some(Hir::literal(lit)),
+ HirFrame::ClassUnicode(_) => {
+ unreachable!("expected expr or concat, got Unicode class")
+ }
+ HirFrame::ClassBytes(_) => {
+ unreachable!("expected expr or concat, got byte class")
+ }
+ HirFrame::Repetition => {
+ unreachable!("expected expr or concat, got repetition")
+ }
+ HirFrame::Group { .. } => {
+ unreachable!("expected expr or concat, got group")
+ }
+ HirFrame::Alternation => {
+ unreachable!("expected expr or concat, got alt marker")
+ }
+ HirFrame::AlternationBranch => {
+ unreachable!("expected expr or concat, got alt branch marker")
+ }
+ }
+ }
+
+ /// Pop an HIR expression from the top of the stack for an alternation.
+ ///
+ /// This returns None if the stack is empty or when an alternation frame is
+ /// seen. Otherwise, it panics if it could not find an HIR expression.
+ fn pop_alt_expr(&self) -> Option<Hir> {
+ let frame = self.pop()?;
+ match frame {
+ HirFrame::Alternation => None,
+ HirFrame::Expr(expr) => Some(expr),
+ HirFrame::Literal(lit) => Some(Hir::literal(lit)),
+ HirFrame::ClassUnicode(_) => {
+ unreachable!("expected expr or alt, got Unicode class")
+ }
+ HirFrame::ClassBytes(_) => {
+ unreachable!("expected expr or alt, got byte class")
+ }
+ HirFrame::Repetition => {
+ unreachable!("expected expr or alt, got repetition")
+ }
+ HirFrame::Group { .. } => {
+ unreachable!("expected expr or alt, got group")
+ }
+ HirFrame::Concat => {
+ unreachable!("expected expr or alt, got concat marker")
+ }
+ HirFrame::AlternationBranch => {
+ unreachable!("expected expr or alt, got alt branch marker")
+ }
+ }
+ }
+
+ /// Create a new error with the given span and error type.
+ fn error(&self, span: Span, kind: ErrorKind) -> Error {
+ Error { kind, pattern: self.pattern.to_string(), span }
+ }
+
+ /// Return a copy of the active flags.
+ fn flags(&self) -> Flags {
+ self.trans().flags.get()
+ }
+
+ /// Set the flags of this translator from the flags set in the given AST.
+ /// Then, return the old flags.
+ fn set_flags(&self, ast_flags: &ast::Flags) -> Flags {
+ let old_flags = self.flags();
+ let mut new_flags = Flags::from_ast(ast_flags);
+ new_flags.merge(&old_flags);
+ self.trans().flags.set(new_flags);
+ old_flags
+ }
+
+ /// Convert an Ast literal to its scalar representation.
+ ///
+ /// When Unicode mode is enabled, then this always succeeds and returns a
+ /// `char` (Unicode scalar value).
+ ///
+ /// When Unicode mode is disabled, then a `char` will still be returned
+ /// whenever possible. A byte is returned only when invalid UTF-8 is
+ /// allowed and when the byte is not ASCII. Otherwise, a non-ASCII byte
+ /// will result in an error when invalid UTF-8 is not allowed.
+ fn ast_literal_to_scalar(
+ &self,
+ lit: &ast::Literal,
+ ) -> Result<Either<char, u8>> {
+ if self.flags().unicode() {
+ return Ok(Either::Left(lit.c));
+ }
+ let byte = match lit.byte() {
+ None => return Ok(Either::Left(lit.c)),
+ Some(byte) => byte,
+ };
+ if byte <= 0x7F {
+ return Ok(Either::Left(char::try_from(byte).unwrap()));
+ }
+ if self.trans().utf8 {
+ return Err(self.error(lit.span, ErrorKind::InvalidUtf8));
+ }
+ Ok(Either::Right(byte))
+ }
+
+ fn case_fold_char(&self, span: Span, c: char) -> Result<Option<Hir>> {
+ if !self.flags().case_insensitive() {
+ return Ok(None);
+ }
+ if self.flags().unicode() {
+ // If case folding won't do anything, then don't bother trying.
+ let map = unicode::SimpleCaseFolder::new()
+ .map(|f| f.overlaps(c, c))
+ .map_err(|_| {
+ self.error(span, ErrorKind::UnicodeCaseUnavailable)
+ })?;
+ if !map {
+ return Ok(None);
+ }
+ let mut cls =
+ hir::ClassUnicode::new(vec![hir::ClassUnicodeRange::new(
+ c, c,
+ )]);
+ cls.try_case_fold_simple().map_err(|_| {
+ self.error(span, ErrorKind::UnicodeCaseUnavailable)
+ })?;
+ Ok(Some(Hir::class(hir::Class::Unicode(cls))))
+ } else {
+ if !c.is_ascii() {
+ return Ok(None);
+ }
+ // If case folding won't do anything, then don't bother trying.
+ match c {
+ 'A'..='Z' | 'a'..='z' => {}
+ _ => return Ok(None),
+ }
+ let mut cls =
+ hir::ClassBytes::new(vec![hir::ClassBytesRange::new(
+ // OK because 'c.len_utf8() == 1' which in turn implies
+ // that 'c' is ASCII.
+ u8::try_from(c).unwrap(),
+ u8::try_from(c).unwrap(),
+ )]);
+ cls.case_fold_simple();
+ Ok(Some(Hir::class(hir::Class::Bytes(cls))))
+ }
+ }
+
+ fn hir_dot(&self, span: Span) -> Result<Hir> {
+ let (utf8, lineterm, flags) =
+ (self.trans().utf8, self.trans().line_terminator, self.flags());
+ if utf8 && (!flags.unicode() || !lineterm.is_ascii()) {
+ return Err(self.error(span, ErrorKind::InvalidUtf8));
+ }
+ let dot = if flags.dot_matches_new_line() {
+ if flags.unicode() {
+ hir::Dot::AnyChar
+ } else {
+ hir::Dot::AnyByte
+ }
+ } else {
+ if flags.unicode() {
+ if flags.crlf() {
+ hir::Dot::AnyCharExceptCRLF
+ } else {
+ if !lineterm.is_ascii() {
+ return Err(
+ self.error(span, ErrorKind::InvalidLineTerminator)
+ );
+ }
+ hir::Dot::AnyCharExcept(char::from(lineterm))
+ }
+ } else {
+ if flags.crlf() {
+ hir::Dot::AnyByteExceptCRLF
+ } else {
+ hir::Dot::AnyByteExcept(lineterm)
+ }
+ }
+ };
+ Ok(Hir::dot(dot))
+ }
+
+ fn hir_assertion(&self, asst: &ast::Assertion) -> Result<Hir> {
+ let unicode = self.flags().unicode();
+ let multi_line = self.flags().multi_line();
+ let crlf = self.flags().crlf();
+ Ok(match asst.kind {
+ ast::AssertionKind::StartLine => Hir::look(if multi_line {
+ if crlf {
+ hir::Look::StartCRLF
+ } else {
+ hir::Look::StartLF
+ }
+ } else {
+ hir::Look::Start
+ }),
+ ast::AssertionKind::EndLine => Hir::look(if multi_line {
+ if crlf {
+ hir::Look::EndCRLF
+ } else {
+ hir::Look::EndLF
+ }
+ } else {
+ hir::Look::End
+ }),
+ ast::AssertionKind::StartText => Hir::look(hir::Look::Start),
+ ast::AssertionKind::EndText => Hir::look(hir::Look::End),
+ ast::AssertionKind::WordBoundary => Hir::look(if unicode {
+ hir::Look::WordUnicode
+ } else {
+ hir::Look::WordAscii
+ }),
+ ast::AssertionKind::NotWordBoundary => Hir::look(if unicode {
+ hir::Look::WordUnicodeNegate
+ } else {
+ hir::Look::WordAsciiNegate
+ }),
+ ast::AssertionKind::WordBoundaryStart
+ | ast::AssertionKind::WordBoundaryStartAngle => {
+ Hir::look(if unicode {
+ hir::Look::WordStartUnicode
+ } else {
+ hir::Look::WordStartAscii
+ })
+ }
+ ast::AssertionKind::WordBoundaryEnd
+ | ast::AssertionKind::WordBoundaryEndAngle => {
+ Hir::look(if unicode {
+ hir::Look::WordEndUnicode
+ } else {
+ hir::Look::WordEndAscii
+ })
+ }
+ ast::AssertionKind::WordBoundaryStartHalf => {
+ Hir::look(if unicode {
+ hir::Look::WordStartHalfUnicode
+ } else {
+ hir::Look::WordStartHalfAscii
+ })
+ }
+ ast::AssertionKind::WordBoundaryEndHalf => Hir::look(if unicode {
+ hir::Look::WordEndHalfUnicode
+ } else {
+ hir::Look::WordEndHalfAscii
+ }),
+ })
+ }
+
+ fn hir_capture(&self, group: &ast::Group, expr: Hir) -> Hir {
+ let (index, name) = match group.kind {
+ ast::GroupKind::CaptureIndex(index) => (index, None),
+ ast::GroupKind::CaptureName { ref name, .. } => {
+ (name.index, Some(name.name.clone().into_boxed_str()))
+ }
+ // The HIR doesn't need to use non-capturing groups, since the way
+ // in which the data type is defined handles this automatically.
+ ast::GroupKind::NonCapturing(_) => return expr,
+ };
+ Hir::capture(hir::Capture { index, name, sub: Box::new(expr) })
+ }
+
+ fn hir_repetition(&self, rep: &ast::Repetition, expr: Hir) -> Hir {
+ let (min, max) = match rep.op.kind {
+ ast::RepetitionKind::ZeroOrOne => (0, Some(1)),
+ ast::RepetitionKind::ZeroOrMore => (0, None),
+ ast::RepetitionKind::OneOrMore => (1, None),
+ ast::RepetitionKind::Range(ast::RepetitionRange::Exactly(m)) => {
+ (m, Some(m))
+ }
+ ast::RepetitionKind::Range(ast::RepetitionRange::AtLeast(m)) => {
+ (m, None)
+ }
+ ast::RepetitionKind::Range(ast::RepetitionRange::Bounded(
+ m,
+ n,
+ )) => (m, Some(n)),
+ };
+ let greedy =
+ if self.flags().swap_greed() { !rep.greedy } else { rep.greedy };
+ Hir::repetition(hir::Repetition {
+ min,
+ max,
+ greedy,
+ sub: Box::new(expr),
+ })
+ }
+
+ fn hir_unicode_class(
+ &self,
+ ast_class: &ast::ClassUnicode,
+ ) -> Result<hir::ClassUnicode> {
+ use crate::ast::ClassUnicodeKind::*;
+
+ if !self.flags().unicode() {
+ return Err(
+ self.error(ast_class.span, ErrorKind::UnicodeNotAllowed)
+ );
+ }
+ let query = match ast_class.kind {
+ OneLetter(name) => ClassQuery::OneLetter(name),
+ Named(ref name) => ClassQuery::Binary(name),
+ NamedValue { ref name, ref value, .. } => ClassQuery::ByValue {
+ property_name: name,
+ property_value: value,
+ },
+ };
+ let mut result = self.convert_unicode_class_error(
+ &ast_class.span,
+ unicode::class(query),
+ );
+ if let Ok(ref mut class) = result {
+ self.unicode_fold_and_negate(
+ &ast_class.span,
+ ast_class.negated,
+ class,
+ )?;
+ }
+ result
+ }
+
+ fn hir_ascii_unicode_class(
+ &self,
+ ast: &ast::ClassAscii,
+ ) -> Result<hir::ClassUnicode> {
+ let mut cls = hir::ClassUnicode::new(
+ ascii_class_as_chars(&ast.kind)
+ .map(|(s, e)| hir::ClassUnicodeRange::new(s, e)),
+ );
+ self.unicode_fold_and_negate(&ast.span, ast.negated, &mut cls)?;
+ Ok(cls)
+ }
+
+ fn hir_ascii_byte_class(
+ &self,
+ ast: &ast::ClassAscii,
+ ) -> Result<hir::ClassBytes> {
+ let mut cls = hir::ClassBytes::new(
+ ascii_class(&ast.kind)
+ .map(|(s, e)| hir::ClassBytesRange::new(s, e)),
+ );
+ self.bytes_fold_and_negate(&ast.span, ast.negated, &mut cls)?;
+ Ok(cls)
+ }
+
+ fn hir_perl_unicode_class(
+ &self,
+ ast_class: &ast::ClassPerl,
+ ) -> Result<hir::ClassUnicode> {
+ use crate::ast::ClassPerlKind::*;
+
+ assert!(self.flags().unicode());
+ let result = match ast_class.kind {
+ Digit => unicode::perl_digit(),
+ Space => unicode::perl_space(),
+ Word => unicode::perl_word(),
+ };
+ let mut class =
+ self.convert_unicode_class_error(&ast_class.span, result)?;
+ // We needn't apply case folding here because the Perl Unicode classes
+ // are already closed under Unicode simple case folding.
+ if ast_class.negated {
+ class.negate();
+ }
+ Ok(class)
+ }
+
+ fn hir_perl_byte_class(
+ &self,
+ ast_class: &ast::ClassPerl,
+ ) -> Result<hir::ClassBytes> {
+ use crate::ast::ClassPerlKind::*;
+
+ assert!(!self.flags().unicode());
+ let mut class = match ast_class.kind {
+ Digit => hir_ascii_class_bytes(&ast::ClassAsciiKind::Digit),
+ Space => hir_ascii_class_bytes(&ast::ClassAsciiKind::Space),
+ Word => hir_ascii_class_bytes(&ast::ClassAsciiKind::Word),
+ };
+ // We needn't apply case folding here because the Perl ASCII classes
+ // are already closed (under ASCII case folding).
+ if ast_class.negated {
+ class.negate();
+ }
+ // Negating a Perl byte class is likely to cause it to match invalid
+ // UTF-8. That's only OK if the translator is configured to allow such
+ // things.
+ if self.trans().utf8 && !class.is_ascii() {
+ return Err(self.error(ast_class.span, ErrorKind::InvalidUtf8));
+ }
+ Ok(class)
+ }
+
+ /// Converts the given Unicode specific error to an HIR translation error.
+ ///
+ /// The span given should approximate the position at which an error would
+ /// occur.
+ fn convert_unicode_class_error(
+ &self,
+ span: &Span,
+ result: core::result::Result<hir::ClassUnicode, unicode::Error>,
+ ) -> Result<hir::ClassUnicode> {
+ result.map_err(|err| {
+ let sp = span.clone();
+ match err {
+ unicode::Error::PropertyNotFound => {
+ self.error(sp, ErrorKind::UnicodePropertyNotFound)
+ }
+ unicode::Error::PropertyValueNotFound => {
+ self.error(sp, ErrorKind::UnicodePropertyValueNotFound)
+ }
+ unicode::Error::PerlClassNotFound => {
+ self.error(sp, ErrorKind::UnicodePerlClassNotFound)
+ }
+ }
+ })
+ }
+
+ fn unicode_fold_and_negate(
+ &self,
+ span: &Span,
+ negated: bool,
+ class: &mut hir::ClassUnicode,
+ ) -> Result<()> {
+ // Note that we must apply case folding before negation!
+ // Consider `(?i)[^x]`. If we applied negation first, then
+ // the result would be the character class that matched any
+ // Unicode scalar value.
+ if self.flags().case_insensitive() {
+ class.try_case_fold_simple().map_err(|_| {
+ self.error(span.clone(), ErrorKind::UnicodeCaseUnavailable)
+ })?;
+ }
+ if negated {
+ class.negate();
+ }
+ Ok(())
+ }
+
+ fn bytes_fold_and_negate(
+ &self,
+ span: &Span,
+ negated: bool,
+ class: &mut hir::ClassBytes,
+ ) -> Result<()> {
+ // Note that we must apply case folding before negation!
+ // Consider `(?i)[^x]`. If we applied negation first, then
+ // the result would be the character class that matched any
+ // Unicode scalar value.
+ if self.flags().case_insensitive() {
+ class.case_fold_simple();
+ }
+ if negated {
+ class.negate();
+ }
+ if self.trans().utf8 && !class.is_ascii() {
+ return Err(self.error(span.clone(), ErrorKind::InvalidUtf8));
+ }
+ Ok(())
+ }
+
+ /// Return a scalar byte value suitable for use as a literal in a byte
+ /// character class.
+ fn class_literal_byte(&self, ast: &ast::Literal) -> Result<u8> {
+ match self.ast_literal_to_scalar(ast)? {
+ Either::Right(byte) => Ok(byte),
+ Either::Left(ch) => {
+ if ch.is_ascii() {
+ Ok(u8::try_from(ch).unwrap())
+ } else {
+ // We can't feasibly support Unicode in
+ // byte oriented classes. Byte classes don't
+ // do Unicode case folding.
+ Err(self.error(ast.span, ErrorKind::UnicodeNotAllowed))
+ }
+ }
+ }
+ }
+}
+
+/// A translator's representation of a regular expression's flags at any given
+/// moment in time.
+///
+/// Each flag can be in one of three states: absent, present but disabled or
+/// present but enabled.
+#[derive(Clone, Copy, Debug, Default)]
+struct Flags {
+ case_insensitive: Option<bool>,
+ multi_line: Option<bool>,
+ dot_matches_new_line: Option<bool>,
+ swap_greed: Option<bool>,
+ unicode: Option<bool>,
+ crlf: Option<bool>,
+ // Note that `ignore_whitespace` is omitted here because it is handled
+ // entirely in the parser.
+}
+
+impl Flags {
+ fn from_ast(ast: &ast::Flags) -> Flags {
+ let mut flags = Flags::default();
+ let mut enable = true;
+ for item in &ast.items {
+ match item.kind {
+ ast::FlagsItemKind::Negation => {
+ enable = false;
+ }
+ ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive) => {
+ flags.case_insensitive = Some(enable);
+ }
+ ast::FlagsItemKind::Flag(ast::Flag::MultiLine) => {
+ flags.multi_line = Some(enable);
+ }
+ ast::FlagsItemKind::Flag(ast::Flag::DotMatchesNewLine) => {
+ flags.dot_matches_new_line = Some(enable);
+ }
+ ast::FlagsItemKind::Flag(ast::Flag::SwapGreed) => {
+ flags.swap_greed = Some(enable);
+ }
+ ast::FlagsItemKind::Flag(ast::Flag::Unicode) => {
+ flags.unicode = Some(enable);
+ }
+ ast::FlagsItemKind::Flag(ast::Flag::CRLF) => {
+ flags.crlf = Some(enable);
+ }
+ ast::FlagsItemKind::Flag(ast::Flag::IgnoreWhitespace) => {}
+ }
+ }
+ flags
+ }
+
+ fn merge(&mut self, previous: &Flags) {
+ if self.case_insensitive.is_none() {
+ self.case_insensitive = previous.case_insensitive;
+ }
+ if self.multi_line.is_none() {
+ self.multi_line = previous.multi_line;
+ }
+ if self.dot_matches_new_line.is_none() {
+ self.dot_matches_new_line = previous.dot_matches_new_line;
+ }
+ if self.swap_greed.is_none() {
+ self.swap_greed = previous.swap_greed;
+ }
+ if self.unicode.is_none() {
+ self.unicode = previous.unicode;
+ }
+ if self.crlf.is_none() {
+ self.crlf = previous.crlf;
+ }
+ }
+
+ fn case_insensitive(&self) -> bool {
+ self.case_insensitive.unwrap_or(false)
+ }
+
+ fn multi_line(&self) -> bool {
+ self.multi_line.unwrap_or(false)
+ }
+
+ fn dot_matches_new_line(&self) -> bool {
+ self.dot_matches_new_line.unwrap_or(false)
+ }
+
+ fn swap_greed(&self) -> bool {
+ self.swap_greed.unwrap_or(false)
+ }
+
+ fn unicode(&self) -> bool {
+ self.unicode.unwrap_or(true)
+ }
+
+ fn crlf(&self) -> bool {
+ self.crlf.unwrap_or(false)
+ }
+}
+
+fn hir_ascii_class_bytes(kind: &ast::ClassAsciiKind) -> hir::ClassBytes {
+ let ranges: Vec<_> = ascii_class(kind)
+ .map(|(s, e)| hir::ClassBytesRange::new(s, e))
+ .collect();
+ hir::ClassBytes::new(ranges)
+}
+
+fn ascii_class(kind: &ast::ClassAsciiKind) -> impl Iterator<Item = (u8, u8)> {
+ use crate::ast::ClassAsciiKind::*;
+
+ let slice: &'static [(u8, u8)] = match *kind {
+ Alnum => &[(b'0', b'9'), (b'A', b'Z'), (b'a', b'z')],
+ Alpha => &[(b'A', b'Z'), (b'a', b'z')],
+ Ascii => &[(b'\x00', b'\x7F')],
+ Blank => &[(b'\t', b'\t'), (b' ', b' ')],
+ Cntrl => &[(b'\x00', b'\x1F'), (b'\x7F', b'\x7F')],
+ Digit => &[(b'0', b'9')],
+ Graph => &[(b'!', b'~')],
+ Lower => &[(b'a', b'z')],
+ Print => &[(b' ', b'~')],
+ Punct => &[(b'!', b'/'), (b':', b'@'), (b'[', b'`'), (b'{', b'~')],
+ Space => &[
+ (b'\t', b'\t'),
+ (b'\n', b'\n'),
+ (b'\x0B', b'\x0B'),
+ (b'\x0C', b'\x0C'),
+ (b'\r', b'\r'),
+ (b' ', b' '),
+ ],
+ Upper => &[(b'A', b'Z')],
+ Word => &[(b'0', b'9'), (b'A', b'Z'), (b'_', b'_'), (b'a', b'z')],
+ Xdigit => &[(b'0', b'9'), (b'A', b'F'), (b'a', b'f')],
+ };
+ slice.iter().copied()
+}
+
+fn ascii_class_as_chars(
+ kind: &ast::ClassAsciiKind,
+) -> impl Iterator<Item = (char, char)> {
+ ascii_class(kind).map(|(s, e)| (char::from(s), char::from(e)))
+}
+
+#[cfg(test)]
+mod tests {
+ use crate::{
+ ast::{self, parse::ParserBuilder, Ast, Position, Span},
+ hir::{self, Hir, HirKind, Look, Properties},
+ unicode::{self, ClassQuery},
+ };
+
+ use super::*;
+
+ // We create these errors to compare with real hir::Errors in the tests.
+ // We define equality between TestError and hir::Error to disregard the
+ // pattern string in hir::Error, which is annoying to provide in tests.
+ #[derive(Clone, Debug)]
+ struct TestError {
+ span: Span,
+ kind: hir::ErrorKind,
+ }
+
+ impl PartialEq<hir::Error> for TestError {
+ fn eq(&self, other: &hir::Error) -> bool {
+ self.span == other.span && self.kind == other.kind
+ }
+ }
+
+ impl PartialEq<TestError> for hir::Error {
+ fn eq(&self, other: &TestError) -> bool {
+ self.span == other.span && self.kind == other.kind
+ }
+ }
+
+ fn parse(pattern: &str) -> Ast {
+ ParserBuilder::new().octal(true).build().parse(pattern).unwrap()
+ }
+
+ fn t(pattern: &str) -> Hir {
+ TranslatorBuilder::new()
+ .utf8(true)
+ .build()
+ .translate(pattern, &parse(pattern))
+ .unwrap()
+ }
+
+ fn t_err(pattern: &str) -> hir::Error {
+ TranslatorBuilder::new()
+ .utf8(true)
+ .build()
+ .translate(pattern, &parse(pattern))
+ .unwrap_err()
+ }
+
+ fn t_bytes(pattern: &str) -> Hir {
+ TranslatorBuilder::new()
+ .utf8(false)
+ .build()
+ .translate(pattern, &parse(pattern))
+ .unwrap()
+ }
+
+ fn props(pattern: &str) -> Properties {
+ t(pattern).properties().clone()
+ }
+
+ fn props_bytes(pattern: &str) -> Properties {
+ t_bytes(pattern).properties().clone()
+ }
+
+ fn hir_lit(s: &str) -> Hir {
+ hir_blit(s.as_bytes())
+ }
+
+ fn hir_blit(s: &[u8]) -> Hir {
+ Hir::literal(s)
+ }
+
+ fn hir_capture(index: u32, expr: Hir) -> Hir {
+ Hir::capture(hir::Capture { index, name: None, sub: Box::new(expr) })
+ }
+
+ fn hir_capture_name(index: u32, name: &str, expr: Hir) -> Hir {
+ Hir::capture(hir::Capture {
+ index,
+ name: Some(name.into()),
+ sub: Box::new(expr),
+ })
+ }
+
+ fn hir_quest(greedy: bool, expr: Hir) -> Hir {
+ Hir::repetition(hir::Repetition {
+ min: 0,
+ max: Some(1),
+ greedy,
+ sub: Box::new(expr),
+ })
+ }
+
+ fn hir_star(greedy: bool, expr: Hir) -> Hir {
+ Hir::repetition(hir::Repetition {
+ min: 0,
+ max: None,
+ greedy,
+ sub: Box::new(expr),
+ })
+ }
+
+ fn hir_plus(greedy: bool, expr: Hir) -> Hir {
+ Hir::repetition(hir::Repetition {
+ min: 1,
+ max: None,
+ greedy,
+ sub: Box::new(expr),
+ })
+ }
+
+ fn hir_range(greedy: bool, min: u32, max: Option<u32>, expr: Hir) -> Hir {
+ Hir::repetition(hir::Repetition {
+ min,
+ max,
+ greedy,
+ sub: Box::new(expr),
+ })
+ }
+
+ fn hir_alt(alts: Vec<Hir>) -> Hir {
+ Hir::alternation(alts)
+ }
+
+ fn hir_cat(exprs: Vec<Hir>) -> Hir {
+ Hir::concat(exprs)
+ }
+
+ #[allow(dead_code)]
+ fn hir_uclass_query(query: ClassQuery<'_>) -> Hir {
+ Hir::class(hir::Class::Unicode(unicode::class(query).unwrap()))
+ }
+
+ #[allow(dead_code)]
+ fn hir_uclass_perl_word() -> Hir {
+ Hir::class(hir::Class::Unicode(unicode::perl_word().unwrap()))
+ }
+
+ fn hir_ascii_uclass(kind: &ast::ClassAsciiKind) -> Hir {
+ Hir::class(hir::Class::Unicode(hir::ClassUnicode::new(
+ ascii_class_as_chars(kind)
+ .map(|(s, e)| hir::ClassUnicodeRange::new(s, e)),
+ )))
+ }
+
+ fn hir_ascii_bclass(kind: &ast::ClassAsciiKind) -> Hir {
+ Hir::class(hir::Class::Bytes(hir::ClassBytes::new(
+ ascii_class(kind).map(|(s, e)| hir::ClassBytesRange::new(s, e)),
+ )))
+ }
+
+ fn hir_uclass(ranges: &[(char, char)]) -> Hir {
+ Hir::class(uclass(ranges))
+ }
+
+ fn hir_bclass(ranges: &[(u8, u8)]) -> Hir {
+ Hir::class(bclass(ranges))
+ }
+
+ fn hir_case_fold(expr: Hir) -> Hir {
+ match expr.into_kind() {
+ HirKind::Class(mut cls) => {
+ cls.case_fold_simple();
+ Hir::class(cls)
+ }
+ _ => panic!("cannot case fold non-class Hir expr"),
+ }
+ }
+
+ fn hir_negate(expr: Hir) -> Hir {
+ match expr.into_kind() {
+ HirKind::Class(mut cls) => {
+ cls.negate();
+ Hir::class(cls)
+ }
+ _ => panic!("cannot negate non-class Hir expr"),
+ }
+ }
+
+ fn uclass(ranges: &[(char, char)]) -> hir::Class {
+ let ranges: Vec<hir::ClassUnicodeRange> = ranges
+ .iter()
+ .map(|&(s, e)| hir::ClassUnicodeRange::new(s, e))
+ .collect();
+ hir::Class::Unicode(hir::ClassUnicode::new(ranges))
+ }
+
+ fn bclass(ranges: &[(u8, u8)]) -> hir::Class {
+ let ranges: Vec<hir::ClassBytesRange> = ranges
+ .iter()
+ .map(|&(s, e)| hir::ClassBytesRange::new(s, e))
+ .collect();
+ hir::Class::Bytes(hir::ClassBytes::new(ranges))
+ }
+
+ #[cfg(feature = "unicode-case")]
+ fn class_case_fold(mut cls: hir::Class) -> Hir {
+ cls.case_fold_simple();
+ Hir::class(cls)
+ }
+
+ fn class_negate(mut cls: hir::Class) -> Hir {
+ cls.negate();
+ Hir::class(cls)
+ }
+
+ #[allow(dead_code)]
+ fn hir_union(expr1: Hir, expr2: Hir) -> Hir {
+ use crate::hir::Class::{Bytes, Unicode};
+
+ match (expr1.into_kind(), expr2.into_kind()) {
+ (HirKind::Class(Unicode(mut c1)), HirKind::Class(Unicode(c2))) => {
+ c1.union(&c2);
+ Hir::class(hir::Class::Unicode(c1))
+ }
+ (HirKind::Class(Bytes(mut c1)), HirKind::Class(Bytes(c2))) => {
+ c1.union(&c2);
+ Hir::class(hir::Class::Bytes(c1))
+ }
+ _ => panic!("cannot union non-class Hir exprs"),
+ }
+ }
+
+ #[allow(dead_code)]
+ fn hir_difference(expr1: Hir, expr2: Hir) -> Hir {
+ use crate::hir::Class::{Bytes, Unicode};
+
+ match (expr1.into_kind(), expr2.into_kind()) {
+ (HirKind::Class(Unicode(mut c1)), HirKind::Class(Unicode(c2))) => {
+ c1.difference(&c2);
+ Hir::class(hir::Class::Unicode(c1))
+ }
+ (HirKind::Class(Bytes(mut c1)), HirKind::Class(Bytes(c2))) => {
+ c1.difference(&c2);
+ Hir::class(hir::Class::Bytes(c1))
+ }
+ _ => panic!("cannot difference non-class Hir exprs"),
+ }
+ }
+
+ fn hir_look(look: hir::Look) -> Hir {
+ Hir::look(look)
+ }
+
+ #[test]
+ fn empty() {
+ assert_eq!(t(""), Hir::empty());
+ assert_eq!(t("(?i)"), Hir::empty());
+ assert_eq!(t("()"), hir_capture(1, Hir::empty()));
+ assert_eq!(t("(?:)"), Hir::empty());
+ assert_eq!(t("(?P<wat>)"), hir_capture_name(1, "wat", Hir::empty()));
+ assert_eq!(t("|"), hir_alt(vec![Hir::empty(), Hir::empty()]));
+ assert_eq!(
+ t("()|()"),
+ hir_alt(vec![
+ hir_capture(1, Hir::empty()),
+ hir_capture(2, Hir::empty()),
+ ])
+ );
+ assert_eq!(
+ t("(|b)"),
+ hir_capture(1, hir_alt(vec![Hir::empty(), hir_lit("b"),]))
+ );
+ assert_eq!(
+ t("(a|)"),
+ hir_capture(1, hir_alt(vec![hir_lit("a"), Hir::empty(),]))
+ );
+ assert_eq!(
+ t("(a||c)"),
+ hir_capture(
+ 1,
+ hir_alt(vec![hir_lit("a"), Hir::empty(), hir_lit("c"),])
+ )
+ );
+ assert_eq!(
+ t("(||)"),
+ hir_capture(
+ 1,
+ hir_alt(vec![Hir::empty(), Hir::empty(), Hir::empty(),])
+ )
+ );
+ }
+
+ #[test]
+ fn literal() {
+ assert_eq!(t("a"), hir_lit("a"));
+ assert_eq!(t("(?-u)a"), hir_lit("a"));
+ assert_eq!(t("☃"), hir_lit("☃"));
+ assert_eq!(t("abcd"), hir_lit("abcd"));
+
+ assert_eq!(t_bytes("(?-u)a"), hir_lit("a"));
+ assert_eq!(t_bytes("(?-u)\x61"), hir_lit("a"));
+ assert_eq!(t_bytes(r"(?-u)\x61"), hir_lit("a"));
+ assert_eq!(t_bytes(r"(?-u)\xFF"), hir_blit(b"\xFF"));
+
+ assert_eq!(t("(?-u)☃"), hir_lit("☃"));
+ assert_eq!(
+ t_err(r"(?-u)\xFF"),
+ TestError {
+ kind: hir::ErrorKind::InvalidUtf8,
+ span: Span::new(
+ Position::new(5, 1, 6),
+ Position::new(9, 1, 10)
+ ),
+ }
+ );
+ }
+
+ #[test]
+ fn literal_case_insensitive() {
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(t("(?i)a"), hir_uclass(&[('A', 'A'), ('a', 'a'),]));
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(t("(?i:a)"), hir_uclass(&[('A', 'A'), ('a', 'a')]));
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t("a(?i)a(?-i)a"),
+ hir_cat(vec![
+ hir_lit("a"),
+ hir_uclass(&[('A', 'A'), ('a', 'a')]),
+ hir_lit("a"),
+ ])
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t("(?i)ab@c"),
+ hir_cat(vec![
+ hir_uclass(&[('A', 'A'), ('a', 'a')]),
+ hir_uclass(&[('B', 'B'), ('b', 'b')]),
+ hir_lit("@"),
+ hir_uclass(&[('C', 'C'), ('c', 'c')]),
+ ])
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t("(?i)β"),
+ hir_uclass(&[('Β', 'Β'), ('β', 'β'), ('ϐ', 'ϐ'),])
+ );
+
+ assert_eq!(t("(?i-u)a"), hir_bclass(&[(b'A', b'A'), (b'a', b'a'),]));
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t("(?-u)a(?i)a(?-i)a"),
+ hir_cat(vec![
+ hir_lit("a"),
+ hir_bclass(&[(b'A', b'A'), (b'a', b'a')]),
+ hir_lit("a"),
+ ])
+ );
+ assert_eq!(
+ t("(?i-u)ab@c"),
+ hir_cat(vec![
+ hir_bclass(&[(b'A', b'A'), (b'a', b'a')]),
+ hir_bclass(&[(b'B', b'B'), (b'b', b'b')]),
+ hir_lit("@"),
+ hir_bclass(&[(b'C', b'C'), (b'c', b'c')]),
+ ])
+ );
+
+ assert_eq!(
+ t_bytes("(?i-u)a"),
+ hir_bclass(&[(b'A', b'A'), (b'a', b'a'),])
+ );
+ assert_eq!(
+ t_bytes("(?i-u)\x61"),
+ hir_bclass(&[(b'A', b'A'), (b'a', b'a'),])
+ );
+ assert_eq!(
+ t_bytes(r"(?i-u)\x61"),
+ hir_bclass(&[(b'A', b'A'), (b'a', b'a'),])
+ );
+ assert_eq!(t_bytes(r"(?i-u)\xFF"), hir_blit(b"\xFF"));
+
+ assert_eq!(t("(?i-u)β"), hir_lit("β"),);
+ }
+
+ #[test]
+ fn dot() {
+ assert_eq!(
+ t("."),
+ hir_uclass(&[('\0', '\t'), ('\x0B', '\u{10FFFF}')])
+ );
+ assert_eq!(
+ t("(?R)."),
+ hir_uclass(&[
+ ('\0', '\t'),
+ ('\x0B', '\x0C'),
+ ('\x0E', '\u{10FFFF}'),
+ ])
+ );
+ assert_eq!(t("(?s)."), hir_uclass(&[('\0', '\u{10FFFF}')]));
+ assert_eq!(t("(?Rs)."), hir_uclass(&[('\0', '\u{10FFFF}')]));
+ assert_eq!(
+ t_bytes("(?-u)."),
+ hir_bclass(&[(b'\0', b'\t'), (b'\x0B', b'\xFF')])
+ );
+ assert_eq!(
+ t_bytes("(?R-u)."),
+ hir_bclass(&[
+ (b'\0', b'\t'),
+ (b'\x0B', b'\x0C'),
+ (b'\x0E', b'\xFF'),
+ ])
+ );
+ assert_eq!(t_bytes("(?s-u)."), hir_bclass(&[(b'\0', b'\xFF'),]));
+ assert_eq!(t_bytes("(?Rs-u)."), hir_bclass(&[(b'\0', b'\xFF'),]));
+
+ // If invalid UTF-8 isn't allowed, then non-Unicode `.` isn't allowed.
+ assert_eq!(
+ t_err("(?-u)."),
+ TestError {
+ kind: hir::ErrorKind::InvalidUtf8,
+ span: Span::new(
+ Position::new(5, 1, 6),
+ Position::new(6, 1, 7)
+ ),
+ }
+ );
+ assert_eq!(
+ t_err("(?R-u)."),
+ TestError {
+ kind: hir::ErrorKind::InvalidUtf8,
+ span: Span::new(
+ Position::new(6, 1, 7),
+ Position::new(7, 1, 8)
+ ),
+ }
+ );
+ assert_eq!(
+ t_err("(?s-u)."),
+ TestError {
+ kind: hir::ErrorKind::InvalidUtf8,
+ span: Span::new(
+ Position::new(6, 1, 7),
+ Position::new(7, 1, 8)
+ ),
+ }
+ );
+ assert_eq!(
+ t_err("(?Rs-u)."),
+ TestError {
+ kind: hir::ErrorKind::InvalidUtf8,
+ span: Span::new(
+ Position::new(7, 1, 8),
+ Position::new(8, 1, 9)
+ ),
+ }
+ );
+ }
+
+ #[test]
+ fn assertions() {
+ assert_eq!(t("^"), hir_look(hir::Look::Start));
+ assert_eq!(t("$"), hir_look(hir::Look::End));
+ assert_eq!(t(r"\A"), hir_look(hir::Look::Start));
+ assert_eq!(t(r"\z"), hir_look(hir::Look::End));
+ assert_eq!(t("(?m)^"), hir_look(hir::Look::StartLF));
+ assert_eq!(t("(?m)$"), hir_look(hir::Look::EndLF));
+ assert_eq!(t(r"(?m)\A"), hir_look(hir::Look::Start));
+ assert_eq!(t(r"(?m)\z"), hir_look(hir::Look::End));
+
+ assert_eq!(t(r"\b"), hir_look(hir::Look::WordUnicode));
+ assert_eq!(t(r"\B"), hir_look(hir::Look::WordUnicodeNegate));
+ assert_eq!(t(r"(?-u)\b"), hir_look(hir::Look::WordAscii));
+ assert_eq!(t(r"(?-u)\B"), hir_look(hir::Look::WordAsciiNegate));
+ }
+
+ #[test]
+ fn group() {
+ assert_eq!(t("(a)"), hir_capture(1, hir_lit("a")));
+ assert_eq!(
+ t("(a)(b)"),
+ hir_cat(vec![
+ hir_capture(1, hir_lit("a")),
+ hir_capture(2, hir_lit("b")),
+ ])
+ );
+ assert_eq!(
+ t("(a)|(b)"),
+ hir_alt(vec![
+ hir_capture(1, hir_lit("a")),
+ hir_capture(2, hir_lit("b")),
+ ])
+ );
+ assert_eq!(t("(?P<foo>)"), hir_capture_name(1, "foo", Hir::empty()));
+ assert_eq!(t("(?P<foo>a)"), hir_capture_name(1, "foo", hir_lit("a")));
+ assert_eq!(
+ t("(?P<foo>a)(?P<bar>b)"),
+ hir_cat(vec![
+ hir_capture_name(1, "foo", hir_lit("a")),
+ hir_capture_name(2, "bar", hir_lit("b")),
+ ])
+ );
+ assert_eq!(t("(?:)"), Hir::empty());
+ assert_eq!(t("(?:a)"), hir_lit("a"));
+ assert_eq!(
+ t("(?:a)(b)"),
+ hir_cat(vec![hir_lit("a"), hir_capture(1, hir_lit("b")),])
+ );
+ assert_eq!(
+ t("(a)(?:b)(c)"),
+ hir_cat(vec![
+ hir_capture(1, hir_lit("a")),
+ hir_lit("b"),
+ hir_capture(2, hir_lit("c")),
+ ])
+ );
+ assert_eq!(
+ t("(a)(?P<foo>b)(c)"),
+ hir_cat(vec![
+ hir_capture(1, hir_lit("a")),
+ hir_capture_name(2, "foo", hir_lit("b")),
+ hir_capture(3, hir_lit("c")),
+ ])
+ );
+ assert_eq!(t("()"), hir_capture(1, Hir::empty()));
+ assert_eq!(t("((?i))"), hir_capture(1, Hir::empty()));
+ assert_eq!(t("((?x))"), hir_capture(1, Hir::empty()));
+ assert_eq!(
+ t("(((?x)))"),
+ hir_capture(1, hir_capture(2, Hir::empty()))
+ );
+ }
+
+ #[test]
+ fn line_anchors() {
+ assert_eq!(t("^"), hir_look(hir::Look::Start));
+ assert_eq!(t("$"), hir_look(hir::Look::End));
+ assert_eq!(t(r"\A"), hir_look(hir::Look::Start));
+ assert_eq!(t(r"\z"), hir_look(hir::Look::End));
+
+ assert_eq!(t(r"(?m)\A"), hir_look(hir::Look::Start));
+ assert_eq!(t(r"(?m)\z"), hir_look(hir::Look::End));
+ assert_eq!(t("(?m)^"), hir_look(hir::Look::StartLF));
+ assert_eq!(t("(?m)$"), hir_look(hir::Look::EndLF));
+
+ assert_eq!(t(r"(?R)\A"), hir_look(hir::Look::Start));
+ assert_eq!(t(r"(?R)\z"), hir_look(hir::Look::End));
+ assert_eq!(t("(?R)^"), hir_look(hir::Look::Start));
+ assert_eq!(t("(?R)$"), hir_look(hir::Look::End));
+
+ assert_eq!(t(r"(?Rm)\A"), hir_look(hir::Look::Start));
+ assert_eq!(t(r"(?Rm)\z"), hir_look(hir::Look::End));
+ assert_eq!(t("(?Rm)^"), hir_look(hir::Look::StartCRLF));
+ assert_eq!(t("(?Rm)$"), hir_look(hir::Look::EndCRLF));
+ }
+
+ #[test]
+ fn flags() {
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t("(?i:a)a"),
+ hir_cat(
+ vec![hir_uclass(&[('A', 'A'), ('a', 'a')]), hir_lit("a"),]
+ )
+ );
+ assert_eq!(
+ t("(?i-u:a)β"),
+ hir_cat(vec![
+ hir_bclass(&[(b'A', b'A'), (b'a', b'a')]),
+ hir_lit("β"),
+ ])
+ );
+ assert_eq!(
+ t("(?:(?i-u)a)b"),
+ hir_cat(vec![
+ hir_bclass(&[(b'A', b'A'), (b'a', b'a')]),
+ hir_lit("b"),
+ ])
+ );
+ assert_eq!(
+ t("((?i-u)a)b"),
+ hir_cat(vec![
+ hir_capture(1, hir_bclass(&[(b'A', b'A'), (b'a', b'a')])),
+ hir_lit("b"),
+ ])
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t("(?i)(?-i:a)a"),
+ hir_cat(
+ vec![hir_lit("a"), hir_uclass(&[('A', 'A'), ('a', 'a')]),]
+ )
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t("(?im)a^"),
+ hir_cat(vec![
+ hir_uclass(&[('A', 'A'), ('a', 'a')]),
+ hir_look(hir::Look::StartLF),
+ ])
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t("(?im)a^(?i-m)a^"),
+ hir_cat(vec![
+ hir_uclass(&[('A', 'A'), ('a', 'a')]),
+ hir_look(hir::Look::StartLF),
+ hir_uclass(&[('A', 'A'), ('a', 'a')]),
+ hir_look(hir::Look::Start),
+ ])
+ );
+ assert_eq!(
+ t("(?U)a*a*?(?-U)a*a*?"),
+ hir_cat(vec![
+ hir_star(false, hir_lit("a")),
+ hir_star(true, hir_lit("a")),
+ hir_star(true, hir_lit("a")),
+ hir_star(false, hir_lit("a")),
+ ])
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t("(?:a(?i)a)a"),
+ hir_cat(vec![
+ hir_cat(vec![
+ hir_lit("a"),
+ hir_uclass(&[('A', 'A'), ('a', 'a')]),
+ ]),
+ hir_lit("a"),
+ ])
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t("(?i)(?:a(?-i)a)a"),
+ hir_cat(vec![
+ hir_cat(vec![
+ hir_uclass(&[('A', 'A'), ('a', 'a')]),
+ hir_lit("a"),
+ ]),
+ hir_uclass(&[('A', 'A'), ('a', 'a')]),
+ ])
+ );
+ }
+
+ #[test]
+ fn escape() {
+ assert_eq!(
+ t(r"\\\.\+\*\?\(\)\|\[\]\{\}\^\$\#"),
+ hir_lit(r"\.+*?()|[]{}^$#")
+ );
+ }
+
+ #[test]
+ fn repetition() {
+ assert_eq!(t("a?"), hir_quest(true, hir_lit("a")));
+ assert_eq!(t("a*"), hir_star(true, hir_lit("a")));
+ assert_eq!(t("a+"), hir_plus(true, hir_lit("a")));
+ assert_eq!(t("a??"), hir_quest(false, hir_lit("a")));
+ assert_eq!(t("a*?"), hir_star(false, hir_lit("a")));
+ assert_eq!(t("a+?"), hir_plus(false, hir_lit("a")));
+
+ assert_eq!(t("a{1}"), hir_range(true, 1, Some(1), hir_lit("a"),));
+ assert_eq!(t("a{1,}"), hir_range(true, 1, None, hir_lit("a"),));
+ assert_eq!(t("a{1,2}"), hir_range(true, 1, Some(2), hir_lit("a"),));
+ assert_eq!(t("a{1}?"), hir_range(false, 1, Some(1), hir_lit("a"),));
+ assert_eq!(t("a{1,}?"), hir_range(false, 1, None, hir_lit("a"),));
+ assert_eq!(t("a{1,2}?"), hir_range(false, 1, Some(2), hir_lit("a"),));
+
+ assert_eq!(
+ t("ab?"),
+ hir_cat(vec![hir_lit("a"), hir_quest(true, hir_lit("b")),])
+ );
+ assert_eq!(t("(ab)?"), hir_quest(true, hir_capture(1, hir_lit("ab"))));
+ assert_eq!(
+ t("a|b?"),
+ hir_alt(vec![hir_lit("a"), hir_quest(true, hir_lit("b")),])
+ );
+ }
+
+ #[test]
+ fn cat_alt() {
+ let a = || hir_look(hir::Look::Start);
+ let b = || hir_look(hir::Look::End);
+ let c = || hir_look(hir::Look::WordUnicode);
+ let d = || hir_look(hir::Look::WordUnicodeNegate);
+
+ assert_eq!(t("(^$)"), hir_capture(1, hir_cat(vec![a(), b()])));
+ assert_eq!(t("^|$"), hir_alt(vec![a(), b()]));
+ assert_eq!(t(r"^|$|\b"), hir_alt(vec![a(), b(), c()]));
+ assert_eq!(
+ t(r"^$|$\b|\b\B"),
+ hir_alt(vec![
+ hir_cat(vec![a(), b()]),
+ hir_cat(vec![b(), c()]),
+ hir_cat(vec![c(), d()]),
+ ])
+ );
+ assert_eq!(t("(^|$)"), hir_capture(1, hir_alt(vec![a(), b()])));
+ assert_eq!(
+ t(r"(^|$|\b)"),
+ hir_capture(1, hir_alt(vec![a(), b(), c()]))
+ );
+ assert_eq!(
+ t(r"(^$|$\b|\b\B)"),
+ hir_capture(
+ 1,
+ hir_alt(vec![
+ hir_cat(vec![a(), b()]),
+ hir_cat(vec![b(), c()]),
+ hir_cat(vec![c(), d()]),
+ ])
+ )
+ );
+ assert_eq!(
+ t(r"(^$|($\b|(\b\B)))"),
+ hir_capture(
+ 1,
+ hir_alt(vec![
+ hir_cat(vec![a(), b()]),
+ hir_capture(
+ 2,
+ hir_alt(vec![
+ hir_cat(vec![b(), c()]),
+ hir_capture(3, hir_cat(vec![c(), d()])),
+ ])
+ ),
+ ])
+ )
+ );
+ }
+
+ // Tests the HIR transformation of things like '[a-z]|[A-Z]' into
+ // '[A-Za-z]'. In other words, an alternation of just classes is always
+ // equivalent to a single class corresponding to the union of the branches
+ // in that class. (Unless some branches match invalid UTF-8 and others
+ // match non-ASCII Unicode.)
+ #[test]
+ fn cat_class_flattened() {
+ assert_eq!(t(r"[a-z]|[A-Z]"), hir_uclass(&[('A', 'Z'), ('a', 'z')]));
+ // Combining all of the letter properties should give us the one giant
+ // letter property.
+ #[cfg(feature = "unicode-gencat")]
+ assert_eq!(
+ t(r"(?x)
+ \p{Lowercase_Letter}
+ |\p{Uppercase_Letter}
+ |\p{Titlecase_Letter}
+ |\p{Modifier_Letter}
+ |\p{Other_Letter}
+ "),
+ hir_uclass_query(ClassQuery::Binary("letter"))
+ );
+ // Byte classes that can truly match invalid UTF-8 cannot be combined
+ // with Unicode classes.
+ assert_eq!(
+ t_bytes(r"[Δδ]|(?-u:[\x90-\xFF])|[Λλ]"),
+ hir_alt(vec![
+ hir_uclass(&[('Δ', 'Δ'), ('δ', 'δ')]),
+ hir_bclass(&[(b'\x90', b'\xFF')]),
+ hir_uclass(&[('Λ', 'Λ'), ('λ', 'λ')]),
+ ])
+ );
+ // Byte classes on their own can be combined, even if some are ASCII
+ // and others are invalid UTF-8.
+ assert_eq!(
+ t_bytes(r"[a-z]|(?-u:[\x90-\xFF])|[A-Z]"),
+ hir_bclass(&[(b'A', b'Z'), (b'a', b'z'), (b'\x90', b'\xFF')]),
+ );
+ }
+
+ #[test]
+ fn class_ascii() {
+ assert_eq!(
+ t("[[:alnum:]]"),
+ hir_ascii_uclass(&ast::ClassAsciiKind::Alnum)
+ );
+ assert_eq!(
+ t("[[:alpha:]]"),
+ hir_ascii_uclass(&ast::ClassAsciiKind::Alpha)
+ );
+ assert_eq!(
+ t("[[:ascii:]]"),
+ hir_ascii_uclass(&ast::ClassAsciiKind::Ascii)
+ );
+ assert_eq!(
+ t("[[:blank:]]"),
+ hir_ascii_uclass(&ast::ClassAsciiKind::Blank)
+ );
+ assert_eq!(
+ t("[[:cntrl:]]"),
+ hir_ascii_uclass(&ast::ClassAsciiKind::Cntrl)
+ );
+ assert_eq!(
+ t("[[:digit:]]"),
+ hir_ascii_uclass(&ast::ClassAsciiKind::Digit)
+ );
+ assert_eq!(
+ t("[[:graph:]]"),
+ hir_ascii_uclass(&ast::ClassAsciiKind::Graph)
+ );
+ assert_eq!(
+ t("[[:lower:]]"),
+ hir_ascii_uclass(&ast::ClassAsciiKind::Lower)
+ );
+ assert_eq!(
+ t("[[:print:]]"),
+ hir_ascii_uclass(&ast::ClassAsciiKind::Print)
+ );
+ assert_eq!(
+ t("[[:punct:]]"),
+ hir_ascii_uclass(&ast::ClassAsciiKind::Punct)
+ );
+ assert_eq!(
+ t("[[:space:]]"),
+ hir_ascii_uclass(&ast::ClassAsciiKind::Space)
+ );
+ assert_eq!(
+ t("[[:upper:]]"),
+ hir_ascii_uclass(&ast::ClassAsciiKind::Upper)
+ );
+ assert_eq!(
+ t("[[:word:]]"),
+ hir_ascii_uclass(&ast::ClassAsciiKind::Word)
+ );
+ assert_eq!(
+ t("[[:xdigit:]]"),
+ hir_ascii_uclass(&ast::ClassAsciiKind::Xdigit)
+ );
+
+ assert_eq!(
+ t("[[:^lower:]]"),
+ hir_negate(hir_ascii_uclass(&ast::ClassAsciiKind::Lower))
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t("(?i)[[:lower:]]"),
+ hir_uclass(&[
+ ('A', 'Z'),
+ ('a', 'z'),
+ ('\u{17F}', '\u{17F}'),
+ ('\u{212A}', '\u{212A}'),
+ ])
+ );
+
+ assert_eq!(
+ t("(?-u)[[:lower:]]"),
+ hir_ascii_bclass(&ast::ClassAsciiKind::Lower)
+ );
+ assert_eq!(
+ t("(?i-u)[[:lower:]]"),
+ hir_case_fold(hir_ascii_bclass(&ast::ClassAsciiKind::Lower))
+ );
+
+ assert_eq!(
+ t_err("(?-u)[[:^lower:]]"),
+ TestError {
+ kind: hir::ErrorKind::InvalidUtf8,
+ span: Span::new(
+ Position::new(6, 1, 7),
+ Position::new(16, 1, 17)
+ ),
+ }
+ );
+ assert_eq!(
+ t_err("(?i-u)[[:^lower:]]"),
+ TestError {
+ kind: hir::ErrorKind::InvalidUtf8,
+ span: Span::new(
+ Position::new(7, 1, 8),
+ Position::new(17, 1, 18)
+ ),
+ }
+ );
+ }
+
+ #[test]
+ fn class_ascii_multiple() {
+ // See: https://github.com/rust-lang/regex/issues/680
+ assert_eq!(
+ t("[[:alnum:][:^ascii:]]"),
+ hir_union(
+ hir_ascii_uclass(&ast::ClassAsciiKind::Alnum),
+ hir_uclass(&[('\u{80}', '\u{10FFFF}')]),
+ ),
+ );
+ assert_eq!(
+ t_bytes("(?-u)[[:alnum:][:^ascii:]]"),
+ hir_union(
+ hir_ascii_bclass(&ast::ClassAsciiKind::Alnum),
+ hir_bclass(&[(0x80, 0xFF)]),
+ ),
+ );
+ }
+
+ #[test]
+ #[cfg(feature = "unicode-perl")]
+ fn class_perl_unicode() {
+ // Unicode
+ assert_eq!(t(r"\d"), hir_uclass_query(ClassQuery::Binary("digit")));
+ assert_eq!(t(r"\s"), hir_uclass_query(ClassQuery::Binary("space")));
+ assert_eq!(t(r"\w"), hir_uclass_perl_word());
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t(r"(?i)\d"),
+ hir_uclass_query(ClassQuery::Binary("digit"))
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t(r"(?i)\s"),
+ hir_uclass_query(ClassQuery::Binary("space"))
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(t(r"(?i)\w"), hir_uclass_perl_word());
+
+ // Unicode, negated
+ assert_eq!(
+ t(r"\D"),
+ hir_negate(hir_uclass_query(ClassQuery::Binary("digit")))
+ );
+ assert_eq!(
+ t(r"\S"),
+ hir_negate(hir_uclass_query(ClassQuery::Binary("space")))
+ );
+ assert_eq!(t(r"\W"), hir_negate(hir_uclass_perl_word()));
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t(r"(?i)\D"),
+ hir_negate(hir_uclass_query(ClassQuery::Binary("digit")))
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t(r"(?i)\S"),
+ hir_negate(hir_uclass_query(ClassQuery::Binary("space")))
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(t(r"(?i)\W"), hir_negate(hir_uclass_perl_word()));
+ }
+
+ #[test]
+ fn class_perl_ascii() {
+ // ASCII only
+ assert_eq!(
+ t(r"(?-u)\d"),
+ hir_ascii_bclass(&ast::ClassAsciiKind::Digit)
+ );
+ assert_eq!(
+ t(r"(?-u)\s"),
+ hir_ascii_bclass(&ast::ClassAsciiKind::Space)
+ );
+ assert_eq!(
+ t(r"(?-u)\w"),
+ hir_ascii_bclass(&ast::ClassAsciiKind::Word)
+ );
+ assert_eq!(
+ t(r"(?i-u)\d"),
+ hir_ascii_bclass(&ast::ClassAsciiKind::Digit)
+ );
+ assert_eq!(
+ t(r"(?i-u)\s"),
+ hir_ascii_bclass(&ast::ClassAsciiKind::Space)
+ );
+ assert_eq!(
+ t(r"(?i-u)\w"),
+ hir_ascii_bclass(&ast::ClassAsciiKind::Word)
+ );
+
+ // ASCII only, negated
+ assert_eq!(
+ t_bytes(r"(?-u)\D"),
+ hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit))
+ );
+ assert_eq!(
+ t_bytes(r"(?-u)\S"),
+ hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Space))
+ );
+ assert_eq!(
+ t_bytes(r"(?-u)\W"),
+ hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Word))
+ );
+ assert_eq!(
+ t_bytes(r"(?i-u)\D"),
+ hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit))
+ );
+ assert_eq!(
+ t_bytes(r"(?i-u)\S"),
+ hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Space))
+ );
+ assert_eq!(
+ t_bytes(r"(?i-u)\W"),
+ hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Word))
+ );
+
+ // ASCII only, negated, with UTF-8 mode enabled.
+ // In this case, negating any Perl class results in an error because
+ // all such classes can match invalid UTF-8.
+ assert_eq!(
+ t_err(r"(?-u)\D"),
+ TestError {
+ kind: hir::ErrorKind::InvalidUtf8,
+ span: Span::new(
+ Position::new(5, 1, 6),
+ Position::new(7, 1, 8),
+ ),
+ },
+ );
+ assert_eq!(
+ t_err(r"(?-u)\S"),
+ TestError {
+ kind: hir::ErrorKind::InvalidUtf8,
+ span: Span::new(
+ Position::new(5, 1, 6),
+ Position::new(7, 1, 8),
+ ),
+ },
+ );
+ assert_eq!(
+ t_err(r"(?-u)\W"),
+ TestError {
+ kind: hir::ErrorKind::InvalidUtf8,
+ span: Span::new(
+ Position::new(5, 1, 6),
+ Position::new(7, 1, 8),
+ ),
+ },
+ );
+ assert_eq!(
+ t_err(r"(?i-u)\D"),
+ TestError {
+ kind: hir::ErrorKind::InvalidUtf8,
+ span: Span::new(
+ Position::new(6, 1, 7),
+ Position::new(8, 1, 9),
+ ),
+ },
+ );
+ assert_eq!(
+ t_err(r"(?i-u)\S"),
+ TestError {
+ kind: hir::ErrorKind::InvalidUtf8,
+ span: Span::new(
+ Position::new(6, 1, 7),
+ Position::new(8, 1, 9),
+ ),
+ },
+ );
+ assert_eq!(
+ t_err(r"(?i-u)\W"),
+ TestError {
+ kind: hir::ErrorKind::InvalidUtf8,
+ span: Span::new(
+ Position::new(6, 1, 7),
+ Position::new(8, 1, 9),
+ ),
+ },
+ );
+ }
+
+ #[test]
+ #[cfg(not(feature = "unicode-perl"))]
+ fn class_perl_word_disabled() {
+ assert_eq!(
+ t_err(r"\w"),
+ TestError {
+ kind: hir::ErrorKind::UnicodePerlClassNotFound,
+ span: Span::new(
+ Position::new(0, 1, 1),
+ Position::new(2, 1, 3)
+ ),
+ }
+ );
+ }
+
+ #[test]
+ #[cfg(all(not(feature = "unicode-perl"), not(feature = "unicode-bool")))]
+ fn class_perl_space_disabled() {
+ assert_eq!(
+ t_err(r"\s"),
+ TestError {
+ kind: hir::ErrorKind::UnicodePerlClassNotFound,
+ span: Span::new(
+ Position::new(0, 1, 1),
+ Position::new(2, 1, 3)
+ ),
+ }
+ );
+ }
+
+ #[test]
+ #[cfg(all(
+ not(feature = "unicode-perl"),
+ not(feature = "unicode-gencat")
+ ))]
+ fn class_perl_digit_disabled() {
+ assert_eq!(
+ t_err(r"\d"),
+ TestError {
+ kind: hir::ErrorKind::UnicodePerlClassNotFound,
+ span: Span::new(
+ Position::new(0, 1, 1),
+ Position::new(2, 1, 3)
+ ),
+ }
+ );
+ }
+
+ #[test]
+ #[cfg(feature = "unicode-gencat")]
+ fn class_unicode_gencat() {
+ assert_eq!(t(r"\pZ"), hir_uclass_query(ClassQuery::Binary("Z")));
+ assert_eq!(t(r"\pz"), hir_uclass_query(ClassQuery::Binary("Z")));
+ assert_eq!(
+ t(r"\p{Separator}"),
+ hir_uclass_query(ClassQuery::Binary("Z"))
+ );
+ assert_eq!(
+ t(r"\p{se PaRa ToR}"),
+ hir_uclass_query(ClassQuery::Binary("Z"))
+ );
+ assert_eq!(
+ t(r"\p{gc:Separator}"),
+ hir_uclass_query(ClassQuery::Binary("Z"))
+ );
+ assert_eq!(
+ t(r"\p{gc=Separator}"),
+ hir_uclass_query(ClassQuery::Binary("Z"))
+ );
+ assert_eq!(
+ t(r"\p{Other}"),
+ hir_uclass_query(ClassQuery::Binary("Other"))
+ );
+ assert_eq!(t(r"\pC"), hir_uclass_query(ClassQuery::Binary("Other")));
+
+ assert_eq!(
+ t(r"\PZ"),
+ hir_negate(hir_uclass_query(ClassQuery::Binary("Z")))
+ );
+ assert_eq!(
+ t(r"\P{separator}"),
+ hir_negate(hir_uclass_query(ClassQuery::Binary("Z")))
+ );
+ assert_eq!(
+ t(r"\P{gc!=separator}"),
+ hir_negate(hir_uclass_query(ClassQuery::Binary("Z")))
+ );
+
+ assert_eq!(t(r"\p{any}"), hir_uclass_query(ClassQuery::Binary("Any")));
+ assert_eq!(
+ t(r"\p{assigned}"),
+ hir_uclass_query(ClassQuery::Binary("Assigned"))
+ );
+ assert_eq!(
+ t(r"\p{ascii}"),
+ hir_uclass_query(ClassQuery::Binary("ASCII"))
+ );
+ assert_eq!(
+ t(r"\p{gc:any}"),
+ hir_uclass_query(ClassQuery::Binary("Any"))
+ );
+ assert_eq!(
+ t(r"\p{gc:assigned}"),
+ hir_uclass_query(ClassQuery::Binary("Assigned"))
+ );
+ assert_eq!(
+ t(r"\p{gc:ascii}"),
+ hir_uclass_query(ClassQuery::Binary("ASCII"))
+ );
+
+ assert_eq!(
+ t_err(r"(?-u)\pZ"),
+ TestError {
+ kind: hir::ErrorKind::UnicodeNotAllowed,
+ span: Span::new(
+ Position::new(5, 1, 6),
+ Position::new(8, 1, 9)
+ ),
+ }
+ );
+ assert_eq!(
+ t_err(r"(?-u)\p{Separator}"),
+ TestError {
+ kind: hir::ErrorKind::UnicodeNotAllowed,
+ span: Span::new(
+ Position::new(5, 1, 6),
+ Position::new(18, 1, 19)
+ ),
+ }
+ );
+ assert_eq!(
+ t_err(r"\pE"),
+ TestError {
+ kind: hir::ErrorKind::UnicodePropertyNotFound,
+ span: Span::new(
+ Position::new(0, 1, 1),
+ Position::new(3, 1, 4)
+ ),
+ }
+ );
+ assert_eq!(
+ t_err(r"\p{Foo}"),
+ TestError {
+ kind: hir::ErrorKind::UnicodePropertyNotFound,
+ span: Span::new(
+ Position::new(0, 1, 1),
+ Position::new(7, 1, 8)
+ ),
+ }
+ );
+ assert_eq!(
+ t_err(r"\p{gc:Foo}"),
+ TestError {
+ kind: hir::ErrorKind::UnicodePropertyValueNotFound,
+ span: Span::new(
+ Position::new(0, 1, 1),
+ Position::new(10, 1, 11)
+ ),
+ }
+ );
+ }
+
+ #[test]
+ #[cfg(not(feature = "unicode-gencat"))]
+ fn class_unicode_gencat_disabled() {
+ assert_eq!(
+ t_err(r"\p{Separator}"),
+ TestError {
+ kind: hir::ErrorKind::UnicodePropertyNotFound,
+ span: Span::new(
+ Position::new(0, 1, 1),
+ Position::new(13, 1, 14)
+ ),
+ }
+ );
+
+ assert_eq!(
+ t_err(r"\p{Any}"),
+ TestError {
+ kind: hir::ErrorKind::UnicodePropertyNotFound,
+ span: Span::new(
+ Position::new(0, 1, 1),
+ Position::new(7, 1, 8)
+ ),
+ }
+ );
+ }
+
+ #[test]
+ #[cfg(feature = "unicode-script")]
+ fn class_unicode_script() {
+ assert_eq!(
+ t(r"\p{Greek}"),
+ hir_uclass_query(ClassQuery::Binary("Greek"))
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t(r"(?i)\p{Greek}"),
+ hir_case_fold(hir_uclass_query(ClassQuery::Binary("Greek")))
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t(r"(?i)\P{Greek}"),
+ hir_negate(hir_case_fold(hir_uclass_query(ClassQuery::Binary(
+ "Greek"
+ ))))
+ );
+
+ assert_eq!(
+ t_err(r"\p{sc:Foo}"),
+ TestError {
+ kind: hir::ErrorKind::UnicodePropertyValueNotFound,
+ span: Span::new(
+ Position::new(0, 1, 1),
+ Position::new(10, 1, 11)
+ ),
+ }
+ );
+ assert_eq!(
+ t_err(r"\p{scx:Foo}"),
+ TestError {
+ kind: hir::ErrorKind::UnicodePropertyValueNotFound,
+ span: Span::new(
+ Position::new(0, 1, 1),
+ Position::new(11, 1, 12)
+ ),
+ }
+ );
+ }
+
+ #[test]
+ #[cfg(not(feature = "unicode-script"))]
+ fn class_unicode_script_disabled() {
+ assert_eq!(
+ t_err(r"\p{Greek}"),
+ TestError {
+ kind: hir::ErrorKind::UnicodePropertyNotFound,
+ span: Span::new(
+ Position::new(0, 1, 1),
+ Position::new(9, 1, 10)
+ ),
+ }
+ );
+
+ assert_eq!(
+ t_err(r"\p{scx:Greek}"),
+ TestError {
+ kind: hir::ErrorKind::UnicodePropertyNotFound,
+ span: Span::new(
+ Position::new(0, 1, 1),
+ Position::new(13, 1, 14)
+ ),
+ }
+ );
+ }
+
+ #[test]
+ #[cfg(feature = "unicode-age")]
+ fn class_unicode_age() {
+ assert_eq!(
+ t_err(r"\p{age:Foo}"),
+ TestError {
+ kind: hir::ErrorKind::UnicodePropertyValueNotFound,
+ span: Span::new(
+ Position::new(0, 1, 1),
+ Position::new(11, 1, 12)
+ ),
+ }
+ );
+ }
+
+ #[test]
+ #[cfg(feature = "unicode-gencat")]
+ fn class_unicode_any_empty() {
+ assert_eq!(t(r"\P{any}"), hir_uclass(&[]),);
+ }
+
+ #[test]
+ #[cfg(not(feature = "unicode-age"))]
+ fn class_unicode_age_disabled() {
+ assert_eq!(
+ t_err(r"\p{age:3.0}"),
+ TestError {
+ kind: hir::ErrorKind::UnicodePropertyNotFound,
+ span: Span::new(
+ Position::new(0, 1, 1),
+ Position::new(11, 1, 12)
+ ),
+ }
+ );
+ }
+
+ #[test]
+ fn class_bracketed() {
+ assert_eq!(t("[a]"), hir_lit("a"));
+ assert_eq!(t("[ab]"), hir_uclass(&[('a', 'b')]));
+ assert_eq!(t("[^[a]]"), class_negate(uclass(&[('a', 'a')])));
+ assert_eq!(t("[a-z]"), hir_uclass(&[('a', 'z')]));
+ assert_eq!(t("[a-fd-h]"), hir_uclass(&[('a', 'h')]));
+ assert_eq!(t("[a-fg-m]"), hir_uclass(&[('a', 'm')]));
+ assert_eq!(t(r"[\x00]"), hir_uclass(&[('\0', '\0')]));
+ assert_eq!(t(r"[\n]"), hir_uclass(&[('\n', '\n')]));
+ assert_eq!(t("[\n]"), hir_uclass(&[('\n', '\n')]));
+ #[cfg(any(feature = "unicode-perl", feature = "unicode-gencat"))]
+ assert_eq!(t(r"[\d]"), hir_uclass_query(ClassQuery::Binary("digit")));
+ #[cfg(feature = "unicode-gencat")]
+ assert_eq!(
+ t(r"[\pZ]"),
+ hir_uclass_query(ClassQuery::Binary("separator"))
+ );
+ #[cfg(feature = "unicode-gencat")]
+ assert_eq!(
+ t(r"[\p{separator}]"),
+ hir_uclass_query(ClassQuery::Binary("separator"))
+ );
+ #[cfg(any(feature = "unicode-perl", feature = "unicode-gencat"))]
+ assert_eq!(t(r"[^\D]"), hir_uclass_query(ClassQuery::Binary("digit")));
+ #[cfg(feature = "unicode-gencat")]
+ assert_eq!(
+ t(r"[^\PZ]"),
+ hir_uclass_query(ClassQuery::Binary("separator"))
+ );
+ #[cfg(feature = "unicode-gencat")]
+ assert_eq!(
+ t(r"[^\P{separator}]"),
+ hir_uclass_query(ClassQuery::Binary("separator"))
+ );
+ #[cfg(all(
+ feature = "unicode-case",
+ any(feature = "unicode-perl", feature = "unicode-gencat")
+ ))]
+ assert_eq!(
+ t(r"(?i)[^\D]"),
+ hir_uclass_query(ClassQuery::Binary("digit"))
+ );
+ #[cfg(all(feature = "unicode-case", feature = "unicode-script"))]
+ assert_eq!(
+ t(r"(?i)[^\P{greek}]"),
+ hir_case_fold(hir_uclass_query(ClassQuery::Binary("greek")))
+ );
+
+ assert_eq!(t("(?-u)[a]"), hir_bclass(&[(b'a', b'a')]));
+ assert_eq!(t(r"(?-u)[\x00]"), hir_bclass(&[(b'\0', b'\0')]));
+ assert_eq!(t_bytes(r"(?-u)[\xFF]"), hir_bclass(&[(b'\xFF', b'\xFF')]));
+
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(t("(?i)[a]"), hir_uclass(&[('A', 'A'), ('a', 'a')]));
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t("(?i)[k]"),
+ hir_uclass(&[('K', 'K'), ('k', 'k'), ('\u{212A}', '\u{212A}'),])
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t("(?i)[β]"),
+ hir_uclass(&[('Β', 'Β'), ('β', 'β'), ('ϐ', 'ϐ'),])
+ );
+ assert_eq!(t("(?i-u)[k]"), hir_bclass(&[(b'K', b'K'), (b'k', b'k'),]));
+
+ assert_eq!(t("[^a]"), class_negate(uclass(&[('a', 'a')])));
+ assert_eq!(t(r"[^\x00]"), class_negate(uclass(&[('\0', '\0')])));
+ assert_eq!(
+ t_bytes("(?-u)[^a]"),
+ class_negate(bclass(&[(b'a', b'a')]))
+ );
+ #[cfg(any(feature = "unicode-perl", feature = "unicode-gencat"))]
+ assert_eq!(
+ t(r"[^\d]"),
+ hir_negate(hir_uclass_query(ClassQuery::Binary("digit")))
+ );
+ #[cfg(feature = "unicode-gencat")]
+ assert_eq!(
+ t(r"[^\pZ]"),
+ hir_negate(hir_uclass_query(ClassQuery::Binary("separator")))
+ );
+ #[cfg(feature = "unicode-gencat")]
+ assert_eq!(
+ t(r"[^\p{separator}]"),
+ hir_negate(hir_uclass_query(ClassQuery::Binary("separator")))
+ );
+ #[cfg(all(feature = "unicode-case", feature = "unicode-script"))]
+ assert_eq!(
+ t(r"(?i)[^\p{greek}]"),
+ hir_negate(hir_case_fold(hir_uclass_query(ClassQuery::Binary(
+ "greek"
+ ))))
+ );
+ #[cfg(all(feature = "unicode-case", feature = "unicode-script"))]
+ assert_eq!(
+ t(r"(?i)[\P{greek}]"),
+ hir_negate(hir_case_fold(hir_uclass_query(ClassQuery::Binary(
+ "greek"
+ ))))
+ );
+
+ // Test some weird cases.
+ assert_eq!(t(r"[\[]"), hir_uclass(&[('[', '[')]));
+
+ assert_eq!(t(r"[&]"), hir_uclass(&[('&', '&')]));
+ assert_eq!(t(r"[\&]"), hir_uclass(&[('&', '&')]));
+ assert_eq!(t(r"[\&\&]"), hir_uclass(&[('&', '&')]));
+ assert_eq!(t(r"[\x00-&]"), hir_uclass(&[('\0', '&')]));
+ assert_eq!(t(r"[&-\xFF]"), hir_uclass(&[('&', '\u{FF}')]));
+
+ assert_eq!(t(r"[~]"), hir_uclass(&[('~', '~')]));
+ assert_eq!(t(r"[\~]"), hir_uclass(&[('~', '~')]));
+ assert_eq!(t(r"[\~\~]"), hir_uclass(&[('~', '~')]));
+ assert_eq!(t(r"[\x00-~]"), hir_uclass(&[('\0', '~')]));
+ assert_eq!(t(r"[~-\xFF]"), hir_uclass(&[('~', '\u{FF}')]));
+
+ assert_eq!(t(r"[-]"), hir_uclass(&[('-', '-')]));
+ assert_eq!(t(r"[\-]"), hir_uclass(&[('-', '-')]));
+ assert_eq!(t(r"[\-\-]"), hir_uclass(&[('-', '-')]));
+ assert_eq!(t(r"[\x00-\-]"), hir_uclass(&[('\0', '-')]));
+ assert_eq!(t(r"[\--\xFF]"), hir_uclass(&[('-', '\u{FF}')]));
+
+ assert_eq!(
+ t_err("(?-u)[^a]"),
+ TestError {
+ kind: hir::ErrorKind::InvalidUtf8,
+ span: Span::new(
+ Position::new(5, 1, 6),
+ Position::new(9, 1, 10)
+ ),
+ }
+ );
+ #[cfg(any(feature = "unicode-perl", feature = "unicode-bool"))]
+ assert_eq!(t(r"[^\s\S]"), hir_uclass(&[]),);
+ #[cfg(any(feature = "unicode-perl", feature = "unicode-bool"))]
+ assert_eq!(t_bytes(r"(?-u)[^\s\S]"), hir_bclass(&[]),);
+ }
+
+ #[test]
+ fn class_bracketed_union() {
+ assert_eq!(t("[a-zA-Z]"), hir_uclass(&[('A', 'Z'), ('a', 'z')]));
+ #[cfg(feature = "unicode-gencat")]
+ assert_eq!(
+ t(r"[a\pZb]"),
+ hir_union(
+ hir_uclass(&[('a', 'b')]),
+ hir_uclass_query(ClassQuery::Binary("separator"))
+ )
+ );
+ #[cfg(all(feature = "unicode-gencat", feature = "unicode-script"))]
+ assert_eq!(
+ t(r"[\pZ\p{Greek}]"),
+ hir_union(
+ hir_uclass_query(ClassQuery::Binary("greek")),
+ hir_uclass_query(ClassQuery::Binary("separator"))
+ )
+ );
+ #[cfg(all(
+ feature = "unicode-age",
+ feature = "unicode-gencat",
+ feature = "unicode-script"
+ ))]
+ assert_eq!(
+ t(r"[\p{age:3.0}\pZ\p{Greek}]"),
+ hir_union(
+ hir_uclass_query(ClassQuery::ByValue {
+ property_name: "age",
+ property_value: "3.0",
+ }),
+ hir_union(
+ hir_uclass_query(ClassQuery::Binary("greek")),
+ hir_uclass_query(ClassQuery::Binary("separator"))
+ )
+ )
+ );
+ #[cfg(all(
+ feature = "unicode-age",
+ feature = "unicode-gencat",
+ feature = "unicode-script"
+ ))]
+ assert_eq!(
+ t(r"[[[\p{age:3.0}\pZ]\p{Greek}][\p{Cyrillic}]]"),
+ hir_union(
+ hir_uclass_query(ClassQuery::ByValue {
+ property_name: "age",
+ property_value: "3.0",
+ }),
+ hir_union(
+ hir_uclass_query(ClassQuery::Binary("cyrillic")),
+ hir_union(
+ hir_uclass_query(ClassQuery::Binary("greek")),
+ hir_uclass_query(ClassQuery::Binary("separator"))
+ )
+ )
+ )
+ );
+
+ #[cfg(all(
+ feature = "unicode-age",
+ feature = "unicode-case",
+ feature = "unicode-gencat",
+ feature = "unicode-script"
+ ))]
+ assert_eq!(
+ t(r"(?i)[\p{age:3.0}\pZ\p{Greek}]"),
+ hir_case_fold(hir_union(
+ hir_uclass_query(ClassQuery::ByValue {
+ property_name: "age",
+ property_value: "3.0",
+ }),
+ hir_union(
+ hir_uclass_query(ClassQuery::Binary("greek")),
+ hir_uclass_query(ClassQuery::Binary("separator"))
+ )
+ ))
+ );
+ #[cfg(all(
+ feature = "unicode-age",
+ feature = "unicode-gencat",
+ feature = "unicode-script"
+ ))]
+ assert_eq!(
+ t(r"[^\p{age:3.0}\pZ\p{Greek}]"),
+ hir_negate(hir_union(
+ hir_uclass_query(ClassQuery::ByValue {
+ property_name: "age",
+ property_value: "3.0",
+ }),
+ hir_union(
+ hir_uclass_query(ClassQuery::Binary("greek")),
+ hir_uclass_query(ClassQuery::Binary("separator"))
+ )
+ ))
+ );
+ #[cfg(all(
+ feature = "unicode-age",
+ feature = "unicode-case",
+ feature = "unicode-gencat",
+ feature = "unicode-script"
+ ))]
+ assert_eq!(
+ t(r"(?i)[^\p{age:3.0}\pZ\p{Greek}]"),
+ hir_negate(hir_case_fold(hir_union(
+ hir_uclass_query(ClassQuery::ByValue {
+ property_name: "age",
+ property_value: "3.0",
+ }),
+ hir_union(
+ hir_uclass_query(ClassQuery::Binary("greek")),
+ hir_uclass_query(ClassQuery::Binary("separator"))
+ )
+ )))
+ );
+ }
+
+ #[test]
+ fn class_bracketed_nested() {
+ assert_eq!(t(r"[a[^c]]"), class_negate(uclass(&[('c', 'c')])));
+ assert_eq!(t(r"[a-b[^c]]"), class_negate(uclass(&[('c', 'c')])));
+ assert_eq!(t(r"[a-c[^c]]"), class_negate(uclass(&[])));
+
+ assert_eq!(t(r"[^a[^c]]"), hir_uclass(&[('c', 'c')]));
+ assert_eq!(t(r"[^a-b[^c]]"), hir_uclass(&[('c', 'c')]));
+
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t(r"(?i)[a[^c]]"),
+ hir_negate(class_case_fold(uclass(&[('c', 'c')])))
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t(r"(?i)[a-b[^c]]"),
+ hir_negate(class_case_fold(uclass(&[('c', 'c')])))
+ );
+
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(t(r"(?i)[^a[^c]]"), hir_uclass(&[('C', 'C'), ('c', 'c')]));
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t(r"(?i)[^a-b[^c]]"),
+ hir_uclass(&[('C', 'C'), ('c', 'c')])
+ );
+
+ assert_eq!(t(r"[^a-c[^c]]"), hir_uclass(&[]),);
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(t(r"(?i)[^a-c[^c]]"), hir_uclass(&[]),);
+ }
+
+ #[test]
+ fn class_bracketed_intersect() {
+ assert_eq!(t("[abc&&b-c]"), hir_uclass(&[('b', 'c')]));
+ assert_eq!(t("[abc&&[b-c]]"), hir_uclass(&[('b', 'c')]));
+ assert_eq!(t("[[abc]&&[b-c]]"), hir_uclass(&[('b', 'c')]));
+ assert_eq!(t("[a-z&&b-y&&c-x]"), hir_uclass(&[('c', 'x')]));
+ assert_eq!(t("[c-da-b&&a-d]"), hir_uclass(&[('a', 'd')]));
+ assert_eq!(t("[a-d&&c-da-b]"), hir_uclass(&[('a', 'd')]));
+ assert_eq!(t(r"[a-z&&a-c]"), hir_uclass(&[('a', 'c')]));
+ assert_eq!(t(r"[[a-z&&a-c]]"), hir_uclass(&[('a', 'c')]));
+ assert_eq!(t(r"[^[a-z&&a-c]]"), hir_negate(hir_uclass(&[('a', 'c')])));
+
+ assert_eq!(t("(?-u)[abc&&b-c]"), hir_bclass(&[(b'b', b'c')]));
+ assert_eq!(t("(?-u)[abc&&[b-c]]"), hir_bclass(&[(b'b', b'c')]));
+ assert_eq!(t("(?-u)[[abc]&&[b-c]]"), hir_bclass(&[(b'b', b'c')]));
+ assert_eq!(t("(?-u)[a-z&&b-y&&c-x]"), hir_bclass(&[(b'c', b'x')]));
+ assert_eq!(t("(?-u)[c-da-b&&a-d]"), hir_bclass(&[(b'a', b'd')]));
+ assert_eq!(t("(?-u)[a-d&&c-da-b]"), hir_bclass(&[(b'a', b'd')]));
+
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t("(?i)[abc&&b-c]"),
+ hir_case_fold(hir_uclass(&[('b', 'c')]))
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t("(?i)[abc&&[b-c]]"),
+ hir_case_fold(hir_uclass(&[('b', 'c')]))
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t("(?i)[[abc]&&[b-c]]"),
+ hir_case_fold(hir_uclass(&[('b', 'c')]))
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t("(?i)[a-z&&b-y&&c-x]"),
+ hir_case_fold(hir_uclass(&[('c', 'x')]))
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t("(?i)[c-da-b&&a-d]"),
+ hir_case_fold(hir_uclass(&[('a', 'd')]))
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t("(?i)[a-d&&c-da-b]"),
+ hir_case_fold(hir_uclass(&[('a', 'd')]))
+ );
+
+ assert_eq!(
+ t("(?i-u)[abc&&b-c]"),
+ hir_case_fold(hir_bclass(&[(b'b', b'c')]))
+ );
+ assert_eq!(
+ t("(?i-u)[abc&&[b-c]]"),
+ hir_case_fold(hir_bclass(&[(b'b', b'c')]))
+ );
+ assert_eq!(
+ t("(?i-u)[[abc]&&[b-c]]"),
+ hir_case_fold(hir_bclass(&[(b'b', b'c')]))
+ );
+ assert_eq!(
+ t("(?i-u)[a-z&&b-y&&c-x]"),
+ hir_case_fold(hir_bclass(&[(b'c', b'x')]))
+ );
+ assert_eq!(
+ t("(?i-u)[c-da-b&&a-d]"),
+ hir_case_fold(hir_bclass(&[(b'a', b'd')]))
+ );
+ assert_eq!(
+ t("(?i-u)[a-d&&c-da-b]"),
+ hir_case_fold(hir_bclass(&[(b'a', b'd')]))
+ );
+
+ // In `[a^]`, `^` does not need to be escaped, so it makes sense that
+ // `^` is also allowed to be unescaped after `&&`.
+ assert_eq!(t(r"[\^&&^]"), hir_uclass(&[('^', '^')]));
+ // `]` needs to be escaped after `&&` since it's not at start of class.
+ assert_eq!(t(r"[]&&\]]"), hir_uclass(&[(']', ']')]));
+ assert_eq!(t(r"[-&&-]"), hir_uclass(&[('-', '-')]));
+ assert_eq!(t(r"[\&&&&]"), hir_uclass(&[('&', '&')]));
+ assert_eq!(t(r"[\&&&\&]"), hir_uclass(&[('&', '&')]));
+ // Test precedence.
+ assert_eq!(
+ t(r"[a-w&&[^c-g]z]"),
+ hir_uclass(&[('a', 'b'), ('h', 'w')])
+ );
+ }
+
+ #[test]
+ fn class_bracketed_intersect_negate() {
+ #[cfg(feature = "unicode-perl")]
+ assert_eq!(
+ t(r"[^\w&&\d]"),
+ hir_negate(hir_uclass_query(ClassQuery::Binary("digit")))
+ );
+ assert_eq!(t(r"[^[a-z&&a-c]]"), hir_negate(hir_uclass(&[('a', 'c')])));
+ #[cfg(feature = "unicode-perl")]
+ assert_eq!(
+ t(r"[^[\w&&\d]]"),
+ hir_negate(hir_uclass_query(ClassQuery::Binary("digit")))
+ );
+ #[cfg(feature = "unicode-perl")]
+ assert_eq!(
+ t(r"[^[^\w&&\d]]"),
+ hir_uclass_query(ClassQuery::Binary("digit"))
+ );
+ #[cfg(feature = "unicode-perl")]
+ assert_eq!(t(r"[[[^\w]&&[^\d]]]"), hir_negate(hir_uclass_perl_word()));
+
+ #[cfg(feature = "unicode-perl")]
+ assert_eq!(
+ t_bytes(r"(?-u)[^\w&&\d]"),
+ hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit))
+ );
+ assert_eq!(
+ t_bytes(r"(?-u)[^[a-z&&a-c]]"),
+ hir_negate(hir_bclass(&[(b'a', b'c')]))
+ );
+ assert_eq!(
+ t_bytes(r"(?-u)[^[\w&&\d]]"),
+ hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit))
+ );
+ assert_eq!(
+ t_bytes(r"(?-u)[^[^\w&&\d]]"),
+ hir_ascii_bclass(&ast::ClassAsciiKind::Digit)
+ );
+ assert_eq!(
+ t_bytes(r"(?-u)[[[^\w]&&[^\d]]]"),
+ hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Word))
+ );
+ }
+
+ #[test]
+ fn class_bracketed_difference() {
+ #[cfg(feature = "unicode-gencat")]
+ assert_eq!(
+ t(r"[\pL--[:ascii:]]"),
+ hir_difference(
+ hir_uclass_query(ClassQuery::Binary("letter")),
+ hir_uclass(&[('\0', '\x7F')])
+ )
+ );
+
+ assert_eq!(
+ t(r"(?-u)[[:alpha:]--[:lower:]]"),
+ hir_bclass(&[(b'A', b'Z')])
+ );
+ }
+
+ #[test]
+ fn class_bracketed_symmetric_difference() {
+ #[cfg(feature = "unicode-script")]
+ assert_eq!(
+ t(r"[\p{sc:Greek}~~\p{scx:Greek}]"),
+ hir_uclass(&[
+ ('\u{0342}', '\u{0342}'),
+ ('\u{0345}', '\u{0345}'),
+ ('\u{1DC0}', '\u{1DC1}'),
+ ])
+ );
+ assert_eq!(t(r"[a-g~~c-j]"), hir_uclass(&[('a', 'b'), ('h', 'j')]));
+
+ assert_eq!(
+ t(r"(?-u)[a-g~~c-j]"),
+ hir_bclass(&[(b'a', b'b'), (b'h', b'j')])
+ );
+ }
+
+ #[test]
+ fn ignore_whitespace() {
+ assert_eq!(t(r"(?x)\12 3"), hir_lit("\n3"));
+ assert_eq!(t(r"(?x)\x { 53 }"), hir_lit("S"));
+ assert_eq!(
+ t(r"(?x)\x # comment
+{ # comment
+ 53 # comment
+} #comment"),
+ hir_lit("S")
+ );
+
+ assert_eq!(t(r"(?x)\x 53"), hir_lit("S"));
+ assert_eq!(
+ t(r"(?x)\x # comment
+ 53 # comment"),
+ hir_lit("S")
+ );
+ assert_eq!(t(r"(?x)\x5 3"), hir_lit("S"));
+
+ #[cfg(feature = "unicode-gencat")]
+ assert_eq!(
+ t(r"(?x)\p # comment
+{ # comment
+ Separator # comment
+} # comment"),
+ hir_uclass_query(ClassQuery::Binary("separator"))
+ );
+
+ assert_eq!(
+ t(r"(?x)a # comment
+{ # comment
+ 5 # comment
+ , # comment
+ 10 # comment
+} # comment"),
+ hir_range(true, 5, Some(10), hir_lit("a"))
+ );
+
+ assert_eq!(t(r"(?x)a\ # hi there"), hir_lit("a "));
+ }
+
+ #[test]
+ fn analysis_is_utf8() {
+ // Positive examples.
+ assert!(props_bytes(r"a").is_utf8());
+ assert!(props_bytes(r"ab").is_utf8());
+ assert!(props_bytes(r"(?-u)a").is_utf8());
+ assert!(props_bytes(r"(?-u)ab").is_utf8());
+ assert!(props_bytes(r"\xFF").is_utf8());
+ assert!(props_bytes(r"\xFF\xFF").is_utf8());
+ assert!(props_bytes(r"[^a]").is_utf8());
+ assert!(props_bytes(r"[^a][^a]").is_utf8());
+ assert!(props_bytes(r"\b").is_utf8());
+ assert!(props_bytes(r"\B").is_utf8());
+ assert!(props_bytes(r"(?-u)\b").is_utf8());
+ assert!(props_bytes(r"(?-u)\B").is_utf8());
+
+ // Negative examples.
+ assert!(!props_bytes(r"(?-u)\xFF").is_utf8());
+ assert!(!props_bytes(r"(?-u)\xFF\xFF").is_utf8());
+ assert!(!props_bytes(r"(?-u)[^a]").is_utf8());
+ assert!(!props_bytes(r"(?-u)[^a][^a]").is_utf8());
+ }
+
+ #[test]
+ fn analysis_captures_len() {
+ assert_eq!(0, props(r"a").explicit_captures_len());
+ assert_eq!(0, props(r"(?:a)").explicit_captures_len());
+ assert_eq!(0, props(r"(?i-u:a)").explicit_captures_len());
+ assert_eq!(0, props(r"(?i-u)a").explicit_captures_len());
+ assert_eq!(1, props(r"(a)").explicit_captures_len());
+ assert_eq!(1, props(r"(?P<foo>a)").explicit_captures_len());
+ assert_eq!(1, props(r"()").explicit_captures_len());
+ assert_eq!(1, props(r"()a").explicit_captures_len());
+ assert_eq!(1, props(r"(a)+").explicit_captures_len());
+ assert_eq!(2, props(r"(a)(b)").explicit_captures_len());
+ assert_eq!(2, props(r"(a)|(b)").explicit_captures_len());
+ assert_eq!(2, props(r"((a))").explicit_captures_len());
+ assert_eq!(1, props(r"([a&&b])").explicit_captures_len());
+ }
+
+ #[test]
+ fn analysis_static_captures_len() {
+ let len = |pattern| props(pattern).static_explicit_captures_len();
+ assert_eq!(Some(0), len(r""));
+ assert_eq!(Some(0), len(r"foo|bar"));
+ assert_eq!(None, len(r"(foo)|bar"));
+ assert_eq!(None, len(r"foo|(bar)"));
+ assert_eq!(Some(1), len(r"(foo|bar)"));
+ assert_eq!(Some(1), len(r"(a|b|c|d|e|f)"));
+ assert_eq!(Some(1), len(r"(a)|(b)|(c)|(d)|(e)|(f)"));
+ assert_eq!(Some(2), len(r"(a)(b)|(c)(d)|(e)(f)"));
+ assert_eq!(Some(6), len(r"(a)(b)(c)(d)(e)(f)"));
+ assert_eq!(Some(3), len(r"(a)(b)(extra)|(a)(b)()"));
+ assert_eq!(Some(3), len(r"(a)(b)((?:extra)?)"));
+ assert_eq!(None, len(r"(a)(b)(extra)?"));
+ assert_eq!(Some(1), len(r"(foo)|(bar)"));
+ assert_eq!(Some(2), len(r"(foo)(bar)"));
+ assert_eq!(Some(2), len(r"(foo)+(bar)"));
+ assert_eq!(None, len(r"(foo)*(bar)"));
+ assert_eq!(Some(0), len(r"(foo)?{0}"));
+ assert_eq!(None, len(r"(foo)?{1}"));
+ assert_eq!(Some(1), len(r"(foo){1}"));
+ assert_eq!(Some(1), len(r"(foo){1,}"));
+ assert_eq!(Some(1), len(r"(foo){1,}?"));
+ assert_eq!(None, len(r"(foo){1,}??"));
+ assert_eq!(None, len(r"(foo){0,}"));
+ assert_eq!(Some(1), len(r"(foo)(?:bar)"));
+ assert_eq!(Some(2), len(r"(foo(?:bar)+)(?:baz(boo))"));
+ assert_eq!(Some(2), len(r"(?P<bar>foo)(?:bar)(bal|loon)"));
+ assert_eq!(
+ Some(2),
+ len(r#"<(a)[^>]+href="([^"]+)"|<(img)[^>]+src="([^"]+)""#)
+ );
+ }
+
+ #[test]
+ fn analysis_is_all_assertions() {
+ // Positive examples.
+ let p = props(r"\b");
+ assert!(!p.look_set().is_empty());
+ assert_eq!(p.minimum_len(), Some(0));
+
+ let p = props(r"\B");
+ assert!(!p.look_set().is_empty());
+ assert_eq!(p.minimum_len(), Some(0));
+
+ let p = props(r"^");
+ assert!(!p.look_set().is_empty());
+ assert_eq!(p.minimum_len(), Some(0));
+
+ let p = props(r"$");
+ assert!(!p.look_set().is_empty());
+ assert_eq!(p.minimum_len(), Some(0));
+
+ let p = props(r"\A");
+ assert!(!p.look_set().is_empty());
+ assert_eq!(p.minimum_len(), Some(0));
+
+ let p = props(r"\z");
+ assert!(!p.look_set().is_empty());
+ assert_eq!(p.minimum_len(), Some(0));
+
+ let p = props(r"$^\z\A\b\B");
+ assert!(!p.look_set().is_empty());
+ assert_eq!(p.minimum_len(), Some(0));
+
+ let p = props(r"$|^|\z|\A|\b|\B");
+ assert!(!p.look_set().is_empty());
+ assert_eq!(p.minimum_len(), Some(0));
+
+ let p = props(r"^$|$^");
+ assert!(!p.look_set().is_empty());
+ assert_eq!(p.minimum_len(), Some(0));
+
+ let p = props(r"((\b)+())*^");
+ assert!(!p.look_set().is_empty());
+ assert_eq!(p.minimum_len(), Some(0));
+
+ // Negative examples.
+ let p = props(r"^a");
+ assert!(!p.look_set().is_empty());
+ assert_eq!(p.minimum_len(), Some(1));
+ }
+
+ #[test]
+ fn analysis_look_set_prefix_any() {
+ let p = props(r"(?-u)(?i:(?:\b|_)win(?:32|64|dows)?(?:\b|_))");
+ assert!(p.look_set_prefix_any().contains(Look::WordAscii));
+ }
+
+ #[test]
+ fn analysis_is_anchored() {
+ let is_start = |p| props(p).look_set_prefix().contains(Look::Start);
+ let is_end = |p| props(p).look_set_suffix().contains(Look::End);
+
+ // Positive examples.
+ assert!(is_start(r"^"));
+ assert!(is_end(r"$"));
+
+ assert!(is_start(r"^^"));
+ assert!(props(r"$$").look_set_suffix().contains(Look::End));
+
+ assert!(is_start(r"^$"));
+ assert!(is_end(r"^$"));
+
+ assert!(is_start(r"^foo"));
+ assert!(is_end(r"foo$"));
+
+ assert!(is_start(r"^foo|^bar"));
+ assert!(is_end(r"foo$|bar$"));
+
+ assert!(is_start(r"^(foo|bar)"));
+ assert!(is_end(r"(foo|bar)$"));
+
+ assert!(is_start(r"^+"));
+ assert!(is_end(r"$+"));
+ assert!(is_start(r"^++"));
+ assert!(is_end(r"$++"));
+ assert!(is_start(r"(^)+"));
+ assert!(is_end(r"($)+"));
+
+ assert!(is_start(r"$^"));
+ assert!(is_start(r"$^"));
+ assert!(is_start(r"$^|^$"));
+ assert!(is_end(r"$^|^$"));
+
+ assert!(is_start(r"\b^"));
+ assert!(is_end(r"$\b"));
+ assert!(is_start(r"^(?m:^)"));
+ assert!(is_end(r"(?m:$)$"));
+ assert!(is_start(r"(?m:^)^"));
+ assert!(is_end(r"$(?m:$)"));
+
+ // Negative examples.
+ assert!(!is_start(r"(?m)^"));
+ assert!(!is_end(r"(?m)$"));
+ assert!(!is_start(r"(?m:^$)|$^"));
+ assert!(!is_end(r"(?m:^$)|$^"));
+ assert!(!is_start(r"$^|(?m:^$)"));
+ assert!(!is_end(r"$^|(?m:^$)"));
+
+ assert!(!is_start(r"a^"));
+ assert!(!is_start(r"$a"));
+
+ assert!(!is_end(r"a^"));
+ assert!(!is_end(r"$a"));
+
+ assert!(!is_start(r"^foo|bar"));
+ assert!(!is_end(r"foo|bar$"));
+
+ assert!(!is_start(r"^*"));
+ assert!(!is_end(r"$*"));
+ assert!(!is_start(r"^*+"));
+ assert!(!is_end(r"$*+"));
+ assert!(!is_start(r"^+*"));
+ assert!(!is_end(r"$+*"));
+ assert!(!is_start(r"(^)*"));
+ assert!(!is_end(r"($)*"));
+ }
+
+ #[test]
+ fn analysis_is_any_anchored() {
+ let is_start = |p| props(p).look_set().contains(Look::Start);
+ let is_end = |p| props(p).look_set().contains(Look::End);
+
+ // Positive examples.
+ assert!(is_start(r"^"));
+ assert!(is_end(r"$"));
+ assert!(is_start(r"\A"));
+ assert!(is_end(r"\z"));
+
+ // Negative examples.
+ assert!(!is_start(r"(?m)^"));
+ assert!(!is_end(r"(?m)$"));
+ assert!(!is_start(r"$"));
+ assert!(!is_end(r"^"));
+ }
+
+ #[test]
+ fn analysis_can_empty() {
+ // Positive examples.
+ let assert_empty =
+ |p| assert_eq!(Some(0), props_bytes(p).minimum_len());
+ assert_empty(r"");
+ assert_empty(r"()");
+ assert_empty(r"()*");
+ assert_empty(r"()+");
+ assert_empty(r"()?");
+ assert_empty(r"a*");
+ assert_empty(r"a?");
+ assert_empty(r"a{0}");
+ assert_empty(r"a{0,}");
+ assert_empty(r"a{0,1}");
+ assert_empty(r"a{0,10}");
+ #[cfg(feature = "unicode-gencat")]
+ assert_empty(r"\pL*");
+ assert_empty(r"a*|b");
+ assert_empty(r"b|a*");
+ assert_empty(r"a|");
+ assert_empty(r"|a");
+ assert_empty(r"a||b");
+ assert_empty(r"a*a?(abcd)*");
+ assert_empty(r"^");
+ assert_empty(r"$");
+ assert_empty(r"(?m)^");
+ assert_empty(r"(?m)$");
+ assert_empty(r"\A");
+ assert_empty(r"\z");
+ assert_empty(r"\B");
+ assert_empty(r"(?-u)\B");
+ assert_empty(r"\b");
+ assert_empty(r"(?-u)\b");
+
+ // Negative examples.
+ let assert_non_empty =
+ |p| assert_ne!(Some(0), props_bytes(p).minimum_len());
+ assert_non_empty(r"a+");
+ assert_non_empty(r"a{1}");
+ assert_non_empty(r"a{1,}");
+ assert_non_empty(r"a{1,2}");
+ assert_non_empty(r"a{1,10}");
+ assert_non_empty(r"b|a");
+ assert_non_empty(r"a*a+(abcd)*");
+ #[cfg(feature = "unicode-gencat")]
+ assert_non_empty(r"\P{any}");
+ assert_non_empty(r"[a--a]");
+ assert_non_empty(r"[a&&b]");
+ }
+
+ #[test]
+ fn analysis_is_literal() {
+ // Positive examples.
+ assert!(props(r"a").is_literal());
+ assert!(props(r"ab").is_literal());
+ assert!(props(r"abc").is_literal());
+ assert!(props(r"(?m)abc").is_literal());
+ assert!(props(r"(?:a)").is_literal());
+ assert!(props(r"foo(?:a)").is_literal());
+ assert!(props(r"(?:a)foo").is_literal());
+ assert!(props(r"[a]").is_literal());
+
+ // Negative examples.
+ assert!(!props(r"").is_literal());
+ assert!(!props(r"^").is_literal());
+ assert!(!props(r"a|b").is_literal());
+ assert!(!props(r"(a)").is_literal());
+ assert!(!props(r"a+").is_literal());
+ assert!(!props(r"foo(a)").is_literal());
+ assert!(!props(r"(a)foo").is_literal());
+ assert!(!props(r"[ab]").is_literal());
+ }
+
+ #[test]
+ fn analysis_is_alternation_literal() {
+ // Positive examples.
+ assert!(props(r"a").is_alternation_literal());
+ assert!(props(r"ab").is_alternation_literal());
+ assert!(props(r"abc").is_alternation_literal());
+ assert!(props(r"(?m)abc").is_alternation_literal());
+ assert!(props(r"foo|bar").is_alternation_literal());
+ assert!(props(r"foo|bar|baz").is_alternation_literal());
+ assert!(props(r"[a]").is_alternation_literal());
+ assert!(props(r"(?:ab)|cd").is_alternation_literal());
+ assert!(props(r"ab|(?:cd)").is_alternation_literal());
+
+ // Negative examples.
+ assert!(!props(r"").is_alternation_literal());
+ assert!(!props(r"^").is_alternation_literal());
+ assert!(!props(r"(a)").is_alternation_literal());
+ assert!(!props(r"a+").is_alternation_literal());
+ assert!(!props(r"foo(a)").is_alternation_literal());
+ assert!(!props(r"(a)foo").is_alternation_literal());
+ assert!(!props(r"[ab]").is_alternation_literal());
+ assert!(!props(r"[ab]|b").is_alternation_literal());
+ assert!(!props(r"a|[ab]").is_alternation_literal());
+ assert!(!props(r"(a)|b").is_alternation_literal());
+ assert!(!props(r"a|(b)").is_alternation_literal());
+ assert!(!props(r"a|b").is_alternation_literal());
+ assert!(!props(r"a|b|c").is_alternation_literal());
+ assert!(!props(r"[a]|b").is_alternation_literal());
+ assert!(!props(r"a|[b]").is_alternation_literal());
+ assert!(!props(r"(?:a)|b").is_alternation_literal());
+ assert!(!props(r"a|(?:b)").is_alternation_literal());
+ assert!(!props(r"(?:z|xx)@|xx").is_alternation_literal());
+ }
+
+ // This tests that the smart Hir::repetition constructors does some basic
+ // simplifications.
+ #[test]
+ fn smart_repetition() {
+ assert_eq!(t(r"a{0}"), Hir::empty());
+ assert_eq!(t(r"a{1}"), hir_lit("a"));
+ assert_eq!(t(r"\B{32111}"), hir_look(hir::Look::WordUnicodeNegate));
+ }
+
+ // This tests that the smart Hir::concat constructor simplifies the given
+ // exprs in a way we expect.
+ #[test]
+ fn smart_concat() {
+ assert_eq!(t(""), Hir::empty());
+ assert_eq!(t("(?:)"), Hir::empty());
+ assert_eq!(t("abc"), hir_lit("abc"));
+ assert_eq!(t("(?:foo)(?:bar)"), hir_lit("foobar"));
+ assert_eq!(t("quux(?:foo)(?:bar)baz"), hir_lit("quuxfoobarbaz"));
+ assert_eq!(
+ t("foo(?:bar^baz)quux"),
+ hir_cat(vec![
+ hir_lit("foobar"),
+ hir_look(hir::Look::Start),
+ hir_lit("bazquux"),
+ ])
+ );
+ assert_eq!(
+ t("foo(?:ba(?:r^b)az)quux"),
+ hir_cat(vec![
+ hir_lit("foobar"),
+ hir_look(hir::Look::Start),
+ hir_lit("bazquux"),
+ ])
+ );
+ }
+
+ // This tests that the smart Hir::alternation constructor simplifies the
+ // given exprs in a way we expect.
+ #[test]
+ fn smart_alternation() {
+ assert_eq!(
+ t("(?:foo)|(?:bar)"),
+ hir_alt(vec![hir_lit("foo"), hir_lit("bar")])
+ );
+ assert_eq!(
+ t("quux|(?:abc|def|xyz)|baz"),
+ hir_alt(vec![
+ hir_lit("quux"),
+ hir_lit("abc"),
+ hir_lit("def"),
+ hir_lit("xyz"),
+ hir_lit("baz"),
+ ])
+ );
+ assert_eq!(
+ t("quux|(?:abc|(?:def|mno)|xyz)|baz"),
+ hir_alt(vec![
+ hir_lit("quux"),
+ hir_lit("abc"),
+ hir_lit("def"),
+ hir_lit("mno"),
+ hir_lit("xyz"),
+ hir_lit("baz"),
+ ])
+ );
+ assert_eq!(
+ t("a|b|c|d|e|f|x|y|z"),
+ hir_uclass(&[('a', 'f'), ('x', 'z')]),
+ );
+ // Tests that we lift common prefixes out of an alternation.
+ assert_eq!(
+ t("[A-Z]foo|[A-Z]quux"),
+ hir_cat(vec![
+ hir_uclass(&[('A', 'Z')]),
+ hir_alt(vec![hir_lit("foo"), hir_lit("quux")]),
+ ]),
+ );
+ assert_eq!(
+ t("[A-Z][A-Z]|[A-Z]quux"),
+ hir_cat(vec![
+ hir_uclass(&[('A', 'Z')]),
+ hir_alt(vec![hir_uclass(&[('A', 'Z')]), hir_lit("quux")]),
+ ]),
+ );
+ assert_eq!(
+ t("[A-Z][A-Z]|[A-Z][A-Z]quux"),
+ hir_cat(vec![
+ hir_uclass(&[('A', 'Z')]),
+ hir_uclass(&[('A', 'Z')]),
+ hir_alt(vec![Hir::empty(), hir_lit("quux")]),
+ ]),
+ );
+ assert_eq!(
+ t("[A-Z]foo|[A-Z]foobar"),
+ hir_cat(vec![
+ hir_uclass(&[('A', 'Z')]),
+ hir_alt(vec![hir_lit("foo"), hir_lit("foobar")]),
+ ]),
+ );
+ }
+
+ #[test]
+ fn regression_alt_empty_concat() {
+ use crate::ast::{self, Ast};
+
+ let span = Span::splat(Position::new(0, 0, 0));
+ let ast = Ast::alternation(ast::Alternation {
+ span,
+ asts: vec![Ast::concat(ast::Concat { span, asts: vec![] })],
+ });
+
+ let mut t = Translator::new();
+ assert_eq!(Ok(Hir::empty()), t.translate("", &ast));
+ }
+
+ #[test]
+ fn regression_empty_alt() {
+ use crate::ast::{self, Ast};
+
+ let span = Span::splat(Position::new(0, 0, 0));
+ let ast = Ast::concat(ast::Concat {
+ span,
+ asts: vec![Ast::alternation(ast::Alternation {
+ span,
+ asts: vec![],
+ })],
+ });
+
+ let mut t = Translator::new();
+ assert_eq!(Ok(Hir::fail()), t.translate("", &ast));
+ }
+
+ #[test]
+ fn regression_singleton_alt() {
+ use crate::{
+ ast::{self, Ast},
+ hir::Dot,
+ };
+
+ let span = Span::splat(Position::new(0, 0, 0));
+ let ast = Ast::concat(ast::Concat {
+ span,
+ asts: vec![Ast::alternation(ast::Alternation {
+ span,
+ asts: vec![Ast::dot(span)],
+ })],
+ });
+
+ let mut t = Translator::new();
+ assert_eq!(Ok(Hir::dot(Dot::AnyCharExceptLF)), t.translate("", &ast));
+ }
+
+ // See: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=63168
+ #[test]
+ fn regression_fuzz_match() {
+ let pat = "[(\u{6} \0-\u{afdf5}] \0 ";
+ let ast = ParserBuilder::new()
+ .octal(false)
+ .ignore_whitespace(true)
+ .build()
+ .parse(pat)
+ .unwrap();
+ let hir = TranslatorBuilder::new()
+ .utf8(true)
+ .case_insensitive(false)
+ .multi_line(false)
+ .dot_matches_new_line(false)
+ .swap_greed(true)
+ .unicode(true)
+ .build()
+ .translate(pat, &ast)
+ .unwrap();
+ assert_eq!(
+ hir,
+ Hir::concat(vec![
+ hir_uclass(&[('\0', '\u{afdf5}')]),
+ hir_lit("\0"),
+ ])
+ );
+ }
+
+ // See: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=63155
+ #[cfg(feature = "unicode")]
+ #[test]
+ fn regression_fuzz_difference1() {
+ let pat = r"\W\W|\W[^\v--\W\W\P{Script_Extensions:Pau_Cin_Hau}\u10A1A1-\U{3E3E3}--~~~~--~~~~~~~~------~~~~~~--~~~~~~]*";
+ let _ = t(pat); // shouldn't panic
+ }
+
+ // See: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=63153
+ #[test]
+ fn regression_fuzz_char_decrement1() {
+ let pat = "w[w[^w?\rw\rw[^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w?\r\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0w?\rw[^w?\rw[^w?\rw[^w\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\u{1}\0]\0\0\0\0\0\0\0\0\0*\0\0\u{1}\0]\0\0-*\0][^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\u{1}\0]\0\0\0\0\0\0\0\0\0x\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\0\0\0*??\0\u{7f}{2}\u{10}??\0\0\0\0\0\0\0\0\0\u{3}\0\0\0}\0-*\0]\0\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\0\u{1}\0]\0\u{1}\u{1}H-i]-]\0\0\0\0\u{1}\0]\0\0\0\u{1}\0]\0\0-*\0\0\0\0\u{1}9-\u{7f}]\0'|-\u{7f}]\0'|(?i-ux)[-\u{7f}]\0'\u{3}\0\0\0}\0-*\0]<D\0\0\0\0\0\0\u{1}]\0\0\0\0]\0\0-*\0]\0\0 ";
+ let _ = t(pat); // shouldn't panic
+ }
+}
diff --git a/vendor/regex-syntax/src/hir/visitor.rs b/vendor/regex-syntax/src/hir/visitor.rs
new file mode 100644
index 0000000..f30f0a1
--- /dev/null
+++ b/vendor/regex-syntax/src/hir/visitor.rs
@@ -0,0 +1,215 @@
+use alloc::{vec, vec::Vec};
+
+use crate::hir::{self, Hir, HirKind};
+
+/// A trait for visiting the high-level IR (HIR) in depth first order.
+///
+/// The principle aim of this trait is to enable callers to perform case
+/// analysis on a high-level intermediate representation of a regular
+/// expression without necessarily using recursion. In particular, this permits
+/// callers to do case analysis with constant stack usage, which can be
+/// important since the size of an HIR may be proportional to end user input.
+///
+/// Typical usage of this trait involves providing an implementation and then
+/// running it using the [`visit`] function.
+pub trait Visitor {
+ /// The result of visiting an HIR.
+ type Output;
+ /// An error that visiting an HIR might return.
+ type Err;
+
+ /// All implementors of `Visitor` must provide a `finish` method, which
+ /// yields the result of visiting the HIR or an error.
+ fn finish(self) -> Result<Self::Output, Self::Err>;
+
+ /// This method is called before beginning traversal of the HIR.
+ fn start(&mut self) {}
+
+ /// This method is called on an `Hir` before descending into child `Hir`
+ /// nodes.
+ fn visit_pre(&mut self, _hir: &Hir) -> Result<(), Self::Err> {
+ Ok(())
+ }
+
+ /// This method is called on an `Hir` after descending all of its child
+ /// `Hir` nodes.
+ fn visit_post(&mut self, _hir: &Hir) -> Result<(), Self::Err> {
+ Ok(())
+ }
+
+ /// This method is called between child nodes of an alternation.
+ fn visit_alternation_in(&mut self) -> Result<(), Self::Err> {
+ Ok(())
+ }
+
+ /// This method is called between child nodes of a concatenation.
+ fn visit_concat_in(&mut self) -> Result<(), Self::Err> {
+ Ok(())
+ }
+}
+
+/// Executes an implementation of `Visitor` in constant stack space.
+///
+/// This function will visit every node in the given `Hir` while calling
+/// appropriate methods provided by the [`Visitor`] trait.
+///
+/// The primary use case for this method is when one wants to perform case
+/// analysis over an `Hir` without using a stack size proportional to the depth
+/// of the `Hir`. Namely, this method will instead use constant stack space,
+/// but will use heap space proportional to the size of the `Hir`. This may be
+/// desirable in cases where the size of `Hir` is proportional to end user
+/// input.
+///
+/// If the visitor returns an error at any point, then visiting is stopped and
+/// the error is returned.
+pub fn visit<V: Visitor>(hir: &Hir, visitor: V) -> Result<V::Output, V::Err> {
+ HeapVisitor::new().visit(hir, visitor)
+}
+
+/// HeapVisitor visits every item in an `Hir` recursively using constant stack
+/// size and a heap size proportional to the size of the `Hir`.
+struct HeapVisitor<'a> {
+ /// A stack of `Hir` nodes. This is roughly analogous to the call stack
+ /// used in a typical recursive visitor.
+ stack: Vec<(&'a Hir, Frame<'a>)>,
+}
+
+/// Represents a single stack frame while performing structural induction over
+/// an `Hir`.
+enum Frame<'a> {
+ /// A stack frame allocated just before descending into a repetition
+ /// operator's child node.
+ Repetition(&'a hir::Repetition),
+ /// A stack frame allocated just before descending into a capture's child
+ /// node.
+ Capture(&'a hir::Capture),
+ /// The stack frame used while visiting every child node of a concatenation
+ /// of expressions.
+ Concat {
+ /// The child node we are currently visiting.
+ head: &'a Hir,
+ /// The remaining child nodes to visit (which may be empty).
+ tail: &'a [Hir],
+ },
+ /// The stack frame used while visiting every child node of an alternation
+ /// of expressions.
+ Alternation {
+ /// The child node we are currently visiting.
+ head: &'a Hir,
+ /// The remaining child nodes to visit (which may be empty).
+ tail: &'a [Hir],
+ },
+}
+
+impl<'a> HeapVisitor<'a> {
+ fn new() -> HeapVisitor<'a> {
+ HeapVisitor { stack: vec![] }
+ }
+
+ fn visit<V: Visitor>(
+ &mut self,
+ mut hir: &'a Hir,
+ mut visitor: V,
+ ) -> Result<V::Output, V::Err> {
+ self.stack.clear();
+
+ visitor.start();
+ loop {
+ visitor.visit_pre(hir)?;
+ if let Some(x) = self.induct(hir) {
+ let child = x.child();
+ self.stack.push((hir, x));
+ hir = child;
+ continue;
+ }
+ // No induction means we have a base case, so we can post visit
+ // it now.
+ visitor.visit_post(hir)?;
+
+ // At this point, we now try to pop our call stack until it is
+ // either empty or we hit another inductive case.
+ loop {
+ let (post_hir, frame) = match self.stack.pop() {
+ None => return visitor.finish(),
+ Some((post_hir, frame)) => (post_hir, frame),
+ };
+ // If this is a concat/alternate, then we might have additional
+ // inductive steps to process.
+ if let Some(x) = self.pop(frame) {
+ match x {
+ Frame::Alternation { .. } => {
+ visitor.visit_alternation_in()?;
+ }
+ Frame::Concat { .. } => {
+ visitor.visit_concat_in()?;
+ }
+ _ => {}
+ }
+ hir = x.child();
+ self.stack.push((post_hir, x));
+ break;
+ }
+ // Otherwise, we've finished visiting all the child nodes for
+ // this HIR, so we can post visit it now.
+ visitor.visit_post(post_hir)?;
+ }
+ }
+ }
+
+ /// Build a stack frame for the given HIR if one is needed (which occurs if
+ /// and only if there are child nodes in the HIR). Otherwise, return None.
+ fn induct(&mut self, hir: &'a Hir) -> Option<Frame<'a>> {
+ match *hir.kind() {
+ HirKind::Repetition(ref x) => Some(Frame::Repetition(x)),
+ HirKind::Capture(ref x) => Some(Frame::Capture(x)),
+ HirKind::Concat(ref x) if x.is_empty() => None,
+ HirKind::Concat(ref x) => {
+ Some(Frame::Concat { head: &x[0], tail: &x[1..] })
+ }
+ HirKind::Alternation(ref x) if x.is_empty() => None,
+ HirKind::Alternation(ref x) => {
+ Some(Frame::Alternation { head: &x[0], tail: &x[1..] })
+ }
+ _ => None,
+ }
+ }
+
+ /// Pops the given frame. If the frame has an additional inductive step,
+ /// then return it, otherwise return `None`.
+ fn pop(&self, induct: Frame<'a>) -> Option<Frame<'a>> {
+ match induct {
+ Frame::Repetition(_) => None,
+ Frame::Capture(_) => None,
+ Frame::Concat { tail, .. } => {
+ if tail.is_empty() {
+ None
+ } else {
+ Some(Frame::Concat { head: &tail[0], tail: &tail[1..] })
+ }
+ }
+ Frame::Alternation { tail, .. } => {
+ if tail.is_empty() {
+ None
+ } else {
+ Some(Frame::Alternation {
+ head: &tail[0],
+ tail: &tail[1..],
+ })
+ }
+ }
+ }
+ }
+}
+
+impl<'a> Frame<'a> {
+ /// Perform the next inductive step on this frame and return the next
+ /// child HIR node to visit.
+ fn child(&self) -> &'a Hir {
+ match *self {
+ Frame::Repetition(rep) => &rep.sub,
+ Frame::Capture(capture) => &capture.sub,
+ Frame::Concat { head, .. } => head,
+ Frame::Alternation { head, .. } => head,
+ }
+ }
+}