summaryrefslogtreecommitdiffstats
path: root/third_party/rust/regex-syntax/src/hir
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 09:22:09 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 09:22:09 +0000
commit43a97878ce14b72f0981164f87f2e35e14151312 (patch)
tree620249daf56c0258faa40cbdcf9cfba06de2a846 /third_party/rust/regex-syntax/src/hir
parentInitial commit. (diff)
downloadfirefox-upstream.tar.xz
firefox-upstream.zip
Adding upstream version 110.0.1.upstream/110.0.1upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/rust/regex-syntax/src/hir')
-rw-r--r--third_party/rust/regex-syntax/src/hir/interval.rs520
-rw-r--r--third_party/rust/regex-syntax/src/hir/literal/mod.rs1686
-rw-r--r--third_party/rust/regex-syntax/src/hir/mod.rs2296
-rw-r--r--third_party/rust/regex-syntax/src/hir/print.rs367
-rw-r--r--third_party/rust/regex-syntax/src/hir/translate.rs3207
-rw-r--r--third_party/rust/regex-syntax/src/hir/visitor.rs203
6 files changed, 8279 insertions, 0 deletions
diff --git a/third_party/rust/regex-syntax/src/hir/interval.rs b/third_party/rust/regex-syntax/src/hir/interval.rs
new file mode 100644
index 0000000000..56698c53af
--- /dev/null
+++ b/third_party/rust/regex-syntax/src/hir/interval.rs
@@ -0,0 +1,520 @@
+use std::char;
+use std::cmp;
+use std::fmt::Debug;
+use std::slice;
+use std::u8;
+
+use crate::unicode;
+
+// This module contains an *internal* implementation of interval sets.
+//
+// The primary invariant that interval sets guards is canonical ordering. That
+// is, every interval set contains an ordered sequence of intervals where
+// no two intervals are overlapping or adjacent. While this invariant is
+// occasionally broken within the implementation, it should be impossible for
+// callers to observe it.
+//
+// Since case folding (as implemented below) breaks that invariant, we roll
+// that into this API even though it is a little out of place in an otherwise
+// generic interval set. (Hence the reason why the `unicode` module is imported
+// here.)
+//
+// Some of the implementation complexity here is a result of me wanting to
+// preserve the sequential representation without using additional memory.
+// In many cases, we do use linear extra memory, but it is at most 2x and it
+// is amortized. If we relaxed the memory requirements, this implementation
+// could become much simpler. The extra memory is honestly probably OK, but
+// character classes (especially of the Unicode variety) can become quite
+// large, and it would be nice to keep regex compilation snappy even in debug
+// builds. (In the past, I have been careless with this area of code and it has
+// caused slow regex compilations in debug mode, so this isn't entirely
+// unwarranted.)
+//
+// Tests on this are relegated to the public API of HIR in src/hir.rs.
+
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct IntervalSet<I> {
+ ranges: Vec<I>,
+}
+
+impl<I: Interval> IntervalSet<I> {
+ /// Create a new set from a sequence of intervals. Each interval is
+ /// specified as a pair of bounds, where both bounds are inclusive.
+ ///
+ /// The given ranges do not need to be in any specific order, and ranges
+ /// may overlap.
+ pub fn new<T: IntoIterator<Item = I>>(intervals: T) -> IntervalSet<I> {
+ let mut set = IntervalSet { ranges: intervals.into_iter().collect() };
+ set.canonicalize();
+ set
+ }
+
+ /// Add a new interval to this set.
+ pub fn push(&mut self, interval: I) {
+ // TODO: This could be faster. e.g., Push the interval such that
+ // it preserves canonicalization.
+ self.ranges.push(interval);
+ self.canonicalize();
+ }
+
+ /// Return an iterator over all intervals in this set.
+ ///
+ /// The iterator yields intervals in ascending order.
+ pub fn iter(&self) -> IntervalSetIter<'_, I> {
+ IntervalSetIter(self.ranges.iter())
+ }
+
+ /// Return an immutable slice of intervals in this set.
+ ///
+ /// The sequence returned is in canonical ordering.
+ pub fn intervals(&self) -> &[I] {
+ &self.ranges
+ }
+
+ /// Expand this interval set such that it contains all case folded
+ /// characters. For example, if this class consists of the range `a-z`,
+ /// then applying case folding will result in the class containing both the
+ /// ranges `a-z` and `A-Z`.
+ ///
+ /// This returns an error if the necessary case mapping data is not
+ /// available.
+ pub fn case_fold_simple(&mut self) -> Result<(), unicode::CaseFoldError> {
+ let len = self.ranges.len();
+ for i in 0..len {
+ let range = self.ranges[i];
+ if let Err(err) = range.case_fold_simple(&mut self.ranges) {
+ self.canonicalize();
+ return Err(err);
+ }
+ }
+ self.canonicalize();
+ Ok(())
+ }
+
+ /// Union this set with the given set, in place.
+ pub fn union(&mut self, other: &IntervalSet<I>) {
+ // This could almost certainly be done more efficiently.
+ self.ranges.extend(&other.ranges);
+ self.canonicalize();
+ }
+
+ /// Intersect this set with the given set, in place.
+ pub fn intersect(&mut self, other: &IntervalSet<I>) {
+ if self.ranges.is_empty() {
+ return;
+ }
+ if other.ranges.is_empty() {
+ self.ranges.clear();
+ return;
+ }
+
+ // There should be a way to do this in-place with constant memory,
+ // but I couldn't figure out a simple way to do it. So just append
+ // the intersection to the end of this range, and then drain it before
+ // we're done.
+ let drain_end = self.ranges.len();
+
+ let mut ita = 0..drain_end;
+ let mut itb = 0..other.ranges.len();
+ let mut a = ita.next().unwrap();
+ let mut b = itb.next().unwrap();
+ loop {
+ if let Some(ab) = self.ranges[a].intersect(&other.ranges[b]) {
+ self.ranges.push(ab);
+ }
+ let (it, aorb) =
+ if self.ranges[a].upper() < other.ranges[b].upper() {
+ (&mut ita, &mut a)
+ } else {
+ (&mut itb, &mut b)
+ };
+ match it.next() {
+ Some(v) => *aorb = v,
+ None => break,
+ }
+ }
+ self.ranges.drain(..drain_end);
+ }
+
+ /// Subtract the given set from this set, in place.
+ pub fn difference(&mut self, other: &IntervalSet<I>) {
+ if self.ranges.is_empty() || other.ranges.is_empty() {
+ return;
+ }
+
+ // This algorithm is (to me) surprisingly complex. A search of the
+ // interwebs indicate that this is a potentially interesting problem.
+ // Folks seem to suggest interval or segment trees, but I'd like to
+ // avoid the overhead (both runtime and conceptual) of that.
+ //
+ // The following is basically my Shitty First Draft. Therefore, in
+ // order to grok it, you probably need to read each line carefully.
+ // Simplifications are most welcome!
+ //
+ // Remember, we can assume the canonical format invariant here, which
+ // says that all ranges are sorted, not overlapping and not adjacent in
+ // each class.
+ let drain_end = self.ranges.len();
+ let (mut a, mut b) = (0, 0);
+ 'LOOP: while a < drain_end && b < other.ranges.len() {
+ // Basically, the easy cases are when neither range overlaps with
+ // each other. If the `b` range is less than our current `a`
+ // range, then we can skip it and move on.
+ if other.ranges[b].upper() < self.ranges[a].lower() {
+ b += 1;
+ continue;
+ }
+ // ... similarly for the `a` range. If it's less than the smallest
+ // `b` range, then we can add it as-is.
+ if self.ranges[a].upper() < other.ranges[b].lower() {
+ let range = self.ranges[a];
+ self.ranges.push(range);
+ a += 1;
+ continue;
+ }
+ // Otherwise, we have overlapping ranges.
+ assert!(!self.ranges[a].is_intersection_empty(&other.ranges[b]));
+
+ // This part is tricky and was non-obvious to me without looking
+ // at explicit examples (see the tests). The trickiness stems from
+ // two things: 1) subtracting a range from another range could
+ // yield two ranges and 2) after subtracting a range, it's possible
+ // that future ranges can have an impact. The loop below advances
+ // the `b` ranges until they can't possible impact the current
+ // range.
+ //
+ // For example, if our `a` range is `a-t` and our next three `b`
+ // ranges are `a-c`, `g-i`, `r-t` and `x-z`, then we need to apply
+ // subtraction three times before moving on to the next `a` range.
+ let mut range = self.ranges[a];
+ while b < other.ranges.len()
+ && !range.is_intersection_empty(&other.ranges[b])
+ {
+ let old_range = range;
+ range = match range.difference(&other.ranges[b]) {
+ (None, None) => {
+ // We lost the entire range, so move on to the next
+ // without adding this one.
+ a += 1;
+ continue 'LOOP;
+ }
+ (Some(range1), None) | (None, Some(range1)) => range1,
+ (Some(range1), Some(range2)) => {
+ self.ranges.push(range1);
+ range2
+ }
+ };
+ // It's possible that the `b` range has more to contribute
+ // here. In particular, if it is greater than the original
+ // range, then it might impact the next `a` range *and* it
+ // has impacted the current `a` range as much as possible,
+ // so we can quit. We don't bump `b` so that the next `a`
+ // range can apply it.
+ if other.ranges[b].upper() > old_range.upper() {
+ break;
+ }
+ // Otherwise, the next `b` range might apply to the current
+ // `a` range.
+ b += 1;
+ }
+ self.ranges.push(range);
+ a += 1;
+ }
+ while a < drain_end {
+ let range = self.ranges[a];
+ self.ranges.push(range);
+ a += 1;
+ }
+ self.ranges.drain(..drain_end);
+ }
+
+ /// Compute the symmetric difference of the two sets, in place.
+ ///
+ /// This computes the symmetric difference of two interval sets. This
+ /// removes all elements in this set that are also in the given set,
+ /// but also adds all elements from the given set that aren't in this
+ /// set. That is, the set will contain all elements in either set,
+ /// but will not contain any elements that are in both sets.
+ pub fn symmetric_difference(&mut self, other: &IntervalSet<I>) {
+ // TODO(burntsushi): Fix this so that it amortizes allocation.
+ let mut intersection = self.clone();
+ intersection.intersect(other);
+ self.union(other);
+ self.difference(&intersection);
+ }
+
+ /// Negate this interval set.
+ ///
+ /// For all `x` where `x` is any element, if `x` was in this set, then it
+ /// will not be in this set after negation.
+ pub fn negate(&mut self) {
+ if self.ranges.is_empty() {
+ let (min, max) = (I::Bound::min_value(), I::Bound::max_value());
+ self.ranges.push(I::create(min, max));
+ return;
+ }
+
+ // There should be a way to do this in-place with constant memory,
+ // but I couldn't figure out a simple way to do it. So just append
+ // the negation to the end of this range, and then drain it before
+ // we're done.
+ let drain_end = self.ranges.len();
+
+ // We do checked arithmetic below because of the canonical ordering
+ // invariant.
+ if self.ranges[0].lower() > I::Bound::min_value() {
+ let upper = self.ranges[0].lower().decrement();
+ self.ranges.push(I::create(I::Bound::min_value(), upper));
+ }
+ for i in 1..drain_end {
+ let lower = self.ranges[i - 1].upper().increment();
+ let upper = self.ranges[i].lower().decrement();
+ self.ranges.push(I::create(lower, upper));
+ }
+ if self.ranges[drain_end - 1].upper() < I::Bound::max_value() {
+ let lower = self.ranges[drain_end - 1].upper().increment();
+ self.ranges.push(I::create(lower, I::Bound::max_value()));
+ }
+ self.ranges.drain(..drain_end);
+ }
+
+ /// Converts this set into a canonical ordering.
+ fn canonicalize(&mut self) {
+ if self.is_canonical() {
+ return;
+ }
+ self.ranges.sort();
+ assert!(!self.ranges.is_empty());
+
+ // Is there a way to do this in-place with constant memory? I couldn't
+ // figure out a way to do it. So just append the canonicalization to
+ // the end of this range, and then drain it before we're done.
+ let drain_end = self.ranges.len();
+ for oldi in 0..drain_end {
+ // If we've added at least one new range, then check if we can
+ // merge this range in the previously added range.
+ if self.ranges.len() > drain_end {
+ let (last, rest) = self.ranges.split_last_mut().unwrap();
+ if let Some(union) = last.union(&rest[oldi]) {
+ *last = union;
+ continue;
+ }
+ }
+ let range = self.ranges[oldi];
+ self.ranges.push(range);
+ }
+ self.ranges.drain(..drain_end);
+ }
+
+ /// Returns true if and only if this class is in a canonical ordering.
+ fn is_canonical(&self) -> bool {
+ for pair in self.ranges.windows(2) {
+ if pair[0] >= pair[1] {
+ return false;
+ }
+ if pair[0].is_contiguous(&pair[1]) {
+ return false;
+ }
+ }
+ true
+ }
+}
+
+/// An iterator over intervals.
+#[derive(Debug)]
+pub struct IntervalSetIter<'a, I>(slice::Iter<'a, I>);
+
+impl<'a, I> Iterator for IntervalSetIter<'a, I> {
+ type Item = &'a I;
+
+ fn next(&mut self) -> Option<&'a I> {
+ self.0.next()
+ }
+}
+
+pub trait Interval:
+ Clone + Copy + Debug + Default + Eq + PartialEq + PartialOrd + Ord
+{
+ type Bound: Bound;
+
+ fn lower(&self) -> Self::Bound;
+ fn upper(&self) -> Self::Bound;
+ fn set_lower(&mut self, bound: Self::Bound);
+ fn set_upper(&mut self, bound: Self::Bound);
+ fn case_fold_simple(
+ &self,
+ intervals: &mut Vec<Self>,
+ ) -> Result<(), unicode::CaseFoldError>;
+
+ /// Create a new interval.
+ fn create(lower: Self::Bound, upper: Self::Bound) -> Self {
+ let mut int = Self::default();
+ if lower <= upper {
+ int.set_lower(lower);
+ int.set_upper(upper);
+ } else {
+ int.set_lower(upper);
+ int.set_upper(lower);
+ }
+ int
+ }
+
+ /// Union the given overlapping range into this range.
+ ///
+ /// If the two ranges aren't contiguous, then this returns `None`.
+ fn union(&self, other: &Self) -> Option<Self> {
+ if !self.is_contiguous(other) {
+ return None;
+ }
+ let lower = cmp::min(self.lower(), other.lower());
+ let upper = cmp::max(self.upper(), other.upper());
+ Some(Self::create(lower, upper))
+ }
+
+ /// Intersect this range with the given range and return the result.
+ ///
+ /// If the intersection is empty, then this returns `None`.
+ fn intersect(&self, other: &Self) -> Option<Self> {
+ let lower = cmp::max(self.lower(), other.lower());
+ let upper = cmp::min(self.upper(), other.upper());
+ if lower <= upper {
+ Some(Self::create(lower, upper))
+ } else {
+ None
+ }
+ }
+
+ /// Subtract the given range from this range and return the resulting
+ /// ranges.
+ ///
+ /// If subtraction would result in an empty range, then no ranges are
+ /// returned.
+ fn difference(&self, other: &Self) -> (Option<Self>, Option<Self>) {
+ if self.is_subset(other) {
+ return (None, None);
+ }
+ if self.is_intersection_empty(other) {
+ return (Some(self.clone()), None);
+ }
+ let add_lower = other.lower() > self.lower();
+ let add_upper = other.upper() < self.upper();
+ // We know this because !self.is_subset(other) and the ranges have
+ // a non-empty intersection.
+ assert!(add_lower || add_upper);
+ let mut ret = (None, None);
+ if add_lower {
+ let upper = other.lower().decrement();
+ ret.0 = Some(Self::create(self.lower(), upper));
+ }
+ if add_upper {
+ let lower = other.upper().increment();
+ let range = Self::create(lower, self.upper());
+ if ret.0.is_none() {
+ ret.0 = Some(range);
+ } else {
+ ret.1 = Some(range);
+ }
+ }
+ ret
+ }
+
+ /// Compute the symmetric difference the given range from this range. This
+ /// returns the union of the two ranges minus its intersection.
+ fn symmetric_difference(
+ &self,
+ other: &Self,
+ ) -> (Option<Self>, Option<Self>) {
+ let union = match self.union(other) {
+ None => return (Some(self.clone()), Some(other.clone())),
+ Some(union) => union,
+ };
+ let intersection = match self.intersect(other) {
+ None => return (Some(self.clone()), Some(other.clone())),
+ Some(intersection) => intersection,
+ };
+ union.difference(&intersection)
+ }
+
+ /// Returns true if and only if the two ranges are contiguous. Two ranges
+ /// are contiguous if and only if the ranges are either overlapping or
+ /// adjacent.
+ fn is_contiguous(&self, other: &Self) -> bool {
+ let lower1 = self.lower().as_u32();
+ let upper1 = self.upper().as_u32();
+ let lower2 = other.lower().as_u32();
+ let upper2 = other.upper().as_u32();
+ cmp::max(lower1, lower2) <= cmp::min(upper1, upper2).saturating_add(1)
+ }
+
+ /// Returns true if and only if the intersection of this range and the
+ /// other range is empty.
+ fn is_intersection_empty(&self, other: &Self) -> bool {
+ let (lower1, upper1) = (self.lower(), self.upper());
+ let (lower2, upper2) = (other.lower(), other.upper());
+ cmp::max(lower1, lower2) > cmp::min(upper1, upper2)
+ }
+
+ /// Returns true if and only if this range is a subset of the other range.
+ fn is_subset(&self, other: &Self) -> bool {
+ let (lower1, upper1) = (self.lower(), self.upper());
+ let (lower2, upper2) = (other.lower(), other.upper());
+ (lower2 <= lower1 && lower1 <= upper2)
+ && (lower2 <= upper1 && upper1 <= upper2)
+ }
+}
+
+pub trait Bound:
+ Copy + Clone + Debug + Eq + PartialEq + PartialOrd + Ord
+{
+ fn min_value() -> Self;
+ fn max_value() -> Self;
+ fn as_u32(self) -> u32;
+ fn increment(self) -> Self;
+ fn decrement(self) -> Self;
+}
+
+impl Bound for u8 {
+ fn min_value() -> Self {
+ u8::MIN
+ }
+ fn max_value() -> Self {
+ u8::MAX
+ }
+ fn as_u32(self) -> u32 {
+ self as u32
+ }
+ fn increment(self) -> Self {
+ self.checked_add(1).unwrap()
+ }
+ fn decrement(self) -> Self {
+ self.checked_sub(1).unwrap()
+ }
+}
+
+impl Bound for char {
+ fn min_value() -> Self {
+ '\x00'
+ }
+ fn max_value() -> Self {
+ '\u{10FFFF}'
+ }
+ fn as_u32(self) -> u32 {
+ self as u32
+ }
+
+ fn increment(self) -> Self {
+ match self {
+ '\u{D7FF}' => '\u{E000}',
+ c => char::from_u32((c as u32).checked_add(1).unwrap()).unwrap(),
+ }
+ }
+
+ fn decrement(self) -> Self {
+ match self {
+ '\u{E000}' => '\u{D7FF}',
+ c => char::from_u32((c as u32).checked_sub(1).unwrap()).unwrap(),
+ }
+ }
+}
+
+// Tests for interval sets are written in src/hir.rs against the public API.
diff --git a/third_party/rust/regex-syntax/src/hir/literal/mod.rs b/third_party/rust/regex-syntax/src/hir/literal/mod.rs
new file mode 100644
index 0000000000..fbc5d3c975
--- /dev/null
+++ b/third_party/rust/regex-syntax/src/hir/literal/mod.rs
@@ -0,0 +1,1686 @@
+/*!
+Provides routines for extracting literal prefixes and suffixes from an `Hir`.
+*/
+
+use std::cmp;
+use std::fmt;
+use std::iter;
+use std::mem;
+use std::ops;
+
+use crate::hir::{self, Hir, HirKind};
+
+/// A set of literal byte strings extracted from a regular expression.
+///
+/// Every member of the set is a `Literal`, which is represented by a
+/// `Vec<u8>`. (Notably, it may contain invalid UTF-8.) Every member is
+/// said to be either *complete* or *cut*. A complete literal means that
+/// it extends until the beginning (or end) of the regular expression. In
+/// some circumstances, this can be used to indicate a match in the regular
+/// expression.
+///
+/// A key aspect of literal extraction is knowing when to stop. It is not
+/// feasible to blindly extract all literals from a regular expression, even if
+/// there are finitely many. For example, the regular expression `[0-9]{10}`
+/// has `10^10` distinct literals. For this reason, literal extraction is
+/// bounded to some low number by default using heuristics, but the limits can
+/// be tweaked.
+///
+/// **WARNING**: Literal extraction uses stack space proportional to the size
+/// of the `Hir` expression. At some point, this drawback will be eliminated.
+/// To protect yourself, set a reasonable
+/// [`nest_limit` on your `Parser`](../../struct.ParserBuilder.html#method.nest_limit).
+/// This is done for you by default.
+#[derive(Clone, Eq, PartialEq)]
+pub struct Literals {
+ lits: Vec<Literal>,
+ limit_size: usize,
+ limit_class: usize,
+}
+
+/// A single member of a set of literals extracted from a regular expression.
+///
+/// This type has `Deref` and `DerefMut` impls to `Vec<u8>` so that all slice
+/// and `Vec` operations are available.
+#[derive(Clone, Eq, Ord)]
+pub struct Literal {
+ v: Vec<u8>,
+ cut: bool,
+}
+
+impl Literals {
+ /// Returns a new empty set of literals using default limits.
+ pub fn empty() -> Literals {
+ Literals { lits: vec![], limit_size: 250, limit_class: 10 }
+ }
+
+ /// Returns a set of literal prefixes extracted from the given `Hir`.
+ pub fn prefixes(expr: &Hir) -> Literals {
+ let mut lits = Literals::empty();
+ lits.union_prefixes(expr);
+ lits
+ }
+
+ /// Returns a set of literal suffixes extracted from the given `Hir`.
+ pub fn suffixes(expr: &Hir) -> Literals {
+ let mut lits = Literals::empty();
+ lits.union_suffixes(expr);
+ lits
+ }
+
+ /// Get the approximate size limit (in bytes) of this set.
+ pub fn limit_size(&self) -> usize {
+ self.limit_size
+ }
+
+ /// Set the approximate size limit (in bytes) of this set.
+ ///
+ /// If extracting a literal would put the set over this limit, then
+ /// extraction stops.
+ ///
+ /// The new limits will only apply to additions to this set. Existing
+ /// members remain unchanged, even if the set exceeds the new limit.
+ pub fn set_limit_size(&mut self, size: usize) -> &mut Literals {
+ self.limit_size = size;
+ self
+ }
+
+ /// Get the character class size limit for this set.
+ pub fn limit_class(&self) -> usize {
+ self.limit_class
+ }
+
+ /// Limits the size of character(or byte) classes considered.
+ ///
+ /// A value of `0` prevents all character classes from being considered.
+ ///
+ /// This limit also applies to case insensitive literals, since each
+ /// character in the case insensitive literal is converted to a class, and
+ /// then case folded.
+ ///
+ /// The new limits will only apply to additions to this set. Existing
+ /// members remain unchanged, even if the set exceeds the new limit.
+ pub fn set_limit_class(&mut self, size: usize) -> &mut Literals {
+ self.limit_class = size;
+ self
+ }
+
+ /// Returns the set of literals as a slice. Its order is unspecified.
+ pub fn literals(&self) -> &[Literal] {
+ &self.lits
+ }
+
+ /// Returns the length of the smallest literal.
+ ///
+ /// Returns None is there are no literals in the set.
+ pub fn min_len(&self) -> Option<usize> {
+ let mut min = None;
+ for lit in &self.lits {
+ match min {
+ None => min = Some(lit.len()),
+ Some(m) if lit.len() < m => min = Some(lit.len()),
+ _ => {}
+ }
+ }
+ min
+ }
+
+ /// Returns true if all members in this set are complete.
+ pub fn all_complete(&self) -> bool {
+ !self.lits.is_empty() && self.lits.iter().all(|l| !l.is_cut())
+ }
+
+ /// Returns true if any member in this set is complete.
+ pub fn any_complete(&self) -> bool {
+ self.lits.iter().any(|lit| !lit.is_cut())
+ }
+
+ /// Returns true if this set contains an empty literal.
+ pub fn contains_empty(&self) -> bool {
+ self.lits.iter().any(|lit| lit.is_empty())
+ }
+
+ /// Returns true if this set is empty or if all of its members is empty.
+ pub fn is_empty(&self) -> bool {
+ self.lits.is_empty() || self.lits.iter().all(|lit| lit.is_empty())
+ }
+
+ /// Returns a new empty set of literals using this set's limits.
+ pub fn to_empty(&self) -> Literals {
+ let mut lits = Literals::empty();
+ lits.set_limit_size(self.limit_size).set_limit_class(self.limit_class);
+ lits
+ }
+
+ /// Returns the longest common prefix of all members in this set.
+ pub fn longest_common_prefix(&self) -> &[u8] {
+ if self.is_empty() {
+ return &[];
+ }
+ let lit0 = &*self.lits[0];
+ let mut len = lit0.len();
+ for lit in &self.lits[1..] {
+ len = cmp::min(
+ len,
+ lit.iter().zip(lit0).take_while(|&(a, b)| a == b).count(),
+ );
+ }
+ &self.lits[0][..len]
+ }
+
+ /// Returns the longest common suffix of all members in this set.
+ pub fn longest_common_suffix(&self) -> &[u8] {
+ if self.is_empty() {
+ return &[];
+ }
+ let lit0 = &*self.lits[0];
+ let mut len = lit0.len();
+ for lit in &self.lits[1..] {
+ len = cmp::min(
+ len,
+ lit.iter()
+ .rev()
+ .zip(lit0.iter().rev())
+ .take_while(|&(a, b)| a == b)
+ .count(),
+ );
+ }
+ &self.lits[0][self.lits[0].len() - len..]
+ }
+
+ /// Returns a new set of literals with the given number of bytes trimmed
+ /// from the suffix of each literal.
+ ///
+ /// If any literal would be cut out completely by trimming, then None is
+ /// returned.
+ ///
+ /// Any duplicates that are created as a result of this transformation are
+ /// removed.
+ pub fn trim_suffix(&self, num_bytes: usize) -> Option<Literals> {
+ if self.min_len().map(|len| len <= num_bytes).unwrap_or(true) {
+ return None;
+ }
+ let mut new = self.to_empty();
+ for mut lit in self.lits.iter().cloned() {
+ let new_len = lit.len() - num_bytes;
+ lit.truncate(new_len);
+ lit.cut();
+ new.lits.push(lit);
+ }
+ new.lits.sort();
+ new.lits.dedup();
+ Some(new)
+ }
+
+ /// Returns a new set of prefixes of this set of literals that are
+ /// guaranteed to be unambiguous.
+ ///
+ /// Any substring match with a member of the set is returned is guaranteed
+ /// to never overlap with a substring match of another member of the set
+ /// at the same starting position.
+ ///
+ /// Given any two members of the returned set, neither is a substring of
+ /// the other.
+ pub fn unambiguous_prefixes(&self) -> Literals {
+ if self.lits.is_empty() {
+ return self.to_empty();
+ }
+ let mut old = self.lits.to_vec();
+ let mut new = self.to_empty();
+ 'OUTER: while let Some(mut candidate) = old.pop() {
+ if candidate.is_empty() {
+ continue;
+ }
+ if new.lits.is_empty() {
+ new.lits.push(candidate);
+ continue;
+ }
+ for lit2 in &mut new.lits {
+ if lit2.is_empty() {
+ continue;
+ }
+ if &candidate == lit2 {
+ // If the literal is already in the set, then we can
+ // just drop it. But make sure that cut literals are
+ // infectious!
+ candidate.cut = candidate.cut || lit2.cut;
+ lit2.cut = candidate.cut;
+ continue 'OUTER;
+ }
+ if candidate.len() < lit2.len() {
+ if let Some(i) = position(&candidate, &lit2) {
+ candidate.cut();
+ let mut lit3 = lit2.clone();
+ lit3.truncate(i);
+ lit3.cut();
+ old.push(lit3);
+ lit2.clear();
+ }
+ } else if let Some(i) = position(&lit2, &candidate) {
+ lit2.cut();
+ let mut new_candidate = candidate.clone();
+ new_candidate.truncate(i);
+ new_candidate.cut();
+ old.push(new_candidate);
+ candidate.clear();
+ }
+ // Oops, the candidate is already represented in the set.
+ if candidate.is_empty() {
+ continue 'OUTER;
+ }
+ }
+ new.lits.push(candidate);
+ }
+ new.lits.retain(|lit| !lit.is_empty());
+ new.lits.sort();
+ new.lits.dedup();
+ new
+ }
+
+ /// Returns a new set of suffixes of this set of literals that are
+ /// guaranteed to be unambiguous.
+ ///
+ /// Any substring match with a member of the set is returned is guaranteed
+ /// to never overlap with a substring match of another member of the set
+ /// at the same ending position.
+ ///
+ /// Given any two members of the returned set, neither is a substring of
+ /// the other.
+ pub fn unambiguous_suffixes(&self) -> Literals {
+ // This is a touch wasteful...
+ let mut lits = self.clone();
+ lits.reverse();
+ let mut unamb = lits.unambiguous_prefixes();
+ unamb.reverse();
+ unamb
+ }
+
+ /// Unions the prefixes from the given expression to this set.
+ ///
+ /// If prefixes could not be added (for example, this set would exceed its
+ /// size limits or the set of prefixes from `expr` includes the empty
+ /// string), then false is returned.
+ ///
+ /// Note that prefix literals extracted from `expr` are said to be complete
+ /// if and only if the literal extends from the beginning of `expr` to the
+ /// end of `expr`.
+ pub fn union_prefixes(&mut self, expr: &Hir) -> bool {
+ let mut lits = self.to_empty();
+ prefixes(expr, &mut lits);
+ !lits.is_empty() && !lits.contains_empty() && self.union(lits)
+ }
+
+ /// Unions the suffixes from the given expression to this set.
+ ///
+ /// If suffixes could not be added (for example, this set would exceed its
+ /// size limits or the set of suffixes from `expr` includes the empty
+ /// string), then false is returned.
+ ///
+ /// Note that prefix literals extracted from `expr` are said to be complete
+ /// if and only if the literal extends from the end of `expr` to the
+ /// beginning of `expr`.
+ pub fn union_suffixes(&mut self, expr: &Hir) -> bool {
+ let mut lits = self.to_empty();
+ suffixes(expr, &mut lits);
+ lits.reverse();
+ !lits.is_empty() && !lits.contains_empty() && self.union(lits)
+ }
+
+ /// Unions this set with another set.
+ ///
+ /// If the union would cause the set to exceed its limits, then the union
+ /// is skipped and it returns false. Otherwise, if the union succeeds, it
+ /// returns true.
+ pub fn union(&mut self, lits: Literals) -> bool {
+ if self.num_bytes() + lits.num_bytes() > self.limit_size {
+ return false;
+ }
+ if lits.is_empty() {
+ self.lits.push(Literal::empty());
+ } else {
+ self.lits.extend(lits.lits);
+ }
+ true
+ }
+
+ /// Extends this set with another set.
+ ///
+ /// The set of literals is extended via a cross product.
+ ///
+ /// If a cross product would cause this set to exceed its limits, then the
+ /// cross product is skipped and it returns false. Otherwise, if the cross
+ /// product succeeds, it returns true.
+ pub fn cross_product(&mut self, lits: &Literals) -> bool {
+ if lits.is_empty() {
+ return true;
+ }
+ // Check that we make sure we stay in our limits.
+ let mut size_after;
+ if self.is_empty() || !self.any_complete() {
+ size_after = self.num_bytes();
+ for lits_lit in lits.literals() {
+ size_after += lits_lit.len();
+ }
+ } else {
+ size_after = self.lits.iter().fold(0, |accum, lit| {
+ accum + if lit.is_cut() { lit.len() } else { 0 }
+ });
+ for lits_lit in lits.literals() {
+ for self_lit in self.literals() {
+ if !self_lit.is_cut() {
+ size_after += self_lit.len() + lits_lit.len();
+ }
+ }
+ }
+ }
+ if size_after > self.limit_size {
+ return false;
+ }
+
+ let mut base = self.remove_complete();
+ if base.is_empty() {
+ base = vec![Literal::empty()];
+ }
+ for lits_lit in lits.literals() {
+ for mut self_lit in base.clone() {
+ self_lit.extend(&**lits_lit);
+ self_lit.cut = lits_lit.cut;
+ self.lits.push(self_lit);
+ }
+ }
+ true
+ }
+
+ /// Extends each literal in this set with the bytes given.
+ ///
+ /// If the set is empty, then the given literal is added to the set.
+ ///
+ /// If adding any number of bytes to all members of this set causes a limit
+ /// to be exceeded, then no bytes are added and false is returned. If a
+ /// prefix of `bytes` can be fit into this set, then it is used and all
+ /// resulting literals are cut.
+ pub fn cross_add(&mut self, bytes: &[u8]) -> bool {
+ // N.B. This could be implemented by simply calling cross_product with
+ // a literal set containing just `bytes`, but we can be smarter about
+ // taking shorter prefixes of `bytes` if they'll fit.
+ if bytes.is_empty() {
+ return true;
+ }
+ if self.lits.is_empty() {
+ let i = cmp::min(self.limit_size, bytes.len());
+ self.lits.push(Literal::new(bytes[..i].to_owned()));
+ self.lits[0].cut = i < bytes.len();
+ return !self.lits[0].is_cut();
+ }
+ let size = self.num_bytes();
+ if size + self.lits.len() >= self.limit_size {
+ return false;
+ }
+ let mut i = 1;
+ while size + (i * self.lits.len()) <= self.limit_size
+ && i < bytes.len()
+ {
+ i += 1;
+ }
+ for lit in &mut self.lits {
+ if !lit.is_cut() {
+ lit.extend(&bytes[..i]);
+ if i < bytes.len() {
+ lit.cut();
+ }
+ }
+ }
+ true
+ }
+
+ /// Adds the given literal to this set.
+ ///
+ /// Returns false if adding this literal would cause the class to be too
+ /// big.
+ pub fn add(&mut self, lit: Literal) -> bool {
+ if self.num_bytes() + lit.len() > self.limit_size {
+ return false;
+ }
+ self.lits.push(lit);
+ true
+ }
+
+ /// Extends each literal in this set with the character class given.
+ ///
+ /// Returns false if the character class was too big to add.
+ pub fn add_char_class(&mut self, cls: &hir::ClassUnicode) -> bool {
+ self._add_char_class(cls, false)
+ }
+
+ /// Extends each literal in this set with the character class given,
+ /// writing the bytes of each character in reverse.
+ ///
+ /// Returns false if the character class was too big to add.
+ fn add_char_class_reverse(&mut self, cls: &hir::ClassUnicode) -> bool {
+ self._add_char_class(cls, true)
+ }
+
+ fn _add_char_class(
+ &mut self,
+ cls: &hir::ClassUnicode,
+ reverse: bool,
+ ) -> bool {
+ use std::char;
+
+ if self.class_exceeds_limits(cls_char_count(cls)) {
+ return false;
+ }
+ let mut base = self.remove_complete();
+ if base.is_empty() {
+ base = vec![Literal::empty()];
+ }
+ for r in cls.iter() {
+ let (s, e) = (r.start as u32, r.end as u32 + 1);
+ for c in (s..e).filter_map(char::from_u32) {
+ for mut lit in base.clone() {
+ let mut bytes = c.to_string().into_bytes();
+ if reverse {
+ bytes.reverse();
+ }
+ lit.extend(&bytes);
+ self.lits.push(lit);
+ }
+ }
+ }
+ true
+ }
+
+ /// Extends each literal in this set with the byte class given.
+ ///
+ /// Returns false if the byte class was too big to add.
+ pub fn add_byte_class(&mut self, cls: &hir::ClassBytes) -> bool {
+ if self.class_exceeds_limits(cls_byte_count(cls)) {
+ return false;
+ }
+ let mut base = self.remove_complete();
+ if base.is_empty() {
+ base = vec![Literal::empty()];
+ }
+ for r in cls.iter() {
+ let (s, e) = (r.start as u32, r.end as u32 + 1);
+ for b in (s..e).map(|b| b as u8) {
+ for mut lit in base.clone() {
+ lit.push(b);
+ self.lits.push(lit);
+ }
+ }
+ }
+ true
+ }
+
+ /// Cuts every member of this set. When a member is cut, it can never
+ /// be extended.
+ pub fn cut(&mut self) {
+ for lit in &mut self.lits {
+ lit.cut();
+ }
+ }
+
+ /// Reverses all members in place.
+ pub fn reverse(&mut self) {
+ for lit in &mut self.lits {
+ lit.reverse();
+ }
+ }
+
+ /// Clears this set of all members.
+ pub fn clear(&mut self) {
+ self.lits.clear();
+ }
+
+ /// Pops all complete literals out of this set.
+ fn remove_complete(&mut self) -> Vec<Literal> {
+ let mut base = vec![];
+ for lit in mem::replace(&mut self.lits, vec![]) {
+ if lit.is_cut() {
+ self.lits.push(lit);
+ } else {
+ base.push(lit);
+ }
+ }
+ base
+ }
+
+ /// Returns the total number of bytes in this set.
+ fn num_bytes(&self) -> usize {
+ self.lits.iter().fold(0, |accum, lit| accum + lit.len())
+ }
+
+ /// Returns true if a character class with the given size would cause this
+ /// set to exceed its limits.
+ ///
+ /// The size given should correspond to the number of items in the class.
+ fn class_exceeds_limits(&self, size: usize) -> bool {
+ if size > self.limit_class {
+ return true;
+ }
+ // This is an approximation since codepoints in a char class can encode
+ // to 1-4 bytes.
+ let new_byte_count = if self.lits.is_empty() {
+ size
+ } else {
+ self.lits.iter().fold(0, |accum, lit| {
+ accum
+ + if lit.is_cut() {
+ // If the literal is cut, then we'll never add
+ // anything to it, so don't count it.
+ 0
+ } else {
+ (lit.len() + 1) * size
+ }
+ })
+ };
+ new_byte_count > self.limit_size
+ }
+}
+
+fn prefixes(expr: &Hir, lits: &mut Literals) {
+ match *expr.kind() {
+ HirKind::Literal(hir::Literal::Unicode(c)) => {
+ let mut buf = [0; 4];
+ lits.cross_add(c.encode_utf8(&mut buf).as_bytes());
+ }
+ HirKind::Literal(hir::Literal::Byte(b)) => {
+ lits.cross_add(&[b]);
+ }
+ HirKind::Class(hir::Class::Unicode(ref cls)) => {
+ if !lits.add_char_class(cls) {
+ lits.cut();
+ }
+ }
+ HirKind::Class(hir::Class::Bytes(ref cls)) => {
+ if !lits.add_byte_class(cls) {
+ lits.cut();
+ }
+ }
+ HirKind::Group(hir::Group { ref hir, .. }) => {
+ prefixes(&**hir, lits);
+ }
+ HirKind::Repetition(ref x) => match x.kind {
+ hir::RepetitionKind::ZeroOrOne => {
+ repeat_zero_or_one_literals(&x.hir, lits, prefixes);
+ }
+ hir::RepetitionKind::ZeroOrMore => {
+ repeat_zero_or_more_literals(&x.hir, lits, prefixes);
+ }
+ hir::RepetitionKind::OneOrMore => {
+ repeat_one_or_more_literals(&x.hir, lits, prefixes);
+ }
+ hir::RepetitionKind::Range(ref rng) => {
+ let (min, max) = match *rng {
+ hir::RepetitionRange::Exactly(m) => (m, Some(m)),
+ hir::RepetitionRange::AtLeast(m) => (m, None),
+ hir::RepetitionRange::Bounded(m, n) => (m, Some(n)),
+ };
+ repeat_range_literals(
+ &x.hir, min, max, x.greedy, lits, prefixes,
+ )
+ }
+ },
+ HirKind::Concat(ref es) if es.is_empty() => {}
+ HirKind::Concat(ref es) if es.len() == 1 => prefixes(&es[0], lits),
+ HirKind::Concat(ref es) => {
+ for e in es {
+ if let HirKind::Anchor(hir::Anchor::StartText) = *e.kind() {
+ if !lits.is_empty() {
+ lits.cut();
+ break;
+ }
+ lits.add(Literal::empty());
+ continue;
+ }
+ let mut lits2 = lits.to_empty();
+ prefixes(e, &mut lits2);
+ if !lits.cross_product(&lits2) || !lits2.any_complete() {
+ // If this expression couldn't yield any literal that
+ // could be extended, then we need to quit. Since we're
+ // short-circuiting, we also need to freeze every member.
+ lits.cut();
+ break;
+ }
+ }
+ }
+ HirKind::Alternation(ref es) => {
+ alternate_literals(es, lits, prefixes);
+ }
+ _ => lits.cut(),
+ }
+}
+
+fn suffixes(expr: &Hir, lits: &mut Literals) {
+ match *expr.kind() {
+ HirKind::Literal(hir::Literal::Unicode(c)) => {
+ let mut buf = [0u8; 4];
+ let i = c.encode_utf8(&mut buf).len();
+ let buf = &mut buf[..i];
+ buf.reverse();
+ lits.cross_add(buf);
+ }
+ HirKind::Literal(hir::Literal::Byte(b)) => {
+ lits.cross_add(&[b]);
+ }
+ HirKind::Class(hir::Class::Unicode(ref cls)) => {
+ if !lits.add_char_class_reverse(cls) {
+ lits.cut();
+ }
+ }
+ HirKind::Class(hir::Class::Bytes(ref cls)) => {
+ if !lits.add_byte_class(cls) {
+ lits.cut();
+ }
+ }
+ HirKind::Group(hir::Group { ref hir, .. }) => {
+ suffixes(&**hir, lits);
+ }
+ HirKind::Repetition(ref x) => match x.kind {
+ hir::RepetitionKind::ZeroOrOne => {
+ repeat_zero_or_one_literals(&x.hir, lits, suffixes);
+ }
+ hir::RepetitionKind::ZeroOrMore => {
+ repeat_zero_or_more_literals(&x.hir, lits, suffixes);
+ }
+ hir::RepetitionKind::OneOrMore => {
+ repeat_one_or_more_literals(&x.hir, lits, suffixes);
+ }
+ hir::RepetitionKind::Range(ref rng) => {
+ let (min, max) = match *rng {
+ hir::RepetitionRange::Exactly(m) => (m, Some(m)),
+ hir::RepetitionRange::AtLeast(m) => (m, None),
+ hir::RepetitionRange::Bounded(m, n) => (m, Some(n)),
+ };
+ repeat_range_literals(
+ &x.hir, min, max, x.greedy, lits, suffixes,
+ )
+ }
+ },
+ HirKind::Concat(ref es) if es.is_empty() => {}
+ HirKind::Concat(ref es) if es.len() == 1 => suffixes(&es[0], lits),
+ HirKind::Concat(ref es) => {
+ for e in es.iter().rev() {
+ if let HirKind::Anchor(hir::Anchor::EndText) = *e.kind() {
+ if !lits.is_empty() {
+ lits.cut();
+ break;
+ }
+ lits.add(Literal::empty());
+ continue;
+ }
+ let mut lits2 = lits.to_empty();
+ suffixes(e, &mut lits2);
+ if !lits.cross_product(&lits2) || !lits2.any_complete() {
+ // If this expression couldn't yield any literal that
+ // could be extended, then we need to quit. Since we're
+ // short-circuiting, we also need to freeze every member.
+ lits.cut();
+ break;
+ }
+ }
+ }
+ HirKind::Alternation(ref es) => {
+ alternate_literals(es, lits, suffixes);
+ }
+ _ => lits.cut(),
+ }
+}
+
+fn repeat_zero_or_one_literals<F: FnMut(&Hir, &mut Literals)>(
+ e: &Hir,
+ lits: &mut Literals,
+ mut f: F,
+) {
+ f(
+ &Hir::repetition(hir::Repetition {
+ kind: hir::RepetitionKind::ZeroOrMore,
+ // FIXME: Our literal extraction doesn't care about greediness.
+ // Which is partially why we're treating 'e?' as 'e*'. Namely,
+ // 'ab??' yields [Complete(ab), Complete(a)], but it should yield
+ // [Complete(a), Complete(ab)] because of the non-greediness.
+ greedy: true,
+ hir: Box::new(e.clone()),
+ }),
+ lits,
+ );
+}
+
+fn repeat_zero_or_more_literals<F: FnMut(&Hir, &mut Literals)>(
+ e: &Hir,
+ lits: &mut Literals,
+ mut f: F,
+) {
+ let (mut lits2, mut lits3) = (lits.clone(), lits.to_empty());
+ lits3.set_limit_size(lits.limit_size() / 2);
+ f(e, &mut lits3);
+
+ if lits3.is_empty() || !lits2.cross_product(&lits3) {
+ lits.cut();
+ return;
+ }
+ lits2.cut();
+ lits2.add(Literal::empty());
+ if !lits.union(lits2) {
+ lits.cut();
+ }
+}
+
+fn repeat_one_or_more_literals<F: FnMut(&Hir, &mut Literals)>(
+ e: &Hir,
+ lits: &mut Literals,
+ mut f: F,
+) {
+ f(e, lits);
+ lits.cut();
+}
+
+fn repeat_range_literals<F: FnMut(&Hir, &mut Literals)>(
+ e: &Hir,
+ min: u32,
+ max: Option<u32>,
+ greedy: bool,
+ lits: &mut Literals,
+ mut f: F,
+) {
+ if min == 0 {
+ // This is a bit conservative. If `max` is set, then we could
+ // treat this as a finite set of alternations. For now, we
+ // just treat it as `e*`.
+ f(
+ &Hir::repetition(hir::Repetition {
+ kind: hir::RepetitionKind::ZeroOrMore,
+ greedy,
+ hir: Box::new(e.clone()),
+ }),
+ lits,
+ );
+ } else {
+ if min > 0 {
+ let n = cmp::min(lits.limit_size, min as usize);
+ let es = iter::repeat(e.clone()).take(n).collect();
+ f(&Hir::concat(es), lits);
+ if n < min as usize || lits.contains_empty() {
+ lits.cut();
+ }
+ }
+ if max.map_or(true, |max| min < max) {
+ lits.cut();
+ }
+ }
+}
+
+fn alternate_literals<F: FnMut(&Hir, &mut Literals)>(
+ es: &[Hir],
+ lits: &mut Literals,
+ mut f: F,
+) {
+ let mut lits2 = lits.to_empty();
+ for e in es {
+ let mut lits3 = lits.to_empty();
+ lits3.set_limit_size(lits.limit_size() / 5);
+ f(e, &mut lits3);
+ if lits3.is_empty() || !lits2.union(lits3) {
+ // If we couldn't find suffixes for *any* of the
+ // alternates, then the entire alternation has to be thrown
+ // away and any existing members must be frozen. Similarly,
+ // if the union couldn't complete, stop and freeze.
+ lits.cut();
+ return;
+ }
+ }
+ if !lits.cross_product(&lits2) {
+ lits.cut();
+ }
+}
+
+impl fmt::Debug for Literals {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ f.debug_struct("Literals")
+ .field("lits", &self.lits)
+ .field("limit_size", &self.limit_size)
+ .field("limit_class", &self.limit_class)
+ .finish()
+ }
+}
+
+impl Literal {
+ /// Returns a new complete literal with the bytes given.
+ pub fn new(bytes: Vec<u8>) -> Literal {
+ Literal { v: bytes, cut: false }
+ }
+
+ /// Returns a new complete empty literal.
+ pub fn empty() -> Literal {
+ Literal { v: vec![], cut: false }
+ }
+
+ /// Returns true if this literal was "cut."
+ pub fn is_cut(&self) -> bool {
+ self.cut
+ }
+
+ /// Cuts this literal.
+ pub fn cut(&mut self) {
+ self.cut = true;
+ }
+}
+
+impl PartialEq for Literal {
+ fn eq(&self, other: &Literal) -> bool {
+ self.v == other.v
+ }
+}
+
+impl PartialOrd for Literal {
+ fn partial_cmp(&self, other: &Literal) -> Option<cmp::Ordering> {
+ self.v.partial_cmp(&other.v)
+ }
+}
+
+impl fmt::Debug for Literal {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ if self.is_cut() {
+ write!(f, "Cut({})", escape_unicode(&self.v))
+ } else {
+ write!(f, "Complete({})", escape_unicode(&self.v))
+ }
+ }
+}
+
+impl AsRef<[u8]> for Literal {
+ fn as_ref(&self) -> &[u8] {
+ &self.v
+ }
+}
+
+impl ops::Deref for Literal {
+ type Target = Vec<u8>;
+ fn deref(&self) -> &Vec<u8> {
+ &self.v
+ }
+}
+
+impl ops::DerefMut for Literal {
+ fn deref_mut(&mut self) -> &mut Vec<u8> {
+ &mut self.v
+ }
+}
+
+fn position(needle: &[u8], mut haystack: &[u8]) -> Option<usize> {
+ let mut i = 0;
+ while haystack.len() >= needle.len() {
+ if needle == &haystack[..needle.len()] {
+ return Some(i);
+ }
+ i += 1;
+ haystack = &haystack[1..];
+ }
+ None
+}
+
+fn escape_unicode(bytes: &[u8]) -> String {
+ let show = match ::std::str::from_utf8(bytes) {
+ Ok(v) => v.to_string(),
+ Err(_) => escape_bytes(bytes),
+ };
+ let mut space_escaped = String::new();
+ for c in show.chars() {
+ if c.is_whitespace() {
+ let escaped = if c as u32 <= 0x7F {
+ escape_byte(c as u8)
+ } else if c as u32 <= 0xFFFF {
+ format!(r"\u{{{:04x}}}", c as u32)
+ } else {
+ format!(r"\U{{{:08x}}}", c as u32)
+ };
+ space_escaped.push_str(&escaped);
+ } else {
+ space_escaped.push(c);
+ }
+ }
+ space_escaped
+}
+
+fn escape_bytes(bytes: &[u8]) -> String {
+ let mut s = String::new();
+ for &b in bytes {
+ s.push_str(&escape_byte(b));
+ }
+ s
+}
+
+fn escape_byte(byte: u8) -> String {
+ use std::ascii::escape_default;
+
+ let escaped: Vec<u8> = escape_default(byte).collect();
+ String::from_utf8_lossy(&escaped).into_owned()
+}
+
+fn cls_char_count(cls: &hir::ClassUnicode) -> usize {
+ cls.iter().map(|&r| 1 + (r.end as u32) - (r.start as u32)).sum::<u32>()
+ as usize
+}
+
+fn cls_byte_count(cls: &hir::ClassBytes) -> usize {
+ cls.iter().map(|&r| 1 + (r.end as u32) - (r.start as u32)).sum::<u32>()
+ as usize
+}
+
+#[cfg(test)]
+mod tests {
+ use std::fmt;
+
+ use super::{escape_bytes, Literal, Literals};
+ use crate::hir::Hir;
+ use crate::ParserBuilder;
+
+ // To make test failures easier to read.
+ #[derive(Debug, Eq, PartialEq)]
+ struct Bytes(Vec<ULiteral>);
+ #[derive(Debug, Eq, PartialEq)]
+ struct Unicode(Vec<ULiteral>);
+
+ fn escape_lits(blits: &[Literal]) -> Vec<ULiteral> {
+ let mut ulits = vec![];
+ for blit in blits {
+ ulits
+ .push(ULiteral { v: escape_bytes(&blit), cut: blit.is_cut() });
+ }
+ ulits
+ }
+
+ fn create_lits<I: IntoIterator<Item = Literal>>(it: I) -> Literals {
+ Literals {
+ lits: it.into_iter().collect(),
+ limit_size: 0,
+ limit_class: 0,
+ }
+ }
+
+ // Needs to be pub for 1.3?
+ #[derive(Clone, Eq, PartialEq)]
+ pub struct ULiteral {
+ v: String,
+ cut: bool,
+ }
+
+ impl ULiteral {
+ fn is_cut(&self) -> bool {
+ self.cut
+ }
+ }
+
+ impl fmt::Debug for ULiteral {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ if self.is_cut() {
+ write!(f, "Cut({})", self.v)
+ } else {
+ write!(f, "Complete({})", self.v)
+ }
+ }
+ }
+
+ impl PartialEq<Literal> for ULiteral {
+ fn eq(&self, other: &Literal) -> bool {
+ self.v.as_bytes() == &*other.v && self.is_cut() == other.is_cut()
+ }
+ }
+
+ impl PartialEq<ULiteral> for Literal {
+ fn eq(&self, other: &ULiteral) -> bool {
+ &*self.v == other.v.as_bytes() && self.is_cut() == other.is_cut()
+ }
+ }
+
+ #[allow(non_snake_case)]
+ fn C(s: &'static str) -> ULiteral {
+ ULiteral { v: s.to_owned(), cut: true }
+ }
+ #[allow(non_snake_case)]
+ fn M(s: &'static str) -> ULiteral {
+ ULiteral { v: s.to_owned(), cut: false }
+ }
+
+ fn prefixes(lits: &mut Literals, expr: &Hir) {
+ lits.union_prefixes(expr);
+ }
+
+ fn suffixes(lits: &mut Literals, expr: &Hir) {
+ lits.union_suffixes(expr);
+ }
+
+ macro_rules! assert_lit_eq {
+ ($which:ident, $got_lits:expr, $($expected_lit:expr),*) => {{
+ let expected: Vec<ULiteral> = vec![$($expected_lit),*];
+ let lits = $got_lits;
+ assert_eq!(
+ $which(expected.clone()),
+ $which(escape_lits(lits.literals())));
+ assert_eq!(
+ !expected.is_empty() && expected.iter().all(|l| !l.is_cut()),
+ lits.all_complete());
+ assert_eq!(
+ expected.iter().any(|l| !l.is_cut()),
+ lits.any_complete());
+ }};
+ }
+
+ macro_rules! test_lit {
+ ($name:ident, $which:ident, $re:expr) => {
+ test_lit!($name, $which, $re,);
+ };
+ ($name:ident, $which:ident, $re:expr, $($lit:expr),*) => {
+ #[test]
+ fn $name() {
+ let expr = ParserBuilder::new()
+ .build()
+ .parse($re)
+ .unwrap();
+ let lits = Literals::$which(&expr);
+ assert_lit_eq!(Unicode, lits, $($lit),*);
+
+ let expr = ParserBuilder::new()
+ .allow_invalid_utf8(true)
+ .unicode(false)
+ .build()
+ .parse($re)
+ .unwrap();
+ let lits = Literals::$which(&expr);
+ assert_lit_eq!(Bytes, lits, $($lit),*);
+ }
+ };
+ }
+
+ // ************************************************************************
+ // Tests for prefix literal extraction.
+ // ************************************************************************
+
+ // Elementary tests.
+ test_lit!(pfx_one_lit1, prefixes, "a", M("a"));
+ test_lit!(pfx_one_lit2, prefixes, "abc", M("abc"));
+ test_lit!(pfx_one_lit3, prefixes, "(?u)☃", M("\\xe2\\x98\\x83"));
+ #[cfg(feature = "unicode-case")]
+ test_lit!(pfx_one_lit4, prefixes, "(?ui)☃", M("\\xe2\\x98\\x83"));
+ test_lit!(pfx_class1, prefixes, "[1-4]", M("1"), M("2"), M("3"), M("4"));
+ test_lit!(
+ pfx_class2,
+ prefixes,
+ "(?u)[☃Ⅰ]",
+ M("\\xe2\\x85\\xa0"),
+ M("\\xe2\\x98\\x83")
+ );
+ #[cfg(feature = "unicode-case")]
+ test_lit!(
+ pfx_class3,
+ prefixes,
+ "(?ui)[☃Ⅰ]",
+ M("\\xe2\\x85\\xa0"),
+ M("\\xe2\\x85\\xb0"),
+ M("\\xe2\\x98\\x83")
+ );
+ test_lit!(pfx_one_lit_casei1, prefixes, "(?i-u)a", M("A"), M("a"));
+ test_lit!(
+ pfx_one_lit_casei2,
+ prefixes,
+ "(?i-u)abc",
+ M("ABC"),
+ M("aBC"),
+ M("AbC"),
+ M("abC"),
+ M("ABc"),
+ M("aBc"),
+ M("Abc"),
+ M("abc")
+ );
+ test_lit!(pfx_group1, prefixes, "(a)", M("a"));
+ test_lit!(pfx_rep_zero_or_one1, prefixes, "a?");
+ test_lit!(pfx_rep_zero_or_one2, prefixes, "(?:abc)?");
+ test_lit!(pfx_rep_zero_or_one_cat1, prefixes, "ab?", C("ab"), M("a"));
+ // FIXME: This should return [M("a"), M("ab")] because of the non-greedy
+ // repetition. As a work-around, we rewrite ab?? as ab*?, and thus we get
+ // a cut literal.
+ test_lit!(pfx_rep_zero_or_one_cat2, prefixes, "ab??", C("ab"), M("a"));
+ test_lit!(pfx_rep_zero_or_more1, prefixes, "a*");
+ test_lit!(pfx_rep_zero_or_more2, prefixes, "(?:abc)*");
+ test_lit!(pfx_rep_one_or_more1, prefixes, "a+", C("a"));
+ test_lit!(pfx_rep_one_or_more2, prefixes, "(?:abc)+", C("abc"));
+ test_lit!(pfx_rep_nested_one_or_more, prefixes, "(?:a+)+", C("a"));
+ test_lit!(pfx_rep_range1, prefixes, "a{0}");
+ test_lit!(pfx_rep_range2, prefixes, "a{0,}");
+ test_lit!(pfx_rep_range3, prefixes, "a{0,1}");
+ test_lit!(pfx_rep_range4, prefixes, "a{1}", M("a"));
+ test_lit!(pfx_rep_range5, prefixes, "a{2}", M("aa"));
+ test_lit!(pfx_rep_range6, prefixes, "a{1,2}", C("a"));
+ test_lit!(pfx_rep_range7, prefixes, "a{2,3}", C("aa"));
+
+ // Test regexes with concatenations.
+ test_lit!(pfx_cat1, prefixes, "(?:a)(?:b)", M("ab"));
+ test_lit!(pfx_cat2, prefixes, "[ab]z", M("az"), M("bz"));
+ test_lit!(
+ pfx_cat3,
+ prefixes,
+ "(?i-u)[ab]z",
+ M("AZ"),
+ M("BZ"),
+ M("aZ"),
+ M("bZ"),
+ M("Az"),
+ M("Bz"),
+ M("az"),
+ M("bz")
+ );
+ test_lit!(
+ pfx_cat4,
+ prefixes,
+ "[ab][yz]",
+ M("ay"),
+ M("by"),
+ M("az"),
+ M("bz")
+ );
+ test_lit!(pfx_cat5, prefixes, "a*b", C("a"), M("b"));
+ test_lit!(pfx_cat6, prefixes, "a*b*c", C("a"), C("b"), M("c"));
+ test_lit!(pfx_cat7, prefixes, "a*b*c+", C("a"), C("b"), C("c"));
+ test_lit!(pfx_cat8, prefixes, "a*b+c", C("a"), C("b"));
+ test_lit!(pfx_cat9, prefixes, "a*b+c*", C("a"), C("b"));
+ test_lit!(pfx_cat10, prefixes, "ab*", C("ab"), M("a"));
+ test_lit!(pfx_cat11, prefixes, "ab*c", C("ab"), M("ac"));
+ test_lit!(pfx_cat12, prefixes, "ab+", C("ab"));
+ test_lit!(pfx_cat13, prefixes, "ab+c", C("ab"));
+ test_lit!(pfx_cat14, prefixes, "a^", C("a"));
+ test_lit!(pfx_cat15, prefixes, "$a");
+ test_lit!(pfx_cat16, prefixes, r"ab*c", C("ab"), M("ac"));
+ test_lit!(pfx_cat17, prefixes, r"ab+c", C("ab"));
+ test_lit!(pfx_cat18, prefixes, r"z*azb", C("z"), M("azb"));
+ test_lit!(pfx_cat19, prefixes, "a.z", C("a"));
+
+ // Test regexes with alternations.
+ test_lit!(pfx_alt1, prefixes, "a|b", M("a"), M("b"));
+ test_lit!(pfx_alt2, prefixes, "[1-3]|b", M("1"), M("2"), M("3"), M("b"));
+ test_lit!(pfx_alt3, prefixes, "y(?:a|b)z", M("yaz"), M("ybz"));
+ test_lit!(pfx_alt4, prefixes, "a|b*");
+ test_lit!(pfx_alt5, prefixes, "a|b+", M("a"), C("b"));
+ test_lit!(pfx_alt6, prefixes, "a|(?:b|c*)");
+ test_lit!(
+ pfx_alt7,
+ prefixes,
+ "(a|b)*c|(a|ab)*c",
+ C("a"),
+ C("b"),
+ M("c"),
+ C("a"),
+ C("ab"),
+ M("c")
+ );
+ test_lit!(pfx_alt8, prefixes, "a*b|c", C("a"), M("b"), M("c"));
+
+ // Test regexes with empty assertions.
+ test_lit!(pfx_empty1, prefixes, "^a", M("a"));
+ test_lit!(pfx_empty2, prefixes, "a${2}", C("a"));
+ test_lit!(pfx_empty3, prefixes, "^abc", M("abc"));
+ test_lit!(pfx_empty4, prefixes, "(?:^abc)|(?:^z)", M("abc"), M("z"));
+
+ // Make sure some curious regexes have no prefixes.
+ test_lit!(pfx_nothing1, prefixes, ".");
+ test_lit!(pfx_nothing2, prefixes, "(?s).");
+ test_lit!(pfx_nothing3, prefixes, "^");
+ test_lit!(pfx_nothing4, prefixes, "$");
+ test_lit!(pfx_nothing6, prefixes, "(?m)$");
+ test_lit!(pfx_nothing7, prefixes, r"\b");
+ test_lit!(pfx_nothing8, prefixes, r"\B");
+
+ // Test a few regexes that defeat any prefix literal detection.
+ test_lit!(pfx_defeated1, prefixes, ".a");
+ test_lit!(pfx_defeated2, prefixes, "(?s).a");
+ test_lit!(pfx_defeated3, prefixes, "a*b*c*");
+ test_lit!(pfx_defeated4, prefixes, "a|.");
+ test_lit!(pfx_defeated5, prefixes, ".|a");
+ test_lit!(pfx_defeated6, prefixes, "a|^");
+ test_lit!(pfx_defeated7, prefixes, ".(?:a(?:b)(?:c))");
+ test_lit!(pfx_defeated8, prefixes, "$a");
+ test_lit!(pfx_defeated9, prefixes, "(?m)$a");
+ test_lit!(pfx_defeated10, prefixes, r"\ba");
+ test_lit!(pfx_defeated11, prefixes, r"\Ba");
+ test_lit!(pfx_defeated12, prefixes, "^*a");
+ test_lit!(pfx_defeated13, prefixes, "^+a");
+
+ test_lit!(
+ pfx_crazy1,
+ prefixes,
+ r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]",
+ C("Mo\\'"),
+ C("Mu\\'"),
+ C("Moam"),
+ C("Muam")
+ );
+
+ // ************************************************************************
+ // Tests for quiting prefix literal search.
+ // ************************************************************************
+
+ macro_rules! test_exhausted {
+ ($name:ident, $which:ident, $re:expr) => {
+ test_exhausted!($name, $which, $re,);
+ };
+ ($name:ident, $which:ident, $re:expr, $($lit:expr),*) => {
+ #[test]
+ fn $name() {
+ let expr = ParserBuilder::new()
+ .build()
+ .parse($re)
+ .unwrap();
+ let mut lits = Literals::empty();
+ lits.set_limit_size(20).set_limit_class(10);
+ $which(&mut lits, &expr);
+ assert_lit_eq!(Unicode, lits, $($lit),*);
+
+ let expr = ParserBuilder::new()
+ .allow_invalid_utf8(true)
+ .unicode(false)
+ .build()
+ .parse($re)
+ .unwrap();
+ let mut lits = Literals::empty();
+ lits.set_limit_size(20).set_limit_class(10);
+ $which(&mut lits, &expr);
+ assert_lit_eq!(Bytes, lits, $($lit),*);
+ }
+ };
+ }
+
+ // These test use a much lower limit than the default so that we can
+ // write test cases of reasonable size.
+ test_exhausted!(pfx_exhausted1, prefixes, "[a-z]");
+ test_exhausted!(pfx_exhausted2, prefixes, "[a-z]*A");
+ test_exhausted!(pfx_exhausted3, prefixes, "A[a-z]Z", C("A"));
+ test_exhausted!(
+ pfx_exhausted4,
+ prefixes,
+ "(?i-u)foobar",
+ C("FO"),
+ C("fO"),
+ C("Fo"),
+ C("fo")
+ );
+ test_exhausted!(
+ pfx_exhausted5,
+ prefixes,
+ "(?:ab){100}",
+ C("abababababababababab")
+ );
+ test_exhausted!(
+ pfx_exhausted6,
+ prefixes,
+ "(?:(?:ab){100})*cd",
+ C("ababababab"),
+ M("cd")
+ );
+ test_exhausted!(
+ pfx_exhausted7,
+ prefixes,
+ "z(?:(?:ab){100})*cd",
+ C("zababababab"),
+ M("zcd")
+ );
+ test_exhausted!(
+ pfx_exhausted8,
+ prefixes,
+ "aaaaaaaaaaaaaaaaaaaaz",
+ C("aaaaaaaaaaaaaaaaaaaa")
+ );
+
+ // ************************************************************************
+ // Tests for suffix literal extraction.
+ // ************************************************************************
+
+ // Elementary tests.
+ test_lit!(sfx_one_lit1, suffixes, "a", M("a"));
+ test_lit!(sfx_one_lit2, suffixes, "abc", M("abc"));
+ test_lit!(sfx_one_lit3, suffixes, "(?u)☃", M("\\xe2\\x98\\x83"));
+ #[cfg(feature = "unicode-case")]
+ test_lit!(sfx_one_lit4, suffixes, "(?ui)☃", M("\\xe2\\x98\\x83"));
+ test_lit!(sfx_class1, suffixes, "[1-4]", M("1"), M("2"), M("3"), M("4"));
+ test_lit!(
+ sfx_class2,
+ suffixes,
+ "(?u)[☃Ⅰ]",
+ M("\\xe2\\x85\\xa0"),
+ M("\\xe2\\x98\\x83")
+ );
+ #[cfg(feature = "unicode-case")]
+ test_lit!(
+ sfx_class3,
+ suffixes,
+ "(?ui)[☃Ⅰ]",
+ M("\\xe2\\x85\\xa0"),
+ M("\\xe2\\x85\\xb0"),
+ M("\\xe2\\x98\\x83")
+ );
+ test_lit!(sfx_one_lit_casei1, suffixes, "(?i-u)a", M("A"), M("a"));
+ test_lit!(
+ sfx_one_lit_casei2,
+ suffixes,
+ "(?i-u)abc",
+ M("ABC"),
+ M("ABc"),
+ M("AbC"),
+ M("Abc"),
+ M("aBC"),
+ M("aBc"),
+ M("abC"),
+ M("abc")
+ );
+ test_lit!(sfx_group1, suffixes, "(a)", M("a"));
+ test_lit!(sfx_rep_zero_or_one1, suffixes, "a?");
+ test_lit!(sfx_rep_zero_or_one2, suffixes, "(?:abc)?");
+ test_lit!(sfx_rep_zero_or_more1, suffixes, "a*");
+ test_lit!(sfx_rep_zero_or_more2, suffixes, "(?:abc)*");
+ test_lit!(sfx_rep_one_or_more1, suffixes, "a+", C("a"));
+ test_lit!(sfx_rep_one_or_more2, suffixes, "(?:abc)+", C("abc"));
+ test_lit!(sfx_rep_nested_one_or_more, suffixes, "(?:a+)+", C("a"));
+ test_lit!(sfx_rep_range1, suffixes, "a{0}");
+ test_lit!(sfx_rep_range2, suffixes, "a{0,}");
+ test_lit!(sfx_rep_range3, suffixes, "a{0,1}");
+ test_lit!(sfx_rep_range4, suffixes, "a{1}", M("a"));
+ test_lit!(sfx_rep_range5, suffixes, "a{2}", M("aa"));
+ test_lit!(sfx_rep_range6, suffixes, "a{1,2}", C("a"));
+ test_lit!(sfx_rep_range7, suffixes, "a{2,3}", C("aa"));
+
+ // Test regexes with concatenations.
+ test_lit!(sfx_cat1, suffixes, "(?:a)(?:b)", M("ab"));
+ test_lit!(sfx_cat2, suffixes, "[ab]z", M("az"), M("bz"));
+ test_lit!(
+ sfx_cat3,
+ suffixes,
+ "(?i-u)[ab]z",
+ M("AZ"),
+ M("Az"),
+ M("BZ"),
+ M("Bz"),
+ M("aZ"),
+ M("az"),
+ M("bZ"),
+ M("bz")
+ );
+ test_lit!(
+ sfx_cat4,
+ suffixes,
+ "[ab][yz]",
+ M("ay"),
+ M("az"),
+ M("by"),
+ M("bz")
+ );
+ test_lit!(sfx_cat5, suffixes, "a*b", C("ab"), M("b"));
+ test_lit!(sfx_cat6, suffixes, "a*b*c", C("bc"), C("ac"), M("c"));
+ test_lit!(sfx_cat7, suffixes, "a*b*c+", C("c"));
+ test_lit!(sfx_cat8, suffixes, "a*b+c", C("bc"));
+ test_lit!(sfx_cat9, suffixes, "a*b+c*", C("c"), C("b"));
+ test_lit!(sfx_cat10, suffixes, "ab*", C("b"), M("a"));
+ test_lit!(sfx_cat11, suffixes, "ab*c", C("bc"), M("ac"));
+ test_lit!(sfx_cat12, suffixes, "ab+", C("b"));
+ test_lit!(sfx_cat13, suffixes, "ab+c", C("bc"));
+ test_lit!(sfx_cat14, suffixes, "a^");
+ test_lit!(sfx_cat15, suffixes, "$a", C("a"));
+ test_lit!(sfx_cat16, suffixes, r"ab*c", C("bc"), M("ac"));
+ test_lit!(sfx_cat17, suffixes, r"ab+c", C("bc"));
+ test_lit!(sfx_cat18, suffixes, r"z*azb", C("zazb"), M("azb"));
+ test_lit!(sfx_cat19, suffixes, "a.z", C("z"));
+
+ // Test regexes with alternations.
+ test_lit!(sfx_alt1, suffixes, "a|b", M("a"), M("b"));
+ test_lit!(sfx_alt2, suffixes, "[1-3]|b", M("1"), M("2"), M("3"), M("b"));
+ test_lit!(sfx_alt3, suffixes, "y(?:a|b)z", M("yaz"), M("ybz"));
+ test_lit!(sfx_alt4, suffixes, "a|b*");
+ test_lit!(sfx_alt5, suffixes, "a|b+", M("a"), C("b"));
+ test_lit!(sfx_alt6, suffixes, "a|(?:b|c*)");
+ test_lit!(
+ sfx_alt7,
+ suffixes,
+ "(a|b)*c|(a|ab)*c",
+ C("ac"),
+ C("bc"),
+ M("c"),
+ C("ac"),
+ C("abc"),
+ M("c")
+ );
+ test_lit!(sfx_alt8, suffixes, "a*b|c", C("ab"), M("b"), M("c"));
+
+ // Test regexes with empty assertions.
+ test_lit!(sfx_empty1, suffixes, "a$", M("a"));
+ test_lit!(sfx_empty2, suffixes, "${2}a", C("a"));
+
+ // Make sure some curious regexes have no suffixes.
+ test_lit!(sfx_nothing1, suffixes, ".");
+ test_lit!(sfx_nothing2, suffixes, "(?s).");
+ test_lit!(sfx_nothing3, suffixes, "^");
+ test_lit!(sfx_nothing4, suffixes, "$");
+ test_lit!(sfx_nothing6, suffixes, "(?m)$");
+ test_lit!(sfx_nothing7, suffixes, r"\b");
+ test_lit!(sfx_nothing8, suffixes, r"\B");
+
+ // Test a few regexes that defeat any suffix literal detection.
+ test_lit!(sfx_defeated1, suffixes, "a.");
+ test_lit!(sfx_defeated2, suffixes, "(?s)a.");
+ test_lit!(sfx_defeated3, suffixes, "a*b*c*");
+ test_lit!(sfx_defeated4, suffixes, "a|.");
+ test_lit!(sfx_defeated5, suffixes, ".|a");
+ test_lit!(sfx_defeated6, suffixes, "a|^");
+ test_lit!(sfx_defeated7, suffixes, "(?:a(?:b)(?:c)).");
+ test_lit!(sfx_defeated8, suffixes, "a^");
+ test_lit!(sfx_defeated9, suffixes, "(?m)a$");
+ test_lit!(sfx_defeated10, suffixes, r"a\b");
+ test_lit!(sfx_defeated11, suffixes, r"a\B");
+ test_lit!(sfx_defeated12, suffixes, "a^*");
+ test_lit!(sfx_defeated13, suffixes, "a^+");
+
+ // These test use a much lower limit than the default so that we can
+ // write test cases of reasonable size.
+ test_exhausted!(sfx_exhausted1, suffixes, "[a-z]");
+ test_exhausted!(sfx_exhausted2, suffixes, "A[a-z]*");
+ test_exhausted!(sfx_exhausted3, suffixes, "A[a-z]Z", C("Z"));
+ test_exhausted!(
+ sfx_exhausted4,
+ suffixes,
+ "(?i-u)foobar",
+ C("AR"),
+ C("Ar"),
+ C("aR"),
+ C("ar")
+ );
+ test_exhausted!(
+ sfx_exhausted5,
+ suffixes,
+ "(?:ab){100}",
+ C("abababababababababab")
+ );
+ test_exhausted!(
+ sfx_exhausted6,
+ suffixes,
+ "cd(?:(?:ab){100})*",
+ C("ababababab"),
+ M("cd")
+ );
+ test_exhausted!(
+ sfx_exhausted7,
+ suffixes,
+ "cd(?:(?:ab){100})*z",
+ C("abababababz"),
+ M("cdz")
+ );
+ test_exhausted!(
+ sfx_exhausted8,
+ suffixes,
+ "zaaaaaaaaaaaaaaaaaaaa",
+ C("aaaaaaaaaaaaaaaaaaaa")
+ );
+
+ // ************************************************************************
+ // Tests for generating unambiguous literal sets.
+ // ************************************************************************
+
+ macro_rules! test_unamb {
+ ($name:ident, $given:expr, $expected:expr) => {
+ #[test]
+ fn $name() {
+ let given: Vec<Literal> = $given
+ .into_iter()
+ .map(|ul| {
+ let cut = ul.is_cut();
+ Literal { v: ul.v.into_bytes(), cut: cut }
+ })
+ .collect();
+ let lits = create_lits(given);
+ let got = lits.unambiguous_prefixes();
+ assert_eq!($expected, escape_lits(got.literals()));
+ }
+ };
+ }
+
+ test_unamb!(unambiguous1, vec![M("z"), M("azb")], vec![C("a"), C("z")]);
+ test_unamb!(
+ unambiguous2,
+ vec![M("zaaaaaa"), M("aa")],
+ vec![C("aa"), C("z")]
+ );
+ test_unamb!(
+ unambiguous3,
+ vec![M("Sherlock"), M("Watson")],
+ vec![M("Sherlock"), M("Watson")]
+ );
+ test_unamb!(unambiguous4, vec![M("abc"), M("bc")], vec![C("a"), C("bc")]);
+ test_unamb!(unambiguous5, vec![M("bc"), M("abc")], vec![C("a"), C("bc")]);
+ test_unamb!(unambiguous6, vec![M("a"), M("aa")], vec![C("a")]);
+ test_unamb!(unambiguous7, vec![M("aa"), M("a")], vec![C("a")]);
+ test_unamb!(unambiguous8, vec![M("ab"), M("a")], vec![C("a")]);
+ test_unamb!(
+ unambiguous9,
+ vec![M("ac"), M("bc"), M("c"), M("ac"), M("abc"), M("c")],
+ vec![C("a"), C("b"), C("c")]
+ );
+ test_unamb!(
+ unambiguous10,
+ vec![M("Mo'"), M("Mu'"), M("Mo"), M("Mu")],
+ vec![C("Mo"), C("Mu")]
+ );
+ test_unamb!(
+ unambiguous11,
+ vec![M("zazb"), M("azb")],
+ vec![C("a"), C("z")]
+ );
+ test_unamb!(unambiguous12, vec![M("foo"), C("foo")], vec![C("foo")]);
+ test_unamb!(
+ unambiguous13,
+ vec![M("ABCX"), M("CDAX"), M("BCX")],
+ vec![C("A"), C("BCX"), C("CD")]
+ );
+ test_unamb!(
+ unambiguous14,
+ vec![M("IMGX"), M("MVIX"), M("MGX"), M("DSX")],
+ vec![M("DSX"), C("I"), C("MGX"), C("MV")]
+ );
+ test_unamb!(
+ unambiguous15,
+ vec![M("IMG_"), M("MG_"), M("CIMG")],
+ vec![C("C"), C("I"), C("MG_")]
+ );
+
+ // ************************************************************************
+ // Tests for suffix trimming.
+ // ************************************************************************
+ macro_rules! test_trim {
+ ($name:ident, $trim:expr, $given:expr, $expected:expr) => {
+ #[test]
+ fn $name() {
+ let given: Vec<Literal> = $given
+ .into_iter()
+ .map(|ul| {
+ let cut = ul.is_cut();
+ Literal { v: ul.v.into_bytes(), cut: cut }
+ })
+ .collect();
+ let lits = create_lits(given);
+ let got = lits.trim_suffix($trim).unwrap();
+ assert_eq!($expected, escape_lits(got.literals()));
+ }
+ };
+ }
+
+ test_trim!(trim1, 1, vec![M("ab"), M("yz")], vec![C("a"), C("y")]);
+ test_trim!(trim2, 1, vec![M("abc"), M("abd")], vec![C("ab")]);
+ test_trim!(trim3, 2, vec![M("abc"), M("abd")], vec![C("a")]);
+ test_trim!(trim4, 2, vec![M("abc"), M("ghij")], vec![C("a"), C("gh")]);
+
+ // ************************************************************************
+ // Tests for longest common prefix.
+ // ************************************************************************
+
+ macro_rules! test_lcp {
+ ($name:ident, $given:expr, $expected:expr) => {
+ #[test]
+ fn $name() {
+ let given: Vec<Literal> = $given
+ .into_iter()
+ .map(|s: &str| Literal {
+ v: s.to_owned().into_bytes(),
+ cut: false,
+ })
+ .collect();
+ let lits = create_lits(given);
+ let got = lits.longest_common_prefix();
+ assert_eq!($expected, escape_bytes(got));
+ }
+ };
+ }
+
+ test_lcp!(lcp1, vec!["a"], "a");
+ test_lcp!(lcp2, vec![], "");
+ test_lcp!(lcp3, vec!["a", "b"], "");
+ test_lcp!(lcp4, vec!["ab", "ab"], "ab");
+ test_lcp!(lcp5, vec!["ab", "a"], "a");
+ test_lcp!(lcp6, vec!["a", "ab"], "a");
+ test_lcp!(lcp7, vec!["ab", "b"], "");
+ test_lcp!(lcp8, vec!["b", "ab"], "");
+ test_lcp!(lcp9, vec!["foobar", "foobaz"], "fooba");
+ test_lcp!(lcp10, vec!["foobar", "foobaz", "a"], "");
+ test_lcp!(lcp11, vec!["a", "foobar", "foobaz"], "");
+ test_lcp!(lcp12, vec!["foo", "flub", "flab", "floo"], "f");
+
+ // ************************************************************************
+ // Tests for longest common suffix.
+ // ************************************************************************
+
+ macro_rules! test_lcs {
+ ($name:ident, $given:expr, $expected:expr) => {
+ #[test]
+ fn $name() {
+ let given: Vec<Literal> = $given
+ .into_iter()
+ .map(|s: &str| Literal {
+ v: s.to_owned().into_bytes(),
+ cut: false,
+ })
+ .collect();
+ let lits = create_lits(given);
+ let got = lits.longest_common_suffix();
+ assert_eq!($expected, escape_bytes(got));
+ }
+ };
+ }
+
+ test_lcs!(lcs1, vec!["a"], "a");
+ test_lcs!(lcs2, vec![], "");
+ test_lcs!(lcs3, vec!["a", "b"], "");
+ test_lcs!(lcs4, vec!["ab", "ab"], "ab");
+ test_lcs!(lcs5, vec!["ab", "a"], "");
+ test_lcs!(lcs6, vec!["a", "ab"], "");
+ test_lcs!(lcs7, vec!["ab", "b"], "b");
+ test_lcs!(lcs8, vec!["b", "ab"], "b");
+ test_lcs!(lcs9, vec!["barfoo", "bazfoo"], "foo");
+ test_lcs!(lcs10, vec!["barfoo", "bazfoo", "a"], "");
+ test_lcs!(lcs11, vec!["a", "barfoo", "bazfoo"], "");
+ test_lcs!(lcs12, vec!["flub", "bub", "boob", "dub"], "b");
+}
diff --git a/third_party/rust/regex-syntax/src/hir/mod.rs b/third_party/rust/regex-syntax/src/hir/mod.rs
new file mode 100644
index 0000000000..1096e9f05a
--- /dev/null
+++ b/third_party/rust/regex-syntax/src/hir/mod.rs
@@ -0,0 +1,2296 @@
+/*!
+Defines a high-level intermediate representation for regular expressions.
+*/
+use std::char;
+use std::cmp;
+use std::error;
+use std::fmt;
+use std::result;
+use std::u8;
+
+use crate::ast::Span;
+use crate::hir::interval::{Interval, IntervalSet, IntervalSetIter};
+use crate::unicode;
+
+pub use crate::hir::visitor::{visit, Visitor};
+pub use crate::unicode::CaseFoldError;
+
+mod interval;
+pub mod literal;
+pub mod print;
+pub mod translate;
+mod visitor;
+
+/// An error that can occur while translating an `Ast` to a `Hir`.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct Error {
+ /// The kind of error.
+ kind: ErrorKind,
+ /// The original pattern that the translator's Ast was parsed from. Every
+ /// span in an error is a valid range into this string.
+ pattern: String,
+ /// The span of this error, derived from the Ast given to the translator.
+ span: Span,
+}
+
+impl Error {
+ /// Return the type of this error.
+ pub fn kind(&self) -> &ErrorKind {
+ &self.kind
+ }
+
+ /// The original pattern string in which this error occurred.
+ ///
+ /// Every span reported by this error is reported in terms of this string.
+ pub fn pattern(&self) -> &str {
+ &self.pattern
+ }
+
+ /// Return the span at which this error occurred.
+ pub fn span(&self) -> &Span {
+ &self.span
+ }
+}
+
+/// The type of an error that occurred while building an `Hir`.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub enum ErrorKind {
+ /// This error occurs when a Unicode feature is used when Unicode
+ /// support is disabled. For example `(?-u:\pL)` would trigger this error.
+ UnicodeNotAllowed,
+ /// This error occurs when translating a pattern that could match a byte
+ /// sequence that isn't UTF-8 and `allow_invalid_utf8` was disabled.
+ InvalidUtf8,
+ /// This occurs when an unrecognized Unicode property name could not
+ /// be found.
+ UnicodePropertyNotFound,
+ /// This occurs when an unrecognized Unicode property value could not
+ /// be found.
+ UnicodePropertyValueNotFound,
+ /// This occurs when a Unicode-aware Perl character class (`\w`, `\s` or
+ /// `\d`) could not be found. This can occur when the `unicode-perl`
+ /// crate feature is not enabled.
+ UnicodePerlClassNotFound,
+ /// This occurs when the Unicode simple case mapping tables are not
+ /// available, and the regular expression required Unicode aware case
+ /// insensitivity.
+ UnicodeCaseUnavailable,
+ /// This occurs when the translator attempts to construct a character class
+ /// that is empty.
+ ///
+ /// Note that this restriction in the translator may be removed in the
+ /// future.
+ EmptyClassNotAllowed,
+ /// Hints that destructuring should not be exhaustive.
+ ///
+ /// This enum may grow additional variants, so this makes sure clients
+ /// don't count on exhaustive matching. (Otherwise, adding a new variant
+ /// could break existing code.)
+ #[doc(hidden)]
+ __Nonexhaustive,
+}
+
+impl ErrorKind {
+ // TODO: Remove this method entirely on the next breaking semver release.
+ #[allow(deprecated)]
+ fn description(&self) -> &str {
+ use self::ErrorKind::*;
+ match *self {
+ UnicodeNotAllowed => "Unicode not allowed here",
+ InvalidUtf8 => "pattern can match invalid UTF-8",
+ UnicodePropertyNotFound => "Unicode property not found",
+ UnicodePropertyValueNotFound => "Unicode property value not found",
+ UnicodePerlClassNotFound => {
+ "Unicode-aware Perl class not found \
+ (make sure the unicode-perl feature is enabled)"
+ }
+ UnicodeCaseUnavailable => {
+ "Unicode-aware case insensitivity matching is not available \
+ (make sure the unicode-case feature is enabled)"
+ }
+ EmptyClassNotAllowed => "empty character classes are not allowed",
+ __Nonexhaustive => unreachable!(),
+ }
+ }
+}
+
+impl error::Error for Error {
+ // TODO: Remove this method entirely on the next breaking semver release.
+ #[allow(deprecated)]
+ fn description(&self) -> &str {
+ self.kind.description()
+ }
+}
+
+impl fmt::Display for Error {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ crate::error::Formatter::from(self).fmt(f)
+ }
+}
+
+impl fmt::Display for ErrorKind {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ // TODO: Remove this on the next breaking semver release.
+ #[allow(deprecated)]
+ f.write_str(self.description())
+ }
+}
+
+/// A high-level intermediate representation (HIR) for a regular expression.
+///
+/// The HIR of a regular expression represents an intermediate step between its
+/// abstract syntax (a structured description of the concrete syntax) and
+/// compiled byte codes. The purpose of HIR is to make regular expressions
+/// easier to analyze. In particular, the AST is much more complex than the
+/// HIR. For example, while an AST supports arbitrarily nested character
+/// classes, the HIR will flatten all nested classes into a single set. The HIR
+/// will also "compile away" every flag present in the concrete syntax. For
+/// example, users of HIR expressions never need to worry about case folding;
+/// it is handled automatically by the translator (e.g., by translating `(?i)A`
+/// to `[aA]`).
+///
+/// If the HIR was produced by a translator that disallows invalid UTF-8, then
+/// the HIR is guaranteed to match UTF-8 exclusively.
+///
+/// This type defines its own destructor that uses constant stack space and
+/// heap space proportional to the size of the HIR.
+///
+/// The specific type of an HIR expression can be accessed via its `kind`
+/// or `into_kind` methods. This extra level of indirection exists for two
+/// reasons:
+///
+/// 1. Construction of an HIR expression *must* use the constructor methods
+/// on this `Hir` type instead of building the `HirKind` values directly.
+/// This permits construction to enforce invariants like "concatenations
+/// always consist of two or more sub-expressions."
+/// 2. Every HIR expression contains attributes that are defined inductively,
+/// and can be computed cheaply during the construction process. For
+/// example, one such attribute is whether the expression must match at the
+/// beginning of the text.
+///
+/// Also, an `Hir`'s `fmt::Display` implementation prints an HIR as a regular
+/// expression pattern string, and uses constant stack space and heap space
+/// proportional to the size of the `Hir`.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct Hir {
+ /// The underlying HIR kind.
+ kind: HirKind,
+ /// Analysis info about this HIR, computed during construction.
+ info: HirInfo,
+}
+
+/// The kind of an arbitrary `Hir` expression.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub enum HirKind {
+ /// The empty regular expression, which matches everything, including the
+ /// empty string.
+ Empty,
+ /// A single literal character that matches exactly this character.
+ Literal(Literal),
+ /// A single character class that matches any of the characters in the
+ /// class. A class can either consist of Unicode scalar values as
+ /// characters, or it can use bytes.
+ Class(Class),
+ /// An anchor assertion. An anchor assertion match always has zero length.
+ Anchor(Anchor),
+ /// A word boundary assertion, which may or may not be Unicode aware. A
+ /// word boundary assertion match always has zero length.
+ WordBoundary(WordBoundary),
+ /// A repetition operation applied to a child expression.
+ Repetition(Repetition),
+ /// A possibly capturing group, which contains a child expression.
+ Group(Group),
+ /// A concatenation of expressions. A concatenation always has at least two
+ /// child expressions.
+ ///
+ /// A concatenation matches only if each of its child expression matches
+ /// one after the other.
+ Concat(Vec<Hir>),
+ /// An alternation of expressions. An alternation always has at least two
+ /// child expressions.
+ ///
+ /// An alternation matches only if at least one of its child expression
+ /// matches. If multiple expressions match, then the leftmost is preferred.
+ Alternation(Vec<Hir>),
+}
+
+impl Hir {
+ /// Returns a reference to the underlying HIR kind.
+ pub fn kind(&self) -> &HirKind {
+ &self.kind
+ }
+
+ /// Consumes ownership of this HIR expression and returns its underlying
+ /// `HirKind`.
+ pub fn into_kind(mut self) -> HirKind {
+ use std::mem;
+ mem::replace(&mut self.kind, HirKind::Empty)
+ }
+
+ /// Returns an empty HIR expression.
+ ///
+ /// An empty HIR expression always matches, including the empty string.
+ pub fn empty() -> Hir {
+ let mut info = HirInfo::new();
+ info.set_always_utf8(true);
+ info.set_all_assertions(true);
+ info.set_anchored_start(false);
+ info.set_anchored_end(false);
+ info.set_line_anchored_start(false);
+ info.set_line_anchored_end(false);
+ info.set_any_anchored_start(false);
+ info.set_any_anchored_end(false);
+ info.set_match_empty(true);
+ info.set_literal(false);
+ info.set_alternation_literal(false);
+ Hir { kind: HirKind::Empty, info }
+ }
+
+ /// Creates a literal HIR expression.
+ ///
+ /// If the given literal has a `Byte` variant with an ASCII byte, then this
+ /// method panics. This enforces the invariant that `Byte` variants are
+ /// only used to express matching of invalid UTF-8.
+ pub fn literal(lit: Literal) -> Hir {
+ if let Literal::Byte(b) = lit {
+ assert!(b > 0x7F);
+ }
+
+ let mut info = HirInfo::new();
+ info.set_always_utf8(lit.is_unicode());
+ info.set_all_assertions(false);
+ info.set_anchored_start(false);
+ info.set_anchored_end(false);
+ info.set_line_anchored_start(false);
+ info.set_line_anchored_end(false);
+ info.set_any_anchored_start(false);
+ info.set_any_anchored_end(false);
+ info.set_match_empty(false);
+ info.set_literal(true);
+ info.set_alternation_literal(true);
+ Hir { kind: HirKind::Literal(lit), info }
+ }
+
+ /// Creates a class HIR expression.
+ pub fn class(class: Class) -> Hir {
+ let mut info = HirInfo::new();
+ info.set_always_utf8(class.is_always_utf8());
+ info.set_all_assertions(false);
+ info.set_anchored_start(false);
+ info.set_anchored_end(false);
+ info.set_line_anchored_start(false);
+ info.set_line_anchored_end(false);
+ info.set_any_anchored_start(false);
+ info.set_any_anchored_end(false);
+ info.set_match_empty(false);
+ info.set_literal(false);
+ info.set_alternation_literal(false);
+ Hir { kind: HirKind::Class(class), info }
+ }
+
+ /// Creates an anchor assertion HIR expression.
+ pub fn anchor(anchor: Anchor) -> Hir {
+ let mut info = HirInfo::new();
+ info.set_always_utf8(true);
+ info.set_all_assertions(true);
+ info.set_anchored_start(false);
+ info.set_anchored_end(false);
+ info.set_line_anchored_start(false);
+ info.set_line_anchored_end(false);
+ info.set_any_anchored_start(false);
+ info.set_any_anchored_end(false);
+ info.set_match_empty(true);
+ info.set_literal(false);
+ info.set_alternation_literal(false);
+ if let Anchor::StartText = anchor {
+ info.set_anchored_start(true);
+ info.set_line_anchored_start(true);
+ info.set_any_anchored_start(true);
+ }
+ if let Anchor::EndText = anchor {
+ info.set_anchored_end(true);
+ info.set_line_anchored_end(true);
+ info.set_any_anchored_end(true);
+ }
+ if let Anchor::StartLine = anchor {
+ info.set_line_anchored_start(true);
+ }
+ if let Anchor::EndLine = anchor {
+ info.set_line_anchored_end(true);
+ }
+ Hir { kind: HirKind::Anchor(anchor), info }
+ }
+
+ /// Creates a word boundary assertion HIR expression.
+ pub fn word_boundary(word_boundary: WordBoundary) -> Hir {
+ let mut info = HirInfo::new();
+ info.set_always_utf8(true);
+ info.set_all_assertions(true);
+ info.set_anchored_start(false);
+ info.set_anchored_end(false);
+ info.set_line_anchored_start(false);
+ info.set_line_anchored_end(false);
+ info.set_any_anchored_start(false);
+ info.set_any_anchored_end(false);
+ info.set_literal(false);
+ info.set_alternation_literal(false);
+ // A negated word boundary matches '', so that's fine. But \b does not
+ // match \b, so why do we say it can match the empty string? Well,
+ // because, if you search for \b against 'a', it will report [0, 0) and
+ // [1, 1) as matches, and both of those matches correspond to the empty
+ // string. Thus, only *certain* empty strings match \b, which similarly
+ // applies to \B.
+ info.set_match_empty(true);
+ // Negated ASCII word boundaries can match invalid UTF-8.
+ if let WordBoundary::AsciiNegate = word_boundary {
+ info.set_always_utf8(false);
+ }
+ Hir { kind: HirKind::WordBoundary(word_boundary), info }
+ }
+
+ /// Creates a repetition HIR expression.
+ pub fn repetition(rep: Repetition) -> Hir {
+ let mut info = HirInfo::new();
+ info.set_always_utf8(rep.hir.is_always_utf8());
+ info.set_all_assertions(rep.hir.is_all_assertions());
+ // If this operator can match the empty string, then it can never
+ // be anchored.
+ info.set_anchored_start(
+ !rep.is_match_empty() && rep.hir.is_anchored_start(),
+ );
+ info.set_anchored_end(
+ !rep.is_match_empty() && rep.hir.is_anchored_end(),
+ );
+ info.set_line_anchored_start(
+ !rep.is_match_empty() && rep.hir.is_anchored_start(),
+ );
+ info.set_line_anchored_end(
+ !rep.is_match_empty() && rep.hir.is_anchored_end(),
+ );
+ info.set_any_anchored_start(rep.hir.is_any_anchored_start());
+ info.set_any_anchored_end(rep.hir.is_any_anchored_end());
+ info.set_match_empty(rep.is_match_empty() || rep.hir.is_match_empty());
+ info.set_literal(false);
+ info.set_alternation_literal(false);
+ Hir { kind: HirKind::Repetition(rep), info }
+ }
+
+ /// Creates a group HIR expression.
+ pub fn group(group: Group) -> Hir {
+ let mut info = HirInfo::new();
+ info.set_always_utf8(group.hir.is_always_utf8());
+ info.set_all_assertions(group.hir.is_all_assertions());
+ info.set_anchored_start(group.hir.is_anchored_start());
+ info.set_anchored_end(group.hir.is_anchored_end());
+ info.set_line_anchored_start(group.hir.is_line_anchored_start());
+ info.set_line_anchored_end(group.hir.is_line_anchored_end());
+ info.set_any_anchored_start(group.hir.is_any_anchored_start());
+ info.set_any_anchored_end(group.hir.is_any_anchored_end());
+ info.set_match_empty(group.hir.is_match_empty());
+ info.set_literal(false);
+ info.set_alternation_literal(false);
+ Hir { kind: HirKind::Group(group), info }
+ }
+
+ /// Returns the concatenation of the given expressions.
+ ///
+ /// This flattens the concatenation as appropriate.
+ pub fn concat(mut exprs: Vec<Hir>) -> Hir {
+ match exprs.len() {
+ 0 => Hir::empty(),
+ 1 => exprs.pop().unwrap(),
+ _ => {
+ let mut info = HirInfo::new();
+ info.set_always_utf8(true);
+ info.set_all_assertions(true);
+ info.set_any_anchored_start(false);
+ info.set_any_anchored_end(false);
+ info.set_match_empty(true);
+ info.set_literal(true);
+ info.set_alternation_literal(true);
+
+ // Some attributes require analyzing all sub-expressions.
+ for e in &exprs {
+ let x = info.is_always_utf8() && e.is_always_utf8();
+ info.set_always_utf8(x);
+
+ let x = info.is_all_assertions() && e.is_all_assertions();
+ info.set_all_assertions(x);
+
+ let x = info.is_any_anchored_start()
+ || e.is_any_anchored_start();
+ info.set_any_anchored_start(x);
+
+ let x =
+ info.is_any_anchored_end() || e.is_any_anchored_end();
+ info.set_any_anchored_end(x);
+
+ let x = info.is_match_empty() && e.is_match_empty();
+ info.set_match_empty(x);
+
+ let x = info.is_literal() && e.is_literal();
+ info.set_literal(x);
+
+ let x = info.is_alternation_literal()
+ && e.is_alternation_literal();
+ info.set_alternation_literal(x);
+ }
+ // Anchored attributes require something slightly more
+ // sophisticated. Normally, WLOG, to determine whether an
+ // expression is anchored to the start, we'd only need to check
+ // the first expression of a concatenation. However,
+ // expressions like `$\b^` are still anchored to the start,
+ // but the first expression in the concatenation *isn't*
+ // anchored to the start. So the "first" expression to look at
+ // is actually one that is either not an assertion or is
+ // specifically the StartText assertion.
+ info.set_anchored_start(
+ exprs
+ .iter()
+ .take_while(|e| {
+ e.is_anchored_start() || e.is_all_assertions()
+ })
+ .any(|e| e.is_anchored_start()),
+ );
+ // Similarly for the end anchor, but in reverse.
+ info.set_anchored_end(
+ exprs
+ .iter()
+ .rev()
+ .take_while(|e| {
+ e.is_anchored_end() || e.is_all_assertions()
+ })
+ .any(|e| e.is_anchored_end()),
+ );
+ // Repeat the process for line anchors.
+ info.set_line_anchored_start(
+ exprs
+ .iter()
+ .take_while(|e| {
+ e.is_line_anchored_start() || e.is_all_assertions()
+ })
+ .any(|e| e.is_line_anchored_start()),
+ );
+ info.set_line_anchored_end(
+ exprs
+ .iter()
+ .rev()
+ .take_while(|e| {
+ e.is_line_anchored_end() || e.is_all_assertions()
+ })
+ .any(|e| e.is_line_anchored_end()),
+ );
+ Hir { kind: HirKind::Concat(exprs), info }
+ }
+ }
+ }
+
+ /// Returns the alternation of the given expressions.
+ ///
+ /// This flattens the alternation as appropriate.
+ pub fn alternation(mut exprs: Vec<Hir>) -> Hir {
+ match exprs.len() {
+ 0 => Hir::empty(),
+ 1 => exprs.pop().unwrap(),
+ _ => {
+ let mut info = HirInfo::new();
+ info.set_always_utf8(true);
+ info.set_all_assertions(true);
+ info.set_anchored_start(true);
+ info.set_anchored_end(true);
+ info.set_line_anchored_start(true);
+ info.set_line_anchored_end(true);
+ info.set_any_anchored_start(false);
+ info.set_any_anchored_end(false);
+ info.set_match_empty(false);
+ info.set_literal(false);
+ info.set_alternation_literal(true);
+
+ // Some attributes require analyzing all sub-expressions.
+ for e in &exprs {
+ let x = info.is_always_utf8() && e.is_always_utf8();
+ info.set_always_utf8(x);
+
+ let x = info.is_all_assertions() && e.is_all_assertions();
+ info.set_all_assertions(x);
+
+ let x = info.is_anchored_start() && e.is_anchored_start();
+ info.set_anchored_start(x);
+
+ let x = info.is_anchored_end() && e.is_anchored_end();
+ info.set_anchored_end(x);
+
+ let x = info.is_line_anchored_start()
+ && e.is_line_anchored_start();
+ info.set_line_anchored_start(x);
+
+ let x = info.is_line_anchored_end()
+ && e.is_line_anchored_end();
+ info.set_line_anchored_end(x);
+
+ let x = info.is_any_anchored_start()
+ || e.is_any_anchored_start();
+ info.set_any_anchored_start(x);
+
+ let x =
+ info.is_any_anchored_end() || e.is_any_anchored_end();
+ info.set_any_anchored_end(x);
+
+ let x = info.is_match_empty() || e.is_match_empty();
+ info.set_match_empty(x);
+
+ let x = info.is_alternation_literal() && e.is_literal();
+ info.set_alternation_literal(x);
+ }
+ Hir { kind: HirKind::Alternation(exprs), info }
+ }
+ }
+ }
+
+ /// Build an HIR expression for `.`.
+ ///
+ /// A `.` expression matches any character except for `\n`. To build an
+ /// expression that matches any character, including `\n`, use the `any`
+ /// method.
+ ///
+ /// If `bytes` is `true`, then this assumes characters are limited to a
+ /// single byte.
+ pub fn dot(bytes: bool) -> Hir {
+ if bytes {
+ let mut cls = ClassBytes::empty();
+ cls.push(ClassBytesRange::new(b'\0', b'\x09'));
+ cls.push(ClassBytesRange::new(b'\x0B', b'\xFF'));
+ Hir::class(Class::Bytes(cls))
+ } else {
+ let mut cls = ClassUnicode::empty();
+ cls.push(ClassUnicodeRange::new('\0', '\x09'));
+ cls.push(ClassUnicodeRange::new('\x0B', '\u{10FFFF}'));
+ Hir::class(Class::Unicode(cls))
+ }
+ }
+
+ /// Build an HIR expression for `(?s).`.
+ ///
+ /// A `(?s).` expression matches any character, including `\n`. To build an
+ /// expression that matches any character except for `\n`, then use the
+ /// `dot` method.
+ ///
+ /// If `bytes` is `true`, then this assumes characters are limited to a
+ /// single byte.
+ pub fn any(bytes: bool) -> Hir {
+ if bytes {
+ let mut cls = ClassBytes::empty();
+ cls.push(ClassBytesRange::new(b'\0', b'\xFF'));
+ Hir::class(Class::Bytes(cls))
+ } else {
+ let mut cls = ClassUnicode::empty();
+ cls.push(ClassUnicodeRange::new('\0', '\u{10FFFF}'));
+ Hir::class(Class::Unicode(cls))
+ }
+ }
+
+ /// Return true if and only if this HIR will always match valid UTF-8.
+ ///
+ /// When this returns false, then it is possible for this HIR expression
+ /// to match invalid UTF-8.
+ pub fn is_always_utf8(&self) -> bool {
+ self.info.is_always_utf8()
+ }
+
+ /// Returns true if and only if this entire HIR expression is made up of
+ /// zero-width assertions.
+ ///
+ /// This includes expressions like `^$\b\A\z` and even `((\b)+())*^`, but
+ /// not `^a`.
+ pub fn is_all_assertions(&self) -> bool {
+ self.info.is_all_assertions()
+ }
+
+ /// Return true if and only if this HIR is required to match from the
+ /// beginning of text. This includes expressions like `^foo`, `^(foo|bar)`,
+ /// `^foo|^bar` but not `^foo|bar`.
+ pub fn is_anchored_start(&self) -> bool {
+ self.info.is_anchored_start()
+ }
+
+ /// Return true if and only if this HIR is required to match at the end
+ /// of text. This includes expressions like `foo$`, `(foo|bar)$`,
+ /// `foo$|bar$` but not `foo$|bar`.
+ pub fn is_anchored_end(&self) -> bool {
+ self.info.is_anchored_end()
+ }
+
+ /// Return true if and only if this HIR is required to match from the
+ /// beginning of text or the beginning of a line. This includes expressions
+ /// like `^foo`, `(?m)^foo`, `^(foo|bar)`, `^(foo|bar)`, `(?m)^foo|^bar`
+ /// but not `^foo|bar` or `(?m)^foo|bar`.
+ ///
+ /// Note that if `is_anchored_start` is `true`, then
+ /// `is_line_anchored_start` will also be `true`. The reverse implication
+ /// is not true. For example, `(?m)^foo` is line anchored, but not
+ /// `is_anchored_start`.
+ pub fn is_line_anchored_start(&self) -> bool {
+ self.info.is_line_anchored_start()
+ }
+
+ /// Return true if and only if this HIR is required to match at the
+ /// end of text or the end of a line. This includes expressions like
+ /// `foo$`, `(?m)foo$`, `(foo|bar)$`, `(?m)(foo|bar)$`, `foo$|bar$`,
+ /// `(?m)(foo|bar)$`, but not `foo$|bar` or `(?m)foo$|bar`.
+ ///
+ /// Note that if `is_anchored_end` is `true`, then
+ /// `is_line_anchored_end` will also be `true`. The reverse implication
+ /// is not true. For example, `(?m)foo$` is line anchored, but not
+ /// `is_anchored_end`.
+ pub fn is_line_anchored_end(&self) -> bool {
+ self.info.is_line_anchored_end()
+ }
+
+ /// Return true if and only if this HIR contains any sub-expression that
+ /// is required to match at the beginning of text. Specifically, this
+ /// returns true if the `^` symbol (when multiline mode is disabled) or the
+ /// `\A` escape appear anywhere in the regex.
+ pub fn is_any_anchored_start(&self) -> bool {
+ self.info.is_any_anchored_start()
+ }
+
+ /// Return true if and only if this HIR contains any sub-expression that is
+ /// required to match at the end of text. Specifically, this returns true
+ /// if the `$` symbol (when multiline mode is disabled) or the `\z` escape
+ /// appear anywhere in the regex.
+ pub fn is_any_anchored_end(&self) -> bool {
+ self.info.is_any_anchored_end()
+ }
+
+ /// Return true if and only if the empty string is part of the language
+ /// matched by this regular expression.
+ ///
+ /// This includes `a*`, `a?b*`, `a{0}`, `()`, `()+`, `^$`, `a|b?`, `\b`
+ /// and `\B`, but not `a` or `a+`.
+ pub fn is_match_empty(&self) -> bool {
+ self.info.is_match_empty()
+ }
+
+ /// Return true if and only if this HIR is a simple literal. This is only
+ /// true when this HIR expression is either itself a `Literal` or a
+ /// concatenation of only `Literal`s.
+ ///
+ /// For example, `f` and `foo` are literals, but `f+`, `(foo)`, `foo()`,
+ /// `` are not (even though that contain sub-expressions that are literals).
+ pub fn is_literal(&self) -> bool {
+ self.info.is_literal()
+ }
+
+ /// Return true if and only if this HIR is either a simple literal or an
+ /// alternation of simple literals. This is only
+ /// true when this HIR expression is either itself a `Literal` or a
+ /// concatenation of only `Literal`s or an alternation of only `Literal`s.
+ ///
+ /// For example, `f`, `foo`, `a|b|c`, and `foo|bar|baz` are alternation
+ /// literals, but `f+`, `(foo)`, `foo()`, ``
+ /// are not (even though that contain sub-expressions that are literals).
+ pub fn is_alternation_literal(&self) -> bool {
+ self.info.is_alternation_literal()
+ }
+}
+
+impl HirKind {
+ /// Return true if and only if this HIR is the empty regular expression.
+ ///
+ /// Note that this is not defined inductively. That is, it only tests if
+ /// this kind is the `Empty` variant. To get the inductive definition,
+ /// use the `is_match_empty` method on [`Hir`](struct.Hir.html).
+ pub fn is_empty(&self) -> bool {
+ match *self {
+ HirKind::Empty => true,
+ _ => false,
+ }
+ }
+
+ /// Returns true if and only if this kind has any (including possibly
+ /// empty) subexpressions.
+ pub fn has_subexprs(&self) -> bool {
+ match *self {
+ HirKind::Empty
+ | HirKind::Literal(_)
+ | HirKind::Class(_)
+ | HirKind::Anchor(_)
+ | HirKind::WordBoundary(_) => false,
+ HirKind::Group(_)
+ | HirKind::Repetition(_)
+ | HirKind::Concat(_)
+ | HirKind::Alternation(_) => true,
+ }
+ }
+}
+
+/// Print a display representation of this Hir.
+///
+/// The result of this is a valid regular expression pattern string.
+///
+/// This implementation uses constant stack space and heap space proportional
+/// to the size of the `Hir`.
+impl fmt::Display for Hir {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ use crate::hir::print::Printer;
+ Printer::new().print(self, f)
+ }
+}
+
+/// The high-level intermediate representation of a literal.
+///
+/// A literal corresponds to a single character, where a character is either
+/// defined by a Unicode scalar value or an arbitrary byte. Unicode characters
+/// are preferred whenever possible. In particular, a `Byte` variant is only
+/// ever produced when it could match invalid UTF-8.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub enum Literal {
+ /// A single character represented by a Unicode scalar value.
+ Unicode(char),
+ /// A single character represented by an arbitrary byte.
+ Byte(u8),
+}
+
+impl Literal {
+ /// Returns true if and only if this literal corresponds to a Unicode
+ /// scalar value.
+ pub fn is_unicode(&self) -> bool {
+ match *self {
+ Literal::Unicode(_) => true,
+ Literal::Byte(b) if b <= 0x7F => true,
+ Literal::Byte(_) => false,
+ }
+ }
+}
+
+/// The high-level intermediate representation of a character class.
+///
+/// A character class corresponds to a set of characters. A character is either
+/// defined by a Unicode scalar value or a byte. Unicode characters are used
+/// by default, while bytes are used when Unicode mode (via the `u` flag) is
+/// disabled.
+///
+/// A character class, regardless of its character type, is represented by a
+/// sequence of non-overlapping non-adjacent ranges of characters.
+///
+/// Note that unlike [`Literal`](enum.Literal.html), a `Bytes` variant may
+/// be produced even when it exclusively matches valid UTF-8. This is because
+/// a `Bytes` variant represents an intention by the author of the regular
+/// expression to disable Unicode mode, which in turn impacts the semantics of
+/// case insensitive matching. For example, `(?i)k` and `(?i-u)k` will not
+/// match the same set of strings.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub enum Class {
+ /// A set of characters represented by Unicode scalar values.
+ Unicode(ClassUnicode),
+ /// A set of characters represented by arbitrary bytes (one byte per
+ /// character).
+ Bytes(ClassBytes),
+}
+
+impl Class {
+ /// Apply Unicode simple case folding to this character class, in place.
+ /// The character class will be expanded to include all simple case folded
+ /// character variants.
+ ///
+ /// If this is a byte oriented character class, then this will be limited
+ /// to the ASCII ranges `A-Z` and `a-z`.
+ pub fn case_fold_simple(&mut self) {
+ match *self {
+ Class::Unicode(ref mut x) => x.case_fold_simple(),
+ Class::Bytes(ref mut x) => x.case_fold_simple(),
+ }
+ }
+
+ /// Negate this character class in place.
+ ///
+ /// After completion, this character class will contain precisely the
+ /// characters that weren't previously in the class.
+ pub fn negate(&mut self) {
+ match *self {
+ Class::Unicode(ref mut x) => x.negate(),
+ Class::Bytes(ref mut x) => x.negate(),
+ }
+ }
+
+ /// Returns true if and only if this character class will only ever match
+ /// valid UTF-8.
+ ///
+ /// A character class can match invalid UTF-8 only when the following
+ /// conditions are met:
+ ///
+ /// 1. The translator was configured to permit generating an expression
+ /// that can match invalid UTF-8. (By default, this is disabled.)
+ /// 2. Unicode mode (via the `u` flag) was disabled either in the concrete
+ /// syntax or in the parser builder. By default, Unicode mode is
+ /// enabled.
+ pub fn is_always_utf8(&self) -> bool {
+ match *self {
+ Class::Unicode(_) => true,
+ Class::Bytes(ref x) => x.is_all_ascii(),
+ }
+ }
+}
+
+/// A set of characters represented by Unicode scalar values.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct ClassUnicode {
+ set: IntervalSet<ClassUnicodeRange>,
+}
+
+impl ClassUnicode {
+ /// Create a new class from a sequence of ranges.
+ ///
+ /// The given ranges do not need to be in any specific order, and ranges
+ /// may overlap.
+ pub fn new<I>(ranges: I) -> ClassUnicode
+ where
+ I: IntoIterator<Item = ClassUnicodeRange>,
+ {
+ ClassUnicode { set: IntervalSet::new(ranges) }
+ }
+
+ /// Create a new class with no ranges.
+ pub fn empty() -> ClassUnicode {
+ ClassUnicode::new(vec![])
+ }
+
+ /// Add a new range to this set.
+ pub fn push(&mut self, range: ClassUnicodeRange) {
+ self.set.push(range);
+ }
+
+ /// Return an iterator over all ranges in this class.
+ ///
+ /// The iterator yields ranges in ascending order.
+ pub fn iter(&self) -> ClassUnicodeIter<'_> {
+ ClassUnicodeIter(self.set.iter())
+ }
+
+ /// Return the underlying ranges as a slice.
+ pub fn ranges(&self) -> &[ClassUnicodeRange] {
+ self.set.intervals()
+ }
+
+ /// Expand this character class such that it contains all case folded
+ /// characters, according to Unicode's "simple" mapping. For example, if
+ /// this class consists of the range `a-z`, then applying case folding will
+ /// result in the class containing both the ranges `a-z` and `A-Z`.
+ ///
+ /// # Panics
+ ///
+ /// This routine panics when the case mapping data necessary for this
+ /// routine to complete is unavailable. This occurs when the `unicode-case`
+ /// feature is not enabled.
+ ///
+ /// Callers should prefer using `try_case_fold_simple` instead, which will
+ /// return an error instead of panicking.
+ pub fn case_fold_simple(&mut self) {
+ self.set
+ .case_fold_simple()
+ .expect("unicode-case feature must be enabled");
+ }
+
+ /// Expand this character class such that it contains all case folded
+ /// characters, according to Unicode's "simple" mapping. For example, if
+ /// this class consists of the range `a-z`, then applying case folding will
+ /// result in the class containing both the ranges `a-z` and `A-Z`.
+ ///
+ /// # Error
+ ///
+ /// This routine returns an error when the case mapping data necessary
+ /// for this routine to complete is unavailable. This occurs when the
+ /// `unicode-case` feature is not enabled.
+ pub fn try_case_fold_simple(
+ &mut self,
+ ) -> result::Result<(), CaseFoldError> {
+ self.set.case_fold_simple()
+ }
+
+ /// Negate this character class.
+ ///
+ /// For all `c` where `c` is a Unicode scalar value, if `c` was in this
+ /// set, then it will not be in this set after negation.
+ pub fn negate(&mut self) {
+ self.set.negate();
+ }
+
+ /// Union this character class with the given character class, in place.
+ pub fn union(&mut self, other: &ClassUnicode) {
+ self.set.union(&other.set);
+ }
+
+ /// Intersect this character class with the given character class, in
+ /// place.
+ pub fn intersect(&mut self, other: &ClassUnicode) {
+ self.set.intersect(&other.set);
+ }
+
+ /// Subtract the given character class from this character class, in place.
+ pub fn difference(&mut self, other: &ClassUnicode) {
+ self.set.difference(&other.set);
+ }
+
+ /// Compute the symmetric difference of the given character classes, in
+ /// place.
+ ///
+ /// This computes the symmetric difference of two character classes. This
+ /// removes all elements in this class that are also in the given class,
+ /// but all adds all elements from the given class that aren't in this
+ /// class. That is, the class will contain all elements in either class,
+ /// but will not contain any elements that are in both classes.
+ pub fn symmetric_difference(&mut self, other: &ClassUnicode) {
+ self.set.symmetric_difference(&other.set);
+ }
+
+ /// Returns true if and only if this character class will either match
+ /// nothing or only ASCII bytes. Stated differently, this returns false
+ /// if and only if this class contains a non-ASCII codepoint.
+ pub fn is_all_ascii(&self) -> bool {
+ self.set.intervals().last().map_or(true, |r| r.end <= '\x7F')
+ }
+}
+
+/// An iterator over all ranges in a Unicode character class.
+///
+/// The lifetime `'a` refers to the lifetime of the underlying class.
+#[derive(Debug)]
+pub struct ClassUnicodeIter<'a>(IntervalSetIter<'a, ClassUnicodeRange>);
+
+impl<'a> Iterator for ClassUnicodeIter<'a> {
+ type Item = &'a ClassUnicodeRange;
+
+ fn next(&mut self) -> Option<&'a ClassUnicodeRange> {
+ self.0.next()
+ }
+}
+
+/// A single range of characters represented by Unicode scalar values.
+///
+/// The range is closed. That is, the start and end of the range are included
+/// in the range.
+#[derive(Clone, Copy, Default, Eq, PartialEq, PartialOrd, Ord)]
+pub struct ClassUnicodeRange {
+ start: char,
+ end: char,
+}
+
+impl fmt::Debug for ClassUnicodeRange {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ let start = if !self.start.is_whitespace() && !self.start.is_control()
+ {
+ self.start.to_string()
+ } else {
+ format!("0x{:X}", self.start as u32)
+ };
+ let end = if !self.end.is_whitespace() && !self.end.is_control() {
+ self.end.to_string()
+ } else {
+ format!("0x{:X}", self.end as u32)
+ };
+ f.debug_struct("ClassUnicodeRange")
+ .field("start", &start)
+ .field("end", &end)
+ .finish()
+ }
+}
+
+impl Interval for ClassUnicodeRange {
+ type Bound = char;
+
+ #[inline]
+ fn lower(&self) -> char {
+ self.start
+ }
+ #[inline]
+ fn upper(&self) -> char {
+ self.end
+ }
+ #[inline]
+ fn set_lower(&mut self, bound: char) {
+ self.start = bound;
+ }
+ #[inline]
+ fn set_upper(&mut self, bound: char) {
+ self.end = bound;
+ }
+
+ /// Apply simple case folding to this Unicode scalar value range.
+ ///
+ /// Additional ranges are appended to the given vector. Canonical ordering
+ /// is *not* maintained in the given vector.
+ fn case_fold_simple(
+ &self,
+ ranges: &mut Vec<ClassUnicodeRange>,
+ ) -> Result<(), unicode::CaseFoldError> {
+ if !unicode::contains_simple_case_mapping(self.start, self.end)? {
+ return Ok(());
+ }
+ let start = self.start as u32;
+ let end = (self.end as u32).saturating_add(1);
+ let mut next_simple_cp = None;
+ for cp in (start..end).filter_map(char::from_u32) {
+ if next_simple_cp.map_or(false, |next| cp < next) {
+ continue;
+ }
+ let it = match unicode::simple_fold(cp)? {
+ Ok(it) => it,
+ Err(next) => {
+ next_simple_cp = next;
+ continue;
+ }
+ };
+ for cp_folded in it {
+ ranges.push(ClassUnicodeRange::new(cp_folded, cp_folded));
+ }
+ }
+ Ok(())
+ }
+}
+
+impl ClassUnicodeRange {
+ /// Create a new Unicode scalar value range for a character class.
+ ///
+ /// The returned range is always in a canonical form. That is, the range
+ /// returned always satisfies the invariant that `start <= end`.
+ pub fn new(start: char, end: char) -> ClassUnicodeRange {
+ ClassUnicodeRange::create(start, end)
+ }
+
+ /// Return the start of this range.
+ ///
+ /// The start of a range is always less than or equal to the end of the
+ /// range.
+ pub fn start(&self) -> char {
+ self.start
+ }
+
+ /// Return the end of this range.
+ ///
+ /// The end of a range is always greater than or equal to the start of the
+ /// range.
+ pub fn end(&self) -> char {
+ self.end
+ }
+}
+
+/// A set of characters represented by arbitrary bytes (where one byte
+/// corresponds to one character).
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct ClassBytes {
+ set: IntervalSet<ClassBytesRange>,
+}
+
+impl ClassBytes {
+ /// Create a new class from a sequence of ranges.
+ ///
+ /// The given ranges do not need to be in any specific order, and ranges
+ /// may overlap.
+ pub fn new<I>(ranges: I) -> ClassBytes
+ where
+ I: IntoIterator<Item = ClassBytesRange>,
+ {
+ ClassBytes { set: IntervalSet::new(ranges) }
+ }
+
+ /// Create a new class with no ranges.
+ pub fn empty() -> ClassBytes {
+ ClassBytes::new(vec![])
+ }
+
+ /// Add a new range to this set.
+ pub fn push(&mut self, range: ClassBytesRange) {
+ self.set.push(range);
+ }
+
+ /// Return an iterator over all ranges in this class.
+ ///
+ /// The iterator yields ranges in ascending order.
+ pub fn iter(&self) -> ClassBytesIter<'_> {
+ ClassBytesIter(self.set.iter())
+ }
+
+ /// Return the underlying ranges as a slice.
+ pub fn ranges(&self) -> &[ClassBytesRange] {
+ self.set.intervals()
+ }
+
+ /// Expand this character class such that it contains all case folded
+ /// characters. For example, if this class consists of the range `a-z`,
+ /// then applying case folding will result in the class containing both the
+ /// ranges `a-z` and `A-Z`.
+ ///
+ /// Note that this only applies ASCII case folding, which is limited to the
+ /// characters `a-z` and `A-Z`.
+ pub fn case_fold_simple(&mut self) {
+ self.set.case_fold_simple().expect("ASCII case folding never fails");
+ }
+
+ /// Negate this byte class.
+ ///
+ /// For all `b` where `b` is a any byte, if `b` was in this set, then it
+ /// will not be in this set after negation.
+ pub fn negate(&mut self) {
+ self.set.negate();
+ }
+
+ /// Union this byte class with the given byte class, in place.
+ pub fn union(&mut self, other: &ClassBytes) {
+ self.set.union(&other.set);
+ }
+
+ /// Intersect this byte class with the given byte class, in place.
+ pub fn intersect(&mut self, other: &ClassBytes) {
+ self.set.intersect(&other.set);
+ }
+
+ /// Subtract the given byte class from this byte class, in place.
+ pub fn difference(&mut self, other: &ClassBytes) {
+ self.set.difference(&other.set);
+ }
+
+ /// Compute the symmetric difference of the given byte classes, in place.
+ ///
+ /// This computes the symmetric difference of two byte classes. This
+ /// removes all elements in this class that are also in the given class,
+ /// but all adds all elements from the given class that aren't in this
+ /// class. That is, the class will contain all elements in either class,
+ /// but will not contain any elements that are in both classes.
+ pub fn symmetric_difference(&mut self, other: &ClassBytes) {
+ self.set.symmetric_difference(&other.set);
+ }
+
+ /// Returns true if and only if this character class will either match
+ /// nothing or only ASCII bytes. Stated differently, this returns false
+ /// if and only if this class contains a non-ASCII byte.
+ pub fn is_all_ascii(&self) -> bool {
+ self.set.intervals().last().map_or(true, |r| r.end <= 0x7F)
+ }
+}
+
+/// An iterator over all ranges in a byte character class.
+///
+/// The lifetime `'a` refers to the lifetime of the underlying class.
+#[derive(Debug)]
+pub struct ClassBytesIter<'a>(IntervalSetIter<'a, ClassBytesRange>);
+
+impl<'a> Iterator for ClassBytesIter<'a> {
+ type Item = &'a ClassBytesRange;
+
+ fn next(&mut self) -> Option<&'a ClassBytesRange> {
+ self.0.next()
+ }
+}
+
+/// A single range of characters represented by arbitrary bytes.
+///
+/// The range is closed. That is, the start and end of the range are included
+/// in the range.
+#[derive(Clone, Copy, Default, Eq, PartialEq, PartialOrd, Ord)]
+pub struct ClassBytesRange {
+ start: u8,
+ end: u8,
+}
+
+impl Interval for ClassBytesRange {
+ type Bound = u8;
+
+ #[inline]
+ fn lower(&self) -> u8 {
+ self.start
+ }
+ #[inline]
+ fn upper(&self) -> u8 {
+ self.end
+ }
+ #[inline]
+ fn set_lower(&mut self, bound: u8) {
+ self.start = bound;
+ }
+ #[inline]
+ fn set_upper(&mut self, bound: u8) {
+ self.end = bound;
+ }
+
+ /// Apply simple case folding to this byte range. Only ASCII case mappings
+ /// (for a-z) are applied.
+ ///
+ /// Additional ranges are appended to the given vector. Canonical ordering
+ /// is *not* maintained in the given vector.
+ fn case_fold_simple(
+ &self,
+ ranges: &mut Vec<ClassBytesRange>,
+ ) -> Result<(), unicode::CaseFoldError> {
+ if !ClassBytesRange::new(b'a', b'z').is_intersection_empty(self) {
+ let lower = cmp::max(self.start, b'a');
+ let upper = cmp::min(self.end, b'z');
+ ranges.push(ClassBytesRange::new(lower - 32, upper - 32));
+ }
+ if !ClassBytesRange::new(b'A', b'Z').is_intersection_empty(self) {
+ let lower = cmp::max(self.start, b'A');
+ let upper = cmp::min(self.end, b'Z');
+ ranges.push(ClassBytesRange::new(lower + 32, upper + 32));
+ }
+ Ok(())
+ }
+}
+
+impl ClassBytesRange {
+ /// Create a new byte range for a character class.
+ ///
+ /// The returned range is always in a canonical form. That is, the range
+ /// returned always satisfies the invariant that `start <= end`.
+ pub fn new(start: u8, end: u8) -> ClassBytesRange {
+ ClassBytesRange::create(start, end)
+ }
+
+ /// Return the start of this range.
+ ///
+ /// The start of a range is always less than or equal to the end of the
+ /// range.
+ pub fn start(&self) -> u8 {
+ self.start
+ }
+
+ /// Return the end of this range.
+ ///
+ /// The end of a range is always greater than or equal to the start of the
+ /// range.
+ pub fn end(&self) -> u8 {
+ self.end
+ }
+}
+
+impl fmt::Debug for ClassBytesRange {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ let mut debug = f.debug_struct("ClassBytesRange");
+ if self.start <= 0x7F {
+ debug.field("start", &(self.start as char));
+ } else {
+ debug.field("start", &self.start);
+ }
+ if self.end <= 0x7F {
+ debug.field("end", &(self.end as char));
+ } else {
+ debug.field("end", &self.end);
+ }
+ debug.finish()
+ }
+}
+
+/// The high-level intermediate representation for an anchor assertion.
+///
+/// A matching anchor assertion is always zero-length.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub enum Anchor {
+ /// Match the beginning of a line or the beginning of text. Specifically,
+ /// this matches at the starting position of the input, or at the position
+ /// immediately following a `\n` character.
+ StartLine,
+ /// Match the end of a line or the end of text. Specifically,
+ /// this matches at the end position of the input, or at the position
+ /// immediately preceding a `\n` character.
+ EndLine,
+ /// Match the beginning of text. Specifically, this matches at the starting
+ /// position of the input.
+ StartText,
+ /// Match the end of text. Specifically, this matches at the ending
+ /// position of the input.
+ EndText,
+}
+
+/// The high-level intermediate representation for a word-boundary assertion.
+///
+/// A matching word boundary assertion is always zero-length.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub enum WordBoundary {
+ /// Match a Unicode-aware word boundary. That is, this matches a position
+ /// where the left adjacent character and right adjacent character
+ /// correspond to a word and non-word or a non-word and word character.
+ Unicode,
+ /// Match a Unicode-aware negation of a word boundary.
+ UnicodeNegate,
+ /// Match an ASCII-only word boundary. That is, this matches a position
+ /// where the left adjacent character and right adjacent character
+ /// correspond to a word and non-word or a non-word and word character.
+ Ascii,
+ /// Match an ASCII-only negation of a word boundary.
+ AsciiNegate,
+}
+
+impl WordBoundary {
+ /// Returns true if and only if this word boundary assertion is negated.
+ pub fn is_negated(&self) -> bool {
+ match *self {
+ WordBoundary::Unicode | WordBoundary::Ascii => false,
+ WordBoundary::UnicodeNegate | WordBoundary::AsciiNegate => true,
+ }
+ }
+}
+
+/// The high-level intermediate representation for a group.
+///
+/// This represents one of three possible group types:
+///
+/// 1. A non-capturing group (e.g., `(?:expr)`).
+/// 2. A capturing group (e.g., `(expr)`).
+/// 3. A named capturing group (e.g., `(?P<name>expr)`).
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct Group {
+ /// The kind of this group. If it is a capturing group, then the kind
+ /// contains the capture group index (and the name, if it is a named
+ /// group).
+ pub kind: GroupKind,
+ /// The expression inside the capturing group, which may be empty.
+ pub hir: Box<Hir>,
+}
+
+/// The kind of group.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub enum GroupKind {
+ /// A normal unnamed capturing group.
+ ///
+ /// The value is the capture index of the group.
+ CaptureIndex(u32),
+ /// A named capturing group.
+ CaptureName {
+ /// The name of the group.
+ name: String,
+ /// The capture index of the group.
+ index: u32,
+ },
+ /// A non-capturing group.
+ NonCapturing,
+}
+
+/// The high-level intermediate representation of a repetition operator.
+///
+/// A repetition operator permits the repetition of an arbitrary
+/// sub-expression.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct Repetition {
+ /// The kind of this repetition operator.
+ pub kind: RepetitionKind,
+ /// Whether this repetition operator is greedy or not. A greedy operator
+ /// will match as much as it can. A non-greedy operator will match as
+ /// little as it can.
+ ///
+ /// Typically, operators are greedy by default and are only non-greedy when
+ /// a `?` suffix is used, e.g., `(expr)*` is greedy while `(expr)*?` is
+ /// not. However, this can be inverted via the `U` "ungreedy" flag.
+ pub greedy: bool,
+ /// The expression being repeated.
+ pub hir: Box<Hir>,
+}
+
+impl Repetition {
+ /// Returns true if and only if this repetition operator makes it possible
+ /// to match the empty string.
+ ///
+ /// Note that this is not defined inductively. For example, while `a*`
+ /// will report `true`, `()+` will not, even though `()` matches the empty
+ /// string and one or more occurrences of something that matches the empty
+ /// string will always match the empty string. In order to get the
+ /// inductive definition, see the corresponding method on
+ /// [`Hir`](struct.Hir.html).
+ pub fn is_match_empty(&self) -> bool {
+ match self.kind {
+ RepetitionKind::ZeroOrOne => true,
+ RepetitionKind::ZeroOrMore => true,
+ RepetitionKind::OneOrMore => false,
+ RepetitionKind::Range(RepetitionRange::Exactly(m)) => m == 0,
+ RepetitionKind::Range(RepetitionRange::AtLeast(m)) => m == 0,
+ RepetitionKind::Range(RepetitionRange::Bounded(m, _)) => m == 0,
+ }
+ }
+}
+
+/// The kind of a repetition operator.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub enum RepetitionKind {
+ /// Matches a sub-expression zero or one times.
+ ZeroOrOne,
+ /// Matches a sub-expression zero or more times.
+ ZeroOrMore,
+ /// Matches a sub-expression one or more times.
+ OneOrMore,
+ /// Matches a sub-expression within a bounded range of times.
+ Range(RepetitionRange),
+}
+
+/// The kind of a counted repetition operator.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub enum RepetitionRange {
+ /// Matches a sub-expression exactly this many times.
+ Exactly(u32),
+ /// Matches a sub-expression at least this many times.
+ AtLeast(u32),
+ /// Matches a sub-expression at least `m` times and at most `n` times.
+ Bounded(u32, u32),
+}
+
+/// A custom `Drop` impl is used for `HirKind` such that it uses constant stack
+/// space but heap space proportional to the depth of the total `Hir`.
+impl Drop for Hir {
+ fn drop(&mut self) {
+ use std::mem;
+
+ match *self.kind() {
+ HirKind::Empty
+ | HirKind::Literal(_)
+ | HirKind::Class(_)
+ | HirKind::Anchor(_)
+ | HirKind::WordBoundary(_) => return,
+ HirKind::Group(ref x) if !x.hir.kind.has_subexprs() => return,
+ HirKind::Repetition(ref x) if !x.hir.kind.has_subexprs() => return,
+ HirKind::Concat(ref x) if x.is_empty() => return,
+ HirKind::Alternation(ref x) if x.is_empty() => return,
+ _ => {}
+ }
+
+ let mut stack = vec![mem::replace(self, Hir::empty())];
+ while let Some(mut expr) = stack.pop() {
+ match expr.kind {
+ HirKind::Empty
+ | HirKind::Literal(_)
+ | HirKind::Class(_)
+ | HirKind::Anchor(_)
+ | HirKind::WordBoundary(_) => {}
+ HirKind::Group(ref mut x) => {
+ stack.push(mem::replace(&mut x.hir, Hir::empty()));
+ }
+ HirKind::Repetition(ref mut x) => {
+ stack.push(mem::replace(&mut x.hir, Hir::empty()));
+ }
+ HirKind::Concat(ref mut x) => {
+ stack.extend(x.drain(..));
+ }
+ HirKind::Alternation(ref mut x) => {
+ stack.extend(x.drain(..));
+ }
+ }
+ }
+ }
+}
+
+/// A type that documents various attributes of an HIR expression.
+///
+/// These attributes are typically defined inductively on the HIR.
+#[derive(Clone, Debug, Eq, PartialEq)]
+struct HirInfo {
+ /// Represent yes/no questions by a bitfield to conserve space, since
+ /// this is included in every HIR expression.
+ ///
+ /// If more attributes need to be added, it is OK to increase the size of
+ /// this as appropriate.
+ bools: u16,
+}
+
+// A simple macro for defining bitfield accessors/mutators.
+macro_rules! define_bool {
+ ($bit:expr, $is_fn_name:ident, $set_fn_name:ident) => {
+ fn $is_fn_name(&self) -> bool {
+ self.bools & (0b1 << $bit) > 0
+ }
+
+ fn $set_fn_name(&mut self, yes: bool) {
+ if yes {
+ self.bools |= 1 << $bit;
+ } else {
+ self.bools &= !(1 << $bit);
+ }
+ }
+ };
+}
+
+impl HirInfo {
+ fn new() -> HirInfo {
+ HirInfo { bools: 0 }
+ }
+
+ define_bool!(0, is_always_utf8, set_always_utf8);
+ define_bool!(1, is_all_assertions, set_all_assertions);
+ define_bool!(2, is_anchored_start, set_anchored_start);
+ define_bool!(3, is_anchored_end, set_anchored_end);
+ define_bool!(4, is_line_anchored_start, set_line_anchored_start);
+ define_bool!(5, is_line_anchored_end, set_line_anchored_end);
+ define_bool!(6, is_any_anchored_start, set_any_anchored_start);
+ define_bool!(7, is_any_anchored_end, set_any_anchored_end);
+ define_bool!(8, is_match_empty, set_match_empty);
+ define_bool!(9, is_literal, set_literal);
+ define_bool!(10, is_alternation_literal, set_alternation_literal);
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ fn uclass(ranges: &[(char, char)]) -> ClassUnicode {
+ let ranges: Vec<ClassUnicodeRange> = ranges
+ .iter()
+ .map(|&(s, e)| ClassUnicodeRange::new(s, e))
+ .collect();
+ ClassUnicode::new(ranges)
+ }
+
+ fn bclass(ranges: &[(u8, u8)]) -> ClassBytes {
+ let ranges: Vec<ClassBytesRange> =
+ ranges.iter().map(|&(s, e)| ClassBytesRange::new(s, e)).collect();
+ ClassBytes::new(ranges)
+ }
+
+ fn uranges(cls: &ClassUnicode) -> Vec<(char, char)> {
+ cls.iter().map(|x| (x.start(), x.end())).collect()
+ }
+
+ #[cfg(feature = "unicode-case")]
+ fn ucasefold(cls: &ClassUnicode) -> ClassUnicode {
+ let mut cls_ = cls.clone();
+ cls_.case_fold_simple();
+ cls_
+ }
+
+ fn uunion(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode {
+ let mut cls_ = cls1.clone();
+ cls_.union(cls2);
+ cls_
+ }
+
+ fn uintersect(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode {
+ let mut cls_ = cls1.clone();
+ cls_.intersect(cls2);
+ cls_
+ }
+
+ fn udifference(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode {
+ let mut cls_ = cls1.clone();
+ cls_.difference(cls2);
+ cls_
+ }
+
+ fn usymdifference(
+ cls1: &ClassUnicode,
+ cls2: &ClassUnicode,
+ ) -> ClassUnicode {
+ let mut cls_ = cls1.clone();
+ cls_.symmetric_difference(cls2);
+ cls_
+ }
+
+ fn unegate(cls: &ClassUnicode) -> ClassUnicode {
+ let mut cls_ = cls.clone();
+ cls_.negate();
+ cls_
+ }
+
+ fn branges(cls: &ClassBytes) -> Vec<(u8, u8)> {
+ cls.iter().map(|x| (x.start(), x.end())).collect()
+ }
+
+ fn bcasefold(cls: &ClassBytes) -> ClassBytes {
+ let mut cls_ = cls.clone();
+ cls_.case_fold_simple();
+ cls_
+ }
+
+ fn bunion(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
+ let mut cls_ = cls1.clone();
+ cls_.union(cls2);
+ cls_
+ }
+
+ fn bintersect(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
+ let mut cls_ = cls1.clone();
+ cls_.intersect(cls2);
+ cls_
+ }
+
+ fn bdifference(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
+ let mut cls_ = cls1.clone();
+ cls_.difference(cls2);
+ cls_
+ }
+
+ fn bsymdifference(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
+ let mut cls_ = cls1.clone();
+ cls_.symmetric_difference(cls2);
+ cls_
+ }
+
+ fn bnegate(cls: &ClassBytes) -> ClassBytes {
+ let mut cls_ = cls.clone();
+ cls_.negate();
+ cls_
+ }
+
+ #[test]
+ fn class_range_canonical_unicode() {
+ let range = ClassUnicodeRange::new('\u{00FF}', '\0');
+ assert_eq!('\0', range.start());
+ assert_eq!('\u{00FF}', range.end());
+ }
+
+ #[test]
+ fn class_range_canonical_bytes() {
+ let range = ClassBytesRange::new(b'\xFF', b'\0');
+ assert_eq!(b'\0', range.start());
+ assert_eq!(b'\xFF', range.end());
+ }
+
+ #[test]
+ fn class_canonicalize_unicode() {
+ let cls = uclass(&[('a', 'c'), ('x', 'z')]);
+ let expected = vec![('a', 'c'), ('x', 'z')];
+ assert_eq!(expected, uranges(&cls));
+
+ let cls = uclass(&[('x', 'z'), ('a', 'c')]);
+ let expected = vec![('a', 'c'), ('x', 'z')];
+ assert_eq!(expected, uranges(&cls));
+
+ let cls = uclass(&[('x', 'z'), ('w', 'y')]);
+ let expected = vec![('w', 'z')];
+ assert_eq!(expected, uranges(&cls));
+
+ let cls = uclass(&[
+ ('c', 'f'),
+ ('a', 'g'),
+ ('d', 'j'),
+ ('a', 'c'),
+ ('m', 'p'),
+ ('l', 's'),
+ ]);
+ let expected = vec![('a', 'j'), ('l', 's')];
+ assert_eq!(expected, uranges(&cls));
+
+ let cls = uclass(&[('x', 'z'), ('u', 'w')]);
+ let expected = vec![('u', 'z')];
+ assert_eq!(expected, uranges(&cls));
+
+ let cls = uclass(&[('\x00', '\u{10FFFF}'), ('\x00', '\u{10FFFF}')]);
+ let expected = vec![('\x00', '\u{10FFFF}')];
+ assert_eq!(expected, uranges(&cls));
+
+ let cls = uclass(&[('a', 'a'), ('b', 'b')]);
+ let expected = vec![('a', 'b')];
+ assert_eq!(expected, uranges(&cls));
+ }
+
+ #[test]
+ fn class_canonicalize_bytes() {
+ let cls = bclass(&[(b'a', b'c'), (b'x', b'z')]);
+ let expected = vec![(b'a', b'c'), (b'x', b'z')];
+ assert_eq!(expected, branges(&cls));
+
+ let cls = bclass(&[(b'x', b'z'), (b'a', b'c')]);
+ let expected = vec![(b'a', b'c'), (b'x', b'z')];
+ assert_eq!(expected, branges(&cls));
+
+ let cls = bclass(&[(b'x', b'z'), (b'w', b'y')]);
+ let expected = vec![(b'w', b'z')];
+ assert_eq!(expected, branges(&cls));
+
+ let cls = bclass(&[
+ (b'c', b'f'),
+ (b'a', b'g'),
+ (b'd', b'j'),
+ (b'a', b'c'),
+ (b'm', b'p'),
+ (b'l', b's'),
+ ]);
+ let expected = vec![(b'a', b'j'), (b'l', b's')];
+ assert_eq!(expected, branges(&cls));
+
+ let cls = bclass(&[(b'x', b'z'), (b'u', b'w')]);
+ let expected = vec![(b'u', b'z')];
+ assert_eq!(expected, branges(&cls));
+
+ let cls = bclass(&[(b'\x00', b'\xFF'), (b'\x00', b'\xFF')]);
+ let expected = vec![(b'\x00', b'\xFF')];
+ assert_eq!(expected, branges(&cls));
+
+ let cls = bclass(&[(b'a', b'a'), (b'b', b'b')]);
+ let expected = vec![(b'a', b'b')];
+ assert_eq!(expected, branges(&cls));
+ }
+
+ #[test]
+ #[cfg(feature = "unicode-case")]
+ fn class_case_fold_unicode() {
+ let cls = uclass(&[
+ ('C', 'F'),
+ ('A', 'G'),
+ ('D', 'J'),
+ ('A', 'C'),
+ ('M', 'P'),
+ ('L', 'S'),
+ ('c', 'f'),
+ ]);
+ let expected = uclass(&[
+ ('A', 'J'),
+ ('L', 'S'),
+ ('a', 'j'),
+ ('l', 's'),
+ ('\u{17F}', '\u{17F}'),
+ ]);
+ assert_eq!(expected, ucasefold(&cls));
+
+ let cls = uclass(&[('A', 'Z')]);
+ let expected = uclass(&[
+ ('A', 'Z'),
+ ('a', 'z'),
+ ('\u{17F}', '\u{17F}'),
+ ('\u{212A}', '\u{212A}'),
+ ]);
+ assert_eq!(expected, ucasefold(&cls));
+
+ let cls = uclass(&[('a', 'z')]);
+ let expected = uclass(&[
+ ('A', 'Z'),
+ ('a', 'z'),
+ ('\u{17F}', '\u{17F}'),
+ ('\u{212A}', '\u{212A}'),
+ ]);
+ assert_eq!(expected, ucasefold(&cls));
+
+ let cls = uclass(&[('A', 'A'), ('_', '_')]);
+ let expected = uclass(&[('A', 'A'), ('_', '_'), ('a', 'a')]);
+ assert_eq!(expected, ucasefold(&cls));
+
+ let cls = uclass(&[('A', 'A'), ('=', '=')]);
+ let expected = uclass(&[('=', '='), ('A', 'A'), ('a', 'a')]);
+ assert_eq!(expected, ucasefold(&cls));
+
+ let cls = uclass(&[('\x00', '\x10')]);
+ assert_eq!(cls, ucasefold(&cls));
+
+ let cls = uclass(&[('k', 'k')]);
+ let expected =
+ uclass(&[('K', 'K'), ('k', 'k'), ('\u{212A}', '\u{212A}')]);
+ assert_eq!(expected, ucasefold(&cls));
+
+ let cls = uclass(&[('@', '@')]);
+ assert_eq!(cls, ucasefold(&cls));
+ }
+
+ #[test]
+ #[cfg(not(feature = "unicode-case"))]
+ fn class_case_fold_unicode_disabled() {
+ let mut cls = uclass(&[
+ ('C', 'F'),
+ ('A', 'G'),
+ ('D', 'J'),
+ ('A', 'C'),
+ ('M', 'P'),
+ ('L', 'S'),
+ ('c', 'f'),
+ ]);
+ assert!(cls.try_case_fold_simple().is_err());
+ }
+
+ #[test]
+ #[should_panic]
+ #[cfg(not(feature = "unicode-case"))]
+ fn class_case_fold_unicode_disabled_panics() {
+ let mut cls = uclass(&[
+ ('C', 'F'),
+ ('A', 'G'),
+ ('D', 'J'),
+ ('A', 'C'),
+ ('M', 'P'),
+ ('L', 'S'),
+ ('c', 'f'),
+ ]);
+ cls.case_fold_simple();
+ }
+
+ #[test]
+ fn class_case_fold_bytes() {
+ let cls = bclass(&[
+ (b'C', b'F'),
+ (b'A', b'G'),
+ (b'D', b'J'),
+ (b'A', b'C'),
+ (b'M', b'P'),
+ (b'L', b'S'),
+ (b'c', b'f'),
+ ]);
+ let expected =
+ bclass(&[(b'A', b'J'), (b'L', b'S'), (b'a', b'j'), (b'l', b's')]);
+ assert_eq!(expected, bcasefold(&cls));
+
+ let cls = bclass(&[(b'A', b'Z')]);
+ let expected = bclass(&[(b'A', b'Z'), (b'a', b'z')]);
+ assert_eq!(expected, bcasefold(&cls));
+
+ let cls = bclass(&[(b'a', b'z')]);
+ let expected = bclass(&[(b'A', b'Z'), (b'a', b'z')]);
+ assert_eq!(expected, bcasefold(&cls));
+
+ let cls = bclass(&[(b'A', b'A'), (b'_', b'_')]);
+ let expected = bclass(&[(b'A', b'A'), (b'_', b'_'), (b'a', b'a')]);
+ assert_eq!(expected, bcasefold(&cls));
+
+ let cls = bclass(&[(b'A', b'A'), (b'=', b'=')]);
+ let expected = bclass(&[(b'=', b'='), (b'A', b'A'), (b'a', b'a')]);
+ assert_eq!(expected, bcasefold(&cls));
+
+ let cls = bclass(&[(b'\x00', b'\x10')]);
+ assert_eq!(cls, bcasefold(&cls));
+
+ let cls = bclass(&[(b'k', b'k')]);
+ let expected = bclass(&[(b'K', b'K'), (b'k', b'k')]);
+ assert_eq!(expected, bcasefold(&cls));
+
+ let cls = bclass(&[(b'@', b'@')]);
+ assert_eq!(cls, bcasefold(&cls));
+ }
+
+ #[test]
+ fn class_negate_unicode() {
+ let cls = uclass(&[('a', 'a')]);
+ let expected = uclass(&[('\x00', '\x60'), ('\x62', '\u{10FFFF}')]);
+ assert_eq!(expected, unegate(&cls));
+
+ let cls = uclass(&[('a', 'a'), ('b', 'b')]);
+ let expected = uclass(&[('\x00', '\x60'), ('\x63', '\u{10FFFF}')]);
+ assert_eq!(expected, unegate(&cls));
+
+ let cls = uclass(&[('a', 'c'), ('x', 'z')]);
+ let expected = uclass(&[
+ ('\x00', '\x60'),
+ ('\x64', '\x77'),
+ ('\x7B', '\u{10FFFF}'),
+ ]);
+ assert_eq!(expected, unegate(&cls));
+
+ let cls = uclass(&[('\x00', 'a')]);
+ let expected = uclass(&[('\x62', '\u{10FFFF}')]);
+ assert_eq!(expected, unegate(&cls));
+
+ let cls = uclass(&[('a', '\u{10FFFF}')]);
+ let expected = uclass(&[('\x00', '\x60')]);
+ assert_eq!(expected, unegate(&cls));
+
+ let cls = uclass(&[('\x00', '\u{10FFFF}')]);
+ let expected = uclass(&[]);
+ assert_eq!(expected, unegate(&cls));
+
+ let cls = uclass(&[]);
+ let expected = uclass(&[('\x00', '\u{10FFFF}')]);
+ assert_eq!(expected, unegate(&cls));
+
+ let cls =
+ uclass(&[('\x00', '\u{10FFFD}'), ('\u{10FFFF}', '\u{10FFFF}')]);
+ let expected = uclass(&[('\u{10FFFE}', '\u{10FFFE}')]);
+ assert_eq!(expected, unegate(&cls));
+
+ let cls = uclass(&[('\x00', '\u{D7FF}')]);
+ let expected = uclass(&[('\u{E000}', '\u{10FFFF}')]);
+ assert_eq!(expected, unegate(&cls));
+
+ let cls = uclass(&[('\x00', '\u{D7FE}')]);
+ let expected = uclass(&[('\u{D7FF}', '\u{10FFFF}')]);
+ assert_eq!(expected, unegate(&cls));
+
+ let cls = uclass(&[('\u{E000}', '\u{10FFFF}')]);
+ let expected = uclass(&[('\x00', '\u{D7FF}')]);
+ assert_eq!(expected, unegate(&cls));
+
+ let cls = uclass(&[('\u{E001}', '\u{10FFFF}')]);
+ let expected = uclass(&[('\x00', '\u{E000}')]);
+ assert_eq!(expected, unegate(&cls));
+ }
+
+ #[test]
+ fn class_negate_bytes() {
+ let cls = bclass(&[(b'a', b'a')]);
+ let expected = bclass(&[(b'\x00', b'\x60'), (b'\x62', b'\xFF')]);
+ assert_eq!(expected, bnegate(&cls));
+
+ let cls = bclass(&[(b'a', b'a'), (b'b', b'b')]);
+ let expected = bclass(&[(b'\x00', b'\x60'), (b'\x63', b'\xFF')]);
+ assert_eq!(expected, bnegate(&cls));
+
+ let cls = bclass(&[(b'a', b'c'), (b'x', b'z')]);
+ let expected = bclass(&[
+ (b'\x00', b'\x60'),
+ (b'\x64', b'\x77'),
+ (b'\x7B', b'\xFF'),
+ ]);
+ assert_eq!(expected, bnegate(&cls));
+
+ let cls = bclass(&[(b'\x00', b'a')]);
+ let expected = bclass(&[(b'\x62', b'\xFF')]);
+ assert_eq!(expected, bnegate(&cls));
+
+ let cls = bclass(&[(b'a', b'\xFF')]);
+ let expected = bclass(&[(b'\x00', b'\x60')]);
+ assert_eq!(expected, bnegate(&cls));
+
+ let cls = bclass(&[(b'\x00', b'\xFF')]);
+ let expected = bclass(&[]);
+ assert_eq!(expected, bnegate(&cls));
+
+ let cls = bclass(&[]);
+ let expected = bclass(&[(b'\x00', b'\xFF')]);
+ assert_eq!(expected, bnegate(&cls));
+
+ let cls = bclass(&[(b'\x00', b'\xFD'), (b'\xFF', b'\xFF')]);
+ let expected = bclass(&[(b'\xFE', b'\xFE')]);
+ assert_eq!(expected, bnegate(&cls));
+ }
+
+ #[test]
+ fn class_union_unicode() {
+ let cls1 = uclass(&[('a', 'g'), ('m', 't'), ('A', 'C')]);
+ let cls2 = uclass(&[('a', 'z')]);
+ let expected = uclass(&[('a', 'z'), ('A', 'C')]);
+ assert_eq!(expected, uunion(&cls1, &cls2));
+ }
+
+ #[test]
+ fn class_union_bytes() {
+ let cls1 = bclass(&[(b'a', b'g'), (b'm', b't'), (b'A', b'C')]);
+ let cls2 = bclass(&[(b'a', b'z')]);
+ let expected = bclass(&[(b'a', b'z'), (b'A', b'C')]);
+ assert_eq!(expected, bunion(&cls1, &cls2));
+ }
+
+ #[test]
+ fn class_intersect_unicode() {
+ let cls1 = uclass(&[]);
+ let cls2 = uclass(&[('a', 'a')]);
+ let expected = uclass(&[]);
+ assert_eq!(expected, uintersect(&cls1, &cls2));
+
+ let cls1 = uclass(&[('a', 'a')]);
+ let cls2 = uclass(&[('a', 'a')]);
+ let expected = uclass(&[('a', 'a')]);
+ assert_eq!(expected, uintersect(&cls1, &cls2));
+
+ let cls1 = uclass(&[('a', 'a')]);
+ let cls2 = uclass(&[('b', 'b')]);
+ let expected = uclass(&[]);
+ assert_eq!(expected, uintersect(&cls1, &cls2));
+
+ let cls1 = uclass(&[('a', 'a')]);
+ let cls2 = uclass(&[('a', 'c')]);
+ let expected = uclass(&[('a', 'a')]);
+ assert_eq!(expected, uintersect(&cls1, &cls2));
+
+ let cls1 = uclass(&[('a', 'b')]);
+ let cls2 = uclass(&[('a', 'c')]);
+ let expected = uclass(&[('a', 'b')]);
+ assert_eq!(expected, uintersect(&cls1, &cls2));
+
+ let cls1 = uclass(&[('a', 'b')]);
+ let cls2 = uclass(&[('b', 'c')]);
+ let expected = uclass(&[('b', 'b')]);
+ assert_eq!(expected, uintersect(&cls1, &cls2));
+
+ let cls1 = uclass(&[('a', 'b')]);
+ let cls2 = uclass(&[('c', 'd')]);
+ let expected = uclass(&[]);
+ assert_eq!(expected, uintersect(&cls1, &cls2));
+
+ let cls1 = uclass(&[('b', 'c')]);
+ let cls2 = uclass(&[('a', 'd')]);
+ let expected = uclass(&[('b', 'c')]);
+ assert_eq!(expected, uintersect(&cls1, &cls2));
+
+ let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
+ let cls2 = uclass(&[('a', 'h')]);
+ let expected = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
+ assert_eq!(expected, uintersect(&cls1, &cls2));
+
+ let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
+ let cls2 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
+ let expected = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
+ assert_eq!(expected, uintersect(&cls1, &cls2));
+
+ let cls1 = uclass(&[('a', 'b'), ('g', 'h')]);
+ let cls2 = uclass(&[('d', 'e'), ('k', 'l')]);
+ let expected = uclass(&[]);
+ assert_eq!(expected, uintersect(&cls1, &cls2));
+
+ let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
+ let cls2 = uclass(&[('h', 'h')]);
+ let expected = uclass(&[('h', 'h')]);
+ assert_eq!(expected, uintersect(&cls1, &cls2));
+
+ let cls1 = uclass(&[('a', 'b'), ('e', 'f'), ('i', 'j')]);
+ let cls2 = uclass(&[('c', 'd'), ('g', 'h'), ('k', 'l')]);
+ let expected = uclass(&[]);
+ assert_eq!(expected, uintersect(&cls1, &cls2));
+
+ let cls1 = uclass(&[('a', 'b'), ('c', 'd'), ('e', 'f')]);
+ let cls2 = uclass(&[('b', 'c'), ('d', 'e'), ('f', 'g')]);
+ let expected = uclass(&[('b', 'f')]);
+ assert_eq!(expected, uintersect(&cls1, &cls2));
+ }
+
+ #[test]
+ fn class_intersect_bytes() {
+ let cls1 = bclass(&[]);
+ let cls2 = bclass(&[(b'a', b'a')]);
+ let expected = bclass(&[]);
+ assert_eq!(expected, bintersect(&cls1, &cls2));
+
+ let cls1 = bclass(&[(b'a', b'a')]);
+ let cls2 = bclass(&[(b'a', b'a')]);
+ let expected = bclass(&[(b'a', b'a')]);
+ assert_eq!(expected, bintersect(&cls1, &cls2));
+
+ let cls1 = bclass(&[(b'a', b'a')]);
+ let cls2 = bclass(&[(b'b', b'b')]);
+ let expected = bclass(&[]);
+ assert_eq!(expected, bintersect(&cls1, &cls2));
+
+ let cls1 = bclass(&[(b'a', b'a')]);
+ let cls2 = bclass(&[(b'a', b'c')]);
+ let expected = bclass(&[(b'a', b'a')]);
+ assert_eq!(expected, bintersect(&cls1, &cls2));
+
+ let cls1 = bclass(&[(b'a', b'b')]);
+ let cls2 = bclass(&[(b'a', b'c')]);
+ let expected = bclass(&[(b'a', b'b')]);
+ assert_eq!(expected, bintersect(&cls1, &cls2));
+
+ let cls1 = bclass(&[(b'a', b'b')]);
+ let cls2 = bclass(&[(b'b', b'c')]);
+ let expected = bclass(&[(b'b', b'b')]);
+ assert_eq!(expected, bintersect(&cls1, &cls2));
+
+ let cls1 = bclass(&[(b'a', b'b')]);
+ let cls2 = bclass(&[(b'c', b'd')]);
+ let expected = bclass(&[]);
+ assert_eq!(expected, bintersect(&cls1, &cls2));
+
+ let cls1 = bclass(&[(b'b', b'c')]);
+ let cls2 = bclass(&[(b'a', b'd')]);
+ let expected = bclass(&[(b'b', b'c')]);
+ assert_eq!(expected, bintersect(&cls1, &cls2));
+
+ let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
+ let cls2 = bclass(&[(b'a', b'h')]);
+ let expected = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
+ assert_eq!(expected, bintersect(&cls1, &cls2));
+
+ let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
+ let cls2 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
+ let expected = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
+ assert_eq!(expected, bintersect(&cls1, &cls2));
+
+ let cls1 = bclass(&[(b'a', b'b'), (b'g', b'h')]);
+ let cls2 = bclass(&[(b'd', b'e'), (b'k', b'l')]);
+ let expected = bclass(&[]);
+ assert_eq!(expected, bintersect(&cls1, &cls2));
+
+ let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
+ let cls2 = bclass(&[(b'h', b'h')]);
+ let expected = bclass(&[(b'h', b'h')]);
+ assert_eq!(expected, bintersect(&cls1, &cls2));
+
+ let cls1 = bclass(&[(b'a', b'b'), (b'e', b'f'), (b'i', b'j')]);
+ let cls2 = bclass(&[(b'c', b'd'), (b'g', b'h'), (b'k', b'l')]);
+ let expected = bclass(&[]);
+ assert_eq!(expected, bintersect(&cls1, &cls2));
+
+ let cls1 = bclass(&[(b'a', b'b'), (b'c', b'd'), (b'e', b'f')]);
+ let cls2 = bclass(&[(b'b', b'c'), (b'd', b'e'), (b'f', b'g')]);
+ let expected = bclass(&[(b'b', b'f')]);
+ assert_eq!(expected, bintersect(&cls1, &cls2));
+ }
+
+ #[test]
+ fn class_difference_unicode() {
+ let cls1 = uclass(&[('a', 'a')]);
+ let cls2 = uclass(&[('a', 'a')]);
+ let expected = uclass(&[]);
+ assert_eq!(expected, udifference(&cls1, &cls2));
+
+ let cls1 = uclass(&[('a', 'a')]);
+ let cls2 = uclass(&[]);
+ let expected = uclass(&[('a', 'a')]);
+ assert_eq!(expected, udifference(&cls1, &cls2));
+
+ let cls1 = uclass(&[]);
+ let cls2 = uclass(&[('a', 'a')]);
+ let expected = uclass(&[]);
+ assert_eq!(expected, udifference(&cls1, &cls2));
+
+ let cls1 = uclass(&[('a', 'z')]);
+ let cls2 = uclass(&[('a', 'a')]);
+ let expected = uclass(&[('b', 'z')]);
+ assert_eq!(expected, udifference(&cls1, &cls2));
+
+ let cls1 = uclass(&[('a', 'z')]);
+ let cls2 = uclass(&[('z', 'z')]);
+ let expected = uclass(&[('a', 'y')]);
+ assert_eq!(expected, udifference(&cls1, &cls2));
+
+ let cls1 = uclass(&[('a', 'z')]);
+ let cls2 = uclass(&[('m', 'm')]);
+ let expected = uclass(&[('a', 'l'), ('n', 'z')]);
+ assert_eq!(expected, udifference(&cls1, &cls2));
+
+ let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
+ let cls2 = uclass(&[('a', 'z')]);
+ let expected = uclass(&[]);
+ assert_eq!(expected, udifference(&cls1, &cls2));
+
+ let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
+ let cls2 = uclass(&[('d', 'v')]);
+ let expected = uclass(&[('a', 'c')]);
+ assert_eq!(expected, udifference(&cls1, &cls2));
+
+ let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
+ let cls2 = uclass(&[('b', 'g'), ('s', 'u')]);
+ let expected = uclass(&[('a', 'a'), ('h', 'i'), ('r', 'r')]);
+ assert_eq!(expected, udifference(&cls1, &cls2));
+
+ let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
+ let cls2 = uclass(&[('b', 'd'), ('e', 'g'), ('s', 'u')]);
+ let expected = uclass(&[('a', 'a'), ('h', 'i'), ('r', 'r')]);
+ assert_eq!(expected, udifference(&cls1, &cls2));
+
+ let cls1 = uclass(&[('x', 'z')]);
+ let cls2 = uclass(&[('a', 'c'), ('e', 'g'), ('s', 'u')]);
+ let expected = uclass(&[('x', 'z')]);
+ assert_eq!(expected, udifference(&cls1, &cls2));
+
+ let cls1 = uclass(&[('a', 'z')]);
+ let cls2 = uclass(&[('a', 'c'), ('e', 'g'), ('s', 'u')]);
+ let expected = uclass(&[('d', 'd'), ('h', 'r'), ('v', 'z')]);
+ assert_eq!(expected, udifference(&cls1, &cls2));
+ }
+
+ #[test]
+ fn class_difference_bytes() {
+ let cls1 = bclass(&[(b'a', b'a')]);
+ let cls2 = bclass(&[(b'a', b'a')]);
+ let expected = bclass(&[]);
+ assert_eq!(expected, bdifference(&cls1, &cls2));
+
+ let cls1 = bclass(&[(b'a', b'a')]);
+ let cls2 = bclass(&[]);
+ let expected = bclass(&[(b'a', b'a')]);
+ assert_eq!(expected, bdifference(&cls1, &cls2));
+
+ let cls1 = bclass(&[]);
+ let cls2 = bclass(&[(b'a', b'a')]);
+ let expected = bclass(&[]);
+ assert_eq!(expected, bdifference(&cls1, &cls2));
+
+ let cls1 = bclass(&[(b'a', b'z')]);
+ let cls2 = bclass(&[(b'a', b'a')]);
+ let expected = bclass(&[(b'b', b'z')]);
+ assert_eq!(expected, bdifference(&cls1, &cls2));
+
+ let cls1 = bclass(&[(b'a', b'z')]);
+ let cls2 = bclass(&[(b'z', b'z')]);
+ let expected = bclass(&[(b'a', b'y')]);
+ assert_eq!(expected, bdifference(&cls1, &cls2));
+
+ let cls1 = bclass(&[(b'a', b'z')]);
+ let cls2 = bclass(&[(b'm', b'm')]);
+ let expected = bclass(&[(b'a', b'l'), (b'n', b'z')]);
+ assert_eq!(expected, bdifference(&cls1, &cls2));
+
+ let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
+ let cls2 = bclass(&[(b'a', b'z')]);
+ let expected = bclass(&[]);
+ assert_eq!(expected, bdifference(&cls1, &cls2));
+
+ let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
+ let cls2 = bclass(&[(b'd', b'v')]);
+ let expected = bclass(&[(b'a', b'c')]);
+ assert_eq!(expected, bdifference(&cls1, &cls2));
+
+ let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
+ let cls2 = bclass(&[(b'b', b'g'), (b's', b'u')]);
+ let expected = bclass(&[(b'a', b'a'), (b'h', b'i'), (b'r', b'r')]);
+ assert_eq!(expected, bdifference(&cls1, &cls2));
+
+ let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
+ let cls2 = bclass(&[(b'b', b'd'), (b'e', b'g'), (b's', b'u')]);
+ let expected = bclass(&[(b'a', b'a'), (b'h', b'i'), (b'r', b'r')]);
+ assert_eq!(expected, bdifference(&cls1, &cls2));
+
+ let cls1 = bclass(&[(b'x', b'z')]);
+ let cls2 = bclass(&[(b'a', b'c'), (b'e', b'g'), (b's', b'u')]);
+ let expected = bclass(&[(b'x', b'z')]);
+ assert_eq!(expected, bdifference(&cls1, &cls2));
+
+ let cls1 = bclass(&[(b'a', b'z')]);
+ let cls2 = bclass(&[(b'a', b'c'), (b'e', b'g'), (b's', b'u')]);
+ let expected = bclass(&[(b'd', b'd'), (b'h', b'r'), (b'v', b'z')]);
+ assert_eq!(expected, bdifference(&cls1, &cls2));
+ }
+
+ #[test]
+ fn class_symmetric_difference_unicode() {
+ let cls1 = uclass(&[('a', 'm')]);
+ let cls2 = uclass(&[('g', 't')]);
+ let expected = uclass(&[('a', 'f'), ('n', 't')]);
+ assert_eq!(expected, usymdifference(&cls1, &cls2));
+ }
+
+ #[test]
+ fn class_symmetric_difference_bytes() {
+ let cls1 = bclass(&[(b'a', b'm')]);
+ let cls2 = bclass(&[(b'g', b't')]);
+ let expected = bclass(&[(b'a', b'f'), (b'n', b't')]);
+ assert_eq!(expected, bsymdifference(&cls1, &cls2));
+ }
+
+ #[test]
+ #[should_panic]
+ fn hir_byte_literal_non_ascii() {
+ Hir::literal(Literal::Byte(b'a'));
+ }
+
+ // We use a thread with an explicit stack size to test that our destructor
+ // for Hir can handle arbitrarily sized expressions in constant stack
+ // space. In case we run on a platform without threads (WASM?), we limit
+ // this test to Windows/Unix.
+ #[test]
+ #[cfg(any(unix, windows))]
+ fn no_stack_overflow_on_drop() {
+ use std::thread;
+
+ let run = || {
+ let mut expr = Hir::empty();
+ for _ in 0..100 {
+ expr = Hir::group(Group {
+ kind: GroupKind::NonCapturing,
+ hir: Box::new(expr),
+ });
+ expr = Hir::repetition(Repetition {
+ kind: RepetitionKind::ZeroOrOne,
+ greedy: true,
+ hir: Box::new(expr),
+ });
+
+ expr = Hir {
+ kind: HirKind::Concat(vec![expr]),
+ info: HirInfo::new(),
+ };
+ expr = Hir {
+ kind: HirKind::Alternation(vec![expr]),
+ info: HirInfo::new(),
+ };
+ }
+ assert!(!expr.kind.is_empty());
+ };
+
+ // We run our test on a thread with a small stack size so we can
+ // force the issue more easily.
+ thread::Builder::new()
+ .stack_size(1 << 10)
+ .spawn(run)
+ .unwrap()
+ .join()
+ .unwrap();
+ }
+}
diff --git a/third_party/rust/regex-syntax/src/hir/print.rs b/third_party/rust/regex-syntax/src/hir/print.rs
new file mode 100644
index 0000000000..b71f3897cf
--- /dev/null
+++ b/third_party/rust/regex-syntax/src/hir/print.rs
@@ -0,0 +1,367 @@
+/*!
+This module provides a regular expression printer for `Hir`.
+*/
+
+use std::fmt;
+
+use crate::hir::visitor::{self, Visitor};
+use crate::hir::{self, Hir, HirKind};
+use crate::is_meta_character;
+
+/// A builder for constructing a printer.
+///
+/// Note that since a printer doesn't have any configuration knobs, this type
+/// remains unexported.
+#[derive(Clone, Debug)]
+struct PrinterBuilder {
+ _priv: (),
+}
+
+impl Default for PrinterBuilder {
+ fn default() -> PrinterBuilder {
+ PrinterBuilder::new()
+ }
+}
+
+impl PrinterBuilder {
+ fn new() -> PrinterBuilder {
+ PrinterBuilder { _priv: () }
+ }
+
+ fn build(&self) -> Printer {
+ Printer { _priv: () }
+ }
+}
+
+/// A printer for a regular expression's high-level intermediate
+/// representation.
+///
+/// A printer converts a high-level intermediate representation (HIR) to a
+/// regular expression pattern string. This particular printer uses constant
+/// stack space and heap space proportional to the size of the HIR.
+///
+/// Since this printer is only using the HIR, the pattern it prints will likely
+/// not resemble the original pattern at all. For example, a pattern like
+/// `\pL` will have its entire class written out.
+///
+/// The purpose of this printer is to provide a means to mutate an HIR and then
+/// build a regular expression from the result of that mutation. (A regex
+/// library could provide a constructor from this HIR explicitly, but that
+/// creates an unnecessary public coupling between the regex library and this
+/// specific HIR representation.)
+#[derive(Debug)]
+pub struct Printer {
+ _priv: (),
+}
+
+impl Printer {
+ /// Create a new printer.
+ pub fn new() -> Printer {
+ PrinterBuilder::new().build()
+ }
+
+ /// Print the given `Ast` to the given writer. The writer must implement
+ /// `fmt::Write`. Typical implementations of `fmt::Write` that can be used
+ /// here are a `fmt::Formatter` (which is available in `fmt::Display`
+ /// implementations) or a `&mut String`.
+ pub fn print<W: fmt::Write>(&mut self, hir: &Hir, wtr: W) -> fmt::Result {
+ visitor::visit(hir, Writer { wtr })
+ }
+}
+
+#[derive(Debug)]
+struct Writer<W> {
+ wtr: W,
+}
+
+impl<W: fmt::Write> Visitor for Writer<W> {
+ type Output = ();
+ type Err = fmt::Error;
+
+ fn finish(self) -> fmt::Result {
+ Ok(())
+ }
+
+ fn visit_pre(&mut self, hir: &Hir) -> fmt::Result {
+ match *hir.kind() {
+ HirKind::Empty
+ | HirKind::Repetition(_)
+ | HirKind::Concat(_)
+ | HirKind::Alternation(_) => {}
+ HirKind::Literal(hir::Literal::Unicode(c)) => {
+ self.write_literal_char(c)?;
+ }
+ HirKind::Literal(hir::Literal::Byte(b)) => {
+ self.write_literal_byte(b)?;
+ }
+ HirKind::Class(hir::Class::Unicode(ref cls)) => {
+ self.wtr.write_str("[")?;
+ for range in cls.iter() {
+ if range.start() == range.end() {
+ self.write_literal_char(range.start())?;
+ } else {
+ self.write_literal_char(range.start())?;
+ self.wtr.write_str("-")?;
+ self.write_literal_char(range.end())?;
+ }
+ }
+ self.wtr.write_str("]")?;
+ }
+ HirKind::Class(hir::Class::Bytes(ref cls)) => {
+ self.wtr.write_str("(?-u:[")?;
+ for range in cls.iter() {
+ if range.start() == range.end() {
+ self.write_literal_class_byte(range.start())?;
+ } else {
+ self.write_literal_class_byte(range.start())?;
+ self.wtr.write_str("-")?;
+ self.write_literal_class_byte(range.end())?;
+ }
+ }
+ self.wtr.write_str("])")?;
+ }
+ HirKind::Anchor(hir::Anchor::StartLine) => {
+ self.wtr.write_str("(?m:^)")?;
+ }
+ HirKind::Anchor(hir::Anchor::EndLine) => {
+ self.wtr.write_str("(?m:$)")?;
+ }
+ HirKind::Anchor(hir::Anchor::StartText) => {
+ self.wtr.write_str(r"\A")?;
+ }
+ HirKind::Anchor(hir::Anchor::EndText) => {
+ self.wtr.write_str(r"\z")?;
+ }
+ HirKind::WordBoundary(hir::WordBoundary::Unicode) => {
+ self.wtr.write_str(r"\b")?;
+ }
+ HirKind::WordBoundary(hir::WordBoundary::UnicodeNegate) => {
+ self.wtr.write_str(r"\B")?;
+ }
+ HirKind::WordBoundary(hir::WordBoundary::Ascii) => {
+ self.wtr.write_str(r"(?-u:\b)")?;
+ }
+ HirKind::WordBoundary(hir::WordBoundary::AsciiNegate) => {
+ self.wtr.write_str(r"(?-u:\B)")?;
+ }
+ HirKind::Group(ref x) => match x.kind {
+ hir::GroupKind::CaptureIndex(_) => {
+ self.wtr.write_str("(")?;
+ }
+ hir::GroupKind::CaptureName { ref name, .. } => {
+ write!(self.wtr, "(?P<{}>", name)?;
+ }
+ hir::GroupKind::NonCapturing => {
+ self.wtr.write_str("(?:")?;
+ }
+ },
+ }
+ Ok(())
+ }
+
+ fn visit_post(&mut self, hir: &Hir) -> fmt::Result {
+ match *hir.kind() {
+ // Handled during visit_pre
+ HirKind::Empty
+ | HirKind::Literal(_)
+ | HirKind::Class(_)
+ | HirKind::Anchor(_)
+ | HirKind::WordBoundary(_)
+ | HirKind::Concat(_)
+ | HirKind::Alternation(_) => {}
+ HirKind::Repetition(ref x) => {
+ match x.kind {
+ hir::RepetitionKind::ZeroOrOne => {
+ self.wtr.write_str("?")?;
+ }
+ hir::RepetitionKind::ZeroOrMore => {
+ self.wtr.write_str("*")?;
+ }
+ hir::RepetitionKind::OneOrMore => {
+ self.wtr.write_str("+")?;
+ }
+ hir::RepetitionKind::Range(ref x) => match *x {
+ hir::RepetitionRange::Exactly(m) => {
+ write!(self.wtr, "{{{}}}", m)?;
+ }
+ hir::RepetitionRange::AtLeast(m) => {
+ write!(self.wtr, "{{{},}}", m)?;
+ }
+ hir::RepetitionRange::Bounded(m, n) => {
+ write!(self.wtr, "{{{},{}}}", m, n)?;
+ }
+ },
+ }
+ if !x.greedy {
+ self.wtr.write_str("?")?;
+ }
+ }
+ HirKind::Group(_) => {
+ self.wtr.write_str(")")?;
+ }
+ }
+ Ok(())
+ }
+
+ fn visit_alternation_in(&mut self) -> fmt::Result {
+ self.wtr.write_str("|")
+ }
+}
+
+impl<W: fmt::Write> Writer<W> {
+ fn write_literal_char(&mut self, c: char) -> fmt::Result {
+ if is_meta_character(c) {
+ self.wtr.write_str("\\")?;
+ }
+ self.wtr.write_char(c)
+ }
+
+ fn write_literal_byte(&mut self, b: u8) -> fmt::Result {
+ let c = b as char;
+ if c <= 0x7F as char && !c.is_control() && !c.is_whitespace() {
+ self.write_literal_char(c)
+ } else {
+ write!(self.wtr, "(?-u:\\x{:02X})", b)
+ }
+ }
+
+ fn write_literal_class_byte(&mut self, b: u8) -> fmt::Result {
+ let c = b as char;
+ if c <= 0x7F as char && !c.is_control() && !c.is_whitespace() {
+ self.write_literal_char(c)
+ } else {
+ write!(self.wtr, "\\x{:02X}", b)
+ }
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::Printer;
+ use crate::ParserBuilder;
+
+ fn roundtrip(given: &str, expected: &str) {
+ roundtrip_with(|b| b, given, expected);
+ }
+
+ fn roundtrip_bytes(given: &str, expected: &str) {
+ roundtrip_with(|b| b.allow_invalid_utf8(true), given, expected);
+ }
+
+ fn roundtrip_with<F>(mut f: F, given: &str, expected: &str)
+ where
+ F: FnMut(&mut ParserBuilder) -> &mut ParserBuilder,
+ {
+ let mut builder = ParserBuilder::new();
+ f(&mut builder);
+ let hir = builder.build().parse(given).unwrap();
+
+ let mut printer = Printer::new();
+ let mut dst = String::new();
+ printer.print(&hir, &mut dst).unwrap();
+
+ // Check that the result is actually valid.
+ builder.build().parse(&dst).unwrap();
+
+ assert_eq!(expected, dst);
+ }
+
+ #[test]
+ fn print_literal() {
+ roundtrip("a", "a");
+ roundtrip(r"\xff", "\u{FF}");
+ roundtrip_bytes(r"\xff", "\u{FF}");
+ roundtrip_bytes(r"(?-u)\xff", r"(?-u:\xFF)");
+ roundtrip("☃", "☃");
+ }
+
+ #[test]
+ fn print_class() {
+ roundtrip(r"[a]", r"[a]");
+ roundtrip(r"[a-z]", r"[a-z]");
+ roundtrip(r"[a-z--b-c--x-y]", r"[ad-wz]");
+ roundtrip(r"[^\x01-\u{10FFFF}]", "[\u{0}]");
+ roundtrip(r"[-]", r"[\-]");
+ roundtrip(r"[☃-⛄]", r"[☃-⛄]");
+
+ roundtrip(r"(?-u)[a]", r"(?-u:[a])");
+ roundtrip(r"(?-u)[a-z]", r"(?-u:[a-z])");
+ roundtrip_bytes(r"(?-u)[a-\xFF]", r"(?-u:[a-\xFF])");
+
+ // The following test that the printer escapes meta characters
+ // in character classes.
+ roundtrip(r"[\[]", r"[\[]");
+ roundtrip(r"[Z-_]", r"[Z-_]");
+ roundtrip(r"[Z-_--Z]", r"[\[-_]");
+
+ // The following test that the printer escapes meta characters
+ // in byte oriented character classes.
+ roundtrip_bytes(r"(?-u)[\[]", r"(?-u:[\[])");
+ roundtrip_bytes(r"(?-u)[Z-_]", r"(?-u:[Z-_])");
+ roundtrip_bytes(r"(?-u)[Z-_--Z]", r"(?-u:[\[-_])");
+ }
+
+ #[test]
+ fn print_anchor() {
+ roundtrip(r"^", r"\A");
+ roundtrip(r"$", r"\z");
+ roundtrip(r"(?m)^", r"(?m:^)");
+ roundtrip(r"(?m)$", r"(?m:$)");
+ }
+
+ #[test]
+ fn print_word_boundary() {
+ roundtrip(r"\b", r"\b");
+ roundtrip(r"\B", r"\B");
+ roundtrip(r"(?-u)\b", r"(?-u:\b)");
+ roundtrip_bytes(r"(?-u)\B", r"(?-u:\B)");
+ }
+
+ #[test]
+ fn print_repetition() {
+ roundtrip("a?", "a?");
+ roundtrip("a??", "a??");
+ roundtrip("(?U)a?", "a??");
+
+ roundtrip("a*", "a*");
+ roundtrip("a*?", "a*?");
+ roundtrip("(?U)a*", "a*?");
+
+ roundtrip("a+", "a+");
+ roundtrip("a+?", "a+?");
+ roundtrip("(?U)a+", "a+?");
+
+ roundtrip("a{1}", "a{1}");
+ roundtrip("a{1,}", "a{1,}");
+ roundtrip("a{1,5}", "a{1,5}");
+ roundtrip("a{1}?", "a{1}?");
+ roundtrip("a{1,}?", "a{1,}?");
+ roundtrip("a{1,5}?", "a{1,5}?");
+ roundtrip("(?U)a{1}", "a{1}?");
+ roundtrip("(?U)a{1,}", "a{1,}?");
+ roundtrip("(?U)a{1,5}", "a{1,5}?");
+ }
+
+ #[test]
+ fn print_group() {
+ roundtrip("()", "()");
+ roundtrip("(?P<foo>)", "(?P<foo>)");
+ roundtrip("(?:)", "(?:)");
+
+ roundtrip("(a)", "(a)");
+ roundtrip("(?P<foo>a)", "(?P<foo>a)");
+ roundtrip("(?:a)", "(?:a)");
+
+ roundtrip("((((a))))", "((((a))))");
+ }
+
+ #[test]
+ fn print_alternation() {
+ roundtrip("|", "|");
+ roundtrip("||", "||");
+
+ roundtrip("a|b", "a|b");
+ roundtrip("a|b|c", "a|b|c");
+ roundtrip("foo|bar|quux", "foo|bar|quux");
+ }
+}
diff --git a/third_party/rust/regex-syntax/src/hir/translate.rs b/third_party/rust/regex-syntax/src/hir/translate.rs
new file mode 100644
index 0000000000..890e1608b3
--- /dev/null
+++ b/third_party/rust/regex-syntax/src/hir/translate.rs
@@ -0,0 +1,3207 @@
+/*!
+Defines a translator that converts an `Ast` to an `Hir`.
+*/
+
+use std::cell::{Cell, RefCell};
+use std::result;
+
+use crate::ast::{self, Ast, Span, Visitor};
+use crate::hir::{self, Error, ErrorKind, Hir};
+use crate::unicode::{self, ClassQuery};
+
+type Result<T> = result::Result<T, Error>;
+
+/// A builder for constructing an AST->HIR translator.
+#[derive(Clone, Debug)]
+pub struct TranslatorBuilder {
+ allow_invalid_utf8: bool,
+ flags: Flags,
+}
+
+impl Default for TranslatorBuilder {
+ fn default() -> TranslatorBuilder {
+ TranslatorBuilder::new()
+ }
+}
+
+impl TranslatorBuilder {
+ /// Create a new translator builder with a default c onfiguration.
+ pub fn new() -> TranslatorBuilder {
+ TranslatorBuilder {
+ allow_invalid_utf8: false,
+ flags: Flags::default(),
+ }
+ }
+
+ /// Build a translator using the current configuration.
+ pub fn build(&self) -> Translator {
+ Translator {
+ stack: RefCell::new(vec![]),
+ flags: Cell::new(self.flags),
+ allow_invalid_utf8: self.allow_invalid_utf8,
+ }
+ }
+
+ /// When enabled, translation will permit the construction of a regular
+ /// expression that may match invalid UTF-8.
+ ///
+ /// When disabled (the default), the translator is guaranteed to produce
+ /// an expression that will only ever match valid UTF-8 (otherwise, the
+ /// translator will return an error).
+ ///
+ /// Perhaps surprisingly, when invalid UTF-8 isn't allowed, a negated ASCII
+ /// word boundary (uttered as `(?-u:\B)` in the concrete syntax) will cause
+ /// the parser to return an error. Namely, a negated ASCII word boundary
+ /// can result in matching positions that aren't valid UTF-8 boundaries.
+ pub fn allow_invalid_utf8(&mut self, yes: bool) -> &mut TranslatorBuilder {
+ self.allow_invalid_utf8 = yes;
+ self
+ }
+
+ /// Enable or disable the case insensitive flag (`i`) by default.
+ pub fn case_insensitive(&mut self, yes: bool) -> &mut TranslatorBuilder {
+ self.flags.case_insensitive = if yes { Some(true) } else { None };
+ self
+ }
+
+ /// Enable or disable the multi-line matching flag (`m`) by default.
+ pub fn multi_line(&mut self, yes: bool) -> &mut TranslatorBuilder {
+ self.flags.multi_line = if yes { Some(true) } else { None };
+ self
+ }
+
+ /// Enable or disable the "dot matches any character" flag (`s`) by
+ /// default.
+ pub fn dot_matches_new_line(
+ &mut self,
+ yes: bool,
+ ) -> &mut TranslatorBuilder {
+ self.flags.dot_matches_new_line = if yes { Some(true) } else { None };
+ self
+ }
+
+ /// Enable or disable the "swap greed" flag (`U`) by default.
+ pub fn swap_greed(&mut self, yes: bool) -> &mut TranslatorBuilder {
+ self.flags.swap_greed = if yes { Some(true) } else { None };
+ self
+ }
+
+ /// Enable or disable the Unicode flag (`u`) by default.
+ pub fn unicode(&mut self, yes: bool) -> &mut TranslatorBuilder {
+ self.flags.unicode = if yes { None } else { Some(false) };
+ self
+ }
+}
+
+/// A translator maps abstract syntax to a high level intermediate
+/// representation.
+///
+/// A translator may be benefit from reuse. That is, a translator can translate
+/// many abstract syntax trees.
+///
+/// A `Translator` can be configured in more detail via a
+/// [`TranslatorBuilder`](struct.TranslatorBuilder.html).
+#[derive(Clone, Debug)]
+pub struct Translator {
+ /// Our call stack, but on the heap.
+ stack: RefCell<Vec<HirFrame>>,
+ /// The current flag settings.
+ flags: Cell<Flags>,
+ /// Whether we're allowed to produce HIR that can match arbitrary bytes.
+ allow_invalid_utf8: bool,
+}
+
+impl Translator {
+ /// Create a new translator using the default configuration.
+ pub fn new() -> Translator {
+ TranslatorBuilder::new().build()
+ }
+
+ /// Translate the given abstract syntax tree (AST) into a high level
+ /// intermediate representation (HIR).
+ ///
+ /// If there was a problem doing the translation, then an HIR-specific
+ /// error is returned.
+ ///
+ /// The original pattern string used to produce the `Ast` *must* also be
+ /// provided. The translator does not use the pattern string during any
+ /// correct translation, but is used for error reporting.
+ pub fn translate(&mut self, pattern: &str, ast: &Ast) -> Result<Hir> {
+ ast::visit(ast, TranslatorI::new(self, pattern))
+ }
+}
+
+/// An HirFrame is a single stack frame, represented explicitly, which is
+/// created for each item in the Ast that we traverse.
+///
+/// Note that technically, this type doesn't represent our entire stack
+/// frame. In particular, the Ast visitor represents any state associated with
+/// traversing the Ast itself.
+#[derive(Clone, Debug)]
+enum HirFrame {
+ /// An arbitrary HIR expression. These get pushed whenever we hit a base
+ /// case in the Ast. They get popped after an inductive (i.e., recursive)
+ /// step is complete.
+ Expr(Hir),
+ /// A Unicode character class. This frame is mutated as we descend into
+ /// the Ast of a character class (which is itself its own mini recursive
+ /// structure).
+ ClassUnicode(hir::ClassUnicode),
+ /// A byte-oriented character class. This frame is mutated as we descend
+ /// into the Ast of a character class (which is itself its own mini
+ /// recursive structure).
+ ///
+ /// Byte character classes are created when Unicode mode (`u`) is disabled.
+ /// If `allow_invalid_utf8` is disabled (the default), then a byte
+ /// character is only permitted to match ASCII text.
+ ClassBytes(hir::ClassBytes),
+ /// This is pushed on to the stack upon first seeing any kind of group,
+ /// indicated by parentheses (including non-capturing groups). It is popped
+ /// upon leaving a group.
+ Group {
+ /// The old active flags when this group was opened.
+ ///
+ /// If this group sets flags, then the new active flags are set to the
+ /// result of merging the old flags with the flags introduced by this
+ /// group. If the group doesn't set any flags, then this is simply
+ /// equivalent to whatever flags were set when the group was opened.
+ ///
+ /// When this group is popped, the active flags should be restored to
+ /// the flags set here.
+ ///
+ /// The "active" flags correspond to whatever flags are set in the
+ /// Translator.
+ old_flags: Flags,
+ },
+ /// This is pushed whenever a concatenation is observed. After visiting
+ /// every sub-expression in the concatenation, the translator's stack is
+ /// popped until it sees a Concat frame.
+ Concat,
+ /// This is pushed whenever an alternation is observed. After visiting
+ /// every sub-expression in the alternation, the translator's stack is
+ /// popped until it sees an Alternation frame.
+ Alternation,
+}
+
+impl HirFrame {
+ /// Assert that the current stack frame is an Hir expression and return it.
+ fn unwrap_expr(self) -> Hir {
+ match self {
+ HirFrame::Expr(expr) => expr,
+ _ => panic!("tried to unwrap expr from HirFrame, got: {:?}", self),
+ }
+ }
+
+ /// Assert that the current stack frame is a Unicode class expression and
+ /// return it.
+ fn unwrap_class_unicode(self) -> hir::ClassUnicode {
+ match self {
+ HirFrame::ClassUnicode(cls) => cls,
+ _ => panic!(
+ "tried to unwrap Unicode class \
+ from HirFrame, got: {:?}",
+ self
+ ),
+ }
+ }
+
+ /// Assert that the current stack frame is a byte class expression and
+ /// return it.
+ fn unwrap_class_bytes(self) -> hir::ClassBytes {
+ match self {
+ HirFrame::ClassBytes(cls) => cls,
+ _ => panic!(
+ "tried to unwrap byte class \
+ from HirFrame, got: {:?}",
+ self
+ ),
+ }
+ }
+
+ /// Assert that the current stack frame is a group indicator and return
+ /// its corresponding flags (the flags that were active at the time the
+ /// group was entered).
+ fn unwrap_group(self) -> Flags {
+ match self {
+ HirFrame::Group { old_flags } => old_flags,
+ _ => {
+ panic!("tried to unwrap group from HirFrame, got: {:?}", self)
+ }
+ }
+ }
+}
+
+impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
+ type Output = Hir;
+ type Err = Error;
+
+ fn finish(self) -> Result<Hir> {
+ // ... otherwise, we should have exactly one HIR on the stack.
+ assert_eq!(self.trans().stack.borrow().len(), 1);
+ Ok(self.pop().unwrap().unwrap_expr())
+ }
+
+ fn visit_pre(&mut self, ast: &Ast) -> Result<()> {
+ match *ast {
+ Ast::Class(ast::Class::Bracketed(_)) => {
+ if self.flags().unicode() {
+ let cls = hir::ClassUnicode::empty();
+ self.push(HirFrame::ClassUnicode(cls));
+ } else {
+ let cls = hir::ClassBytes::empty();
+ self.push(HirFrame::ClassBytes(cls));
+ }
+ }
+ Ast::Group(ref x) => {
+ let old_flags = x
+ .flags()
+ .map(|ast| self.set_flags(ast))
+ .unwrap_or_else(|| self.flags());
+ self.push(HirFrame::Group { old_flags });
+ }
+ Ast::Concat(ref x) if x.asts.is_empty() => {}
+ Ast::Concat(_) => {
+ self.push(HirFrame::Concat);
+ }
+ Ast::Alternation(ref x) if x.asts.is_empty() => {}
+ Ast::Alternation(_) => {
+ self.push(HirFrame::Alternation);
+ }
+ _ => {}
+ }
+ Ok(())
+ }
+
+ fn visit_post(&mut self, ast: &Ast) -> Result<()> {
+ match *ast {
+ Ast::Empty(_) => {
+ self.push(HirFrame::Expr(Hir::empty()));
+ }
+ Ast::Flags(ref x) => {
+ self.set_flags(&x.flags);
+ // Flags in the AST are generally considered directives and
+ // not actual sub-expressions. However, they can be used in
+ // the concrete syntax like `((?i))`, and we need some kind of
+ // indication of an expression there, and Empty is the correct
+ // choice.
+ //
+ // There can also be things like `(?i)+`, but we rule those out
+ // in the parser. In the future, we might allow them for
+ // consistency sake.
+ self.push(HirFrame::Expr(Hir::empty()));
+ }
+ Ast::Literal(ref x) => {
+ self.push(HirFrame::Expr(self.hir_literal(x)?));
+ }
+ Ast::Dot(span) => {
+ self.push(HirFrame::Expr(self.hir_dot(span)?));
+ }
+ Ast::Assertion(ref x) => {
+ self.push(HirFrame::Expr(self.hir_assertion(x)?));
+ }
+ Ast::Class(ast::Class::Perl(ref x)) => {
+ if self.flags().unicode() {
+ let cls = self.hir_perl_unicode_class(x)?;
+ let hcls = hir::Class::Unicode(cls);
+ self.push(HirFrame::Expr(Hir::class(hcls)));
+ } else {
+ let cls = self.hir_perl_byte_class(x);
+ let hcls = hir::Class::Bytes(cls);
+ self.push(HirFrame::Expr(Hir::class(hcls)));
+ }
+ }
+ Ast::Class(ast::Class::Unicode(ref x)) => {
+ let cls = hir::Class::Unicode(self.hir_unicode_class(x)?);
+ self.push(HirFrame::Expr(Hir::class(cls)));
+ }
+ Ast::Class(ast::Class::Bracketed(ref ast)) => {
+ if self.flags().unicode() {
+ let mut cls = self.pop().unwrap().unwrap_class_unicode();
+ self.unicode_fold_and_negate(
+ &ast.span,
+ ast.negated,
+ &mut cls,
+ )?;
+ if cls.ranges().is_empty() {
+ return Err(self.error(
+ ast.span,
+ ErrorKind::EmptyClassNotAllowed,
+ ));
+ }
+ let expr = Hir::class(hir::Class::Unicode(cls));
+ self.push(HirFrame::Expr(expr));
+ } else {
+ let mut cls = self.pop().unwrap().unwrap_class_bytes();
+ self.bytes_fold_and_negate(
+ &ast.span,
+ ast.negated,
+ &mut cls,
+ )?;
+ if cls.ranges().is_empty() {
+ return Err(self.error(
+ ast.span,
+ ErrorKind::EmptyClassNotAllowed,
+ ));
+ }
+
+ let expr = Hir::class(hir::Class::Bytes(cls));
+ self.push(HirFrame::Expr(expr));
+ }
+ }
+ Ast::Repetition(ref x) => {
+ let expr = self.pop().unwrap().unwrap_expr();
+ self.push(HirFrame::Expr(self.hir_repetition(x, expr)));
+ }
+ Ast::Group(ref x) => {
+ let expr = self.pop().unwrap().unwrap_expr();
+ let old_flags = self.pop().unwrap().unwrap_group();
+ self.trans().flags.set(old_flags);
+ self.push(HirFrame::Expr(self.hir_group(x, expr)));
+ }
+ Ast::Concat(_) => {
+ let mut exprs = vec![];
+ while let Some(HirFrame::Expr(expr)) = self.pop() {
+ if !expr.kind().is_empty() {
+ exprs.push(expr);
+ }
+ }
+ exprs.reverse();
+ self.push(HirFrame::Expr(Hir::concat(exprs)));
+ }
+ Ast::Alternation(_) => {
+ let mut exprs = vec![];
+ while let Some(HirFrame::Expr(expr)) = self.pop() {
+ exprs.push(expr);
+ }
+ exprs.reverse();
+ self.push(HirFrame::Expr(Hir::alternation(exprs)));
+ }
+ }
+ Ok(())
+ }
+
+ fn visit_class_set_item_pre(
+ &mut self,
+ ast: &ast::ClassSetItem,
+ ) -> Result<()> {
+ match *ast {
+ ast::ClassSetItem::Bracketed(_) => {
+ if self.flags().unicode() {
+ let cls = hir::ClassUnicode::empty();
+ self.push(HirFrame::ClassUnicode(cls));
+ } else {
+ let cls = hir::ClassBytes::empty();
+ self.push(HirFrame::ClassBytes(cls));
+ }
+ }
+ // We needn't handle the Union case here since the visitor will
+ // do it for us.
+ _ => {}
+ }
+ Ok(())
+ }
+
+ fn visit_class_set_item_post(
+ &mut self,
+ ast: &ast::ClassSetItem,
+ ) -> Result<()> {
+ match *ast {
+ ast::ClassSetItem::Empty(_) => {}
+ ast::ClassSetItem::Literal(ref x) => {
+ if self.flags().unicode() {
+ let mut cls = self.pop().unwrap().unwrap_class_unicode();
+ cls.push(hir::ClassUnicodeRange::new(x.c, x.c));
+ self.push(HirFrame::ClassUnicode(cls));
+ } else {
+ let mut cls = self.pop().unwrap().unwrap_class_bytes();
+ let byte = self.class_literal_byte(x)?;
+ cls.push(hir::ClassBytesRange::new(byte, byte));
+ self.push(HirFrame::ClassBytes(cls));
+ }
+ }
+ ast::ClassSetItem::Range(ref x) => {
+ if self.flags().unicode() {
+ let mut cls = self.pop().unwrap().unwrap_class_unicode();
+ cls.push(hir::ClassUnicodeRange::new(x.start.c, x.end.c));
+ self.push(HirFrame::ClassUnicode(cls));
+ } else {
+ let mut cls = self.pop().unwrap().unwrap_class_bytes();
+ let start = self.class_literal_byte(&x.start)?;
+ let end = self.class_literal_byte(&x.end)?;
+ cls.push(hir::ClassBytesRange::new(start, end));
+ self.push(HirFrame::ClassBytes(cls));
+ }
+ }
+ ast::ClassSetItem::Ascii(ref x) => {
+ if self.flags().unicode() {
+ let xcls = self.hir_ascii_unicode_class(x)?;
+ let mut cls = self.pop().unwrap().unwrap_class_unicode();
+ cls.union(&xcls);
+ self.push(HirFrame::ClassUnicode(cls));
+ } else {
+ let xcls = self.hir_ascii_byte_class(x)?;
+ let mut cls = self.pop().unwrap().unwrap_class_bytes();
+ cls.union(&xcls);
+ self.push(HirFrame::ClassBytes(cls));
+ }
+ }
+ ast::ClassSetItem::Unicode(ref x) => {
+ let xcls = self.hir_unicode_class(x)?;
+ let mut cls = self.pop().unwrap().unwrap_class_unicode();
+ cls.union(&xcls);
+ self.push(HirFrame::ClassUnicode(cls));
+ }
+ ast::ClassSetItem::Perl(ref x) => {
+ if self.flags().unicode() {
+ let xcls = self.hir_perl_unicode_class(x)?;
+ let mut cls = self.pop().unwrap().unwrap_class_unicode();
+ cls.union(&xcls);
+ self.push(HirFrame::ClassUnicode(cls));
+ } else {
+ let xcls = self.hir_perl_byte_class(x);
+ let mut cls = self.pop().unwrap().unwrap_class_bytes();
+ cls.union(&xcls);
+ self.push(HirFrame::ClassBytes(cls));
+ }
+ }
+ ast::ClassSetItem::Bracketed(ref ast) => {
+ if self.flags().unicode() {
+ let mut cls1 = self.pop().unwrap().unwrap_class_unicode();
+ self.unicode_fold_and_negate(
+ &ast.span,
+ ast.negated,
+ &mut cls1,
+ )?;
+
+ let mut cls2 = self.pop().unwrap().unwrap_class_unicode();
+ cls2.union(&cls1);
+ self.push(HirFrame::ClassUnicode(cls2));
+ } else {
+ let mut cls1 = self.pop().unwrap().unwrap_class_bytes();
+ self.bytes_fold_and_negate(
+ &ast.span,
+ ast.negated,
+ &mut cls1,
+ )?;
+
+ let mut cls2 = self.pop().unwrap().unwrap_class_bytes();
+ cls2.union(&cls1);
+ self.push(HirFrame::ClassBytes(cls2));
+ }
+ }
+ // This is handled automatically by the visitor.
+ ast::ClassSetItem::Union(_) => {}
+ }
+ Ok(())
+ }
+
+ fn visit_class_set_binary_op_pre(
+ &mut self,
+ _op: &ast::ClassSetBinaryOp,
+ ) -> Result<()> {
+ if self.flags().unicode() {
+ let cls = hir::ClassUnicode::empty();
+ self.push(HirFrame::ClassUnicode(cls));
+ } else {
+ let cls = hir::ClassBytes::empty();
+ self.push(HirFrame::ClassBytes(cls));
+ }
+ Ok(())
+ }
+
+ fn visit_class_set_binary_op_in(
+ &mut self,
+ _op: &ast::ClassSetBinaryOp,
+ ) -> Result<()> {
+ if self.flags().unicode() {
+ let cls = hir::ClassUnicode::empty();
+ self.push(HirFrame::ClassUnicode(cls));
+ } else {
+ let cls = hir::ClassBytes::empty();
+ self.push(HirFrame::ClassBytes(cls));
+ }
+ Ok(())
+ }
+
+ fn visit_class_set_binary_op_post(
+ &mut self,
+ op: &ast::ClassSetBinaryOp,
+ ) -> Result<()> {
+ use crate::ast::ClassSetBinaryOpKind::*;
+
+ if self.flags().unicode() {
+ let mut rhs = self.pop().unwrap().unwrap_class_unicode();
+ let mut lhs = self.pop().unwrap().unwrap_class_unicode();
+ let mut cls = self.pop().unwrap().unwrap_class_unicode();
+ if self.flags().case_insensitive() {
+ rhs.try_case_fold_simple().map_err(|_| {
+ self.error(
+ op.rhs.span().clone(),
+ ErrorKind::UnicodeCaseUnavailable,
+ )
+ })?;
+ lhs.try_case_fold_simple().map_err(|_| {
+ self.error(
+ op.lhs.span().clone(),
+ ErrorKind::UnicodeCaseUnavailable,
+ )
+ })?;
+ }
+ match op.kind {
+ Intersection => lhs.intersect(&rhs),
+ Difference => lhs.difference(&rhs),
+ SymmetricDifference => lhs.symmetric_difference(&rhs),
+ }
+ cls.union(&lhs);
+ self.push(HirFrame::ClassUnicode(cls));
+ } else {
+ let mut rhs = self.pop().unwrap().unwrap_class_bytes();
+ let mut lhs = self.pop().unwrap().unwrap_class_bytes();
+ let mut cls = self.pop().unwrap().unwrap_class_bytes();
+ if self.flags().case_insensitive() {
+ rhs.case_fold_simple();
+ lhs.case_fold_simple();
+ }
+ match op.kind {
+ Intersection => lhs.intersect(&rhs),
+ Difference => lhs.difference(&rhs),
+ SymmetricDifference => lhs.symmetric_difference(&rhs),
+ }
+ cls.union(&lhs);
+ self.push(HirFrame::ClassBytes(cls));
+ }
+ Ok(())
+ }
+}
+
+/// The internal implementation of a translator.
+///
+/// This type is responsible for carrying around the original pattern string,
+/// which is not tied to the internal state of a translator.
+///
+/// A TranslatorI exists for the time it takes to translate a single Ast.
+#[derive(Clone, Debug)]
+struct TranslatorI<'t, 'p> {
+ trans: &'t Translator,
+ pattern: &'p str,
+}
+
+impl<'t, 'p> TranslatorI<'t, 'p> {
+ /// Build a new internal translator.
+ fn new(trans: &'t Translator, pattern: &'p str) -> TranslatorI<'t, 'p> {
+ TranslatorI { trans, pattern }
+ }
+
+ /// Return a reference to the underlying translator.
+ fn trans(&self) -> &Translator {
+ &self.trans
+ }
+
+ /// Push the given frame on to the call stack.
+ fn push(&self, frame: HirFrame) {
+ self.trans().stack.borrow_mut().push(frame);
+ }
+
+ /// Pop the top of the call stack. If the call stack is empty, return None.
+ fn pop(&self) -> Option<HirFrame> {
+ self.trans().stack.borrow_mut().pop()
+ }
+
+ /// Create a new error with the given span and error type.
+ fn error(&self, span: Span, kind: ErrorKind) -> Error {
+ Error { kind, pattern: self.pattern.to_string(), span }
+ }
+
+ /// Return a copy of the active flags.
+ fn flags(&self) -> Flags {
+ self.trans().flags.get()
+ }
+
+ /// Set the flags of this translator from the flags set in the given AST.
+ /// Then, return the old flags.
+ fn set_flags(&self, ast_flags: &ast::Flags) -> Flags {
+ let old_flags = self.flags();
+ let mut new_flags = Flags::from_ast(ast_flags);
+ new_flags.merge(&old_flags);
+ self.trans().flags.set(new_flags);
+ old_flags
+ }
+
+ fn hir_literal(&self, lit: &ast::Literal) -> Result<Hir> {
+ let ch = match self.literal_to_char(lit)? {
+ byte @ hir::Literal::Byte(_) => return Ok(Hir::literal(byte)),
+ hir::Literal::Unicode(ch) => ch,
+ };
+ if self.flags().case_insensitive() {
+ self.hir_from_char_case_insensitive(lit.span, ch)
+ } else {
+ self.hir_from_char(lit.span, ch)
+ }
+ }
+
+ /// Convert an Ast literal to its scalar representation.
+ ///
+ /// When Unicode mode is enabled, then this always succeeds and returns a
+ /// `char` (Unicode scalar value).
+ ///
+ /// When Unicode mode is disabled, then a raw byte is returned. If that
+ /// byte is not ASCII and invalid UTF-8 is not allowed, then this returns
+ /// an error.
+ fn literal_to_char(&self, lit: &ast::Literal) -> Result<hir::Literal> {
+ if self.flags().unicode() {
+ return Ok(hir::Literal::Unicode(lit.c));
+ }
+ let byte = match lit.byte() {
+ None => return Ok(hir::Literal::Unicode(lit.c)),
+ Some(byte) => byte,
+ };
+ if byte <= 0x7F {
+ return Ok(hir::Literal::Unicode(byte as char));
+ }
+ if !self.trans().allow_invalid_utf8 {
+ return Err(self.error(lit.span, ErrorKind::InvalidUtf8));
+ }
+ Ok(hir::Literal::Byte(byte))
+ }
+
+ fn hir_from_char(&self, span: Span, c: char) -> Result<Hir> {
+ if !self.flags().unicode() && c.len_utf8() > 1 {
+ return Err(self.error(span, ErrorKind::UnicodeNotAllowed));
+ }
+ Ok(Hir::literal(hir::Literal::Unicode(c)))
+ }
+
+ fn hir_from_char_case_insensitive(
+ &self,
+ span: Span,
+ c: char,
+ ) -> Result<Hir> {
+ if self.flags().unicode() {
+ // If case folding won't do anything, then don't bother trying.
+ let map =
+ unicode::contains_simple_case_mapping(c, c).map_err(|_| {
+ self.error(span, ErrorKind::UnicodeCaseUnavailable)
+ })?;
+ if !map {
+ return self.hir_from_char(span, c);
+ }
+ let mut cls =
+ hir::ClassUnicode::new(vec![hir::ClassUnicodeRange::new(
+ c, c,
+ )]);
+ cls.try_case_fold_simple().map_err(|_| {
+ self.error(span, ErrorKind::UnicodeCaseUnavailable)
+ })?;
+ Ok(Hir::class(hir::Class::Unicode(cls)))
+ } else {
+ if c.len_utf8() > 1 {
+ return Err(self.error(span, ErrorKind::UnicodeNotAllowed));
+ }
+ // If case folding won't do anything, then don't bother trying.
+ match c {
+ 'A'..='Z' | 'a'..='z' => {}
+ _ => return self.hir_from_char(span, c),
+ }
+ let mut cls =
+ hir::ClassBytes::new(vec![hir::ClassBytesRange::new(
+ c as u8, c as u8,
+ )]);
+ cls.case_fold_simple();
+ Ok(Hir::class(hir::Class::Bytes(cls)))
+ }
+ }
+
+ fn hir_dot(&self, span: Span) -> Result<Hir> {
+ let unicode = self.flags().unicode();
+ if !unicode && !self.trans().allow_invalid_utf8 {
+ return Err(self.error(span, ErrorKind::InvalidUtf8));
+ }
+ Ok(if self.flags().dot_matches_new_line() {
+ Hir::any(!unicode)
+ } else {
+ Hir::dot(!unicode)
+ })
+ }
+
+ fn hir_assertion(&self, asst: &ast::Assertion) -> Result<Hir> {
+ let unicode = self.flags().unicode();
+ let multi_line = self.flags().multi_line();
+ Ok(match asst.kind {
+ ast::AssertionKind::StartLine => Hir::anchor(if multi_line {
+ hir::Anchor::StartLine
+ } else {
+ hir::Anchor::StartText
+ }),
+ ast::AssertionKind::EndLine => Hir::anchor(if multi_line {
+ hir::Anchor::EndLine
+ } else {
+ hir::Anchor::EndText
+ }),
+ ast::AssertionKind::StartText => {
+ Hir::anchor(hir::Anchor::StartText)
+ }
+ ast::AssertionKind::EndText => Hir::anchor(hir::Anchor::EndText),
+ ast::AssertionKind::WordBoundary => {
+ Hir::word_boundary(if unicode {
+ hir::WordBoundary::Unicode
+ } else {
+ hir::WordBoundary::Ascii
+ })
+ }
+ ast::AssertionKind::NotWordBoundary => {
+ Hir::word_boundary(if unicode {
+ hir::WordBoundary::UnicodeNegate
+ } else {
+ // It is possible for negated ASCII word boundaries to
+ // match at invalid UTF-8 boundaries, even when searching
+ // valid UTF-8.
+ if !self.trans().allow_invalid_utf8 {
+ return Err(
+ self.error(asst.span, ErrorKind::InvalidUtf8)
+ );
+ }
+ hir::WordBoundary::AsciiNegate
+ })
+ }
+ })
+ }
+
+ fn hir_group(&self, group: &ast::Group, expr: Hir) -> Hir {
+ let kind = match group.kind {
+ ast::GroupKind::CaptureIndex(idx) => {
+ hir::GroupKind::CaptureIndex(idx)
+ }
+ ast::GroupKind::CaptureName(ref capname) => {
+ hir::GroupKind::CaptureName {
+ name: capname.name.clone(),
+ index: capname.index,
+ }
+ }
+ ast::GroupKind::NonCapturing(_) => hir::GroupKind::NonCapturing,
+ };
+ Hir::group(hir::Group { kind, hir: Box::new(expr) })
+ }
+
+ fn hir_repetition(&self, rep: &ast::Repetition, expr: Hir) -> Hir {
+ let kind = match rep.op.kind {
+ ast::RepetitionKind::ZeroOrOne => hir::RepetitionKind::ZeroOrOne,
+ ast::RepetitionKind::ZeroOrMore => hir::RepetitionKind::ZeroOrMore,
+ ast::RepetitionKind::OneOrMore => hir::RepetitionKind::OneOrMore,
+ ast::RepetitionKind::Range(ast::RepetitionRange::Exactly(m)) => {
+ hir::RepetitionKind::Range(hir::RepetitionRange::Exactly(m))
+ }
+ ast::RepetitionKind::Range(ast::RepetitionRange::AtLeast(m)) => {
+ hir::RepetitionKind::Range(hir::RepetitionRange::AtLeast(m))
+ }
+ ast::RepetitionKind::Range(ast::RepetitionRange::Bounded(
+ m,
+ n,
+ )) => {
+ hir::RepetitionKind::Range(hir::RepetitionRange::Bounded(m, n))
+ }
+ };
+ let greedy =
+ if self.flags().swap_greed() { !rep.greedy } else { rep.greedy };
+ Hir::repetition(hir::Repetition { kind, greedy, hir: Box::new(expr) })
+ }
+
+ fn hir_unicode_class(
+ &self,
+ ast_class: &ast::ClassUnicode,
+ ) -> Result<hir::ClassUnicode> {
+ use crate::ast::ClassUnicodeKind::*;
+
+ if !self.flags().unicode() {
+ return Err(
+ self.error(ast_class.span, ErrorKind::UnicodeNotAllowed)
+ );
+ }
+ let query = match ast_class.kind {
+ OneLetter(name) => ClassQuery::OneLetter(name),
+ Named(ref name) => ClassQuery::Binary(name),
+ NamedValue { ref name, ref value, .. } => ClassQuery::ByValue {
+ property_name: name,
+ property_value: value,
+ },
+ };
+ let mut result = self.convert_unicode_class_error(
+ &ast_class.span,
+ unicode::class(query),
+ );
+ if let Ok(ref mut class) = result {
+ self.unicode_fold_and_negate(
+ &ast_class.span,
+ ast_class.negated,
+ class,
+ )?;
+ if class.ranges().is_empty() {
+ let err = self
+ .error(ast_class.span, ErrorKind::EmptyClassNotAllowed);
+ return Err(err);
+ }
+ }
+ result
+ }
+
+ fn hir_ascii_unicode_class(
+ &self,
+ ast: &ast::ClassAscii,
+ ) -> Result<hir::ClassUnicode> {
+ let mut cls = hir::ClassUnicode::new(
+ ascii_class(&ast.kind)
+ .iter()
+ .map(|&(s, e)| hir::ClassUnicodeRange::new(s, e)),
+ );
+ self.unicode_fold_and_negate(&ast.span, ast.negated, &mut cls)?;
+ Ok(cls)
+ }
+
+ fn hir_ascii_byte_class(
+ &self,
+ ast: &ast::ClassAscii,
+ ) -> Result<hir::ClassBytes> {
+ let mut cls = hir::ClassBytes::new(
+ ascii_class(&ast.kind)
+ .iter()
+ .map(|&(s, e)| hir::ClassBytesRange::new(s as u8, e as u8)),
+ );
+ self.bytes_fold_and_negate(&ast.span, ast.negated, &mut cls)?;
+ Ok(cls)
+ }
+
+ fn hir_perl_unicode_class(
+ &self,
+ ast_class: &ast::ClassPerl,
+ ) -> Result<hir::ClassUnicode> {
+ use crate::ast::ClassPerlKind::*;
+
+ assert!(self.flags().unicode());
+ let result = match ast_class.kind {
+ Digit => unicode::perl_digit(),
+ Space => unicode::perl_space(),
+ Word => unicode::perl_word(),
+ };
+ let mut class =
+ self.convert_unicode_class_error(&ast_class.span, result)?;
+ // We needn't apply case folding here because the Perl Unicode classes
+ // are already closed under Unicode simple case folding.
+ if ast_class.negated {
+ class.negate();
+ }
+ Ok(class)
+ }
+
+ fn hir_perl_byte_class(
+ &self,
+ ast_class: &ast::ClassPerl,
+ ) -> hir::ClassBytes {
+ use crate::ast::ClassPerlKind::*;
+
+ assert!(!self.flags().unicode());
+ let mut class = match ast_class.kind {
+ Digit => hir_ascii_class_bytes(&ast::ClassAsciiKind::Digit),
+ Space => hir_ascii_class_bytes(&ast::ClassAsciiKind::Space),
+ Word => hir_ascii_class_bytes(&ast::ClassAsciiKind::Word),
+ };
+ // We needn't apply case folding here because the Perl ASCII classes
+ // are already closed (under ASCII case folding).
+ if ast_class.negated {
+ class.negate();
+ }
+ class
+ }
+
+ /// Converts the given Unicode specific error to an HIR translation error.
+ ///
+ /// The span given should approximate the position at which an error would
+ /// occur.
+ fn convert_unicode_class_error(
+ &self,
+ span: &Span,
+ result: unicode::Result<hir::ClassUnicode>,
+ ) -> Result<hir::ClassUnicode> {
+ result.map_err(|err| {
+ let sp = span.clone();
+ match err {
+ unicode::Error::PropertyNotFound => {
+ self.error(sp, ErrorKind::UnicodePropertyNotFound)
+ }
+ unicode::Error::PropertyValueNotFound => {
+ self.error(sp, ErrorKind::UnicodePropertyValueNotFound)
+ }
+ unicode::Error::PerlClassNotFound => {
+ self.error(sp, ErrorKind::UnicodePerlClassNotFound)
+ }
+ }
+ })
+ }
+
+ fn unicode_fold_and_negate(
+ &self,
+ span: &Span,
+ negated: bool,
+ class: &mut hir::ClassUnicode,
+ ) -> Result<()> {
+ // Note that we must apply case folding before negation!
+ // Consider `(?i)[^x]`. If we applied negation field, then
+ // the result would be the character class that matched any
+ // Unicode scalar value.
+ if self.flags().case_insensitive() {
+ class.try_case_fold_simple().map_err(|_| {
+ self.error(span.clone(), ErrorKind::UnicodeCaseUnavailable)
+ })?;
+ }
+ if negated {
+ class.negate();
+ }
+ Ok(())
+ }
+
+ fn bytes_fold_and_negate(
+ &self,
+ span: &Span,
+ negated: bool,
+ class: &mut hir::ClassBytes,
+ ) -> Result<()> {
+ // Note that we must apply case folding before negation!
+ // Consider `(?i)[^x]`. If we applied negation first, then
+ // the result would be the character class that matched any
+ // Unicode scalar value.
+ if self.flags().case_insensitive() {
+ class.case_fold_simple();
+ }
+ if negated {
+ class.negate();
+ }
+ if !self.trans().allow_invalid_utf8 && !class.is_all_ascii() {
+ return Err(self.error(span.clone(), ErrorKind::InvalidUtf8));
+ }
+ Ok(())
+ }
+
+ /// Return a scalar byte value suitable for use as a literal in a byte
+ /// character class.
+ fn class_literal_byte(&self, ast: &ast::Literal) -> Result<u8> {
+ match self.literal_to_char(ast)? {
+ hir::Literal::Byte(byte) => Ok(byte),
+ hir::Literal::Unicode(ch) => {
+ if ch <= 0x7F as char {
+ Ok(ch as u8)
+ } else {
+ // We can't feasibly support Unicode in
+ // byte oriented classes. Byte classes don't
+ // do Unicode case folding.
+ Err(self.error(ast.span, ErrorKind::UnicodeNotAllowed))
+ }
+ }
+ }
+ }
+}
+
+/// A translator's representation of a regular expression's flags at any given
+/// moment in time.
+///
+/// Each flag can be in one of three states: absent, present but disabled or
+/// present but enabled.
+#[derive(Clone, Copy, Debug, Default)]
+struct Flags {
+ case_insensitive: Option<bool>,
+ multi_line: Option<bool>,
+ dot_matches_new_line: Option<bool>,
+ swap_greed: Option<bool>,
+ unicode: Option<bool>,
+ // Note that `ignore_whitespace` is omitted here because it is handled
+ // entirely in the parser.
+}
+
+impl Flags {
+ fn from_ast(ast: &ast::Flags) -> Flags {
+ let mut flags = Flags::default();
+ let mut enable = true;
+ for item in &ast.items {
+ match item.kind {
+ ast::FlagsItemKind::Negation => {
+ enable = false;
+ }
+ ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive) => {
+ flags.case_insensitive = Some(enable);
+ }
+ ast::FlagsItemKind::Flag(ast::Flag::MultiLine) => {
+ flags.multi_line = Some(enable);
+ }
+ ast::FlagsItemKind::Flag(ast::Flag::DotMatchesNewLine) => {
+ flags.dot_matches_new_line = Some(enable);
+ }
+ ast::FlagsItemKind::Flag(ast::Flag::SwapGreed) => {
+ flags.swap_greed = Some(enable);
+ }
+ ast::FlagsItemKind::Flag(ast::Flag::Unicode) => {
+ flags.unicode = Some(enable);
+ }
+ ast::FlagsItemKind::Flag(ast::Flag::IgnoreWhitespace) => {}
+ }
+ }
+ flags
+ }
+
+ fn merge(&mut self, previous: &Flags) {
+ if self.case_insensitive.is_none() {
+ self.case_insensitive = previous.case_insensitive;
+ }
+ if self.multi_line.is_none() {
+ self.multi_line = previous.multi_line;
+ }
+ if self.dot_matches_new_line.is_none() {
+ self.dot_matches_new_line = previous.dot_matches_new_line;
+ }
+ if self.swap_greed.is_none() {
+ self.swap_greed = previous.swap_greed;
+ }
+ if self.unicode.is_none() {
+ self.unicode = previous.unicode;
+ }
+ }
+
+ fn case_insensitive(&self) -> bool {
+ self.case_insensitive.unwrap_or(false)
+ }
+
+ fn multi_line(&self) -> bool {
+ self.multi_line.unwrap_or(false)
+ }
+
+ fn dot_matches_new_line(&self) -> bool {
+ self.dot_matches_new_line.unwrap_or(false)
+ }
+
+ fn swap_greed(&self) -> bool {
+ self.swap_greed.unwrap_or(false)
+ }
+
+ fn unicode(&self) -> bool {
+ self.unicode.unwrap_or(true)
+ }
+}
+
+fn hir_ascii_class_bytes(kind: &ast::ClassAsciiKind) -> hir::ClassBytes {
+ let ranges: Vec<_> = ascii_class(kind)
+ .iter()
+ .cloned()
+ .map(|(s, e)| hir::ClassBytesRange::new(s as u8, e as u8))
+ .collect();
+ hir::ClassBytes::new(ranges)
+}
+
+fn ascii_class(kind: &ast::ClassAsciiKind) -> &'static [(char, char)] {
+ use crate::ast::ClassAsciiKind::*;
+ match *kind {
+ Alnum => &[('0', '9'), ('A', 'Z'), ('a', 'z')],
+ Alpha => &[('A', 'Z'), ('a', 'z')],
+ Ascii => &[('\x00', '\x7F')],
+ Blank => &[('\t', '\t'), (' ', ' ')],
+ Cntrl => &[('\x00', '\x1F'), ('\x7F', '\x7F')],
+ Digit => &[('0', '9')],
+ Graph => &[('!', '~')],
+ Lower => &[('a', 'z')],
+ Print => &[(' ', '~')],
+ Punct => &[('!', '/'), (':', '@'), ('[', '`'), ('{', '~')],
+ Space => &[
+ ('\t', '\t'),
+ ('\n', '\n'),
+ ('\x0B', '\x0B'),
+ ('\x0C', '\x0C'),
+ ('\r', '\r'),
+ (' ', ' '),
+ ],
+ Upper => &[('A', 'Z')],
+ Word => &[('0', '9'), ('A', 'Z'), ('_', '_'), ('a', 'z')],
+ Xdigit => &[('0', '9'), ('A', 'F'), ('a', 'f')],
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use crate::ast::parse::ParserBuilder;
+ use crate::ast::{self, Ast, Position, Span};
+ use crate::hir::{self, Hir, HirKind};
+ use crate::unicode::{self, ClassQuery};
+
+ use super::{ascii_class, TranslatorBuilder};
+
+ // We create these errors to compare with real hir::Errors in the tests.
+ // We define equality between TestError and hir::Error to disregard the
+ // pattern string in hir::Error, which is annoying to provide in tests.
+ #[derive(Clone, Debug)]
+ struct TestError {
+ span: Span,
+ kind: hir::ErrorKind,
+ }
+
+ impl PartialEq<hir::Error> for TestError {
+ fn eq(&self, other: &hir::Error) -> bool {
+ self.span == other.span && self.kind == other.kind
+ }
+ }
+
+ impl PartialEq<TestError> for hir::Error {
+ fn eq(&self, other: &TestError) -> bool {
+ self.span == other.span && self.kind == other.kind
+ }
+ }
+
+ fn parse(pattern: &str) -> Ast {
+ ParserBuilder::new().octal(true).build().parse(pattern).unwrap()
+ }
+
+ fn t(pattern: &str) -> Hir {
+ TranslatorBuilder::new()
+ .allow_invalid_utf8(false)
+ .build()
+ .translate(pattern, &parse(pattern))
+ .unwrap()
+ }
+
+ fn t_err(pattern: &str) -> hir::Error {
+ TranslatorBuilder::new()
+ .allow_invalid_utf8(false)
+ .build()
+ .translate(pattern, &parse(pattern))
+ .unwrap_err()
+ }
+
+ fn t_bytes(pattern: &str) -> Hir {
+ TranslatorBuilder::new()
+ .allow_invalid_utf8(true)
+ .build()
+ .translate(pattern, &parse(pattern))
+ .unwrap()
+ }
+
+ fn hir_lit(s: &str) -> Hir {
+ match s.len() {
+ 0 => Hir::empty(),
+ _ => {
+ let lits = s
+ .chars()
+ .map(hir::Literal::Unicode)
+ .map(Hir::literal)
+ .collect();
+ Hir::concat(lits)
+ }
+ }
+ }
+
+ fn hir_blit(s: &[u8]) -> Hir {
+ match s.len() {
+ 0 => Hir::empty(),
+ 1 => Hir::literal(hir::Literal::Byte(s[0])),
+ _ => {
+ let lits = s
+ .iter()
+ .cloned()
+ .map(hir::Literal::Byte)
+ .map(Hir::literal)
+ .collect();
+ Hir::concat(lits)
+ }
+ }
+ }
+
+ fn hir_group(i: u32, expr: Hir) -> Hir {
+ Hir::group(hir::Group {
+ kind: hir::GroupKind::CaptureIndex(i),
+ hir: Box::new(expr),
+ })
+ }
+
+ fn hir_group_name(i: u32, name: &str, expr: Hir) -> Hir {
+ Hir::group(hir::Group {
+ kind: hir::GroupKind::CaptureName {
+ name: name.to_string(),
+ index: i,
+ },
+ hir: Box::new(expr),
+ })
+ }
+
+ fn hir_group_nocap(expr: Hir) -> Hir {
+ Hir::group(hir::Group {
+ kind: hir::GroupKind::NonCapturing,
+ hir: Box::new(expr),
+ })
+ }
+
+ fn hir_quest(greedy: bool, expr: Hir) -> Hir {
+ Hir::repetition(hir::Repetition {
+ kind: hir::RepetitionKind::ZeroOrOne,
+ greedy,
+ hir: Box::new(expr),
+ })
+ }
+
+ fn hir_star(greedy: bool, expr: Hir) -> Hir {
+ Hir::repetition(hir::Repetition {
+ kind: hir::RepetitionKind::ZeroOrMore,
+ greedy,
+ hir: Box::new(expr),
+ })
+ }
+
+ fn hir_plus(greedy: bool, expr: Hir) -> Hir {
+ Hir::repetition(hir::Repetition {
+ kind: hir::RepetitionKind::OneOrMore,
+ greedy,
+ hir: Box::new(expr),
+ })
+ }
+
+ fn hir_range(greedy: bool, range: hir::RepetitionRange, expr: Hir) -> Hir {
+ Hir::repetition(hir::Repetition {
+ kind: hir::RepetitionKind::Range(range),
+ greedy,
+ hir: Box::new(expr),
+ })
+ }
+
+ fn hir_alt(alts: Vec<Hir>) -> Hir {
+ Hir::alternation(alts)
+ }
+
+ fn hir_cat(exprs: Vec<Hir>) -> Hir {
+ Hir::concat(exprs)
+ }
+
+ #[allow(dead_code)]
+ fn hir_uclass_query(query: ClassQuery<'_>) -> Hir {
+ Hir::class(hir::Class::Unicode(unicode::class(query).unwrap()))
+ }
+
+ #[allow(dead_code)]
+ fn hir_uclass_perl_word() -> Hir {
+ Hir::class(hir::Class::Unicode(unicode::perl_word().unwrap()))
+ }
+
+ fn hir_uclass(ranges: &[(char, char)]) -> Hir {
+ let ranges: Vec<hir::ClassUnicodeRange> = ranges
+ .iter()
+ .map(|&(s, e)| hir::ClassUnicodeRange::new(s, e))
+ .collect();
+ Hir::class(hir::Class::Unicode(hir::ClassUnicode::new(ranges)))
+ }
+
+ fn hir_bclass(ranges: &[(u8, u8)]) -> Hir {
+ let ranges: Vec<hir::ClassBytesRange> = ranges
+ .iter()
+ .map(|&(s, e)| hir::ClassBytesRange::new(s, e))
+ .collect();
+ Hir::class(hir::Class::Bytes(hir::ClassBytes::new(ranges)))
+ }
+
+ fn hir_bclass_from_char(ranges: &[(char, char)]) -> Hir {
+ let ranges: Vec<hir::ClassBytesRange> = ranges
+ .iter()
+ .map(|&(s, e)| {
+ assert!(s as u32 <= 0x7F);
+ assert!(e as u32 <= 0x7F);
+ hir::ClassBytesRange::new(s as u8, e as u8)
+ })
+ .collect();
+ Hir::class(hir::Class::Bytes(hir::ClassBytes::new(ranges)))
+ }
+
+ fn hir_case_fold(expr: Hir) -> Hir {
+ match expr.into_kind() {
+ HirKind::Class(mut cls) => {
+ cls.case_fold_simple();
+ Hir::class(cls)
+ }
+ _ => panic!("cannot case fold non-class Hir expr"),
+ }
+ }
+
+ fn hir_negate(expr: Hir) -> Hir {
+ match expr.into_kind() {
+ HirKind::Class(mut cls) => {
+ cls.negate();
+ Hir::class(cls)
+ }
+ _ => panic!("cannot negate non-class Hir expr"),
+ }
+ }
+
+ #[allow(dead_code)]
+ fn hir_union(expr1: Hir, expr2: Hir) -> Hir {
+ use crate::hir::Class::{Bytes, Unicode};
+
+ match (expr1.into_kind(), expr2.into_kind()) {
+ (HirKind::Class(Unicode(mut c1)), HirKind::Class(Unicode(c2))) => {
+ c1.union(&c2);
+ Hir::class(hir::Class::Unicode(c1))
+ }
+ (HirKind::Class(Bytes(mut c1)), HirKind::Class(Bytes(c2))) => {
+ c1.union(&c2);
+ Hir::class(hir::Class::Bytes(c1))
+ }
+ _ => panic!("cannot union non-class Hir exprs"),
+ }
+ }
+
+ #[allow(dead_code)]
+ fn hir_difference(expr1: Hir, expr2: Hir) -> Hir {
+ use crate::hir::Class::{Bytes, Unicode};
+
+ match (expr1.into_kind(), expr2.into_kind()) {
+ (HirKind::Class(Unicode(mut c1)), HirKind::Class(Unicode(c2))) => {
+ c1.difference(&c2);
+ Hir::class(hir::Class::Unicode(c1))
+ }
+ (HirKind::Class(Bytes(mut c1)), HirKind::Class(Bytes(c2))) => {
+ c1.difference(&c2);
+ Hir::class(hir::Class::Bytes(c1))
+ }
+ _ => panic!("cannot difference non-class Hir exprs"),
+ }
+ }
+
+ fn hir_anchor(anchor: hir::Anchor) -> Hir {
+ Hir::anchor(anchor)
+ }
+
+ fn hir_word(wb: hir::WordBoundary) -> Hir {
+ Hir::word_boundary(wb)
+ }
+
+ #[test]
+ fn empty() {
+ assert_eq!(t(""), Hir::empty());
+ assert_eq!(t("(?i)"), Hir::empty());
+ assert_eq!(t("()"), hir_group(1, Hir::empty()));
+ assert_eq!(t("(?:)"), hir_group_nocap(Hir::empty()));
+ assert_eq!(t("(?P<wat>)"), hir_group_name(1, "wat", Hir::empty()));
+ assert_eq!(t("|"), hir_alt(vec![Hir::empty(), Hir::empty()]));
+ assert_eq!(
+ t("()|()"),
+ hir_alt(vec![
+ hir_group(1, Hir::empty()),
+ hir_group(2, Hir::empty()),
+ ])
+ );
+ assert_eq!(
+ t("(|b)"),
+ hir_group(1, hir_alt(vec![Hir::empty(), hir_lit("b"),]))
+ );
+ assert_eq!(
+ t("(a|)"),
+ hir_group(1, hir_alt(vec![hir_lit("a"), Hir::empty(),]))
+ );
+ assert_eq!(
+ t("(a||c)"),
+ hir_group(
+ 1,
+ hir_alt(vec![hir_lit("a"), Hir::empty(), hir_lit("c"),])
+ )
+ );
+ assert_eq!(
+ t("(||)"),
+ hir_group(
+ 1,
+ hir_alt(vec![Hir::empty(), Hir::empty(), Hir::empty(),])
+ )
+ );
+ }
+
+ #[test]
+ fn literal() {
+ assert_eq!(t("a"), hir_lit("a"));
+ assert_eq!(t("(?-u)a"), hir_lit("a"));
+ assert_eq!(t("☃"), hir_lit("☃"));
+ assert_eq!(t("abcd"), hir_lit("abcd"));
+
+ assert_eq!(t_bytes("(?-u)a"), hir_lit("a"));
+ assert_eq!(t_bytes("(?-u)\x61"), hir_lit("a"));
+ assert_eq!(t_bytes(r"(?-u)\x61"), hir_lit("a"));
+ assert_eq!(t_bytes(r"(?-u)\xFF"), hir_blit(b"\xFF"));
+
+ assert_eq!(
+ t_err("(?-u)☃"),
+ TestError {
+ kind: hir::ErrorKind::UnicodeNotAllowed,
+ span: Span::new(
+ Position::new(5, 1, 6),
+ Position::new(8, 1, 7)
+ ),
+ }
+ );
+ assert_eq!(
+ t_err(r"(?-u)\xFF"),
+ TestError {
+ kind: hir::ErrorKind::InvalidUtf8,
+ span: Span::new(
+ Position::new(5, 1, 6),
+ Position::new(9, 1, 10)
+ ),
+ }
+ );
+ }
+
+ #[test]
+ fn literal_case_insensitive() {
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(t("(?i)a"), hir_uclass(&[('A', 'A'), ('a', 'a'),]));
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t("(?i:a)"),
+ hir_group_nocap(hir_uclass(&[('A', 'A'), ('a', 'a')],))
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t("a(?i)a(?-i)a"),
+ hir_cat(vec![
+ hir_lit("a"),
+ hir_uclass(&[('A', 'A'), ('a', 'a')]),
+ hir_lit("a"),
+ ])
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t("(?i)ab@c"),
+ hir_cat(vec![
+ hir_uclass(&[('A', 'A'), ('a', 'a')]),
+ hir_uclass(&[('B', 'B'), ('b', 'b')]),
+ hir_lit("@"),
+ hir_uclass(&[('C', 'C'), ('c', 'c')]),
+ ])
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t("(?i)β"),
+ hir_uclass(&[('Β', 'Β'), ('β', 'β'), ('ϐ', 'ϐ'),])
+ );
+
+ assert_eq!(t("(?i-u)a"), hir_bclass(&[(b'A', b'A'), (b'a', b'a'),]));
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t("(?-u)a(?i)a(?-i)a"),
+ hir_cat(vec![
+ hir_lit("a"),
+ hir_bclass(&[(b'A', b'A'), (b'a', b'a')]),
+ hir_lit("a"),
+ ])
+ );
+ assert_eq!(
+ t("(?i-u)ab@c"),
+ hir_cat(vec![
+ hir_bclass(&[(b'A', b'A'), (b'a', b'a')]),
+ hir_bclass(&[(b'B', b'B'), (b'b', b'b')]),
+ hir_lit("@"),
+ hir_bclass(&[(b'C', b'C'), (b'c', b'c')]),
+ ])
+ );
+
+ assert_eq!(
+ t_bytes("(?i-u)a"),
+ hir_bclass(&[(b'A', b'A'), (b'a', b'a'),])
+ );
+ assert_eq!(
+ t_bytes("(?i-u)\x61"),
+ hir_bclass(&[(b'A', b'A'), (b'a', b'a'),])
+ );
+ assert_eq!(
+ t_bytes(r"(?i-u)\x61"),
+ hir_bclass(&[(b'A', b'A'), (b'a', b'a'),])
+ );
+ assert_eq!(t_bytes(r"(?i-u)\xFF"), hir_blit(b"\xFF"));
+
+ assert_eq!(
+ t_err("(?i-u)β"),
+ TestError {
+ kind: hir::ErrorKind::UnicodeNotAllowed,
+ span: Span::new(
+ Position::new(6, 1, 7),
+ Position::new(8, 1, 8),
+ ),
+ }
+ );
+ }
+
+ #[test]
+ fn dot() {
+ assert_eq!(
+ t("."),
+ hir_uclass(&[('\0', '\t'), ('\x0B', '\u{10FFFF}'),])
+ );
+ assert_eq!(t("(?s)."), hir_uclass(&[('\0', '\u{10FFFF}'),]));
+ assert_eq!(
+ t_bytes("(?-u)."),
+ hir_bclass(&[(b'\0', b'\t'), (b'\x0B', b'\xFF'),])
+ );
+ assert_eq!(t_bytes("(?s-u)."), hir_bclass(&[(b'\0', b'\xFF'),]));
+
+ // If invalid UTF-8 isn't allowed, then non-Unicode `.` isn't allowed.
+ assert_eq!(
+ t_err("(?-u)."),
+ TestError {
+ kind: hir::ErrorKind::InvalidUtf8,
+ span: Span::new(
+ Position::new(5, 1, 6),
+ Position::new(6, 1, 7)
+ ),
+ }
+ );
+ assert_eq!(
+ t_err("(?s-u)."),
+ TestError {
+ kind: hir::ErrorKind::InvalidUtf8,
+ span: Span::new(
+ Position::new(6, 1, 7),
+ Position::new(7, 1, 8)
+ ),
+ }
+ );
+ }
+
+ #[test]
+ fn assertions() {
+ assert_eq!(t("^"), hir_anchor(hir::Anchor::StartText));
+ assert_eq!(t("$"), hir_anchor(hir::Anchor::EndText));
+ assert_eq!(t(r"\A"), hir_anchor(hir::Anchor::StartText));
+ assert_eq!(t(r"\z"), hir_anchor(hir::Anchor::EndText));
+ assert_eq!(t("(?m)^"), hir_anchor(hir::Anchor::StartLine));
+ assert_eq!(t("(?m)$"), hir_anchor(hir::Anchor::EndLine));
+ assert_eq!(t(r"(?m)\A"), hir_anchor(hir::Anchor::StartText));
+ assert_eq!(t(r"(?m)\z"), hir_anchor(hir::Anchor::EndText));
+
+ assert_eq!(t(r"\b"), hir_word(hir::WordBoundary::Unicode));
+ assert_eq!(t(r"\B"), hir_word(hir::WordBoundary::UnicodeNegate));
+ assert_eq!(t(r"(?-u)\b"), hir_word(hir::WordBoundary::Ascii));
+ assert_eq!(
+ t_bytes(r"(?-u)\B"),
+ hir_word(hir::WordBoundary::AsciiNegate)
+ );
+
+ assert_eq!(
+ t_err(r"(?-u)\B"),
+ TestError {
+ kind: hir::ErrorKind::InvalidUtf8,
+ span: Span::new(
+ Position::new(5, 1, 6),
+ Position::new(7, 1, 8)
+ ),
+ }
+ );
+ }
+
+ #[test]
+ fn group() {
+ assert_eq!(t("(a)"), hir_group(1, hir_lit("a")));
+ assert_eq!(
+ t("(a)(b)"),
+ hir_cat(vec![
+ hir_group(1, hir_lit("a")),
+ hir_group(2, hir_lit("b")),
+ ])
+ );
+ assert_eq!(
+ t("(a)|(b)"),
+ hir_alt(vec![
+ hir_group(1, hir_lit("a")),
+ hir_group(2, hir_lit("b")),
+ ])
+ );
+ assert_eq!(t("(?P<foo>)"), hir_group_name(1, "foo", Hir::empty()));
+ assert_eq!(t("(?P<foo>a)"), hir_group_name(1, "foo", hir_lit("a")));
+ assert_eq!(
+ t("(?P<foo>a)(?P<bar>b)"),
+ hir_cat(vec![
+ hir_group_name(1, "foo", hir_lit("a")),
+ hir_group_name(2, "bar", hir_lit("b")),
+ ])
+ );
+ assert_eq!(t("(?:)"), hir_group_nocap(Hir::empty()));
+ assert_eq!(t("(?:a)"), hir_group_nocap(hir_lit("a")));
+ assert_eq!(
+ t("(?:a)(b)"),
+ hir_cat(vec![
+ hir_group_nocap(hir_lit("a")),
+ hir_group(1, hir_lit("b")),
+ ])
+ );
+ assert_eq!(
+ t("(a)(?:b)(c)"),
+ hir_cat(vec![
+ hir_group(1, hir_lit("a")),
+ hir_group_nocap(hir_lit("b")),
+ hir_group(2, hir_lit("c")),
+ ])
+ );
+ assert_eq!(
+ t("(a)(?P<foo>b)(c)"),
+ hir_cat(vec![
+ hir_group(1, hir_lit("a")),
+ hir_group_name(2, "foo", hir_lit("b")),
+ hir_group(3, hir_lit("c")),
+ ])
+ );
+ assert_eq!(t("()"), hir_group(1, Hir::empty()));
+ assert_eq!(t("((?i))"), hir_group(1, Hir::empty()));
+ assert_eq!(t("((?x))"), hir_group(1, Hir::empty()));
+ assert_eq!(t("(((?x)))"), hir_group(1, hir_group(2, Hir::empty())));
+ }
+
+ #[test]
+ fn flags() {
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t("(?i:a)a"),
+ hir_cat(vec![
+ hir_group_nocap(hir_uclass(&[('A', 'A'), ('a', 'a')])),
+ hir_lit("a"),
+ ])
+ );
+ assert_eq!(
+ t("(?i-u:a)β"),
+ hir_cat(vec![
+ hir_group_nocap(hir_bclass(&[(b'A', b'A'), (b'a', b'a')])),
+ hir_lit("β"),
+ ])
+ );
+ assert_eq!(
+ t("(?:(?i-u)a)b"),
+ hir_cat(vec![
+ hir_group_nocap(hir_bclass(&[(b'A', b'A'), (b'a', b'a')])),
+ hir_lit("b"),
+ ])
+ );
+ assert_eq!(
+ t("((?i-u)a)b"),
+ hir_cat(vec![
+ hir_group(1, hir_bclass(&[(b'A', b'A'), (b'a', b'a')])),
+ hir_lit("b"),
+ ])
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t("(?i)(?-i:a)a"),
+ hir_cat(vec![
+ hir_group_nocap(hir_lit("a")),
+ hir_uclass(&[('A', 'A'), ('a', 'a')]),
+ ])
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t("(?im)a^"),
+ hir_cat(vec![
+ hir_uclass(&[('A', 'A'), ('a', 'a')]),
+ hir_anchor(hir::Anchor::StartLine),
+ ])
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t("(?im)a^(?i-m)a^"),
+ hir_cat(vec![
+ hir_uclass(&[('A', 'A'), ('a', 'a')]),
+ hir_anchor(hir::Anchor::StartLine),
+ hir_uclass(&[('A', 'A'), ('a', 'a')]),
+ hir_anchor(hir::Anchor::StartText),
+ ])
+ );
+ assert_eq!(
+ t("(?U)a*a*?(?-U)a*a*?"),
+ hir_cat(vec![
+ hir_star(false, hir_lit("a")),
+ hir_star(true, hir_lit("a")),
+ hir_star(true, hir_lit("a")),
+ hir_star(false, hir_lit("a")),
+ ])
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t("(?:a(?i)a)a"),
+ hir_cat(vec![
+ hir_group_nocap(hir_cat(vec![
+ hir_lit("a"),
+ hir_uclass(&[('A', 'A'), ('a', 'a')]),
+ ])),
+ hir_lit("a"),
+ ])
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t("(?i)(?:a(?-i)a)a"),
+ hir_cat(vec![
+ hir_group_nocap(hir_cat(vec![
+ hir_uclass(&[('A', 'A'), ('a', 'a')]),
+ hir_lit("a"),
+ ])),
+ hir_uclass(&[('A', 'A'), ('a', 'a')]),
+ ])
+ );
+ }
+
+ #[test]
+ fn escape() {
+ assert_eq!(
+ t(r"\\\.\+\*\?\(\)\|\[\]\{\}\^\$\#"),
+ hir_lit(r"\.+*?()|[]{}^$#")
+ );
+ }
+
+ #[test]
+ fn repetition() {
+ assert_eq!(t("a?"), hir_quest(true, hir_lit("a")));
+ assert_eq!(t("a*"), hir_star(true, hir_lit("a")));
+ assert_eq!(t("a+"), hir_plus(true, hir_lit("a")));
+ assert_eq!(t("a??"), hir_quest(false, hir_lit("a")));
+ assert_eq!(t("a*?"), hir_star(false, hir_lit("a")));
+ assert_eq!(t("a+?"), hir_plus(false, hir_lit("a")));
+
+ assert_eq!(
+ t("a{1}"),
+ hir_range(true, hir::RepetitionRange::Exactly(1), hir_lit("a"),)
+ );
+ assert_eq!(
+ t("a{1,}"),
+ hir_range(true, hir::RepetitionRange::AtLeast(1), hir_lit("a"),)
+ );
+ assert_eq!(
+ t("a{1,2}"),
+ hir_range(true, hir::RepetitionRange::Bounded(1, 2), hir_lit("a"),)
+ );
+ assert_eq!(
+ t("a{1}?"),
+ hir_range(false, hir::RepetitionRange::Exactly(1), hir_lit("a"),)
+ );
+ assert_eq!(
+ t("a{1,}?"),
+ hir_range(false, hir::RepetitionRange::AtLeast(1), hir_lit("a"),)
+ );
+ assert_eq!(
+ t("a{1,2}?"),
+ hir_range(
+ false,
+ hir::RepetitionRange::Bounded(1, 2),
+ hir_lit("a"),
+ )
+ );
+
+ assert_eq!(
+ t("ab?"),
+ hir_cat(vec![hir_lit("a"), hir_quest(true, hir_lit("b")),])
+ );
+ assert_eq!(
+ t("(ab)?"),
+ hir_quest(
+ true,
+ hir_group(1, hir_cat(vec![hir_lit("a"), hir_lit("b"),]))
+ )
+ );
+ assert_eq!(
+ t("a|b?"),
+ hir_alt(vec![hir_lit("a"), hir_quest(true, hir_lit("b")),])
+ );
+ }
+
+ #[test]
+ fn cat_alt() {
+ assert_eq!(
+ t("(ab)"),
+ hir_group(1, hir_cat(vec![hir_lit("a"), hir_lit("b"),]))
+ );
+ assert_eq!(t("a|b"), hir_alt(vec![hir_lit("a"), hir_lit("b"),]));
+ assert_eq!(
+ t("a|b|c"),
+ hir_alt(vec![hir_lit("a"), hir_lit("b"), hir_lit("c"),])
+ );
+ assert_eq!(
+ t("ab|bc|cd"),
+ hir_alt(vec![hir_lit("ab"), hir_lit("bc"), hir_lit("cd"),])
+ );
+ assert_eq!(
+ t("(a|b)"),
+ hir_group(1, hir_alt(vec![hir_lit("a"), hir_lit("b"),]))
+ );
+ assert_eq!(
+ t("(a|b|c)"),
+ hir_group(
+ 1,
+ hir_alt(vec![hir_lit("a"), hir_lit("b"), hir_lit("c"),])
+ )
+ );
+ assert_eq!(
+ t("(ab|bc|cd)"),
+ hir_group(
+ 1,
+ hir_alt(vec![hir_lit("ab"), hir_lit("bc"), hir_lit("cd"),])
+ )
+ );
+ assert_eq!(
+ t("(ab|(bc|(cd)))"),
+ hir_group(
+ 1,
+ hir_alt(vec![
+ hir_lit("ab"),
+ hir_group(
+ 2,
+ hir_alt(vec![
+ hir_lit("bc"),
+ hir_group(3, hir_lit("cd")),
+ ])
+ ),
+ ])
+ )
+ );
+ }
+
+ #[test]
+ fn class_ascii() {
+ assert_eq!(
+ t("[[:alnum:]]"),
+ hir_uclass(ascii_class(&ast::ClassAsciiKind::Alnum))
+ );
+ assert_eq!(
+ t("[[:alpha:]]"),
+ hir_uclass(ascii_class(&ast::ClassAsciiKind::Alpha))
+ );
+ assert_eq!(
+ t("[[:ascii:]]"),
+ hir_uclass(ascii_class(&ast::ClassAsciiKind::Ascii))
+ );
+ assert_eq!(
+ t("[[:blank:]]"),
+ hir_uclass(ascii_class(&ast::ClassAsciiKind::Blank))
+ );
+ assert_eq!(
+ t("[[:cntrl:]]"),
+ hir_uclass(ascii_class(&ast::ClassAsciiKind::Cntrl))
+ );
+ assert_eq!(
+ t("[[:digit:]]"),
+ hir_uclass(ascii_class(&ast::ClassAsciiKind::Digit))
+ );
+ assert_eq!(
+ t("[[:graph:]]"),
+ hir_uclass(ascii_class(&ast::ClassAsciiKind::Graph))
+ );
+ assert_eq!(
+ t("[[:lower:]]"),
+ hir_uclass(ascii_class(&ast::ClassAsciiKind::Lower))
+ );
+ assert_eq!(
+ t("[[:print:]]"),
+ hir_uclass(ascii_class(&ast::ClassAsciiKind::Print))
+ );
+ assert_eq!(
+ t("[[:punct:]]"),
+ hir_uclass(ascii_class(&ast::ClassAsciiKind::Punct))
+ );
+ assert_eq!(
+ t("[[:space:]]"),
+ hir_uclass(ascii_class(&ast::ClassAsciiKind::Space))
+ );
+ assert_eq!(
+ t("[[:upper:]]"),
+ hir_uclass(ascii_class(&ast::ClassAsciiKind::Upper))
+ );
+ assert_eq!(
+ t("[[:word:]]"),
+ hir_uclass(ascii_class(&ast::ClassAsciiKind::Word))
+ );
+ assert_eq!(
+ t("[[:xdigit:]]"),
+ hir_uclass(ascii_class(&ast::ClassAsciiKind::Xdigit))
+ );
+
+ assert_eq!(
+ t("[[:^lower:]]"),
+ hir_negate(hir_uclass(ascii_class(&ast::ClassAsciiKind::Lower)))
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t("(?i)[[:lower:]]"),
+ hir_uclass(&[
+ ('A', 'Z'),
+ ('a', 'z'),
+ ('\u{17F}', '\u{17F}'),
+ ('\u{212A}', '\u{212A}'),
+ ])
+ );
+
+ assert_eq!(
+ t("(?-u)[[:lower:]]"),
+ hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Lower))
+ );
+ assert_eq!(
+ t("(?i-u)[[:lower:]]"),
+ hir_case_fold(hir_bclass_from_char(ascii_class(
+ &ast::ClassAsciiKind::Lower
+ )))
+ );
+
+ assert_eq!(
+ t_err("(?-u)[[:^lower:]]"),
+ TestError {
+ kind: hir::ErrorKind::InvalidUtf8,
+ span: Span::new(
+ Position::new(6, 1, 7),
+ Position::new(16, 1, 17)
+ ),
+ }
+ );
+ assert_eq!(
+ t_err("(?i-u)[[:^lower:]]"),
+ TestError {
+ kind: hir::ErrorKind::InvalidUtf8,
+ span: Span::new(
+ Position::new(7, 1, 8),
+ Position::new(17, 1, 18)
+ ),
+ }
+ );
+ }
+
+ #[test]
+ fn class_ascii_multiple() {
+ // See: https://github.com/rust-lang/regex/issues/680
+ assert_eq!(
+ t("[[:alnum:][:^ascii:]]"),
+ hir_union(
+ hir_uclass(ascii_class(&ast::ClassAsciiKind::Alnum)),
+ hir_uclass(&[('\u{80}', '\u{10FFFF}')]),
+ ),
+ );
+ assert_eq!(
+ t_bytes("(?-u)[[:alnum:][:^ascii:]]"),
+ hir_union(
+ hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Alnum)),
+ hir_bclass(&[(0x80, 0xFF)]),
+ ),
+ );
+ }
+
+ #[test]
+ #[cfg(feature = "unicode-perl")]
+ fn class_perl() {
+ // Unicode
+ assert_eq!(t(r"\d"), hir_uclass_query(ClassQuery::Binary("digit")));
+ assert_eq!(t(r"\s"), hir_uclass_query(ClassQuery::Binary("space")));
+ assert_eq!(t(r"\w"), hir_uclass_perl_word());
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t(r"(?i)\d"),
+ hir_uclass_query(ClassQuery::Binary("digit"))
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t(r"(?i)\s"),
+ hir_uclass_query(ClassQuery::Binary("space"))
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(t(r"(?i)\w"), hir_uclass_perl_word());
+
+ // Unicode, negated
+ assert_eq!(
+ t(r"\D"),
+ hir_negate(hir_uclass_query(ClassQuery::Binary("digit")))
+ );
+ assert_eq!(
+ t(r"\S"),
+ hir_negate(hir_uclass_query(ClassQuery::Binary("space")))
+ );
+ assert_eq!(t(r"\W"), hir_negate(hir_uclass_perl_word()));
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t(r"(?i)\D"),
+ hir_negate(hir_uclass_query(ClassQuery::Binary("digit")))
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t(r"(?i)\S"),
+ hir_negate(hir_uclass_query(ClassQuery::Binary("space")))
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(t(r"(?i)\W"), hir_negate(hir_uclass_perl_word()));
+
+ // ASCII only
+ assert_eq!(
+ t(r"(?-u)\d"),
+ hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Digit))
+ );
+ assert_eq!(
+ t(r"(?-u)\s"),
+ hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Space))
+ );
+ assert_eq!(
+ t(r"(?-u)\w"),
+ hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Word))
+ );
+ assert_eq!(
+ t(r"(?i-u)\d"),
+ hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Digit))
+ );
+ assert_eq!(
+ t(r"(?i-u)\s"),
+ hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Space))
+ );
+ assert_eq!(
+ t(r"(?i-u)\w"),
+ hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Word))
+ );
+
+ // ASCII only, negated
+ assert_eq!(
+ t(r"(?-u)\D"),
+ hir_negate(hir_bclass_from_char(ascii_class(
+ &ast::ClassAsciiKind::Digit
+ )))
+ );
+ assert_eq!(
+ t(r"(?-u)\S"),
+ hir_negate(hir_bclass_from_char(ascii_class(
+ &ast::ClassAsciiKind::Space
+ )))
+ );
+ assert_eq!(
+ t(r"(?-u)\W"),
+ hir_negate(hir_bclass_from_char(ascii_class(
+ &ast::ClassAsciiKind::Word
+ )))
+ );
+ assert_eq!(
+ t(r"(?i-u)\D"),
+ hir_negate(hir_bclass_from_char(ascii_class(
+ &ast::ClassAsciiKind::Digit
+ )))
+ );
+ assert_eq!(
+ t(r"(?i-u)\S"),
+ hir_negate(hir_bclass_from_char(ascii_class(
+ &ast::ClassAsciiKind::Space
+ )))
+ );
+ assert_eq!(
+ t(r"(?i-u)\W"),
+ hir_negate(hir_bclass_from_char(ascii_class(
+ &ast::ClassAsciiKind::Word
+ )))
+ );
+ }
+
+ #[test]
+ #[cfg(not(feature = "unicode-perl"))]
+ fn class_perl_word_disabled() {
+ assert_eq!(
+ t_err(r"\w"),
+ TestError {
+ kind: hir::ErrorKind::UnicodePerlClassNotFound,
+ span: Span::new(
+ Position::new(0, 1, 1),
+ Position::new(2, 1, 3)
+ ),
+ }
+ );
+ }
+
+ #[test]
+ #[cfg(all(not(feature = "unicode-perl"), not(feature = "unicode-bool")))]
+ fn class_perl_space_disabled() {
+ assert_eq!(
+ t_err(r"\s"),
+ TestError {
+ kind: hir::ErrorKind::UnicodePerlClassNotFound,
+ span: Span::new(
+ Position::new(0, 1, 1),
+ Position::new(2, 1, 3)
+ ),
+ }
+ );
+ }
+
+ #[test]
+ #[cfg(all(
+ not(feature = "unicode-perl"),
+ not(feature = "unicode-gencat")
+ ))]
+ fn class_perl_digit_disabled() {
+ assert_eq!(
+ t_err(r"\d"),
+ TestError {
+ kind: hir::ErrorKind::UnicodePerlClassNotFound,
+ span: Span::new(
+ Position::new(0, 1, 1),
+ Position::new(2, 1, 3)
+ ),
+ }
+ );
+ }
+
+ #[test]
+ #[cfg(feature = "unicode-gencat")]
+ fn class_unicode_gencat() {
+ assert_eq!(t(r"\pZ"), hir_uclass_query(ClassQuery::Binary("Z")));
+ assert_eq!(t(r"\pz"), hir_uclass_query(ClassQuery::Binary("Z")));
+ assert_eq!(
+ t(r"\p{Separator}"),
+ hir_uclass_query(ClassQuery::Binary("Z"))
+ );
+ assert_eq!(
+ t(r"\p{se PaRa ToR}"),
+ hir_uclass_query(ClassQuery::Binary("Z"))
+ );
+ assert_eq!(
+ t(r"\p{gc:Separator}"),
+ hir_uclass_query(ClassQuery::Binary("Z"))
+ );
+ assert_eq!(
+ t(r"\p{gc=Separator}"),
+ hir_uclass_query(ClassQuery::Binary("Z"))
+ );
+ assert_eq!(
+ t(r"\p{Other}"),
+ hir_uclass_query(ClassQuery::Binary("Other"))
+ );
+ assert_eq!(t(r"\pC"), hir_uclass_query(ClassQuery::Binary("Other")));
+
+ assert_eq!(
+ t(r"\PZ"),
+ hir_negate(hir_uclass_query(ClassQuery::Binary("Z")))
+ );
+ assert_eq!(
+ t(r"\P{separator}"),
+ hir_negate(hir_uclass_query(ClassQuery::Binary("Z")))
+ );
+ assert_eq!(
+ t(r"\P{gc!=separator}"),
+ hir_negate(hir_uclass_query(ClassQuery::Binary("Z")))
+ );
+
+ assert_eq!(t(r"\p{any}"), hir_uclass_query(ClassQuery::Binary("Any")));
+ assert_eq!(
+ t(r"\p{assigned}"),
+ hir_uclass_query(ClassQuery::Binary("Assigned"))
+ );
+ assert_eq!(
+ t(r"\p{ascii}"),
+ hir_uclass_query(ClassQuery::Binary("ASCII"))
+ );
+ assert_eq!(
+ t(r"\p{gc:any}"),
+ hir_uclass_query(ClassQuery::Binary("Any"))
+ );
+ assert_eq!(
+ t(r"\p{gc:assigned}"),
+ hir_uclass_query(ClassQuery::Binary("Assigned"))
+ );
+ assert_eq!(
+ t(r"\p{gc:ascii}"),
+ hir_uclass_query(ClassQuery::Binary("ASCII"))
+ );
+
+ assert_eq!(
+ t_err(r"(?-u)\pZ"),
+ TestError {
+ kind: hir::ErrorKind::UnicodeNotAllowed,
+ span: Span::new(
+ Position::new(5, 1, 6),
+ Position::new(8, 1, 9)
+ ),
+ }
+ );
+ assert_eq!(
+ t_err(r"(?-u)\p{Separator}"),
+ TestError {
+ kind: hir::ErrorKind::UnicodeNotAllowed,
+ span: Span::new(
+ Position::new(5, 1, 6),
+ Position::new(18, 1, 19)
+ ),
+ }
+ );
+ assert_eq!(
+ t_err(r"\pE"),
+ TestError {
+ kind: hir::ErrorKind::UnicodePropertyNotFound,
+ span: Span::new(
+ Position::new(0, 1, 1),
+ Position::new(3, 1, 4)
+ ),
+ }
+ );
+ assert_eq!(
+ t_err(r"\p{Foo}"),
+ TestError {
+ kind: hir::ErrorKind::UnicodePropertyNotFound,
+ span: Span::new(
+ Position::new(0, 1, 1),
+ Position::new(7, 1, 8)
+ ),
+ }
+ );
+ assert_eq!(
+ t_err(r"\p{gc:Foo}"),
+ TestError {
+ kind: hir::ErrorKind::UnicodePropertyValueNotFound,
+ span: Span::new(
+ Position::new(0, 1, 1),
+ Position::new(10, 1, 11)
+ ),
+ }
+ );
+ }
+
+ #[test]
+ #[cfg(not(feature = "unicode-gencat"))]
+ fn class_unicode_gencat_disabled() {
+ assert_eq!(
+ t_err(r"\p{Separator}"),
+ TestError {
+ kind: hir::ErrorKind::UnicodePropertyNotFound,
+ span: Span::new(
+ Position::new(0, 1, 1),
+ Position::new(13, 1, 14)
+ ),
+ }
+ );
+
+ assert_eq!(
+ t_err(r"\p{Any}"),
+ TestError {
+ kind: hir::ErrorKind::UnicodePropertyNotFound,
+ span: Span::new(
+ Position::new(0, 1, 1),
+ Position::new(7, 1, 8)
+ ),
+ }
+ );
+ }
+
+ #[test]
+ #[cfg(feature = "unicode-script")]
+ fn class_unicode_script() {
+ assert_eq!(
+ t(r"\p{Greek}"),
+ hir_uclass_query(ClassQuery::Binary("Greek"))
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t(r"(?i)\p{Greek}"),
+ hir_case_fold(hir_uclass_query(ClassQuery::Binary("Greek")))
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t(r"(?i)\P{Greek}"),
+ hir_negate(hir_case_fold(hir_uclass_query(ClassQuery::Binary(
+ "Greek"
+ ))))
+ );
+
+ assert_eq!(
+ t_err(r"\p{sc:Foo}"),
+ TestError {
+ kind: hir::ErrorKind::UnicodePropertyValueNotFound,
+ span: Span::new(
+ Position::new(0, 1, 1),
+ Position::new(10, 1, 11)
+ ),
+ }
+ );
+ assert_eq!(
+ t_err(r"\p{scx:Foo}"),
+ TestError {
+ kind: hir::ErrorKind::UnicodePropertyValueNotFound,
+ span: Span::new(
+ Position::new(0, 1, 1),
+ Position::new(11, 1, 12)
+ ),
+ }
+ );
+ }
+
+ #[test]
+ #[cfg(not(feature = "unicode-script"))]
+ fn class_unicode_script_disabled() {
+ assert_eq!(
+ t_err(r"\p{Greek}"),
+ TestError {
+ kind: hir::ErrorKind::UnicodePropertyNotFound,
+ span: Span::new(
+ Position::new(0, 1, 1),
+ Position::new(9, 1, 10)
+ ),
+ }
+ );
+
+ assert_eq!(
+ t_err(r"\p{scx:Greek}"),
+ TestError {
+ kind: hir::ErrorKind::UnicodePropertyNotFound,
+ span: Span::new(
+ Position::new(0, 1, 1),
+ Position::new(13, 1, 14)
+ ),
+ }
+ );
+ }
+
+ #[test]
+ #[cfg(feature = "unicode-age")]
+ fn class_unicode_age() {
+ assert_eq!(
+ t_err(r"\p{age:Foo}"),
+ TestError {
+ kind: hir::ErrorKind::UnicodePropertyValueNotFound,
+ span: Span::new(
+ Position::new(0, 1, 1),
+ Position::new(11, 1, 12)
+ ),
+ }
+ );
+ }
+
+ #[test]
+ #[cfg(feature = "unicode-gencat")]
+ fn class_unicode_any_empty() {
+ assert_eq!(
+ t_err(r"\P{any}"),
+ TestError {
+ kind: hir::ErrorKind::EmptyClassNotAllowed,
+ span: Span::new(
+ Position::new(0, 1, 1),
+ Position::new(7, 1, 8)
+ ),
+ }
+ );
+ }
+
+ #[test]
+ #[cfg(not(feature = "unicode-age"))]
+ fn class_unicode_age_disabled() {
+ assert_eq!(
+ t_err(r"\p{age:3.0}"),
+ TestError {
+ kind: hir::ErrorKind::UnicodePropertyNotFound,
+ span: Span::new(
+ Position::new(0, 1, 1),
+ Position::new(11, 1, 12)
+ ),
+ }
+ );
+ }
+
+ #[test]
+ fn class_bracketed() {
+ assert_eq!(t("[a]"), hir_uclass(&[('a', 'a')]));
+ assert_eq!(t("[^[a]]"), hir_negate(hir_uclass(&[('a', 'a')])));
+ assert_eq!(t("[a-z]"), hir_uclass(&[('a', 'z')]));
+ assert_eq!(t("[a-fd-h]"), hir_uclass(&[('a', 'h')]));
+ assert_eq!(t("[a-fg-m]"), hir_uclass(&[('a', 'm')]));
+ assert_eq!(t(r"[\x00]"), hir_uclass(&[('\0', '\0')]));
+ assert_eq!(t(r"[\n]"), hir_uclass(&[('\n', '\n')]));
+ assert_eq!(t("[\n]"), hir_uclass(&[('\n', '\n')]));
+ #[cfg(any(feature = "unicode-perl", feature = "unicode-gencat"))]
+ assert_eq!(t(r"[\d]"), hir_uclass_query(ClassQuery::Binary("digit")));
+ #[cfg(feature = "unicode-gencat")]
+ assert_eq!(
+ t(r"[\pZ]"),
+ hir_uclass_query(ClassQuery::Binary("separator"))
+ );
+ #[cfg(feature = "unicode-gencat")]
+ assert_eq!(
+ t(r"[\p{separator}]"),
+ hir_uclass_query(ClassQuery::Binary("separator"))
+ );
+ #[cfg(any(feature = "unicode-perl", feature = "unicode-gencat"))]
+ assert_eq!(t(r"[^\D]"), hir_uclass_query(ClassQuery::Binary("digit")));
+ #[cfg(feature = "unicode-gencat")]
+ assert_eq!(
+ t(r"[^\PZ]"),
+ hir_uclass_query(ClassQuery::Binary("separator"))
+ );
+ #[cfg(feature = "unicode-gencat")]
+ assert_eq!(
+ t(r"[^\P{separator}]"),
+ hir_uclass_query(ClassQuery::Binary("separator"))
+ );
+ #[cfg(all(
+ feature = "unicode-case",
+ any(feature = "unicode-perl", feature = "unicode-gencat")
+ ))]
+ assert_eq!(
+ t(r"(?i)[^\D]"),
+ hir_uclass_query(ClassQuery::Binary("digit"))
+ );
+ #[cfg(all(feature = "unicode-case", feature = "unicode-script"))]
+ assert_eq!(
+ t(r"(?i)[^\P{greek}]"),
+ hir_case_fold(hir_uclass_query(ClassQuery::Binary("greek")))
+ );
+
+ assert_eq!(t("(?-u)[a]"), hir_bclass(&[(b'a', b'a')]));
+ assert_eq!(t(r"(?-u)[\x00]"), hir_bclass(&[(b'\0', b'\0')]));
+ assert_eq!(t_bytes(r"(?-u)[\xFF]"), hir_bclass(&[(b'\xFF', b'\xFF')]));
+
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(t("(?i)[a]"), hir_uclass(&[('A', 'A'), ('a', 'a')]));
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t("(?i)[k]"),
+ hir_uclass(&[('K', 'K'), ('k', 'k'), ('\u{212A}', '\u{212A}'),])
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t("(?i)[β]"),
+ hir_uclass(&[('Β', 'Β'), ('β', 'β'), ('ϐ', 'ϐ'),])
+ );
+ assert_eq!(t("(?i-u)[k]"), hir_bclass(&[(b'K', b'K'), (b'k', b'k'),]));
+
+ assert_eq!(t("[^a]"), hir_negate(hir_uclass(&[('a', 'a')])));
+ assert_eq!(t(r"[^\x00]"), hir_negate(hir_uclass(&[('\0', '\0')])));
+ assert_eq!(
+ t_bytes("(?-u)[^a]"),
+ hir_negate(hir_bclass(&[(b'a', b'a')]))
+ );
+ #[cfg(any(feature = "unicode-perl", feature = "unicode-gencat"))]
+ assert_eq!(
+ t(r"[^\d]"),
+ hir_negate(hir_uclass_query(ClassQuery::Binary("digit")))
+ );
+ #[cfg(feature = "unicode-gencat")]
+ assert_eq!(
+ t(r"[^\pZ]"),
+ hir_negate(hir_uclass_query(ClassQuery::Binary("separator")))
+ );
+ #[cfg(feature = "unicode-gencat")]
+ assert_eq!(
+ t(r"[^\p{separator}]"),
+ hir_negate(hir_uclass_query(ClassQuery::Binary("separator")))
+ );
+ #[cfg(all(feature = "unicode-case", feature = "unicode-script"))]
+ assert_eq!(
+ t(r"(?i)[^\p{greek}]"),
+ hir_negate(hir_case_fold(hir_uclass_query(ClassQuery::Binary(
+ "greek"
+ ))))
+ );
+ #[cfg(all(feature = "unicode-case", feature = "unicode-script"))]
+ assert_eq!(
+ t(r"(?i)[\P{greek}]"),
+ hir_negate(hir_case_fold(hir_uclass_query(ClassQuery::Binary(
+ "greek"
+ ))))
+ );
+
+ // Test some weird cases.
+ assert_eq!(t(r"[\[]"), hir_uclass(&[('[', '[')]));
+
+ assert_eq!(t(r"[&]"), hir_uclass(&[('&', '&')]));
+ assert_eq!(t(r"[\&]"), hir_uclass(&[('&', '&')]));
+ assert_eq!(t(r"[\&\&]"), hir_uclass(&[('&', '&')]));
+ assert_eq!(t(r"[\x00-&]"), hir_uclass(&[('\0', '&')]));
+ assert_eq!(t(r"[&-\xFF]"), hir_uclass(&[('&', '\u{FF}')]));
+
+ assert_eq!(t(r"[~]"), hir_uclass(&[('~', '~')]));
+ assert_eq!(t(r"[\~]"), hir_uclass(&[('~', '~')]));
+ assert_eq!(t(r"[\~\~]"), hir_uclass(&[('~', '~')]));
+ assert_eq!(t(r"[\x00-~]"), hir_uclass(&[('\0', '~')]));
+ assert_eq!(t(r"[~-\xFF]"), hir_uclass(&[('~', '\u{FF}')]));
+
+ assert_eq!(t(r"[-]"), hir_uclass(&[('-', '-')]));
+ assert_eq!(t(r"[\-]"), hir_uclass(&[('-', '-')]));
+ assert_eq!(t(r"[\-\-]"), hir_uclass(&[('-', '-')]));
+ assert_eq!(t(r"[\x00-\-]"), hir_uclass(&[('\0', '-')]));
+ assert_eq!(t(r"[\--\xFF]"), hir_uclass(&[('-', '\u{FF}')]));
+
+ assert_eq!(
+ t_err("(?-u)[^a]"),
+ TestError {
+ kind: hir::ErrorKind::InvalidUtf8,
+ span: Span::new(
+ Position::new(5, 1, 6),
+ Position::new(9, 1, 10)
+ ),
+ }
+ );
+ #[cfg(any(feature = "unicode-perl", feature = "unicode-bool"))]
+ assert_eq!(
+ t_err(r"[^\s\S]"),
+ TestError {
+ kind: hir::ErrorKind::EmptyClassNotAllowed,
+ span: Span::new(
+ Position::new(0, 1, 1),
+ Position::new(7, 1, 8)
+ ),
+ }
+ );
+ #[cfg(any(feature = "unicode-perl", feature = "unicode-bool"))]
+ assert_eq!(
+ t_err(r"(?-u)[^\s\S]"),
+ TestError {
+ kind: hir::ErrorKind::EmptyClassNotAllowed,
+ span: Span::new(
+ Position::new(5, 1, 6),
+ Position::new(12, 1, 13)
+ ),
+ }
+ );
+ }
+
+ #[test]
+ fn class_bracketed_union() {
+ assert_eq!(t("[a-zA-Z]"), hir_uclass(&[('A', 'Z'), ('a', 'z')]));
+ #[cfg(feature = "unicode-gencat")]
+ assert_eq!(
+ t(r"[a\pZb]"),
+ hir_union(
+ hir_uclass(&[('a', 'b')]),
+ hir_uclass_query(ClassQuery::Binary("separator"))
+ )
+ );
+ #[cfg(all(feature = "unicode-gencat", feature = "unicode-script"))]
+ assert_eq!(
+ t(r"[\pZ\p{Greek}]"),
+ hir_union(
+ hir_uclass_query(ClassQuery::Binary("greek")),
+ hir_uclass_query(ClassQuery::Binary("separator"))
+ )
+ );
+ #[cfg(all(
+ feature = "unicode-age",
+ feature = "unicode-gencat",
+ feature = "unicode-script"
+ ))]
+ assert_eq!(
+ t(r"[\p{age:3.0}\pZ\p{Greek}]"),
+ hir_union(
+ hir_uclass_query(ClassQuery::ByValue {
+ property_name: "age",
+ property_value: "3.0",
+ }),
+ hir_union(
+ hir_uclass_query(ClassQuery::Binary("greek")),
+ hir_uclass_query(ClassQuery::Binary("separator"))
+ )
+ )
+ );
+ #[cfg(all(
+ feature = "unicode-age",
+ feature = "unicode-gencat",
+ feature = "unicode-script"
+ ))]
+ assert_eq!(
+ t(r"[[[\p{age:3.0}\pZ]\p{Greek}][\p{Cyrillic}]]"),
+ hir_union(
+ hir_uclass_query(ClassQuery::ByValue {
+ property_name: "age",
+ property_value: "3.0",
+ }),
+ hir_union(
+ hir_uclass_query(ClassQuery::Binary("cyrillic")),
+ hir_union(
+ hir_uclass_query(ClassQuery::Binary("greek")),
+ hir_uclass_query(ClassQuery::Binary("separator"))
+ )
+ )
+ )
+ );
+
+ #[cfg(all(
+ feature = "unicode-age",
+ feature = "unicode-case",
+ feature = "unicode-gencat",
+ feature = "unicode-script"
+ ))]
+ assert_eq!(
+ t(r"(?i)[\p{age:3.0}\pZ\p{Greek}]"),
+ hir_case_fold(hir_union(
+ hir_uclass_query(ClassQuery::ByValue {
+ property_name: "age",
+ property_value: "3.0",
+ }),
+ hir_union(
+ hir_uclass_query(ClassQuery::Binary("greek")),
+ hir_uclass_query(ClassQuery::Binary("separator"))
+ )
+ ))
+ );
+ #[cfg(all(
+ feature = "unicode-age",
+ feature = "unicode-gencat",
+ feature = "unicode-script"
+ ))]
+ assert_eq!(
+ t(r"[^\p{age:3.0}\pZ\p{Greek}]"),
+ hir_negate(hir_union(
+ hir_uclass_query(ClassQuery::ByValue {
+ property_name: "age",
+ property_value: "3.0",
+ }),
+ hir_union(
+ hir_uclass_query(ClassQuery::Binary("greek")),
+ hir_uclass_query(ClassQuery::Binary("separator"))
+ )
+ ))
+ );
+ #[cfg(all(
+ feature = "unicode-age",
+ feature = "unicode-case",
+ feature = "unicode-gencat",
+ feature = "unicode-script"
+ ))]
+ assert_eq!(
+ t(r"(?i)[^\p{age:3.0}\pZ\p{Greek}]"),
+ hir_negate(hir_case_fold(hir_union(
+ hir_uclass_query(ClassQuery::ByValue {
+ property_name: "age",
+ property_value: "3.0",
+ }),
+ hir_union(
+ hir_uclass_query(ClassQuery::Binary("greek")),
+ hir_uclass_query(ClassQuery::Binary("separator"))
+ )
+ )))
+ );
+ }
+
+ #[test]
+ fn class_bracketed_nested() {
+ assert_eq!(t(r"[a[^c]]"), hir_negate(hir_uclass(&[('c', 'c')])));
+ assert_eq!(t(r"[a-b[^c]]"), hir_negate(hir_uclass(&[('c', 'c')])));
+ assert_eq!(t(r"[a-c[^c]]"), hir_negate(hir_uclass(&[])));
+
+ assert_eq!(t(r"[^a[^c]]"), hir_uclass(&[('c', 'c')]));
+ assert_eq!(t(r"[^a-b[^c]]"), hir_uclass(&[('c', 'c')]));
+
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t(r"(?i)[a[^c]]"),
+ hir_negate(hir_case_fold(hir_uclass(&[('c', 'c')])))
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t(r"(?i)[a-b[^c]]"),
+ hir_negate(hir_case_fold(hir_uclass(&[('c', 'c')])))
+ );
+
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(t(r"(?i)[^a[^c]]"), hir_uclass(&[('C', 'C'), ('c', 'c')]));
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t(r"(?i)[^a-b[^c]]"),
+ hir_uclass(&[('C', 'C'), ('c', 'c')])
+ );
+
+ assert_eq!(
+ t_err(r"[^a-c[^c]]"),
+ TestError {
+ kind: hir::ErrorKind::EmptyClassNotAllowed,
+ span: Span::new(
+ Position::new(0, 1, 1),
+ Position::new(10, 1, 11)
+ ),
+ }
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t_err(r"(?i)[^a-c[^c]]"),
+ TestError {
+ kind: hir::ErrorKind::EmptyClassNotAllowed,
+ span: Span::new(
+ Position::new(4, 1, 5),
+ Position::new(14, 1, 15)
+ ),
+ }
+ );
+ }
+
+ #[test]
+ fn class_bracketed_intersect() {
+ assert_eq!(t("[abc&&b-c]"), hir_uclass(&[('b', 'c')]));
+ assert_eq!(t("[abc&&[b-c]]"), hir_uclass(&[('b', 'c')]));
+ assert_eq!(t("[[abc]&&[b-c]]"), hir_uclass(&[('b', 'c')]));
+ assert_eq!(t("[a-z&&b-y&&c-x]"), hir_uclass(&[('c', 'x')]));
+ assert_eq!(t("[c-da-b&&a-d]"), hir_uclass(&[('a', 'd')]));
+ assert_eq!(t("[a-d&&c-da-b]"), hir_uclass(&[('a', 'd')]));
+ assert_eq!(t(r"[a-z&&a-c]"), hir_uclass(&[('a', 'c')]));
+ assert_eq!(t(r"[[a-z&&a-c]]"), hir_uclass(&[('a', 'c')]));
+ assert_eq!(t(r"[^[a-z&&a-c]]"), hir_negate(hir_uclass(&[('a', 'c')])));
+
+ assert_eq!(t("(?-u)[abc&&b-c]"), hir_bclass(&[(b'b', b'c')]));
+ assert_eq!(t("(?-u)[abc&&[b-c]]"), hir_bclass(&[(b'b', b'c')]));
+ assert_eq!(t("(?-u)[[abc]&&[b-c]]"), hir_bclass(&[(b'b', b'c')]));
+ assert_eq!(t("(?-u)[a-z&&b-y&&c-x]"), hir_bclass(&[(b'c', b'x')]));
+ assert_eq!(t("(?-u)[c-da-b&&a-d]"), hir_bclass(&[(b'a', b'd')]));
+ assert_eq!(t("(?-u)[a-d&&c-da-b]"), hir_bclass(&[(b'a', b'd')]));
+
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t("(?i)[abc&&b-c]"),
+ hir_case_fold(hir_uclass(&[('b', 'c')]))
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t("(?i)[abc&&[b-c]]"),
+ hir_case_fold(hir_uclass(&[('b', 'c')]))
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t("(?i)[[abc]&&[b-c]]"),
+ hir_case_fold(hir_uclass(&[('b', 'c')]))
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t("(?i)[a-z&&b-y&&c-x]"),
+ hir_case_fold(hir_uclass(&[('c', 'x')]))
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t("(?i)[c-da-b&&a-d]"),
+ hir_case_fold(hir_uclass(&[('a', 'd')]))
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t("(?i)[a-d&&c-da-b]"),
+ hir_case_fold(hir_uclass(&[('a', 'd')]))
+ );
+
+ assert_eq!(
+ t("(?i-u)[abc&&b-c]"),
+ hir_case_fold(hir_bclass(&[(b'b', b'c')]))
+ );
+ assert_eq!(
+ t("(?i-u)[abc&&[b-c]]"),
+ hir_case_fold(hir_bclass(&[(b'b', b'c')]))
+ );
+ assert_eq!(
+ t("(?i-u)[[abc]&&[b-c]]"),
+ hir_case_fold(hir_bclass(&[(b'b', b'c')]))
+ );
+ assert_eq!(
+ t("(?i-u)[a-z&&b-y&&c-x]"),
+ hir_case_fold(hir_bclass(&[(b'c', b'x')]))
+ );
+ assert_eq!(
+ t("(?i-u)[c-da-b&&a-d]"),
+ hir_case_fold(hir_bclass(&[(b'a', b'd')]))
+ );
+ assert_eq!(
+ t("(?i-u)[a-d&&c-da-b]"),
+ hir_case_fold(hir_bclass(&[(b'a', b'd')]))
+ );
+
+ // In `[a^]`, `^` does not need to be escaped, so it makes sense that
+ // `^` is also allowed to be unescaped after `&&`.
+ assert_eq!(t(r"[\^&&^]"), hir_uclass(&[('^', '^')]));
+ // `]` needs to be escaped after `&&` since it's not at start of class.
+ assert_eq!(t(r"[]&&\]]"), hir_uclass(&[(']', ']')]));
+ assert_eq!(t(r"[-&&-]"), hir_uclass(&[('-', '-')]));
+ assert_eq!(t(r"[\&&&&]"), hir_uclass(&[('&', '&')]));
+ assert_eq!(t(r"[\&&&\&]"), hir_uclass(&[('&', '&')]));
+ // Test precedence.
+ assert_eq!(
+ t(r"[a-w&&[^c-g]z]"),
+ hir_uclass(&[('a', 'b'), ('h', 'w')])
+ );
+ }
+
+ #[test]
+ fn class_bracketed_intersect_negate() {
+ #[cfg(feature = "unicode-perl")]
+ assert_eq!(
+ t(r"[^\w&&\d]"),
+ hir_negate(hir_uclass_query(ClassQuery::Binary("digit")))
+ );
+ assert_eq!(t(r"[^[a-z&&a-c]]"), hir_negate(hir_uclass(&[('a', 'c')])));
+ #[cfg(feature = "unicode-perl")]
+ assert_eq!(
+ t(r"[^[\w&&\d]]"),
+ hir_negate(hir_uclass_query(ClassQuery::Binary("digit")))
+ );
+ #[cfg(feature = "unicode-perl")]
+ assert_eq!(
+ t(r"[^[^\w&&\d]]"),
+ hir_uclass_query(ClassQuery::Binary("digit"))
+ );
+ #[cfg(feature = "unicode-perl")]
+ assert_eq!(t(r"[[[^\w]&&[^\d]]]"), hir_negate(hir_uclass_perl_word()));
+
+ #[cfg(feature = "unicode-perl")]
+ assert_eq!(
+ t_bytes(r"(?-u)[^\w&&\d]"),
+ hir_negate(hir_bclass_from_char(ascii_class(
+ &ast::ClassAsciiKind::Digit
+ )))
+ );
+ assert_eq!(
+ t_bytes(r"(?-u)[^[a-z&&a-c]]"),
+ hir_negate(hir_bclass(&[(b'a', b'c')]))
+ );
+ assert_eq!(
+ t_bytes(r"(?-u)[^[\w&&\d]]"),
+ hir_negate(hir_bclass_from_char(ascii_class(
+ &ast::ClassAsciiKind::Digit
+ )))
+ );
+ assert_eq!(
+ t_bytes(r"(?-u)[^[^\w&&\d]]"),
+ hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Digit))
+ );
+ assert_eq!(
+ t_bytes(r"(?-u)[[[^\w]&&[^\d]]]"),
+ hir_negate(hir_bclass_from_char(ascii_class(
+ &ast::ClassAsciiKind::Word
+ )))
+ );
+ }
+
+ #[test]
+ fn class_bracketed_difference() {
+ #[cfg(feature = "unicode-gencat")]
+ assert_eq!(
+ t(r"[\pL--[:ascii:]]"),
+ hir_difference(
+ hir_uclass_query(ClassQuery::Binary("letter")),
+ hir_uclass(&[('\0', '\x7F')])
+ )
+ );
+
+ assert_eq!(
+ t(r"(?-u)[[:alpha:]--[:lower:]]"),
+ hir_bclass(&[(b'A', b'Z')])
+ );
+ }
+
+ #[test]
+ fn class_bracketed_symmetric_difference() {
+ #[cfg(feature = "unicode-script")]
+ assert_eq!(
+ t(r"[\p{sc:Greek}~~\p{scx:Greek}]"),
+ hir_uclass(&[
+ ('\u{0342}', '\u{0342}'),
+ ('\u{0345}', '\u{0345}'),
+ ('\u{1DC0}', '\u{1DC1}'),
+ ])
+ );
+ assert_eq!(t(r"[a-g~~c-j]"), hir_uclass(&[('a', 'b'), ('h', 'j')]));
+
+ assert_eq!(
+ t(r"(?-u)[a-g~~c-j]"),
+ hir_bclass(&[(b'a', b'b'), (b'h', b'j')])
+ );
+ }
+
+ #[test]
+ fn ignore_whitespace() {
+ assert_eq!(t(r"(?x)\12 3"), hir_lit("\n3"));
+ assert_eq!(t(r"(?x)\x { 53 }"), hir_lit("S"));
+ assert_eq!(
+ t(r"(?x)\x # comment
+{ # comment
+ 53 # comment
+} #comment"),
+ hir_lit("S")
+ );
+
+ assert_eq!(t(r"(?x)\x 53"), hir_lit("S"));
+ assert_eq!(
+ t(r"(?x)\x # comment
+ 53 # comment"),
+ hir_lit("S")
+ );
+ assert_eq!(t(r"(?x)\x5 3"), hir_lit("S"));
+
+ #[cfg(feature = "unicode-gencat")]
+ assert_eq!(
+ t(r"(?x)\p # comment
+{ # comment
+ Separator # comment
+} # comment"),
+ hir_uclass_query(ClassQuery::Binary("separator"))
+ );
+
+ assert_eq!(
+ t(r"(?x)a # comment
+{ # comment
+ 5 # comment
+ , # comment
+ 10 # comment
+} # comment"),
+ hir_range(
+ true,
+ hir::RepetitionRange::Bounded(5, 10),
+ hir_lit("a")
+ )
+ );
+
+ assert_eq!(t(r"(?x)a\ # hi there"), hir_lit("a "));
+ }
+
+ #[test]
+ fn analysis_is_always_utf8() {
+ // Positive examples.
+ assert!(t_bytes(r"a").is_always_utf8());
+ assert!(t_bytes(r"ab").is_always_utf8());
+ assert!(t_bytes(r"(?-u)a").is_always_utf8());
+ assert!(t_bytes(r"(?-u)ab").is_always_utf8());
+ assert!(t_bytes(r"\xFF").is_always_utf8());
+ assert!(t_bytes(r"\xFF\xFF").is_always_utf8());
+ assert!(t_bytes(r"[^a]").is_always_utf8());
+ assert!(t_bytes(r"[^a][^a]").is_always_utf8());
+ assert!(t_bytes(r"\b").is_always_utf8());
+ assert!(t_bytes(r"\B").is_always_utf8());
+ assert!(t_bytes(r"(?-u)\b").is_always_utf8());
+
+ // Negative examples.
+ assert!(!t_bytes(r"(?-u)\xFF").is_always_utf8());
+ assert!(!t_bytes(r"(?-u)\xFF\xFF").is_always_utf8());
+ assert!(!t_bytes(r"(?-u)[^a]").is_always_utf8());
+ assert!(!t_bytes(r"(?-u)[^a][^a]").is_always_utf8());
+ assert!(!t_bytes(r"(?-u)\B").is_always_utf8());
+ }
+
+ #[test]
+ fn analysis_is_all_assertions() {
+ // Positive examples.
+ assert!(t(r"\b").is_all_assertions());
+ assert!(t(r"\B").is_all_assertions());
+ assert!(t(r"^").is_all_assertions());
+ assert!(t(r"$").is_all_assertions());
+ assert!(t(r"\A").is_all_assertions());
+ assert!(t(r"\z").is_all_assertions());
+ assert!(t(r"$^\z\A\b\B").is_all_assertions());
+ assert!(t(r"$|^|\z|\A|\b|\B").is_all_assertions());
+ assert!(t(r"^$|$^").is_all_assertions());
+ assert!(t(r"((\b)+())*^").is_all_assertions());
+
+ // Negative examples.
+ assert!(!t(r"^a").is_all_assertions());
+ }
+
+ #[test]
+ fn analysis_is_anchored() {
+ // Positive examples.
+ assert!(t(r"^").is_anchored_start());
+ assert!(t(r"$").is_anchored_end());
+ assert!(t(r"^").is_line_anchored_start());
+ assert!(t(r"$").is_line_anchored_end());
+
+ assert!(t(r"^^").is_anchored_start());
+ assert!(t(r"$$").is_anchored_end());
+ assert!(t(r"^^").is_line_anchored_start());
+ assert!(t(r"$$").is_line_anchored_end());
+
+ assert!(t(r"^$").is_anchored_start());
+ assert!(t(r"^$").is_anchored_end());
+ assert!(t(r"^$").is_line_anchored_start());
+ assert!(t(r"^$").is_line_anchored_end());
+
+ assert!(t(r"^foo").is_anchored_start());
+ assert!(t(r"foo$").is_anchored_end());
+ assert!(t(r"^foo").is_line_anchored_start());
+ assert!(t(r"foo$").is_line_anchored_end());
+
+ assert!(t(r"^foo|^bar").is_anchored_start());
+ assert!(t(r"foo$|bar$").is_anchored_end());
+ assert!(t(r"^foo|^bar").is_line_anchored_start());
+ assert!(t(r"foo$|bar$").is_line_anchored_end());
+
+ assert!(t(r"^(foo|bar)").is_anchored_start());
+ assert!(t(r"(foo|bar)$").is_anchored_end());
+ assert!(t(r"^(foo|bar)").is_line_anchored_start());
+ assert!(t(r"(foo|bar)$").is_line_anchored_end());
+
+ assert!(t(r"^+").is_anchored_start());
+ assert!(t(r"$+").is_anchored_end());
+ assert!(t(r"^+").is_line_anchored_start());
+ assert!(t(r"$+").is_line_anchored_end());
+ assert!(t(r"^++").is_anchored_start());
+ assert!(t(r"$++").is_anchored_end());
+ assert!(t(r"^++").is_line_anchored_start());
+ assert!(t(r"$++").is_line_anchored_end());
+ assert!(t(r"(^)+").is_anchored_start());
+ assert!(t(r"($)+").is_anchored_end());
+ assert!(t(r"(^)+").is_line_anchored_start());
+ assert!(t(r"($)+").is_line_anchored_end());
+
+ assert!(t(r"$^").is_anchored_start());
+ assert!(t(r"$^").is_anchored_start());
+ assert!(t(r"$^").is_line_anchored_end());
+ assert!(t(r"$^").is_line_anchored_end());
+ assert!(t(r"$^|^$").is_anchored_start());
+ assert!(t(r"$^|^$").is_anchored_end());
+ assert!(t(r"$^|^$").is_line_anchored_start());
+ assert!(t(r"$^|^$").is_line_anchored_end());
+
+ assert!(t(r"\b^").is_anchored_start());
+ assert!(t(r"$\b").is_anchored_end());
+ assert!(t(r"\b^").is_line_anchored_start());
+ assert!(t(r"$\b").is_line_anchored_end());
+ assert!(t(r"^(?m:^)").is_anchored_start());
+ assert!(t(r"(?m:$)$").is_anchored_end());
+ assert!(t(r"^(?m:^)").is_line_anchored_start());
+ assert!(t(r"(?m:$)$").is_line_anchored_end());
+ assert!(t(r"(?m:^)^").is_anchored_start());
+ assert!(t(r"$(?m:$)").is_anchored_end());
+ assert!(t(r"(?m:^)^").is_line_anchored_start());
+ assert!(t(r"$(?m:$)").is_line_anchored_end());
+
+ // Negative examples.
+ assert!(!t(r"(?m)^").is_anchored_start());
+ assert!(!t(r"(?m)$").is_anchored_end());
+ assert!(!t(r"(?m:^$)|$^").is_anchored_start());
+ assert!(!t(r"(?m:^$)|$^").is_anchored_end());
+ assert!(!t(r"$^|(?m:^$)").is_anchored_start());
+ assert!(!t(r"$^|(?m:^$)").is_anchored_end());
+
+ assert!(!t(r"a^").is_anchored_start());
+ assert!(!t(r"$a").is_anchored_start());
+ assert!(!t(r"a^").is_line_anchored_start());
+ assert!(!t(r"$a").is_line_anchored_start());
+
+ assert!(!t(r"a^").is_anchored_end());
+ assert!(!t(r"$a").is_anchored_end());
+ assert!(!t(r"a^").is_line_anchored_end());
+ assert!(!t(r"$a").is_line_anchored_end());
+
+ assert!(!t(r"^foo|bar").is_anchored_start());
+ assert!(!t(r"foo|bar$").is_anchored_end());
+ assert!(!t(r"^foo|bar").is_line_anchored_start());
+ assert!(!t(r"foo|bar$").is_line_anchored_end());
+
+ assert!(!t(r"^*").is_anchored_start());
+ assert!(!t(r"$*").is_anchored_end());
+ assert!(!t(r"^*").is_line_anchored_start());
+ assert!(!t(r"$*").is_line_anchored_end());
+ assert!(!t(r"^*+").is_anchored_start());
+ assert!(!t(r"$*+").is_anchored_end());
+ assert!(!t(r"^*+").is_line_anchored_start());
+ assert!(!t(r"$*+").is_line_anchored_end());
+ assert!(!t(r"^+*").is_anchored_start());
+ assert!(!t(r"$+*").is_anchored_end());
+ assert!(!t(r"^+*").is_line_anchored_start());
+ assert!(!t(r"$+*").is_line_anchored_end());
+ assert!(!t(r"(^)*").is_anchored_start());
+ assert!(!t(r"($)*").is_anchored_end());
+ assert!(!t(r"(^)*").is_line_anchored_start());
+ assert!(!t(r"($)*").is_line_anchored_end());
+ }
+
+ #[test]
+ fn analysis_is_line_anchored() {
+ assert!(t(r"(?m)^(foo|bar)").is_line_anchored_start());
+ assert!(t(r"(?m)(foo|bar)$").is_line_anchored_end());
+
+ assert!(t(r"(?m)^foo|^bar").is_line_anchored_start());
+ assert!(t(r"(?m)foo$|bar$").is_line_anchored_end());
+
+ assert!(t(r"(?m)^").is_line_anchored_start());
+ assert!(t(r"(?m)$").is_line_anchored_end());
+
+ assert!(t(r"(?m:^$)|$^").is_line_anchored_start());
+ assert!(t(r"(?m:^$)|$^").is_line_anchored_end());
+
+ assert!(t(r"$^|(?m:^$)").is_line_anchored_start());
+ assert!(t(r"$^|(?m:^$)").is_line_anchored_end());
+ }
+
+ #[test]
+ fn analysis_is_any_anchored() {
+ // Positive examples.
+ assert!(t(r"^").is_any_anchored_start());
+ assert!(t(r"$").is_any_anchored_end());
+ assert!(t(r"\A").is_any_anchored_start());
+ assert!(t(r"\z").is_any_anchored_end());
+
+ // Negative examples.
+ assert!(!t(r"(?m)^").is_any_anchored_start());
+ assert!(!t(r"(?m)$").is_any_anchored_end());
+ assert!(!t(r"$").is_any_anchored_start());
+ assert!(!t(r"^").is_any_anchored_end());
+ }
+
+ #[test]
+ fn analysis_is_match_empty() {
+ // Positive examples.
+ assert!(t(r"").is_match_empty());
+ assert!(t(r"()").is_match_empty());
+ assert!(t(r"()*").is_match_empty());
+ assert!(t(r"()+").is_match_empty());
+ assert!(t(r"()?").is_match_empty());
+ assert!(t(r"a*").is_match_empty());
+ assert!(t(r"a?").is_match_empty());
+ assert!(t(r"a{0}").is_match_empty());
+ assert!(t(r"a{0,}").is_match_empty());
+ assert!(t(r"a{0,1}").is_match_empty());
+ assert!(t(r"a{0,10}").is_match_empty());
+ #[cfg(feature = "unicode-gencat")]
+ assert!(t(r"\pL*").is_match_empty());
+ assert!(t(r"a*|b").is_match_empty());
+ assert!(t(r"b|a*").is_match_empty());
+ assert!(t(r"a|").is_match_empty());
+ assert!(t(r"|a").is_match_empty());
+ assert!(t(r"a||b").is_match_empty());
+ assert!(t(r"a*a?(abcd)*").is_match_empty());
+ assert!(t(r"^").is_match_empty());
+ assert!(t(r"$").is_match_empty());
+ assert!(t(r"(?m)^").is_match_empty());
+ assert!(t(r"(?m)$").is_match_empty());
+ assert!(t(r"\A").is_match_empty());
+ assert!(t(r"\z").is_match_empty());
+ assert!(t(r"\B").is_match_empty());
+ assert!(t_bytes(r"(?-u)\B").is_match_empty());
+ assert!(t(r"\b").is_match_empty());
+ assert!(t(r"(?-u)\b").is_match_empty());
+
+ // Negative examples.
+ assert!(!t(r"a+").is_match_empty());
+ assert!(!t(r"a{1}").is_match_empty());
+ assert!(!t(r"a{1,}").is_match_empty());
+ assert!(!t(r"a{1,2}").is_match_empty());
+ assert!(!t(r"a{1,10}").is_match_empty());
+ assert!(!t(r"b|a").is_match_empty());
+ assert!(!t(r"a*a+(abcd)*").is_match_empty());
+ }
+
+ #[test]
+ fn analysis_is_literal() {
+ // Positive examples.
+ assert!(t(r"a").is_literal());
+ assert!(t(r"ab").is_literal());
+ assert!(t(r"abc").is_literal());
+ assert!(t(r"(?m)abc").is_literal());
+
+ // Negative examples.
+ assert!(!t(r"").is_literal());
+ assert!(!t(r"^").is_literal());
+ assert!(!t(r"a|b").is_literal());
+ assert!(!t(r"(a)").is_literal());
+ assert!(!t(r"a+").is_literal());
+ assert!(!t(r"foo(a)").is_literal());
+ assert!(!t(r"(a)foo").is_literal());
+ assert!(!t(r"[a]").is_literal());
+ }
+
+ #[test]
+ fn analysis_is_alternation_literal() {
+ // Positive examples.
+ assert!(t(r"a").is_alternation_literal());
+ assert!(t(r"ab").is_alternation_literal());
+ assert!(t(r"abc").is_alternation_literal());
+ assert!(t(r"(?m)abc").is_alternation_literal());
+ assert!(t(r"a|b").is_alternation_literal());
+ assert!(t(r"a|b|c").is_alternation_literal());
+ assert!(t(r"foo|bar").is_alternation_literal());
+ assert!(t(r"foo|bar|baz").is_alternation_literal());
+
+ // Negative examples.
+ assert!(!t(r"").is_alternation_literal());
+ assert!(!t(r"^").is_alternation_literal());
+ assert!(!t(r"(a)").is_alternation_literal());
+ assert!(!t(r"a+").is_alternation_literal());
+ assert!(!t(r"foo(a)").is_alternation_literal());
+ assert!(!t(r"(a)foo").is_alternation_literal());
+ assert!(!t(r"[a]").is_alternation_literal());
+ assert!(!t(r"[a]|b").is_alternation_literal());
+ assert!(!t(r"a|[b]").is_alternation_literal());
+ assert!(!t(r"(a)|b").is_alternation_literal());
+ assert!(!t(r"a|(b)").is_alternation_literal());
+ }
+}
diff --git a/third_party/rust/regex-syntax/src/hir/visitor.rs b/third_party/rust/regex-syntax/src/hir/visitor.rs
new file mode 100644
index 0000000000..4f5a70909c
--- /dev/null
+++ b/third_party/rust/regex-syntax/src/hir/visitor.rs
@@ -0,0 +1,203 @@
+use crate::hir::{self, Hir, HirKind};
+
+/// A trait for visiting the high-level IR (HIR) in depth first order.
+///
+/// The principle aim of this trait is to enable callers to perform case
+/// analysis on a high-level intermediate representation of a regular
+/// expression without necessarily using recursion. In particular, this permits
+/// callers to do case analysis with constant stack usage, which can be
+/// important since the size of an HIR may be proportional to end user input.
+///
+/// Typical usage of this trait involves providing an implementation and then
+/// running it using the [`visit`](fn.visit.html) function.
+pub trait Visitor {
+ /// The result of visiting an HIR.
+ type Output;
+ /// An error that visiting an HIR might return.
+ type Err;
+
+ /// All implementors of `Visitor` must provide a `finish` method, which
+ /// yields the result of visiting the HIR or an error.
+ fn finish(self) -> Result<Self::Output, Self::Err>;
+
+ /// This method is called before beginning traversal of the HIR.
+ fn start(&mut self) {}
+
+ /// This method is called on an `Hir` before descending into child `Hir`
+ /// nodes.
+ fn visit_pre(&mut self, _hir: &Hir) -> Result<(), Self::Err> {
+ Ok(())
+ }
+
+ /// This method is called on an `Hir` after descending all of its child
+ /// `Hir` nodes.
+ fn visit_post(&mut self, _hir: &Hir) -> Result<(), Self::Err> {
+ Ok(())
+ }
+
+ /// This method is called between child nodes of an alternation.
+ fn visit_alternation_in(&mut self) -> Result<(), Self::Err> {
+ Ok(())
+ }
+}
+
+/// Executes an implementation of `Visitor` in constant stack space.
+///
+/// This function will visit every node in the given `Hir` while calling
+/// appropriate methods provided by the
+/// [`Visitor`](trait.Visitor.html) trait.
+///
+/// The primary use case for this method is when one wants to perform case
+/// analysis over an `Hir` without using a stack size proportional to the depth
+/// of the `Hir`. Namely, this method will instead use constant stack space,
+/// but will use heap space proportional to the size of the `Hir`. This may be
+/// desirable in cases where the size of `Hir` is proportional to end user
+/// input.
+///
+/// If the visitor returns an error at any point, then visiting is stopped and
+/// the error is returned.
+pub fn visit<V: Visitor>(hir: &Hir, visitor: V) -> Result<V::Output, V::Err> {
+ HeapVisitor::new().visit(hir, visitor)
+}
+
+/// HeapVisitor visits every item in an `Hir` recursively using constant stack
+/// size and a heap size proportional to the size of the `Hir`.
+struct HeapVisitor<'a> {
+ /// A stack of `Hir` nodes. This is roughly analogous to the call stack
+ /// used in a typical recursive visitor.
+ stack: Vec<(&'a Hir, Frame<'a>)>,
+}
+
+/// Represents a single stack frame while performing structural induction over
+/// an `Hir`.
+enum Frame<'a> {
+ /// A stack frame allocated just before descending into a repetition
+ /// operator's child node.
+ Repetition(&'a hir::Repetition),
+ /// A stack frame allocated just before descending into a group's child
+ /// node.
+ Group(&'a hir::Group),
+ /// The stack frame used while visiting every child node of a concatenation
+ /// of expressions.
+ Concat {
+ /// The child node we are currently visiting.
+ head: &'a Hir,
+ /// The remaining child nodes to visit (which may be empty).
+ tail: &'a [Hir],
+ },
+ /// The stack frame used while visiting every child node of an alternation
+ /// of expressions.
+ Alternation {
+ /// The child node we are currently visiting.
+ head: &'a Hir,
+ /// The remaining child nodes to visit (which may be empty).
+ tail: &'a [Hir],
+ },
+}
+
+impl<'a> HeapVisitor<'a> {
+ fn new() -> HeapVisitor<'a> {
+ HeapVisitor { stack: vec![] }
+ }
+
+ fn visit<V: Visitor>(
+ &mut self,
+ mut hir: &'a Hir,
+ mut visitor: V,
+ ) -> Result<V::Output, V::Err> {
+ self.stack.clear();
+
+ visitor.start();
+ loop {
+ visitor.visit_pre(hir)?;
+ if let Some(x) = self.induct(hir) {
+ let child = x.child();
+ self.stack.push((hir, x));
+ hir = child;
+ continue;
+ }
+ // No induction means we have a base case, so we can post visit
+ // it now.
+ visitor.visit_post(hir)?;
+
+ // At this point, we now try to pop our call stack until it is
+ // either empty or we hit another inductive case.
+ loop {
+ let (post_hir, frame) = match self.stack.pop() {
+ None => return visitor.finish(),
+ Some((post_hir, frame)) => (post_hir, frame),
+ };
+ // If this is a concat/alternate, then we might have additional
+ // inductive steps to process.
+ if let Some(x) = self.pop(frame) {
+ if let Frame::Alternation { .. } = x {
+ visitor.visit_alternation_in()?;
+ }
+ hir = x.child();
+ self.stack.push((post_hir, x));
+ break;
+ }
+ // Otherwise, we've finished visiting all the child nodes for
+ // this HIR, so we can post visit it now.
+ visitor.visit_post(post_hir)?;
+ }
+ }
+ }
+
+ /// Build a stack frame for the given HIR if one is needed (which occurs if
+ /// and only if there are child nodes in the HIR). Otherwise, return None.
+ fn induct(&mut self, hir: &'a Hir) -> Option<Frame<'a>> {
+ match *hir.kind() {
+ HirKind::Repetition(ref x) => Some(Frame::Repetition(x)),
+ HirKind::Group(ref x) => Some(Frame::Group(x)),
+ HirKind::Concat(ref x) if x.is_empty() => None,
+ HirKind::Concat(ref x) => {
+ Some(Frame::Concat { head: &x[0], tail: &x[1..] })
+ }
+ HirKind::Alternation(ref x) if x.is_empty() => None,
+ HirKind::Alternation(ref x) => {
+ Some(Frame::Alternation { head: &x[0], tail: &x[1..] })
+ }
+ _ => None,
+ }
+ }
+
+ /// Pops the given frame. If the frame has an additional inductive step,
+ /// then return it, otherwise return `None`.
+ fn pop(&self, induct: Frame<'a>) -> Option<Frame<'a>> {
+ match induct {
+ Frame::Repetition(_) => None,
+ Frame::Group(_) => None,
+ Frame::Concat { tail, .. } => {
+ if tail.is_empty() {
+ None
+ } else {
+ Some(Frame::Concat { head: &tail[0], tail: &tail[1..] })
+ }
+ }
+ Frame::Alternation { tail, .. } => {
+ if tail.is_empty() {
+ None
+ } else {
+ Some(Frame::Alternation {
+ head: &tail[0],
+ tail: &tail[1..],
+ })
+ }
+ }
+ }
+ }
+}
+
+impl<'a> Frame<'a> {
+ /// Perform the next inductive step on this frame and return the next
+ /// child HIR node to visit.
+ fn child(&self) -> &'a Hir {
+ match *self {
+ Frame::Repetition(rep) => &rep.hir,
+ Frame::Group(group) => &group.hir,
+ Frame::Concat { head, .. } => head,
+ Frame::Alternation { head, .. } => head,
+ }
+ }
+}