summaryrefslogtreecommitdiffstats
path: root/vendor/regex-syntax/src/hir
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-18 02:49:50 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-18 02:49:50 +0000
commit9835e2ae736235810b4ea1c162ca5e65c547e770 (patch)
tree3fcebf40ed70e581d776a8a4c65923e8ec20e026 /vendor/regex-syntax/src/hir
parentReleasing progress-linux version 1.70.0+dfsg2-1~progress7.99u1. (diff)
downloadrustc-9835e2ae736235810b4ea1c162ca5e65c547e770.tar.xz
rustc-9835e2ae736235810b4ea1c162ca5e65c547e770.zip
Merging upstream version 1.71.1+dfsg1.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'vendor/regex-syntax/src/hir')
-rw-r--r--vendor/regex-syntax/src/hir/interval.rs83
-rw-r--r--vendor/regex-syntax/src/hir/literal.rs3165
-rw-r--r--vendor/regex-syntax/src/hir/literal/mod.rs1686
-rw-r--r--vendor/regex-syntax/src/hir/mod.rs3021
-rw-r--r--vendor/regex-syntax/src/hir/print.rs394
-rw-r--r--vendor/regex-syntax/src/hir/translate.rs1920
-rw-r--r--vendor/regex-syntax/src/hir/visitor.rs19
7 files changed, 6925 insertions, 3363 deletions
diff --git a/vendor/regex-syntax/src/hir/interval.rs b/vendor/regex-syntax/src/hir/interval.rs
index 56698c53a..e063390a8 100644
--- a/vendor/regex-syntax/src/hir/interval.rs
+++ b/vendor/regex-syntax/src/hir/interval.rs
@@ -1,8 +1,6 @@
-use std::char;
-use std::cmp;
-use std::fmt::Debug;
-use std::slice;
-use std::u8;
+use core::{char, cmp, fmt::Debug, slice};
+
+use alloc::vec::Vec;
use crate::unicode;
@@ -32,9 +30,38 @@ use crate::unicode;
//
// Tests on this are relegated to the public API of HIR in src/hir.rs.
-#[derive(Clone, Debug, Eq, PartialEq)]
+#[derive(Clone, Debug)]
pub struct IntervalSet<I> {
+ /// A sorted set of non-overlapping ranges.
ranges: Vec<I>,
+ /// While not required at all for correctness, we keep track of whether an
+ /// interval set has been case folded or not. This helps us avoid doing
+ /// redundant work if, for example, a set has already been cased folded.
+ /// And note that whether a set is folded or not is preserved through
+ /// all of the pairwise set operations. That is, if both interval sets
+ /// have been case folded, then any of difference, union, intersection or
+ /// symmetric difference all produce a case folded set.
+ ///
+ /// Note that when this is true, it *must* be the case that the set is case
+ /// folded. But when it's false, the set *may* be case folded. In other
+ /// words, we only set this to true when we know it to be case, but we're
+ /// okay with it being false if it would otherwise be costly to determine
+ /// whether it should be true. This means code cannot assume that a false
+ /// value necessarily indicates that the set is not case folded.
+ ///
+ /// Bottom line: this is a performance optimization.
+ folded: bool,
+}
+
+impl<I: Interval> Eq for IntervalSet<I> {}
+
+// We implement PartialEq manually so that we don't consider the set's internal
+// 'folded' property to be part of its identity. The 'folded' property is
+// strictly an optimization.
+impl<I: Interval> PartialEq for IntervalSet<I> {
+ fn eq(&self, other: &IntervalSet<I>) -> bool {
+ self.ranges.eq(&other.ranges)
+ }
}
impl<I: Interval> IntervalSet<I> {
@@ -44,7 +71,10 @@ impl<I: Interval> IntervalSet<I> {
/// The given ranges do not need to be in any specific order, and ranges
/// may overlap.
pub fn new<T: IntoIterator<Item = I>>(intervals: T) -> IntervalSet<I> {
- let mut set = IntervalSet { ranges: intervals.into_iter().collect() };
+ let ranges: Vec<I> = intervals.into_iter().collect();
+ // An empty set is case folded.
+ let folded = ranges.is_empty();
+ let mut set = IntervalSet { ranges, folded };
set.canonicalize();
set
}
@@ -55,6 +85,10 @@ impl<I: Interval> IntervalSet<I> {
// it preserves canonicalization.
self.ranges.push(interval);
self.canonicalize();
+ // We don't know whether the new interval added here is considered
+ // case folded, so we conservatively assume that the entire set is
+ // no longer case folded if it was previously.
+ self.folded = false;
}
/// Return an iterator over all intervals in this set.
@@ -79,6 +113,9 @@ impl<I: Interval> IntervalSet<I> {
/// This returns an error if the necessary case mapping data is not
/// available.
pub fn case_fold_simple(&mut self) -> Result<(), unicode::CaseFoldError> {
+ if self.folded {
+ return Ok(());
+ }
let len = self.ranges.len();
for i in 0..len {
let range = self.ranges[i];
@@ -88,14 +125,19 @@ impl<I: Interval> IntervalSet<I> {
}
}
self.canonicalize();
+ self.folded = true;
Ok(())
}
/// Union this set with the given set, in place.
pub fn union(&mut self, other: &IntervalSet<I>) {
+ if other.ranges.is_empty() || self.ranges == other.ranges {
+ return;
+ }
// This could almost certainly be done more efficiently.
self.ranges.extend(&other.ranges);
self.canonicalize();
+ self.folded = self.folded && other.folded;
}
/// Intersect this set with the given set, in place.
@@ -105,6 +147,8 @@ impl<I: Interval> IntervalSet<I> {
}
if other.ranges.is_empty() {
self.ranges.clear();
+ // An empty set is case folded.
+ self.folded = true;
return;
}
@@ -134,6 +178,7 @@ impl<I: Interval> IntervalSet<I> {
}
}
self.ranges.drain(..drain_end);
+ self.folded = self.folded && other.folded;
}
/// Subtract the given set from this set, in place.
@@ -226,6 +271,7 @@ impl<I: Interval> IntervalSet<I> {
a += 1;
}
self.ranges.drain(..drain_end);
+ self.folded = self.folded && other.folded;
}
/// Compute the symmetric difference of the two sets, in place.
@@ -251,6 +297,8 @@ impl<I: Interval> IntervalSet<I> {
if self.ranges.is_empty() {
let (min, max) = (I::Bound::min_value(), I::Bound::max_value());
self.ranges.push(I::create(min, max));
+ // The set containing everything must case folded.
+ self.folded = true;
return;
}
@@ -276,6 +324,19 @@ impl<I: Interval> IntervalSet<I> {
self.ranges.push(I::create(lower, I::Bound::max_value()));
}
self.ranges.drain(..drain_end);
+ // We don't need to update whether this set is folded or not, because
+ // it is conservatively preserved through negation. Namely, if a set
+ // is not folded, then it is possible that its negation is folded, for
+ // example, [^☃]. But we're fine with assuming that the set is not
+ // folded in that case. (`folded` permits false negatives but not false
+ // positives.)
+ //
+ // But what about when a set is folded, is its negation also
+ // necessarily folded? Yes. Because if a set is folded, then for every
+ // character in the set, it necessarily included its equivalence class
+ // of case folded characters. Negating it in turn means that all
+ // equivalence classes in the set are negated, and any equivalence
+ // class that was previously not in the set is now entirely in the set.
}
/// Converts this set into a canonical ordering.
@@ -481,7 +542,7 @@ impl Bound for u8 {
u8::MAX
}
fn as_u32(self) -> u32 {
- self as u32
+ u32::from(self)
}
fn increment(self) -> Self {
self.checked_add(1).unwrap()
@@ -499,20 +560,20 @@ impl Bound for char {
'\u{10FFFF}'
}
fn as_u32(self) -> u32 {
- self as u32
+ u32::from(self)
}
fn increment(self) -> Self {
match self {
'\u{D7FF}' => '\u{E000}',
- c => char::from_u32((c as u32).checked_add(1).unwrap()).unwrap(),
+ c => char::from_u32(u32::from(c).checked_add(1).unwrap()).unwrap(),
}
}
fn decrement(self) -> Self {
match self {
'\u{E000}' => '\u{D7FF}',
- c => char::from_u32((c as u32).checked_sub(1).unwrap()).unwrap(),
+ c => char::from_u32(u32::from(c).checked_sub(1).unwrap()).unwrap(),
}
}
}
diff --git a/vendor/regex-syntax/src/hir/literal.rs b/vendor/regex-syntax/src/hir/literal.rs
new file mode 100644
index 000000000..bd3a2d143
--- /dev/null
+++ b/vendor/regex-syntax/src/hir/literal.rs
@@ -0,0 +1,3165 @@
+/*!
+Provides literal extraction from `Hir` expressions.
+
+An [`Extractor`] pulls literals out of [`Hir`] expressions and returns a
+[`Seq`] of [`Literal`]s.
+
+The purpose of literal extraction is generally to provide avenues for
+optimizing regex searches. The main idea is that substring searches can be an
+order of magnitude faster than a regex search. Therefore, if one can execute
+a substring search to find candidate match locations and only run the regex
+search at those locations, then it is possible for huge improvements in
+performance to be realized.
+
+With that said, literal optimizations are generally a black art because even
+though substring search is generally faster, if the number of candidates
+produced is high, then it can create a lot of overhead by ping-ponging between
+the substring search and the regex search.
+
+Here are some heuristics that might be used to help increase the chances of
+effective literal optimizations:
+
+* Stick to small [`Seq`]s. If you search for too many literals, it's likely
+to lead to substring search that is only a little faster than a regex search,
+and thus the overhead of using literal optimizations in the first place might
+make things slower overall.
+* The literals in your [`Seq`] shoudn't be too short. In general, longer is
+better. A sequence corresponding to single bytes that occur frequently in the
+haystack, for example, is probably a bad literal optimization because it's
+likely to produce many false positive candidates. Longer literals are less
+likely to match, and thus probably produce fewer false positives.
+* If it's possible to estimate the approximate frequency of each byte according
+to some pre-computed background distribution, it is possible to compute a score
+of how "good" a `Seq` is. If a `Seq` isn't good enough, you might consider
+skipping the literal optimization and just use the regex engine.
+
+(It should be noted that there are always pathological cases that can make
+any kind of literal optimization be a net slower result. This is why it
+might be a good idea to be conservative, or to even provide a means for
+literal optimizations to be dynamically disabled if they are determined to be
+ineffective according to some measure.)
+
+You're encouraged to explore the methods on [`Seq`], which permit shrinking
+the size of sequences in a preference-order preserving fashion.
+
+Finally, note that it isn't strictly necessary to use an [`Extractor`]. Namely,
+an `Extractor` only uses public APIs of the [`Seq`] and [`Literal`] types,
+so it is possible to implement your own extractor. For example, for n-grams
+or "inner" literals (i.e., not prefix or suffix literals). The `Extractor`
+is mostly responsible for the case analysis over `Hir` expressions. Much of
+the "trickier" parts are how to combine literal sequences, and that is all
+implemented on [`Seq`].
+*/
+
+use core::{cmp, mem};
+
+use alloc::{vec, vec::Vec};
+
+use crate::hir::{self, Hir};
+
+/// Extracts prefix or suffix literal sequences from [`Hir`] expressions.
+///
+/// Literal extraction is based on the following observations:
+///
+/// * Many regexes start with one or a small number of literals.
+/// * Substring search for literals is often much faster (sometimes by an order
+/// of magnitude) than a regex search.
+///
+/// Thus, in many cases, one can search for literals to find candidate starting
+/// locations of a match, and then only run the full regex engine at each such
+/// location instead of over the full haystack.
+///
+/// The main downside of literal extraction is that it can wind up causing a
+/// search to be slower overall. For example, if there are many matches or if
+/// there are many candidates that don't ultimately lead to a match, then a
+/// lot of overhead will be spent in shuffing back-and-forth between substring
+/// search and the regex engine. This is the fundamental reason why literal
+/// optimizations for regex patterns is sometimes considered a "black art."
+///
+/// # Look-around assertions
+///
+/// Literal extraction treats all look-around assertions as-if they match every
+/// empty string. So for example, the regex `\bquux\b` will yield a sequence
+/// containing a single exact literal `quux`. However, not all occurrences
+/// of `quux` correspond to a match a of the regex. For example, `\bquux\b`
+/// does not match `ZquuxZ` anywhere because `quux` does not fall on a word
+/// boundary.
+///
+/// In effect, if your regex contains look-around assertions, then a match of
+/// an exact literal does not necessarily mean the regex overall matches. So
+/// you may still need to run the regex engine in such cases to confirm the
+/// match.
+///
+/// The precise guarantee you get from a literal sequence is: if every literal
+/// in the sequence is exact and the original regex contains zero look-around
+/// assertions, then a preference-order multi-substring search of those
+/// literals will precisely match a preference-order search of the original
+/// regex.
+///
+/// # Example
+///
+/// This shows how to extract prefixes:
+///
+/// ```
+/// use regex_syntax::{hir::literal::{Extractor, Literal, Seq}, parse};
+///
+/// let hir = parse(r"(a|b|c)(x|y|z)[A-Z]+foo")?;
+/// let got = Extractor::new().extract(&hir);
+/// // All literals returned are "inexact" because none of them reach the
+/// // match state.
+/// let expected = Seq::from_iter([
+/// Literal::inexact("ax"),
+/// Literal::inexact("ay"),
+/// Literal::inexact("az"),
+/// Literal::inexact("bx"),
+/// Literal::inexact("by"),
+/// Literal::inexact("bz"),
+/// Literal::inexact("cx"),
+/// Literal::inexact("cy"),
+/// Literal::inexact("cz"),
+/// ]);
+/// assert_eq!(expected, got);
+///
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+///
+/// This shows how to extract suffixes:
+///
+/// ```
+/// use regex_syntax::{
+/// hir::literal::{Extractor, ExtractKind, Literal, Seq},
+/// parse,
+/// };
+///
+/// let hir = parse(r"foo|[A-Z]+bar")?;
+/// let got = Extractor::new().kind(ExtractKind::Suffix).extract(&hir);
+/// // Since 'foo' gets to a match state, it is considered exact. But 'bar'
+/// // does not because of the '[A-Z]+', and thus is marked inexact.
+/// let expected = Seq::from_iter([
+/// Literal::exact("foo"),
+/// Literal::inexact("bar"),
+/// ]);
+/// assert_eq!(expected, got);
+///
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+#[derive(Clone, Debug)]
+pub struct Extractor {
+ kind: ExtractKind,
+ limit_class: usize,
+ limit_repeat: usize,
+ limit_literal_len: usize,
+ limit_total: usize,
+}
+
+impl Extractor {
+ /// Create a new extractor with a default configuration.
+ ///
+ /// The extractor can be optionally configured before calling
+ /// [`Extractor::extract`] to get a literal sequence.
+ pub fn new() -> Extractor {
+ Extractor {
+ kind: ExtractKind::Prefix,
+ limit_class: 10,
+ limit_repeat: 10,
+ limit_literal_len: 100,
+ limit_total: 250,
+ }
+ }
+
+ /// Execute the extractor and return a sequence of literals.
+ pub fn extract(&self, hir: &Hir) -> Seq {
+ use crate::hir::HirKind::*;
+
+ match *hir.kind() {
+ Empty | Look(_) => Seq::singleton(self::Literal::exact(vec![])),
+ Literal(hir::Literal(ref bytes)) => {
+ let mut seq =
+ Seq::singleton(self::Literal::exact(bytes.to_vec()));
+ self.enforce_literal_len(&mut seq);
+ seq
+ }
+ Class(hir::Class::Unicode(ref cls)) => {
+ self.extract_class_unicode(cls)
+ }
+ Class(hir::Class::Bytes(ref cls)) => self.extract_class_bytes(cls),
+ Repetition(ref rep) => self.extract_repetition(rep),
+ Capture(hir::Capture { ref sub, .. }) => self.extract(sub),
+ Concat(ref hirs) => match self.kind {
+ ExtractKind::Prefix => self.extract_concat(hirs.iter()),
+ ExtractKind::Suffix => self.extract_concat(hirs.iter().rev()),
+ },
+ Alternation(ref hirs) => {
+ // Unlike concat, we always union starting from the beginning,
+ // since the beginning corresponds to the highest preference,
+ // which doesn't change based on forwards vs reverse.
+ self.extract_alternation(hirs.iter())
+ }
+ }
+ }
+
+ /// Set the kind of literal sequence to extract from an [`Hir`] expression.
+ ///
+ /// The default is to extract prefixes, but suffixes can be selected
+ /// instead. The contract for prefixes is that every match of the
+ /// corresponding `Hir` must start with one of the literals in the sequence
+ /// returned. Moreover, the _order_ of the sequence returned corresponds to
+ /// the preference order.
+ ///
+ /// Suffixes satisfy a similar contract in that every match of the
+ /// corresponding `Hir` must end with one of the literals in the sequence
+ /// returned. However, there is no guarantee that the literals are in
+ /// preference order.
+ ///
+ /// Remember that a sequence can be infinite. For example, unless the
+ /// limits are configured to be impractically large, attempting to extract
+ /// prefixes (or suffixes) for the pattern `[A-Z]` will return an infinite
+ /// sequence. Generally speaking, if the sequence returned is infinite,
+ /// then it is presumed to be unwise to do prefix (or suffix) optimizations
+ /// for the pattern.
+ pub fn kind(&mut self, kind: ExtractKind) -> &mut Extractor {
+ self.kind = kind;
+ self
+ }
+
+ /// Configure a limit on the length of the sequence that is permitted for
+ /// a character class. If a character class exceeds this limit, then the
+ /// sequence returned for it is infinite.
+ ///
+ /// This prevents classes like `[A-Z]` or `\pL` from getting turned into
+ /// huge and likely unproductive sequences of literals.
+ ///
+ /// # Example
+ ///
+ /// This example shows how this limit can be lowered to decrease the tolerance
+ /// for character classes being turned into literal sequences.
+ ///
+ /// ```
+ /// use regex_syntax::{hir::literal::{Extractor, Seq}, parse};
+ ///
+ /// let hir = parse(r"[0-9]")?;
+ ///
+ /// let got = Extractor::new().extract(&hir);
+ /// let expected = Seq::new([
+ /// "0", "1", "2", "3", "4", "5", "6", "7", "8", "9",
+ /// ]);
+ /// assert_eq!(expected, got);
+ ///
+ /// // Now let's shrink the limit and see how that changes things.
+ /// let got = Extractor::new().limit_class(4).extract(&hir);
+ /// let expected = Seq::infinite();
+ /// assert_eq!(expected, got);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn limit_class(&mut self, limit: usize) -> &mut Extractor {
+ self.limit_class = limit;
+ self
+ }
+
+ /// Configure a limit on the total number of repetitions that is permitted
+ /// before literal extraction is stopped.
+ ///
+ /// This is useful for limiting things like `(abcde){50}`, or more
+ /// insidiously, `(?:){1000000000}`. This limit prevents any one single
+ /// repetition from adding too much to a literal sequence.
+ ///
+ /// With this limit set, repetitions that exceed it will be stopped and any
+ /// literals extracted up to that point will be made inexact.
+ ///
+ /// # Example
+ ///
+ /// This shows how to decrease the limit and compares it with the default.
+ ///
+ /// ```
+ /// use regex_syntax::{hir::literal::{Extractor, Literal, Seq}, parse};
+ ///
+ /// let hir = parse(r"(abc){8}")?;
+ ///
+ /// let got = Extractor::new().extract(&hir);
+ /// let expected = Seq::new(["abcabcabcabcabcabcabcabc"]);
+ /// assert_eq!(expected, got);
+ ///
+ /// // Now let's shrink the limit and see how that changes things.
+ /// let got = Extractor::new().limit_repeat(4).extract(&hir);
+ /// let expected = Seq::from_iter([
+ /// Literal::inexact("abcabcabcabc"),
+ /// ]);
+ /// assert_eq!(expected, got);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn limit_repeat(&mut self, limit: usize) -> &mut Extractor {
+ self.limit_repeat = limit;
+ self
+ }
+
+ /// Configure a limit on the maximum length of any literal in a sequence.
+ ///
+ /// This is useful for limiting things like `(abcde){5}{5}{5}{5}`. While
+ /// each repetition or literal in that regex is small, when all the
+ /// repetitions are applied, one ends up with a literal of length `5^4 =
+ /// 625`.
+ ///
+ /// With this limit set, literals that exceed it will be made inexact and
+ /// thus prevented from growing.
+ ///
+ /// # Example
+ ///
+ /// This shows how to decrease the limit and compares it with the default.
+ ///
+ /// ```
+ /// use regex_syntax::{hir::literal::{Extractor, Literal, Seq}, parse};
+ ///
+ /// let hir = parse(r"(abc){2}{2}{2}")?;
+ ///
+ /// let got = Extractor::new().extract(&hir);
+ /// let expected = Seq::new(["abcabcabcabcabcabcabcabc"]);
+ /// assert_eq!(expected, got);
+ ///
+ /// // Now let's shrink the limit and see how that changes things.
+ /// let got = Extractor::new().limit_literal_len(14).extract(&hir);
+ /// let expected = Seq::from_iter([
+ /// Literal::inexact("abcabcabcabcab"),
+ /// ]);
+ /// assert_eq!(expected, got);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn limit_literal_len(&mut self, limit: usize) -> &mut Extractor {
+ self.limit_literal_len = limit;
+ self
+ }
+
+ /// Configure a limit on the total number of literals that will be
+ /// returned.
+ ///
+ /// This is useful as a practical measure for avoiding the creation of
+ /// large sequences of literals. While the extractor will automatically
+ /// handle local creations of large sequences (for example, `[A-Z]` yields
+ /// an infinite sequence by default), large sequences can be created
+ /// through non-local means as well.
+ ///
+ /// For example, `[ab]{3}{3}` would yield a sequence of length `512 = 2^9`
+ /// despite each of the repetitions being small on their own. This limit
+ /// thus represents a "catch all" for avoiding locally small sequences from
+ /// combining into large sequences.
+ ///
+ /// # Example
+ ///
+ /// This example shows how reducing the limit will change the literal
+ /// sequence returned.
+ ///
+ /// ```
+ /// use regex_syntax::{hir::literal::{Extractor, Literal, Seq}, parse};
+ ///
+ /// let hir = parse(r"[ab]{2}{2}")?;
+ ///
+ /// let got = Extractor::new().extract(&hir);
+ /// let expected = Seq::new([
+ /// "aaaa", "aaab", "aaba", "aabb",
+ /// "abaa", "abab", "abba", "abbb",
+ /// "baaa", "baab", "baba", "babb",
+ /// "bbaa", "bbab", "bbba", "bbbb",
+ /// ]);
+ /// assert_eq!(expected, got);
+ ///
+ /// // The default limit is not too big, but big enough to extract all
+ /// // literals from '[ab]{2}{2}'. If we shrink the limit to less than 16,
+ /// // then we'll get a truncated set. Notice that it returns a sequence of
+ /// // length 4 even though our limit was 10. This is because the sequence
+ /// // is difficult to increase without blowing the limit. Notice also
+ /// // that every literal in the sequence is now inexact because they were
+ /// // stripped of some suffix.
+ /// let got = Extractor::new().limit_total(10).extract(&hir);
+ /// let expected = Seq::from_iter([
+ /// Literal::inexact("aa"),
+ /// Literal::inexact("ab"),
+ /// Literal::inexact("ba"),
+ /// Literal::inexact("bb"),
+ /// ]);
+ /// assert_eq!(expected, got);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn limit_total(&mut self, limit: usize) -> &mut Extractor {
+ self.limit_total = limit;
+ self
+ }
+
+ /// Extract a sequence from the given concatenation. Sequences from each of
+ /// the child HIR expressions are combined via cross product.
+ ///
+ /// This short circuits once the cross product turns into a sequence
+ /// containing only inexact literals.
+ fn extract_concat<'a, I: Iterator<Item = &'a Hir>>(&self, it: I) -> Seq {
+ let mut seq = Seq::singleton(self::Literal::exact(vec![]));
+ for hir in it {
+ // If every element in the sequence is inexact, then a cross
+ // product will always be a no-op. Thus, there is nothing else we
+ // can add to it and can quit early. Note that this also includes
+ // infinite sequences.
+ if seq.is_inexact() {
+ break;
+ }
+ // Note that 'cross' also dispatches based on whether we're
+ // extracting prefixes or suffixes.
+ seq = self.cross(seq, &mut self.extract(hir));
+ }
+ seq
+ }
+
+ /// Extract a sequence from the given alternation.
+ ///
+ /// This short circuits once the union turns into an infinite sequence.
+ fn extract_alternation<'a, I: Iterator<Item = &'a Hir>>(
+ &self,
+ it: I,
+ ) -> Seq {
+ let mut seq = Seq::empty();
+ for hir in it {
+ // Once our 'seq' is infinite, every subsequent union
+ // operation on it will itself always result in an
+ // infinite sequence. Thus, it can never change and we can
+ // short-circuit.
+ if !seq.is_finite() {
+ break;
+ }
+ seq = self.union(seq, &mut self.extract(hir));
+ }
+ seq
+ }
+
+ /// Extract a sequence of literals from the given repetition. We do our
+ /// best, Some examples:
+ ///
+ /// 'a*' => [inexact(a), exact("")]
+ /// 'a*?' => [exact(""), inexact(a)]
+ /// 'a+' => [inexact(a)]
+ /// 'a{3}' => [exact(aaa)]
+ /// 'a{3,5} => [inexact(aaa)]
+ ///
+ /// The key here really is making sure we get the 'inexact' vs 'exact'
+ /// attributes correct on each of the literals we add. For example, the
+ /// fact that 'a*' gives us an inexact 'a' and an exact empty string means
+ /// that a regex like 'ab*c' will result in [inexact(ab), exact(ac)]
+ /// literals being extracted, which might actually be a better prefilter
+ /// than just 'a'.
+ fn extract_repetition(&self, rep: &hir::Repetition) -> Seq {
+ let mut subseq = self.extract(&rep.sub);
+ match *rep {
+ hir::Repetition { min: 0, max, greedy, .. } => {
+ // When 'max=1', we can retain exactness, since 'a?' is
+ // equivalent to 'a|'. Similarly below, 'a??' is equivalent to
+ // '|a'.
+ if max != Some(1) {
+ subseq.make_inexact();
+ }
+ let mut empty = Seq::singleton(Literal::exact(vec![]));
+ if !greedy {
+ mem::swap(&mut subseq, &mut empty);
+ }
+ self.union(subseq, &mut empty)
+ }
+ hir::Repetition { min, max: Some(max), .. } if min == max => {
+ assert!(min > 0); // handled above
+ let limit =
+ u32::try_from(self.limit_repeat).unwrap_or(u32::MAX);
+ let mut seq = Seq::singleton(Literal::exact(vec![]));
+ for _ in 0..cmp::min(min, limit) {
+ if seq.is_inexact() {
+ break;
+ }
+ seq = self.cross(seq, &mut subseq.clone());
+ }
+ if usize::try_from(min).is_err() || min > limit {
+ seq.make_inexact();
+ }
+ seq
+ }
+ hir::Repetition { min, max: Some(max), .. } if min < max => {
+ assert!(min > 0); // handled above
+ let limit =
+ u32::try_from(self.limit_repeat).unwrap_or(u32::MAX);
+ let mut seq = Seq::singleton(Literal::exact(vec![]));
+ for _ in 0..cmp::min(min, limit) {
+ if seq.is_inexact() {
+ break;
+ }
+ seq = self.cross(seq, &mut subseq.clone());
+ }
+ seq.make_inexact();
+ seq
+ }
+ hir::Repetition { .. } => {
+ subseq.make_inexact();
+ subseq
+ }
+ }
+ }
+
+ /// Convert the given Unicode class into a sequence of literals if the
+ /// class is small enough. If the class is too big, return an infinite
+ /// sequence.
+ fn extract_class_unicode(&self, cls: &hir::ClassUnicode) -> Seq {
+ if self.class_over_limit_unicode(cls) {
+ return Seq::infinite();
+ }
+ let mut seq = Seq::empty();
+ for r in cls.iter() {
+ for ch in r.start()..=r.end() {
+ seq.push(Literal::from(ch));
+ }
+ }
+ self.enforce_literal_len(&mut seq);
+ seq
+ }
+
+ /// Convert the given byte class into a sequence of literals if the class
+ /// is small enough. If the class is too big, return an infinite sequence.
+ fn extract_class_bytes(&self, cls: &hir::ClassBytes) -> Seq {
+ if self.class_over_limit_bytes(cls) {
+ return Seq::infinite();
+ }
+ let mut seq = Seq::empty();
+ for r in cls.iter() {
+ for b in r.start()..=r.end() {
+ seq.push(Literal::from(b));
+ }
+ }
+ self.enforce_literal_len(&mut seq);
+ seq
+ }
+
+ /// Returns true if the given Unicode class exceeds the configured limits
+ /// on this extractor.
+ fn class_over_limit_unicode(&self, cls: &hir::ClassUnicode) -> bool {
+ let mut count = 0;
+ for r in cls.iter() {
+ if count > self.limit_class {
+ return true;
+ }
+ count += r.len();
+ }
+ count > self.limit_class
+ }
+
+ /// Returns true if the given byte class exceeds the configured limits on
+ /// this extractor.
+ fn class_over_limit_bytes(&self, cls: &hir::ClassBytes) -> bool {
+ let mut count = 0;
+ for r in cls.iter() {
+ if count > self.limit_class {
+ return true;
+ }
+ count += r.len();
+ }
+ count > self.limit_class
+ }
+
+ /// Compute the cross product of the two sequences if the result would be
+ /// within configured limits. Otherwise, make `seq2` infinite and cross the
+ /// infinite sequence with `seq1`.
+ fn cross(&self, mut seq1: Seq, seq2: &mut Seq) -> Seq {
+ if seq1.max_cross_len(seq2).map_or(false, |len| len > self.limit_total)
+ {
+ seq2.make_infinite();
+ }
+ if let ExtractKind::Suffix = self.kind {
+ seq1.cross_reverse(seq2);
+ } else {
+ seq1.cross_forward(seq2);
+ }
+ assert!(seq1.len().map_or(true, |x| x <= self.limit_total));
+ self.enforce_literal_len(&mut seq1);
+ seq1
+ }
+
+ /// Union the two sequences if the result would be within configured
+ /// limits. Otherwise, make `seq2` infinite and union the infinite sequence
+ /// with `seq1`.
+ fn union(&self, mut seq1: Seq, seq2: &mut Seq) -> Seq {
+ if seq1.max_union_len(seq2).map_or(false, |len| len > self.limit_total)
+ {
+ // We try to trim our literal sequences to see if we can make
+ // room for more literals. The idea is that we'd rather trim down
+ // literals already in our sequence if it means we can add a few
+ // more and retain a finite sequence. Otherwise, we'll union with
+ // an infinite sequence and that infects everything and effectively
+ // stops literal extraction in its tracks.
+ //
+ // We do we keep 4 bytes here? Well, it's a bit of an abstraction
+ // leakage. Downstream, the literals may wind up getting fed to
+ // the Teddy algorithm, which supports searching literals up to
+ // length 4. So that's why we pick that number here. Arguably this
+ // should be a tuneable parameter, but it seems a little tricky to
+ // describe. And I'm still unsure if this is the right way to go
+ // about culling literal sequences.
+ match self.kind {
+ ExtractKind::Prefix => {
+ seq1.keep_first_bytes(4);
+ seq2.keep_first_bytes(4);
+ }
+ ExtractKind::Suffix => {
+ seq1.keep_last_bytes(4);
+ seq2.keep_last_bytes(4);
+ }
+ }
+ seq1.dedup();
+ seq2.dedup();
+ if seq1
+ .max_union_len(seq2)
+ .map_or(false, |len| len > self.limit_total)
+ {
+ seq2.make_infinite();
+ }
+ }
+ seq1.union(seq2);
+ assert!(seq1.len().map_or(true, |x| x <= self.limit_total));
+ seq1
+ }
+
+ /// Applies the literal length limit to the given sequence. If none of the
+ /// literals in the sequence exceed the limit, then this is a no-op.
+ fn enforce_literal_len(&self, seq: &mut Seq) {
+ let len = self.limit_literal_len;
+ match self.kind {
+ ExtractKind::Prefix => seq.keep_first_bytes(len),
+ ExtractKind::Suffix => seq.keep_last_bytes(len),
+ }
+ }
+}
+
+impl Default for Extractor {
+ fn default() -> Extractor {
+ Extractor::new()
+ }
+}
+
+/// The kind of literals to extract from an [`Hir`] expression.
+///
+/// The default extraction kind is `Prefix`.
+#[non_exhaustive]
+#[derive(Clone, Debug)]
+pub enum ExtractKind {
+ /// Extracts only prefix literals from a regex.
+ Prefix,
+ /// Extracts only suffix literals from a regex.
+ ///
+ /// Note that the sequence returned by suffix literals currently may
+ /// not correctly represent leftmost-first or "preference" order match
+ /// semantics.
+ Suffix,
+}
+
+impl ExtractKind {
+ /// Returns true if this kind is the `Prefix` variant.
+ pub fn is_prefix(&self) -> bool {
+ matches!(*self, ExtractKind::Prefix)
+ }
+
+ /// Returns true if this kind is the `Suffix` variant.
+ pub fn is_suffix(&self) -> bool {
+ matches!(*self, ExtractKind::Suffix)
+ }
+}
+
+impl Default for ExtractKind {
+ fn default() -> ExtractKind {
+ ExtractKind::Prefix
+ }
+}
+
+/// A sequence of literals.
+///
+/// A `Seq` is very much like a set in that it represents a union of its
+/// members. That is, it corresponds to a set of literals where at least one
+/// must match in order for a particular [`Hir`] expression to match. (Whether
+/// this corresponds to the entire `Hir` expression, a prefix of it or a suffix
+/// of it depends on how the `Seq` was extracted from the `Hir`.)
+///
+/// It is also unlike a set in that multiple identical literals may appear,
+/// and that the order of the literals in the `Seq` matters. For example, if
+/// the sequence is `[sam, samwise]` and leftmost-first matching is used, then
+/// `samwise` can never match and the sequence is equivalent to `[sam]`.
+///
+/// # States of a sequence
+///
+/// A `Seq` has a few different logical states to consider:
+///
+/// * The sequence can represent "any" literal. When this happens, the set does
+/// not have a finite size. The purpose of this state is to inhibit callers
+/// from making assumptions about what literals are required in order to match
+/// a particular [`Hir`] expression. Generally speaking, when a set is in this
+/// state, literal optimizations are inhibited. A good example of a regex that
+/// will cause this sort of set to apppear is `[A-Za-z]`. The character class
+/// is just too big (and also too narrow) to be usefully expanded into 52
+/// different literals. (Note that the decision for when a seq should become
+/// infinite is determined by the caller. A seq itself has no hard-coded
+/// limits.)
+/// * The sequence can be empty, in which case, it is an affirmative statement
+/// that there are no literals that can match the corresponding `Hir`.
+/// Consequently, the `Hir` never matches any input. For example, `[a&&b]`.
+/// * The sequence can be non-empty, in which case, at least one of the
+/// literals must match in order for the corresponding `Hir` to match.
+///
+/// # Example
+///
+/// This example shows how literal sequences can be simplified by stripping
+/// suffixes and minimizing while maintaining preference order.
+///
+/// ```
+/// use regex_syntax::hir::literal::{Literal, Seq};
+///
+/// let mut seq = Seq::new(&[
+/// "farm",
+/// "appliance",
+/// "faraway",
+/// "apple",
+/// "fare",
+/// "gap",
+/// "applicant",
+/// "applaud",
+/// ]);
+/// seq.keep_first_bytes(3);
+/// seq.minimize_by_preference();
+/// // Notice that 'far' comes before 'app', which matches the order in the
+/// // original sequence. This guarantees that leftmost-first semantics are
+/// // not altered by simplifying the set.
+/// let expected = Seq::from_iter([
+/// Literal::inexact("far"),
+/// Literal::inexact("app"),
+/// Literal::exact("gap"),
+/// ]);
+/// assert_eq!(expected, seq);
+/// ```
+#[derive(Clone, Eq, PartialEq)]
+pub struct Seq {
+ /// The members of this seq.
+ ///
+ /// When `None`, the seq represents all possible literals. That is, it
+ /// prevents one from making assumptions about specific literals in the
+ /// seq, and forces one to treat it as if any literal might be in the seq.
+ ///
+ /// Note that `Some(vec![])` is valid and corresponds to the empty seq of
+ /// literals, i.e., a regex that can never match. For example, `[a&&b]`.
+ /// It is distinct from `Some(vec![""])`, which corresponds to the seq
+ /// containing an empty string, which matches at every position.
+ literals: Option<Vec<Literal>>,
+}
+
+impl Seq {
+ /// Returns an empty sequence.
+ ///
+ /// An empty sequence matches zero literals, and thus corresponds to a
+ /// regex that itself can never match.
+ #[inline]
+ pub fn empty() -> Seq {
+ Seq { literals: Some(vec![]) }
+ }
+
+ /// Returns a sequence of literals without a finite size and may contain
+ /// any literal.
+ ///
+ /// A sequence without finite size does not reveal anything about the
+ /// characteristics of the literals in its set. There are no fixed prefixes
+ /// or suffixes, nor are lower or upper bounds on the length of the literals
+ /// in the set known.
+ ///
+ /// This is useful to represent constructs in a regex that are "too big"
+ /// to useful represent as a sequence of literals. For example, `[A-Za-z]`.
+ /// When sequences get too big, they lose their discriminating nature and
+ /// are more likely to produce false positives, which in turn makes them
+ /// less likely to speed up searches.
+ ///
+ /// More pragmatically, for many regexes, enumerating all possible literals
+ /// is itself not possible or might otherwise use too many resources. So
+ /// constraining the size of sets during extraction is a practical trade
+ /// off to make.
+ #[inline]
+ pub fn infinite() -> Seq {
+ Seq { literals: None }
+ }
+
+ /// Returns a sequence containing a single literal.
+ #[inline]
+ pub fn singleton(lit: Literal) -> Seq {
+ Seq { literals: Some(vec![lit]) }
+ }
+
+ /// Returns a sequence of exact literals from the given byte strings.
+ #[inline]
+ pub fn new<I, B>(it: I) -> Seq
+ where
+ I: IntoIterator<Item = B>,
+ B: AsRef<[u8]>,
+ {
+ it.into_iter().map(|b| Literal::exact(b.as_ref())).collect()
+ }
+
+ /// If this is a finite sequence, return its members as a slice of
+ /// literals.
+ ///
+ /// The slice returned may be empty, in which case, there are no literals
+ /// that can match this sequence.
+ #[inline]
+ pub fn literals(&self) -> Option<&[Literal]> {
+ self.literals.as_deref()
+ }
+
+ /// Push a literal to the end of this sequence.
+ ///
+ /// If this sequence is not finite, then this is a no-op.
+ ///
+ /// Similarly, if the most recently added item of this sequence is
+ /// equivalent to the literal given, then it is not added. This reflects
+ /// a `Seq`'s "set like" behavior, and represents a practical trade off.
+ /// Namely, there is never any need to have two adjacent and equivalent
+ /// literals in the same sequence, _and_ it is easy to detect in some
+ /// cases.
+ #[inline]
+ pub fn push(&mut self, lit: Literal) {
+ let lits = match self.literals {
+ None => return,
+ Some(ref mut lits) => lits,
+ };
+ if lits.last().map_or(false, |m| m == &lit) {
+ return;
+ }
+ lits.push(lit);
+ }
+
+ /// Make all of the literals in this sequence inexact.
+ ///
+ /// This is a no-op if this sequence is not finite.
+ #[inline]
+ pub fn make_inexact(&mut self) {
+ let lits = match self.literals {
+ None => return,
+ Some(ref mut lits) => lits,
+ };
+ for lit in lits.iter_mut() {
+ lit.make_inexact();
+ }
+ }
+
+ /// Converts this sequence to an infinite sequence.
+ ///
+ /// This is a no-op if the sequence is already infinite.
+ #[inline]
+ pub fn make_infinite(&mut self) {
+ self.literals = None;
+ }
+
+ /// Modify this sequence to contain the cross product between it and the
+ /// sequence given.
+ ///
+ /// The cross product only considers literals in this sequence that are
+ /// exact. That is, inexact literals are not extended.
+ ///
+ /// The literals are always drained from `other`, even if none are used.
+ /// This permits callers to reuse the sequence allocation elsewhere.
+ ///
+ /// If this sequence is infinite, then this is a no-op, regardless of what
+ /// `other` contains (and in this case, the literals are still drained from
+ /// `other`). If `other` is infinite and this sequence is finite, then this
+ /// is a no-op, unless this sequence contains a zero-length literal. In
+ /// which case, the infiniteness of `other` infects this sequence, and this
+ /// sequence is itself made infinite.
+ ///
+ /// Like [`Seq::union`], this may attempt to deduplicate literals. See
+ /// [`Seq::dedup`] for how deduplication deals with exact and inexact
+ /// literals.
+ ///
+ /// # Example
+ ///
+ /// This example shows basic usage and how exact and inexact literals
+ /// interact.
+ ///
+ /// ```
+ /// use regex_syntax::hir::literal::{Literal, Seq};
+ ///
+ /// let mut seq1 = Seq::from_iter([
+ /// Literal::exact("foo"),
+ /// Literal::inexact("bar"),
+ /// ]);
+ /// let mut seq2 = Seq::from_iter([
+ /// Literal::inexact("quux"),
+ /// Literal::exact("baz"),
+ /// ]);
+ /// seq1.cross_forward(&mut seq2);
+ ///
+ /// // The literals are pulled out of seq2.
+ /// assert_eq!(Some(0), seq2.len());
+ ///
+ /// let expected = Seq::from_iter([
+ /// Literal::inexact("fooquux"),
+ /// Literal::exact("foobaz"),
+ /// Literal::inexact("bar"),
+ /// ]);
+ /// assert_eq!(expected, seq1);
+ /// ```
+ ///
+ /// This example shows the behavior of when `other` is an infinite
+ /// sequence.
+ ///
+ /// ```
+ /// use regex_syntax::hir::literal::{Literal, Seq};
+ ///
+ /// let mut seq1 = Seq::from_iter([
+ /// Literal::exact("foo"),
+ /// Literal::inexact("bar"),
+ /// ]);
+ /// let mut seq2 = Seq::infinite();
+ /// seq1.cross_forward(&mut seq2);
+ ///
+ /// // When seq2 is infinite, cross product doesn't add anything, but
+ /// // ensures all members of seq1 are inexact.
+ /// let expected = Seq::from_iter([
+ /// Literal::inexact("foo"),
+ /// Literal::inexact("bar"),
+ /// ]);
+ /// assert_eq!(expected, seq1);
+ /// ```
+ ///
+ /// This example is like the one above, but shows what happens when this
+ /// sequence contains an empty string. In this case, an infinite `other`
+ /// sequence infects this sequence (because the empty string means that
+ /// there are no finite prefixes):
+ ///
+ /// ```
+ /// use regex_syntax::hir::literal::{Literal, Seq};
+ ///
+ /// let mut seq1 = Seq::from_iter([
+ /// Literal::exact("foo"),
+ /// Literal::exact(""), // inexact provokes same behavior
+ /// Literal::inexact("bar"),
+ /// ]);
+ /// let mut seq2 = Seq::infinite();
+ /// seq1.cross_forward(&mut seq2);
+ ///
+ /// // seq1 is now infinite!
+ /// assert!(!seq1.is_finite());
+ /// ```
+ ///
+ /// This example shows the behavior of this sequence is infinite.
+ ///
+ /// ```
+ /// use regex_syntax::hir::literal::{Literal, Seq};
+ ///
+ /// let mut seq1 = Seq::infinite();
+ /// let mut seq2 = Seq::from_iter([
+ /// Literal::exact("foo"),
+ /// Literal::inexact("bar"),
+ /// ]);
+ /// seq1.cross_forward(&mut seq2);
+ ///
+ /// // seq1 remains unchanged.
+ /// assert!(!seq1.is_finite());
+ /// // Even though the literals in seq2 weren't used, it was still drained.
+ /// assert_eq!(Some(0), seq2.len());
+ /// ```
+ #[inline]
+ pub fn cross_forward(&mut self, other: &mut Seq) {
+ let (lits1, lits2) = match self.cross_preamble(other) {
+ None => return,
+ Some((lits1, lits2)) => (lits1, lits2),
+ };
+ let newcap = lits1.len().saturating_mul(lits2.len());
+ for selflit in mem::replace(lits1, Vec::with_capacity(newcap)) {
+ if !selflit.is_exact() {
+ lits1.push(selflit);
+ continue;
+ }
+ for otherlit in lits2.iter() {
+ let mut newlit = Literal::exact(Vec::with_capacity(
+ selflit.len() + otherlit.len(),
+ ));
+ newlit.extend(&selflit);
+ newlit.extend(&otherlit);
+ if !otherlit.is_exact() {
+ newlit.make_inexact();
+ }
+ lits1.push(newlit);
+ }
+ }
+ lits2.drain(..);
+ self.dedup();
+ }
+
+ /// Modify this sequence to contain the cross product between it and
+ /// the sequence given, where the sequences are treated as suffixes
+ /// instead of prefixes. Namely, the sequence `other` is *prepended*
+ /// to `self` (as opposed to `other` being *appended* to `self` in
+ /// [`Seq::cross_forward`]).
+ ///
+ /// The cross product only considers literals in this sequence that are
+ /// exact. That is, inexact literals are not extended.
+ ///
+ /// The literals are always drained from `other`, even if none are used.
+ /// This permits callers to reuse the sequence allocation elsewhere.
+ ///
+ /// If this sequence is infinite, then this is a no-op, regardless of what
+ /// `other` contains (and in this case, the literals are still drained from
+ /// `other`). If `other` is infinite and this sequence is finite, then this
+ /// is a no-op, unless this sequence contains a zero-length literal. In
+ /// which case, the infiniteness of `other` infects this sequence, and this
+ /// sequence is itself made infinite.
+ ///
+ /// Like [`Seq::union`], this may attempt to deduplicate literals. See
+ /// [`Seq::dedup`] for how deduplication deals with exact and inexact
+ /// literals.
+ ///
+ /// # Example
+ ///
+ /// This example shows basic usage and how exact and inexact literals
+ /// interact.
+ ///
+ /// ```
+ /// use regex_syntax::hir::literal::{Literal, Seq};
+ ///
+ /// let mut seq1 = Seq::from_iter([
+ /// Literal::exact("foo"),
+ /// Literal::inexact("bar"),
+ /// ]);
+ /// let mut seq2 = Seq::from_iter([
+ /// Literal::inexact("quux"),
+ /// Literal::exact("baz"),
+ /// ]);
+ /// seq1.cross_reverse(&mut seq2);
+ ///
+ /// // The literals are pulled out of seq2.
+ /// assert_eq!(Some(0), seq2.len());
+ ///
+ /// let expected = Seq::from_iter([
+ /// Literal::inexact("quuxfoo"),
+ /// Literal::inexact("bar"),
+ /// Literal::exact("bazfoo"),
+ /// ]);
+ /// assert_eq!(expected, seq1);
+ /// ```
+ ///
+ /// This example shows the behavior of when `other` is an infinite
+ /// sequence.
+ ///
+ /// ```
+ /// use regex_syntax::hir::literal::{Literal, Seq};
+ ///
+ /// let mut seq1 = Seq::from_iter([
+ /// Literal::exact("foo"),
+ /// Literal::inexact("bar"),
+ /// ]);
+ /// let mut seq2 = Seq::infinite();
+ /// seq1.cross_reverse(&mut seq2);
+ ///
+ /// // When seq2 is infinite, cross product doesn't add anything, but
+ /// // ensures all members of seq1 are inexact.
+ /// let expected = Seq::from_iter([
+ /// Literal::inexact("foo"),
+ /// Literal::inexact("bar"),
+ /// ]);
+ /// assert_eq!(expected, seq1);
+ /// ```
+ ///
+ /// This example is like the one above, but shows what happens when this
+ /// sequence contains an empty string. In this case, an infinite `other`
+ /// sequence infects this sequence (because the empty string means that
+ /// there are no finite suffixes):
+ ///
+ /// ```
+ /// use regex_syntax::hir::literal::{Literal, Seq};
+ ///
+ /// let mut seq1 = Seq::from_iter([
+ /// Literal::exact("foo"),
+ /// Literal::exact(""), // inexact provokes same behavior
+ /// Literal::inexact("bar"),
+ /// ]);
+ /// let mut seq2 = Seq::infinite();
+ /// seq1.cross_reverse(&mut seq2);
+ ///
+ /// // seq1 is now infinite!
+ /// assert!(!seq1.is_finite());
+ /// ```
+ ///
+ /// This example shows the behavior when this sequence is infinite.
+ ///
+ /// ```
+ /// use regex_syntax::hir::literal::{Literal, Seq};
+ ///
+ /// let mut seq1 = Seq::infinite();
+ /// let mut seq2 = Seq::from_iter([
+ /// Literal::exact("foo"),
+ /// Literal::inexact("bar"),
+ /// ]);
+ /// seq1.cross_reverse(&mut seq2);
+ ///
+ /// // seq1 remains unchanged.
+ /// assert!(!seq1.is_finite());
+ /// // Even though the literals in seq2 weren't used, it was still drained.
+ /// assert_eq!(Some(0), seq2.len());
+ /// ```
+ #[inline]
+ pub fn cross_reverse(&mut self, other: &mut Seq) {
+ let (lits1, lits2) = match self.cross_preamble(other) {
+ None => return,
+ Some((lits1, lits2)) => (lits1, lits2),
+ };
+ // We basically proceed as we do in 'cross_forward' at this point,
+ // except that the outer loop is now 'other' and the inner loop is now
+ // 'self'. That's because 'self' corresponds to suffixes and 'other'
+ // corresponds to the sequence we want to *prepend* to the suffixes.
+ let newcap = lits1.len().saturating_mul(lits2.len());
+ let selflits = mem::replace(lits1, Vec::with_capacity(newcap));
+ for (i, otherlit) in lits2.drain(..).enumerate() {
+ for selflit in selflits.iter() {
+ if !selflit.is_exact() {
+ // If the suffix isn't exact, then we can't prepend
+ // anything to it. However, we still want to keep it. But
+ // we only want to keep one of them, to avoid duplication.
+ // (The duplication is okay from a correctness perspective,
+ // but wasteful.)
+ if i == 0 {
+ lits1.push(selflit.clone());
+ }
+ continue;
+ }
+ let mut newlit = Literal::exact(Vec::with_capacity(
+ otherlit.len() + selflit.len(),
+ ));
+ newlit.extend(&otherlit);
+ newlit.extend(&selflit);
+ if !otherlit.is_exact() {
+ newlit.make_inexact();
+ }
+ lits1.push(newlit);
+ }
+ }
+ self.dedup();
+ }
+
+ /// A helper function the corresponds to the subtle preamble for both
+ /// `cross_forward` and `cross_reverse`. In effect, it handles the cases
+ /// of infinite sequences for both `self` and `other`, as well as ensuring
+ /// that literals from `other` are drained even if they aren't used.
+ fn cross_preamble<'a>(
+ &'a mut self,
+ other: &'a mut Seq,
+ ) -> Option<(&'a mut Vec<Literal>, &'a mut Vec<Literal>)> {
+ let lits2 = match other.literals {
+ None => {
+ // If our current seq contains the empty string and the seq
+ // we're adding matches any literal, then it follows that the
+ // current seq must now also match any literal.
+ //
+ // Otherwise, we just have to make sure everything in this
+ // sequence is inexact.
+ if self.min_literal_len() == Some(0) {
+ *self = Seq::infinite();
+ } else {
+ self.make_inexact();
+ }
+ return None;
+ }
+ Some(ref mut lits) => lits,
+ };
+ let lits1 = match self.literals {
+ None => {
+ // If we aren't going to make it to the end of this routine
+ // where lits2 is drained, then we need to do it now.
+ lits2.drain(..);
+ return None;
+ }
+ Some(ref mut lits) => lits,
+ };
+ Some((lits1, lits2))
+ }
+
+ /// Unions the `other` sequence into this one.
+ ///
+ /// The literals are always drained out of the given `other` sequence,
+ /// even if they are being unioned into an infinite sequence. This permits
+ /// the caller to reuse the `other` sequence in another context.
+ ///
+ /// Some literal deduping may be performed. If any deduping happens,
+ /// any leftmost-first or "preference" order match semantics will be
+ /// preserved.
+ ///
+ /// # Example
+ ///
+ /// This example shows basic usage.
+ ///
+ /// ```
+ /// use regex_syntax::hir::literal::Seq;
+ ///
+ /// let mut seq1 = Seq::new(&["foo", "bar"]);
+ /// let mut seq2 = Seq::new(&["bar", "quux", "foo"]);
+ /// seq1.union(&mut seq2);
+ ///
+ /// // The literals are pulled out of seq2.
+ /// assert_eq!(Some(0), seq2.len());
+ ///
+ /// // Adjacent literals are deduped, but non-adjacent literals may not be.
+ /// assert_eq!(Seq::new(&["foo", "bar", "quux", "foo"]), seq1);
+ /// ```
+ ///
+ /// This example shows that literals are drained from `other` even when
+ /// they aren't necessarily used.
+ ///
+ /// ```
+ /// use regex_syntax::hir::literal::Seq;
+ ///
+ /// let mut seq1 = Seq::infinite();
+ /// // Infinite sequences have no finite length.
+ /// assert_eq!(None, seq1.len());
+ ///
+ /// let mut seq2 = Seq::new(&["bar", "quux", "foo"]);
+ /// seq1.union(&mut seq2);
+ ///
+ /// // seq1 is still infinite and seq2 has been drained.
+ /// assert_eq!(None, seq1.len());
+ /// assert_eq!(Some(0), seq2.len());
+ /// ```
+ #[inline]
+ pub fn union(&mut self, other: &mut Seq) {
+ let lits2 = match other.literals {
+ None => {
+ // Unioning with an infinite sequence always results in an
+ // infinite sequence.
+ self.make_infinite();
+ return;
+ }
+ Some(ref mut lits) => lits.drain(..),
+ };
+ let lits1 = match self.literals {
+ None => return,
+ Some(ref mut lits) => lits,
+ };
+ lits1.extend(lits2);
+ self.dedup();
+ }
+
+ /// Unions the `other` sequence into this one by splice the `other`
+ /// sequence at the position of the first zero-length literal.
+ ///
+ /// This is useful for preserving preference order semantics when combining
+ /// two literal sequences. For example, in the regex `(a||f)+foo`, the
+ /// correct preference order prefix sequence is `[a, foo, f]`.
+ ///
+ /// The literals are always drained out of the given `other` sequence,
+ /// even if they are being unioned into an infinite sequence. This permits
+ /// the caller to reuse the `other` sequence in another context. Note that
+ /// the literals are drained even if no union is performed as well, i.e.,
+ /// when this sequence does not contain a zero-length literal.
+ ///
+ /// Some literal deduping may be performed. If any deduping happens,
+ /// any leftmost-first or "preference" order match semantics will be
+ /// preserved.
+ ///
+ /// # Example
+ ///
+ /// This example shows basic usage.
+ ///
+ /// ```
+ /// use regex_syntax::hir::literal::Seq;
+ ///
+ /// let mut seq1 = Seq::new(&["a", "", "f", ""]);
+ /// let mut seq2 = Seq::new(&["foo"]);
+ /// seq1.union_into_empty(&mut seq2);
+ ///
+ /// // The literals are pulled out of seq2.
+ /// assert_eq!(Some(0), seq2.len());
+ /// // 'foo' gets spliced into seq1 where the first empty string occurs.
+ /// assert_eq!(Seq::new(&["a", "foo", "f"]), seq1);
+ /// ```
+ ///
+ /// This example shows that literals are drained from `other` even when
+ /// they aren't necessarily used.
+ ///
+ /// ```
+ /// use regex_syntax::hir::literal::Seq;
+ ///
+ /// let mut seq1 = Seq::new(&["foo", "bar"]);
+ /// let mut seq2 = Seq::new(&["bar", "quux", "foo"]);
+ /// seq1.union_into_empty(&mut seq2);
+ ///
+ /// // seq1 has no zero length literals, so no splicing happens.
+ /// assert_eq!(Seq::new(&["foo", "bar"]), seq1);
+ /// // Even though no splicing happens, seq2 is still drained.
+ /// assert_eq!(Some(0), seq2.len());
+ /// ```
+ #[inline]
+ pub fn union_into_empty(&mut self, other: &mut Seq) {
+ let lits2 = other.literals.as_mut().map(|lits| lits.drain(..));
+ let lits1 = match self.literals {
+ None => return,
+ Some(ref mut lits) => lits,
+ };
+ let first_empty = match lits1.iter().position(|m| m.is_empty()) {
+ None => return,
+ Some(i) => i,
+ };
+ let lits2 = match lits2 {
+ None => {
+ // Note that we are only here if we've found an empty literal,
+ // which implies that an infinite sequence infects this seq and
+ // also turns it into an infinite sequence.
+ self.literals = None;
+ return;
+ }
+ Some(lits) => lits,
+ };
+ // Clearing out the empties needs to come before the splice because
+ // the splice might add more empties that we don't want to get rid
+ // of. Since we're splicing into the position of the first empty, the
+ // 'first_empty' position computed above is still correct.
+ lits1.retain(|m| !m.is_empty());
+ lits1.splice(first_empty..first_empty, lits2);
+ self.dedup();
+ }
+
+ /// Deduplicate adjacent equivalent literals in this sequence.
+ ///
+ /// If adjacent literals are equivalent strings but one is exact and the
+ /// other inexact, the inexact literal is kept and the exact one is
+ /// removed.
+ ///
+ /// Deduping an infinite sequence is a no-op.
+ ///
+ /// # Example
+ ///
+ /// This example shows how literals that are duplicate byte strings but
+ /// are not equivalent with respect to exactness are resolved.
+ ///
+ /// ```
+ /// use regex_syntax::hir::literal::{Literal, Seq};
+ ///
+ /// let mut seq = Seq::from_iter([
+ /// Literal::exact("foo"),
+ /// Literal::inexact("foo"),
+ /// ]);
+ /// seq.dedup();
+ ///
+ /// assert_eq!(Seq::from_iter([Literal::inexact("foo")]), seq);
+ /// ```
+ #[inline]
+ pub fn dedup(&mut self) {
+ if let Some(ref mut lits) = self.literals {
+ lits.dedup_by(|lit1, lit2| {
+ if lit1.as_bytes() != lit2.as_bytes() {
+ return false;
+ }
+ if lit1.is_exact() != lit2.is_exact() {
+ lit1.make_inexact();
+ lit2.make_inexact();
+ }
+ true
+ });
+ }
+ }
+
+ /// Sorts this sequence of literals lexicographically.
+ ///
+ /// Note that if, before sorting, if a literal that is a prefix of another
+ /// literal appears after it, then after sorting, the sequence will not
+ /// represent the same preference order match semantics. For example,
+ /// sorting the sequence `[samwise, sam]` yields the sequence `[sam,
+ /// samwise]`. Under preference order semantics, the latter sequence will
+ /// never match `samwise` where as the first sequence can.
+ ///
+ /// # Example
+ ///
+ /// This example shows basic usage.
+ ///
+ /// ```
+ /// use regex_syntax::hir::literal::Seq;
+ ///
+ /// let mut seq = Seq::new(&["foo", "quux", "bar"]);
+ /// seq.sort();
+ ///
+ /// assert_eq!(Seq::new(&["bar", "foo", "quux"]), seq);
+ /// ```
+ #[inline]
+ pub fn sort(&mut self) {
+ if let Some(ref mut lits) = self.literals {
+ lits.sort();
+ }
+ }
+
+ /// Reverses all of the literals in this sequence.
+ ///
+ /// The order of the sequence itself is preserved.
+ ///
+ /// # Example
+ ///
+ /// This example shows basic usage.
+ ///
+ /// ```
+ /// use regex_syntax::hir::literal::Seq;
+ ///
+ /// let mut seq = Seq::new(&["oof", "rab"]);
+ /// seq.reverse_literals();
+ /// assert_eq!(Seq::new(&["foo", "bar"]), seq);
+ /// ```
+ #[inline]
+ pub fn reverse_literals(&mut self) {
+ if let Some(ref mut lits) = self.literals {
+ for lit in lits.iter_mut() {
+ lit.reverse();
+ }
+ }
+ }
+
+ /// Shrinks this seq to its minimal size while respecting the preference
+ /// order of its literals.
+ ///
+ /// While this routine will remove duplicate literals from this seq, it
+ /// will also remove literals that can never match in a leftmost-first or
+ /// "preference order" search. Similar to [`Seq::dedup`], if a literal is
+ /// deduped, then the one that remains is made inexact.
+ ///
+ /// This is a no-op on seqs that are empty or not finite.
+ ///
+ /// # Example
+ ///
+ /// This example shows the difference between `{sam, samwise}` and
+ /// `{samwise, sam}`.
+ ///
+ /// ```
+ /// use regex_syntax::hir::literal::{Literal, Seq};
+ ///
+ /// // If 'sam' comes before 'samwise' and a preference order search is
+ /// // executed, then 'samwise' can never match.
+ /// let mut seq = Seq::new(&["sam", "samwise"]);
+ /// seq.minimize_by_preference();
+ /// assert_eq!(Seq::from_iter([Literal::inexact("sam")]), seq);
+ ///
+ /// // But if they are reversed, then it's possible for 'samwise' to match
+ /// // since it is given higher preference.
+ /// let mut seq = Seq::new(&["samwise", "sam"]);
+ /// seq.minimize_by_preference();
+ /// assert_eq!(Seq::new(&["samwise", "sam"]), seq);
+ /// ```
+ ///
+ /// This example shows that if an empty string is in this seq, then
+ /// anything that comes after it can never match.
+ ///
+ /// ```
+ /// use regex_syntax::hir::literal::{Literal, Seq};
+ ///
+ /// // An empty string is a prefix of all strings, so it automatically
+ /// // inhibits any subsequent strings from matching.
+ /// let mut seq = Seq::new(&["foo", "bar", "", "quux", "fox"]);
+ /// seq.minimize_by_preference();
+ /// let expected = Seq::from_iter([
+ /// Literal::exact("foo"),
+ /// Literal::exact("bar"),
+ /// Literal::inexact(""),
+ /// ]);
+ /// assert_eq!(expected, seq);
+ ///
+ /// // And of course, if it's at the beginning, then it makes it impossible
+ /// // for anything else to match.
+ /// let mut seq = Seq::new(&["", "foo", "quux", "fox"]);
+ /// seq.minimize_by_preference();
+ /// assert_eq!(Seq::from_iter([Literal::inexact("")]), seq);
+ /// ```
+ #[inline]
+ pub fn minimize_by_preference(&mut self) {
+ if let Some(ref mut lits) = self.literals {
+ PreferenceTrie::minimize(lits, false);
+ }
+ }
+
+ /// Trims all literals in this seq such that only the first `len` bytes
+ /// remain. If a literal has less than or equal to `len` bytes, then it
+ /// remains unchanged. Otherwise, it is trimmed and made inexact.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_syntax::hir::literal::{Literal, Seq};
+ ///
+ /// let mut seq = Seq::new(&["a", "foo", "quux"]);
+ /// seq.keep_first_bytes(2);
+ ///
+ /// let expected = Seq::from_iter([
+ /// Literal::exact("a"),
+ /// Literal::inexact("fo"),
+ /// Literal::inexact("qu"),
+ /// ]);
+ /// assert_eq!(expected, seq);
+ /// ```
+ #[inline]
+ pub fn keep_first_bytes(&mut self, len: usize) {
+ if let Some(ref mut lits) = self.literals {
+ for m in lits.iter_mut() {
+ m.keep_first_bytes(len);
+ }
+ }
+ }
+
+ /// Trims all literals in this seq such that only the last `len` bytes
+ /// remain. If a literal has less than or equal to `len` bytes, then it
+ /// remains unchanged. Otherwise, it is trimmed and made inexact.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_syntax::hir::literal::{Literal, Seq};
+ ///
+ /// let mut seq = Seq::new(&["a", "foo", "quux"]);
+ /// seq.keep_last_bytes(2);
+ ///
+ /// let expected = Seq::from_iter([
+ /// Literal::exact("a"),
+ /// Literal::inexact("oo"),
+ /// Literal::inexact("ux"),
+ /// ]);
+ /// assert_eq!(expected, seq);
+ /// ```
+ #[inline]
+ pub fn keep_last_bytes(&mut self, len: usize) {
+ if let Some(ref mut lits) = self.literals {
+ for m in lits.iter_mut() {
+ m.keep_last_bytes(len);
+ }
+ }
+ }
+
+ /// Returns true if this sequence is finite.
+ ///
+ /// When false, this sequence is infinite and must be treated as if it
+ /// contains every possible literal.
+ #[inline]
+ pub fn is_finite(&self) -> bool {
+ self.literals.is_some()
+ }
+
+ /// Returns true if and only if this sequence is finite and empty.
+ ///
+ /// An empty sequence never matches anything. It can only be produced by
+ /// literal extraction when the corresponding regex itself cannot match.
+ #[inline]
+ pub fn is_empty(&self) -> bool {
+ self.len() == Some(0)
+ }
+
+ /// Returns the number of literals in this sequence if the sequence is
+ /// finite. If the sequence is infinite, then `None` is returned.
+ #[inline]
+ pub fn len(&self) -> Option<usize> {
+ self.literals.as_ref().map(|lits| lits.len())
+ }
+
+ /// Returns true if and only if all literals in this sequence are exact.
+ ///
+ /// This returns false if the sequence is infinite.
+ #[inline]
+ pub fn is_exact(&self) -> bool {
+ self.literals().map_or(false, |lits| lits.iter().all(|x| x.is_exact()))
+ }
+
+ /// Returns true if and only if all literals in this sequence are inexact.
+ ///
+ /// This returns true if the sequence is infinite.
+ #[inline]
+ pub fn is_inexact(&self) -> bool {
+ self.literals().map_or(true, |lits| lits.iter().all(|x| !x.is_exact()))
+ }
+
+ /// Return the maximum length of the sequence that would result from
+ /// unioning `self` with `other`. If either set is infinite, then this
+ /// returns `None`.
+ #[inline]
+ fn max_union_len(&self, other: &Seq) -> Option<usize> {
+ let len1 = self.len()?;
+ let len2 = other.len()?;
+ Some(len1.saturating_add(len2))
+ }
+
+ /// Return the maximum length of the sequence that would result from the
+ /// cross product of `self` with `other`. If either set is infinite, then
+ /// this returns `None`.
+ #[inline]
+ fn max_cross_len(&self, other: &Seq) -> Option<usize> {
+ let len1 = self.len()?;
+ let len2 = other.len()?;
+ Some(len1.saturating_mul(len2))
+ }
+
+ /// Returns the length of the shortest literal in this sequence.
+ ///
+ /// If the sequence is infinite or empty, then this returns `None`.
+ #[inline]
+ pub fn min_literal_len(&self) -> Option<usize> {
+ self.literals.as_ref()?.iter().map(|x| x.len()).min()
+ }
+
+ /// Returns the length of the longest literal in this sequence.
+ ///
+ /// If the sequence is infinite or empty, then this returns `None`.
+ #[inline]
+ pub fn max_literal_len(&self) -> Option<usize> {
+ self.literals.as_ref()?.iter().map(|x| x.len()).max()
+ }
+
+ /// Returns the longest common prefix from this seq.
+ ///
+ /// If the seq matches any literal or other contains no literals, then
+ /// there is no meaningful prefix and this returns `None`.
+ ///
+ /// # Example
+ ///
+ /// This shows some example seqs and their longest common prefix.
+ ///
+ /// ```
+ /// use regex_syntax::hir::literal::Seq;
+ ///
+ /// let seq = Seq::new(&["foo", "foobar", "fo"]);
+ /// assert_eq!(Some(&b"fo"[..]), seq.longest_common_prefix());
+ /// let seq = Seq::new(&["foo", "foo"]);
+ /// assert_eq!(Some(&b"foo"[..]), seq.longest_common_prefix());
+ /// let seq = Seq::new(&["foo", "bar"]);
+ /// assert_eq!(Some(&b""[..]), seq.longest_common_prefix());
+ /// let seq = Seq::new(&[""]);
+ /// assert_eq!(Some(&b""[..]), seq.longest_common_prefix());
+ ///
+ /// let seq = Seq::infinite();
+ /// assert_eq!(None, seq.longest_common_prefix());
+ /// let seq = Seq::empty();
+ /// assert_eq!(None, seq.longest_common_prefix());
+ /// ```
+ #[inline]
+ pub fn longest_common_prefix(&self) -> Option<&[u8]> {
+ // If we match everything or match nothing, then there's no meaningful
+ // longest common prefix.
+ let lits = match self.literals {
+ None => return None,
+ Some(ref lits) => lits,
+ };
+ if lits.len() == 0 {
+ return None;
+ }
+ let base = lits[0].as_bytes();
+ let mut len = base.len();
+ for m in lits.iter().skip(1) {
+ len = m
+ .as_bytes()
+ .iter()
+ .zip(base[..len].iter())
+ .take_while(|&(a, b)| a == b)
+ .count();
+ if len == 0 {
+ return Some(&[]);
+ }
+ }
+ Some(&base[..len])
+ }
+
+ /// Returns the longest common suffix from this seq.
+ ///
+ /// If the seq matches any literal or other contains no literals, then
+ /// there is no meaningful suffix and this returns `None`.
+ ///
+ /// # Example
+ ///
+ /// This shows some example seqs and their longest common suffix.
+ ///
+ /// ```
+ /// use regex_syntax::hir::literal::Seq;
+ ///
+ /// let seq = Seq::new(&["oof", "raboof", "of"]);
+ /// assert_eq!(Some(&b"of"[..]), seq.longest_common_suffix());
+ /// let seq = Seq::new(&["foo", "foo"]);
+ /// assert_eq!(Some(&b"foo"[..]), seq.longest_common_suffix());
+ /// let seq = Seq::new(&["foo", "bar"]);
+ /// assert_eq!(Some(&b""[..]), seq.longest_common_suffix());
+ /// let seq = Seq::new(&[""]);
+ /// assert_eq!(Some(&b""[..]), seq.longest_common_suffix());
+ ///
+ /// let seq = Seq::infinite();
+ /// assert_eq!(None, seq.longest_common_suffix());
+ /// let seq = Seq::empty();
+ /// assert_eq!(None, seq.longest_common_suffix());
+ /// ```
+ #[inline]
+ pub fn longest_common_suffix(&self) -> Option<&[u8]> {
+ // If we match everything or match nothing, then there's no meaningful
+ // longest common suffix.
+ let lits = match self.literals {
+ None => return None,
+ Some(ref lits) => lits,
+ };
+ if lits.len() == 0 {
+ return None;
+ }
+ let base = lits[0].as_bytes();
+ let mut len = base.len();
+ for m in lits.iter().skip(1) {
+ len = m
+ .as_bytes()
+ .iter()
+ .rev()
+ .zip(base[base.len() - len..].iter().rev())
+ .take_while(|&(a, b)| a == b)
+ .count();
+ if len == 0 {
+ return Some(&[]);
+ }
+ }
+ Some(&base[base.len() - len..])
+ }
+
+ /// Optimizes this seq while treating its literals as prefixes and
+ /// respecting the preference order of its literals.
+ ///
+ /// The specific way "optimization" works is meant to be an implementation
+ /// detail, as it essentially represents a set of heuristics. The goal
+ /// that optimization tries to accomplish is to make the literals in this
+ /// set reflect inputs that will result in a more effective prefilter.
+ /// Principally by reducing the false positive rate of candidates found by
+ /// the literals in this sequence. That is, when a match of a literal is
+ /// found, we would like it to be a strong predictor of the overall match
+ /// of the regex. If it isn't, then much time will be spent starting and
+ /// stopping the prefilter search and attempting to confirm the match only
+ /// to have it fail.
+ ///
+ /// Some of those heuristics might be:
+ ///
+ /// * Identifying a common prefix from a larger sequence of literals, and
+ /// shrinking the sequence down to that single common prefix.
+ /// * Rejecting the sequence entirely if it is believed to result in very
+ /// high false positive rate. When this happens, the sequence is made
+ /// infinite.
+ /// * Shrinking the sequence to a smaller number of literals representing
+ /// prefixes, but not shrinking it so much as to make literals too short.
+ /// (A sequence with very short literals, of 1 or 2 bytes, will typically
+ /// result in a higher false positive rate.)
+ ///
+ /// Optimization should only be run once extraction is complete. Namely,
+ /// optimization may make assumptions that do not compose with other
+ /// operations in the middle of extraction. For example, optimization will
+ /// reduce `[E(sam), E(samwise)]` to `[E(sam)]`, but such a transformation
+ /// is only valid if no other extraction will occur. If other extraction
+ /// may occur, then the correct transformation would be to `[I(sam)]`.
+ ///
+ /// The [`Seq::optimize_for_suffix_by_preference`] does the same thing, but
+ /// for suffixes.
+ ///
+ /// # Example
+ ///
+ /// This shows how optimization might transform a sequence. Note that
+ /// the specific behavior is not a documented guarantee. The heuristics
+ /// used are an implementation detail and may change over time in semver
+ /// compatible releases.
+ ///
+ /// ```
+ /// use regex_syntax::hir::literal::{Seq, Literal};
+ ///
+ /// let mut seq = Seq::new(&[
+ /// "samantha",
+ /// "sam",
+ /// "samwise",
+ /// "frodo",
+ /// ]);
+ /// seq.optimize_for_prefix_by_preference();
+ /// assert_eq!(Seq::from_iter([
+ /// Literal::exact("samantha"),
+ /// // Kept exact even though 'samwise' got pruned
+ /// // because optimization assumes literal extraction
+ /// // has finished.
+ /// Literal::exact("sam"),
+ /// Literal::exact("frodo"),
+ /// ]), seq);
+ /// ```
+ ///
+ /// # Example: optimization may make the sequence infinite
+ ///
+ /// If the heuristics deem that the sequence could cause a very high false
+ /// positive rate, then it may make the sequence infinite, effectively
+ /// disabling its use as a prefilter.
+ ///
+ /// ```
+ /// use regex_syntax::hir::literal::{Seq, Literal};
+ ///
+ /// let mut seq = Seq::new(&[
+ /// "samantha",
+ /// // An empty string matches at every position,
+ /// // thus rendering the prefilter completely
+ /// // ineffective.
+ /// "",
+ /// "sam",
+ /// "samwise",
+ /// "frodo",
+ /// ]);
+ /// seq.optimize_for_prefix_by_preference();
+ /// assert!(!seq.is_finite());
+ /// ```
+ ///
+ /// Do note that just because there is a `" "` in the sequence, that
+ /// doesn't mean the sequence will always be made infinite after it is
+ /// optimized. Namely, if the sequence is considered exact (any match
+ /// corresponds to an overall match of the original regex), then any match
+ /// is an overall match, and so the false positive rate is always `0`.
+ ///
+ /// To demonstrate this, we remove `samwise` from our sequence. This
+ /// results in no optimization happening and all literals remain exact.
+ /// Thus the entire sequence is exact, and it is kept as-is, even though
+ /// one is an ASCII space:
+ ///
+ /// ```
+ /// use regex_syntax::hir::literal::{Seq, Literal};
+ ///
+ /// let mut seq = Seq::new(&[
+ /// "samantha",
+ /// " ",
+ /// "sam",
+ /// "frodo",
+ /// ]);
+ /// seq.optimize_for_prefix_by_preference();
+ /// assert!(seq.is_finite());
+ /// ```
+ #[inline]
+ pub fn optimize_for_prefix_by_preference(&mut self) {
+ self.optimize_by_preference(true);
+ }
+
+ /// Optimizes this seq while treating its literals as suffixes and
+ /// respecting the preference order of its literals.
+ ///
+ /// Optimization should only be run once extraction is complete.
+ ///
+ /// The [`Seq::optimize_for_prefix_by_preference`] does the same thing, but
+ /// for prefixes. See its documentation for more explanation.
+ #[inline]
+ pub fn optimize_for_suffix_by_preference(&mut self) {
+ self.optimize_by_preference(false);
+ }
+
+ fn optimize_by_preference(&mut self, prefix: bool) {
+ let origlen = match self.len() {
+ None => return,
+ Some(len) => len,
+ };
+ // Make sure we start with the smallest sequence possible. We use a
+ // special version of preference minimization that retains exactness.
+ // This is legal because optimization is only expected to occur once
+ // extraction is complete.
+ if prefix {
+ if let Some(ref mut lits) = self.literals {
+ PreferenceTrie::minimize(lits, true);
+ }
+ }
+
+ // Look for a common prefix (or suffix). If we found one of those and
+ // it's long enough, then it's a good bet that it will be our fastest
+ // possible prefilter since single-substring search is so fast.
+ let fix = if prefix {
+ self.longest_common_prefix()
+ } else {
+ self.longest_common_suffix()
+ };
+ if let Some(fix) = fix {
+ // As a special case, if we have a common prefix and the leading
+ // byte of that prefix is one that we think probably occurs rarely,
+ // then strip everything down to just that single byte. This should
+ // promote the use of memchr.
+ //
+ // ... we only do this though if our sequence has more than one
+ // literal. Otherwise, we'd rather just stick with a single literal
+ // scan. That is, using memchr is probably better than looking
+ // for 2 or more literals, but probably not as good as a straight
+ // memmem search.
+ //
+ // ... and also only do this when the prefix is short and probably
+ // not too discriminatory anyway. If it's longer, then it's
+ // probably quite discriminatory and thus is likely to have a low
+ // false positive rate.
+ if prefix
+ && origlen > 1
+ && fix.len() >= 1
+ && fix.len() <= 3
+ && rank(fix[0]) < 200
+ {
+ self.keep_first_bytes(1);
+ self.dedup();
+ return;
+ }
+ // We only strip down to the common prefix/suffix if we think
+ // the existing set of literals isn't great, or if the common
+ // prefix/suffix is expected to be particularly discriminatory.
+ let isfast =
+ self.is_exact() && self.len().map_or(false, |len| len <= 16);
+ let usefix = fix.len() > 4 || (fix.len() > 1 && !isfast);
+ if usefix {
+ // If we keep exactly the number of bytes equal to the length
+ // of the prefix (or suffix), then by the definition of a
+ // prefix, every literal in the sequence will be equivalent.
+ // Thus, 'dedup' will leave us with one literal.
+ //
+ // We do it this way to avoid an alloc, but also to make sure
+ // the exactness of literals is kept (or not).
+ if prefix {
+ self.keep_first_bytes(fix.len());
+ } else {
+ self.keep_last_bytes(fix.len());
+ }
+ self.dedup();
+ assert_eq!(Some(1), self.len());
+ // We still fall through here. In particular, we want our
+ // longest common prefix to be subject to the poison check.
+ }
+ }
+ // Everything below this check is more-or-less about trying to
+ // heuristically reduce the false positive rate of a prefilter. But
+ // if our sequence is completely exact, then it's possible the regex
+ // engine can be skipped entirely. In this case, the false positive
+ // rate is zero because every literal match corresponds to a regex
+ // match.
+ //
+ // This is OK even if the sequence contains a poison literal. Remember,
+ // a literal is only poisononous because of what we assume about its
+ // impact on the false positive rate. However, we do still check for
+ // an empty string. Empty strings are weird and it's best to let the
+ // regex engine handle those.
+ //
+ // We do currently do this check after the longest common prefix (or
+ // suffix) check, under the theory that single-substring search is so
+ // fast that we want that even if we'd end up turning an exact sequence
+ // into an inexact one. But this might be wrong...
+ if self.is_exact()
+ && self.min_literal_len().map_or(false, |len| len > 0)
+ {
+ return;
+ }
+ // Now we attempt to shorten the sequence. The idea here is that we
+ // don't want to look for too many literals, but we want to shorten
+ // our sequence enough to improve our odds of using better algorithms
+ // downstream (such as Teddy).
+ const ATTEMPTS: [(usize, usize); 5] =
+ [(5, 64), (4, 64), (3, 64), (2, 64), (1, 10)];
+ for (keep, limit) in ATTEMPTS {
+ let len = match self.len() {
+ None => break,
+ Some(len) => len,
+ };
+ if len <= limit {
+ break;
+ }
+ if prefix {
+ self.keep_first_bytes(keep);
+ } else {
+ self.keep_last_bytes(keep);
+ }
+ self.minimize_by_preference();
+ }
+ // Check for a poison literal. A poison literal is one that is short
+ // and is believed to have a very high match count. These poisons
+ // generally lead to a prefilter with a very high false positive rate,
+ // and thus overall worse performance.
+ //
+ // We do this last because we could have gone from a non-poisonous
+ // sequence to a poisonous one. Perhaps we should add some code to
+ // prevent such transitions in the first place, but then again, we
+ // likely only made the transition in the first place if the sequence
+ // was itself huge. And huge sequences are themselves poisonous. So...
+ if let Some(lits) = self.literals() {
+ if lits.iter().any(|lit| lit.is_poisonous()) {
+ self.make_infinite();
+ }
+ }
+ }
+}
+
+impl core::fmt::Debug for Seq {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ write!(f, "Seq")?;
+ if let Some(lits) = self.literals() {
+ f.debug_list().entries(lits.iter()).finish()
+ } else {
+ write!(f, "[∅]")
+ }
+ }
+}
+
+impl FromIterator<Literal> for Seq {
+ fn from_iter<T: IntoIterator<Item = Literal>>(it: T) -> Seq {
+ let mut seq = Seq::empty();
+ for literal in it {
+ seq.push(literal);
+ }
+ seq
+ }
+}
+
+/// A single literal extracted from an [`Hir`] expression.
+///
+/// A literal is composed of two things:
+///
+/// * A sequence of bytes. No guarantees with respect to UTF-8 are provided.
+/// In particular, even if the regex a literal is extracted from is UTF-8, the
+/// literal extracted may not be valid UTF-8. (For example, if an [`Extractor`]
+/// limit resulted in trimming a literal in a way that splits a codepoint.)
+/// * Whether the literal is "exact" or not. An "exact" literal means that it
+/// has not been trimmed, and may continue to be extended. If a literal is
+/// "exact" after visiting the entire `Hir` expression, then this implies that
+/// the literal leads to a match state. (Although it doesn't necessarily imply
+/// all occurrences of the literal correspond to a match of the regex, since
+/// literal extraction ignores look-around assertions.)
+#[derive(Clone, Eq, PartialEq, PartialOrd, Ord)]
+pub struct Literal {
+ bytes: Vec<u8>,
+ exact: bool,
+}
+
+impl Literal {
+ /// Returns a new exact literal containing the bytes given.
+ #[inline]
+ pub fn exact<B: Into<Vec<u8>>>(bytes: B) -> Literal {
+ Literal { bytes: bytes.into(), exact: true }
+ }
+
+ /// Returns a new inexact literal containing the bytes given.
+ #[inline]
+ pub fn inexact<B: Into<Vec<u8>>>(bytes: B) -> Literal {
+ Literal { bytes: bytes.into(), exact: false }
+ }
+
+ /// Returns the bytes in this literal.
+ #[inline]
+ pub fn as_bytes(&self) -> &[u8] {
+ &self.bytes
+ }
+
+ /// Yields ownership of the bytes inside this literal.
+ ///
+ /// Note that this throws away whether the literal is "exact" or not.
+ #[inline]
+ pub fn into_bytes(self) -> Vec<u8> {
+ self.bytes
+ }
+
+ /// Returns the length of this literal in bytes.
+ #[inline]
+ pub fn len(&self) -> usize {
+ self.as_bytes().len()
+ }
+
+ /// Returns true if and only if this literal has zero bytes.
+ #[inline]
+ pub fn is_empty(&self) -> bool {
+ self.len() == 0
+ }
+
+ /// Returns true if and only if this literal is exact.
+ #[inline]
+ pub fn is_exact(&self) -> bool {
+ self.exact
+ }
+
+ /// Marks this literal as inexact.
+ ///
+ /// Inexact literals can never be extended. For example,
+ /// [`Seq::cross_forward`] will not extend inexact literals.
+ #[inline]
+ pub fn make_inexact(&mut self) {
+ self.exact = false;
+ }
+
+ /// Reverse the bytes in this literal.
+ #[inline]
+ pub fn reverse(&mut self) {
+ self.bytes.reverse();
+ }
+
+ /// Extend this literal with the literal given.
+ ///
+ /// If this literal is inexact, then this is a no-op.
+ #[inline]
+ pub fn extend(&mut self, lit: &Literal) {
+ if !self.is_exact() {
+ return;
+ }
+ self.bytes.extend_from_slice(&lit.bytes);
+ }
+
+ /// Trims this literal such that only the first `len` bytes remain. If
+ /// this literal has fewer than `len` bytes, then it remains unchanged.
+ /// Otherwise, the literal is marked as inexact.
+ #[inline]
+ pub fn keep_first_bytes(&mut self, len: usize) {
+ if len >= self.len() {
+ return;
+ }
+ self.make_inexact();
+ self.bytes.truncate(len);
+ }
+
+ /// Trims this literal such that only the last `len` bytes remain. If this
+ /// literal has fewer than `len` bytes, then it remains unchanged.
+ /// Otherwise, the literal is marked as inexact.
+ #[inline]
+ pub fn keep_last_bytes(&mut self, len: usize) {
+ if len >= self.len() {
+ return;
+ }
+ self.make_inexact();
+ self.bytes.drain(..self.len() - len);
+ }
+
+ /// Returns true if it is believe that this literal is likely to match very
+ /// frequently, and is thus not a good candidate for a prefilter.
+ fn is_poisonous(&self) -> bool {
+ self.is_empty() || (self.len() == 1 && rank(self.as_bytes()[0]) >= 250)
+ }
+}
+
+impl From<u8> for Literal {
+ fn from(byte: u8) -> Literal {
+ Literal::exact(vec![byte])
+ }
+}
+
+impl From<char> for Literal {
+ fn from(ch: char) -> Literal {
+ use alloc::string::ToString;
+ Literal::exact(ch.encode_utf8(&mut [0; 4]).to_string())
+ }
+}
+
+impl AsRef<[u8]> for Literal {
+ fn as_ref(&self) -> &[u8] {
+ self.as_bytes()
+ }
+}
+
+impl core::fmt::Debug for Literal {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ let tag = if self.exact { "E" } else { "I" };
+ f.debug_tuple(tag)
+ .field(&crate::debug::Bytes(self.as_bytes()))
+ .finish()
+ }
+}
+
+/// A "preference" trie that rejects literals that will never match when
+/// executing a leftmost first or "preference" search.
+///
+/// For example, if 'sam' is inserted, then trying to insert 'samwise' will be
+/// rejected because 'samwise' can never match since 'sam' will always take
+/// priority. However, if 'samwise' is inserted first, then inserting 'sam'
+/// after it is accepted. In this case, either 'samwise' or 'sam' can match in
+/// a "preference" search.
+///
+/// Note that we only use this trie as a "set." That is, given a sequence of
+/// literals, we insert each one in order. An `insert` will reject a literal
+/// if a prefix of that literal already exists in the trie. Thus, to rebuild
+/// the "minimal" sequence, we simply only keep literals that were successfully
+/// inserted. (Since we don't need traversal, one wonders whether we can make
+/// some simplifications here, but I haven't given it a ton of thought and I've
+/// never seen this show up on a profile. Because of the heuristic limits
+/// imposed on literal extractions, the size of the inputs here is usually
+/// very small.)
+#[derive(Debug, Default)]
+struct PreferenceTrie {
+ /// The states in this trie. The index of a state in this vector is its ID.
+ states: Vec<State>,
+ /// The index to allocate to the next literal added to this trie. Starts at
+ /// 0 and increments by 1 for every literal successfully added to the trie.
+ next_literal_index: usize,
+}
+
+/// A single state in a trie. Uses a sparse representation for its transitions.
+#[derive(Debug, Default)]
+struct State {
+ /// Sparse representation of the transitions out of this state. Transitions
+ /// are sorted by byte. There is at most one such transition for any
+ /// particular byte.
+ trans: Vec<(u8, usize)>,
+ /// Whether this is a matching state or not. If it is, then it contains the
+ /// index to the matching literal.
+ literal_index: Option<usize>,
+}
+
+impl PreferenceTrie {
+ /// Minimizes the given sequence of literals while preserving preference
+ /// order semantics.
+ ///
+ /// When `keep_exact` is true, the exactness of every literal retained is
+ /// kept. This is useful when dealing with a fully extracted `Seq` that
+ /// only contains exact literals. In that case, we can keep all retained
+ /// literals as exact because we know we'll never need to match anything
+ /// after them and because any removed literals are guaranteed to never
+ /// match.
+ fn minimize(literals: &mut Vec<Literal>, keep_exact: bool) {
+ use core::cell::RefCell;
+
+ // MSRV(1.61): Use retain_mut here to avoid interior mutability.
+ let trie = RefCell::new(PreferenceTrie::default());
+ let mut make_inexact = vec![];
+ literals.retain(|lit| {
+ match trie.borrow_mut().insert(lit.as_bytes()) {
+ Ok(_) => true,
+ Err(i) => {
+ if !keep_exact {
+ make_inexact.push(i);
+ }
+ false
+ }
+ }
+ });
+ for i in make_inexact {
+ literals[i].make_inexact();
+ }
+ }
+
+ /// Returns `Ok` if the given byte string is accepted into this trie and
+ /// `Err` otherwise. The index for the success case corresponds to the
+ /// index of the literal added. The index for the error case corresponds to
+ /// the index of the literal already in the trie that prevented the given
+ /// byte string from being added. (Which implies it is a prefix of the one
+ /// given.)
+ ///
+ /// In short, the byte string given is accepted into the trie if and only
+ /// if it is possible for it to match when executing a preference order
+ /// search.
+ fn insert(&mut self, bytes: &[u8]) -> Result<usize, usize> {
+ let mut prev = self.root();
+ if let Some(idx) = self.states[prev].literal_index {
+ return Err(idx);
+ }
+ for &b in bytes.iter() {
+ match self.states[prev].trans.binary_search_by_key(&b, |t| t.0) {
+ Ok(i) => {
+ prev = self.states[prev].trans[i].1;
+ if let Some(idx) = self.states[prev].literal_index {
+ return Err(idx);
+ }
+ }
+ Err(i) => {
+ let next = self.create_state();
+ self.states[prev].trans.insert(i, (b, next));
+ prev = next;
+ }
+ }
+ }
+ let idx = self.next_literal_index;
+ self.next_literal_index += 1;
+ self.states[prev].literal_index = Some(idx);
+ Ok(idx)
+ }
+
+ /// Returns the root state ID, and if it doesn't exist, creates it.
+ fn root(&mut self) -> usize {
+ if !self.states.is_empty() {
+ 0
+ } else {
+ self.create_state()
+ }
+ }
+
+ /// Creates a new empty state and returns its ID.
+ fn create_state(&mut self) -> usize {
+ let id = self.states.len();
+ self.states.push(State::default());
+ id
+ }
+}
+
+/// Returns the "rank" of the given byte.
+///
+/// The minimum rank value is `0` and the maximum rank value is `255`.
+///
+/// The rank of a byte is derived from a heuristic background distribution of
+/// relative frequencies of bytes. The heuristic says that lower the rank of a
+/// byte, the less likely that byte is to appear in any arbitrary haystack.
+pub fn rank(byte: u8) -> u8 {
+ crate::rank::BYTE_FREQUENCIES[usize::from(byte)]
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ fn parse(pattern: &str) -> Hir {
+ crate::ParserBuilder::new().utf8(false).build().parse(pattern).unwrap()
+ }
+
+ fn prefixes(pattern: &str) -> Seq {
+ Extractor::new().kind(ExtractKind::Prefix).extract(&parse(pattern))
+ }
+
+ fn suffixes(pattern: &str) -> Seq {
+ Extractor::new().kind(ExtractKind::Suffix).extract(&parse(pattern))
+ }
+
+ fn e(pattern: &str) -> (Seq, Seq) {
+ (prefixes(pattern), suffixes(pattern))
+ }
+
+ #[allow(non_snake_case)]
+ fn E(x: &str) -> Literal {
+ Literal::exact(x.as_bytes())
+ }
+
+ #[allow(non_snake_case)]
+ fn I(x: &str) -> Literal {
+ Literal::inexact(x.as_bytes())
+ }
+
+ fn seq<I: IntoIterator<Item = Literal>>(it: I) -> Seq {
+ Seq::from_iter(it)
+ }
+
+ fn infinite() -> (Seq, Seq) {
+ (Seq::infinite(), Seq::infinite())
+ }
+
+ fn inexact<I1, I2>(it1: I1, it2: I2) -> (Seq, Seq)
+ where
+ I1: IntoIterator<Item = Literal>,
+ I2: IntoIterator<Item = Literal>,
+ {
+ (Seq::from_iter(it1), Seq::from_iter(it2))
+ }
+
+ fn exact<B: AsRef<[u8]>, I: IntoIterator<Item = B>>(it: I) -> (Seq, Seq) {
+ let s1 = Seq::new(it);
+ let s2 = s1.clone();
+ (s1, s2)
+ }
+
+ fn opt<B: AsRef<[u8]>, I: IntoIterator<Item = B>>(it: I) -> (Seq, Seq) {
+ let (mut p, mut s) = exact(it);
+ p.optimize_for_prefix_by_preference();
+ s.optimize_for_suffix_by_preference();
+ (p, s)
+ }
+
+ #[test]
+ fn literal() {
+ assert_eq!(exact(["a"]), e("a"));
+ assert_eq!(exact(["aaaaa"]), e("aaaaa"));
+ assert_eq!(exact(["A", "a"]), e("(?i-u)a"));
+ assert_eq!(exact(["AB", "Ab", "aB", "ab"]), e("(?i-u)ab"));
+ assert_eq!(exact(["abC", "abc"]), e("ab(?i-u)c"));
+
+ assert_eq!(exact([b"\xFF"]), e(r"(?-u:\xFF)"));
+
+ #[cfg(feature = "unicode-case")]
+ {
+ assert_eq!(exact(["☃"]), e("☃"));
+ assert_eq!(exact(["☃"]), e("(?i)☃"));
+ assert_eq!(exact(["☃☃☃☃☃"]), e("☃☃☃☃☃"));
+
+ assert_eq!(exact(["Δ"]), e("Δ"));
+ assert_eq!(exact(["δ"]), e("δ"));
+ assert_eq!(exact(["Δ", "δ"]), e("(?i)Δ"));
+ assert_eq!(exact(["Δ", "δ"]), e("(?i)δ"));
+
+ assert_eq!(exact(["S", "s", "ſ"]), e("(?i)S"));
+ assert_eq!(exact(["S", "s", "ſ"]), e("(?i)s"));
+ assert_eq!(exact(["S", "s", "ſ"]), e("(?i)ſ"));
+ }
+
+ let letters = "ͱͳͷΐάέήίΰαβγδεζηθικλμνξοπρςστυφχψωϊϋ";
+ assert_eq!(exact([letters]), e(letters));
+ }
+
+ #[test]
+ fn class() {
+ assert_eq!(exact(["a", "b", "c"]), e("[abc]"));
+ assert_eq!(exact(["a1b", "a2b", "a3b"]), e("a[123]b"));
+ assert_eq!(exact(["δ", "ε"]), e("[εδ]"));
+ #[cfg(feature = "unicode-case")]
+ {
+ assert_eq!(exact(["Δ", "Ε", "δ", "ε", "ϵ"]), e(r"(?i)[εδ]"));
+ }
+ }
+
+ #[test]
+ fn look() {
+ assert_eq!(exact(["ab"]), e(r"a\Ab"));
+ assert_eq!(exact(["ab"]), e(r"a\zb"));
+ assert_eq!(exact(["ab"]), e(r"a(?m:^)b"));
+ assert_eq!(exact(["ab"]), e(r"a(?m:$)b"));
+ assert_eq!(exact(["ab"]), e(r"a\bb"));
+ assert_eq!(exact(["ab"]), e(r"a\Bb"));
+ assert_eq!(exact(["ab"]), e(r"a(?-u:\b)b"));
+ assert_eq!(exact(["ab"]), e(r"a(?-u:\B)b"));
+
+ assert_eq!(exact(["ab"]), e(r"^ab"));
+ assert_eq!(exact(["ab"]), e(r"$ab"));
+ assert_eq!(exact(["ab"]), e(r"(?m:^)ab"));
+ assert_eq!(exact(["ab"]), e(r"(?m:$)ab"));
+ assert_eq!(exact(["ab"]), e(r"\bab"));
+ assert_eq!(exact(["ab"]), e(r"\Bab"));
+ assert_eq!(exact(["ab"]), e(r"(?-u:\b)ab"));
+ assert_eq!(exact(["ab"]), e(r"(?-u:\B)ab"));
+
+ assert_eq!(exact(["ab"]), e(r"ab^"));
+ assert_eq!(exact(["ab"]), e(r"ab$"));
+ assert_eq!(exact(["ab"]), e(r"ab(?m:^)"));
+ assert_eq!(exact(["ab"]), e(r"ab(?m:$)"));
+ assert_eq!(exact(["ab"]), e(r"ab\b"));
+ assert_eq!(exact(["ab"]), e(r"ab\B"));
+ assert_eq!(exact(["ab"]), e(r"ab(?-u:\b)"));
+ assert_eq!(exact(["ab"]), e(r"ab(?-u:\B)"));
+
+ let expected = (seq([I("aZ"), E("ab")]), seq([I("Zb"), E("ab")]));
+ assert_eq!(expected, e(r"^aZ*b"));
+ }
+
+ #[test]
+ fn repetition() {
+ assert_eq!(exact(["a", ""]), e(r"a?"));
+ assert_eq!(exact(["", "a"]), e(r"a??"));
+ assert_eq!(inexact([I("a"), E("")], [I("a"), E("")]), e(r"a*"));
+ assert_eq!(inexact([E(""), I("a")], [E(""), I("a")]), e(r"a*?"));
+ assert_eq!(inexact([I("a")], [I("a")]), e(r"a+"));
+ assert_eq!(inexact([I("a")], [I("a")]), e(r"(a+)+"));
+
+ assert_eq!(exact(["ab"]), e(r"aZ{0}b"));
+ assert_eq!(exact(["aZb", "ab"]), e(r"aZ?b"));
+ assert_eq!(exact(["ab", "aZb"]), e(r"aZ??b"));
+ assert_eq!(
+ inexact([I("aZ"), E("ab")], [I("Zb"), E("ab")]),
+ e(r"aZ*b")
+ );
+ assert_eq!(
+ inexact([E("ab"), I("aZ")], [E("ab"), I("Zb")]),
+ e(r"aZ*?b")
+ );
+ assert_eq!(inexact([I("aZ")], [I("Zb")]), e(r"aZ+b"));
+ assert_eq!(inexact([I("aZ")], [I("Zb")]), e(r"aZ+?b"));
+
+ assert_eq!(exact(["aZZb"]), e(r"aZ{2}b"));
+ assert_eq!(inexact([I("aZZ")], [I("ZZb")]), e(r"aZ{2,3}b"));
+
+ assert_eq!(exact(["abc", ""]), e(r"(abc)?"));
+ assert_eq!(exact(["", "abc"]), e(r"(abc)??"));
+
+ assert_eq!(inexact([I("a"), E("b")], [I("ab"), E("b")]), e(r"a*b"));
+ assert_eq!(inexact([E("b"), I("a")], [E("b"), I("ab")]), e(r"a*?b"));
+ assert_eq!(inexact([I("ab")], [I("b")]), e(r"ab+"));
+ assert_eq!(inexact([I("a"), I("b")], [I("b")]), e(r"a*b+"));
+
+ // FIXME: The suffixes for this don't look quite right to me. I think
+ // the right suffixes would be: [I(ac), I(bc), E(c)]. The main issue I
+ // think is that suffixes are computed by iterating over concatenations
+ // in reverse, and then [bc, ac, c] ordering is indeed correct from
+ // that perspective. We also test a few more equivalent regexes, and
+ // we get the same result, so it is consistent at least I suppose.
+ //
+ // The reason why this isn't an issue is that it only messes up
+ // preference order, and currently, suffixes are never used in a
+ // context where preference order matters. For prefixes it matters
+ // because we sometimes want to use prefilters without confirmation
+ // when all of the literals are exact (and there's no look-around). But
+ // we never do that for suffixes. Any time we use suffixes, we always
+ // include a confirmation step. If that ever changes, then it's likely
+ // this bug will need to be fixed, but last time I looked, it appears
+ // hard to do so.
+ assert_eq!(
+ inexact([I("a"), I("b"), E("c")], [I("bc"), I("ac"), E("c")]),
+ e(r"a*b*c")
+ );
+ assert_eq!(
+ inexact([I("a"), I("b"), E("c")], [I("bc"), I("ac"), E("c")]),
+ e(r"(a+)?(b+)?c")
+ );
+ assert_eq!(
+ inexact([I("a"), I("b"), E("c")], [I("bc"), I("ac"), E("c")]),
+ e(r"(a+|)(b+|)c")
+ );
+ // A few more similarish but not identical regexes. These may have a
+ // similar problem as above.
+ assert_eq!(
+ inexact(
+ [I("a"), I("b"), I("c"), E("")],
+ [I("c"), I("b"), I("a"), E("")]
+ ),
+ e(r"a*b*c*")
+ );
+ assert_eq!(inexact([I("a"), I("b"), I("c")], [I("c")]), e(r"a*b*c+"));
+ assert_eq!(inexact([I("a"), I("b")], [I("bc")]), e(r"a*b+c"));
+ assert_eq!(inexact([I("a"), I("b")], [I("c"), I("b")]), e(r"a*b+c*"));
+ assert_eq!(inexact([I("ab"), E("a")], [I("b"), E("a")]), e(r"ab*"));
+ assert_eq!(
+ inexact([I("ab"), E("ac")], [I("bc"), E("ac")]),
+ e(r"ab*c")
+ );
+ assert_eq!(inexact([I("ab")], [I("b")]), e(r"ab+"));
+ assert_eq!(inexact([I("ab")], [I("bc")]), e(r"ab+c"));
+
+ assert_eq!(
+ inexact([I("z"), E("azb")], [I("zazb"), E("azb")]),
+ e(r"z*azb")
+ );
+
+ let expected =
+ exact(["aaa", "aab", "aba", "abb", "baa", "bab", "bba", "bbb"]);
+ assert_eq!(expected, e(r"[ab]{3}"));
+ let expected = inexact(
+ [
+ I("aaa"),
+ I("aab"),
+ I("aba"),
+ I("abb"),
+ I("baa"),
+ I("bab"),
+ I("bba"),
+ I("bbb"),
+ ],
+ [
+ I("aaa"),
+ I("aab"),
+ I("aba"),
+ I("abb"),
+ I("baa"),
+ I("bab"),
+ I("bba"),
+ I("bbb"),
+ ],
+ );
+ assert_eq!(expected, e(r"[ab]{3,4}"));
+ }
+
+ #[test]
+ fn concat() {
+ let empty: [&str; 0] = [];
+
+ assert_eq!(exact(["abcxyz"]), e(r"abc()xyz"));
+ assert_eq!(exact(["abcxyz"]), e(r"(abc)(xyz)"));
+ assert_eq!(exact(["abcmnoxyz"]), e(r"abc()mno()xyz"));
+ assert_eq!(exact(empty), e(r"abc[a&&b]xyz"));
+ assert_eq!(exact(["abcxyz"]), e(r"abc[a&&b]*xyz"));
+ }
+
+ #[test]
+ fn alternation() {
+ assert_eq!(exact(["abc", "mno", "xyz"]), e(r"abc|mno|xyz"));
+ assert_eq!(
+ inexact(
+ [E("abc"), I("mZ"), E("mo"), E("xyz")],
+ [E("abc"), I("Zo"), E("mo"), E("xyz")]
+ ),
+ e(r"abc|mZ*o|xyz")
+ );
+ assert_eq!(exact(["abc", "xyz"]), e(r"abc|M[a&&b]N|xyz"));
+ assert_eq!(exact(["abc", "MN", "xyz"]), e(r"abc|M[a&&b]*N|xyz"));
+
+ assert_eq!(exact(["aaa", "aaaaa"]), e(r"(?:|aa)aaa"));
+ assert_eq!(
+ inexact(
+ [I("aaa"), E(""), I("aaaaa"), E("aa")],
+ [I("aaa"), E(""), E("aa")]
+ ),
+ e(r"(?:|aa)(?:aaa)*")
+ );
+ assert_eq!(
+ inexact(
+ [E(""), I("aaa"), E("aa"), I("aaaaa")],
+ [E(""), I("aaa"), E("aa")]
+ ),
+ e(r"(?:|aa)(?:aaa)*?")
+ );
+
+ assert_eq!(
+ inexact([E("a"), I("b"), E("")], [E("a"), I("b"), E("")]),
+ e(r"a|b*")
+ );
+ assert_eq!(inexact([E("a"), I("b")], [E("a"), I("b")]), e(r"a|b+"));
+
+ assert_eq!(
+ inexact([I("a"), E("b"), E("c")], [I("ab"), E("b"), E("c")]),
+ e(r"a*b|c")
+ );
+
+ assert_eq!(
+ inexact(
+ [E("a"), E("b"), I("c"), E("")],
+ [E("a"), E("b"), I("c"), E("")]
+ ),
+ e(r"a|(?:b|c*)")
+ );
+
+ assert_eq!(
+ inexact(
+ [I("a"), I("b"), E("c"), I("a"), I("ab"), E("c")],
+ [I("ac"), I("bc"), E("c"), I("ac"), I("abc"), E("c")],
+ ),
+ e(r"(a|b)*c|(a|ab)*c")
+ );
+
+ assert_eq!(
+ exact(["abef", "abgh", "cdef", "cdgh"]),
+ e(r"(ab|cd)(ef|gh)")
+ );
+ assert_eq!(
+ exact([
+ "abefij", "abefkl", "abghij", "abghkl", "cdefij", "cdefkl",
+ "cdghij", "cdghkl",
+ ]),
+ e(r"(ab|cd)(ef|gh)(ij|kl)")
+ );
+ }
+
+ #[test]
+ fn impossible() {
+ let empty: [&str; 0] = [];
+
+ assert_eq!(exact(empty), e(r"[a&&b]"));
+ assert_eq!(exact(empty), e(r"a[a&&b]"));
+ assert_eq!(exact(empty), e(r"[a&&b]b"));
+ assert_eq!(exact(empty), e(r"a[a&&b]b"));
+ assert_eq!(exact(["a", "b"]), e(r"a|[a&&b]|b"));
+ assert_eq!(exact(["a", "b"]), e(r"a|c[a&&b]|b"));
+ assert_eq!(exact(["a", "b"]), e(r"a|[a&&b]d|b"));
+ assert_eq!(exact(["a", "b"]), e(r"a|c[a&&b]d|b"));
+ assert_eq!(exact([""]), e(r"[a&&b]*"));
+ assert_eq!(exact(["MN"]), e(r"M[a&&b]*N"));
+ }
+
+ // This tests patterns that contain something that defeats literal
+ // detection, usually because it would blow some limit on the total number
+ // of literals that can be returned.
+ //
+ // The main idea is that when literal extraction sees something that
+ // it knows will blow a limit, it replaces it with a marker that says
+ // "any literal will match here." While not necessarily true, the
+ // over-estimation is just fine for the purposes of literal extraction,
+ // because the imprecision doesn't matter: too big is too big.
+ //
+ // This is one of the trickier parts of literal extraction, since we need
+ // to make sure all of our literal extraction operations correctly compose
+ // with the markers.
+ #[test]
+ fn anything() {
+ assert_eq!(infinite(), e(r"."));
+ assert_eq!(infinite(), e(r"(?s)."));
+ assert_eq!(infinite(), e(r"[A-Za-z]"));
+ assert_eq!(infinite(), e(r"[A-Z]"));
+ assert_eq!(exact([""]), e(r"[A-Z]{0}"));
+ assert_eq!(infinite(), e(r"[A-Z]?"));
+ assert_eq!(infinite(), e(r"[A-Z]*"));
+ assert_eq!(infinite(), e(r"[A-Z]+"));
+ assert_eq!((seq([I("1")]), Seq::infinite()), e(r"1[A-Z]"));
+ assert_eq!((seq([I("1")]), seq([I("2")])), e(r"1[A-Z]2"));
+ assert_eq!((Seq::infinite(), seq([I("123")])), e(r"[A-Z]+123"));
+ assert_eq!(infinite(), e(r"[A-Z]+123[A-Z]+"));
+ assert_eq!(infinite(), e(r"1|[A-Z]|3"));
+ assert_eq!(
+ (seq([E("1"), I("2"), E("3")]), Seq::infinite()),
+ e(r"1|2[A-Z]|3"),
+ );
+ assert_eq!(
+ (Seq::infinite(), seq([E("1"), I("2"), E("3")])),
+ e(r"1|[A-Z]2|3"),
+ );
+ assert_eq!(
+ (seq([E("1"), I("2"), E("4")]), seq([E("1"), I("3"), E("4")])),
+ e(r"1|2[A-Z]3|4"),
+ );
+ assert_eq!((Seq::infinite(), seq([I("2")])), e(r"(?:|1)[A-Z]2"));
+ assert_eq!(inexact([I("a")], [I("z")]), e(r"a.z"));
+ }
+
+ // Like the 'anything' test, but it uses smaller limits in order to test
+ // the logic for effectively aborting literal extraction when the seqs get
+ // too big.
+ #[test]
+ fn anything_small_limits() {
+ fn prefixes(pattern: &str) -> Seq {
+ Extractor::new()
+ .kind(ExtractKind::Prefix)
+ .limit_total(10)
+ .extract(&parse(pattern))
+ }
+
+ fn suffixes(pattern: &str) -> Seq {
+ Extractor::new()
+ .kind(ExtractKind::Suffix)
+ .limit_total(10)
+ .extract(&parse(pattern))
+ }
+
+ fn e(pattern: &str) -> (Seq, Seq) {
+ (prefixes(pattern), suffixes(pattern))
+ }
+
+ assert_eq!(
+ (
+ seq([
+ I("aaa"),
+ I("aab"),
+ I("aba"),
+ I("abb"),
+ I("baa"),
+ I("bab"),
+ I("bba"),
+ I("bbb")
+ ]),
+ seq([
+ I("aaa"),
+ I("aab"),
+ I("aba"),
+ I("abb"),
+ I("baa"),
+ I("bab"),
+ I("bba"),
+ I("bbb")
+ ])
+ ),
+ e(r"[ab]{3}{3}")
+ );
+
+ assert_eq!(infinite(), e(r"ab|cd|ef|gh|ij|kl|mn|op|qr|st|uv|wx|yz"));
+ }
+
+ #[test]
+ fn empty() {
+ assert_eq!(exact([""]), e(r""));
+ assert_eq!(exact([""]), e(r"^"));
+ assert_eq!(exact([""]), e(r"$"));
+ assert_eq!(exact([""]), e(r"(?m:^)"));
+ assert_eq!(exact([""]), e(r"(?m:$)"));
+ assert_eq!(exact([""]), e(r"\b"));
+ assert_eq!(exact([""]), e(r"\B"));
+ assert_eq!(exact([""]), e(r"(?-u:\b)"));
+ assert_eq!(exact([""]), e(r"(?-u:\B)"));
+ }
+
+ #[test]
+ fn odds_and_ends() {
+ assert_eq!((Seq::infinite(), seq([I("a")])), e(r".a"));
+ assert_eq!((seq([I("a")]), Seq::infinite()), e(r"a."));
+ assert_eq!(infinite(), e(r"a|."));
+ assert_eq!(infinite(), e(r".|a"));
+
+ let pat = r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]";
+ let expected = inexact(
+ ["Mo'am", "Moam", "Mu'am", "Muam"].map(I),
+ [
+ "ddafi", "ddafy", "dhafi", "dhafy", "dzafi", "dzafy", "dafi",
+ "dafy", "tdafi", "tdafy", "thafi", "thafy", "tzafi", "tzafy",
+ "tafi", "tafy", "zdafi", "zdafy", "zhafi", "zhafy", "zzafi",
+ "zzafy", "zafi", "zafy",
+ ]
+ .map(I),
+ );
+ assert_eq!(expected, e(pat));
+
+ assert_eq!(
+ (seq(["fn is_", "fn as_"].map(I)), Seq::infinite()),
+ e(r"fn is_([A-Z]+)|fn as_([A-Z]+)"),
+ );
+ assert_eq!(
+ inexact([I("foo")], [I("quux")]),
+ e(r"foo[A-Z]+bar[A-Z]+quux")
+ );
+ assert_eq!(infinite(), e(r"[A-Z]+bar[A-Z]+"));
+ assert_eq!(
+ exact(["Sherlock Holmes"]),
+ e(r"(?m)^Sherlock Holmes|Sherlock Holmes$")
+ );
+
+ assert_eq!(exact(["sa", "sb"]), e(r"\bs(?:[ab])"));
+ }
+
+ // This tests a specific regex along with some heuristic steps to reduce
+ // the sequences extracted. This is meant to roughly correspond to the
+ // types of heuristics used to shrink literal sets in practice. (Shrinking
+ // is done because you want to balance "spend too much work looking for
+ // too many literals" and "spend too much work processing false positive
+ // matches from short literals.")
+ #[test]
+ #[cfg(feature = "unicode-case")]
+ fn holmes() {
+ let expected = inexact(
+ ["HOL", "HOl", "HoL", "Hol", "hOL", "hOl", "hoL", "hol"].map(I),
+ [
+ "MES", "MEs", "Eſ", "MeS", "Mes", "eſ", "mES", "mEs", "meS",
+ "mes",
+ ]
+ .map(I),
+ );
+ let (mut prefixes, mut suffixes) = e(r"(?i)Holmes");
+ prefixes.keep_first_bytes(3);
+ suffixes.keep_last_bytes(3);
+ prefixes.minimize_by_preference();
+ suffixes.minimize_by_preference();
+ assert_eq!(expected, (prefixes, suffixes));
+ }
+
+ // This tests that we get some kind of literals extracted for a beefier
+ // alternation with case insensitive mode enabled. At one point during
+ // development, this returned nothing, and motivated some special case
+ // code in Extractor::union to try and trim down the literal sequences
+ // if the union would blow the limits set.
+ #[test]
+ #[cfg(feature = "unicode-case")]
+ fn holmes_alt() {
+ let mut pre =
+ prefixes(r"(?i)Sherlock|Holmes|Watson|Irene|Adler|John|Baker");
+ assert!(pre.len().unwrap() > 0);
+ pre.optimize_for_prefix_by_preference();
+ assert!(pre.len().unwrap() > 0);
+ }
+
+ // See: https://github.com/rust-lang/regex/security/advisories/GHSA-m5pq-gvj9-9vr8
+ // See: CVE-2022-24713
+ //
+ // We test this here to ensure literal extraction completes in reasonable
+ // time and isn't materially impacted by these sorts of pathological
+ // repeats.
+ #[test]
+ fn crazy_repeats() {
+ assert_eq!(inexact([I("")], [I("")]), e(r"(?:){4294967295}"));
+ assert_eq!(
+ inexact([I("")], [I("")]),
+ e(r"(?:){64}{64}{64}{64}{64}{64}")
+ );
+ assert_eq!(inexact([I("")], [I("")]), e(r"x{0}{4294967295}"));
+ assert_eq!(inexact([I("")], [I("")]), e(r"(?:|){4294967295}"));
+
+ assert_eq!(
+ inexact([E("")], [E("")]),
+ e(r"(?:){8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}")
+ );
+ let repa = "a".repeat(100);
+ assert_eq!(
+ inexact([I(&repa)], [I(&repa)]),
+ e(r"a{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}")
+ );
+ }
+
+ #[test]
+ fn huge() {
+ let pat = r#"(?-u)
+ 2(?:
+ [45]\d{3}|
+ 7(?:
+ 1[0-267]|
+ 2[0-289]|
+ 3[0-29]|
+ 4[01]|
+ 5[1-3]|
+ 6[013]|
+ 7[0178]|
+ 91
+ )|
+ 8(?:
+ 0[125]|
+ [139][1-6]|
+ 2[0157-9]|
+ 41|
+ 6[1-35]|
+ 7[1-5]|
+ 8[1-8]|
+ 90
+ )|
+ 9(?:
+ 0[0-2]|
+ 1[0-4]|
+ 2[568]|
+ 3[3-6]|
+ 5[5-7]|
+ 6[0167]|
+ 7[15]|
+ 8[0146-9]
+ )
+ )\d{4}|
+ 3(?:
+ 12?[5-7]\d{2}|
+ 0(?:
+ 2(?:
+ [025-79]\d|
+ [348]\d{1,2}
+ )|
+ 3(?:
+ [2-4]\d|
+ [56]\d?
+ )
+ )|
+ 2(?:
+ 1\d{2}|
+ 2(?:
+ [12]\d|
+ [35]\d{1,2}|
+ 4\d?
+ )
+ )|
+ 3(?:
+ 1\d{2}|
+ 2(?:
+ [2356]\d|
+ 4\d{1,2}
+ )
+ )|
+ 4(?:
+ 1\d{2}|
+ 2(?:
+ 2\d{1,2}|
+ [47]|
+ 5\d{2}
+ )
+ )|
+ 5(?:
+ 1\d{2}|
+ 29
+ )|
+ [67]1\d{2}|
+ 8(?:
+ 1\d{2}|
+ 2(?:
+ 2\d{2}|
+ 3|
+ 4\d
+ )
+ )
+ )\d{3}|
+ 4(?:
+ 0(?:
+ 2(?:
+ [09]\d|
+ 7
+ )|
+ 33\d{2}
+ )|
+ 1\d{3}|
+ 2(?:
+ 1\d{2}|
+ 2(?:
+ [25]\d?|
+ [348]\d|
+ [67]\d{1,2}
+ )
+ )|
+ 3(?:
+ 1\d{2}(?:
+ \d{2}
+ )?|
+ 2(?:
+ [045]\d|
+ [236-9]\d{1,2}
+ )|
+ 32\d{2}
+ )|
+ 4(?:
+ [18]\d{2}|
+ 2(?:
+ [2-46]\d{2}|
+ 3
+ )|
+ 5[25]\d{2}
+ )|
+ 5(?:
+ 1\d{2}|
+ 2(?:
+ 3\d|
+ 5
+ )
+ )|
+ 6(?:
+ [18]\d{2}|
+ 2(?:
+ 3(?:
+ \d{2}
+ )?|
+ [46]\d{1,2}|
+ 5\d{2}|
+ 7\d
+ )|
+ 5(?:
+ 3\d?|
+ 4\d|
+ [57]\d{1,2}|
+ 6\d{2}|
+ 8
+ )
+ )|
+ 71\d{2}|
+ 8(?:
+ [18]\d{2}|
+ 23\d{2}|
+ 54\d{2}
+ )|
+ 9(?:
+ [18]\d{2}|
+ 2[2-5]\d{2}|
+ 53\d{1,2}
+ )
+ )\d{3}|
+ 5(?:
+ 02[03489]\d{2}|
+ 1\d{2}|
+ 2(?:
+ 1\d{2}|
+ 2(?:
+ 2(?:
+ \d{2}
+ )?|
+ [457]\d{2}
+ )
+ )|
+ 3(?:
+ 1\d{2}|
+ 2(?:
+ [37](?:
+ \d{2}
+ )?|
+ [569]\d{2}
+ )
+ )|
+ 4(?:
+ 1\d{2}|
+ 2[46]\d{2}
+ )|
+ 5(?:
+ 1\d{2}|
+ 26\d{1,2}
+ )|
+ 6(?:
+ [18]\d{2}|
+ 2|
+ 53\d{2}
+ )|
+ 7(?:
+ 1|
+ 24
+ )\d{2}|
+ 8(?:
+ 1|
+ 26
+ )\d{2}|
+ 91\d{2}
+ )\d{3}|
+ 6(?:
+ 0(?:
+ 1\d{2}|
+ 2(?:
+ 3\d{2}|
+ 4\d{1,2}
+ )
+ )|
+ 2(?:
+ 2[2-5]\d{2}|
+ 5(?:
+ [3-5]\d{2}|
+ 7
+ )|
+ 8\d{2}
+ )|
+ 3(?:
+ 1|
+ 2[3478]
+ )\d{2}|
+ 4(?:
+ 1|
+ 2[34]
+ )\d{2}|
+ 5(?:
+ 1|
+ 2[47]
+ )\d{2}|
+ 6(?:
+ [18]\d{2}|
+ 6(?:
+ 2(?:
+ 2\d|
+ [34]\d{2}
+ )|
+ 5(?:
+ [24]\d{2}|
+ 3\d|
+ 5\d{1,2}
+ )
+ )
+ )|
+ 72[2-5]\d{2}|
+ 8(?:
+ 1\d{2}|
+ 2[2-5]\d{2}
+ )|
+ 9(?:
+ 1\d{2}|
+ 2[2-6]\d{2}
+ )
+ )\d{3}|
+ 7(?:
+ (?:
+ 02|
+ [3-589]1|
+ 6[12]|
+ 72[24]
+ )\d{2}|
+ 21\d{3}|
+ 32
+ )\d{3}|
+ 8(?:
+ (?:
+ 4[12]|
+ [5-7]2|
+ 1\d?
+ )|
+ (?:
+ 0|
+ 3[12]|
+ [5-7]1|
+ 217
+ )\d
+ )\d{4}|
+ 9(?:
+ [35]1|
+ (?:
+ [024]2|
+ 81
+ )\d|
+ (?:
+ 1|
+ [24]1
+ )\d{2}
+ )\d{3}
+ "#;
+ // TODO: This is a good candidate of a seq of literals that could be
+ // shrunk quite a bit and still be very productive with respect to
+ // literal optimizations.
+ let (prefixes, suffixes) = e(pat);
+ assert!(!suffixes.is_finite());
+ assert_eq!(Some(243), prefixes.len());
+ }
+
+ #[test]
+ fn optimize() {
+ // This gets a common prefix that isn't too short.
+ let (p, s) =
+ opt(["foobarfoobar", "foobar", "foobarzfoobar", "foobarfoobar"]);
+ assert_eq!(seq([I("foobar")]), p);
+ assert_eq!(seq([I("foobar")]), s);
+
+ // This also finds a common prefix, but since it's only one byte, it
+ // prefers the multiple literals.
+ let (p, s) = opt(["abba", "akka", "abccba"]);
+ assert_eq!(exact(["abba", "akka", "abccba"]), (p, s));
+
+ let (p, s) = opt(["sam", "samwise"]);
+ assert_eq!((seq([E("sam")]), seq([E("sam"), E("samwise")])), (p, s));
+
+ // The empty string is poisonous, so our seq becomes infinite, even
+ // though all literals are exact.
+ let (p, s) = opt(["foobarfoo", "foo", "", "foozfoo", "foofoo"]);
+ assert!(!p.is_finite());
+ assert!(!s.is_finite());
+
+ // A space is also poisonous, so our seq becomes infinite. But this
+ // only gets triggered when we don't have a completely exact sequence.
+ // When the sequence is exact, spaces are okay, since we presume that
+ // any prefilter will match a space more quickly than the regex engine.
+ // (When the sequence is exact, there's a chance of the prefilter being
+ // used without needing the regex engine at all.)
+ let mut p = seq([E("foobarfoo"), I("foo"), E(" "), E("foofoo")]);
+ p.optimize_for_prefix_by_preference();
+ assert!(!p.is_finite());
+ }
+}
diff --git a/vendor/regex-syntax/src/hir/literal/mod.rs b/vendor/regex-syntax/src/hir/literal/mod.rs
deleted file mode 100644
index fbc5d3c97..000000000
--- a/vendor/regex-syntax/src/hir/literal/mod.rs
+++ /dev/null
@@ -1,1686 +0,0 @@
-/*!
-Provides routines for extracting literal prefixes and suffixes from an `Hir`.
-*/
-
-use std::cmp;
-use std::fmt;
-use std::iter;
-use std::mem;
-use std::ops;
-
-use crate::hir::{self, Hir, HirKind};
-
-/// A set of literal byte strings extracted from a regular expression.
-///
-/// Every member of the set is a `Literal`, which is represented by a
-/// `Vec<u8>`. (Notably, it may contain invalid UTF-8.) Every member is
-/// said to be either *complete* or *cut*. A complete literal means that
-/// it extends until the beginning (or end) of the regular expression. In
-/// some circumstances, this can be used to indicate a match in the regular
-/// expression.
-///
-/// A key aspect of literal extraction is knowing when to stop. It is not
-/// feasible to blindly extract all literals from a regular expression, even if
-/// there are finitely many. For example, the regular expression `[0-9]{10}`
-/// has `10^10` distinct literals. For this reason, literal extraction is
-/// bounded to some low number by default using heuristics, but the limits can
-/// be tweaked.
-///
-/// **WARNING**: Literal extraction uses stack space proportional to the size
-/// of the `Hir` expression. At some point, this drawback will be eliminated.
-/// To protect yourself, set a reasonable
-/// [`nest_limit` on your `Parser`](../../struct.ParserBuilder.html#method.nest_limit).
-/// This is done for you by default.
-#[derive(Clone, Eq, PartialEq)]
-pub struct Literals {
- lits: Vec<Literal>,
- limit_size: usize,
- limit_class: usize,
-}
-
-/// A single member of a set of literals extracted from a regular expression.
-///
-/// This type has `Deref` and `DerefMut` impls to `Vec<u8>` so that all slice
-/// and `Vec` operations are available.
-#[derive(Clone, Eq, Ord)]
-pub struct Literal {
- v: Vec<u8>,
- cut: bool,
-}
-
-impl Literals {
- /// Returns a new empty set of literals using default limits.
- pub fn empty() -> Literals {
- Literals { lits: vec![], limit_size: 250, limit_class: 10 }
- }
-
- /// Returns a set of literal prefixes extracted from the given `Hir`.
- pub fn prefixes(expr: &Hir) -> Literals {
- let mut lits = Literals::empty();
- lits.union_prefixes(expr);
- lits
- }
-
- /// Returns a set of literal suffixes extracted from the given `Hir`.
- pub fn suffixes(expr: &Hir) -> Literals {
- let mut lits = Literals::empty();
- lits.union_suffixes(expr);
- lits
- }
-
- /// Get the approximate size limit (in bytes) of this set.
- pub fn limit_size(&self) -> usize {
- self.limit_size
- }
-
- /// Set the approximate size limit (in bytes) of this set.
- ///
- /// If extracting a literal would put the set over this limit, then
- /// extraction stops.
- ///
- /// The new limits will only apply to additions to this set. Existing
- /// members remain unchanged, even if the set exceeds the new limit.
- pub fn set_limit_size(&mut self, size: usize) -> &mut Literals {
- self.limit_size = size;
- self
- }
-
- /// Get the character class size limit for this set.
- pub fn limit_class(&self) -> usize {
- self.limit_class
- }
-
- /// Limits the size of character(or byte) classes considered.
- ///
- /// A value of `0` prevents all character classes from being considered.
- ///
- /// This limit also applies to case insensitive literals, since each
- /// character in the case insensitive literal is converted to a class, and
- /// then case folded.
- ///
- /// The new limits will only apply to additions to this set. Existing
- /// members remain unchanged, even if the set exceeds the new limit.
- pub fn set_limit_class(&mut self, size: usize) -> &mut Literals {
- self.limit_class = size;
- self
- }
-
- /// Returns the set of literals as a slice. Its order is unspecified.
- pub fn literals(&self) -> &[Literal] {
- &self.lits
- }
-
- /// Returns the length of the smallest literal.
- ///
- /// Returns None is there are no literals in the set.
- pub fn min_len(&self) -> Option<usize> {
- let mut min = None;
- for lit in &self.lits {
- match min {
- None => min = Some(lit.len()),
- Some(m) if lit.len() < m => min = Some(lit.len()),
- _ => {}
- }
- }
- min
- }
-
- /// Returns true if all members in this set are complete.
- pub fn all_complete(&self) -> bool {
- !self.lits.is_empty() && self.lits.iter().all(|l| !l.is_cut())
- }
-
- /// Returns true if any member in this set is complete.
- pub fn any_complete(&self) -> bool {
- self.lits.iter().any(|lit| !lit.is_cut())
- }
-
- /// Returns true if this set contains an empty literal.
- pub fn contains_empty(&self) -> bool {
- self.lits.iter().any(|lit| lit.is_empty())
- }
-
- /// Returns true if this set is empty or if all of its members is empty.
- pub fn is_empty(&self) -> bool {
- self.lits.is_empty() || self.lits.iter().all(|lit| lit.is_empty())
- }
-
- /// Returns a new empty set of literals using this set's limits.
- pub fn to_empty(&self) -> Literals {
- let mut lits = Literals::empty();
- lits.set_limit_size(self.limit_size).set_limit_class(self.limit_class);
- lits
- }
-
- /// Returns the longest common prefix of all members in this set.
- pub fn longest_common_prefix(&self) -> &[u8] {
- if self.is_empty() {
- return &[];
- }
- let lit0 = &*self.lits[0];
- let mut len = lit0.len();
- for lit in &self.lits[1..] {
- len = cmp::min(
- len,
- lit.iter().zip(lit0).take_while(|&(a, b)| a == b).count(),
- );
- }
- &self.lits[0][..len]
- }
-
- /// Returns the longest common suffix of all members in this set.
- pub fn longest_common_suffix(&self) -> &[u8] {
- if self.is_empty() {
- return &[];
- }
- let lit0 = &*self.lits[0];
- let mut len = lit0.len();
- for lit in &self.lits[1..] {
- len = cmp::min(
- len,
- lit.iter()
- .rev()
- .zip(lit0.iter().rev())
- .take_while(|&(a, b)| a == b)
- .count(),
- );
- }
- &self.lits[0][self.lits[0].len() - len..]
- }
-
- /// Returns a new set of literals with the given number of bytes trimmed
- /// from the suffix of each literal.
- ///
- /// If any literal would be cut out completely by trimming, then None is
- /// returned.
- ///
- /// Any duplicates that are created as a result of this transformation are
- /// removed.
- pub fn trim_suffix(&self, num_bytes: usize) -> Option<Literals> {
- if self.min_len().map(|len| len <= num_bytes).unwrap_or(true) {
- return None;
- }
- let mut new = self.to_empty();
- for mut lit in self.lits.iter().cloned() {
- let new_len = lit.len() - num_bytes;
- lit.truncate(new_len);
- lit.cut();
- new.lits.push(lit);
- }
- new.lits.sort();
- new.lits.dedup();
- Some(new)
- }
-
- /// Returns a new set of prefixes of this set of literals that are
- /// guaranteed to be unambiguous.
- ///
- /// Any substring match with a member of the set is returned is guaranteed
- /// to never overlap with a substring match of another member of the set
- /// at the same starting position.
- ///
- /// Given any two members of the returned set, neither is a substring of
- /// the other.
- pub fn unambiguous_prefixes(&self) -> Literals {
- if self.lits.is_empty() {
- return self.to_empty();
- }
- let mut old = self.lits.to_vec();
- let mut new = self.to_empty();
- 'OUTER: while let Some(mut candidate) = old.pop() {
- if candidate.is_empty() {
- continue;
- }
- if new.lits.is_empty() {
- new.lits.push(candidate);
- continue;
- }
- for lit2 in &mut new.lits {
- if lit2.is_empty() {
- continue;
- }
- if &candidate == lit2 {
- // If the literal is already in the set, then we can
- // just drop it. But make sure that cut literals are
- // infectious!
- candidate.cut = candidate.cut || lit2.cut;
- lit2.cut = candidate.cut;
- continue 'OUTER;
- }
- if candidate.len() < lit2.len() {
- if let Some(i) = position(&candidate, &lit2) {
- candidate.cut();
- let mut lit3 = lit2.clone();
- lit3.truncate(i);
- lit3.cut();
- old.push(lit3);
- lit2.clear();
- }
- } else if let Some(i) = position(&lit2, &candidate) {
- lit2.cut();
- let mut new_candidate = candidate.clone();
- new_candidate.truncate(i);
- new_candidate.cut();
- old.push(new_candidate);
- candidate.clear();
- }
- // Oops, the candidate is already represented in the set.
- if candidate.is_empty() {
- continue 'OUTER;
- }
- }
- new.lits.push(candidate);
- }
- new.lits.retain(|lit| !lit.is_empty());
- new.lits.sort();
- new.lits.dedup();
- new
- }
-
- /// Returns a new set of suffixes of this set of literals that are
- /// guaranteed to be unambiguous.
- ///
- /// Any substring match with a member of the set is returned is guaranteed
- /// to never overlap with a substring match of another member of the set
- /// at the same ending position.
- ///
- /// Given any two members of the returned set, neither is a substring of
- /// the other.
- pub fn unambiguous_suffixes(&self) -> Literals {
- // This is a touch wasteful...
- let mut lits = self.clone();
- lits.reverse();
- let mut unamb = lits.unambiguous_prefixes();
- unamb.reverse();
- unamb
- }
-
- /// Unions the prefixes from the given expression to this set.
- ///
- /// If prefixes could not be added (for example, this set would exceed its
- /// size limits or the set of prefixes from `expr` includes the empty
- /// string), then false is returned.
- ///
- /// Note that prefix literals extracted from `expr` are said to be complete
- /// if and only if the literal extends from the beginning of `expr` to the
- /// end of `expr`.
- pub fn union_prefixes(&mut self, expr: &Hir) -> bool {
- let mut lits = self.to_empty();
- prefixes(expr, &mut lits);
- !lits.is_empty() && !lits.contains_empty() && self.union(lits)
- }
-
- /// Unions the suffixes from the given expression to this set.
- ///
- /// If suffixes could not be added (for example, this set would exceed its
- /// size limits or the set of suffixes from `expr` includes the empty
- /// string), then false is returned.
- ///
- /// Note that prefix literals extracted from `expr` are said to be complete
- /// if and only if the literal extends from the end of `expr` to the
- /// beginning of `expr`.
- pub fn union_suffixes(&mut self, expr: &Hir) -> bool {
- let mut lits = self.to_empty();
- suffixes(expr, &mut lits);
- lits.reverse();
- !lits.is_empty() && !lits.contains_empty() && self.union(lits)
- }
-
- /// Unions this set with another set.
- ///
- /// If the union would cause the set to exceed its limits, then the union
- /// is skipped and it returns false. Otherwise, if the union succeeds, it
- /// returns true.
- pub fn union(&mut self, lits: Literals) -> bool {
- if self.num_bytes() + lits.num_bytes() > self.limit_size {
- return false;
- }
- if lits.is_empty() {
- self.lits.push(Literal::empty());
- } else {
- self.lits.extend(lits.lits);
- }
- true
- }
-
- /// Extends this set with another set.
- ///
- /// The set of literals is extended via a cross product.
- ///
- /// If a cross product would cause this set to exceed its limits, then the
- /// cross product is skipped and it returns false. Otherwise, if the cross
- /// product succeeds, it returns true.
- pub fn cross_product(&mut self, lits: &Literals) -> bool {
- if lits.is_empty() {
- return true;
- }
- // Check that we make sure we stay in our limits.
- let mut size_after;
- if self.is_empty() || !self.any_complete() {
- size_after = self.num_bytes();
- for lits_lit in lits.literals() {
- size_after += lits_lit.len();
- }
- } else {
- size_after = self.lits.iter().fold(0, |accum, lit| {
- accum + if lit.is_cut() { lit.len() } else { 0 }
- });
- for lits_lit in lits.literals() {
- for self_lit in self.literals() {
- if !self_lit.is_cut() {
- size_after += self_lit.len() + lits_lit.len();
- }
- }
- }
- }
- if size_after > self.limit_size {
- return false;
- }
-
- let mut base = self.remove_complete();
- if base.is_empty() {
- base = vec![Literal::empty()];
- }
- for lits_lit in lits.literals() {
- for mut self_lit in base.clone() {
- self_lit.extend(&**lits_lit);
- self_lit.cut = lits_lit.cut;
- self.lits.push(self_lit);
- }
- }
- true
- }
-
- /// Extends each literal in this set with the bytes given.
- ///
- /// If the set is empty, then the given literal is added to the set.
- ///
- /// If adding any number of bytes to all members of this set causes a limit
- /// to be exceeded, then no bytes are added and false is returned. If a
- /// prefix of `bytes` can be fit into this set, then it is used and all
- /// resulting literals are cut.
- pub fn cross_add(&mut self, bytes: &[u8]) -> bool {
- // N.B. This could be implemented by simply calling cross_product with
- // a literal set containing just `bytes`, but we can be smarter about
- // taking shorter prefixes of `bytes` if they'll fit.
- if bytes.is_empty() {
- return true;
- }
- if self.lits.is_empty() {
- let i = cmp::min(self.limit_size, bytes.len());
- self.lits.push(Literal::new(bytes[..i].to_owned()));
- self.lits[0].cut = i < bytes.len();
- return !self.lits[0].is_cut();
- }
- let size = self.num_bytes();
- if size + self.lits.len() >= self.limit_size {
- return false;
- }
- let mut i = 1;
- while size + (i * self.lits.len()) <= self.limit_size
- && i < bytes.len()
- {
- i += 1;
- }
- for lit in &mut self.lits {
- if !lit.is_cut() {
- lit.extend(&bytes[..i]);
- if i < bytes.len() {
- lit.cut();
- }
- }
- }
- true
- }
-
- /// Adds the given literal to this set.
- ///
- /// Returns false if adding this literal would cause the class to be too
- /// big.
- pub fn add(&mut self, lit: Literal) -> bool {
- if self.num_bytes() + lit.len() > self.limit_size {
- return false;
- }
- self.lits.push(lit);
- true
- }
-
- /// Extends each literal in this set with the character class given.
- ///
- /// Returns false if the character class was too big to add.
- pub fn add_char_class(&mut self, cls: &hir::ClassUnicode) -> bool {
- self._add_char_class(cls, false)
- }
-
- /// Extends each literal in this set with the character class given,
- /// writing the bytes of each character in reverse.
- ///
- /// Returns false if the character class was too big to add.
- fn add_char_class_reverse(&mut self, cls: &hir::ClassUnicode) -> bool {
- self._add_char_class(cls, true)
- }
-
- fn _add_char_class(
- &mut self,
- cls: &hir::ClassUnicode,
- reverse: bool,
- ) -> bool {
- use std::char;
-
- if self.class_exceeds_limits(cls_char_count(cls)) {
- return false;
- }
- let mut base = self.remove_complete();
- if base.is_empty() {
- base = vec![Literal::empty()];
- }
- for r in cls.iter() {
- let (s, e) = (r.start as u32, r.end as u32 + 1);
- for c in (s..e).filter_map(char::from_u32) {
- for mut lit in base.clone() {
- let mut bytes = c.to_string().into_bytes();
- if reverse {
- bytes.reverse();
- }
- lit.extend(&bytes);
- self.lits.push(lit);
- }
- }
- }
- true
- }
-
- /// Extends each literal in this set with the byte class given.
- ///
- /// Returns false if the byte class was too big to add.
- pub fn add_byte_class(&mut self, cls: &hir::ClassBytes) -> bool {
- if self.class_exceeds_limits(cls_byte_count(cls)) {
- return false;
- }
- let mut base = self.remove_complete();
- if base.is_empty() {
- base = vec![Literal::empty()];
- }
- for r in cls.iter() {
- let (s, e) = (r.start as u32, r.end as u32 + 1);
- for b in (s..e).map(|b| b as u8) {
- for mut lit in base.clone() {
- lit.push(b);
- self.lits.push(lit);
- }
- }
- }
- true
- }
-
- /// Cuts every member of this set. When a member is cut, it can never
- /// be extended.
- pub fn cut(&mut self) {
- for lit in &mut self.lits {
- lit.cut();
- }
- }
-
- /// Reverses all members in place.
- pub fn reverse(&mut self) {
- for lit in &mut self.lits {
- lit.reverse();
- }
- }
-
- /// Clears this set of all members.
- pub fn clear(&mut self) {
- self.lits.clear();
- }
-
- /// Pops all complete literals out of this set.
- fn remove_complete(&mut self) -> Vec<Literal> {
- let mut base = vec![];
- for lit in mem::replace(&mut self.lits, vec![]) {
- if lit.is_cut() {
- self.lits.push(lit);
- } else {
- base.push(lit);
- }
- }
- base
- }
-
- /// Returns the total number of bytes in this set.
- fn num_bytes(&self) -> usize {
- self.lits.iter().fold(0, |accum, lit| accum + lit.len())
- }
-
- /// Returns true if a character class with the given size would cause this
- /// set to exceed its limits.
- ///
- /// The size given should correspond to the number of items in the class.
- fn class_exceeds_limits(&self, size: usize) -> bool {
- if size > self.limit_class {
- return true;
- }
- // This is an approximation since codepoints in a char class can encode
- // to 1-4 bytes.
- let new_byte_count = if self.lits.is_empty() {
- size
- } else {
- self.lits.iter().fold(0, |accum, lit| {
- accum
- + if lit.is_cut() {
- // If the literal is cut, then we'll never add
- // anything to it, so don't count it.
- 0
- } else {
- (lit.len() + 1) * size
- }
- })
- };
- new_byte_count > self.limit_size
- }
-}
-
-fn prefixes(expr: &Hir, lits: &mut Literals) {
- match *expr.kind() {
- HirKind::Literal(hir::Literal::Unicode(c)) => {
- let mut buf = [0; 4];
- lits.cross_add(c.encode_utf8(&mut buf).as_bytes());
- }
- HirKind::Literal(hir::Literal::Byte(b)) => {
- lits.cross_add(&[b]);
- }
- HirKind::Class(hir::Class::Unicode(ref cls)) => {
- if !lits.add_char_class(cls) {
- lits.cut();
- }
- }
- HirKind::Class(hir::Class::Bytes(ref cls)) => {
- if !lits.add_byte_class(cls) {
- lits.cut();
- }
- }
- HirKind::Group(hir::Group { ref hir, .. }) => {
- prefixes(&**hir, lits);
- }
- HirKind::Repetition(ref x) => match x.kind {
- hir::RepetitionKind::ZeroOrOne => {
- repeat_zero_or_one_literals(&x.hir, lits, prefixes);
- }
- hir::RepetitionKind::ZeroOrMore => {
- repeat_zero_or_more_literals(&x.hir, lits, prefixes);
- }
- hir::RepetitionKind::OneOrMore => {
- repeat_one_or_more_literals(&x.hir, lits, prefixes);
- }
- hir::RepetitionKind::Range(ref rng) => {
- let (min, max) = match *rng {
- hir::RepetitionRange::Exactly(m) => (m, Some(m)),
- hir::RepetitionRange::AtLeast(m) => (m, None),
- hir::RepetitionRange::Bounded(m, n) => (m, Some(n)),
- };
- repeat_range_literals(
- &x.hir, min, max, x.greedy, lits, prefixes,
- )
- }
- },
- HirKind::Concat(ref es) if es.is_empty() => {}
- HirKind::Concat(ref es) if es.len() == 1 => prefixes(&es[0], lits),
- HirKind::Concat(ref es) => {
- for e in es {
- if let HirKind::Anchor(hir::Anchor::StartText) = *e.kind() {
- if !lits.is_empty() {
- lits.cut();
- break;
- }
- lits.add(Literal::empty());
- continue;
- }
- let mut lits2 = lits.to_empty();
- prefixes(e, &mut lits2);
- if !lits.cross_product(&lits2) || !lits2.any_complete() {
- // If this expression couldn't yield any literal that
- // could be extended, then we need to quit. Since we're
- // short-circuiting, we also need to freeze every member.
- lits.cut();
- break;
- }
- }
- }
- HirKind::Alternation(ref es) => {
- alternate_literals(es, lits, prefixes);
- }
- _ => lits.cut(),
- }
-}
-
-fn suffixes(expr: &Hir, lits: &mut Literals) {
- match *expr.kind() {
- HirKind::Literal(hir::Literal::Unicode(c)) => {
- let mut buf = [0u8; 4];
- let i = c.encode_utf8(&mut buf).len();
- let buf = &mut buf[..i];
- buf.reverse();
- lits.cross_add(buf);
- }
- HirKind::Literal(hir::Literal::Byte(b)) => {
- lits.cross_add(&[b]);
- }
- HirKind::Class(hir::Class::Unicode(ref cls)) => {
- if !lits.add_char_class_reverse(cls) {
- lits.cut();
- }
- }
- HirKind::Class(hir::Class::Bytes(ref cls)) => {
- if !lits.add_byte_class(cls) {
- lits.cut();
- }
- }
- HirKind::Group(hir::Group { ref hir, .. }) => {
- suffixes(&**hir, lits);
- }
- HirKind::Repetition(ref x) => match x.kind {
- hir::RepetitionKind::ZeroOrOne => {
- repeat_zero_or_one_literals(&x.hir, lits, suffixes);
- }
- hir::RepetitionKind::ZeroOrMore => {
- repeat_zero_or_more_literals(&x.hir, lits, suffixes);
- }
- hir::RepetitionKind::OneOrMore => {
- repeat_one_or_more_literals(&x.hir, lits, suffixes);
- }
- hir::RepetitionKind::Range(ref rng) => {
- let (min, max) = match *rng {
- hir::RepetitionRange::Exactly(m) => (m, Some(m)),
- hir::RepetitionRange::AtLeast(m) => (m, None),
- hir::RepetitionRange::Bounded(m, n) => (m, Some(n)),
- };
- repeat_range_literals(
- &x.hir, min, max, x.greedy, lits, suffixes,
- )
- }
- },
- HirKind::Concat(ref es) if es.is_empty() => {}
- HirKind::Concat(ref es) if es.len() == 1 => suffixes(&es[0], lits),
- HirKind::Concat(ref es) => {
- for e in es.iter().rev() {
- if let HirKind::Anchor(hir::Anchor::EndText) = *e.kind() {
- if !lits.is_empty() {
- lits.cut();
- break;
- }
- lits.add(Literal::empty());
- continue;
- }
- let mut lits2 = lits.to_empty();
- suffixes(e, &mut lits2);
- if !lits.cross_product(&lits2) || !lits2.any_complete() {
- // If this expression couldn't yield any literal that
- // could be extended, then we need to quit. Since we're
- // short-circuiting, we also need to freeze every member.
- lits.cut();
- break;
- }
- }
- }
- HirKind::Alternation(ref es) => {
- alternate_literals(es, lits, suffixes);
- }
- _ => lits.cut(),
- }
-}
-
-fn repeat_zero_or_one_literals<F: FnMut(&Hir, &mut Literals)>(
- e: &Hir,
- lits: &mut Literals,
- mut f: F,
-) {
- f(
- &Hir::repetition(hir::Repetition {
- kind: hir::RepetitionKind::ZeroOrMore,
- // FIXME: Our literal extraction doesn't care about greediness.
- // Which is partially why we're treating 'e?' as 'e*'. Namely,
- // 'ab??' yields [Complete(ab), Complete(a)], but it should yield
- // [Complete(a), Complete(ab)] because of the non-greediness.
- greedy: true,
- hir: Box::new(e.clone()),
- }),
- lits,
- );
-}
-
-fn repeat_zero_or_more_literals<F: FnMut(&Hir, &mut Literals)>(
- e: &Hir,
- lits: &mut Literals,
- mut f: F,
-) {
- let (mut lits2, mut lits3) = (lits.clone(), lits.to_empty());
- lits3.set_limit_size(lits.limit_size() / 2);
- f(e, &mut lits3);
-
- if lits3.is_empty() || !lits2.cross_product(&lits3) {
- lits.cut();
- return;
- }
- lits2.cut();
- lits2.add(Literal::empty());
- if !lits.union(lits2) {
- lits.cut();
- }
-}
-
-fn repeat_one_or_more_literals<F: FnMut(&Hir, &mut Literals)>(
- e: &Hir,
- lits: &mut Literals,
- mut f: F,
-) {
- f(e, lits);
- lits.cut();
-}
-
-fn repeat_range_literals<F: FnMut(&Hir, &mut Literals)>(
- e: &Hir,
- min: u32,
- max: Option<u32>,
- greedy: bool,
- lits: &mut Literals,
- mut f: F,
-) {
- if min == 0 {
- // This is a bit conservative. If `max` is set, then we could
- // treat this as a finite set of alternations. For now, we
- // just treat it as `e*`.
- f(
- &Hir::repetition(hir::Repetition {
- kind: hir::RepetitionKind::ZeroOrMore,
- greedy,
- hir: Box::new(e.clone()),
- }),
- lits,
- );
- } else {
- if min > 0 {
- let n = cmp::min(lits.limit_size, min as usize);
- let es = iter::repeat(e.clone()).take(n).collect();
- f(&Hir::concat(es), lits);
- if n < min as usize || lits.contains_empty() {
- lits.cut();
- }
- }
- if max.map_or(true, |max| min < max) {
- lits.cut();
- }
- }
-}
-
-fn alternate_literals<F: FnMut(&Hir, &mut Literals)>(
- es: &[Hir],
- lits: &mut Literals,
- mut f: F,
-) {
- let mut lits2 = lits.to_empty();
- for e in es {
- let mut lits3 = lits.to_empty();
- lits3.set_limit_size(lits.limit_size() / 5);
- f(e, &mut lits3);
- if lits3.is_empty() || !lits2.union(lits3) {
- // If we couldn't find suffixes for *any* of the
- // alternates, then the entire alternation has to be thrown
- // away and any existing members must be frozen. Similarly,
- // if the union couldn't complete, stop and freeze.
- lits.cut();
- return;
- }
- }
- if !lits.cross_product(&lits2) {
- lits.cut();
- }
-}
-
-impl fmt::Debug for Literals {
- fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
- f.debug_struct("Literals")
- .field("lits", &self.lits)
- .field("limit_size", &self.limit_size)
- .field("limit_class", &self.limit_class)
- .finish()
- }
-}
-
-impl Literal {
- /// Returns a new complete literal with the bytes given.
- pub fn new(bytes: Vec<u8>) -> Literal {
- Literal { v: bytes, cut: false }
- }
-
- /// Returns a new complete empty literal.
- pub fn empty() -> Literal {
- Literal { v: vec![], cut: false }
- }
-
- /// Returns true if this literal was "cut."
- pub fn is_cut(&self) -> bool {
- self.cut
- }
-
- /// Cuts this literal.
- pub fn cut(&mut self) {
- self.cut = true;
- }
-}
-
-impl PartialEq for Literal {
- fn eq(&self, other: &Literal) -> bool {
- self.v == other.v
- }
-}
-
-impl PartialOrd for Literal {
- fn partial_cmp(&self, other: &Literal) -> Option<cmp::Ordering> {
- self.v.partial_cmp(&other.v)
- }
-}
-
-impl fmt::Debug for Literal {
- fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
- if self.is_cut() {
- write!(f, "Cut({})", escape_unicode(&self.v))
- } else {
- write!(f, "Complete({})", escape_unicode(&self.v))
- }
- }
-}
-
-impl AsRef<[u8]> for Literal {
- fn as_ref(&self) -> &[u8] {
- &self.v
- }
-}
-
-impl ops::Deref for Literal {
- type Target = Vec<u8>;
- fn deref(&self) -> &Vec<u8> {
- &self.v
- }
-}
-
-impl ops::DerefMut for Literal {
- fn deref_mut(&mut self) -> &mut Vec<u8> {
- &mut self.v
- }
-}
-
-fn position(needle: &[u8], mut haystack: &[u8]) -> Option<usize> {
- let mut i = 0;
- while haystack.len() >= needle.len() {
- if needle == &haystack[..needle.len()] {
- return Some(i);
- }
- i += 1;
- haystack = &haystack[1..];
- }
- None
-}
-
-fn escape_unicode(bytes: &[u8]) -> String {
- let show = match ::std::str::from_utf8(bytes) {
- Ok(v) => v.to_string(),
- Err(_) => escape_bytes(bytes),
- };
- let mut space_escaped = String::new();
- for c in show.chars() {
- if c.is_whitespace() {
- let escaped = if c as u32 <= 0x7F {
- escape_byte(c as u8)
- } else if c as u32 <= 0xFFFF {
- format!(r"\u{{{:04x}}}", c as u32)
- } else {
- format!(r"\U{{{:08x}}}", c as u32)
- };
- space_escaped.push_str(&escaped);
- } else {
- space_escaped.push(c);
- }
- }
- space_escaped
-}
-
-fn escape_bytes(bytes: &[u8]) -> String {
- let mut s = String::new();
- for &b in bytes {
- s.push_str(&escape_byte(b));
- }
- s
-}
-
-fn escape_byte(byte: u8) -> String {
- use std::ascii::escape_default;
-
- let escaped: Vec<u8> = escape_default(byte).collect();
- String::from_utf8_lossy(&escaped).into_owned()
-}
-
-fn cls_char_count(cls: &hir::ClassUnicode) -> usize {
- cls.iter().map(|&r| 1 + (r.end as u32) - (r.start as u32)).sum::<u32>()
- as usize
-}
-
-fn cls_byte_count(cls: &hir::ClassBytes) -> usize {
- cls.iter().map(|&r| 1 + (r.end as u32) - (r.start as u32)).sum::<u32>()
- as usize
-}
-
-#[cfg(test)]
-mod tests {
- use std::fmt;
-
- use super::{escape_bytes, Literal, Literals};
- use crate::hir::Hir;
- use crate::ParserBuilder;
-
- // To make test failures easier to read.
- #[derive(Debug, Eq, PartialEq)]
- struct Bytes(Vec<ULiteral>);
- #[derive(Debug, Eq, PartialEq)]
- struct Unicode(Vec<ULiteral>);
-
- fn escape_lits(blits: &[Literal]) -> Vec<ULiteral> {
- let mut ulits = vec![];
- for blit in blits {
- ulits
- .push(ULiteral { v: escape_bytes(&blit), cut: blit.is_cut() });
- }
- ulits
- }
-
- fn create_lits<I: IntoIterator<Item = Literal>>(it: I) -> Literals {
- Literals {
- lits: it.into_iter().collect(),
- limit_size: 0,
- limit_class: 0,
- }
- }
-
- // Needs to be pub for 1.3?
- #[derive(Clone, Eq, PartialEq)]
- pub struct ULiteral {
- v: String,
- cut: bool,
- }
-
- impl ULiteral {
- fn is_cut(&self) -> bool {
- self.cut
- }
- }
-
- impl fmt::Debug for ULiteral {
- fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
- if self.is_cut() {
- write!(f, "Cut({})", self.v)
- } else {
- write!(f, "Complete({})", self.v)
- }
- }
- }
-
- impl PartialEq<Literal> for ULiteral {
- fn eq(&self, other: &Literal) -> bool {
- self.v.as_bytes() == &*other.v && self.is_cut() == other.is_cut()
- }
- }
-
- impl PartialEq<ULiteral> for Literal {
- fn eq(&self, other: &ULiteral) -> bool {
- &*self.v == other.v.as_bytes() && self.is_cut() == other.is_cut()
- }
- }
-
- #[allow(non_snake_case)]
- fn C(s: &'static str) -> ULiteral {
- ULiteral { v: s.to_owned(), cut: true }
- }
- #[allow(non_snake_case)]
- fn M(s: &'static str) -> ULiteral {
- ULiteral { v: s.to_owned(), cut: false }
- }
-
- fn prefixes(lits: &mut Literals, expr: &Hir) {
- lits.union_prefixes(expr);
- }
-
- fn suffixes(lits: &mut Literals, expr: &Hir) {
- lits.union_suffixes(expr);
- }
-
- macro_rules! assert_lit_eq {
- ($which:ident, $got_lits:expr, $($expected_lit:expr),*) => {{
- let expected: Vec<ULiteral> = vec![$($expected_lit),*];
- let lits = $got_lits;
- assert_eq!(
- $which(expected.clone()),
- $which(escape_lits(lits.literals())));
- assert_eq!(
- !expected.is_empty() && expected.iter().all(|l| !l.is_cut()),
- lits.all_complete());
- assert_eq!(
- expected.iter().any(|l| !l.is_cut()),
- lits.any_complete());
- }};
- }
-
- macro_rules! test_lit {
- ($name:ident, $which:ident, $re:expr) => {
- test_lit!($name, $which, $re,);
- };
- ($name:ident, $which:ident, $re:expr, $($lit:expr),*) => {
- #[test]
- fn $name() {
- let expr = ParserBuilder::new()
- .build()
- .parse($re)
- .unwrap();
- let lits = Literals::$which(&expr);
- assert_lit_eq!(Unicode, lits, $($lit),*);
-
- let expr = ParserBuilder::new()
- .allow_invalid_utf8(true)
- .unicode(false)
- .build()
- .parse($re)
- .unwrap();
- let lits = Literals::$which(&expr);
- assert_lit_eq!(Bytes, lits, $($lit),*);
- }
- };
- }
-
- // ************************************************************************
- // Tests for prefix literal extraction.
- // ************************************************************************
-
- // Elementary tests.
- test_lit!(pfx_one_lit1, prefixes, "a", M("a"));
- test_lit!(pfx_one_lit2, prefixes, "abc", M("abc"));
- test_lit!(pfx_one_lit3, prefixes, "(?u)☃", M("\\xe2\\x98\\x83"));
- #[cfg(feature = "unicode-case")]
- test_lit!(pfx_one_lit4, prefixes, "(?ui)☃", M("\\xe2\\x98\\x83"));
- test_lit!(pfx_class1, prefixes, "[1-4]", M("1"), M("2"), M("3"), M("4"));
- test_lit!(
- pfx_class2,
- prefixes,
- "(?u)[☃Ⅰ]",
- M("\\xe2\\x85\\xa0"),
- M("\\xe2\\x98\\x83")
- );
- #[cfg(feature = "unicode-case")]
- test_lit!(
- pfx_class3,
- prefixes,
- "(?ui)[☃Ⅰ]",
- M("\\xe2\\x85\\xa0"),
- M("\\xe2\\x85\\xb0"),
- M("\\xe2\\x98\\x83")
- );
- test_lit!(pfx_one_lit_casei1, prefixes, "(?i-u)a", M("A"), M("a"));
- test_lit!(
- pfx_one_lit_casei2,
- prefixes,
- "(?i-u)abc",
- M("ABC"),
- M("aBC"),
- M("AbC"),
- M("abC"),
- M("ABc"),
- M("aBc"),
- M("Abc"),
- M("abc")
- );
- test_lit!(pfx_group1, prefixes, "(a)", M("a"));
- test_lit!(pfx_rep_zero_or_one1, prefixes, "a?");
- test_lit!(pfx_rep_zero_or_one2, prefixes, "(?:abc)?");
- test_lit!(pfx_rep_zero_or_one_cat1, prefixes, "ab?", C("ab"), M("a"));
- // FIXME: This should return [M("a"), M("ab")] because of the non-greedy
- // repetition. As a work-around, we rewrite ab?? as ab*?, and thus we get
- // a cut literal.
- test_lit!(pfx_rep_zero_or_one_cat2, prefixes, "ab??", C("ab"), M("a"));
- test_lit!(pfx_rep_zero_or_more1, prefixes, "a*");
- test_lit!(pfx_rep_zero_or_more2, prefixes, "(?:abc)*");
- test_lit!(pfx_rep_one_or_more1, prefixes, "a+", C("a"));
- test_lit!(pfx_rep_one_or_more2, prefixes, "(?:abc)+", C("abc"));
- test_lit!(pfx_rep_nested_one_or_more, prefixes, "(?:a+)+", C("a"));
- test_lit!(pfx_rep_range1, prefixes, "a{0}");
- test_lit!(pfx_rep_range2, prefixes, "a{0,}");
- test_lit!(pfx_rep_range3, prefixes, "a{0,1}");
- test_lit!(pfx_rep_range4, prefixes, "a{1}", M("a"));
- test_lit!(pfx_rep_range5, prefixes, "a{2}", M("aa"));
- test_lit!(pfx_rep_range6, prefixes, "a{1,2}", C("a"));
- test_lit!(pfx_rep_range7, prefixes, "a{2,3}", C("aa"));
-
- // Test regexes with concatenations.
- test_lit!(pfx_cat1, prefixes, "(?:a)(?:b)", M("ab"));
- test_lit!(pfx_cat2, prefixes, "[ab]z", M("az"), M("bz"));
- test_lit!(
- pfx_cat3,
- prefixes,
- "(?i-u)[ab]z",
- M("AZ"),
- M("BZ"),
- M("aZ"),
- M("bZ"),
- M("Az"),
- M("Bz"),
- M("az"),
- M("bz")
- );
- test_lit!(
- pfx_cat4,
- prefixes,
- "[ab][yz]",
- M("ay"),
- M("by"),
- M("az"),
- M("bz")
- );
- test_lit!(pfx_cat5, prefixes, "a*b", C("a"), M("b"));
- test_lit!(pfx_cat6, prefixes, "a*b*c", C("a"), C("b"), M("c"));
- test_lit!(pfx_cat7, prefixes, "a*b*c+", C("a"), C("b"), C("c"));
- test_lit!(pfx_cat8, prefixes, "a*b+c", C("a"), C("b"));
- test_lit!(pfx_cat9, prefixes, "a*b+c*", C("a"), C("b"));
- test_lit!(pfx_cat10, prefixes, "ab*", C("ab"), M("a"));
- test_lit!(pfx_cat11, prefixes, "ab*c", C("ab"), M("ac"));
- test_lit!(pfx_cat12, prefixes, "ab+", C("ab"));
- test_lit!(pfx_cat13, prefixes, "ab+c", C("ab"));
- test_lit!(pfx_cat14, prefixes, "a^", C("a"));
- test_lit!(pfx_cat15, prefixes, "$a");
- test_lit!(pfx_cat16, prefixes, r"ab*c", C("ab"), M("ac"));
- test_lit!(pfx_cat17, prefixes, r"ab+c", C("ab"));
- test_lit!(pfx_cat18, prefixes, r"z*azb", C("z"), M("azb"));
- test_lit!(pfx_cat19, prefixes, "a.z", C("a"));
-
- // Test regexes with alternations.
- test_lit!(pfx_alt1, prefixes, "a|b", M("a"), M("b"));
- test_lit!(pfx_alt2, prefixes, "[1-3]|b", M("1"), M("2"), M("3"), M("b"));
- test_lit!(pfx_alt3, prefixes, "y(?:a|b)z", M("yaz"), M("ybz"));
- test_lit!(pfx_alt4, prefixes, "a|b*");
- test_lit!(pfx_alt5, prefixes, "a|b+", M("a"), C("b"));
- test_lit!(pfx_alt6, prefixes, "a|(?:b|c*)");
- test_lit!(
- pfx_alt7,
- prefixes,
- "(a|b)*c|(a|ab)*c",
- C("a"),
- C("b"),
- M("c"),
- C("a"),
- C("ab"),
- M("c")
- );
- test_lit!(pfx_alt8, prefixes, "a*b|c", C("a"), M("b"), M("c"));
-
- // Test regexes with empty assertions.
- test_lit!(pfx_empty1, prefixes, "^a", M("a"));
- test_lit!(pfx_empty2, prefixes, "a${2}", C("a"));
- test_lit!(pfx_empty3, prefixes, "^abc", M("abc"));
- test_lit!(pfx_empty4, prefixes, "(?:^abc)|(?:^z)", M("abc"), M("z"));
-
- // Make sure some curious regexes have no prefixes.
- test_lit!(pfx_nothing1, prefixes, ".");
- test_lit!(pfx_nothing2, prefixes, "(?s).");
- test_lit!(pfx_nothing3, prefixes, "^");
- test_lit!(pfx_nothing4, prefixes, "$");
- test_lit!(pfx_nothing6, prefixes, "(?m)$");
- test_lit!(pfx_nothing7, prefixes, r"\b");
- test_lit!(pfx_nothing8, prefixes, r"\B");
-
- // Test a few regexes that defeat any prefix literal detection.
- test_lit!(pfx_defeated1, prefixes, ".a");
- test_lit!(pfx_defeated2, prefixes, "(?s).a");
- test_lit!(pfx_defeated3, prefixes, "a*b*c*");
- test_lit!(pfx_defeated4, prefixes, "a|.");
- test_lit!(pfx_defeated5, prefixes, ".|a");
- test_lit!(pfx_defeated6, prefixes, "a|^");
- test_lit!(pfx_defeated7, prefixes, ".(?:a(?:b)(?:c))");
- test_lit!(pfx_defeated8, prefixes, "$a");
- test_lit!(pfx_defeated9, prefixes, "(?m)$a");
- test_lit!(pfx_defeated10, prefixes, r"\ba");
- test_lit!(pfx_defeated11, prefixes, r"\Ba");
- test_lit!(pfx_defeated12, prefixes, "^*a");
- test_lit!(pfx_defeated13, prefixes, "^+a");
-
- test_lit!(
- pfx_crazy1,
- prefixes,
- r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]",
- C("Mo\\'"),
- C("Mu\\'"),
- C("Moam"),
- C("Muam")
- );
-
- // ************************************************************************
- // Tests for quiting prefix literal search.
- // ************************************************************************
-
- macro_rules! test_exhausted {
- ($name:ident, $which:ident, $re:expr) => {
- test_exhausted!($name, $which, $re,);
- };
- ($name:ident, $which:ident, $re:expr, $($lit:expr),*) => {
- #[test]
- fn $name() {
- let expr = ParserBuilder::new()
- .build()
- .parse($re)
- .unwrap();
- let mut lits = Literals::empty();
- lits.set_limit_size(20).set_limit_class(10);
- $which(&mut lits, &expr);
- assert_lit_eq!(Unicode, lits, $($lit),*);
-
- let expr = ParserBuilder::new()
- .allow_invalid_utf8(true)
- .unicode(false)
- .build()
- .parse($re)
- .unwrap();
- let mut lits = Literals::empty();
- lits.set_limit_size(20).set_limit_class(10);
- $which(&mut lits, &expr);
- assert_lit_eq!(Bytes, lits, $($lit),*);
- }
- };
- }
-
- // These test use a much lower limit than the default so that we can
- // write test cases of reasonable size.
- test_exhausted!(pfx_exhausted1, prefixes, "[a-z]");
- test_exhausted!(pfx_exhausted2, prefixes, "[a-z]*A");
- test_exhausted!(pfx_exhausted3, prefixes, "A[a-z]Z", C("A"));
- test_exhausted!(
- pfx_exhausted4,
- prefixes,
- "(?i-u)foobar",
- C("FO"),
- C("fO"),
- C("Fo"),
- C("fo")
- );
- test_exhausted!(
- pfx_exhausted5,
- prefixes,
- "(?:ab){100}",
- C("abababababababababab")
- );
- test_exhausted!(
- pfx_exhausted6,
- prefixes,
- "(?:(?:ab){100})*cd",
- C("ababababab"),
- M("cd")
- );
- test_exhausted!(
- pfx_exhausted7,
- prefixes,
- "z(?:(?:ab){100})*cd",
- C("zababababab"),
- M("zcd")
- );
- test_exhausted!(
- pfx_exhausted8,
- prefixes,
- "aaaaaaaaaaaaaaaaaaaaz",
- C("aaaaaaaaaaaaaaaaaaaa")
- );
-
- // ************************************************************************
- // Tests for suffix literal extraction.
- // ************************************************************************
-
- // Elementary tests.
- test_lit!(sfx_one_lit1, suffixes, "a", M("a"));
- test_lit!(sfx_one_lit2, suffixes, "abc", M("abc"));
- test_lit!(sfx_one_lit3, suffixes, "(?u)☃", M("\\xe2\\x98\\x83"));
- #[cfg(feature = "unicode-case")]
- test_lit!(sfx_one_lit4, suffixes, "(?ui)☃", M("\\xe2\\x98\\x83"));
- test_lit!(sfx_class1, suffixes, "[1-4]", M("1"), M("2"), M("3"), M("4"));
- test_lit!(
- sfx_class2,
- suffixes,
- "(?u)[☃Ⅰ]",
- M("\\xe2\\x85\\xa0"),
- M("\\xe2\\x98\\x83")
- );
- #[cfg(feature = "unicode-case")]
- test_lit!(
- sfx_class3,
- suffixes,
- "(?ui)[☃Ⅰ]",
- M("\\xe2\\x85\\xa0"),
- M("\\xe2\\x85\\xb0"),
- M("\\xe2\\x98\\x83")
- );
- test_lit!(sfx_one_lit_casei1, suffixes, "(?i-u)a", M("A"), M("a"));
- test_lit!(
- sfx_one_lit_casei2,
- suffixes,
- "(?i-u)abc",
- M("ABC"),
- M("ABc"),
- M("AbC"),
- M("Abc"),
- M("aBC"),
- M("aBc"),
- M("abC"),
- M("abc")
- );
- test_lit!(sfx_group1, suffixes, "(a)", M("a"));
- test_lit!(sfx_rep_zero_or_one1, suffixes, "a?");
- test_lit!(sfx_rep_zero_or_one2, suffixes, "(?:abc)?");
- test_lit!(sfx_rep_zero_or_more1, suffixes, "a*");
- test_lit!(sfx_rep_zero_or_more2, suffixes, "(?:abc)*");
- test_lit!(sfx_rep_one_or_more1, suffixes, "a+", C("a"));
- test_lit!(sfx_rep_one_or_more2, suffixes, "(?:abc)+", C("abc"));
- test_lit!(sfx_rep_nested_one_or_more, suffixes, "(?:a+)+", C("a"));
- test_lit!(sfx_rep_range1, suffixes, "a{0}");
- test_lit!(sfx_rep_range2, suffixes, "a{0,}");
- test_lit!(sfx_rep_range3, suffixes, "a{0,1}");
- test_lit!(sfx_rep_range4, suffixes, "a{1}", M("a"));
- test_lit!(sfx_rep_range5, suffixes, "a{2}", M("aa"));
- test_lit!(sfx_rep_range6, suffixes, "a{1,2}", C("a"));
- test_lit!(sfx_rep_range7, suffixes, "a{2,3}", C("aa"));
-
- // Test regexes with concatenations.
- test_lit!(sfx_cat1, suffixes, "(?:a)(?:b)", M("ab"));
- test_lit!(sfx_cat2, suffixes, "[ab]z", M("az"), M("bz"));
- test_lit!(
- sfx_cat3,
- suffixes,
- "(?i-u)[ab]z",
- M("AZ"),
- M("Az"),
- M("BZ"),
- M("Bz"),
- M("aZ"),
- M("az"),
- M("bZ"),
- M("bz")
- );
- test_lit!(
- sfx_cat4,
- suffixes,
- "[ab][yz]",
- M("ay"),
- M("az"),
- M("by"),
- M("bz")
- );
- test_lit!(sfx_cat5, suffixes, "a*b", C("ab"), M("b"));
- test_lit!(sfx_cat6, suffixes, "a*b*c", C("bc"), C("ac"), M("c"));
- test_lit!(sfx_cat7, suffixes, "a*b*c+", C("c"));
- test_lit!(sfx_cat8, suffixes, "a*b+c", C("bc"));
- test_lit!(sfx_cat9, suffixes, "a*b+c*", C("c"), C("b"));
- test_lit!(sfx_cat10, suffixes, "ab*", C("b"), M("a"));
- test_lit!(sfx_cat11, suffixes, "ab*c", C("bc"), M("ac"));
- test_lit!(sfx_cat12, suffixes, "ab+", C("b"));
- test_lit!(sfx_cat13, suffixes, "ab+c", C("bc"));
- test_lit!(sfx_cat14, suffixes, "a^");
- test_lit!(sfx_cat15, suffixes, "$a", C("a"));
- test_lit!(sfx_cat16, suffixes, r"ab*c", C("bc"), M("ac"));
- test_lit!(sfx_cat17, suffixes, r"ab+c", C("bc"));
- test_lit!(sfx_cat18, suffixes, r"z*azb", C("zazb"), M("azb"));
- test_lit!(sfx_cat19, suffixes, "a.z", C("z"));
-
- // Test regexes with alternations.
- test_lit!(sfx_alt1, suffixes, "a|b", M("a"), M("b"));
- test_lit!(sfx_alt2, suffixes, "[1-3]|b", M("1"), M("2"), M("3"), M("b"));
- test_lit!(sfx_alt3, suffixes, "y(?:a|b)z", M("yaz"), M("ybz"));
- test_lit!(sfx_alt4, suffixes, "a|b*");
- test_lit!(sfx_alt5, suffixes, "a|b+", M("a"), C("b"));
- test_lit!(sfx_alt6, suffixes, "a|(?:b|c*)");
- test_lit!(
- sfx_alt7,
- suffixes,
- "(a|b)*c|(a|ab)*c",
- C("ac"),
- C("bc"),
- M("c"),
- C("ac"),
- C("abc"),
- M("c")
- );
- test_lit!(sfx_alt8, suffixes, "a*b|c", C("ab"), M("b"), M("c"));
-
- // Test regexes with empty assertions.
- test_lit!(sfx_empty1, suffixes, "a$", M("a"));
- test_lit!(sfx_empty2, suffixes, "${2}a", C("a"));
-
- // Make sure some curious regexes have no suffixes.
- test_lit!(sfx_nothing1, suffixes, ".");
- test_lit!(sfx_nothing2, suffixes, "(?s).");
- test_lit!(sfx_nothing3, suffixes, "^");
- test_lit!(sfx_nothing4, suffixes, "$");
- test_lit!(sfx_nothing6, suffixes, "(?m)$");
- test_lit!(sfx_nothing7, suffixes, r"\b");
- test_lit!(sfx_nothing8, suffixes, r"\B");
-
- // Test a few regexes that defeat any suffix literal detection.
- test_lit!(sfx_defeated1, suffixes, "a.");
- test_lit!(sfx_defeated2, suffixes, "(?s)a.");
- test_lit!(sfx_defeated3, suffixes, "a*b*c*");
- test_lit!(sfx_defeated4, suffixes, "a|.");
- test_lit!(sfx_defeated5, suffixes, ".|a");
- test_lit!(sfx_defeated6, suffixes, "a|^");
- test_lit!(sfx_defeated7, suffixes, "(?:a(?:b)(?:c)).");
- test_lit!(sfx_defeated8, suffixes, "a^");
- test_lit!(sfx_defeated9, suffixes, "(?m)a$");
- test_lit!(sfx_defeated10, suffixes, r"a\b");
- test_lit!(sfx_defeated11, suffixes, r"a\B");
- test_lit!(sfx_defeated12, suffixes, "a^*");
- test_lit!(sfx_defeated13, suffixes, "a^+");
-
- // These test use a much lower limit than the default so that we can
- // write test cases of reasonable size.
- test_exhausted!(sfx_exhausted1, suffixes, "[a-z]");
- test_exhausted!(sfx_exhausted2, suffixes, "A[a-z]*");
- test_exhausted!(sfx_exhausted3, suffixes, "A[a-z]Z", C("Z"));
- test_exhausted!(
- sfx_exhausted4,
- suffixes,
- "(?i-u)foobar",
- C("AR"),
- C("Ar"),
- C("aR"),
- C("ar")
- );
- test_exhausted!(
- sfx_exhausted5,
- suffixes,
- "(?:ab){100}",
- C("abababababababababab")
- );
- test_exhausted!(
- sfx_exhausted6,
- suffixes,
- "cd(?:(?:ab){100})*",
- C("ababababab"),
- M("cd")
- );
- test_exhausted!(
- sfx_exhausted7,
- suffixes,
- "cd(?:(?:ab){100})*z",
- C("abababababz"),
- M("cdz")
- );
- test_exhausted!(
- sfx_exhausted8,
- suffixes,
- "zaaaaaaaaaaaaaaaaaaaa",
- C("aaaaaaaaaaaaaaaaaaaa")
- );
-
- // ************************************************************************
- // Tests for generating unambiguous literal sets.
- // ************************************************************************
-
- macro_rules! test_unamb {
- ($name:ident, $given:expr, $expected:expr) => {
- #[test]
- fn $name() {
- let given: Vec<Literal> = $given
- .into_iter()
- .map(|ul| {
- let cut = ul.is_cut();
- Literal { v: ul.v.into_bytes(), cut: cut }
- })
- .collect();
- let lits = create_lits(given);
- let got = lits.unambiguous_prefixes();
- assert_eq!($expected, escape_lits(got.literals()));
- }
- };
- }
-
- test_unamb!(unambiguous1, vec![M("z"), M("azb")], vec![C("a"), C("z")]);
- test_unamb!(
- unambiguous2,
- vec![M("zaaaaaa"), M("aa")],
- vec![C("aa"), C("z")]
- );
- test_unamb!(
- unambiguous3,
- vec![M("Sherlock"), M("Watson")],
- vec![M("Sherlock"), M("Watson")]
- );
- test_unamb!(unambiguous4, vec![M("abc"), M("bc")], vec![C("a"), C("bc")]);
- test_unamb!(unambiguous5, vec![M("bc"), M("abc")], vec![C("a"), C("bc")]);
- test_unamb!(unambiguous6, vec![M("a"), M("aa")], vec![C("a")]);
- test_unamb!(unambiguous7, vec![M("aa"), M("a")], vec![C("a")]);
- test_unamb!(unambiguous8, vec![M("ab"), M("a")], vec![C("a")]);
- test_unamb!(
- unambiguous9,
- vec![M("ac"), M("bc"), M("c"), M("ac"), M("abc"), M("c")],
- vec![C("a"), C("b"), C("c")]
- );
- test_unamb!(
- unambiguous10,
- vec![M("Mo'"), M("Mu'"), M("Mo"), M("Mu")],
- vec![C("Mo"), C("Mu")]
- );
- test_unamb!(
- unambiguous11,
- vec![M("zazb"), M("azb")],
- vec![C("a"), C("z")]
- );
- test_unamb!(unambiguous12, vec![M("foo"), C("foo")], vec![C("foo")]);
- test_unamb!(
- unambiguous13,
- vec![M("ABCX"), M("CDAX"), M("BCX")],
- vec![C("A"), C("BCX"), C("CD")]
- );
- test_unamb!(
- unambiguous14,
- vec![M("IMGX"), M("MVIX"), M("MGX"), M("DSX")],
- vec![M("DSX"), C("I"), C("MGX"), C("MV")]
- );
- test_unamb!(
- unambiguous15,
- vec![M("IMG_"), M("MG_"), M("CIMG")],
- vec![C("C"), C("I"), C("MG_")]
- );
-
- // ************************************************************************
- // Tests for suffix trimming.
- // ************************************************************************
- macro_rules! test_trim {
- ($name:ident, $trim:expr, $given:expr, $expected:expr) => {
- #[test]
- fn $name() {
- let given: Vec<Literal> = $given
- .into_iter()
- .map(|ul| {
- let cut = ul.is_cut();
- Literal { v: ul.v.into_bytes(), cut: cut }
- })
- .collect();
- let lits = create_lits(given);
- let got = lits.trim_suffix($trim).unwrap();
- assert_eq!($expected, escape_lits(got.literals()));
- }
- };
- }
-
- test_trim!(trim1, 1, vec![M("ab"), M("yz")], vec![C("a"), C("y")]);
- test_trim!(trim2, 1, vec![M("abc"), M("abd")], vec![C("ab")]);
- test_trim!(trim3, 2, vec![M("abc"), M("abd")], vec![C("a")]);
- test_trim!(trim4, 2, vec![M("abc"), M("ghij")], vec![C("a"), C("gh")]);
-
- // ************************************************************************
- // Tests for longest common prefix.
- // ************************************************************************
-
- macro_rules! test_lcp {
- ($name:ident, $given:expr, $expected:expr) => {
- #[test]
- fn $name() {
- let given: Vec<Literal> = $given
- .into_iter()
- .map(|s: &str| Literal {
- v: s.to_owned().into_bytes(),
- cut: false,
- })
- .collect();
- let lits = create_lits(given);
- let got = lits.longest_common_prefix();
- assert_eq!($expected, escape_bytes(got));
- }
- };
- }
-
- test_lcp!(lcp1, vec!["a"], "a");
- test_lcp!(lcp2, vec![], "");
- test_lcp!(lcp3, vec!["a", "b"], "");
- test_lcp!(lcp4, vec!["ab", "ab"], "ab");
- test_lcp!(lcp5, vec!["ab", "a"], "a");
- test_lcp!(lcp6, vec!["a", "ab"], "a");
- test_lcp!(lcp7, vec!["ab", "b"], "");
- test_lcp!(lcp8, vec!["b", "ab"], "");
- test_lcp!(lcp9, vec!["foobar", "foobaz"], "fooba");
- test_lcp!(lcp10, vec!["foobar", "foobaz", "a"], "");
- test_lcp!(lcp11, vec!["a", "foobar", "foobaz"], "");
- test_lcp!(lcp12, vec!["foo", "flub", "flab", "floo"], "f");
-
- // ************************************************************************
- // Tests for longest common suffix.
- // ************************************************************************
-
- macro_rules! test_lcs {
- ($name:ident, $given:expr, $expected:expr) => {
- #[test]
- fn $name() {
- let given: Vec<Literal> = $given
- .into_iter()
- .map(|s: &str| Literal {
- v: s.to_owned().into_bytes(),
- cut: false,
- })
- .collect();
- let lits = create_lits(given);
- let got = lits.longest_common_suffix();
- assert_eq!($expected, escape_bytes(got));
- }
- };
- }
-
- test_lcs!(lcs1, vec!["a"], "a");
- test_lcs!(lcs2, vec![], "");
- test_lcs!(lcs3, vec!["a", "b"], "");
- test_lcs!(lcs4, vec!["ab", "ab"], "ab");
- test_lcs!(lcs5, vec!["ab", "a"], "");
- test_lcs!(lcs6, vec!["a", "ab"], "");
- test_lcs!(lcs7, vec!["ab", "b"], "b");
- test_lcs!(lcs8, vec!["b", "ab"], "b");
- test_lcs!(lcs9, vec!["barfoo", "bazfoo"], "foo");
- test_lcs!(lcs10, vec!["barfoo", "bazfoo", "a"], "");
- test_lcs!(lcs11, vec!["a", "barfoo", "bazfoo"], "");
- test_lcs!(lcs12, vec!["flub", "bub", "boob", "dub"], "b");
-}
diff --git a/vendor/regex-syntax/src/hir/mod.rs b/vendor/regex-syntax/src/hir/mod.rs
index 1096e9f05..e5ea3701b 100644
--- a/vendor/regex-syntax/src/hir/mod.rs
+++ b/vendor/regex-syntax/src/hir/mod.rs
@@ -1,19 +1,42 @@
/*!
-Defines a high-level intermediate representation for regular expressions.
+Defines a high-level intermediate (HIR) representation for regular expressions.
+
+The HIR is represented by the [`Hir`] type, and it principally constructed via
+[translation](translate) from an [`Ast`](crate::ast::Ast). Alternatively, users
+may use the smart constructors defined on `Hir` to build their own by hand. The
+smart constructors simultaneously simplify and "optimize" the HIR, and are also
+the same routines used by translation.
+
+Most regex engines only have an HIR like this, and usually construct it
+directly from the concrete syntax. This crate however first parses the
+concrete syntax into an `Ast`, and only then creates the HIR from the `Ast`,
+as mentioned above. It's done this way to facilitate better error reporting,
+and to have a structured representation of a regex that faithfully represents
+its concrete syntax. Namely, while an `Hir` value can be converted back to an
+equivalent regex pattern string, it is unlikely to look like the original due
+to its simplified structure.
*/
-use std::char;
-use std::cmp;
-use std::error;
-use std::fmt;
-use std::result;
-use std::u8;
-use crate::ast::Span;
-use crate::hir::interval::{Interval, IntervalSet, IntervalSetIter};
-use crate::unicode;
+use core::{char, cmp};
-pub use crate::hir::visitor::{visit, Visitor};
-pub use crate::unicode::CaseFoldError;
+use alloc::{
+ boxed::Box,
+ format,
+ string::{String, ToString},
+ vec,
+ vec::Vec,
+};
+
+use crate::{
+ ast::Span,
+ hir::interval::{Interval, IntervalSet, IntervalSetIter},
+ unicode,
+};
+
+pub use crate::{
+ hir::visitor::{visit, Visitor},
+ unicode::CaseFoldError,
+};
mod interval;
pub mod literal;
@@ -53,13 +76,17 @@ impl Error {
}
/// The type of an error that occurred while building an `Hir`.
+///
+/// This error type is marked as `non_exhaustive`. This means that adding a
+/// new variant is not considered a breaking change.
+#[non_exhaustive]
#[derive(Clone, Debug, Eq, PartialEq)]
pub enum ErrorKind {
/// This error occurs when a Unicode feature is used when Unicode
/// support is disabled. For example `(?-u:\pL)` would trigger this error.
UnicodeNotAllowed,
/// This error occurs when translating a pattern that could match a byte
- /// sequence that isn't UTF-8 and `allow_invalid_utf8` was disabled.
+ /// sequence that isn't UTF-8 and `utf8` was enabled.
InvalidUtf8,
/// This occurs when an unrecognized Unicode property name could not
/// be found.
@@ -75,27 +102,22 @@ pub enum ErrorKind {
/// available, and the regular expression required Unicode aware case
/// insensitivity.
UnicodeCaseUnavailable,
- /// This occurs when the translator attempts to construct a character class
- /// that is empty.
- ///
- /// Note that this restriction in the translator may be removed in the
- /// future.
- EmptyClassNotAllowed,
- /// Hints that destructuring should not be exhaustive.
- ///
- /// This enum may grow additional variants, so this makes sure clients
- /// don't count on exhaustive matching. (Otherwise, adding a new variant
- /// could break existing code.)
- #[doc(hidden)]
- __Nonexhaustive,
}
-impl ErrorKind {
- // TODO: Remove this method entirely on the next breaking semver release.
- #[allow(deprecated)]
- fn description(&self) -> &str {
+#[cfg(feature = "std")]
+impl std::error::Error for Error {}
+
+impl core::fmt::Display for Error {
+ fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+ crate::error::Formatter::from(self).fmt(f)
+ }
+}
+
+impl core::fmt::Display for ErrorKind {
+ fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
use self::ErrorKind::*;
- match *self {
+
+ let msg = match *self {
UnicodeNotAllowed => "Unicode not allowed here",
InvalidUtf8 => "pattern can match invalid UTF-8",
UnicodePropertyNotFound => "Unicode property not found",
@@ -108,112 +130,82 @@ impl ErrorKind {
"Unicode-aware case insensitivity matching is not available \
(make sure the unicode-case feature is enabled)"
}
- EmptyClassNotAllowed => "empty character classes are not allowed",
- __Nonexhaustive => unreachable!(),
- }
- }
-}
-
-impl error::Error for Error {
- // TODO: Remove this method entirely on the next breaking semver release.
- #[allow(deprecated)]
- fn description(&self) -> &str {
- self.kind.description()
- }
-}
-
-impl fmt::Display for Error {
- fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
- crate::error::Formatter::from(self).fmt(f)
- }
-}
-
-impl fmt::Display for ErrorKind {
- fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
- // TODO: Remove this on the next breaking semver release.
- #[allow(deprecated)]
- f.write_str(self.description())
+ };
+ f.write_str(msg)
}
}
/// A high-level intermediate representation (HIR) for a regular expression.
///
-/// The HIR of a regular expression represents an intermediate step between its
-/// abstract syntax (a structured description of the concrete syntax) and
-/// compiled byte codes. The purpose of HIR is to make regular expressions
+/// An HIR value is a combination of a [`HirKind`] and a set of [`Properties`].
+/// An `HirKind` indicates what kind of regular expression it is (a literal,
+/// a repetition, a look-around assertion, etc.), where as a `Properties`
+/// describes various facts about the regular expression. For example, whether
+/// it matches UTF-8 or if it matches the empty string.
+///
+/// The HIR of a regular expression represents an intermediate step between
+/// its abstract syntax (a structured description of the concrete syntax) and
+/// an actual regex matcher. The purpose of HIR is to make regular expressions
/// easier to analyze. In particular, the AST is much more complex than the
/// HIR. For example, while an AST supports arbitrarily nested character
/// classes, the HIR will flatten all nested classes into a single set. The HIR
/// will also "compile away" every flag present in the concrete syntax. For
/// example, users of HIR expressions never need to worry about case folding;
-/// it is handled automatically by the translator (e.g., by translating `(?i)A`
-/// to `[aA]`).
-///
-/// If the HIR was produced by a translator that disallows invalid UTF-8, then
-/// the HIR is guaranteed to match UTF-8 exclusively.
-///
-/// This type defines its own destructor that uses constant stack space and
-/// heap space proportional to the size of the HIR.
+/// it is handled automatically by the translator (e.g., by translating
+/// `(?i:A)` to `[aA]`).
///
/// The specific type of an HIR expression can be accessed via its `kind`
/// or `into_kind` methods. This extra level of indirection exists for two
/// reasons:
///
-/// 1. Construction of an HIR expression *must* use the constructor methods
-/// on this `Hir` type instead of building the `HirKind` values directly.
-/// This permits construction to enforce invariants like "concatenations
-/// always consist of two or more sub-expressions."
+/// 1. Construction of an HIR expression *must* use the constructor methods on
+/// this `Hir` type instead of building the `HirKind` values directly. This
+/// permits construction to enforce invariants like "concatenations always
+/// consist of two or more sub-expressions."
/// 2. Every HIR expression contains attributes that are defined inductively,
-/// and can be computed cheaply during the construction process. For
-/// example, one such attribute is whether the expression must match at the
-/// beginning of the text.
+/// and can be computed cheaply during the construction process. For example,
+/// one such attribute is whether the expression must match at the beginning of
+/// the haystack.
+///
+/// In particular, if you have an `HirKind` value, then there is intentionally
+/// no way to build an `Hir` value from it. You instead need to do case
+/// analysis on the `HirKind` value and build the `Hir` value using its smart
+/// constructors.
+///
+/// # UTF-8
+///
+/// If the HIR was produced by a translator with
+/// [`TranslatorBuilder::utf8`](translate::TranslatorBuilder::utf8) enabled,
+/// then the HIR is guaranteed to match UTF-8 exclusively for all non-empty
+/// matches.
+///
+/// For empty matches, those can occur at any position. It is the
+/// repsonsibility of the regex engine to determine whether empty matches are
+/// permitted between the code units of a single codepoint.
+///
+/// # Stack space
+///
+/// This type defines its own destructor that uses constant stack space and
+/// heap space proportional to the size of the HIR.
///
/// Also, an `Hir`'s `fmt::Display` implementation prints an HIR as a regular
/// expression pattern string, and uses constant stack space and heap space
-/// proportional to the size of the `Hir`.
-#[derive(Clone, Debug, Eq, PartialEq)]
+/// proportional to the size of the `Hir`. The regex it prints is guaranteed to
+/// be _semantically_ equivalent to the original concrete syntax, but it may
+/// look very different. (And potentially not practically readable by a human.)
+///
+/// An `Hir`'s `fmt::Debug` implementation currently does not use constant
+/// stack space. The implementation will also suppress some details (such as
+/// the `Properties` inlined into every `Hir` value to make it less noisy).
+#[derive(Clone, Eq, PartialEq)]
pub struct Hir {
/// The underlying HIR kind.
kind: HirKind,
/// Analysis info about this HIR, computed during construction.
- info: HirInfo,
-}
-
-/// The kind of an arbitrary `Hir` expression.
-#[derive(Clone, Debug, Eq, PartialEq)]
-pub enum HirKind {
- /// The empty regular expression, which matches everything, including the
- /// empty string.
- Empty,
- /// A single literal character that matches exactly this character.
- Literal(Literal),
- /// A single character class that matches any of the characters in the
- /// class. A class can either consist of Unicode scalar values as
- /// characters, or it can use bytes.
- Class(Class),
- /// An anchor assertion. An anchor assertion match always has zero length.
- Anchor(Anchor),
- /// A word boundary assertion, which may or may not be Unicode aware. A
- /// word boundary assertion match always has zero length.
- WordBoundary(WordBoundary),
- /// A repetition operation applied to a child expression.
- Repetition(Repetition),
- /// A possibly capturing group, which contains a child expression.
- Group(Group),
- /// A concatenation of expressions. A concatenation always has at least two
- /// child expressions.
- ///
- /// A concatenation matches only if each of its child expression matches
- /// one after the other.
- Concat(Vec<Hir>),
- /// An alternation of expressions. An alternation always has at least two
- /// child expressions.
- ///
- /// An alternation matches only if at least one of its child expression
- /// matches. If multiple expressions match, then the leftmost is preferred.
- Alternation(Vec<Hir>),
+ props: Properties,
}
+/// Methods for accessing the underlying `HirKind` and `Properties`.
impl Hir {
/// Returns a reference to the underlying HIR kind.
pub fn kind(&self) -> &HirKind {
@@ -223,543 +215,560 @@ impl Hir {
/// Consumes ownership of this HIR expression and returns its underlying
/// `HirKind`.
pub fn into_kind(mut self) -> HirKind {
- use std::mem;
- mem::replace(&mut self.kind, HirKind::Empty)
+ core::mem::replace(&mut self.kind, HirKind::Empty)
+ }
+
+ /// Returns the properties computed for this `Hir`.
+ pub fn properties(&self) -> &Properties {
+ &self.props
+ }
+
+ /// Splits this HIR into its constituent parts.
+ ///
+ /// This is useful because `let Hir { kind, props } = hir;` does not work
+ /// because of `Hir`'s custom `Drop` implementation.
+ fn into_parts(mut self) -> (HirKind, Properties) {
+ (
+ core::mem::replace(&mut self.kind, HirKind::Empty),
+ core::mem::replace(&mut self.props, Properties::empty()),
+ )
}
+}
+/// Smart constructors for HIR values.
+///
+/// These constructors are called "smart" because they do inductive work or
+/// simplifications. For example, calling `Hir::repetition` with a repetition
+/// like `a{0}` will actually return a `Hir` with a `HirKind::Empty` kind
+/// since it is equivalent to an empty regex. Another example is calling
+/// `Hir::concat(vec![expr])`. Instead of getting a `HirKind::Concat`, you'll
+/// just get back the original `expr` since it's precisely equivalent.
+///
+/// Smart constructors enable maintaining invariants about the HIR data type
+/// while also simulanteously keeping the representation as simple as possible.
+impl Hir {
/// Returns an empty HIR expression.
///
/// An empty HIR expression always matches, including the empty string.
+ #[inline]
pub fn empty() -> Hir {
- let mut info = HirInfo::new();
- info.set_always_utf8(true);
- info.set_all_assertions(true);
- info.set_anchored_start(false);
- info.set_anchored_end(false);
- info.set_line_anchored_start(false);
- info.set_line_anchored_end(false);
- info.set_any_anchored_start(false);
- info.set_any_anchored_end(false);
- info.set_match_empty(true);
- info.set_literal(false);
- info.set_alternation_literal(false);
- Hir { kind: HirKind::Empty, info }
+ let props = Properties::empty();
+ Hir { kind: HirKind::Empty, props }
+ }
+
+ /// Returns an HIR expression that can never match anything. That is,
+ /// the size of the set of strings in the language described by the HIR
+ /// returned is `0`.
+ ///
+ /// This is distinct from [`Hir::empty`] in that the empty string matches
+ /// the HIR returned by `Hir::empty`. That is, the set of strings in the
+ /// language describe described by `Hir::empty` is non-empty.
+ ///
+ /// Note that currently, the HIR returned uses an empty character class to
+ /// indicate that nothing can match. An equivalent expression that cannot
+ /// match is an empty alternation, but all such "fail" expressions are
+ /// normalized (via smart constructors) to empty character classes. This is
+ /// because empty character classes can be spelled in the concrete syntax
+ /// of a regex (e.g., `\P{any}` or `(?-u:[^\x00-\xFF])` or `[a&&b]`), but
+ /// empty alternations cannot.
+ #[inline]
+ pub fn fail() -> Hir {
+ let class = Class::Bytes(ClassBytes::empty());
+ let props = Properties::class(&class);
+ // We can't just call Hir::class here because it defers to Hir::fail
+ // in order to canonicalize the Hir value used to represent "cannot
+ // match."
+ Hir { kind: HirKind::Class(class), props }
}
/// Creates a literal HIR expression.
///
- /// If the given literal has a `Byte` variant with an ASCII byte, then this
- /// method panics. This enforces the invariant that `Byte` variants are
- /// only used to express matching of invalid UTF-8.
- pub fn literal(lit: Literal) -> Hir {
- if let Literal::Byte(b) = lit {
- assert!(b > 0x7F);
+ /// This accepts anything that can be converted into a `Box<[u8]>`.
+ ///
+ /// Note that there is no mechanism for storing a `char` or a `Box<str>`
+ /// in an HIR. Everything is "just bytes." Whether a `Literal` (or
+ /// any HIR node) matches valid UTF-8 exclusively can be queried via
+ /// [`Properties::is_utf8`].
+ ///
+ /// # Example
+ ///
+ /// This example shows that concatenations of `Literal` HIR values will
+ /// automatically get flattened and combined together. So for example, even
+ /// if you concat multiple `Literal` values that are themselves not valid
+ /// UTF-8, they might add up to valid UTF-8. This also demonstrates just
+ /// how "smart" Hir's smart constructors are.
+ ///
+ /// ```
+ /// use regex_syntax::hir::{Hir, HirKind, Literal};
+ ///
+ /// let literals = vec![
+ /// Hir::literal([0xE2]),
+ /// Hir::literal([0x98]),
+ /// Hir::literal([0x83]),
+ /// ];
+ /// // Each literal, on its own, is invalid UTF-8.
+ /// assert!(literals.iter().all(|hir| !hir.properties().is_utf8()));
+ ///
+ /// let concat = Hir::concat(literals);
+ /// // But the concatenation is valid UTF-8!
+ /// assert!(concat.properties().is_utf8());
+ ///
+ /// // And also notice that the literals have been concatenated into a
+ /// // single `Literal`, to the point where there is no explicit `Concat`!
+ /// let expected = HirKind::Literal(Literal(Box::from("☃".as_bytes())));
+ /// assert_eq!(&expected, concat.kind());
+ /// ```
+ #[inline]
+ pub fn literal<B: Into<Box<[u8]>>>(lit: B) -> Hir {
+ let bytes = lit.into();
+ if bytes.is_empty() {
+ return Hir::empty();
}
- let mut info = HirInfo::new();
- info.set_always_utf8(lit.is_unicode());
- info.set_all_assertions(false);
- info.set_anchored_start(false);
- info.set_anchored_end(false);
- info.set_line_anchored_start(false);
- info.set_line_anchored_end(false);
- info.set_any_anchored_start(false);
- info.set_any_anchored_end(false);
- info.set_match_empty(false);
- info.set_literal(true);
- info.set_alternation_literal(true);
- Hir { kind: HirKind::Literal(lit), info }
- }
-
- /// Creates a class HIR expression.
+ let lit = Literal(bytes);
+ let props = Properties::literal(&lit);
+ Hir { kind: HirKind::Literal(lit), props }
+ }
+
+ /// Creates a class HIR expression. The class may either be defined over
+ /// ranges of Unicode codepoints or ranges of raw byte values.
+ ///
+ /// Note that an empty class is permitted. An empty class is equivalent to
+ /// `Hir::fail()`.
+ #[inline]
pub fn class(class: Class) -> Hir {
- let mut info = HirInfo::new();
- info.set_always_utf8(class.is_always_utf8());
- info.set_all_assertions(false);
- info.set_anchored_start(false);
- info.set_anchored_end(false);
- info.set_line_anchored_start(false);
- info.set_line_anchored_end(false);
- info.set_any_anchored_start(false);
- info.set_any_anchored_end(false);
- info.set_match_empty(false);
- info.set_literal(false);
- info.set_alternation_literal(false);
- Hir { kind: HirKind::Class(class), info }
- }
-
- /// Creates an anchor assertion HIR expression.
- pub fn anchor(anchor: Anchor) -> Hir {
- let mut info = HirInfo::new();
- info.set_always_utf8(true);
- info.set_all_assertions(true);
- info.set_anchored_start(false);
- info.set_anchored_end(false);
- info.set_line_anchored_start(false);
- info.set_line_anchored_end(false);
- info.set_any_anchored_start(false);
- info.set_any_anchored_end(false);
- info.set_match_empty(true);
- info.set_literal(false);
- info.set_alternation_literal(false);
- if let Anchor::StartText = anchor {
- info.set_anchored_start(true);
- info.set_line_anchored_start(true);
- info.set_any_anchored_start(true);
- }
- if let Anchor::EndText = anchor {
- info.set_anchored_end(true);
- info.set_line_anchored_end(true);
- info.set_any_anchored_end(true);
+ if class.is_empty() {
+ return Hir::fail();
+ } else if let Some(bytes) = class.literal() {
+ return Hir::literal(bytes);
}
- if let Anchor::StartLine = anchor {
- info.set_line_anchored_start(true);
- }
- if let Anchor::EndLine = anchor {
- info.set_line_anchored_end(true);
- }
- Hir { kind: HirKind::Anchor(anchor), info }
- }
-
- /// Creates a word boundary assertion HIR expression.
- pub fn word_boundary(word_boundary: WordBoundary) -> Hir {
- let mut info = HirInfo::new();
- info.set_always_utf8(true);
- info.set_all_assertions(true);
- info.set_anchored_start(false);
- info.set_anchored_end(false);
- info.set_line_anchored_start(false);
- info.set_line_anchored_end(false);
- info.set_any_anchored_start(false);
- info.set_any_anchored_end(false);
- info.set_literal(false);
- info.set_alternation_literal(false);
- // A negated word boundary matches '', so that's fine. But \b does not
- // match \b, so why do we say it can match the empty string? Well,
- // because, if you search for \b against 'a', it will report [0, 0) and
- // [1, 1) as matches, and both of those matches correspond to the empty
- // string. Thus, only *certain* empty strings match \b, which similarly
- // applies to \B.
- info.set_match_empty(true);
- // Negated ASCII word boundaries can match invalid UTF-8.
- if let WordBoundary::AsciiNegate = word_boundary {
- info.set_always_utf8(false);
- }
- Hir { kind: HirKind::WordBoundary(word_boundary), info }
+ let props = Properties::class(&class);
+ Hir { kind: HirKind::Class(class), props }
+ }
+
+ /// Creates a look-around assertion HIR expression.
+ #[inline]
+ pub fn look(look: Look) -> Hir {
+ let props = Properties::look(look);
+ Hir { kind: HirKind::Look(look), props }
}
/// Creates a repetition HIR expression.
+ #[inline]
pub fn repetition(rep: Repetition) -> Hir {
- let mut info = HirInfo::new();
- info.set_always_utf8(rep.hir.is_always_utf8());
- info.set_all_assertions(rep.hir.is_all_assertions());
- // If this operator can match the empty string, then it can never
- // be anchored.
- info.set_anchored_start(
- !rep.is_match_empty() && rep.hir.is_anchored_start(),
- );
- info.set_anchored_end(
- !rep.is_match_empty() && rep.hir.is_anchored_end(),
- );
- info.set_line_anchored_start(
- !rep.is_match_empty() && rep.hir.is_anchored_start(),
- );
- info.set_line_anchored_end(
- !rep.is_match_empty() && rep.hir.is_anchored_end(),
- );
- info.set_any_anchored_start(rep.hir.is_any_anchored_start());
- info.set_any_anchored_end(rep.hir.is_any_anchored_end());
- info.set_match_empty(rep.is_match_empty() || rep.hir.is_match_empty());
- info.set_literal(false);
- info.set_alternation_literal(false);
- Hir { kind: HirKind::Repetition(rep), info }
- }
-
- /// Creates a group HIR expression.
- pub fn group(group: Group) -> Hir {
- let mut info = HirInfo::new();
- info.set_always_utf8(group.hir.is_always_utf8());
- info.set_all_assertions(group.hir.is_all_assertions());
- info.set_anchored_start(group.hir.is_anchored_start());
- info.set_anchored_end(group.hir.is_anchored_end());
- info.set_line_anchored_start(group.hir.is_line_anchored_start());
- info.set_line_anchored_end(group.hir.is_line_anchored_end());
- info.set_any_anchored_start(group.hir.is_any_anchored_start());
- info.set_any_anchored_end(group.hir.is_any_anchored_end());
- info.set_match_empty(group.hir.is_match_empty());
- info.set_literal(false);
- info.set_alternation_literal(false);
- Hir { kind: HirKind::Group(group), info }
+ // The regex 'a{0}' is always equivalent to the empty regex. This is
+ // true even when 'a' is an expression that never matches anything
+ // (like '\P{any}').
+ //
+ // Additionally, the regex 'a{1}' is always equivalent to 'a'.
+ if rep.min == 0 && rep.max == Some(0) {
+ return Hir::empty();
+ } else if rep.min == 1 && rep.max == Some(1) {
+ return *rep.sub;
+ }
+ let props = Properties::repetition(&rep);
+ Hir { kind: HirKind::Repetition(rep), props }
+ }
+
+ /// Creates a capture HIR expression.
+ ///
+ /// Note that there is no explicit HIR value for a non-capturing group.
+ /// Since a non-capturing group only exists to override precedence in the
+ /// concrete syntax and since an HIR already does its own grouping based on
+ /// what is parsed, there is no need to explicitly represent non-capturing
+ /// groups in the HIR.
+ #[inline]
+ pub fn capture(capture: Capture) -> Hir {
+ let props = Properties::capture(&capture);
+ Hir { kind: HirKind::Capture(capture), props }
}
/// Returns the concatenation of the given expressions.
///
- /// This flattens the concatenation as appropriate.
- pub fn concat(mut exprs: Vec<Hir>) -> Hir {
- match exprs.len() {
- 0 => Hir::empty(),
- 1 => exprs.pop().unwrap(),
- _ => {
- let mut info = HirInfo::new();
- info.set_always_utf8(true);
- info.set_all_assertions(true);
- info.set_any_anchored_start(false);
- info.set_any_anchored_end(false);
- info.set_match_empty(true);
- info.set_literal(true);
- info.set_alternation_literal(true);
-
- // Some attributes require analyzing all sub-expressions.
- for e in &exprs {
- let x = info.is_always_utf8() && e.is_always_utf8();
- info.set_always_utf8(x);
-
- let x = info.is_all_assertions() && e.is_all_assertions();
- info.set_all_assertions(x);
-
- let x = info.is_any_anchored_start()
- || e.is_any_anchored_start();
- info.set_any_anchored_start(x);
-
- let x =
- info.is_any_anchored_end() || e.is_any_anchored_end();
- info.set_any_anchored_end(x);
-
- let x = info.is_match_empty() && e.is_match_empty();
- info.set_match_empty(x);
-
- let x = info.is_literal() && e.is_literal();
- info.set_literal(x);
-
- let x = info.is_alternation_literal()
- && e.is_alternation_literal();
- info.set_alternation_literal(x);
+ /// This attempts to flatten and simplify the concatenation as appropriate.
+ ///
+ /// # Example
+ ///
+ /// This shows a simple example of basic flattening of both concatenations
+ /// and literals.
+ ///
+ /// ```
+ /// use regex_syntax::hir::Hir;
+ ///
+ /// let hir = Hir::concat(vec![
+ /// Hir::concat(vec![
+ /// Hir::literal([b'a']),
+ /// Hir::literal([b'b']),
+ /// Hir::literal([b'c']),
+ /// ]),
+ /// Hir::concat(vec![
+ /// Hir::literal([b'x']),
+ /// Hir::literal([b'y']),
+ /// Hir::literal([b'z']),
+ /// ]),
+ /// ]);
+ /// let expected = Hir::literal("abcxyz".as_bytes());
+ /// assert_eq!(expected, hir);
+ /// ```
+ pub fn concat(subs: Vec<Hir>) -> Hir {
+ // We rebuild the concatenation by simplifying it. Would be nice to do
+ // it in place, but that seems a little tricky?
+ let mut new = vec![];
+ // This gobbles up any adjacent literals in a concatenation and smushes
+ // them together. Basically, when we see a literal, we add its bytes
+ // to 'prior_lit', and whenever we see anything else, we first take
+ // any bytes in 'prior_lit' and add it to the 'new' concatenation.
+ let mut prior_lit: Option<Vec<u8>> = None;
+ for sub in subs {
+ let (kind, props) = sub.into_parts();
+ match kind {
+ HirKind::Literal(Literal(bytes)) => {
+ if let Some(ref mut prior_bytes) = prior_lit {
+ prior_bytes.extend_from_slice(&bytes);
+ } else {
+ prior_lit = Some(bytes.to_vec());
+ }
+ }
+ // We also flatten concats that are direct children of another
+ // concat. We only need to do this one level deep since
+ // Hir::concat is the only way to build concatenations, and so
+ // flattening happens inductively.
+ HirKind::Concat(subs2) => {
+ for sub2 in subs2 {
+ let (kind2, props2) = sub2.into_parts();
+ match kind2 {
+ HirKind::Literal(Literal(bytes)) => {
+ if let Some(ref mut prior_bytes) = prior_lit {
+ prior_bytes.extend_from_slice(&bytes);
+ } else {
+ prior_lit = Some(bytes.to_vec());
+ }
+ }
+ kind2 => {
+ if let Some(prior_bytes) = prior_lit.take() {
+ new.push(Hir::literal(prior_bytes));
+ }
+ new.push(Hir { kind: kind2, props: props2 });
+ }
+ }
+ }
+ }
+ // We can just skip empty HIRs.
+ HirKind::Empty => {}
+ kind => {
+ if let Some(prior_bytes) = prior_lit.take() {
+ new.push(Hir::literal(prior_bytes));
+ }
+ new.push(Hir { kind, props });
}
- // Anchored attributes require something slightly more
- // sophisticated. Normally, WLOG, to determine whether an
- // expression is anchored to the start, we'd only need to check
- // the first expression of a concatenation. However,
- // expressions like `$\b^` are still anchored to the start,
- // but the first expression in the concatenation *isn't*
- // anchored to the start. So the "first" expression to look at
- // is actually one that is either not an assertion or is
- // specifically the StartText assertion.
- info.set_anchored_start(
- exprs
- .iter()
- .take_while(|e| {
- e.is_anchored_start() || e.is_all_assertions()
- })
- .any(|e| e.is_anchored_start()),
- );
- // Similarly for the end anchor, but in reverse.
- info.set_anchored_end(
- exprs
- .iter()
- .rev()
- .take_while(|e| {
- e.is_anchored_end() || e.is_all_assertions()
- })
- .any(|e| e.is_anchored_end()),
- );
- // Repeat the process for line anchors.
- info.set_line_anchored_start(
- exprs
- .iter()
- .take_while(|e| {
- e.is_line_anchored_start() || e.is_all_assertions()
- })
- .any(|e| e.is_line_anchored_start()),
- );
- info.set_line_anchored_end(
- exprs
- .iter()
- .rev()
- .take_while(|e| {
- e.is_line_anchored_end() || e.is_all_assertions()
- })
- .any(|e| e.is_line_anchored_end()),
- );
- Hir { kind: HirKind::Concat(exprs), info }
}
}
+ if let Some(prior_bytes) = prior_lit.take() {
+ new.push(Hir::literal(prior_bytes));
+ }
+ if new.is_empty() {
+ return Hir::empty();
+ } else if new.len() == 1 {
+ return new.pop().unwrap();
+ }
+ let props = Properties::concat(&new);
+ Hir { kind: HirKind::Concat(new), props }
}
/// Returns the alternation of the given expressions.
///
- /// This flattens the alternation as appropriate.
- pub fn alternation(mut exprs: Vec<Hir>) -> Hir {
- match exprs.len() {
- 0 => Hir::empty(),
- 1 => exprs.pop().unwrap(),
- _ => {
- let mut info = HirInfo::new();
- info.set_always_utf8(true);
- info.set_all_assertions(true);
- info.set_anchored_start(true);
- info.set_anchored_end(true);
- info.set_line_anchored_start(true);
- info.set_line_anchored_end(true);
- info.set_any_anchored_start(false);
- info.set_any_anchored_end(false);
- info.set_match_empty(false);
- info.set_literal(false);
- info.set_alternation_literal(true);
-
- // Some attributes require analyzing all sub-expressions.
- for e in &exprs {
- let x = info.is_always_utf8() && e.is_always_utf8();
- info.set_always_utf8(x);
-
- let x = info.is_all_assertions() && e.is_all_assertions();
- info.set_all_assertions(x);
-
- let x = info.is_anchored_start() && e.is_anchored_start();
- info.set_anchored_start(x);
-
- let x = info.is_anchored_end() && e.is_anchored_end();
- info.set_anchored_end(x);
-
- let x = info.is_line_anchored_start()
- && e.is_line_anchored_start();
- info.set_line_anchored_start(x);
-
- let x = info.is_line_anchored_end()
- && e.is_line_anchored_end();
- info.set_line_anchored_end(x);
-
- let x = info.is_any_anchored_start()
- || e.is_any_anchored_start();
- info.set_any_anchored_start(x);
-
- let x =
- info.is_any_anchored_end() || e.is_any_anchored_end();
- info.set_any_anchored_end(x);
-
- let x = info.is_match_empty() || e.is_match_empty();
- info.set_match_empty(x);
-
- let x = info.is_alternation_literal() && e.is_literal();
- info.set_alternation_literal(x);
+ /// This flattens and simplifies the alternation as appropriate. This may
+ /// include factoring out common prefixes or even rewriting the alternation
+ /// as a character class.
+ ///
+ /// Note that an empty alternation is equivalent to `Hir::fail()`. (It
+ /// is not possible for one to write an empty alternation, or even an
+ /// alternation with a single sub-expression, in the concrete syntax of a
+ /// regex.)
+ ///
+ /// # Example
+ ///
+ /// This is a simple example showing how an alternation might get
+ /// simplified.
+ ///
+ /// ```
+ /// use regex_syntax::hir::{Hir, Class, ClassUnicode, ClassUnicodeRange};
+ ///
+ /// let hir = Hir::alternation(vec![
+ /// Hir::literal([b'a']),
+ /// Hir::literal([b'b']),
+ /// Hir::literal([b'c']),
+ /// Hir::literal([b'd']),
+ /// Hir::literal([b'e']),
+ /// Hir::literal([b'f']),
+ /// ]);
+ /// let expected = Hir::class(Class::Unicode(ClassUnicode::new([
+ /// ClassUnicodeRange::new('a', 'f'),
+ /// ])));
+ /// assert_eq!(expected, hir);
+ /// ```
+ ///
+ /// And another example showing how common prefixes might get factored
+ /// out.
+ ///
+ /// ```
+ /// use regex_syntax::hir::{Hir, Class, ClassUnicode, ClassUnicodeRange};
+ ///
+ /// let hir = Hir::alternation(vec![
+ /// Hir::concat(vec![
+ /// Hir::literal("abc".as_bytes()),
+ /// Hir::class(Class::Unicode(ClassUnicode::new([
+ /// ClassUnicodeRange::new('A', 'Z'),
+ /// ]))),
+ /// ]),
+ /// Hir::concat(vec![
+ /// Hir::literal("abc".as_bytes()),
+ /// Hir::class(Class::Unicode(ClassUnicode::new([
+ /// ClassUnicodeRange::new('a', 'z'),
+ /// ]))),
+ /// ]),
+ /// ]);
+ /// let expected = Hir::concat(vec![
+ /// Hir::literal("abc".as_bytes()),
+ /// Hir::alternation(vec![
+ /// Hir::class(Class::Unicode(ClassUnicode::new([
+ /// ClassUnicodeRange::new('A', 'Z'),
+ /// ]))),
+ /// Hir::class(Class::Unicode(ClassUnicode::new([
+ /// ClassUnicodeRange::new('a', 'z'),
+ /// ]))),
+ /// ]),
+ /// ]);
+ /// assert_eq!(expected, hir);
+ /// ```
+ ///
+ /// Note that these sorts of simplifications are not guaranteed.
+ pub fn alternation(subs: Vec<Hir>) -> Hir {
+ // We rebuild the alternation by simplifying it. We proceed similarly
+ // as the concatenation case. But in this case, there's no literal
+ // simplification happening. We're just flattening alternations.
+ let mut new = vec![];
+ for sub in subs {
+ let (kind, props) = sub.into_parts();
+ match kind {
+ HirKind::Alternation(subs2) => {
+ new.extend(subs2);
+ }
+ kind => {
+ new.push(Hir { kind, props });
}
- Hir { kind: HirKind::Alternation(exprs), info }
}
}
- }
-
- /// Build an HIR expression for `.`.
- ///
- /// A `.` expression matches any character except for `\n`. To build an
- /// expression that matches any character, including `\n`, use the `any`
- /// method.
- ///
- /// If `bytes` is `true`, then this assumes characters are limited to a
- /// single byte.
- pub fn dot(bytes: bool) -> Hir {
- if bytes {
- let mut cls = ClassBytes::empty();
- cls.push(ClassBytesRange::new(b'\0', b'\x09'));
- cls.push(ClassBytesRange::new(b'\x0B', b'\xFF'));
- Hir::class(Class::Bytes(cls))
- } else {
- let mut cls = ClassUnicode::empty();
- cls.push(ClassUnicodeRange::new('\0', '\x09'));
- cls.push(ClassUnicodeRange::new('\x0B', '\u{10FFFF}'));
- Hir::class(Class::Unicode(cls))
+ if new.is_empty() {
+ return Hir::fail();
+ } else if new.len() == 1 {
+ return new.pop().unwrap();
+ }
+ // Now that it's completely flattened, look for the special case of
+ // 'char1|char2|...|charN' and collapse that into a class. Note that
+ // we look for 'char' first and then bytes. The issue here is that if
+ // we find both non-ASCII codepoints and non-ASCII singleton bytes,
+ // then it isn't actually possible to smush them into a single class.
+ // (Because classes are either "all codepoints" or "all bytes." You
+ // can have a class that both matches non-ASCII but valid UTF-8 and
+ // invalid UTF-8.) So we look for all chars and then all bytes, and
+ // don't handle anything else.
+ if let Some(singletons) = singleton_chars(&new) {
+ let it = singletons
+ .into_iter()
+ .map(|ch| ClassUnicodeRange { start: ch, end: ch });
+ return Hir::class(Class::Unicode(ClassUnicode::new(it)));
}
+ if let Some(singletons) = singleton_bytes(&new) {
+ let it = singletons
+ .into_iter()
+ .map(|b| ClassBytesRange { start: b, end: b });
+ return Hir::class(Class::Bytes(ClassBytes::new(it)));
+ }
+ // Similar to singleton chars, we can also look for alternations of
+ // classes. Those can be smushed into a single class.
+ if let Some(cls) = class_chars(&new) {
+ return Hir::class(cls);
+ }
+ if let Some(cls) = class_bytes(&new) {
+ return Hir::class(cls);
+ }
+ // Factor out a common prefix if we can, which might potentially
+ // simplify the expression and unlock other optimizations downstream.
+ // It also might generally make NFA matching and DFA construction
+ // faster by reducing the scope of branching in the regex.
+ new = match lift_common_prefix(new) {
+ Ok(hir) => return hir,
+ Err(unchanged) => unchanged,
+ };
+ let props = Properties::alternation(&new);
+ Hir { kind: HirKind::Alternation(new), props }
}
- /// Build an HIR expression for `(?s).`.
+ /// Returns an HIR expression for `.`.
///
- /// A `(?s).` expression matches any character, including `\n`. To build an
- /// expression that matches any character except for `\n`, then use the
- /// `dot` method.
+ /// * [`Dot::AnyChar`] maps to `(?su-R:.)`.
+ /// * [`Dot::AnyByte`] maps to `(?s-Ru:.)`.
+ /// * [`Dot::AnyCharExceptLF`] maps to `(?u-Rs:.)`.
+ /// * [`Dot::AnyCharExceptCRLF`] maps to `(?Ru-s:.)`.
+ /// * [`Dot::AnyByteExceptLF`] maps to `(?-Rsu:.)`.
+ /// * [`Dot::AnyByteExceptCRLF`] maps to `(?R-su:.)`.
///
- /// If `bytes` is `true`, then this assumes characters are limited to a
- /// single byte.
- pub fn any(bytes: bool) -> Hir {
- if bytes {
- let mut cls = ClassBytes::empty();
- cls.push(ClassBytesRange::new(b'\0', b'\xFF'));
- Hir::class(Class::Bytes(cls))
- } else {
- let mut cls = ClassUnicode::empty();
- cls.push(ClassUnicodeRange::new('\0', '\u{10FFFF}'));
- Hir::class(Class::Unicode(cls))
- }
- }
-
- /// Return true if and only if this HIR will always match valid UTF-8.
+ /// # Example
///
- /// When this returns false, then it is possible for this HIR expression
- /// to match invalid UTF-8.
- pub fn is_always_utf8(&self) -> bool {
- self.info.is_always_utf8()
- }
-
- /// Returns true if and only if this entire HIR expression is made up of
- /// zero-width assertions.
+ /// Note that this is a convenience routine for constructing the correct
+ /// character class based on the value of `Dot`. There is no explicit "dot"
+ /// HIR value. It is just an abbreviation for a common character class.
///
- /// This includes expressions like `^$\b\A\z` and even `((\b)+())*^`, but
- /// not `^a`.
- pub fn is_all_assertions(&self) -> bool {
- self.info.is_all_assertions()
- }
-
- /// Return true if and only if this HIR is required to match from the
- /// beginning of text. This includes expressions like `^foo`, `^(foo|bar)`,
- /// `^foo|^bar` but not `^foo|bar`.
- pub fn is_anchored_start(&self) -> bool {
- self.info.is_anchored_start()
- }
-
- /// Return true if and only if this HIR is required to match at the end
- /// of text. This includes expressions like `foo$`, `(foo|bar)$`,
- /// `foo$|bar$` but not `foo$|bar`.
- pub fn is_anchored_end(&self) -> bool {
- self.info.is_anchored_end()
- }
-
- /// Return true if and only if this HIR is required to match from the
- /// beginning of text or the beginning of a line. This includes expressions
- /// like `^foo`, `(?m)^foo`, `^(foo|bar)`, `^(foo|bar)`, `(?m)^foo|^bar`
- /// but not `^foo|bar` or `(?m)^foo|bar`.
+ /// ```
+ /// use regex_syntax::hir::{Hir, Dot, Class, ClassBytes, ClassBytesRange};
///
- /// Note that if `is_anchored_start` is `true`, then
- /// `is_line_anchored_start` will also be `true`. The reverse implication
- /// is not true. For example, `(?m)^foo` is line anchored, but not
- /// `is_anchored_start`.
- pub fn is_line_anchored_start(&self) -> bool {
- self.info.is_line_anchored_start()
+ /// let hir = Hir::dot(Dot::AnyByte);
+ /// let expected = Hir::class(Class::Bytes(ClassBytes::new([
+ /// ClassBytesRange::new(0x00, 0xFF),
+ /// ])));
+ /// assert_eq!(expected, hir);
+ /// ```
+ #[inline]
+ pub fn dot(dot: Dot) -> Hir {
+ match dot {
+ Dot::AnyChar => {
+ let mut cls = ClassUnicode::empty();
+ cls.push(ClassUnicodeRange::new('\0', '\u{10FFFF}'));
+ Hir::class(Class::Unicode(cls))
+ }
+ Dot::AnyByte => {
+ let mut cls = ClassBytes::empty();
+ cls.push(ClassBytesRange::new(b'\0', b'\xFF'));
+ Hir::class(Class::Bytes(cls))
+ }
+ Dot::AnyCharExceptLF => {
+ let mut cls = ClassUnicode::empty();
+ cls.push(ClassUnicodeRange::new('\0', '\x09'));
+ cls.push(ClassUnicodeRange::new('\x0B', '\u{10FFFF}'));
+ Hir::class(Class::Unicode(cls))
+ }
+ Dot::AnyCharExceptCRLF => {
+ let mut cls = ClassUnicode::empty();
+ cls.push(ClassUnicodeRange::new('\0', '\x09'));
+ cls.push(ClassUnicodeRange::new('\x0B', '\x0C'));
+ cls.push(ClassUnicodeRange::new('\x0E', '\u{10FFFF}'));
+ Hir::class(Class::Unicode(cls))
+ }
+ Dot::AnyByteExceptLF => {
+ let mut cls = ClassBytes::empty();
+ cls.push(ClassBytesRange::new(b'\0', b'\x09'));
+ cls.push(ClassBytesRange::new(b'\x0B', b'\xFF'));
+ Hir::class(Class::Bytes(cls))
+ }
+ Dot::AnyByteExceptCRLF => {
+ let mut cls = ClassBytes::empty();
+ cls.push(ClassBytesRange::new(b'\0', b'\x09'));
+ cls.push(ClassBytesRange::new(b'\x0B', b'\x0C'));
+ cls.push(ClassBytesRange::new(b'\x0E', b'\xFF'));
+ Hir::class(Class::Bytes(cls))
+ }
+ }
}
+}
- /// Return true if and only if this HIR is required to match at the
- /// end of text or the end of a line. This includes expressions like
- /// `foo$`, `(?m)foo$`, `(foo|bar)$`, `(?m)(foo|bar)$`, `foo$|bar$`,
- /// `(?m)(foo|bar)$`, but not `foo$|bar` or `(?m)foo$|bar`.
+/// The underlying kind of an arbitrary [`Hir`] expression.
+///
+/// An `HirKind` is principally useful for doing case analysis on the type
+/// of a regular expression. If you're looking to build new `Hir` values,
+/// then you _must_ use the smart constructors defined on `Hir`, like
+/// [`Hir::repetition`], to build new `Hir` values. The API intentionally does
+/// not expose any way of building an `Hir` directly from an `HirKind`.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub enum HirKind {
+ /// The empty regular expression, which matches everything, including the
+ /// empty string.
+ Empty,
+ /// A literalstring that matches exactly these bytes.
+ Literal(Literal),
+ /// A single character class that matches any of the characters in the
+ /// class. A class can either consist of Unicode scalar values as
+ /// characters, or it can use bytes.
///
- /// Note that if `is_anchored_end` is `true`, then
- /// `is_line_anchored_end` will also be `true`. The reverse implication
- /// is not true. For example, `(?m)foo$` is line anchored, but not
- /// `is_anchored_end`.
- pub fn is_line_anchored_end(&self) -> bool {
- self.info.is_line_anchored_end()
- }
-
- /// Return true if and only if this HIR contains any sub-expression that
- /// is required to match at the beginning of text. Specifically, this
- /// returns true if the `^` symbol (when multiline mode is disabled) or the
- /// `\A` escape appear anywhere in the regex.
- pub fn is_any_anchored_start(&self) -> bool {
- self.info.is_any_anchored_start()
- }
-
- /// Return true if and only if this HIR contains any sub-expression that is
- /// required to match at the end of text. Specifically, this returns true
- /// if the `$` symbol (when multiline mode is disabled) or the `\z` escape
- /// appear anywhere in the regex.
- pub fn is_any_anchored_end(&self) -> bool {
- self.info.is_any_anchored_end()
- }
-
- /// Return true if and only if the empty string is part of the language
- /// matched by this regular expression.
+ /// A class may be empty. In which case, it matches nothing.
+ Class(Class),
+ /// A look-around assertion. A look-around match always has zero length.
+ Look(Look),
+ /// A repetition operation applied to a sub-expression.
+ Repetition(Repetition),
+ /// A capturing group, which contains a sub-expression.
+ Capture(Capture),
+ /// A concatenation of expressions.
///
- /// This includes `a*`, `a?b*`, `a{0}`, `()`, `()+`, `^$`, `a|b?`, `\b`
- /// and `\B`, but not `a` or `a+`.
- pub fn is_match_empty(&self) -> bool {
- self.info.is_match_empty()
- }
-
- /// Return true if and only if this HIR is a simple literal. This is only
- /// true when this HIR expression is either itself a `Literal` or a
- /// concatenation of only `Literal`s.
+ /// A concatenation matches only if each of its sub-expressions match one
+ /// after the other.
///
- /// For example, `f` and `foo` are literals, but `f+`, `(foo)`, `foo()`,
- /// `` are not (even though that contain sub-expressions that are literals).
- pub fn is_literal(&self) -> bool {
- self.info.is_literal()
- }
-
- /// Return true if and only if this HIR is either a simple literal or an
- /// alternation of simple literals. This is only
- /// true when this HIR expression is either itself a `Literal` or a
- /// concatenation of only `Literal`s or an alternation of only `Literal`s.
+ /// Concatenations are guaranteed by `Hir`'s smart constructors to always
+ /// have at least two sub-expressions.
+ Concat(Vec<Hir>),
+ /// An alternation of expressions.
///
- /// For example, `f`, `foo`, `a|b|c`, and `foo|bar|baz` are alternation
- /// literals, but `f+`, `(foo)`, `foo()`, ``
- /// are not (even though that contain sub-expressions that are literals).
- pub fn is_alternation_literal(&self) -> bool {
- self.info.is_alternation_literal()
- }
+ /// An alternation matches only if at least one of its sub-expressions
+ /// match. If multiple sub-expressions match, then the leftmost is
+ /// preferred.
+ ///
+ /// Alternations are guaranteed by `Hir`'s smart constructors to always
+ /// have at least two sub-expressions.
+ Alternation(Vec<Hir>),
}
impl HirKind {
- /// Return true if and only if this HIR is the empty regular expression.
- ///
- /// Note that this is not defined inductively. That is, it only tests if
- /// this kind is the `Empty` variant. To get the inductive definition,
- /// use the `is_match_empty` method on [`Hir`](struct.Hir.html).
- pub fn is_empty(&self) -> bool {
- match *self {
- HirKind::Empty => true,
- _ => false,
- }
- }
+ /// Returns a slice of this kind's sub-expressions, if any.
+ pub fn subs(&self) -> &[Hir] {
+ use core::slice::from_ref;
- /// Returns true if and only if this kind has any (including possibly
- /// empty) subexpressions.
- pub fn has_subexprs(&self) -> bool {
match *self {
HirKind::Empty
| HirKind::Literal(_)
| HirKind::Class(_)
- | HirKind::Anchor(_)
- | HirKind::WordBoundary(_) => false,
- HirKind::Group(_)
- | HirKind::Repetition(_)
- | HirKind::Concat(_)
- | HirKind::Alternation(_) => true,
+ | HirKind::Look(_) => &[],
+ HirKind::Repetition(Repetition { ref sub, .. }) => from_ref(sub),
+ HirKind::Capture(Capture { ref sub, .. }) => from_ref(sub),
+ HirKind::Concat(ref subs) => subs,
+ HirKind::Alternation(ref subs) => subs,
}
}
}
+impl core::fmt::Debug for Hir {
+ fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+ self.kind.fmt(f)
+ }
+}
+
/// Print a display representation of this Hir.
///
/// The result of this is a valid regular expression pattern string.
///
/// This implementation uses constant stack space and heap space proportional
/// to the size of the `Hir`.
-impl fmt::Display for Hir {
- fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
- use crate::hir::print::Printer;
- Printer::new().print(self, f)
+impl core::fmt::Display for Hir {
+ fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+ crate::hir::print::Printer::new().print(self, f)
}
}
/// The high-level intermediate representation of a literal.
///
-/// A literal corresponds to a single character, where a character is either
-/// defined by a Unicode scalar value or an arbitrary byte. Unicode characters
-/// are preferred whenever possible. In particular, a `Byte` variant is only
-/// ever produced when it could match invalid UTF-8.
-#[derive(Clone, Debug, Eq, PartialEq)]
-pub enum Literal {
- /// A single character represented by a Unicode scalar value.
- Unicode(char),
- /// A single character represented by an arbitrary byte.
- Byte(u8),
-}
+/// A literal corresponds to `0` or more bytes that should be matched
+/// literally. The smart constructors defined on `Hir` will automatically
+/// concatenate adjacent literals into one literal, and will even automatically
+/// replace empty literals with `Hir::empty()`.
+///
+/// Note that despite a literal being represented by a sequence of bytes, its
+/// `Debug` implementation will attempt to print it as a normal string. (That
+/// is, not a sequence of decimal numbers.)
+#[derive(Clone, Eq, PartialEq)]
+pub struct Literal(pub Box<[u8]>);
-impl Literal {
- /// Returns true if and only if this literal corresponds to a Unicode
- /// scalar value.
- pub fn is_unicode(&self) -> bool {
- match *self {
- Literal::Unicode(_) => true,
- Literal::Byte(b) if b <= 0x7F => true,
- Literal::Byte(_) => false,
- }
+impl core::fmt::Debug for Literal {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ crate::debug::Bytes(&self.0).fmt(f)
}
}
@@ -773,13 +782,12 @@ impl Literal {
/// A character class, regardless of its character type, is represented by a
/// sequence of non-overlapping non-adjacent ranges of characters.
///
-/// Note that unlike [`Literal`](enum.Literal.html), a `Bytes` variant may
-/// be produced even when it exclusively matches valid UTF-8. This is because
-/// a `Bytes` variant represents an intention by the author of the regular
-/// expression to disable Unicode mode, which in turn impacts the semantics of
-/// case insensitive matching. For example, `(?i)k` and `(?i-u)k` will not
-/// match the same set of strings.
-#[derive(Clone, Debug, Eq, PartialEq)]
+/// Note that `Bytes` variant may be produced even when it exclusively matches
+/// valid UTF-8. This is because a `Bytes` variant represents an intention by
+/// the author of the regular expression to disable Unicode mode, which in turn
+/// impacts the semantics of case insensitive matching. For example, `(?i)k`
+/// and `(?i-u)k` will not match the same set of strings.
+#[derive(Clone, Eq, PartialEq)]
pub enum Class {
/// A set of characters represented by Unicode scalar values.
Unicode(ClassUnicode),
@@ -795,6 +803,15 @@ impl Class {
///
/// If this is a byte oriented character class, then this will be limited
/// to the ASCII ranges `A-Z` and `a-z`.
+ ///
+ /// # Panics
+ ///
+ /// This routine panics when the case mapping data necessary for this
+ /// routine to complete is unavailable. This occurs when the `unicode-case`
+ /// feature is not enabled and the underlying class is Unicode oriented.
+ ///
+ /// Callers should prefer using `try_case_fold_simple` instead, which will
+ /// return an error instead of panicking.
pub fn case_fold_simple(&mut self) {
match *self {
Class::Unicode(ref mut x) => x.case_fold_simple(),
@@ -802,6 +819,29 @@ impl Class {
}
}
+ /// Apply Unicode simple case folding to this character class, in place.
+ /// The character class will be expanded to include all simple case folded
+ /// character variants.
+ ///
+ /// If this is a byte oriented character class, then this will be limited
+ /// to the ASCII ranges `A-Z` and `a-z`.
+ ///
+ /// # Error
+ ///
+ /// This routine returns an error when the case mapping data necessary
+ /// for this routine to complete is unavailable. This occurs when the
+ /// `unicode-case` feature is not enabled and the underlying class is
+ /// Unicode oriented.
+ pub fn try_case_fold_simple(
+ &mut self,
+ ) -> core::result::Result<(), CaseFoldError> {
+ match *self {
+ Class::Unicode(ref mut x) => x.try_case_fold_simple()?,
+ Class::Bytes(ref mut x) => x.case_fold_simple(),
+ }
+ Ok(())
+ }
+
/// Negate this character class in place.
///
/// After completion, this character class will contain precisely the
@@ -824,14 +864,149 @@ impl Class {
/// 2. Unicode mode (via the `u` flag) was disabled either in the concrete
/// syntax or in the parser builder. By default, Unicode mode is
/// enabled.
- pub fn is_always_utf8(&self) -> bool {
+ pub fn is_utf8(&self) -> bool {
match *self {
Class::Unicode(_) => true,
- Class::Bytes(ref x) => x.is_all_ascii(),
+ Class::Bytes(ref x) => x.is_ascii(),
+ }
+ }
+
+ /// Returns the length, in bytes, of the smallest string matched by this
+ /// character class.
+ ///
+ /// For non-empty byte oriented classes, this always returns `1`. For
+ /// non-empty Unicode oriented classes, this can return `1`, `2`, `3` or
+ /// `4`. For empty classes, `None` is returned. It is impossible for `0` to
+ /// be returned.
+ ///
+ /// # Example
+ ///
+ /// This example shows some examples of regexes and their corresponding
+ /// minimum length, if any.
+ ///
+ /// ```
+ /// use regex_syntax::{hir::Properties, parse};
+ ///
+ /// // The empty string has a min length of 0.
+ /// let hir = parse(r"")?;
+ /// assert_eq!(Some(0), hir.properties().minimum_len());
+ /// // As do other types of regexes that only match the empty string.
+ /// let hir = parse(r"^$\b\B")?;
+ /// assert_eq!(Some(0), hir.properties().minimum_len());
+ /// // A regex that can match the empty string but match more is still 0.
+ /// let hir = parse(r"a*")?;
+ /// assert_eq!(Some(0), hir.properties().minimum_len());
+ /// // A regex that matches nothing has no minimum defined.
+ /// let hir = parse(r"[a&&b]")?;
+ /// assert_eq!(None, hir.properties().minimum_len());
+ /// // Character classes usually have a minimum length of 1.
+ /// let hir = parse(r"\w")?;
+ /// assert_eq!(Some(1), hir.properties().minimum_len());
+ /// // But sometimes Unicode classes might be bigger!
+ /// let hir = parse(r"\p{Cyrillic}")?;
+ /// assert_eq!(Some(2), hir.properties().minimum_len());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn minimum_len(&self) -> Option<usize> {
+ match *self {
+ Class::Unicode(ref x) => x.minimum_len(),
+ Class::Bytes(ref x) => x.minimum_len(),
+ }
+ }
+
+ /// Returns the length, in bytes, of the longest string matched by this
+ /// character class.
+ ///
+ /// For non-empty byte oriented classes, this always returns `1`. For
+ /// non-empty Unicode oriented classes, this can return `1`, `2`, `3` or
+ /// `4`. For empty classes, `None` is returned. It is impossible for `0` to
+ /// be returned.
+ ///
+ /// # Example
+ ///
+ /// This example shows some examples of regexes and their corresponding
+ /// maximum length, if any.
+ ///
+ /// ```
+ /// use regex_syntax::{hir::Properties, parse};
+ ///
+ /// // The empty string has a max length of 0.
+ /// let hir = parse(r"")?;
+ /// assert_eq!(Some(0), hir.properties().maximum_len());
+ /// // As do other types of regexes that only match the empty string.
+ /// let hir = parse(r"^$\b\B")?;
+ /// assert_eq!(Some(0), hir.properties().maximum_len());
+ /// // A regex that matches nothing has no maximum defined.
+ /// let hir = parse(r"[a&&b]")?;
+ /// assert_eq!(None, hir.properties().maximum_len());
+ /// // Bounded repeats work as you expect.
+ /// let hir = parse(r"x{2,10}")?;
+ /// assert_eq!(Some(10), hir.properties().maximum_len());
+ /// // An unbounded repeat means there is no maximum.
+ /// let hir = parse(r"x{2,}")?;
+ /// assert_eq!(None, hir.properties().maximum_len());
+ /// // With Unicode enabled, \w can match up to 4 bytes!
+ /// let hir = parse(r"\w")?;
+ /// assert_eq!(Some(4), hir.properties().maximum_len());
+ /// // Without Unicode enabled, \w matches at most 1 byte.
+ /// let hir = parse(r"(?-u)\w")?;
+ /// assert_eq!(Some(1), hir.properties().maximum_len());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn maximum_len(&self) -> Option<usize> {
+ match *self {
+ Class::Unicode(ref x) => x.maximum_len(),
+ Class::Bytes(ref x) => x.maximum_len(),
+ }
+ }
+
+ /// Returns true if and only if this character class is empty. That is,
+ /// it has no elements.
+ ///
+ /// An empty character can never match anything, including an empty string.
+ pub fn is_empty(&self) -> bool {
+ match *self {
+ Class::Unicode(ref x) => x.ranges().is_empty(),
+ Class::Bytes(ref x) => x.ranges().is_empty(),
+ }
+ }
+
+ /// If this class consists of exactly one element (whether a codepoint or a
+ /// byte), then return it as a literal byte string.
+ ///
+ /// If this class is empty or contains more than one element, then `None`
+ /// is returned.
+ pub fn literal(&self) -> Option<Vec<u8>> {
+ match *self {
+ Class::Unicode(ref x) => x.literal(),
+ Class::Bytes(ref x) => x.literal(),
}
}
}
+impl core::fmt::Debug for Class {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ use crate::debug::Byte;
+
+ let mut fmter = f.debug_set();
+ match *self {
+ Class::Unicode(ref cls) => {
+ for r in cls.ranges().iter() {
+ fmter.entry(&(r.start..=r.end));
+ }
+ }
+ Class::Bytes(ref cls) => {
+ for r in cls.ranges().iter() {
+ fmter.entry(&(Byte(r.start)..=Byte(r.end)));
+ }
+ }
+ }
+ fmter.finish()
+ }
+}
+
/// A set of characters represented by Unicode scalar values.
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct ClassUnicode {
@@ -842,7 +1017,8 @@ impl ClassUnicode {
/// Create a new class from a sequence of ranges.
///
/// The given ranges do not need to be in any specific order, and ranges
- /// may overlap.
+ /// may overlap. Ranges will automatically be sorted into a canonical
+ /// non-overlapping order.
pub fn new<I>(ranges: I) -> ClassUnicode
where
I: IntoIterator<Item = ClassUnicodeRange>,
@@ -851,6 +1027,9 @@ impl ClassUnicode {
}
/// Create a new class with no ranges.
+ ///
+ /// An empty class matches nothing. That is, it is equivalent to
+ /// [`Hir::fail`].
pub fn empty() -> ClassUnicode {
ClassUnicode::new(vec![])
}
@@ -903,7 +1082,7 @@ impl ClassUnicode {
/// `unicode-case` feature is not enabled.
pub fn try_case_fold_simple(
&mut self,
- ) -> result::Result<(), CaseFoldError> {
+ ) -> core::result::Result<(), CaseFoldError> {
self.set.case_fold_simple()
}
@@ -946,9 +1125,59 @@ impl ClassUnicode {
/// Returns true if and only if this character class will either match
/// nothing or only ASCII bytes. Stated differently, this returns false
/// if and only if this class contains a non-ASCII codepoint.
- pub fn is_all_ascii(&self) -> bool {
+ pub fn is_ascii(&self) -> bool {
self.set.intervals().last().map_or(true, |r| r.end <= '\x7F')
}
+
+ /// Returns the length, in bytes, of the smallest string matched by this
+ /// character class.
+ ///
+ /// Returns `None` when the class is empty.
+ pub fn minimum_len(&self) -> Option<usize> {
+ let first = self.ranges().get(0)?;
+ // Correct because c1 < c2 implies c1.len_utf8() < c2.len_utf8().
+ Some(first.start.len_utf8())
+ }
+
+ /// Returns the length, in bytes, of the longest string matched by this
+ /// character class.
+ ///
+ /// Returns `None` when the class is empty.
+ pub fn maximum_len(&self) -> Option<usize> {
+ let last = self.ranges().last()?;
+ // Correct because c1 < c2 implies c1.len_utf8() < c2.len_utf8().
+ Some(last.end.len_utf8())
+ }
+
+ /// If this class consists of exactly one codepoint, then return it as
+ /// a literal byte string.
+ ///
+ /// If this class is empty or contains more than one codepoint, then `None`
+ /// is returned.
+ pub fn literal(&self) -> Option<Vec<u8>> {
+ let rs = self.ranges();
+ if rs.len() == 1 && rs[0].start == rs[0].end {
+ Some(rs[0].start.encode_utf8(&mut [0; 4]).to_string().into_bytes())
+ } else {
+ None
+ }
+ }
+
+ /// If this class consists of only ASCII ranges, then return its
+ /// corresponding and equivalent byte class.
+ pub fn to_byte_class(&self) -> Option<ClassBytes> {
+ if !self.is_ascii() {
+ return None;
+ }
+ Some(ClassBytes::new(self.ranges().iter().map(|r| {
+ // Since we are guaranteed that our codepoint range is ASCII, the
+ // 'u8::try_from' calls below are guaranteed to be correct.
+ ClassBytesRange {
+ start: u8::try_from(r.start).unwrap(),
+ end: u8::try_from(r.end).unwrap(),
+ }
+ })))
+ }
}
/// An iterator over all ranges in a Unicode character class.
@@ -975,18 +1204,18 @@ pub struct ClassUnicodeRange {
end: char,
}
-impl fmt::Debug for ClassUnicodeRange {
- fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+impl core::fmt::Debug for ClassUnicodeRange {
+ fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
let start = if !self.start.is_whitespace() && !self.start.is_control()
{
self.start.to_string()
} else {
- format!("0x{:X}", self.start as u32)
+ format!("0x{:X}", u32::from(self.start))
};
let end = if !self.end.is_whitespace() && !self.end.is_control() {
self.end.to_string()
} else {
- format!("0x{:X}", self.end as u32)
+ format!("0x{:X}", u32::from(self.end))
};
f.debug_struct("ClassUnicodeRange")
.field("start", &start)
@@ -1023,24 +1252,13 @@ impl Interval for ClassUnicodeRange {
&self,
ranges: &mut Vec<ClassUnicodeRange>,
) -> Result<(), unicode::CaseFoldError> {
- if !unicode::contains_simple_case_mapping(self.start, self.end)? {
+ let mut folder = unicode::SimpleCaseFolder::new()?;
+ if !folder.overlaps(self.start, self.end) {
return Ok(());
}
- let start = self.start as u32;
- let end = (self.end as u32).saturating_add(1);
- let mut next_simple_cp = None;
- for cp in (start..end).filter_map(char::from_u32) {
- if next_simple_cp.map_or(false, |next| cp < next) {
- continue;
- }
- let it = match unicode::simple_fold(cp)? {
- Ok(it) => it,
- Err(next) => {
- next_simple_cp = next;
- continue;
- }
- };
- for cp_folded in it {
+ let (start, end) = (u32::from(self.start), u32::from(self.end));
+ for cp in (start..=end).filter_map(char::from_u32) {
+ for &cp_folded in folder.mapping(cp) {
ranges.push(ClassUnicodeRange::new(cp_folded, cp_folded));
}
}
@@ -1072,6 +1290,18 @@ impl ClassUnicodeRange {
pub fn end(&self) -> char {
self.end
}
+
+ /// Returns the number of codepoints in this range.
+ pub fn len(&self) -> usize {
+ let diff = 1 + u32::from(self.end) - u32::from(self.start);
+ // This is likely to panic in 16-bit targets since a usize can only fit
+ // 2^16. It's not clear what to do here, other than to return an error
+ // when building a Unicode class that contains a range whose length
+ // overflows usize. (Which, to be honest, is probably quite common on
+ // 16-bit targets. For example, this would imply that '.' and '\p{any}'
+ // would be impossible to build.)
+ usize::try_from(diff).expect("char class len fits in usize")
+ }
}
/// A set of characters represented by arbitrary bytes (where one byte
@@ -1085,7 +1315,8 @@ impl ClassBytes {
/// Create a new class from a sequence of ranges.
///
/// The given ranges do not need to be in any specific order, and ranges
- /// may overlap.
+ /// may overlap. Ranges will automatically be sorted into a canonical
+ /// non-overlapping order.
pub fn new<I>(ranges: I) -> ClassBytes
where
I: IntoIterator<Item = ClassBytesRange>,
@@ -1094,6 +1325,9 @@ impl ClassBytes {
}
/// Create a new class with no ranges.
+ ///
+ /// An empty class matches nothing. That is, it is equivalent to
+ /// [`Hir::fail`].
pub fn empty() -> ClassBytes {
ClassBytes::new(vec![])
}
@@ -1163,9 +1397,64 @@ impl ClassBytes {
/// Returns true if and only if this character class will either match
/// nothing or only ASCII bytes. Stated differently, this returns false
/// if and only if this class contains a non-ASCII byte.
- pub fn is_all_ascii(&self) -> bool {
+ pub fn is_ascii(&self) -> bool {
self.set.intervals().last().map_or(true, |r| r.end <= 0x7F)
}
+
+ /// Returns the length, in bytes, of the smallest string matched by this
+ /// character class.
+ ///
+ /// Returns `None` when the class is empty.
+ pub fn minimum_len(&self) -> Option<usize> {
+ if self.ranges().is_empty() {
+ None
+ } else {
+ Some(1)
+ }
+ }
+
+ /// Returns the length, in bytes, of the longest string matched by this
+ /// character class.
+ ///
+ /// Returns `None` when the class is empty.
+ pub fn maximum_len(&self) -> Option<usize> {
+ if self.ranges().is_empty() {
+ None
+ } else {
+ Some(1)
+ }
+ }
+
+ /// If this class consists of exactly one byte, then return it as
+ /// a literal byte string.
+ ///
+ /// If this class is empty or contains more than one byte, then `None`
+ /// is returned.
+ pub fn literal(&self) -> Option<Vec<u8>> {
+ let rs = self.ranges();
+ if rs.len() == 1 && rs[0].start == rs[0].end {
+ Some(vec![rs[0].start])
+ } else {
+ None
+ }
+ }
+
+ /// If this class consists of only ASCII ranges, then return its
+ /// corresponding and equivalent Unicode class.
+ pub fn to_unicode_class(&self) -> Option<ClassUnicode> {
+ if !self.is_ascii() {
+ return None;
+ }
+ Some(ClassUnicode::new(self.ranges().iter().map(|r| {
+ // Since we are guaranteed that our byte range is ASCII, the
+ // 'char::from' calls below are correct and will not erroneously
+ // convert a raw byte value into its corresponding codepoint.
+ ClassUnicodeRange {
+ start: char::from(r.start),
+ end: char::from(r.end),
+ }
+ })))
+ }
}
/// An iterator over all ranges in a byte character class.
@@ -1259,108 +1548,161 @@ impl ClassBytesRange {
pub fn end(&self) -> u8 {
self.end
}
+
+ /// Returns the number of bytes in this range.
+ pub fn len(&self) -> usize {
+ usize::from(self.end.checked_sub(self.start).unwrap())
+ .checked_add(1)
+ .unwrap()
+ }
}
-impl fmt::Debug for ClassBytesRange {
- fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
- let mut debug = f.debug_struct("ClassBytesRange");
- if self.start <= 0x7F {
- debug.field("start", &(self.start as char));
- } else {
- debug.field("start", &self.start);
- }
- if self.end <= 0x7F {
- debug.field("end", &(self.end as char));
- } else {
- debug.field("end", &self.end);
- }
- debug.finish()
+impl core::fmt::Debug for ClassBytesRange {
+ fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+ f.debug_struct("ClassBytesRange")
+ .field("start", &crate::debug::Byte(self.start))
+ .field("end", &crate::debug::Byte(self.end))
+ .finish()
}
}
-/// The high-level intermediate representation for an anchor assertion.
+/// The high-level intermediate representation for a look-around assertion.
///
-/// A matching anchor assertion is always zero-length.
-#[derive(Clone, Debug, Eq, PartialEq)]
-pub enum Anchor {
- /// Match the beginning of a line or the beginning of text. Specifically,
- /// this matches at the starting position of the input, or at the position
- /// immediately following a `\n` character.
- StartLine,
- /// Match the end of a line or the end of text. Specifically,
- /// this matches at the end position of the input, or at the position
- /// immediately preceding a `\n` character.
- EndLine,
+/// An assertion match is always zero-length. Also called an "empty match."
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+pub enum Look {
/// Match the beginning of text. Specifically, this matches at the starting
/// position of the input.
- StartText,
+ Start = 1 << 0,
/// Match the end of text. Specifically, this matches at the ending
/// position of the input.
- EndText,
-}
-
-/// The high-level intermediate representation for a word-boundary assertion.
-///
-/// A matching word boundary assertion is always zero-length.
-#[derive(Clone, Debug, Eq, PartialEq)]
-pub enum WordBoundary {
- /// Match a Unicode-aware word boundary. That is, this matches a position
- /// where the left adjacent character and right adjacent character
- /// correspond to a word and non-word or a non-word and word character.
- Unicode,
- /// Match a Unicode-aware negation of a word boundary.
- UnicodeNegate,
+ End = 1 << 1,
+ /// Match the beginning of a line or the beginning of text. Specifically,
+ /// this matches at the starting position of the input, or at the position
+ /// immediately following a `\n` character.
+ StartLF = 1 << 2,
+ /// Match the end of a line or the end of text. Specifically, this matches
+ /// at the end position of the input, or at the position immediately
+ /// preceding a `\n` character.
+ EndLF = 1 << 3,
+ /// Match the beginning of a line or the beginning of text. Specifically,
+ /// this matches at the starting position of the input, or at the position
+ /// immediately following either a `\r` or `\n` character, but never after
+ /// a `\r` when a `\n` follows.
+ StartCRLF = 1 << 4,
+ /// Match the end of a line or the end of text. Specifically, this matches
+ /// at the end position of the input, or at the position immediately
+ /// preceding a `\r` or `\n` character, but never before a `\n` when a `\r`
+ /// precedes it.
+ EndCRLF = 1 << 5,
/// Match an ASCII-only word boundary. That is, this matches a position
/// where the left adjacent character and right adjacent character
/// correspond to a word and non-word or a non-word and word character.
- Ascii,
+ WordAscii = 1 << 6,
/// Match an ASCII-only negation of a word boundary.
- AsciiNegate,
+ WordAsciiNegate = 1 << 7,
+ /// Match a Unicode-aware word boundary. That is, this matches a position
+ /// where the left adjacent character and right adjacent character
+ /// correspond to a word and non-word or a non-word and word character.
+ WordUnicode = 1 << 8,
+ /// Match a Unicode-aware negation of a word boundary.
+ WordUnicodeNegate = 1 << 9,
}
-impl WordBoundary {
- /// Returns true if and only if this word boundary assertion is negated.
- pub fn is_negated(&self) -> bool {
- match *self {
- WordBoundary::Unicode | WordBoundary::Ascii => false,
- WordBoundary::UnicodeNegate | WordBoundary::AsciiNegate => true,
+impl Look {
+ /// Flip the look-around assertion to its equivalent for reverse searches.
+ /// For example, `StartLF` gets translated to `EndLF`.
+ ///
+ /// Some assertions, such as `WordUnicode`, remain the same since they
+ /// match the same positions regardless of the direction of the search.
+ #[inline]
+ pub const fn reversed(self) -> Look {
+ match self {
+ Look::Start => Look::End,
+ Look::End => Look::Start,
+ Look::StartLF => Look::EndLF,
+ Look::EndLF => Look::StartLF,
+ Look::StartCRLF => Look::EndCRLF,
+ Look::EndCRLF => Look::StartCRLF,
+ Look::WordAscii => Look::WordAscii,
+ Look::WordAsciiNegate => Look::WordAsciiNegate,
+ Look::WordUnicode => Look::WordUnicode,
+ Look::WordUnicodeNegate => Look::WordUnicodeNegate,
+ }
+ }
+
+ /// Return the underlying representation of this look-around enumeration
+ /// as an integer. Giving the return value to the [`Look::from_repr`]
+ /// constructor is guaranteed to return the same look-around variant that
+ /// one started with within a semver compatible release of this crate.
+ #[inline]
+ pub const fn as_repr(self) -> u16 {
+ // AFAIK, 'as' is the only way to zero-cost convert an int enum to an
+ // actual int.
+ self as u16
+ }
+
+ /// Given the underlying representation of a `Look` value, return the
+ /// corresponding `Look` value if the representation is valid. Otherwise
+ /// `None` is returned.
+ #[inline]
+ pub const fn from_repr(repr: u16) -> Option<Look> {
+ match repr {
+ 0b00_0000_0001 => Some(Look::Start),
+ 0b00_0000_0010 => Some(Look::End),
+ 0b00_0000_0100 => Some(Look::StartLF),
+ 0b00_0000_1000 => Some(Look::EndLF),
+ 0b00_0001_0000 => Some(Look::StartCRLF),
+ 0b00_0010_0000 => Some(Look::EndCRLF),
+ 0b00_0100_0000 => Some(Look::WordAscii),
+ 0b00_1000_0000 => Some(Look::WordAsciiNegate),
+ 0b01_0000_0000 => Some(Look::WordUnicode),
+ 0b10_0000_0000 => Some(Look::WordUnicodeNegate),
+ _ => None,
+ }
+ }
+
+ /// Returns a convenient single codepoint representation of this
+ /// look-around assertion. Each assertion is guaranteed to be represented
+ /// by a distinct character.
+ ///
+ /// This is useful for succinctly representing a look-around assertion in
+ /// human friendly but succinct output intended for a programmer working on
+ /// regex internals.
+ #[inline]
+ pub const fn as_char(self) -> char {
+ match self {
+ Look::Start => 'A',
+ Look::End => 'z',
+ Look::StartLF => '^',
+ Look::EndLF => '$',
+ Look::StartCRLF => 'r',
+ Look::EndCRLF => 'R',
+ Look::WordAscii => 'b',
+ Look::WordAsciiNegate => 'B',
+ Look::WordUnicode => '𝛃',
+ Look::WordUnicodeNegate => '𝚩',
}
}
}
-/// The high-level intermediate representation for a group.
+/// The high-level intermediate representation for a capturing group.
///
-/// This represents one of three possible group types:
+/// A capturing group always has an index and a child expression. It may
+/// also have a name associated with it (e.g., `(?P<foo>\w)`), but it's not
+/// necessary.
///
-/// 1. A non-capturing group (e.g., `(?:expr)`).
-/// 2. A capturing group (e.g., `(expr)`).
-/// 3. A named capturing group (e.g., `(?P<name>expr)`).
+/// Note that there is no explicit representation of a non-capturing group
+/// in a `Hir`. Instead, non-capturing grouping is handled automatically by
+/// the recursive structure of the `Hir` itself.
#[derive(Clone, Debug, Eq, PartialEq)]
-pub struct Group {
- /// The kind of this group. If it is a capturing group, then the kind
- /// contains the capture group index (and the name, if it is a named
- /// group).
- pub kind: GroupKind,
+pub struct Capture {
+ /// The capture index of the capture.
+ pub index: u32,
+ /// The name of the capture, if it exists.
+ pub name: Option<Box<str>>,
/// The expression inside the capturing group, which may be empty.
- pub hir: Box<Hir>,
-}
-
-/// The kind of group.
-#[derive(Clone, Debug, Eq, PartialEq)]
-pub enum GroupKind {
- /// A normal unnamed capturing group.
- ///
- /// The value is the capture index of the group.
- CaptureIndex(u32),
- /// A named capturing group.
- CaptureName {
- /// The name of the group.
- name: String,
- /// The capture index of the group.
- index: u32,
- },
- /// A non-capturing group.
- NonCapturing,
+ pub sub: Box<Hir>,
}
/// The high-level intermediate representation of a repetition operator.
@@ -1369,8 +1711,21 @@ pub enum GroupKind {
/// sub-expression.
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct Repetition {
- /// The kind of this repetition operator.
- pub kind: RepetitionKind,
+ /// The minimum range of the repetition.
+ ///
+ /// Note that special cases like `?`, `+` and `*` all get translated into
+ /// the ranges `{0,1}`, `{1,}` and `{0,}`, respectively.
+ ///
+ /// When `min` is zero, this expression can match the empty string
+ /// regardless of what its sub-expression is.
+ pub min: u32,
+ /// The maximum range of the repetition.
+ ///
+ /// Note that when `max` is `None`, `min` acts as a lower bound but where
+ /// there is no upper bound. For something like `x{5}` where the min and
+ /// max are equivalent, `min` will be set to `5` and `max` will be set to
+ /// `Some(5)`.
+ pub max: Option<u32>,
/// Whether this repetition operator is greedy or not. A greedy operator
/// will match as much as it can. A non-greedy operator will match as
/// little as it can.
@@ -1380,69 +1735,71 @@ pub struct Repetition {
/// not. However, this can be inverted via the `U` "ungreedy" flag.
pub greedy: bool,
/// The expression being repeated.
- pub hir: Box<Hir>,
+ pub sub: Box<Hir>,
}
impl Repetition {
- /// Returns true if and only if this repetition operator makes it possible
- /// to match the empty string.
- ///
- /// Note that this is not defined inductively. For example, while `a*`
- /// will report `true`, `()+` will not, even though `()` matches the empty
- /// string and one or more occurrences of something that matches the empty
- /// string will always match the empty string. In order to get the
- /// inductive definition, see the corresponding method on
- /// [`Hir`](struct.Hir.html).
- pub fn is_match_empty(&self) -> bool {
- match self.kind {
- RepetitionKind::ZeroOrOne => true,
- RepetitionKind::ZeroOrMore => true,
- RepetitionKind::OneOrMore => false,
- RepetitionKind::Range(RepetitionRange::Exactly(m)) => m == 0,
- RepetitionKind::Range(RepetitionRange::AtLeast(m)) => m == 0,
- RepetitionKind::Range(RepetitionRange::Bounded(m, _)) => m == 0,
+ /// Returns a new repetition with the same `min`, `max` and `greedy`
+ /// values, but with its sub-expression replaced with the one given.
+ pub fn with(&self, sub: Hir) -> Repetition {
+ Repetition {
+ min: self.min,
+ max: self.max,
+ greedy: self.greedy,
+ sub: Box::new(sub),
}
}
}
-/// The kind of a repetition operator.
-#[derive(Clone, Debug, Eq, PartialEq)]
-pub enum RepetitionKind {
- /// Matches a sub-expression zero or one times.
- ZeroOrOne,
- /// Matches a sub-expression zero or more times.
- ZeroOrMore,
- /// Matches a sub-expression one or more times.
- OneOrMore,
- /// Matches a sub-expression within a bounded range of times.
- Range(RepetitionRange),
-}
-
-/// The kind of a counted repetition operator.
-#[derive(Clone, Debug, Eq, PartialEq)]
-pub enum RepetitionRange {
- /// Matches a sub-expression exactly this many times.
- Exactly(u32),
- /// Matches a sub-expression at least this many times.
- AtLeast(u32),
- /// Matches a sub-expression at least `m` times and at most `n` times.
- Bounded(u32, u32),
+/// A type describing the different flavors of `.`.
+///
+/// This type is meant to be used with [`Hir::dot`], which is a convenience
+/// routine for building HIR values derived from the `.` regex.
+#[non_exhaustive]
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+pub enum Dot {
+ /// Matches the UTF-8 encoding of any Unicode scalar value.
+ ///
+ /// This is equivalent to `(?su:.)` and also `\p{any}`.
+ AnyChar,
+ /// Matches any byte value.
+ ///
+ /// This is equivalent to `(?s-u:.)` and also `(?-u:[\x00-\xFF])`.
+ AnyByte,
+ /// Matches the UTF-8 encoding of any Unicode scalar value except for `\n`.
+ ///
+ /// This is equivalent to `(?u-s:.)` and also `[\p{any}--\n]`.
+ AnyCharExceptLF,
+ /// Matches the UTF-8 encoding of any Unicode scalar value except for `\r`
+ /// and `\n`.
+ ///
+ /// This is equivalent to `(?uR-s:.)` and also `[\p{any}--\r\n]`.
+ AnyCharExceptCRLF,
+ /// Matches any byte value except for `\n`.
+ ///
+ /// This is equivalent to `(?-su:.)` and also `(?-u:[[\x00-\xFF]--\n])`.
+ AnyByteExceptLF,
+ /// Matches any byte value except for `\r` and `\n`.
+ ///
+ /// This is equivalent to `(?R-su:.)` and also `(?-u:[[\x00-\xFF]--\r\n])`.
+ AnyByteExceptCRLF,
}
/// A custom `Drop` impl is used for `HirKind` such that it uses constant stack
/// space but heap space proportional to the depth of the total `Hir`.
impl Drop for Hir {
fn drop(&mut self) {
- use std::mem;
+ use core::mem;
match *self.kind() {
HirKind::Empty
| HirKind::Literal(_)
| HirKind::Class(_)
- | HirKind::Anchor(_)
- | HirKind::WordBoundary(_) => return,
- HirKind::Group(ref x) if !x.hir.kind.has_subexprs() => return,
- HirKind::Repetition(ref x) if !x.hir.kind.has_subexprs() => return,
+ | HirKind::Look(_) => return,
+ HirKind::Capture(ref x) if x.sub.kind.subs().is_empty() => return,
+ HirKind::Repetition(ref x) if x.sub.kind.subs().is_empty() => {
+ return
+ }
HirKind::Concat(ref x) if x.is_empty() => return,
HirKind::Alternation(ref x) if x.is_empty() => return,
_ => {}
@@ -1454,13 +1811,12 @@ impl Drop for Hir {
HirKind::Empty
| HirKind::Literal(_)
| HirKind::Class(_)
- | HirKind::Anchor(_)
- | HirKind::WordBoundary(_) => {}
- HirKind::Group(ref mut x) => {
- stack.push(mem::replace(&mut x.hir, Hir::empty()));
+ | HirKind::Look(_) => {}
+ HirKind::Capture(ref mut x) => {
+ stack.push(mem::replace(&mut x.sub, Hir::empty()));
}
HirKind::Repetition(ref mut x) => {
- stack.push(mem::replace(&mut x.hir, Hir::empty()));
+ stack.push(mem::replace(&mut x.sub, Hir::empty()));
}
HirKind::Concat(ref mut x) => {
stack.extend(x.drain(..));
@@ -1473,52 +1829,1105 @@ impl Drop for Hir {
}
}
-/// A type that documents various attributes of an HIR expression.
+/// A type that collects various properties of an HIR value.
+///
+/// Properties are always scalar values and represent meta data that is
+/// computed inductively on an HIR value. Properties are defined for all
+/// HIR values.
+///
+/// All methods on a `Properties` value take constant time and are meant to
+/// be cheap to call.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct Properties(Box<PropertiesI>);
+
+/// The property definition. It is split out so that we can box it, and
+/// there by make `Properties` use less stack size. This is kind-of important
+/// because every HIR value has a `Properties` attached to it.
///
-/// These attributes are typically defined inductively on the HIR.
+/// This does have the unfortunate consequence that creating any HIR value
+/// always leads to at least one alloc for properties, but this is generally
+/// true anyway (for pretty much all HirKinds except for look-arounds).
#[derive(Clone, Debug, Eq, PartialEq)]
-struct HirInfo {
- /// Represent yes/no questions by a bitfield to conserve space, since
- /// this is included in every HIR expression.
- ///
- /// If more attributes need to be added, it is OK to increase the size of
- /// this as appropriate.
- bools: u16,
+struct PropertiesI {
+ minimum_len: Option<usize>,
+ maximum_len: Option<usize>,
+ look_set: LookSet,
+ look_set_prefix: LookSet,
+ look_set_suffix: LookSet,
+ look_set_prefix_any: LookSet,
+ look_set_suffix_any: LookSet,
+ utf8: bool,
+ explicit_captures_len: usize,
+ static_explicit_captures_len: Option<usize>,
+ literal: bool,
+ alternation_literal: bool,
}
-// A simple macro for defining bitfield accessors/mutators.
-macro_rules! define_bool {
- ($bit:expr, $is_fn_name:ident, $set_fn_name:ident) => {
- fn $is_fn_name(&self) -> bool {
- self.bools & (0b1 << $bit) > 0
+impl Properties {
+ /// Returns the length (in bytes) of the smallest string matched by this
+ /// HIR.
+ ///
+ /// A return value of `0` is possible and occurs when the HIR can match an
+ /// empty string.
+ ///
+ /// `None` is returned when there is no minimum length. This occurs in
+ /// precisely the cases where the HIR matches nothing. i.e., The language
+ /// the regex matches is empty. An example of such a regex is `\P{any}`.
+ #[inline]
+ pub fn minimum_len(&self) -> Option<usize> {
+ self.0.minimum_len
+ }
+
+ /// Returns the length (in bytes) of the longest string matched by this
+ /// HIR.
+ ///
+ /// A return value of `0` is possible and occurs when nothing longer than
+ /// the empty string is in the language described by this HIR.
+ ///
+ /// `None` is returned when there is no longest matching string. This
+ /// occurs when the HIR matches nothing or when there is no upper bound on
+ /// the length of matching strings. Example of such regexes are `\P{any}`
+ /// (matches nothing) and `a+` (has no upper bound).
+ #[inline]
+ pub fn maximum_len(&self) -> Option<usize> {
+ self.0.maximum_len
+ }
+
+ /// Returns a set of all look-around assertions that appear at least once
+ /// in this HIR value.
+ #[inline]
+ pub fn look_set(&self) -> LookSet {
+ self.0.look_set
+ }
+
+ /// Returns a set of all look-around assertions that appear as a prefix for
+ /// this HIR value. That is, the set returned corresponds to the set of
+ /// assertions that must be passed before matching any bytes in a haystack.
+ ///
+ /// For example, `hir.look_set_prefix().contains(Look::Start)` returns true
+ /// if and only if the HIR is fully anchored at the start.
+ #[inline]
+ pub fn look_set_prefix(&self) -> LookSet {
+ self.0.look_set_prefix
+ }
+
+ /// Returns a set of all look-around assertions that appear as a _possible_
+ /// prefix for this HIR value. That is, the set returned corresponds to the
+ /// set of assertions that _may_ be passed before matching any bytes in a
+ /// haystack.
+ ///
+ /// For example, `hir.look_set_prefix_any().contains(Look::Start)` returns
+ /// true if and only if it's possible for the regex to match through a
+ /// anchored assertion before consuming any input.
+ #[inline]
+ pub fn look_set_prefix_any(&self) -> LookSet {
+ self.0.look_set_prefix_any
+ }
+
+ /// Returns a set of all look-around assertions that appear as a suffix for
+ /// this HIR value. That is, the set returned corresponds to the set of
+ /// assertions that must be passed in order to be considered a match after
+ /// all other consuming HIR expressions.
+ ///
+ /// For example, `hir.look_set_suffix().contains(Look::End)` returns true
+ /// if and only if the HIR is fully anchored at the end.
+ #[inline]
+ pub fn look_set_suffix(&self) -> LookSet {
+ self.0.look_set_suffix
+ }
+
+ /// Returns a set of all look-around assertions that appear as a _possible_
+ /// suffix for this HIR value. That is, the set returned corresponds to the
+ /// set of assertions that _may_ be passed before matching any bytes in a
+ /// haystack.
+ ///
+ /// For example, `hir.look_set_suffix_any().contains(Look::End)` returns
+ /// true if and only if it's possible for the regex to match through a
+ /// anchored assertion at the end of a match without consuming any input.
+ #[inline]
+ pub fn look_set_suffix_any(&self) -> LookSet {
+ self.0.look_set_suffix_any
+ }
+
+ /// Return true if and only if the corresponding HIR will always match
+ /// valid UTF-8.
+ ///
+ /// When this returns false, then it is possible for this HIR expression to
+ /// match invalid UTF-8, including by matching between the code units of
+ /// a single UTF-8 encoded codepoint.
+ ///
+ /// Note that this returns true even when the corresponding HIR can match
+ /// the empty string. Since an empty string can technically appear between
+ /// UTF-8 code units, it is possible for a match to be reported that splits
+ /// a codepoint which could in turn be considered matching invalid UTF-8.
+ /// However, it is generally assumed that such empty matches are handled
+ /// specially by the search routine if it is absolutely required that
+ /// matches not split a codepoint.
+ ///
+ /// # Example
+ ///
+ /// This code example shows the UTF-8 property of a variety of patterns.
+ ///
+ /// ```
+ /// use regex_syntax::{ParserBuilder, parse};
+ ///
+ /// // Examples of 'is_utf8() == true'.
+ /// assert!(parse(r"a")?.properties().is_utf8());
+ /// assert!(parse(r"[^a]")?.properties().is_utf8());
+ /// assert!(parse(r".")?.properties().is_utf8());
+ /// assert!(parse(r"\W")?.properties().is_utf8());
+ /// assert!(parse(r"\b")?.properties().is_utf8());
+ /// assert!(parse(r"\B")?.properties().is_utf8());
+ /// assert!(parse(r"(?-u)\b")?.properties().is_utf8());
+ /// assert!(parse(r"(?-u)\B")?.properties().is_utf8());
+ /// // Unicode mode is enabled by default, and in
+ /// // that mode, all \x hex escapes are treated as
+ /// // codepoints. So this actually matches the UTF-8
+ /// // encoding of U+00FF.
+ /// assert!(parse(r"\xFF")?.properties().is_utf8());
+ ///
+ /// // Now we show examples of 'is_utf8() == false'.
+ /// // The only way to do this is to force the parser
+ /// // to permit invalid UTF-8, otherwise all of these
+ /// // would fail to parse!
+ /// let parse = |pattern| {
+ /// ParserBuilder::new().utf8(false).build().parse(pattern)
+ /// };
+ /// assert!(!parse(r"(?-u)[^a]")?.properties().is_utf8());
+ /// assert!(!parse(r"(?-u).")?.properties().is_utf8());
+ /// assert!(!parse(r"(?-u)\W")?.properties().is_utf8());
+ /// // Conversely to the equivalent example above,
+ /// // when Unicode mode is disabled, \x hex escapes
+ /// // are treated as their raw byte values.
+ /// assert!(!parse(r"(?-u)\xFF")?.properties().is_utf8());
+ /// // Note that just because we disabled UTF-8 in the
+ /// // parser doesn't mean we still can't use Unicode.
+ /// // It is enabled by default, so \xFF is still
+ /// // equivalent to matching the UTF-8 encoding of
+ /// // U+00FF by default.
+ /// assert!(parse(r"\xFF")?.properties().is_utf8());
+ /// // Even though we use raw bytes that individually
+ /// // are not valid UTF-8, when combined together, the
+ /// // overall expression *does* match valid UTF-8!
+ /// assert!(parse(r"(?-u)\xE2\x98\x83")?.properties().is_utf8());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn is_utf8(&self) -> bool {
+ self.0.utf8
+ }
+
+ /// Returns the total number of explicit capturing groups in the
+ /// corresponding HIR.
+ ///
+ /// Note that this does not include the implicit capturing group
+ /// corresponding to the entire match that is typically included by regex
+ /// engines.
+ ///
+ /// # Example
+ ///
+ /// This method will return `0` for `a` and `1` for `(a)`:
+ ///
+ /// ```
+ /// use regex_syntax::parse;
+ ///
+ /// assert_eq!(0, parse("a")?.properties().explicit_captures_len());
+ /// assert_eq!(1, parse("(a)")?.properties().explicit_captures_len());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn explicit_captures_len(&self) -> usize {
+ self.0.explicit_captures_len
+ }
+
+ /// Returns the total number of explicit capturing groups that appear in
+ /// every possible match.
+ ///
+ /// If the number of capture groups can vary depending on the match, then
+ /// this returns `None`. That is, a value is only returned when the number
+ /// of matching groups is invariant or "static."
+ ///
+ /// Note that this does not include the implicit capturing group
+ /// corresponding to the entire match.
+ ///
+ /// # Example
+ ///
+ /// This shows a few cases where a static number of capture groups is
+ /// available and a few cases where it is not.
+ ///
+ /// ```
+ /// use regex_syntax::parse;
+ ///
+ /// let len = |pattern| {
+ /// parse(pattern).map(|h| {
+ /// h.properties().static_explicit_captures_len()
+ /// })
+ /// };
+ ///
+ /// assert_eq!(Some(0), len("a")?);
+ /// assert_eq!(Some(1), len("(a)")?);
+ /// assert_eq!(Some(1), len("(a)|(b)")?);
+ /// assert_eq!(Some(2), len("(a)(b)|(c)(d)")?);
+ /// assert_eq!(None, len("(a)|b")?);
+ /// assert_eq!(None, len("a|(b)")?);
+ /// assert_eq!(None, len("(b)*")?);
+ /// assert_eq!(Some(1), len("(b)+")?);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn static_explicit_captures_len(&self) -> Option<usize> {
+ self.0.static_explicit_captures_len
+ }
+
+ /// Return true if and only if this HIR is a simple literal. This is
+ /// only true when this HIR expression is either itself a `Literal` or a
+ /// concatenation of only `Literal`s.
+ ///
+ /// For example, `f` and `foo` are literals, but `f+`, `(foo)`, `foo()` and
+ /// the empty string are not (even though they contain sub-expressions that
+ /// are literals).
+ #[inline]
+ pub fn is_literal(&self) -> bool {
+ self.0.literal
+ }
+
+ /// Return true if and only if this HIR is either a simple literal or an
+ /// alternation of simple literals. This is only
+ /// true when this HIR expression is either itself a `Literal` or a
+ /// concatenation of only `Literal`s or an alternation of only `Literal`s.
+ ///
+ /// For example, `f`, `foo`, `a|b|c`, and `foo|bar|baz` are alternation
+ /// literals, but `f+`, `(foo)`, `foo()`, and the empty pattern are not
+ /// (even though that contain sub-expressions that are literals).
+ #[inline]
+ pub fn is_alternation_literal(&self) -> bool {
+ self.0.alternation_literal
+ }
+
+ /// Returns the total amount of heap memory usage, in bytes, used by this
+ /// `Properties` value.
+ #[inline]
+ pub fn memory_usage(&self) -> usize {
+ core::mem::size_of::<PropertiesI>()
+ }
+
+ /// Returns a new set of properties that corresponds to the union of the
+ /// iterator of properties given.
+ ///
+ /// This is useful when one has multiple `Hir` expressions and wants
+ /// to combine them into a single alternation without constructing the
+ /// corresponding `Hir`. This routine provides a way of combining the
+ /// properties of each `Hir` expression into one set of properties
+ /// representing the union of those expressions.
+ ///
+ /// # Example: union with HIRs that never match
+ ///
+ /// This example shows that unioning properties together with one that
+ /// represents a regex that never matches will "poison" certain attributes,
+ /// like the minimum and maximum lengths.
+ ///
+ /// ```
+ /// use regex_syntax::{hir::Properties, parse};
+ ///
+ /// let hir1 = parse("ab?c?")?;
+ /// assert_eq!(Some(1), hir1.properties().minimum_len());
+ /// assert_eq!(Some(3), hir1.properties().maximum_len());
+ ///
+ /// let hir2 = parse(r"[a&&b]")?;
+ /// assert_eq!(None, hir2.properties().minimum_len());
+ /// assert_eq!(None, hir2.properties().maximum_len());
+ ///
+ /// let hir3 = parse(r"wxy?z?")?;
+ /// assert_eq!(Some(2), hir3.properties().minimum_len());
+ /// assert_eq!(Some(4), hir3.properties().maximum_len());
+ ///
+ /// let unioned = Properties::union([
+ /// hir1.properties(),
+ /// hir2.properties(),
+ /// hir3.properties(),
+ /// ]);
+ /// assert_eq!(None, unioned.minimum_len());
+ /// assert_eq!(None, unioned.maximum_len());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// The maximum length can also be "poisoned" by a pattern that has no
+ /// upper bound on the length of a match. The minimum length remains
+ /// unaffected:
+ ///
+ /// ```
+ /// use regex_syntax::{hir::Properties, parse};
+ ///
+ /// let hir1 = parse("ab?c?")?;
+ /// assert_eq!(Some(1), hir1.properties().minimum_len());
+ /// assert_eq!(Some(3), hir1.properties().maximum_len());
+ ///
+ /// let hir2 = parse(r"a+")?;
+ /// assert_eq!(Some(1), hir2.properties().minimum_len());
+ /// assert_eq!(None, hir2.properties().maximum_len());
+ ///
+ /// let hir3 = parse(r"wxy?z?")?;
+ /// assert_eq!(Some(2), hir3.properties().minimum_len());
+ /// assert_eq!(Some(4), hir3.properties().maximum_len());
+ ///
+ /// let unioned = Properties::union([
+ /// hir1.properties(),
+ /// hir2.properties(),
+ /// hir3.properties(),
+ /// ]);
+ /// assert_eq!(Some(1), unioned.minimum_len());
+ /// assert_eq!(None, unioned.maximum_len());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn union<I, P>(props: I) -> Properties
+ where
+ I: IntoIterator<Item = P>,
+ P: core::borrow::Borrow<Properties>,
+ {
+ let mut it = props.into_iter().peekable();
+ // While empty alternations aren't possible, we still behave as if they
+ // are. When we have an empty alternate, then clearly the look-around
+ // prefix and suffix is empty. Otherwise, it is the intersection of all
+ // prefixes and suffixes (respectively) of the branches.
+ let fix = if it.peek().is_none() {
+ LookSet::empty()
+ } else {
+ LookSet::full()
+ };
+ // And also, an empty alternate means we have 0 static capture groups,
+ // but we otherwise start with the number corresponding to the first
+ // alternate. If any subsequent alternate has a different number of
+ // static capture groups, then we overall have a variation and not a
+ // static number of groups.
+ let static_explicit_captures_len =
+ it.peek().and_then(|p| p.borrow().static_explicit_captures_len());
+ // The base case is an empty alternation, which matches nothing.
+ // Note though that empty alternations aren't possible, because the
+ // Hir::alternation smart constructor rewrites those as empty character
+ // classes.
+ let mut props = PropertiesI {
+ minimum_len: None,
+ maximum_len: None,
+ look_set: LookSet::empty(),
+ look_set_prefix: fix,
+ look_set_suffix: fix,
+ look_set_prefix_any: LookSet::empty(),
+ look_set_suffix_any: LookSet::empty(),
+ utf8: true,
+ explicit_captures_len: 0,
+ static_explicit_captures_len,
+ literal: false,
+ alternation_literal: true,
+ };
+ let (mut min_poisoned, mut max_poisoned) = (false, false);
+ // Handle properties that need to visit every child hir.
+ for prop in it {
+ let p = prop.borrow();
+ props.look_set.set_union(p.look_set());
+ props.look_set_prefix.set_intersect(p.look_set_prefix());
+ props.look_set_suffix.set_intersect(p.look_set_suffix());
+ props.look_set_prefix_any.set_union(p.look_set_prefix_any());
+ props.look_set_suffix_any.set_union(p.look_set_suffix_any());
+ props.utf8 = props.utf8 && p.is_utf8();
+ props.explicit_captures_len = props
+ .explicit_captures_len
+ .saturating_add(p.explicit_captures_len());
+ if props.static_explicit_captures_len
+ != p.static_explicit_captures_len()
+ {
+ props.static_explicit_captures_len = None;
+ }
+ props.alternation_literal =
+ props.alternation_literal && p.is_literal();
+ if !min_poisoned {
+ if let Some(xmin) = p.minimum_len() {
+ if props.minimum_len.map_or(true, |pmin| xmin < pmin) {
+ props.minimum_len = Some(xmin);
+ }
+ } else {
+ props.minimum_len = None;
+ min_poisoned = true;
+ }
+ }
+ if !max_poisoned {
+ if let Some(xmax) = p.maximum_len() {
+ if props.maximum_len.map_or(true, |pmax| xmax > pmax) {
+ props.maximum_len = Some(xmax);
+ }
+ } else {
+ props.maximum_len = None;
+ max_poisoned = true;
+ }
+ }
}
+ Properties(Box::new(props))
+ }
+}
- fn $set_fn_name(&mut self, yes: bool) {
- if yes {
- self.bools |= 1 << $bit;
+impl Properties {
+ /// Create a new set of HIR properties for an empty regex.
+ fn empty() -> Properties {
+ let inner = PropertiesI {
+ minimum_len: Some(0),
+ maximum_len: Some(0),
+ look_set: LookSet::empty(),
+ look_set_prefix: LookSet::empty(),
+ look_set_suffix: LookSet::empty(),
+ look_set_prefix_any: LookSet::empty(),
+ look_set_suffix_any: LookSet::empty(),
+ // It is debatable whether an empty regex always matches at valid
+ // UTF-8 boundaries. Strictly speaking, at a byte oriented view,
+ // it is clearly false. There are, for example, many empty strings
+ // between the bytes encoding a '☃'.
+ //
+ // However, when Unicode mode is enabled, the fundamental atom
+ // of matching is really a codepoint. And in that scenario, an
+ // empty regex is defined to only match at valid UTF-8 boundaries
+ // and to never split a codepoint. It just so happens that this
+ // enforcement is somewhat tricky to do for regexes that match
+ // the empty string inside regex engines themselves. It usually
+ // requires some layer above the regex engine to filter out such
+ // matches.
+ //
+ // In any case, 'true' is really the only coherent option. If it
+ // were false, for example, then 'a*' would also need to be false
+ // since it too can match the empty string.
+ utf8: true,
+ explicit_captures_len: 0,
+ static_explicit_captures_len: Some(0),
+ literal: false,
+ alternation_literal: false,
+ };
+ Properties(Box::new(inner))
+ }
+
+ /// Create a new set of HIR properties for a literal regex.
+ fn literal(lit: &Literal) -> Properties {
+ let inner = PropertiesI {
+ minimum_len: Some(lit.0.len()),
+ maximum_len: Some(lit.0.len()),
+ look_set: LookSet::empty(),
+ look_set_prefix: LookSet::empty(),
+ look_set_suffix: LookSet::empty(),
+ look_set_prefix_any: LookSet::empty(),
+ look_set_suffix_any: LookSet::empty(),
+ utf8: core::str::from_utf8(&lit.0).is_ok(),
+ explicit_captures_len: 0,
+ static_explicit_captures_len: Some(0),
+ literal: true,
+ alternation_literal: true,
+ };
+ Properties(Box::new(inner))
+ }
+
+ /// Create a new set of HIR properties for a character class.
+ fn class(class: &Class) -> Properties {
+ let inner = PropertiesI {
+ minimum_len: class.minimum_len(),
+ maximum_len: class.maximum_len(),
+ look_set: LookSet::empty(),
+ look_set_prefix: LookSet::empty(),
+ look_set_suffix: LookSet::empty(),
+ look_set_prefix_any: LookSet::empty(),
+ look_set_suffix_any: LookSet::empty(),
+ utf8: class.is_utf8(),
+ explicit_captures_len: 0,
+ static_explicit_captures_len: Some(0),
+ literal: false,
+ alternation_literal: false,
+ };
+ Properties(Box::new(inner))
+ }
+
+ /// Create a new set of HIR properties for a look-around assertion.
+ fn look(look: Look) -> Properties {
+ let inner = PropertiesI {
+ minimum_len: Some(0),
+ maximum_len: Some(0),
+ look_set: LookSet::singleton(look),
+ look_set_prefix: LookSet::singleton(look),
+ look_set_suffix: LookSet::singleton(look),
+ look_set_prefix_any: LookSet::singleton(look),
+ look_set_suffix_any: LookSet::singleton(look),
+ // This requires a little explanation. Basically, we don't consider
+ // matching an empty string to be equivalent to matching invalid
+ // UTF-8, even though technically matching every empty string will
+ // split the UTF-8 encoding of a single codepoint when treating a
+ // UTF-8 encoded string as a sequence of bytes. Our defense here is
+ // that in such a case, a codepoint should logically be treated as
+ // the fundamental atom for matching, and thus the only valid match
+ // points are between codepoints and not bytes.
+ //
+ // More practically, this is true here because it's also true
+ // for 'Hir::empty()', otherwise something like 'a*' would be
+ // considered to match invalid UTF-8. That in turn makes this
+ // property borderline useless.
+ utf8: true,
+ explicit_captures_len: 0,
+ static_explicit_captures_len: Some(0),
+ literal: false,
+ alternation_literal: false,
+ };
+ Properties(Box::new(inner))
+ }
+
+ /// Create a new set of HIR properties for a repetition.
+ fn repetition(rep: &Repetition) -> Properties {
+ let p = rep.sub.properties();
+ let minimum_len = p.minimum_len().map(|child_min| {
+ let rep_min = usize::try_from(rep.min).unwrap_or(usize::MAX);
+ child_min.saturating_mul(rep_min)
+ });
+ let maximum_len = rep.max.and_then(|rep_max| {
+ let rep_max = usize::try_from(rep_max).ok()?;
+ let child_max = p.maximum_len()?;
+ child_max.checked_mul(rep_max)
+ });
+
+ let mut inner = PropertiesI {
+ minimum_len,
+ maximum_len,
+ look_set: p.look_set(),
+ look_set_prefix: LookSet::empty(),
+ look_set_suffix: LookSet::empty(),
+ look_set_prefix_any: p.look_set_prefix_any(),
+ look_set_suffix_any: p.look_set_suffix_any(),
+ utf8: p.is_utf8(),
+ explicit_captures_len: p.explicit_captures_len(),
+ static_explicit_captures_len: p.static_explicit_captures_len(),
+ literal: false,
+ alternation_literal: false,
+ };
+ // If the repetition operator can match the empty string, then its
+ // lookset prefix and suffixes themselves remain empty since they are
+ // no longer required to match.
+ if rep.min > 0 {
+ inner.look_set_prefix = p.look_set_prefix();
+ inner.look_set_suffix = p.look_set_suffix();
+ }
+ // If the static captures len of the sub-expression is not known or is
+ // zero, then it automatically propagates to the repetition, regardless
+ // of the repetition. Otherwise, it might change, but only when the
+ // repetition can match 0 times.
+ if rep.min == 0
+ && inner.static_explicit_captures_len.map_or(false, |len| len > 0)
+ {
+ // If we require a match 0 times, then our captures len is
+ // guaranteed to be zero. Otherwise, if we *can* match the empty
+ // string, then it's impossible to know how many captures will be
+ // in the resulting match.
+ if rep.max == Some(0) {
+ inner.static_explicit_captures_len = Some(0);
} else {
- self.bools &= !(1 << $bit);
+ inner.static_explicit_captures_len = None;
}
}
- };
+ Properties(Box::new(inner))
+ }
+
+ /// Create a new set of HIR properties for a capture.
+ fn capture(capture: &Capture) -> Properties {
+ let p = capture.sub.properties();
+ Properties(Box::new(PropertiesI {
+ explicit_captures_len: p.explicit_captures_len().saturating_add(1),
+ static_explicit_captures_len: p
+ .static_explicit_captures_len()
+ .map(|len| len.saturating_add(1)),
+ literal: false,
+ alternation_literal: false,
+ ..*p.0.clone()
+ }))
+ }
+
+ /// Create a new set of HIR properties for a concatenation.
+ fn concat(concat: &[Hir]) -> Properties {
+ // The base case is an empty concatenation, which matches the empty
+ // string. Note though that empty concatenations aren't possible,
+ // because the Hir::concat smart constructor rewrites those as
+ // Hir::empty.
+ let mut props = PropertiesI {
+ minimum_len: Some(0),
+ maximum_len: Some(0),
+ look_set: LookSet::empty(),
+ look_set_prefix: LookSet::empty(),
+ look_set_suffix: LookSet::empty(),
+ look_set_prefix_any: LookSet::empty(),
+ look_set_suffix_any: LookSet::empty(),
+ utf8: true,
+ explicit_captures_len: 0,
+ static_explicit_captures_len: Some(0),
+ literal: true,
+ alternation_literal: true,
+ };
+ // Handle properties that need to visit every child hir.
+ for x in concat.iter() {
+ let p = x.properties();
+ props.look_set.set_union(p.look_set());
+ props.utf8 = props.utf8 && p.is_utf8();
+ props.explicit_captures_len = props
+ .explicit_captures_len
+ .saturating_add(p.explicit_captures_len());
+ props.static_explicit_captures_len = p
+ .static_explicit_captures_len()
+ .and_then(|len1| {
+ Some((len1, props.static_explicit_captures_len?))
+ })
+ .and_then(|(len1, len2)| Some(len1.saturating_add(len2)));
+ props.literal = props.literal && p.is_literal();
+ props.alternation_literal =
+ props.alternation_literal && p.is_alternation_literal();
+ if let Some(ref mut minimum_len) = props.minimum_len {
+ match p.minimum_len() {
+ None => props.minimum_len = None,
+ Some(len) => *minimum_len += len,
+ }
+ }
+ if let Some(ref mut maximum_len) = props.maximum_len {
+ match p.maximum_len() {
+ None => props.maximum_len = None,
+ Some(len) => *maximum_len += len,
+ }
+ }
+ }
+ // Handle the prefix properties, which only requires visiting
+ // child exprs until one matches more than the empty string.
+ let mut it = concat.iter();
+ while let Some(x) = it.next() {
+ props.look_set_prefix.set_union(x.properties().look_set_prefix());
+ props
+ .look_set_prefix_any
+ .set_union(x.properties().look_set_prefix_any());
+ if x.properties().maximum_len().map_or(true, |x| x > 0) {
+ break;
+ }
+ }
+ // Same thing for the suffix properties, but in reverse.
+ let mut it = concat.iter().rev();
+ while let Some(x) = it.next() {
+ props.look_set_suffix.set_union(x.properties().look_set_suffix());
+ props
+ .look_set_suffix_any
+ .set_union(x.properties().look_set_suffix_any());
+ if x.properties().maximum_len().map_or(true, |x| x > 0) {
+ break;
+ }
+ }
+ Properties(Box::new(props))
+ }
+
+ /// Create a new set of HIR properties for a concatenation.
+ fn alternation(alts: &[Hir]) -> Properties {
+ Properties::union(alts.iter().map(|hir| hir.properties()))
+ }
}
-impl HirInfo {
- fn new() -> HirInfo {
- HirInfo { bools: 0 }
- }
-
- define_bool!(0, is_always_utf8, set_always_utf8);
- define_bool!(1, is_all_assertions, set_all_assertions);
- define_bool!(2, is_anchored_start, set_anchored_start);
- define_bool!(3, is_anchored_end, set_anchored_end);
- define_bool!(4, is_line_anchored_start, set_line_anchored_start);
- define_bool!(5, is_line_anchored_end, set_line_anchored_end);
- define_bool!(6, is_any_anchored_start, set_any_anchored_start);
- define_bool!(7, is_any_anchored_end, set_any_anchored_end);
- define_bool!(8, is_match_empty, set_match_empty);
- define_bool!(9, is_literal, set_literal);
- define_bool!(10, is_alternation_literal, set_alternation_literal);
+/// A set of look-around assertions.
+///
+/// This is useful for efficiently tracking look-around assertions. For
+/// example, an [`Hir`] provides properties that return `LookSet`s.
+#[derive(Clone, Copy, Default, Eq, PartialEq)]
+pub struct LookSet {
+ /// The underlying representation this set is exposed to make it possible
+ /// to store it somewhere efficiently. The representation is that
+ /// of a bitset, where each assertion occupies bit `i` where `i =
+ /// Look::as_repr()`.
+ ///
+ /// Note that users of this internal representation must permit the full
+ /// range of `u16` values to be represented. For example, even if the
+ /// current implementation only makes use of the 10 least significant bits,
+ /// it may use more bits in a future semver compatible release.
+ pub bits: u16,
+}
+
+impl LookSet {
+ /// Create an empty set of look-around assertions.
+ #[inline]
+ pub fn empty() -> LookSet {
+ LookSet { bits: 0 }
+ }
+
+ /// Create a full set of look-around assertions.
+ ///
+ /// This set contains all possible look-around assertions.
+ #[inline]
+ pub fn full() -> LookSet {
+ LookSet { bits: !0 }
+ }
+
+ /// Create a look-around set containing the look-around assertion given.
+ ///
+ /// This is a convenience routine for creating an empty set and inserting
+ /// one look-around assertions.
+ #[inline]
+ pub fn singleton(look: Look) -> LookSet {
+ LookSet::empty().insert(look)
+ }
+
+ /// Returns the total number of look-around assertions in this set.
+ #[inline]
+ pub fn len(self) -> usize {
+ // OK because max value always fits in a u8, which in turn always
+ // fits in a usize, regardless of target.
+ usize::try_from(self.bits.count_ones()).unwrap()
+ }
+
+ /// Returns true if and only if this set is empty.
+ #[inline]
+ pub fn is_empty(self) -> bool {
+ self.len() == 0
+ }
+
+ /// Returns true if and only if the given look-around assertion is in this
+ /// set.
+ #[inline]
+ pub fn contains(self, look: Look) -> bool {
+ self.bits & look.as_repr() != 0
+ }
+
+ /// Returns true if and only if this set contains any anchor assertions.
+ /// This includes both "start/end of haystack" and "start/end of line."
+ #[inline]
+ pub fn contains_anchor(&self) -> bool {
+ self.contains_anchor_haystack() || self.contains_anchor_line()
+ }
+
+ /// Returns true if and only if this set contains any "start/end of
+ /// haystack" anchors. This doesn't include "start/end of line" anchors.
+ #[inline]
+ pub fn contains_anchor_haystack(&self) -> bool {
+ self.contains(Look::Start) || self.contains(Look::End)
+ }
+
+ /// Returns true if and only if this set contains any "start/end of line"
+ /// anchors. This doesn't include "start/end of haystack" anchors. This
+ /// includes both `\n` line anchors and CRLF (`\r\n`) aware line anchors.
+ #[inline]
+ pub fn contains_anchor_line(&self) -> bool {
+ self.contains(Look::StartLF)
+ || self.contains(Look::EndLF)
+ || self.contains(Look::StartCRLF)
+ || self.contains(Look::EndCRLF)
+ }
+
+ /// Returns true if and only if this set contains any "start/end of line"
+ /// anchors that only treat `\n` as line terminators. This does not include
+ /// haystack anchors or CRLF aware line anchors.
+ #[inline]
+ pub fn contains_anchor_lf(&self) -> bool {
+ self.contains(Look::StartLF) || self.contains(Look::EndLF)
+ }
+
+ /// Returns true if and only if this set contains any "start/end of line"
+ /// anchors that are CRLF-aware. This doesn't include "start/end of
+ /// haystack" or "start/end of line-feed" anchors.
+ #[inline]
+ pub fn contains_anchor_crlf(&self) -> bool {
+ self.contains(Look::StartCRLF) || self.contains(Look::EndCRLF)
+ }
+
+ /// Returns true if and only if this set contains any word boundary or
+ /// negated word boundary assertions. This include both Unicode and ASCII
+ /// word boundaries.
+ #[inline]
+ pub fn contains_word(self) -> bool {
+ self.contains_word_unicode() || self.contains_word_ascii()
+ }
+
+ /// Returns true if and only if this set contains any Unicode word boundary
+ /// or negated Unicode word boundary assertions.
+ #[inline]
+ pub fn contains_word_unicode(self) -> bool {
+ self.contains(Look::WordUnicode)
+ || self.contains(Look::WordUnicodeNegate)
+ }
+
+ /// Returns true if and only if this set contains any ASCII word boundary
+ /// or negated ASCII word boundary assertions.
+ #[inline]
+ pub fn contains_word_ascii(self) -> bool {
+ self.contains(Look::WordAscii) || self.contains(Look::WordAsciiNegate)
+ }
+
+ /// Returns an iterator over all of the look-around assertions in this set.
+ #[inline]
+ pub fn iter(self) -> LookSetIter {
+ LookSetIter { set: self }
+ }
+
+ /// Return a new set that is equivalent to the original, but with the given
+ /// assertion added to it. If the assertion is already in the set, then the
+ /// returned set is equivalent to the original.
+ #[inline]
+ pub fn insert(self, look: Look) -> LookSet {
+ LookSet { bits: self.bits | look.as_repr() }
+ }
+
+ /// Updates this set in place with the result of inserting the given
+ /// assertion into this set.
+ #[inline]
+ pub fn set_insert(&mut self, look: Look) {
+ *self = self.insert(look);
+ }
+
+ /// Return a new set that is equivalent to the original, but with the given
+ /// assertion removed from it. If the assertion is not in the set, then the
+ /// returned set is equivalent to the original.
+ #[inline]
+ pub fn remove(self, look: Look) -> LookSet {
+ LookSet { bits: self.bits & !look.as_repr() }
+ }
+
+ /// Updates this set in place with the result of removing the given
+ /// assertion from this set.
+ #[inline]
+ pub fn set_remove(&mut self, look: Look) {
+ *self = self.remove(look);
+ }
+
+ /// Returns a new set that is the result of subtracting the given set from
+ /// this set.
+ #[inline]
+ pub fn subtract(self, other: LookSet) -> LookSet {
+ LookSet { bits: self.bits & !other.bits }
+ }
+
+ /// Updates this set in place with the result of subtracting the given set
+ /// from this set.
+ #[inline]
+ pub fn set_subtract(&mut self, other: LookSet) {
+ *self = self.subtract(other);
+ }
+
+ /// Returns a new set that is the union of this and the one given.
+ #[inline]
+ pub fn union(self, other: LookSet) -> LookSet {
+ LookSet { bits: self.bits | other.bits }
+ }
+
+ /// Updates this set in place with the result of unioning it with the one
+ /// given.
+ #[inline]
+ pub fn set_union(&mut self, other: LookSet) {
+ *self = self.union(other);
+ }
+
+ /// Returns a new set that is the intersection of this and the one given.
+ #[inline]
+ pub fn intersect(self, other: LookSet) -> LookSet {
+ LookSet { bits: self.bits & other.bits }
+ }
+
+ /// Updates this set in place with the result of intersecting it with the
+ /// one given.
+ #[inline]
+ pub fn set_intersect(&mut self, other: LookSet) {
+ *self = self.intersect(other);
+ }
+
+ /// Return a `LookSet` from the slice given as a native endian 16-bit
+ /// integer.
+ ///
+ /// # Panics
+ ///
+ /// This panics if `slice.len() < 2`.
+ #[inline]
+ pub fn read_repr(slice: &[u8]) -> LookSet {
+ let bits = u16::from_ne_bytes(slice[..2].try_into().unwrap());
+ LookSet { bits }
+ }
+
+ /// Write a `LookSet` as a native endian 16-bit integer to the beginning
+ /// of the slice given.
+ ///
+ /// # Panics
+ ///
+ /// This panics if `slice.len() < 2`.
+ #[inline]
+ pub fn write_repr(self, slice: &mut [u8]) {
+ let raw = self.bits.to_ne_bytes();
+ slice[0] = raw[0];
+ slice[1] = raw[1];
+ }
+}
+
+impl core::fmt::Debug for LookSet {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ if self.is_empty() {
+ return write!(f, "∅");
+ }
+ for look in self.iter() {
+ write!(f, "{}", look.as_char())?;
+ }
+ Ok(())
+ }
+}
+
+/// An iterator over all look-around assertions in a [`LookSet`].
+///
+/// This iterator is created by [`LookSet::iter`].
+#[derive(Clone, Debug)]
+pub struct LookSetIter {
+ set: LookSet,
+}
+
+impl Iterator for LookSetIter {
+ type Item = Look;
+
+ #[inline]
+ fn next(&mut self) -> Option<Look> {
+ if self.set.is_empty() {
+ return None;
+ }
+ // We'll never have more than u8::MAX distinct look-around assertions,
+ // so 'repr' will always fit into a u16.
+ let repr = u16::try_from(self.set.bits.trailing_zeros()).unwrap();
+ let look = Look::from_repr(1 << repr)?;
+ self.set = self.set.remove(look);
+ Some(look)
+ }
+}
+
+/// Given a sequence of HIR values where each value corresponds to a Unicode
+/// class (or an all-ASCII byte class), return a single Unicode class
+/// corresponding to the union of the classes found.
+fn class_chars(hirs: &[Hir]) -> Option<Class> {
+ let mut cls = ClassUnicode::new(vec![]);
+ for hir in hirs.iter() {
+ match *hir.kind() {
+ HirKind::Class(Class::Unicode(ref cls2)) => {
+ cls.union(cls2);
+ }
+ HirKind::Class(Class::Bytes(ref cls2)) => {
+ cls.union(&cls2.to_unicode_class()?);
+ }
+ _ => return None,
+ };
+ }
+ Some(Class::Unicode(cls))
+}
+
+/// Given a sequence of HIR values where each value corresponds to a byte class
+/// (or an all-ASCII Unicode class), return a single byte class corresponding
+/// to the union of the classes found.
+fn class_bytes(hirs: &[Hir]) -> Option<Class> {
+ let mut cls = ClassBytes::new(vec![]);
+ for hir in hirs.iter() {
+ match *hir.kind() {
+ HirKind::Class(Class::Unicode(ref cls2)) => {
+ cls.union(&cls2.to_byte_class()?);
+ }
+ HirKind::Class(Class::Bytes(ref cls2)) => {
+ cls.union(cls2);
+ }
+ _ => return None,
+ };
+ }
+ Some(Class::Bytes(cls))
+}
+
+/// Given a sequence of HIR values where each value corresponds to a literal
+/// that is a single `char`, return that sequence of `char`s. Otherwise return
+/// None. No deduplication is done.
+fn singleton_chars(hirs: &[Hir]) -> Option<Vec<char>> {
+ let mut singletons = vec![];
+ for hir in hirs.iter() {
+ let literal = match *hir.kind() {
+ HirKind::Literal(Literal(ref bytes)) => bytes,
+ _ => return None,
+ };
+ let ch = match crate::debug::utf8_decode(literal) {
+ None => return None,
+ Some(Err(_)) => return None,
+ Some(Ok(ch)) => ch,
+ };
+ if literal.len() != ch.len_utf8() {
+ return None;
+ }
+ singletons.push(ch);
+ }
+ Some(singletons)
+}
+
+/// Given a sequence of HIR values where each value corresponds to a literal
+/// that is a single byte, return that sequence of bytes. Otherwise return
+/// None. No deduplication is done.
+fn singleton_bytes(hirs: &[Hir]) -> Option<Vec<u8>> {
+ let mut singletons = vec![];
+ for hir in hirs.iter() {
+ let literal = match *hir.kind() {
+ HirKind::Literal(Literal(ref bytes)) => bytes,
+ _ => return None,
+ };
+ if literal.len() != 1 {
+ return None;
+ }
+ singletons.push(literal[0]);
+ }
+ Some(singletons)
+}
+
+/// Looks for a common prefix in the list of alternation branches given. If one
+/// is found, then an equivalent but (hopefully) simplified Hir is returned.
+/// Otherwise, the original given list of branches is returned unmodified.
+///
+/// This is not quite as good as it could be. Right now, it requires that
+/// all branches are 'Concat' expressions. It also doesn't do well with
+/// literals. For example, given 'foofoo|foobar', it will not refactor it to
+/// 'foo(?:foo|bar)' because literals are flattened into their own special
+/// concatenation. (One wonders if perhaps 'Literal' should be a single atom
+/// instead of a string of bytes because of this. Otherwise, handling the
+/// current representation in this routine will be pretty gnarly. Sigh.)
+fn lift_common_prefix(hirs: Vec<Hir>) -> Result<Hir, Vec<Hir>> {
+ if hirs.len() <= 1 {
+ return Err(hirs);
+ }
+ let mut prefix = match hirs[0].kind() {
+ HirKind::Concat(ref xs) => &**xs,
+ _ => return Err(hirs),
+ };
+ if prefix.is_empty() {
+ return Err(hirs);
+ }
+ for h in hirs.iter().skip(1) {
+ let concat = match h.kind() {
+ HirKind::Concat(ref xs) => xs,
+ _ => return Err(hirs),
+ };
+ let common_len = prefix
+ .iter()
+ .zip(concat.iter())
+ .take_while(|(x, y)| x == y)
+ .count();
+ prefix = &prefix[..common_len];
+ if prefix.is_empty() {
+ return Err(hirs);
+ }
+ }
+ let len = prefix.len();
+ assert_ne!(0, len);
+ let mut prefix_concat = vec![];
+ let mut suffix_alts = vec![];
+ for h in hirs {
+ let mut concat = match h.into_kind() {
+ HirKind::Concat(xs) => xs,
+ // We required all sub-expressions to be
+ // concats above, so we're only here if we
+ // have a concat.
+ _ => unreachable!(),
+ };
+ suffix_alts.push(Hir::concat(concat.split_off(len)));
+ if prefix_concat.is_empty() {
+ prefix_concat = concat;
+ }
+ }
+ let mut concat = prefix_concat;
+ concat.push(Hir::alternation(suffix_alts));
+ Ok(Hir::concat(concat))
}
#[cfg(test)]
@@ -2244,12 +3653,6 @@ mod tests {
assert_eq!(expected, bsymdifference(&cls1, &cls2));
}
- #[test]
- #[should_panic]
- fn hir_byte_literal_non_ascii() {
- Hir::literal(Literal::Byte(b'a'));
- }
-
// We use a thread with an explicit stack size to test that our destructor
// for Hir can handle arbitrarily sized expressions in constant stack
// space. In case we run on a platform without threads (WASM?), we limit
@@ -2262,35 +3665,67 @@ mod tests {
let run = || {
let mut expr = Hir::empty();
for _ in 0..100 {
- expr = Hir::group(Group {
- kind: GroupKind::NonCapturing,
- hir: Box::new(expr),
+ expr = Hir::capture(Capture {
+ index: 1,
+ name: None,
+ sub: Box::new(expr),
});
expr = Hir::repetition(Repetition {
- kind: RepetitionKind::ZeroOrOne,
+ min: 0,
+ max: Some(1),
greedy: true,
- hir: Box::new(expr),
+ sub: Box::new(expr),
});
expr = Hir {
kind: HirKind::Concat(vec![expr]),
- info: HirInfo::new(),
+ props: Properties::empty(),
};
expr = Hir {
kind: HirKind::Alternation(vec![expr]),
- info: HirInfo::new(),
+ props: Properties::empty(),
};
}
- assert!(!expr.kind.is_empty());
+ assert!(!matches!(*expr.kind(), HirKind::Empty));
};
// We run our test on a thread with a small stack size so we can
// force the issue more easily.
+ //
+ // NOTE(2023-03-21): See the corresponding test in 'crate::ast::tests'
+ // for context on the specific stack size chosen here.
thread::Builder::new()
- .stack_size(1 << 10)
+ .stack_size(16 << 10)
.spawn(run)
.unwrap()
.join()
.unwrap();
}
+
+ #[test]
+ fn look_set_iter() {
+ let set = LookSet::empty();
+ assert_eq!(0, set.iter().count());
+
+ let set = LookSet::full();
+ assert_eq!(10, set.iter().count());
+
+ let set =
+ LookSet::empty().insert(Look::StartLF).insert(Look::WordUnicode);
+ assert_eq!(2, set.iter().count());
+
+ let set = LookSet::empty().insert(Look::StartLF);
+ assert_eq!(1, set.iter().count());
+
+ let set = LookSet::empty().insert(Look::WordAsciiNegate);
+ assert_eq!(1, set.iter().count());
+ }
+
+ #[test]
+ fn look_set_debug() {
+ let res = format!("{:?}", LookSet::empty());
+ assert_eq!("∅", res);
+ let res = format!("{:?}", LookSet::full());
+ assert_eq!("Az^$rRbB𝛃𝚩", res);
+ }
}
diff --git a/vendor/regex-syntax/src/hir/print.rs b/vendor/regex-syntax/src/hir/print.rs
index b71f3897c..fcb7cd252 100644
--- a/vendor/regex-syntax/src/hir/print.rs
+++ b/vendor/regex-syntax/src/hir/print.rs
@@ -2,11 +2,16 @@
This module provides a regular expression printer for `Hir`.
*/
-use std::fmt;
+use core::fmt;
-use crate::hir::visitor::{self, Visitor};
-use crate::hir::{self, Hir, HirKind};
-use crate::is_meta_character;
+use crate::{
+ hir::{
+ self,
+ visitor::{self, Visitor},
+ Hir, HirKind,
+ },
+ is_meta_character,
+};
/// A builder for constructing a printer.
///
@@ -84,21 +89,54 @@ impl<W: fmt::Write> Visitor for Writer<W> {
fn visit_pre(&mut self, hir: &Hir) -> fmt::Result {
match *hir.kind() {
- HirKind::Empty
- | HirKind::Repetition(_)
- | HirKind::Concat(_)
- | HirKind::Alternation(_) => {}
- HirKind::Literal(hir::Literal::Unicode(c)) => {
- self.write_literal_char(c)?;
- }
- HirKind::Literal(hir::Literal::Byte(b)) => {
- self.write_literal_byte(b)?;
+ // Empty is represented by nothing in the concrete syntax, and
+ // repetition operators are strictly suffix oriented.
+ HirKind::Empty | HirKind::Repetition(_) => {}
+ HirKind::Literal(hir::Literal(ref bytes)) => {
+ // See the comment on the 'Concat' and 'Alternation' case below
+ // for why we put parens here. Literals are, conceptually,
+ // a special case of concatenation where each element is a
+ // character. The HIR flattens this into a Box<[u8]>, but we
+ // still need to treat it like a concatenation for correct
+ // printing. As a special case, we don't write parens if there
+ // is only one character. One character means there is no
+ // concat so we don't need parens. Adding parens would still be
+ // correct, but we drop them here because it tends to create
+ // rather noisy regexes even in simple cases.
+ let result = core::str::from_utf8(bytes);
+ let len = result.map_or(bytes.len(), |s| s.chars().count());
+ if len > 1 {
+ self.wtr.write_str(r"(?:")?;
+ }
+ match result {
+ Ok(string) => {
+ for c in string.chars() {
+ self.write_literal_char(c)?;
+ }
+ }
+ Err(_) => {
+ for &b in bytes.iter() {
+ self.write_literal_byte(b)?;
+ }
+ }
+ }
+ if len > 1 {
+ self.wtr.write_str(r")")?;
+ }
}
HirKind::Class(hir::Class::Unicode(ref cls)) => {
+ if cls.ranges().is_empty() {
+ return self.wtr.write_str("[a&&b]");
+ }
self.wtr.write_str("[")?;
for range in cls.iter() {
if range.start() == range.end() {
self.write_literal_char(range.start())?;
+ } else if u32::from(range.start()) + 1
+ == u32::from(range.end())
+ {
+ self.write_literal_char(range.start())?;
+ self.write_literal_char(range.end())?;
} else {
self.write_literal_char(range.start())?;
self.wtr.write_str("-")?;
@@ -108,10 +146,16 @@ impl<W: fmt::Write> Visitor for Writer<W> {
self.wtr.write_str("]")?;
}
HirKind::Class(hir::Class::Bytes(ref cls)) => {
+ if cls.ranges().is_empty() {
+ return self.wtr.write_str("[a&&b]");
+ }
self.wtr.write_str("(?-u:[")?;
for range in cls.iter() {
if range.start() == range.end() {
self.write_literal_class_byte(range.start())?;
+ } else if range.start() + 1 == range.end() {
+ self.write_literal_class_byte(range.start())?;
+ self.write_literal_class_byte(range.end())?;
} else {
self.write_literal_class_byte(range.start())?;
self.wtr.write_str("-")?;
@@ -120,41 +164,60 @@ impl<W: fmt::Write> Visitor for Writer<W> {
}
self.wtr.write_str("])")?;
}
- HirKind::Anchor(hir::Anchor::StartLine) => {
- self.wtr.write_str("(?m:^)")?;
- }
- HirKind::Anchor(hir::Anchor::EndLine) => {
- self.wtr.write_str("(?m:$)")?;
- }
- HirKind::Anchor(hir::Anchor::StartText) => {
- self.wtr.write_str(r"\A")?;
- }
- HirKind::Anchor(hir::Anchor::EndText) => {
- self.wtr.write_str(r"\z")?;
- }
- HirKind::WordBoundary(hir::WordBoundary::Unicode) => {
- self.wtr.write_str(r"\b")?;
- }
- HirKind::WordBoundary(hir::WordBoundary::UnicodeNegate) => {
- self.wtr.write_str(r"\B")?;
- }
- HirKind::WordBoundary(hir::WordBoundary::Ascii) => {
- self.wtr.write_str(r"(?-u:\b)")?;
- }
- HirKind::WordBoundary(hir::WordBoundary::AsciiNegate) => {
- self.wtr.write_str(r"(?-u:\B)")?;
- }
- HirKind::Group(ref x) => match x.kind {
- hir::GroupKind::CaptureIndex(_) => {
- self.wtr.write_str("(")?;
+ HirKind::Look(ref look) => match *look {
+ hir::Look::Start => {
+ self.wtr.write_str(r"\A")?;
+ }
+ hir::Look::End => {
+ self.wtr.write_str(r"\z")?;
+ }
+ hir::Look::StartLF => {
+ self.wtr.write_str("(?m:^)")?;
+ }
+ hir::Look::EndLF => {
+ self.wtr.write_str("(?m:$)")?;
+ }
+ hir::Look::StartCRLF => {
+ self.wtr.write_str("(?mR:^)")?;
}
- hir::GroupKind::CaptureName { ref name, .. } => {
- write!(self.wtr, "(?P<{}>", name)?;
+ hir::Look::EndCRLF => {
+ self.wtr.write_str("(?mR:$)")?;
}
- hir::GroupKind::NonCapturing => {
- self.wtr.write_str("(?:")?;
+ hir::Look::WordAscii => {
+ self.wtr.write_str(r"(?-u:\b)")?;
+ }
+ hir::Look::WordAsciiNegate => {
+ self.wtr.write_str(r"(?-u:\B)")?;
+ }
+ hir::Look::WordUnicode => {
+ self.wtr.write_str(r"\b")?;
+ }
+ hir::Look::WordUnicodeNegate => {
+ self.wtr.write_str(r"\B")?;
}
},
+ HirKind::Capture(hir::Capture { ref name, .. }) => {
+ self.wtr.write_str("(")?;
+ if let Some(ref name) = *name {
+ write!(self.wtr, "?P<{}>", name)?;
+ }
+ }
+ // Why do this? Wrapping concats and alts in non-capturing groups
+ // is not *always* necessary, but is sometimes necessary. For
+ // example, 'concat(a, alt(b, c))' should be written as 'a(?:b|c)'
+ // and not 'ab|c'. The former is clearly the intended meaning, but
+ // the latter is actually 'alt(concat(a, b), c)'.
+ //
+ // It would be possible to only group these things in cases where
+ // it's strictly necessary, but it requires knowing the parent
+ // expression. And since this technique is simpler and always
+ // correct, we take this route. More to the point, it is a non-goal
+ // of an HIR printer to show a nice easy-to-read regex. Indeed,
+ // its construction forbids it from doing so. Therefore, inserting
+ // extra groups where they aren't necessary is perfectly okay.
+ HirKind::Concat(_) | HirKind::Alternation(_) => {
+ self.wtr.write_str(r"(?:")?;
+ }
}
Ok(())
}
@@ -165,39 +228,42 @@ impl<W: fmt::Write> Visitor for Writer<W> {
HirKind::Empty
| HirKind::Literal(_)
| HirKind::Class(_)
- | HirKind::Anchor(_)
- | HirKind::WordBoundary(_)
- | HirKind::Concat(_)
- | HirKind::Alternation(_) => {}
+ | HirKind::Look(_) => {}
HirKind::Repetition(ref x) => {
- match x.kind {
- hir::RepetitionKind::ZeroOrOne => {
+ match (x.min, x.max) {
+ (0, Some(1)) => {
self.wtr.write_str("?")?;
}
- hir::RepetitionKind::ZeroOrMore => {
+ (0, None) => {
self.wtr.write_str("*")?;
}
- hir::RepetitionKind::OneOrMore => {
+ (1, None) => {
self.wtr.write_str("+")?;
}
- hir::RepetitionKind::Range(ref x) => match *x {
- hir::RepetitionRange::Exactly(m) => {
- write!(self.wtr, "{{{}}}", m)?;
- }
- hir::RepetitionRange::AtLeast(m) => {
- write!(self.wtr, "{{{},}}", m)?;
- }
- hir::RepetitionRange::Bounded(m, n) => {
- write!(self.wtr, "{{{},{}}}", m, n)?;
- }
- },
+ (1, Some(1)) => {
+ // 'a{1}' and 'a{1}?' are exactly equivalent to 'a'.
+ return Ok(());
+ }
+ (m, None) => {
+ write!(self.wtr, "{{{},}}", m)?;
+ }
+ (m, Some(n)) if m == n => {
+ write!(self.wtr, "{{{}}}", m)?;
+ // a{m} and a{m}? are always exactly equivalent.
+ return Ok(());
+ }
+ (m, Some(n)) => {
+ write!(self.wtr, "{{{},{}}}", m, n)?;
+ }
}
if !x.greedy {
self.wtr.write_str("?")?;
}
}
- HirKind::Group(_) => {
- self.wtr.write_str(")")?;
+ HirKind::Capture(_)
+ | HirKind::Concat(_)
+ | HirKind::Alternation(_) => {
+ self.wtr.write_str(r")")?;
}
}
Ok(())
@@ -217,18 +283,16 @@ impl<W: fmt::Write> Writer<W> {
}
fn write_literal_byte(&mut self, b: u8) -> fmt::Result {
- let c = b as char;
- if c <= 0x7F as char && !c.is_control() && !c.is_whitespace() {
- self.write_literal_char(c)
+ if b <= 0x7F && !b.is_ascii_control() && !b.is_ascii_whitespace() {
+ self.write_literal_char(char::try_from(b).unwrap())
} else {
write!(self.wtr, "(?-u:\\x{:02X})", b)
}
}
fn write_literal_class_byte(&mut self, b: u8) -> fmt::Result {
- let c = b as char;
- if c <= 0x7F as char && !c.is_control() && !c.is_whitespace() {
- self.write_literal_char(c)
+ if b <= 0x7F && !b.is_ascii_control() && !b.is_ascii_whitespace() {
+ self.write_literal_char(char::try_from(b).unwrap())
} else {
write!(self.wtr, "\\x{:02X}", b)
}
@@ -237,15 +301,21 @@ impl<W: fmt::Write> Writer<W> {
#[cfg(test)]
mod tests {
- use super::Printer;
+ use alloc::{
+ boxed::Box,
+ string::{String, ToString},
+ };
+
use crate::ParserBuilder;
+ use super::*;
+
fn roundtrip(given: &str, expected: &str) {
roundtrip_with(|b| b, given, expected);
}
fn roundtrip_bytes(given: &str, expected: &str) {
- roundtrip_with(|b| b.allow_invalid_utf8(true), given, expected);
+ roundtrip_with(|b| b.utf8(false), given, expected);
}
fn roundtrip_with<F>(mut f: F, given: &str, expected: &str)
@@ -277,28 +347,35 @@ mod tests {
#[test]
fn print_class() {
- roundtrip(r"[a]", r"[a]");
+ roundtrip(r"[a]", r"a");
+ roundtrip(r"[ab]", r"[ab]");
roundtrip(r"[a-z]", r"[a-z]");
roundtrip(r"[a-z--b-c--x-y]", r"[ad-wz]");
- roundtrip(r"[^\x01-\u{10FFFF}]", "[\u{0}]");
- roundtrip(r"[-]", r"[\-]");
+ roundtrip(r"[^\x01-\u{10FFFF}]", "\u{0}");
+ roundtrip(r"[-]", r"\-");
roundtrip(r"[☃-⛄]", r"[☃-⛄]");
- roundtrip(r"(?-u)[a]", r"(?-u:[a])");
+ roundtrip(r"(?-u)[a]", r"a");
+ roundtrip(r"(?-u)[ab]", r"(?-u:[ab])");
roundtrip(r"(?-u)[a-z]", r"(?-u:[a-z])");
roundtrip_bytes(r"(?-u)[a-\xFF]", r"(?-u:[a-\xFF])");
// The following test that the printer escapes meta characters
// in character classes.
- roundtrip(r"[\[]", r"[\[]");
+ roundtrip(r"[\[]", r"\[");
roundtrip(r"[Z-_]", r"[Z-_]");
roundtrip(r"[Z-_--Z]", r"[\[-_]");
// The following test that the printer escapes meta characters
// in byte oriented character classes.
- roundtrip_bytes(r"(?-u)[\[]", r"(?-u:[\[])");
+ roundtrip_bytes(r"(?-u)[\[]", r"\[");
roundtrip_bytes(r"(?-u)[Z-_]", r"(?-u:[Z-_])");
roundtrip_bytes(r"(?-u)[Z-_--Z]", r"(?-u:[\[-_])");
+
+ // This tests that an empty character class is correctly roundtripped.
+ #[cfg(feature = "unicode-gencat")]
+ roundtrip(r"\P{any}", r"[a&&b]");
+ roundtrip_bytes(r"(?-u)[^\x00-\xFF]", r"[a&&b]");
}
#[test]
@@ -331,37 +408,170 @@ mod tests {
roundtrip("a+?", "a+?");
roundtrip("(?U)a+", "a+?");
- roundtrip("a{1}", "a{1}");
- roundtrip("a{1,}", "a{1,}");
+ roundtrip("a{1}", "a");
+ roundtrip("a{2}", "a{2}");
+ roundtrip("a{1,}", "a+");
roundtrip("a{1,5}", "a{1,5}");
- roundtrip("a{1}?", "a{1}?");
- roundtrip("a{1,}?", "a{1,}?");
+ roundtrip("a{1}?", "a");
+ roundtrip("a{2}?", "a{2}");
+ roundtrip("a{1,}?", "a+?");
roundtrip("a{1,5}?", "a{1,5}?");
- roundtrip("(?U)a{1}", "a{1}?");
- roundtrip("(?U)a{1,}", "a{1,}?");
+ roundtrip("(?U)a{1}", "a");
+ roundtrip("(?U)a{2}", "a{2}");
+ roundtrip("(?U)a{1,}", "a+?");
roundtrip("(?U)a{1,5}", "a{1,5}?");
+
+ // Test that various zero-length repetitions always translate to an
+ // empty regex. This is more a property of HIR's smart constructors
+ // than the printer though.
+ roundtrip("a{0}", "");
+ roundtrip("(?:ab){0}", "");
+ #[cfg(feature = "unicode-gencat")]
+ {
+ roundtrip(r"\p{any}{0}", "");
+ roundtrip(r"\P{any}{0}", "");
+ }
}
#[test]
fn print_group() {
roundtrip("()", "()");
roundtrip("(?P<foo>)", "(?P<foo>)");
- roundtrip("(?:)", "(?:)");
+ roundtrip("(?:)", "");
roundtrip("(a)", "(a)");
roundtrip("(?P<foo>a)", "(?P<foo>a)");
- roundtrip("(?:a)", "(?:a)");
+ roundtrip("(?:a)", "a");
roundtrip("((((a))))", "((((a))))");
}
#[test]
fn print_alternation() {
- roundtrip("|", "|");
- roundtrip("||", "||");
+ roundtrip("|", "(?:|)");
+ roundtrip("||", "(?:||)");
+
+ roundtrip("a|b", "[ab]");
+ roundtrip("ab|cd", "(?:(?:ab)|(?:cd))");
+ roundtrip("a|b|c", "[a-c]");
+ roundtrip("ab|cd|ef", "(?:(?:ab)|(?:cd)|(?:ef))");
+ roundtrip("foo|bar|quux", "(?:(?:foo)|(?:bar)|(?:quux))");
+ }
- roundtrip("a|b", "a|b");
- roundtrip("a|b|c", "a|b|c");
- roundtrip("foo|bar|quux", "foo|bar|quux");
+ // This is a regression test that stresses a peculiarity of how the HIR
+ // is both constructed and printed. Namely, it is legal for a repetition
+ // to directly contain a concatenation. This particular construct isn't
+ // really possible to build from the concrete syntax directly, since you'd
+ // be forced to put the concatenation into (at least) a non-capturing
+ // group. Concurrently, the printer doesn't consider this case and just
+ // kind of naively prints the child expression and tacks on the repetition
+ // operator.
+ //
+ // As a result, if you attached '+' to a 'concat(a, b)', the printer gives
+ // you 'ab+', but clearly it really should be '(?:ab)+'.
+ //
+ // This bug isn't easy to surface because most ways of building an HIR
+ // come directly from the concrete syntax, and as mentioned above, it just
+ // isn't possible to build this kind of HIR from the concrete syntax.
+ // Nevertheless, this is definitely a bug.
+ //
+ // See: https://github.com/rust-lang/regex/issues/731
+ #[test]
+ fn regression_repetition_concat() {
+ let expr = Hir::concat(alloc::vec![
+ Hir::literal("x".as_bytes()),
+ Hir::repetition(hir::Repetition {
+ min: 1,
+ max: None,
+ greedy: true,
+ sub: Box::new(Hir::literal("ab".as_bytes())),
+ }),
+ Hir::literal("y".as_bytes()),
+ ]);
+ assert_eq!(r"(?:x(?:ab)+y)", expr.to_string());
+
+ let expr = Hir::concat(alloc::vec![
+ Hir::look(hir::Look::Start),
+ Hir::repetition(hir::Repetition {
+ min: 1,
+ max: None,
+ greedy: true,
+ sub: Box::new(Hir::concat(alloc::vec![
+ Hir::look(hir::Look::Start),
+ Hir::look(hir::Look::End),
+ ])),
+ }),
+ Hir::look(hir::Look::End),
+ ]);
+ assert_eq!(r"(?:\A(?:\A\z)+\z)", expr.to_string());
+ }
+
+ // Just like regression_repetition_concat, but with the repetition using
+ // an alternation as a child expression instead.
+ //
+ // See: https://github.com/rust-lang/regex/issues/731
+ #[test]
+ fn regression_repetition_alternation() {
+ let expr = Hir::concat(alloc::vec![
+ Hir::literal("ab".as_bytes()),
+ Hir::repetition(hir::Repetition {
+ min: 1,
+ max: None,
+ greedy: true,
+ sub: Box::new(Hir::alternation(alloc::vec![
+ Hir::literal("cd".as_bytes()),
+ Hir::literal("ef".as_bytes()),
+ ])),
+ }),
+ Hir::literal("gh".as_bytes()),
+ ]);
+ assert_eq!(r"(?:(?:ab)(?:(?:cd)|(?:ef))+(?:gh))", expr.to_string());
+
+ let expr = Hir::concat(alloc::vec![
+ Hir::look(hir::Look::Start),
+ Hir::repetition(hir::Repetition {
+ min: 1,
+ max: None,
+ greedy: true,
+ sub: Box::new(Hir::alternation(alloc::vec![
+ Hir::look(hir::Look::Start),
+ Hir::look(hir::Look::End),
+ ])),
+ }),
+ Hir::look(hir::Look::End),
+ ]);
+ assert_eq!(r"(?:\A(?:\A|\z)+\z)", expr.to_string());
+ }
+
+ // This regression test is very similar in flavor to
+ // regression_repetition_concat in that the root of the issue lies in a
+ // peculiarity of how the HIR is represented and how the printer writes it
+ // out. Like the other regression, this one is also rooted in the fact that
+ // you can't produce the peculiar HIR from the concrete syntax. Namely, you
+ // just can't have a 'concat(a, alt(b, c))' because the 'alt' will normally
+ // be in (at least) a non-capturing group. Why? Because the '|' has very
+ // low precedence (lower that concatenation), and so something like 'ab|c'
+ // is actually 'alt(ab, c)'.
+ //
+ // See: https://github.com/rust-lang/regex/issues/516
+ #[test]
+ fn regression_alternation_concat() {
+ let expr = Hir::concat(alloc::vec![
+ Hir::literal("ab".as_bytes()),
+ Hir::alternation(alloc::vec![
+ Hir::literal("mn".as_bytes()),
+ Hir::literal("xy".as_bytes()),
+ ]),
+ ]);
+ assert_eq!(r"(?:(?:ab)(?:(?:mn)|(?:xy)))", expr.to_string());
+
+ let expr = Hir::concat(alloc::vec![
+ Hir::look(hir::Look::Start),
+ Hir::alternation(alloc::vec![
+ Hir::look(hir::Look::Start),
+ Hir::look(hir::Look::End),
+ ]),
+ ]);
+ assert_eq!(r"(?:\A(?:\A|\z))", expr.to_string());
}
}
diff --git a/vendor/regex-syntax/src/hir/translate.rs b/vendor/regex-syntax/src/hir/translate.rs
index 890e1608b..ff9c5ee91 100644
--- a/vendor/regex-syntax/src/hir/translate.rs
+++ b/vendor/regex-syntax/src/hir/translate.rs
@@ -2,19 +2,23 @@
Defines a translator that converts an `Ast` to an `Hir`.
*/
-use std::cell::{Cell, RefCell};
-use std::result;
+use core::cell::{Cell, RefCell};
-use crate::ast::{self, Ast, Span, Visitor};
-use crate::hir::{self, Error, ErrorKind, Hir};
-use crate::unicode::{self, ClassQuery};
+use alloc::{boxed::Box, string::ToString, vec, vec::Vec};
-type Result<T> = result::Result<T, Error>;
+use crate::{
+ ast::{self, Ast, Span, Visitor},
+ either::Either,
+ hir::{self, Error, ErrorKind, Hir, HirKind},
+ unicode::{self, ClassQuery},
+};
+
+type Result<T> = core::result::Result<T, Error>;
/// A builder for constructing an AST->HIR translator.
#[derive(Clone, Debug)]
pub struct TranslatorBuilder {
- allow_invalid_utf8: bool,
+ utf8: bool,
flags: Flags,
}
@@ -27,10 +31,7 @@ impl Default for TranslatorBuilder {
impl TranslatorBuilder {
/// Create a new translator builder with a default c onfiguration.
pub fn new() -> TranslatorBuilder {
- TranslatorBuilder {
- allow_invalid_utf8: false,
- flags: Flags::default(),
- }
+ TranslatorBuilder { utf8: true, flags: Flags::default() }
}
/// Build a translator using the current configuration.
@@ -38,23 +39,27 @@ impl TranslatorBuilder {
Translator {
stack: RefCell::new(vec![]),
flags: Cell::new(self.flags),
- allow_invalid_utf8: self.allow_invalid_utf8,
+ utf8: self.utf8,
}
}
- /// When enabled, translation will permit the construction of a regular
+ /// When disabled, translation will permit the construction of a regular
/// expression that may match invalid UTF-8.
///
- /// When disabled (the default), the translator is guaranteed to produce
- /// an expression that will only ever match valid UTF-8 (otherwise, the
- /// translator will return an error).
+ /// When enabled (the default), the translator is guaranteed to produce an
+ /// expression that, for non-empty matches, will only ever produce spans
+ /// that are entirely valid UTF-8 (otherwise, the translator will return an
+ /// error).
///
- /// Perhaps surprisingly, when invalid UTF-8 isn't allowed, a negated ASCII
- /// word boundary (uttered as `(?-u:\B)` in the concrete syntax) will cause
- /// the parser to return an error. Namely, a negated ASCII word boundary
- /// can result in matching positions that aren't valid UTF-8 boundaries.
- pub fn allow_invalid_utf8(&mut self, yes: bool) -> &mut TranslatorBuilder {
- self.allow_invalid_utf8 = yes;
+ /// Perhaps surprisingly, when UTF-8 is enabled, an empty regex or even
+ /// a negated ASCII word boundary (uttered as `(?-u:\B)` in the concrete
+ /// syntax) will be allowed even though they can produce matches that split
+ /// a UTF-8 encoded codepoint. This only applies to zero-width or "empty"
+ /// matches, and it is expected that the regex engine itself must handle
+ /// these cases if necessary (perhaps by suppressing any zero-width matches
+ /// that split a codepoint).
+ pub fn utf8(&mut self, yes: bool) -> &mut TranslatorBuilder {
+ self.utf8 = yes;
self
}
@@ -80,6 +85,12 @@ impl TranslatorBuilder {
self
}
+ /// Enable or disable the CRLF mode flag (`R`) by default.
+ pub fn crlf(&mut self, yes: bool) -> &mut TranslatorBuilder {
+ self.flags.crlf = if yes { Some(true) } else { None };
+ self
+ }
+
/// Enable or disable the "swap greed" flag (`U`) by default.
pub fn swap_greed(&mut self, yes: bool) -> &mut TranslatorBuilder {
self.flags.swap_greed = if yes { Some(true) } else { None };
@@ -100,7 +111,7 @@ impl TranslatorBuilder {
/// many abstract syntax trees.
///
/// A `Translator` can be configured in more detail via a
-/// [`TranslatorBuilder`](struct.TranslatorBuilder.html).
+/// [`TranslatorBuilder`].
#[derive(Clone, Debug)]
pub struct Translator {
/// Our call stack, but on the heap.
@@ -108,7 +119,7 @@ pub struct Translator {
/// The current flag settings.
flags: Cell<Flags>,
/// Whether we're allowed to produce HIR that can match arbitrary bytes.
- allow_invalid_utf8: bool,
+ utf8: bool,
}
impl Translator {
@@ -143,6 +154,12 @@ enum HirFrame {
/// case in the Ast. They get popped after an inductive (i.e., recursive)
/// step is complete.
Expr(Hir),
+ /// A literal that is being constructed, character by character, from the
+ /// AST. We need this because the AST gives each individual character its
+ /// own node. So as we see characters, we peek at the top-most HirFrame.
+ /// If it's a literal, then we add to it. Otherwise, we push a new literal.
+ /// When it comes time to pop it, we convert it to an Hir via Hir::literal.
+ Literal(Vec<u8>),
/// A Unicode character class. This frame is mutated as we descend into
/// the Ast of a character class (which is itself its own mini recursive
/// structure).
@@ -152,10 +169,17 @@ enum HirFrame {
/// recursive structure).
///
/// Byte character classes are created when Unicode mode (`u`) is disabled.
- /// If `allow_invalid_utf8` is disabled (the default), then a byte
- /// character is only permitted to match ASCII text.
+ /// If `utf8` is enabled (the default), then a byte character is only
+ /// permitted to match ASCII text.
ClassBytes(hir::ClassBytes),
- /// This is pushed on to the stack upon first seeing any kind of group,
+ /// This is pushed whenever a repetition is observed. After visiting every
+ /// sub-expression in the repetition, the translator's stack is expected to
+ /// have this sentinel at the top.
+ ///
+ /// This sentinel only exists to stop other things (like flattening
+ /// literals) from reaching across repetition operators.
+ Repetition,
+ /// This is pushed on to the stack upon first seeing any kind of capture,
/// indicated by parentheses (including non-capturing groups). It is popped
/// upon leaving a group.
Group {
@@ -181,6 +205,14 @@ enum HirFrame {
/// every sub-expression in the alternation, the translator's stack is
/// popped until it sees an Alternation frame.
Alternation,
+ /// This is pushed immediately before each sub-expression in an
+ /// alternation. This separates the branches of an alternation on the
+ /// stack and prevents literal flattening from reaching across alternation
+ /// branches.
+ ///
+ /// It is popped after each expression in a branch until an 'Alternation'
+ /// frame is observed when doing a post visit on an alternation.
+ AlternationBranch,
}
impl HirFrame {
@@ -188,6 +220,7 @@ impl HirFrame {
fn unwrap_expr(self) -> Hir {
match self {
HirFrame::Expr(expr) => expr,
+ HirFrame::Literal(lit) => Hir::literal(lit),
_ => panic!("tried to unwrap expr from HirFrame, got: {:?}", self),
}
}
@@ -218,6 +251,20 @@ impl HirFrame {
}
}
+ /// Assert that the current stack frame is a repetition sentinel. If it
+ /// isn't, then panic.
+ fn unwrap_repetition(self) {
+ match self {
+ HirFrame::Repetition => {}
+ _ => {
+ panic!(
+ "tried to unwrap repetition from HirFrame, got: {:?}",
+ self
+ )
+ }
+ }
+ }
+
/// Assert that the current stack frame is a group indicator and return
/// its corresponding flags (the flags that were active at the time the
/// group was entered).
@@ -229,6 +276,20 @@ impl HirFrame {
}
}
}
+
+ /// Assert that the current stack frame is an alternation pipe sentinel. If
+ /// it isn't, then panic.
+ fn unwrap_alternation_pipe(self) {
+ match self {
+ HirFrame::AlternationBranch => {}
+ _ => {
+ panic!(
+ "tried to unwrap alt pipe from HirFrame, got: {:?}",
+ self
+ )
+ }
+ }
+ }
}
impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
@@ -252,6 +313,7 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
self.push(HirFrame::ClassBytes(cls));
}
}
+ Ast::Repetition(_) => self.push(HirFrame::Repetition),
Ast::Group(ref x) => {
let old_flags = x
.flags()
@@ -266,6 +328,7 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
Ast::Alternation(ref x) if x.asts.is_empty() => {}
Ast::Alternation(_) => {
self.push(HirFrame::Alternation);
+ self.push(HirFrame::AlternationBranch);
}
_ => {}
}
@@ -291,7 +354,20 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
self.push(HirFrame::Expr(Hir::empty()));
}
Ast::Literal(ref x) => {
- self.push(HirFrame::Expr(self.hir_literal(x)?));
+ match self.ast_literal_to_scalar(x)? {
+ Either::Right(byte) => self.push_byte(byte),
+ Either::Left(ch) => {
+ if !self.flags().unicode() && ch.len_utf8() > 1 {
+ return Err(self
+ .error(x.span, ErrorKind::UnicodeNotAllowed));
+ }
+ match self.case_fold_char(x.span, ch)? {
+ None => self.push_char(ch),
+ Some(expr) => self.push(HirFrame::Expr(expr)),
+ }
+ }
+ }
+ // self.push(HirFrame::Expr(self.hir_literal(x)?));
}
Ast::Dot(span) => {
self.push(HirFrame::Expr(self.hir_dot(span)?));
@@ -305,7 +381,7 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
let hcls = hir::Class::Unicode(cls);
self.push(HirFrame::Expr(Hir::class(hcls)));
} else {
- let cls = self.hir_perl_byte_class(x);
+ let cls = self.hir_perl_byte_class(x)?;
let hcls = hir::Class::Bytes(cls);
self.push(HirFrame::Expr(Hir::class(hcls)));
}
@@ -322,12 +398,6 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
ast.negated,
&mut cls,
)?;
- if cls.ranges().is_empty() {
- return Err(self.error(
- ast.span,
- ErrorKind::EmptyClassNotAllowed,
- ));
- }
let expr = Hir::class(hir::Class::Unicode(cls));
self.push(HirFrame::Expr(expr));
} else {
@@ -337,31 +407,25 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
ast.negated,
&mut cls,
)?;
- if cls.ranges().is_empty() {
- return Err(self.error(
- ast.span,
- ErrorKind::EmptyClassNotAllowed,
- ));
- }
-
let expr = Hir::class(hir::Class::Bytes(cls));
self.push(HirFrame::Expr(expr));
}
}
Ast::Repetition(ref x) => {
let expr = self.pop().unwrap().unwrap_expr();
+ self.pop().unwrap().unwrap_repetition();
self.push(HirFrame::Expr(self.hir_repetition(x, expr)));
}
Ast::Group(ref x) => {
let expr = self.pop().unwrap().unwrap_expr();
let old_flags = self.pop().unwrap().unwrap_group();
self.trans().flags.set(old_flags);
- self.push(HirFrame::Expr(self.hir_group(x, expr)));
+ self.push(HirFrame::Expr(self.hir_capture(x, expr)));
}
Ast::Concat(_) => {
let mut exprs = vec![];
- while let Some(HirFrame::Expr(expr)) = self.pop() {
- if !expr.kind().is_empty() {
+ while let Some(expr) = self.pop_concat_expr() {
+ if !matches!(*expr.kind(), HirKind::Empty) {
exprs.push(expr);
}
}
@@ -370,7 +434,8 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
}
Ast::Alternation(_) => {
let mut exprs = vec![];
- while let Some(HirFrame::Expr(expr)) = self.pop() {
+ while let Some(expr) = self.pop_alt_expr() {
+ self.pop().unwrap().unwrap_alternation_pipe();
exprs.push(expr);
}
exprs.reverse();
@@ -380,6 +445,11 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
Ok(())
}
+ fn visit_alternation_in(&mut self) -> Result<()> {
+ self.push(HirFrame::AlternationBranch);
+ Ok(())
+ }
+
fn visit_class_set_item_pre(
&mut self,
ast: &ast::ClassSetItem,
@@ -458,7 +528,7 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
cls.union(&xcls);
self.push(HirFrame::ClassUnicode(cls));
} else {
- let xcls = self.hir_perl_byte_class(x);
+ let xcls = self.hir_perl_byte_class(x)?;
let mut cls = self.pop().unwrap().unwrap_class_bytes();
cls.union(&xcls);
self.push(HirFrame::ClassBytes(cls));
@@ -602,11 +672,103 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
self.trans().stack.borrow_mut().push(frame);
}
+ /// Push the given literal char on to the call stack.
+ ///
+ /// If the top-most element of the stack is a literal, then the char
+ /// is appended to the end of that literal. Otherwise, a new literal
+ /// containing just the given char is pushed to the top of the stack.
+ fn push_char(&self, ch: char) {
+ let mut buf = [0; 4];
+ let bytes = ch.encode_utf8(&mut buf).as_bytes();
+ let mut stack = self.trans().stack.borrow_mut();
+ if let Some(HirFrame::Literal(ref mut literal)) = stack.last_mut() {
+ literal.extend_from_slice(bytes);
+ } else {
+ stack.push(HirFrame::Literal(bytes.to_vec()));
+ }
+ }
+
+ /// Push the given literal byte on to the call stack.
+ ///
+ /// If the top-most element of the stack is a literal, then the byte
+ /// is appended to the end of that literal. Otherwise, a new literal
+ /// containing just the given byte is pushed to the top of the stack.
+ fn push_byte(&self, byte: u8) {
+ let mut stack = self.trans().stack.borrow_mut();
+ if let Some(HirFrame::Literal(ref mut literal)) = stack.last_mut() {
+ literal.push(byte);
+ } else {
+ stack.push(HirFrame::Literal(vec![byte]));
+ }
+ }
+
/// Pop the top of the call stack. If the call stack is empty, return None.
fn pop(&self) -> Option<HirFrame> {
self.trans().stack.borrow_mut().pop()
}
+ /// Pop an HIR expression from the top of the stack for a concatenation.
+ ///
+ /// This returns None if the stack is empty or when a concat frame is seen.
+ /// Otherwise, it panics if it could not find an HIR expression.
+ fn pop_concat_expr(&self) -> Option<Hir> {
+ let frame = self.pop()?;
+ match frame {
+ HirFrame::Concat => None,
+ HirFrame::Expr(expr) => Some(expr),
+ HirFrame::Literal(lit) => Some(Hir::literal(lit)),
+ HirFrame::ClassUnicode(_) => {
+ unreachable!("expected expr or concat, got Unicode class")
+ }
+ HirFrame::ClassBytes(_) => {
+ unreachable!("expected expr or concat, got byte class")
+ }
+ HirFrame::Repetition => {
+ unreachable!("expected expr or concat, got repetition")
+ }
+ HirFrame::Group { .. } => {
+ unreachable!("expected expr or concat, got group")
+ }
+ HirFrame::Alternation => {
+ unreachable!("expected expr or concat, got alt marker")
+ }
+ HirFrame::AlternationBranch => {
+ unreachable!("expected expr or concat, got alt branch marker")
+ }
+ }
+ }
+
+ /// Pop an HIR expression from the top of the stack for an alternation.
+ ///
+ /// This returns None if the stack is empty or when an alternation frame is
+ /// seen. Otherwise, it panics if it could not find an HIR expression.
+ fn pop_alt_expr(&self) -> Option<Hir> {
+ let frame = self.pop()?;
+ match frame {
+ HirFrame::Alternation => None,
+ HirFrame::Expr(expr) => Some(expr),
+ HirFrame::Literal(lit) => Some(Hir::literal(lit)),
+ HirFrame::ClassUnicode(_) => {
+ unreachable!("expected expr or alt, got Unicode class")
+ }
+ HirFrame::ClassBytes(_) => {
+ unreachable!("expected expr or alt, got byte class")
+ }
+ HirFrame::Repetition => {
+ unreachable!("expected expr or alt, got repetition")
+ }
+ HirFrame::Group { .. } => {
+ unreachable!("expected expr or alt, got group")
+ }
+ HirFrame::Concat => {
+ unreachable!("expected expr or alt, got concat marker")
+ }
+ HirFrame::AlternationBranch => {
+ unreachable!("expected expr or alt, got alt branch marker")
+ }
+ }
+ }
+
/// Create a new error with the given span and error type.
fn error(&self, span: Span, kind: ErrorKind) -> Error {
Error { kind, pattern: self.pattern.to_string(), span }
@@ -627,63 +789,48 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
old_flags
}
- fn hir_literal(&self, lit: &ast::Literal) -> Result<Hir> {
- let ch = match self.literal_to_char(lit)? {
- byte @ hir::Literal::Byte(_) => return Ok(Hir::literal(byte)),
- hir::Literal::Unicode(ch) => ch,
- };
- if self.flags().case_insensitive() {
- self.hir_from_char_case_insensitive(lit.span, ch)
- } else {
- self.hir_from_char(lit.span, ch)
- }
- }
-
/// Convert an Ast literal to its scalar representation.
///
/// When Unicode mode is enabled, then this always succeeds and returns a
/// `char` (Unicode scalar value).
///
- /// When Unicode mode is disabled, then a raw byte is returned. If that
- /// byte is not ASCII and invalid UTF-8 is not allowed, then this returns
- /// an error.
- fn literal_to_char(&self, lit: &ast::Literal) -> Result<hir::Literal> {
+ /// When Unicode mode is disabled, then a `char` will still be returned
+ /// whenever possible. A byte is returned only when invalid UTF-8 is
+ /// allowed and when the byte is not ASCII. Otherwise, a non-ASCII byte
+ /// will result in an error when invalid UTF-8 is not allowed.
+ fn ast_literal_to_scalar(
+ &self,
+ lit: &ast::Literal,
+ ) -> Result<Either<char, u8>> {
if self.flags().unicode() {
- return Ok(hir::Literal::Unicode(lit.c));
+ return Ok(Either::Left(lit.c));
}
let byte = match lit.byte() {
- None => return Ok(hir::Literal::Unicode(lit.c)),
+ None => return Ok(Either::Left(lit.c)),
Some(byte) => byte,
};
if byte <= 0x7F {
- return Ok(hir::Literal::Unicode(byte as char));
+ return Ok(Either::Left(char::try_from(byte).unwrap()));
}
- if !self.trans().allow_invalid_utf8 {
+ if self.trans().utf8 {
return Err(self.error(lit.span, ErrorKind::InvalidUtf8));
}
- Ok(hir::Literal::Byte(byte))
+ Ok(Either::Right(byte))
}
- fn hir_from_char(&self, span: Span, c: char) -> Result<Hir> {
- if !self.flags().unicode() && c.len_utf8() > 1 {
- return Err(self.error(span, ErrorKind::UnicodeNotAllowed));
+ fn case_fold_char(&self, span: Span, c: char) -> Result<Option<Hir>> {
+ if !self.flags().case_insensitive() {
+ return Ok(None);
}
- Ok(Hir::literal(hir::Literal::Unicode(c)))
- }
-
- fn hir_from_char_case_insensitive(
- &self,
- span: Span,
- c: char,
- ) -> Result<Hir> {
if self.flags().unicode() {
// If case folding won't do anything, then don't bother trying.
- let map =
- unicode::contains_simple_case_mapping(c, c).map_err(|_| {
+ let map = unicode::SimpleCaseFolder::new()
+ .map(|f| f.overlaps(c, c))
+ .map_err(|_| {
self.error(span, ErrorKind::UnicodeCaseUnavailable)
})?;
if !map {
- return self.hir_from_char(span, c);
+ return Ok(None);
}
let mut cls =
hir::ClassUnicode::new(vec![hir::ClassUnicodeRange::new(
@@ -692,7 +839,7 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
cls.try_case_fold_simple().map_err(|_| {
self.error(span, ErrorKind::UnicodeCaseUnavailable)
})?;
- Ok(Hir::class(hir::Class::Unicode(cls)))
+ Ok(Some(Hir::class(hir::Class::Unicode(cls))))
} else {
if c.len_utf8() > 1 {
return Err(self.error(span, ErrorKind::UnicodeNotAllowed));
@@ -700,109 +847,102 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
// If case folding won't do anything, then don't bother trying.
match c {
'A'..='Z' | 'a'..='z' => {}
- _ => return self.hir_from_char(span, c),
+ _ => return Ok(None),
}
let mut cls =
hir::ClassBytes::new(vec![hir::ClassBytesRange::new(
- c as u8, c as u8,
+ // OK because 'c.len_utf8() == 1' which in turn implies
+ // that 'c' is ASCII.
+ u8::try_from(c).unwrap(),
+ u8::try_from(c).unwrap(),
)]);
cls.case_fold_simple();
- Ok(Hir::class(hir::Class::Bytes(cls)))
+ Ok(Some(Hir::class(hir::Class::Bytes(cls))))
}
}
fn hir_dot(&self, span: Span) -> Result<Hir> {
- let unicode = self.flags().unicode();
- if !unicode && !self.trans().allow_invalid_utf8 {
+ if !self.flags().unicode() && self.trans().utf8 {
return Err(self.error(span, ErrorKind::InvalidUtf8));
}
- Ok(if self.flags().dot_matches_new_line() {
- Hir::any(!unicode)
- } else {
- Hir::dot(!unicode)
- })
+ Ok(Hir::dot(self.flags().dot()))
}
fn hir_assertion(&self, asst: &ast::Assertion) -> Result<Hir> {
let unicode = self.flags().unicode();
let multi_line = self.flags().multi_line();
+ let crlf = self.flags().crlf();
Ok(match asst.kind {
- ast::AssertionKind::StartLine => Hir::anchor(if multi_line {
- hir::Anchor::StartLine
+ ast::AssertionKind::StartLine => Hir::look(if multi_line {
+ if crlf {
+ hir::Look::StartCRLF
+ } else {
+ hir::Look::StartLF
+ }
} else {
- hir::Anchor::StartText
+ hir::Look::Start
}),
- ast::AssertionKind::EndLine => Hir::anchor(if multi_line {
- hir::Anchor::EndLine
+ ast::AssertionKind::EndLine => Hir::look(if multi_line {
+ if crlf {
+ hir::Look::EndCRLF
+ } else {
+ hir::Look::EndLF
+ }
} else {
- hir::Anchor::EndText
+ hir::Look::End
+ }),
+ ast::AssertionKind::StartText => Hir::look(hir::Look::Start),
+ ast::AssertionKind::EndText => Hir::look(hir::Look::End),
+ ast::AssertionKind::WordBoundary => Hir::look(if unicode {
+ hir::Look::WordUnicode
+ } else {
+ hir::Look::WordAscii
+ }),
+ ast::AssertionKind::NotWordBoundary => Hir::look(if unicode {
+ hir::Look::WordUnicodeNegate
+ } else {
+ hir::Look::WordAsciiNegate
}),
- ast::AssertionKind::StartText => {
- Hir::anchor(hir::Anchor::StartText)
- }
- ast::AssertionKind::EndText => Hir::anchor(hir::Anchor::EndText),
- ast::AssertionKind::WordBoundary => {
- Hir::word_boundary(if unicode {
- hir::WordBoundary::Unicode
- } else {
- hir::WordBoundary::Ascii
- })
- }
- ast::AssertionKind::NotWordBoundary => {
- Hir::word_boundary(if unicode {
- hir::WordBoundary::UnicodeNegate
- } else {
- // It is possible for negated ASCII word boundaries to
- // match at invalid UTF-8 boundaries, even when searching
- // valid UTF-8.
- if !self.trans().allow_invalid_utf8 {
- return Err(
- self.error(asst.span, ErrorKind::InvalidUtf8)
- );
- }
- hir::WordBoundary::AsciiNegate
- })
- }
})
}
- fn hir_group(&self, group: &ast::Group, expr: Hir) -> Hir {
- let kind = match group.kind {
- ast::GroupKind::CaptureIndex(idx) => {
- hir::GroupKind::CaptureIndex(idx)
+ fn hir_capture(&self, group: &ast::Group, expr: Hir) -> Hir {
+ let (index, name) = match group.kind {
+ ast::GroupKind::CaptureIndex(index) => (index, None),
+ ast::GroupKind::CaptureName { ref name, .. } => {
+ (name.index, Some(name.name.clone().into_boxed_str()))
}
- ast::GroupKind::CaptureName(ref capname) => {
- hir::GroupKind::CaptureName {
- name: capname.name.clone(),
- index: capname.index,
- }
- }
- ast::GroupKind::NonCapturing(_) => hir::GroupKind::NonCapturing,
+ // The HIR doesn't need to use non-capturing groups, since the way
+ // in which the data type is defined handles this automatically.
+ ast::GroupKind::NonCapturing(_) => return expr,
};
- Hir::group(hir::Group { kind, hir: Box::new(expr) })
+ Hir::capture(hir::Capture { index, name, sub: Box::new(expr) })
}
fn hir_repetition(&self, rep: &ast::Repetition, expr: Hir) -> Hir {
- let kind = match rep.op.kind {
- ast::RepetitionKind::ZeroOrOne => hir::RepetitionKind::ZeroOrOne,
- ast::RepetitionKind::ZeroOrMore => hir::RepetitionKind::ZeroOrMore,
- ast::RepetitionKind::OneOrMore => hir::RepetitionKind::OneOrMore,
+ let (min, max) = match rep.op.kind {
+ ast::RepetitionKind::ZeroOrOne => (0, Some(1)),
+ ast::RepetitionKind::ZeroOrMore => (0, None),
+ ast::RepetitionKind::OneOrMore => (1, None),
ast::RepetitionKind::Range(ast::RepetitionRange::Exactly(m)) => {
- hir::RepetitionKind::Range(hir::RepetitionRange::Exactly(m))
+ (m, Some(m))
}
ast::RepetitionKind::Range(ast::RepetitionRange::AtLeast(m)) => {
- hir::RepetitionKind::Range(hir::RepetitionRange::AtLeast(m))
+ (m, None)
}
ast::RepetitionKind::Range(ast::RepetitionRange::Bounded(
m,
n,
- )) => {
- hir::RepetitionKind::Range(hir::RepetitionRange::Bounded(m, n))
- }
+ )) => (m, Some(n)),
};
let greedy =
if self.flags().swap_greed() { !rep.greedy } else { rep.greedy };
- Hir::repetition(hir::Repetition { kind, greedy, hir: Box::new(expr) })
+ Hir::repetition(hir::Repetition {
+ min,
+ max,
+ greedy,
+ sub: Box::new(expr),
+ })
}
fn hir_unicode_class(
@@ -834,11 +974,6 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
ast_class.negated,
class,
)?;
- if class.ranges().is_empty() {
- let err = self
- .error(ast_class.span, ErrorKind::EmptyClassNotAllowed);
- return Err(err);
- }
}
result
}
@@ -848,9 +983,8 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
ast: &ast::ClassAscii,
) -> Result<hir::ClassUnicode> {
let mut cls = hir::ClassUnicode::new(
- ascii_class(&ast.kind)
- .iter()
- .map(|&(s, e)| hir::ClassUnicodeRange::new(s, e)),
+ ascii_class_as_chars(&ast.kind)
+ .map(|(s, e)| hir::ClassUnicodeRange::new(s, e)),
);
self.unicode_fold_and_negate(&ast.span, ast.negated, &mut cls)?;
Ok(cls)
@@ -862,8 +996,7 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
) -> Result<hir::ClassBytes> {
let mut cls = hir::ClassBytes::new(
ascii_class(&ast.kind)
- .iter()
- .map(|&(s, e)| hir::ClassBytesRange::new(s as u8, e as u8)),
+ .map(|(s, e)| hir::ClassBytesRange::new(s, e)),
);
self.bytes_fold_and_negate(&ast.span, ast.negated, &mut cls)?;
Ok(cls)
@@ -894,7 +1027,7 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
fn hir_perl_byte_class(
&self,
ast_class: &ast::ClassPerl,
- ) -> hir::ClassBytes {
+ ) -> Result<hir::ClassBytes> {
use crate::ast::ClassPerlKind::*;
assert!(!self.flags().unicode());
@@ -908,7 +1041,13 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
if ast_class.negated {
class.negate();
}
- class
+ // Negating a Perl byte class is likely to cause it to match invalid
+ // UTF-8. That's only OK if the translator is configured to allow such
+ // things.
+ if self.trans().utf8 && !class.is_ascii() {
+ return Err(self.error(ast_class.span, ErrorKind::InvalidUtf8));
+ }
+ Ok(class)
}
/// Converts the given Unicode specific error to an HIR translation error.
@@ -918,7 +1057,7 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
fn convert_unicode_class_error(
&self,
span: &Span,
- result: unicode::Result<hir::ClassUnicode>,
+ result: core::result::Result<hir::ClassUnicode, unicode::Error>,
) -> Result<hir::ClassUnicode> {
result.map_err(|err| {
let sp = span.clone();
@@ -943,7 +1082,7 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
class: &mut hir::ClassUnicode,
) -> Result<()> {
// Note that we must apply case folding before negation!
- // Consider `(?i)[^x]`. If we applied negation field, then
+ // Consider `(?i)[^x]`. If we applied negation first, then
// the result would be the character class that matched any
// Unicode scalar value.
if self.flags().case_insensitive() {
@@ -973,7 +1112,7 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
if negated {
class.negate();
}
- if !self.trans().allow_invalid_utf8 && !class.is_all_ascii() {
+ if self.trans().utf8 && !class.is_ascii() {
return Err(self.error(span.clone(), ErrorKind::InvalidUtf8));
}
Ok(())
@@ -982,11 +1121,12 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
/// Return a scalar byte value suitable for use as a literal in a byte
/// character class.
fn class_literal_byte(&self, ast: &ast::Literal) -> Result<u8> {
- match self.literal_to_char(ast)? {
- hir::Literal::Byte(byte) => Ok(byte),
- hir::Literal::Unicode(ch) => {
- if ch <= 0x7F as char {
- Ok(ch as u8)
+ match self.ast_literal_to_scalar(ast)? {
+ Either::Right(byte) => Ok(byte),
+ Either::Left(ch) => {
+ let cp = u32::from(ch);
+ if cp <= 0x7F {
+ Ok(u8::try_from(cp).unwrap())
} else {
// We can't feasibly support Unicode in
// byte oriented classes. Byte classes don't
@@ -1010,6 +1150,7 @@ struct Flags {
dot_matches_new_line: Option<bool>,
swap_greed: Option<bool>,
unicode: Option<bool>,
+ crlf: Option<bool>,
// Note that `ignore_whitespace` is omitted here because it is handled
// entirely in the parser.
}
@@ -1038,6 +1179,9 @@ impl Flags {
ast::FlagsItemKind::Flag(ast::Flag::Unicode) => {
flags.unicode = Some(enable);
}
+ ast::FlagsItemKind::Flag(ast::Flag::CRLF) => {
+ flags.crlf = Some(enable);
+ }
ast::FlagsItemKind::Flag(ast::Flag::IgnoreWhitespace) => {}
}
}
@@ -1060,6 +1204,33 @@ impl Flags {
if self.unicode.is_none() {
self.unicode = previous.unicode;
}
+ if self.crlf.is_none() {
+ self.crlf = previous.crlf;
+ }
+ }
+
+ fn dot(&self) -> hir::Dot {
+ if self.dot_matches_new_line() {
+ if self.unicode() {
+ hir::Dot::AnyChar
+ } else {
+ hir::Dot::AnyByte
+ }
+ } else {
+ if self.unicode() {
+ if self.crlf() {
+ hir::Dot::AnyCharExceptCRLF
+ } else {
+ hir::Dot::AnyCharExceptLF
+ }
+ } else {
+ if self.crlf() {
+ hir::Dot::AnyByteExceptCRLF
+ } else {
+ hir::Dot::AnyByteExceptLF
+ }
+ }
+ }
}
fn case_insensitive(&self) -> bool {
@@ -1081,52 +1252,63 @@ impl Flags {
fn unicode(&self) -> bool {
self.unicode.unwrap_or(true)
}
+
+ fn crlf(&self) -> bool {
+ self.crlf.unwrap_or(false)
+ }
}
fn hir_ascii_class_bytes(kind: &ast::ClassAsciiKind) -> hir::ClassBytes {
let ranges: Vec<_> = ascii_class(kind)
- .iter()
- .cloned()
- .map(|(s, e)| hir::ClassBytesRange::new(s as u8, e as u8))
+ .map(|(s, e)| hir::ClassBytesRange::new(s, e))
.collect();
hir::ClassBytes::new(ranges)
}
-fn ascii_class(kind: &ast::ClassAsciiKind) -> &'static [(char, char)] {
+fn ascii_class(kind: &ast::ClassAsciiKind) -> impl Iterator<Item = (u8, u8)> {
use crate::ast::ClassAsciiKind::*;
- match *kind {
- Alnum => &[('0', '9'), ('A', 'Z'), ('a', 'z')],
- Alpha => &[('A', 'Z'), ('a', 'z')],
- Ascii => &[('\x00', '\x7F')],
- Blank => &[('\t', '\t'), (' ', ' ')],
- Cntrl => &[('\x00', '\x1F'), ('\x7F', '\x7F')],
- Digit => &[('0', '9')],
- Graph => &[('!', '~')],
- Lower => &[('a', 'z')],
- Print => &[(' ', '~')],
- Punct => &[('!', '/'), (':', '@'), ('[', '`'), ('{', '~')],
+
+ let slice: &'static [(u8, u8)] = match *kind {
+ Alnum => &[(b'0', b'9'), (b'A', b'Z'), (b'a', b'z')],
+ Alpha => &[(b'A', b'Z'), (b'a', b'z')],
+ Ascii => &[(b'\x00', b'\x7F')],
+ Blank => &[(b'\t', b'\t'), (b' ', b' ')],
+ Cntrl => &[(b'\x00', b'\x1F'), (b'\x7F', b'\x7F')],
+ Digit => &[(b'0', b'9')],
+ Graph => &[(b'!', b'~')],
+ Lower => &[(b'a', b'z')],
+ Print => &[(b' ', b'~')],
+ Punct => &[(b'!', b'/'), (b':', b'@'), (b'[', b'`'), (b'{', b'~')],
Space => &[
- ('\t', '\t'),
- ('\n', '\n'),
- ('\x0B', '\x0B'),
- ('\x0C', '\x0C'),
- ('\r', '\r'),
- (' ', ' '),
+ (b'\t', b'\t'),
+ (b'\n', b'\n'),
+ (b'\x0B', b'\x0B'),
+ (b'\x0C', b'\x0C'),
+ (b'\r', b'\r'),
+ (b' ', b' '),
],
- Upper => &[('A', 'Z')],
- Word => &[('0', '9'), ('A', 'Z'), ('_', '_'), ('a', 'z')],
- Xdigit => &[('0', '9'), ('A', 'F'), ('a', 'f')],
- }
+ Upper => &[(b'A', b'Z')],
+ Word => &[(b'0', b'9'), (b'A', b'Z'), (b'_', b'_'), (b'a', b'z')],
+ Xdigit => &[(b'0', b'9'), (b'A', b'F'), (b'a', b'f')],
+ };
+ slice.iter().copied()
+}
+
+fn ascii_class_as_chars(
+ kind: &ast::ClassAsciiKind,
+) -> impl Iterator<Item = (char, char)> {
+ ascii_class(kind).map(|(s, e)| (char::from(s), char::from(e)))
}
#[cfg(test)]
mod tests {
- use crate::ast::parse::ParserBuilder;
- use crate::ast::{self, Ast, Position, Span};
- use crate::hir::{self, Hir, HirKind};
- use crate::unicode::{self, ClassQuery};
+ use crate::{
+ ast::{self, parse::ParserBuilder, Ast, Position, Span},
+ hir::{self, Hir, HirKind, Look, Properties},
+ unicode::{self, ClassQuery},
+ };
- use super::{ascii_class, TranslatorBuilder};
+ use super::*;
// We create these errors to compare with real hir::Errors in the tests.
// We define equality between TestError and hir::Error to disregard the
@@ -1155,7 +1337,7 @@ mod tests {
fn t(pattern: &str) -> Hir {
TranslatorBuilder::new()
- .allow_invalid_utf8(false)
+ .utf8(true)
.build()
.translate(pattern, &parse(pattern))
.unwrap()
@@ -1163,7 +1345,7 @@ mod tests {
fn t_err(pattern: &str) -> hir::Error {
TranslatorBuilder::new()
- .allow_invalid_utf8(false)
+ .utf8(true)
.build()
.translate(pattern, &parse(pattern))
.unwrap_err()
@@ -1171,95 +1353,73 @@ mod tests {
fn t_bytes(pattern: &str) -> Hir {
TranslatorBuilder::new()
- .allow_invalid_utf8(true)
+ .utf8(false)
.build()
.translate(pattern, &parse(pattern))
.unwrap()
}
- fn hir_lit(s: &str) -> Hir {
- match s.len() {
- 0 => Hir::empty(),
- _ => {
- let lits = s
- .chars()
- .map(hir::Literal::Unicode)
- .map(Hir::literal)
- .collect();
- Hir::concat(lits)
- }
- }
+ fn props(pattern: &str) -> Properties {
+ t(pattern).properties().clone()
}
- fn hir_blit(s: &[u8]) -> Hir {
- match s.len() {
- 0 => Hir::empty(),
- 1 => Hir::literal(hir::Literal::Byte(s[0])),
- _ => {
- let lits = s
- .iter()
- .cloned()
- .map(hir::Literal::Byte)
- .map(Hir::literal)
- .collect();
- Hir::concat(lits)
- }
- }
+ fn props_bytes(pattern: &str) -> Properties {
+ t_bytes(pattern).properties().clone()
}
- fn hir_group(i: u32, expr: Hir) -> Hir {
- Hir::group(hir::Group {
- kind: hir::GroupKind::CaptureIndex(i),
- hir: Box::new(expr),
- })
+ fn hir_lit(s: &str) -> Hir {
+ hir_blit(s.as_bytes())
}
- fn hir_group_name(i: u32, name: &str, expr: Hir) -> Hir {
- Hir::group(hir::Group {
- kind: hir::GroupKind::CaptureName {
- name: name.to_string(),
- index: i,
- },
- hir: Box::new(expr),
- })
+ fn hir_blit(s: &[u8]) -> Hir {
+ Hir::literal(s)
+ }
+
+ fn hir_capture(index: u32, expr: Hir) -> Hir {
+ Hir::capture(hir::Capture { index, name: None, sub: Box::new(expr) })
}
- fn hir_group_nocap(expr: Hir) -> Hir {
- Hir::group(hir::Group {
- kind: hir::GroupKind::NonCapturing,
- hir: Box::new(expr),
+ fn hir_capture_name(index: u32, name: &str, expr: Hir) -> Hir {
+ Hir::capture(hir::Capture {
+ index,
+ name: Some(name.into()),
+ sub: Box::new(expr),
})
}
fn hir_quest(greedy: bool, expr: Hir) -> Hir {
Hir::repetition(hir::Repetition {
- kind: hir::RepetitionKind::ZeroOrOne,
+ min: 0,
+ max: Some(1),
greedy,
- hir: Box::new(expr),
+ sub: Box::new(expr),
})
}
fn hir_star(greedy: bool, expr: Hir) -> Hir {
Hir::repetition(hir::Repetition {
- kind: hir::RepetitionKind::ZeroOrMore,
+ min: 0,
+ max: None,
greedy,
- hir: Box::new(expr),
+ sub: Box::new(expr),
})
}
fn hir_plus(greedy: bool, expr: Hir) -> Hir {
Hir::repetition(hir::Repetition {
- kind: hir::RepetitionKind::OneOrMore,
+ min: 1,
+ max: None,
greedy,
- hir: Box::new(expr),
+ sub: Box::new(expr),
})
}
- fn hir_range(greedy: bool, range: hir::RepetitionRange, expr: Hir) -> Hir {
+ fn hir_range(greedy: bool, min: u32, max: Option<u32>, expr: Hir) -> Hir {
Hir::repetition(hir::Repetition {
- kind: hir::RepetitionKind::Range(range),
+ min,
+ max,
greedy,
- hir: Box::new(expr),
+ sub: Box::new(expr),
})
}
@@ -1281,32 +1441,25 @@ mod tests {
Hir::class(hir::Class::Unicode(unicode::perl_word().unwrap()))
}
- fn hir_uclass(ranges: &[(char, char)]) -> Hir {
- let ranges: Vec<hir::ClassUnicodeRange> = ranges
- .iter()
- .map(|&(s, e)| hir::ClassUnicodeRange::new(s, e))
- .collect();
- Hir::class(hir::Class::Unicode(hir::ClassUnicode::new(ranges)))
+ fn hir_ascii_uclass(kind: &ast::ClassAsciiKind) -> Hir {
+ Hir::class(hir::Class::Unicode(hir::ClassUnicode::new(
+ ascii_class_as_chars(kind)
+ .map(|(s, e)| hir::ClassUnicodeRange::new(s, e)),
+ )))
}
- fn hir_bclass(ranges: &[(u8, u8)]) -> Hir {
- let ranges: Vec<hir::ClassBytesRange> = ranges
- .iter()
- .map(|&(s, e)| hir::ClassBytesRange::new(s, e))
- .collect();
- Hir::class(hir::Class::Bytes(hir::ClassBytes::new(ranges)))
+ fn hir_ascii_bclass(kind: &ast::ClassAsciiKind) -> Hir {
+ Hir::class(hir::Class::Bytes(hir::ClassBytes::new(
+ ascii_class(kind).map(|(s, e)| hir::ClassBytesRange::new(s, e)),
+ )))
}
- fn hir_bclass_from_char(ranges: &[(char, char)]) -> Hir {
- let ranges: Vec<hir::ClassBytesRange> = ranges
- .iter()
- .map(|&(s, e)| {
- assert!(s as u32 <= 0x7F);
- assert!(e as u32 <= 0x7F);
- hir::ClassBytesRange::new(s as u8, e as u8)
- })
- .collect();
- Hir::class(hir::Class::Bytes(hir::ClassBytes::new(ranges)))
+ fn hir_uclass(ranges: &[(char, char)]) -> Hir {
+ Hir::class(uclass(ranges))
+ }
+
+ fn hir_bclass(ranges: &[(u8, u8)]) -> Hir {
+ Hir::class(bclass(ranges))
}
fn hir_case_fold(expr: Hir) -> Hir {
@@ -1329,6 +1482,33 @@ mod tests {
}
}
+ fn uclass(ranges: &[(char, char)]) -> hir::Class {
+ let ranges: Vec<hir::ClassUnicodeRange> = ranges
+ .iter()
+ .map(|&(s, e)| hir::ClassUnicodeRange::new(s, e))
+ .collect();
+ hir::Class::Unicode(hir::ClassUnicode::new(ranges))
+ }
+
+ fn bclass(ranges: &[(u8, u8)]) -> hir::Class {
+ let ranges: Vec<hir::ClassBytesRange> = ranges
+ .iter()
+ .map(|&(s, e)| hir::ClassBytesRange::new(s, e))
+ .collect();
+ hir::Class::Bytes(hir::ClassBytes::new(ranges))
+ }
+
+ #[cfg(feature = "unicode-case")]
+ fn class_case_fold(mut cls: hir::Class) -> Hir {
+ cls.case_fold_simple();
+ Hir::class(cls)
+ }
+
+ fn class_negate(mut cls: hir::Class) -> Hir {
+ cls.negate();
+ Hir::class(cls)
+ }
+
#[allow(dead_code)]
fn hir_union(expr1: Hir, expr2: Hir) -> Hir {
use crate::hir::Class::{Bytes, Unicode};
@@ -1363,47 +1543,43 @@ mod tests {
}
}
- fn hir_anchor(anchor: hir::Anchor) -> Hir {
- Hir::anchor(anchor)
- }
-
- fn hir_word(wb: hir::WordBoundary) -> Hir {
- Hir::word_boundary(wb)
+ fn hir_look(look: hir::Look) -> Hir {
+ Hir::look(look)
}
#[test]
fn empty() {
assert_eq!(t(""), Hir::empty());
assert_eq!(t("(?i)"), Hir::empty());
- assert_eq!(t("()"), hir_group(1, Hir::empty()));
- assert_eq!(t("(?:)"), hir_group_nocap(Hir::empty()));
- assert_eq!(t("(?P<wat>)"), hir_group_name(1, "wat", Hir::empty()));
+ assert_eq!(t("()"), hir_capture(1, Hir::empty()));
+ assert_eq!(t("(?:)"), Hir::empty());
+ assert_eq!(t("(?P<wat>)"), hir_capture_name(1, "wat", Hir::empty()));
assert_eq!(t("|"), hir_alt(vec![Hir::empty(), Hir::empty()]));
assert_eq!(
t("()|()"),
hir_alt(vec![
- hir_group(1, Hir::empty()),
- hir_group(2, Hir::empty()),
+ hir_capture(1, Hir::empty()),
+ hir_capture(2, Hir::empty()),
])
);
assert_eq!(
t("(|b)"),
- hir_group(1, hir_alt(vec![Hir::empty(), hir_lit("b"),]))
+ hir_capture(1, hir_alt(vec![Hir::empty(), hir_lit("b"),]))
);
assert_eq!(
t("(a|)"),
- hir_group(1, hir_alt(vec![hir_lit("a"), Hir::empty(),]))
+ hir_capture(1, hir_alt(vec![hir_lit("a"), Hir::empty(),]))
);
assert_eq!(
t("(a||c)"),
- hir_group(
+ hir_capture(
1,
hir_alt(vec![hir_lit("a"), Hir::empty(), hir_lit("c"),])
)
);
assert_eq!(
t("(||)"),
- hir_group(
+ hir_capture(
1,
hir_alt(vec![Hir::empty(), Hir::empty(), Hir::empty(),])
)
@@ -1449,10 +1625,7 @@ mod tests {
#[cfg(feature = "unicode-case")]
assert_eq!(t("(?i)a"), hir_uclass(&[('A', 'A'), ('a', 'a'),]));
#[cfg(feature = "unicode-case")]
- assert_eq!(
- t("(?i:a)"),
- hir_group_nocap(hir_uclass(&[('A', 'A'), ('a', 'a')],))
- );
+ assert_eq!(t("(?i:a)"), hir_uclass(&[('A', 'A'), ('a', 'a')]));
#[cfg(feature = "unicode-case")]
assert_eq!(
t("a(?i)a(?-i)a"),
@@ -1528,14 +1701,32 @@ mod tests {
fn dot() {
assert_eq!(
t("."),
- hir_uclass(&[('\0', '\t'), ('\x0B', '\u{10FFFF}'),])
+ hir_uclass(&[('\0', '\t'), ('\x0B', '\u{10FFFF}')])
+ );
+ assert_eq!(
+ t("(?R)."),
+ hir_uclass(&[
+ ('\0', '\t'),
+ ('\x0B', '\x0C'),
+ ('\x0E', '\u{10FFFF}'),
+ ])
);
- assert_eq!(t("(?s)."), hir_uclass(&[('\0', '\u{10FFFF}'),]));
+ assert_eq!(t("(?s)."), hir_uclass(&[('\0', '\u{10FFFF}')]));
+ assert_eq!(t("(?Rs)."), hir_uclass(&[('\0', '\u{10FFFF}')]));
assert_eq!(
t_bytes("(?-u)."),
- hir_bclass(&[(b'\0', b'\t'), (b'\x0B', b'\xFF'),])
+ hir_bclass(&[(b'\0', b'\t'), (b'\x0B', b'\xFF')])
+ );
+ assert_eq!(
+ t_bytes("(?R-u)."),
+ hir_bclass(&[
+ (b'\0', b'\t'),
+ (b'\x0B', b'\x0C'),
+ (b'\x0E', b'\xFF'),
+ ])
);
assert_eq!(t_bytes("(?s-u)."), hir_bclass(&[(b'\0', b'\xFF'),]));
+ assert_eq!(t_bytes("(?Rs-u)."), hir_bclass(&[(b'\0', b'\xFF'),]));
// If invalid UTF-8 isn't allowed, then non-Unicode `.` isn't allowed.
assert_eq!(
@@ -1549,7 +1740,7 @@ mod tests {
}
);
assert_eq!(
- t_err("(?s-u)."),
+ t_err("(?R-u)."),
TestError {
kind: hir::ErrorKind::InvalidUtf8,
span: Span::new(
@@ -1558,94 +1749,123 @@ mod tests {
),
}
);
- }
-
- #[test]
- fn assertions() {
- assert_eq!(t("^"), hir_anchor(hir::Anchor::StartText));
- assert_eq!(t("$"), hir_anchor(hir::Anchor::EndText));
- assert_eq!(t(r"\A"), hir_anchor(hir::Anchor::StartText));
- assert_eq!(t(r"\z"), hir_anchor(hir::Anchor::EndText));
- assert_eq!(t("(?m)^"), hir_anchor(hir::Anchor::StartLine));
- assert_eq!(t("(?m)$"), hir_anchor(hir::Anchor::EndLine));
- assert_eq!(t(r"(?m)\A"), hir_anchor(hir::Anchor::StartText));
- assert_eq!(t(r"(?m)\z"), hir_anchor(hir::Anchor::EndText));
-
- assert_eq!(t(r"\b"), hir_word(hir::WordBoundary::Unicode));
- assert_eq!(t(r"\B"), hir_word(hir::WordBoundary::UnicodeNegate));
- assert_eq!(t(r"(?-u)\b"), hir_word(hir::WordBoundary::Ascii));
assert_eq!(
- t_bytes(r"(?-u)\B"),
- hir_word(hir::WordBoundary::AsciiNegate)
+ t_err("(?s-u)."),
+ TestError {
+ kind: hir::ErrorKind::InvalidUtf8,
+ span: Span::new(
+ Position::new(6, 1, 7),
+ Position::new(7, 1, 8)
+ ),
+ }
);
-
assert_eq!(
- t_err(r"(?-u)\B"),
+ t_err("(?Rs-u)."),
TestError {
kind: hir::ErrorKind::InvalidUtf8,
span: Span::new(
- Position::new(5, 1, 6),
- Position::new(7, 1, 8)
+ Position::new(7, 1, 8),
+ Position::new(8, 1, 9)
),
}
);
}
#[test]
+ fn assertions() {
+ assert_eq!(t("^"), hir_look(hir::Look::Start));
+ assert_eq!(t("$"), hir_look(hir::Look::End));
+ assert_eq!(t(r"\A"), hir_look(hir::Look::Start));
+ assert_eq!(t(r"\z"), hir_look(hir::Look::End));
+ assert_eq!(t("(?m)^"), hir_look(hir::Look::StartLF));
+ assert_eq!(t("(?m)$"), hir_look(hir::Look::EndLF));
+ assert_eq!(t(r"(?m)\A"), hir_look(hir::Look::Start));
+ assert_eq!(t(r"(?m)\z"), hir_look(hir::Look::End));
+
+ assert_eq!(t(r"\b"), hir_look(hir::Look::WordUnicode));
+ assert_eq!(t(r"\B"), hir_look(hir::Look::WordUnicodeNegate));
+ assert_eq!(t(r"(?-u)\b"), hir_look(hir::Look::WordAscii));
+ assert_eq!(t(r"(?-u)\B"), hir_look(hir::Look::WordAsciiNegate));
+ }
+
+ #[test]
fn group() {
- assert_eq!(t("(a)"), hir_group(1, hir_lit("a")));
+ assert_eq!(t("(a)"), hir_capture(1, hir_lit("a")));
assert_eq!(
t("(a)(b)"),
hir_cat(vec![
- hir_group(1, hir_lit("a")),
- hir_group(2, hir_lit("b")),
+ hir_capture(1, hir_lit("a")),
+ hir_capture(2, hir_lit("b")),
])
);
assert_eq!(
t("(a)|(b)"),
hir_alt(vec![
- hir_group(1, hir_lit("a")),
- hir_group(2, hir_lit("b")),
+ hir_capture(1, hir_lit("a")),
+ hir_capture(2, hir_lit("b")),
])
);
- assert_eq!(t("(?P<foo>)"), hir_group_name(1, "foo", Hir::empty()));
- assert_eq!(t("(?P<foo>a)"), hir_group_name(1, "foo", hir_lit("a")));
+ assert_eq!(t("(?P<foo>)"), hir_capture_name(1, "foo", Hir::empty()));
+ assert_eq!(t("(?P<foo>a)"), hir_capture_name(1, "foo", hir_lit("a")));
assert_eq!(
t("(?P<foo>a)(?P<bar>b)"),
hir_cat(vec![
- hir_group_name(1, "foo", hir_lit("a")),
- hir_group_name(2, "bar", hir_lit("b")),
+ hir_capture_name(1, "foo", hir_lit("a")),
+ hir_capture_name(2, "bar", hir_lit("b")),
])
);
- assert_eq!(t("(?:)"), hir_group_nocap(Hir::empty()));
- assert_eq!(t("(?:a)"), hir_group_nocap(hir_lit("a")));
+ assert_eq!(t("(?:)"), Hir::empty());
+ assert_eq!(t("(?:a)"), hir_lit("a"));
assert_eq!(
t("(?:a)(b)"),
- hir_cat(vec![
- hir_group_nocap(hir_lit("a")),
- hir_group(1, hir_lit("b")),
- ])
+ hir_cat(vec![hir_lit("a"), hir_capture(1, hir_lit("b")),])
);
assert_eq!(
t("(a)(?:b)(c)"),
hir_cat(vec![
- hir_group(1, hir_lit("a")),
- hir_group_nocap(hir_lit("b")),
- hir_group(2, hir_lit("c")),
+ hir_capture(1, hir_lit("a")),
+ hir_lit("b"),
+ hir_capture(2, hir_lit("c")),
])
);
assert_eq!(
t("(a)(?P<foo>b)(c)"),
hir_cat(vec![
- hir_group(1, hir_lit("a")),
- hir_group_name(2, "foo", hir_lit("b")),
- hir_group(3, hir_lit("c")),
+ hir_capture(1, hir_lit("a")),
+ hir_capture_name(2, "foo", hir_lit("b")),
+ hir_capture(3, hir_lit("c")),
])
);
- assert_eq!(t("()"), hir_group(1, Hir::empty()));
- assert_eq!(t("((?i))"), hir_group(1, Hir::empty()));
- assert_eq!(t("((?x))"), hir_group(1, Hir::empty()));
- assert_eq!(t("(((?x)))"), hir_group(1, hir_group(2, Hir::empty())));
+ assert_eq!(t("()"), hir_capture(1, Hir::empty()));
+ assert_eq!(t("((?i))"), hir_capture(1, Hir::empty()));
+ assert_eq!(t("((?x))"), hir_capture(1, Hir::empty()));
+ assert_eq!(
+ t("(((?x)))"),
+ hir_capture(1, hir_capture(2, Hir::empty()))
+ );
+ }
+
+ #[test]
+ fn line_anchors() {
+ assert_eq!(t("^"), hir_look(hir::Look::Start));
+ assert_eq!(t("$"), hir_look(hir::Look::End));
+ assert_eq!(t(r"\A"), hir_look(hir::Look::Start));
+ assert_eq!(t(r"\z"), hir_look(hir::Look::End));
+
+ assert_eq!(t(r"(?m)\A"), hir_look(hir::Look::Start));
+ assert_eq!(t(r"(?m)\z"), hir_look(hir::Look::End));
+ assert_eq!(t("(?m)^"), hir_look(hir::Look::StartLF));
+ assert_eq!(t("(?m)$"), hir_look(hir::Look::EndLF));
+
+ assert_eq!(t(r"(?R)\A"), hir_look(hir::Look::Start));
+ assert_eq!(t(r"(?R)\z"), hir_look(hir::Look::End));
+ assert_eq!(t("(?R)^"), hir_look(hir::Look::Start));
+ assert_eq!(t("(?R)$"), hir_look(hir::Look::End));
+
+ assert_eq!(t(r"(?Rm)\A"), hir_look(hir::Look::Start));
+ assert_eq!(t(r"(?Rm)\z"), hir_look(hir::Look::End));
+ assert_eq!(t("(?Rm)^"), hir_look(hir::Look::StartCRLF));
+ assert_eq!(t("(?Rm)$"), hir_look(hir::Look::EndCRLF));
}
#[test]
@@ -1653,46 +1873,44 @@ mod tests {
#[cfg(feature = "unicode-case")]
assert_eq!(
t("(?i:a)a"),
- hir_cat(vec![
- hir_group_nocap(hir_uclass(&[('A', 'A'), ('a', 'a')])),
- hir_lit("a"),
- ])
+ hir_cat(
+ vec![hir_uclass(&[('A', 'A'), ('a', 'a')]), hir_lit("a"),]
+ )
);
assert_eq!(
t("(?i-u:a)β"),
hir_cat(vec![
- hir_group_nocap(hir_bclass(&[(b'A', b'A'), (b'a', b'a')])),
+ hir_bclass(&[(b'A', b'A'), (b'a', b'a')]),
hir_lit("β"),
])
);
assert_eq!(
t("(?:(?i-u)a)b"),
hir_cat(vec![
- hir_group_nocap(hir_bclass(&[(b'A', b'A'), (b'a', b'a')])),
+ hir_bclass(&[(b'A', b'A'), (b'a', b'a')]),
hir_lit("b"),
])
);
assert_eq!(
t("((?i-u)a)b"),
hir_cat(vec![
- hir_group(1, hir_bclass(&[(b'A', b'A'), (b'a', b'a')])),
+ hir_capture(1, hir_bclass(&[(b'A', b'A'), (b'a', b'a')])),
hir_lit("b"),
])
);
#[cfg(feature = "unicode-case")]
assert_eq!(
t("(?i)(?-i:a)a"),
- hir_cat(vec![
- hir_group_nocap(hir_lit("a")),
- hir_uclass(&[('A', 'A'), ('a', 'a')]),
- ])
+ hir_cat(
+ vec![hir_lit("a"), hir_uclass(&[('A', 'A'), ('a', 'a')]),]
+ )
);
#[cfg(feature = "unicode-case")]
assert_eq!(
t("(?im)a^"),
hir_cat(vec![
hir_uclass(&[('A', 'A'), ('a', 'a')]),
- hir_anchor(hir::Anchor::StartLine),
+ hir_look(hir::Look::StartLF),
])
);
#[cfg(feature = "unicode-case")]
@@ -1700,9 +1918,9 @@ mod tests {
t("(?im)a^(?i-m)a^"),
hir_cat(vec![
hir_uclass(&[('A', 'A'), ('a', 'a')]),
- hir_anchor(hir::Anchor::StartLine),
+ hir_look(hir::Look::StartLF),
hir_uclass(&[('A', 'A'), ('a', 'a')]),
- hir_anchor(hir::Anchor::StartText),
+ hir_look(hir::Look::Start),
])
);
assert_eq!(
@@ -1718,10 +1936,10 @@ mod tests {
assert_eq!(
t("(?:a(?i)a)a"),
hir_cat(vec![
- hir_group_nocap(hir_cat(vec![
+ hir_cat(vec![
hir_lit("a"),
hir_uclass(&[('A', 'A'), ('a', 'a')]),
- ])),
+ ]),
hir_lit("a"),
])
);
@@ -1729,10 +1947,10 @@ mod tests {
assert_eq!(
t("(?i)(?:a(?-i)a)a"),
hir_cat(vec![
- hir_group_nocap(hir_cat(vec![
+ hir_cat(vec![
hir_uclass(&[('A', 'A'), ('a', 'a')]),
hir_lit("a"),
- ])),
+ ]),
hir_uclass(&[('A', 'A'), ('a', 'a')]),
])
);
@@ -1755,46 +1973,18 @@ mod tests {
assert_eq!(t("a*?"), hir_star(false, hir_lit("a")));
assert_eq!(t("a+?"), hir_plus(false, hir_lit("a")));
- assert_eq!(
- t("a{1}"),
- hir_range(true, hir::RepetitionRange::Exactly(1), hir_lit("a"),)
- );
- assert_eq!(
- t("a{1,}"),
- hir_range(true, hir::RepetitionRange::AtLeast(1), hir_lit("a"),)
- );
- assert_eq!(
- t("a{1,2}"),
- hir_range(true, hir::RepetitionRange::Bounded(1, 2), hir_lit("a"),)
- );
- assert_eq!(
- t("a{1}?"),
- hir_range(false, hir::RepetitionRange::Exactly(1), hir_lit("a"),)
- );
- assert_eq!(
- t("a{1,}?"),
- hir_range(false, hir::RepetitionRange::AtLeast(1), hir_lit("a"),)
- );
- assert_eq!(
- t("a{1,2}?"),
- hir_range(
- false,
- hir::RepetitionRange::Bounded(1, 2),
- hir_lit("a"),
- )
- );
+ assert_eq!(t("a{1}"), hir_range(true, 1, Some(1), hir_lit("a"),));
+ assert_eq!(t("a{1,}"), hir_range(true, 1, None, hir_lit("a"),));
+ assert_eq!(t("a{1,2}"), hir_range(true, 1, Some(2), hir_lit("a"),));
+ assert_eq!(t("a{1}?"), hir_range(false, 1, Some(1), hir_lit("a"),));
+ assert_eq!(t("a{1,}?"), hir_range(false, 1, None, hir_lit("a"),));
+ assert_eq!(t("a{1,2}?"), hir_range(false, 1, Some(2), hir_lit("a"),));
assert_eq!(
t("ab?"),
hir_cat(vec![hir_lit("a"), hir_quest(true, hir_lit("b")),])
);
- assert_eq!(
- t("(ab)?"),
- hir_quest(
- true,
- hir_group(1, hir_cat(vec![hir_lit("a"), hir_lit("b"),]))
- )
- );
+ assert_eq!(t("(ab)?"), hir_quest(true, hir_capture(1, hir_lit("ab"))));
assert_eq!(
t("a|b?"),
hir_alt(vec![hir_lit("a"), hir_quest(true, hir_lit("b")),])
@@ -1803,48 +1993,49 @@ mod tests {
#[test]
fn cat_alt() {
+ let a = || hir_look(hir::Look::Start);
+ let b = || hir_look(hir::Look::End);
+ let c = || hir_look(hir::Look::WordUnicode);
+ let d = || hir_look(hir::Look::WordUnicodeNegate);
+
+ assert_eq!(t("(^$)"), hir_capture(1, hir_cat(vec![a(), b()])));
+ assert_eq!(t("^|$"), hir_alt(vec![a(), b()]));
+ assert_eq!(t(r"^|$|\b"), hir_alt(vec![a(), b(), c()]));
assert_eq!(
- t("(ab)"),
- hir_group(1, hir_cat(vec![hir_lit("a"), hir_lit("b"),]))
- );
- assert_eq!(t("a|b"), hir_alt(vec![hir_lit("a"), hir_lit("b"),]));
- assert_eq!(
- t("a|b|c"),
- hir_alt(vec![hir_lit("a"), hir_lit("b"), hir_lit("c"),])
- );
- assert_eq!(
- t("ab|bc|cd"),
- hir_alt(vec![hir_lit("ab"), hir_lit("bc"), hir_lit("cd"),])
- );
- assert_eq!(
- t("(a|b)"),
- hir_group(1, hir_alt(vec![hir_lit("a"), hir_lit("b"),]))
+ t(r"^$|$\b|\b\B"),
+ hir_alt(vec![
+ hir_cat(vec![a(), b()]),
+ hir_cat(vec![b(), c()]),
+ hir_cat(vec![c(), d()]),
+ ])
);
+ assert_eq!(t("(^|$)"), hir_capture(1, hir_alt(vec![a(), b()])));
assert_eq!(
- t("(a|b|c)"),
- hir_group(
- 1,
- hir_alt(vec![hir_lit("a"), hir_lit("b"), hir_lit("c"),])
- )
+ t(r"(^|$|\b)"),
+ hir_capture(1, hir_alt(vec![a(), b(), c()]))
);
assert_eq!(
- t("(ab|bc|cd)"),
- hir_group(
+ t(r"(^$|$\b|\b\B)"),
+ hir_capture(
1,
- hir_alt(vec![hir_lit("ab"), hir_lit("bc"), hir_lit("cd"),])
+ hir_alt(vec![
+ hir_cat(vec![a(), b()]),
+ hir_cat(vec![b(), c()]),
+ hir_cat(vec![c(), d()]),
+ ])
)
);
assert_eq!(
- t("(ab|(bc|(cd)))"),
- hir_group(
+ t(r"(^$|($\b|(\b\B)))"),
+ hir_capture(
1,
hir_alt(vec![
- hir_lit("ab"),
- hir_group(
+ hir_cat(vec![a(), b()]),
+ hir_capture(
2,
hir_alt(vec![
- hir_lit("bc"),
- hir_group(3, hir_lit("cd")),
+ hir_cat(vec![b(), c()]),
+ hir_capture(3, hir_cat(vec![c(), d()])),
])
),
])
@@ -1852,68 +2043,107 @@ mod tests {
);
}
+ // Tests the HIR transformation of things like '[a-z]|[A-Z]' into
+ // '[A-Za-z]'. In other words, an alternation of just classes is always
+ // equivalent to a single class corresponding to the union of the branches
+ // in that class. (Unless some branches match invalid UTF-8 and others
+ // match non-ASCII Unicode.)
+ #[test]
+ fn cat_class_flattened() {
+ assert_eq!(t(r"[a-z]|[A-Z]"), hir_uclass(&[('A', 'Z'), ('a', 'z')]));
+ // Combining all of the letter properties should give us the one giant
+ // letter property.
+ #[cfg(feature = "unicode-gencat")]
+ assert_eq!(
+ t(r"(?x)
+ \p{Lowercase_Letter}
+ |\p{Uppercase_Letter}
+ |\p{Titlecase_Letter}
+ |\p{Modifier_Letter}
+ |\p{Other_Letter}
+ "),
+ hir_uclass_query(ClassQuery::Binary("letter"))
+ );
+ // Byte classes that can truly match invalid UTF-8 cannot be combined
+ // with Unicode classes.
+ assert_eq!(
+ t_bytes(r"[Δδ]|(?-u:[\x90-\xFF])|[Λλ]"),
+ hir_alt(vec![
+ hir_uclass(&[('Δ', 'Δ'), ('δ', 'δ')]),
+ hir_bclass(&[(b'\x90', b'\xFF')]),
+ hir_uclass(&[('Λ', 'Λ'), ('λ', 'λ')]),
+ ])
+ );
+ // Byte classes on their own can be combined, even if some are ASCII
+ // and others are invalid UTF-8.
+ assert_eq!(
+ t_bytes(r"[a-z]|(?-u:[\x90-\xFF])|[A-Z]"),
+ hir_bclass(&[(b'A', b'Z'), (b'a', b'z'), (b'\x90', b'\xFF')]),
+ );
+ }
+
#[test]
fn class_ascii() {
assert_eq!(
t("[[:alnum:]]"),
- hir_uclass(ascii_class(&ast::ClassAsciiKind::Alnum))
+ hir_ascii_uclass(&ast::ClassAsciiKind::Alnum)
);
assert_eq!(
t("[[:alpha:]]"),
- hir_uclass(ascii_class(&ast::ClassAsciiKind::Alpha))
+ hir_ascii_uclass(&ast::ClassAsciiKind::Alpha)
);
assert_eq!(
t("[[:ascii:]]"),
- hir_uclass(ascii_class(&ast::ClassAsciiKind::Ascii))
+ hir_ascii_uclass(&ast::ClassAsciiKind::Ascii)
);
assert_eq!(
t("[[:blank:]]"),
- hir_uclass(ascii_class(&ast::ClassAsciiKind::Blank))
+ hir_ascii_uclass(&ast::ClassAsciiKind::Blank)
);
assert_eq!(
t("[[:cntrl:]]"),
- hir_uclass(ascii_class(&ast::ClassAsciiKind::Cntrl))
+ hir_ascii_uclass(&ast::ClassAsciiKind::Cntrl)
);
assert_eq!(
t("[[:digit:]]"),
- hir_uclass(ascii_class(&ast::ClassAsciiKind::Digit))
+ hir_ascii_uclass(&ast::ClassAsciiKind::Digit)
);
assert_eq!(
t("[[:graph:]]"),
- hir_uclass(ascii_class(&ast::ClassAsciiKind::Graph))
+ hir_ascii_uclass(&ast::ClassAsciiKind::Graph)
);
assert_eq!(
t("[[:lower:]]"),
- hir_uclass(ascii_class(&ast::ClassAsciiKind::Lower))
+ hir_ascii_uclass(&ast::ClassAsciiKind::Lower)
);
assert_eq!(
t("[[:print:]]"),
- hir_uclass(ascii_class(&ast::ClassAsciiKind::Print))
+ hir_ascii_uclass(&ast::ClassAsciiKind::Print)
);
assert_eq!(
t("[[:punct:]]"),
- hir_uclass(ascii_class(&ast::ClassAsciiKind::Punct))
+ hir_ascii_uclass(&ast::ClassAsciiKind::Punct)
);
assert_eq!(
t("[[:space:]]"),
- hir_uclass(ascii_class(&ast::ClassAsciiKind::Space))
+ hir_ascii_uclass(&ast::ClassAsciiKind::Space)
);
assert_eq!(
t("[[:upper:]]"),
- hir_uclass(ascii_class(&ast::ClassAsciiKind::Upper))
+ hir_ascii_uclass(&ast::ClassAsciiKind::Upper)
);
assert_eq!(
t("[[:word:]]"),
- hir_uclass(ascii_class(&ast::ClassAsciiKind::Word))
+ hir_ascii_uclass(&ast::ClassAsciiKind::Word)
);
assert_eq!(
t("[[:xdigit:]]"),
- hir_uclass(ascii_class(&ast::ClassAsciiKind::Xdigit))
+ hir_ascii_uclass(&ast::ClassAsciiKind::Xdigit)
);
assert_eq!(
t("[[:^lower:]]"),
- hir_negate(hir_uclass(ascii_class(&ast::ClassAsciiKind::Lower)))
+ hir_negate(hir_ascii_uclass(&ast::ClassAsciiKind::Lower))
);
#[cfg(feature = "unicode-case")]
assert_eq!(
@@ -1928,13 +2158,11 @@ mod tests {
assert_eq!(
t("(?-u)[[:lower:]]"),
- hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Lower))
+ hir_ascii_bclass(&ast::ClassAsciiKind::Lower)
);
assert_eq!(
t("(?i-u)[[:lower:]]"),
- hir_case_fold(hir_bclass_from_char(ascii_class(
- &ast::ClassAsciiKind::Lower
- )))
+ hir_case_fold(hir_ascii_bclass(&ast::ClassAsciiKind::Lower))
);
assert_eq!(
@@ -1965,14 +2193,14 @@ mod tests {
assert_eq!(
t("[[:alnum:][:^ascii:]]"),
hir_union(
- hir_uclass(ascii_class(&ast::ClassAsciiKind::Alnum)),
+ hir_ascii_uclass(&ast::ClassAsciiKind::Alnum),
hir_uclass(&[('\u{80}', '\u{10FFFF}')]),
),
);
assert_eq!(
t_bytes("(?-u)[[:alnum:][:^ascii:]]"),
hir_union(
- hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Alnum)),
+ hir_ascii_bclass(&ast::ClassAsciiKind::Alnum),
hir_bclass(&[(0x80, 0xFF)]),
),
);
@@ -1980,7 +2208,7 @@ mod tests {
#[test]
#[cfg(feature = "unicode-perl")]
- fn class_perl() {
+ fn class_perl_unicode() {
// Unicode
assert_eq!(t(r"\d"), hir_uclass_query(ClassQuery::Binary("digit")));
assert_eq!(t(r"\s"), hir_uclass_query(ClassQuery::Binary("space")));
@@ -2020,69 +2248,124 @@ mod tests {
);
#[cfg(feature = "unicode-case")]
assert_eq!(t(r"(?i)\W"), hir_negate(hir_uclass_perl_word()));
+ }
+ #[test]
+ fn class_perl_ascii() {
// ASCII only
assert_eq!(
t(r"(?-u)\d"),
- hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Digit))
+ hir_ascii_bclass(&ast::ClassAsciiKind::Digit)
);
assert_eq!(
t(r"(?-u)\s"),
- hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Space))
+ hir_ascii_bclass(&ast::ClassAsciiKind::Space)
);
assert_eq!(
t(r"(?-u)\w"),
- hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Word))
+ hir_ascii_bclass(&ast::ClassAsciiKind::Word)
);
assert_eq!(
t(r"(?i-u)\d"),
- hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Digit))
+ hir_ascii_bclass(&ast::ClassAsciiKind::Digit)
);
assert_eq!(
t(r"(?i-u)\s"),
- hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Space))
+ hir_ascii_bclass(&ast::ClassAsciiKind::Space)
);
assert_eq!(
t(r"(?i-u)\w"),
- hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Word))
+ hir_ascii_bclass(&ast::ClassAsciiKind::Word)
);
// ASCII only, negated
assert_eq!(
- t(r"(?-u)\D"),
- hir_negate(hir_bclass_from_char(ascii_class(
- &ast::ClassAsciiKind::Digit
- )))
+ t_bytes(r"(?-u)\D"),
+ hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit))
);
assert_eq!(
- t(r"(?-u)\S"),
- hir_negate(hir_bclass_from_char(ascii_class(
- &ast::ClassAsciiKind::Space
- )))
+ t_bytes(r"(?-u)\S"),
+ hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Space))
);
assert_eq!(
- t(r"(?-u)\W"),
- hir_negate(hir_bclass_from_char(ascii_class(
- &ast::ClassAsciiKind::Word
- )))
+ t_bytes(r"(?-u)\W"),
+ hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Word))
);
assert_eq!(
- t(r"(?i-u)\D"),
- hir_negate(hir_bclass_from_char(ascii_class(
- &ast::ClassAsciiKind::Digit
- )))
+ t_bytes(r"(?i-u)\D"),
+ hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit))
);
assert_eq!(
- t(r"(?i-u)\S"),
- hir_negate(hir_bclass_from_char(ascii_class(
- &ast::ClassAsciiKind::Space
- )))
+ t_bytes(r"(?i-u)\S"),
+ hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Space))
);
assert_eq!(
- t(r"(?i-u)\W"),
- hir_negate(hir_bclass_from_char(ascii_class(
- &ast::ClassAsciiKind::Word
- )))
+ t_bytes(r"(?i-u)\W"),
+ hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Word))
+ );
+
+ // ASCII only, negated, with UTF-8 mode enabled.
+ // In this case, negating any Perl class results in an error because
+ // all such classes can match invalid UTF-8.
+ assert_eq!(
+ t_err(r"(?-u)\D"),
+ TestError {
+ kind: hir::ErrorKind::InvalidUtf8,
+ span: Span::new(
+ Position::new(5, 1, 6),
+ Position::new(7, 1, 8),
+ ),
+ },
+ );
+ assert_eq!(
+ t_err(r"(?-u)\S"),
+ TestError {
+ kind: hir::ErrorKind::InvalidUtf8,
+ span: Span::new(
+ Position::new(5, 1, 6),
+ Position::new(7, 1, 8),
+ ),
+ },
+ );
+ assert_eq!(
+ t_err(r"(?-u)\W"),
+ TestError {
+ kind: hir::ErrorKind::InvalidUtf8,
+ span: Span::new(
+ Position::new(5, 1, 6),
+ Position::new(7, 1, 8),
+ ),
+ },
+ );
+ assert_eq!(
+ t_err(r"(?i-u)\D"),
+ TestError {
+ kind: hir::ErrorKind::InvalidUtf8,
+ span: Span::new(
+ Position::new(6, 1, 7),
+ Position::new(8, 1, 9),
+ ),
+ },
+ );
+ assert_eq!(
+ t_err(r"(?i-u)\S"),
+ TestError {
+ kind: hir::ErrorKind::InvalidUtf8,
+ span: Span::new(
+ Position::new(6, 1, 7),
+ Position::new(8, 1, 9),
+ ),
+ },
+ );
+ assert_eq!(
+ t_err(r"(?i-u)\W"),
+ TestError {
+ kind: hir::ErrorKind::InvalidUtf8,
+ span: Span::new(
+ Position::new(6, 1, 7),
+ Position::new(8, 1, 9),
+ ),
+ },
);
}
@@ -2360,16 +2643,7 @@ mod tests {
#[test]
#[cfg(feature = "unicode-gencat")]
fn class_unicode_any_empty() {
- assert_eq!(
- t_err(r"\P{any}"),
- TestError {
- kind: hir::ErrorKind::EmptyClassNotAllowed,
- span: Span::new(
- Position::new(0, 1, 1),
- Position::new(7, 1, 8)
- ),
- }
- );
+ assert_eq!(t(r"\P{any}"), hir_uclass(&[]),);
}
#[test]
@@ -2389,8 +2663,9 @@ mod tests {
#[test]
fn class_bracketed() {
- assert_eq!(t("[a]"), hir_uclass(&[('a', 'a')]));
- assert_eq!(t("[^[a]]"), hir_negate(hir_uclass(&[('a', 'a')])));
+ assert_eq!(t("[a]"), hir_lit("a"));
+ assert_eq!(t("[ab]"), hir_uclass(&[('a', 'b')]));
+ assert_eq!(t("[^[a]]"), class_negate(uclass(&[('a', 'a')])));
assert_eq!(t("[a-z]"), hir_uclass(&[('a', 'z')]));
assert_eq!(t("[a-fd-h]"), hir_uclass(&[('a', 'h')]));
assert_eq!(t("[a-fg-m]"), hir_uclass(&[('a', 'm')]));
@@ -2453,11 +2728,11 @@ mod tests {
);
assert_eq!(t("(?i-u)[k]"), hir_bclass(&[(b'K', b'K'), (b'k', b'k'),]));
- assert_eq!(t("[^a]"), hir_negate(hir_uclass(&[('a', 'a')])));
- assert_eq!(t(r"[^\x00]"), hir_negate(hir_uclass(&[('\0', '\0')])));
+ assert_eq!(t("[^a]"), class_negate(uclass(&[('a', 'a')])));
+ assert_eq!(t(r"[^\x00]"), class_negate(uclass(&[('\0', '\0')])));
assert_eq!(
t_bytes("(?-u)[^a]"),
- hir_negate(hir_bclass(&[(b'a', b'a')]))
+ class_negate(bclass(&[(b'a', b'a')]))
);
#[cfg(any(feature = "unicode-perl", feature = "unicode-gencat"))]
assert_eq!(
@@ -2521,27 +2796,9 @@ mod tests {
}
);
#[cfg(any(feature = "unicode-perl", feature = "unicode-bool"))]
- assert_eq!(
- t_err(r"[^\s\S]"),
- TestError {
- kind: hir::ErrorKind::EmptyClassNotAllowed,
- span: Span::new(
- Position::new(0, 1, 1),
- Position::new(7, 1, 8)
- ),
- }
- );
+ assert_eq!(t(r"[^\s\S]"), hir_uclass(&[]),);
#[cfg(any(feature = "unicode-perl", feature = "unicode-bool"))]
- assert_eq!(
- t_err(r"(?-u)[^\s\S]"),
- TestError {
- kind: hir::ErrorKind::EmptyClassNotAllowed,
- span: Span::new(
- Position::new(5, 1, 6),
- Position::new(12, 1, 13)
- ),
- }
- );
+ assert_eq!(t_bytes(r"(?-u)[^\s\S]"), hir_bclass(&[]),);
}
#[test]
@@ -2663,9 +2920,9 @@ mod tests {
#[test]
fn class_bracketed_nested() {
- assert_eq!(t(r"[a[^c]]"), hir_negate(hir_uclass(&[('c', 'c')])));
- assert_eq!(t(r"[a-b[^c]]"), hir_negate(hir_uclass(&[('c', 'c')])));
- assert_eq!(t(r"[a-c[^c]]"), hir_negate(hir_uclass(&[])));
+ assert_eq!(t(r"[a[^c]]"), class_negate(uclass(&[('c', 'c')])));
+ assert_eq!(t(r"[a-b[^c]]"), class_negate(uclass(&[('c', 'c')])));
+ assert_eq!(t(r"[a-c[^c]]"), class_negate(uclass(&[])));
assert_eq!(t(r"[^a[^c]]"), hir_uclass(&[('c', 'c')]));
assert_eq!(t(r"[^a-b[^c]]"), hir_uclass(&[('c', 'c')]));
@@ -2673,12 +2930,12 @@ mod tests {
#[cfg(feature = "unicode-case")]
assert_eq!(
t(r"(?i)[a[^c]]"),
- hir_negate(hir_case_fold(hir_uclass(&[('c', 'c')])))
+ hir_negate(class_case_fold(uclass(&[('c', 'c')])))
);
#[cfg(feature = "unicode-case")]
assert_eq!(
t(r"(?i)[a-b[^c]]"),
- hir_negate(hir_case_fold(hir_uclass(&[('c', 'c')])))
+ hir_negate(class_case_fold(uclass(&[('c', 'c')])))
);
#[cfg(feature = "unicode-case")]
@@ -2689,27 +2946,9 @@ mod tests {
hir_uclass(&[('C', 'C'), ('c', 'c')])
);
- assert_eq!(
- t_err(r"[^a-c[^c]]"),
- TestError {
- kind: hir::ErrorKind::EmptyClassNotAllowed,
- span: Span::new(
- Position::new(0, 1, 1),
- Position::new(10, 1, 11)
- ),
- }
- );
+ assert_eq!(t(r"[^a-c[^c]]"), hir_uclass(&[]),);
#[cfg(feature = "unicode-case")]
- assert_eq!(
- t_err(r"(?i)[^a-c[^c]]"),
- TestError {
- kind: hir::ErrorKind::EmptyClassNotAllowed,
- span: Span::new(
- Position::new(4, 1, 5),
- Position::new(14, 1, 15)
- ),
- }
- );
+ assert_eq!(t(r"(?i)[^a-c[^c]]"), hir_uclass(&[]),);
}
#[test]
@@ -2826,9 +3065,7 @@ mod tests {
#[cfg(feature = "unicode-perl")]
assert_eq!(
t_bytes(r"(?-u)[^\w&&\d]"),
- hir_negate(hir_bclass_from_char(ascii_class(
- &ast::ClassAsciiKind::Digit
- )))
+ hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit))
);
assert_eq!(
t_bytes(r"(?-u)[^[a-z&&a-c]]"),
@@ -2836,19 +3073,15 @@ mod tests {
);
assert_eq!(
t_bytes(r"(?-u)[^[\w&&\d]]"),
- hir_negate(hir_bclass_from_char(ascii_class(
- &ast::ClassAsciiKind::Digit
- )))
+ hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit))
);
assert_eq!(
t_bytes(r"(?-u)[^[^\w&&\d]]"),
- hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Digit))
+ hir_ascii_bclass(&ast::ClassAsciiKind::Digit)
);
assert_eq!(
t_bytes(r"(?-u)[[[^\w]&&[^\d]]]"),
- hir_negate(hir_bclass_from_char(ascii_class(
- &ast::ClassAsciiKind::Word
- )))
+ hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Word))
);
}
@@ -2924,284 +3157,427 @@ mod tests {
, # comment
10 # comment
} # comment"),
- hir_range(
- true,
- hir::RepetitionRange::Bounded(5, 10),
- hir_lit("a")
- )
+ hir_range(true, 5, Some(10), hir_lit("a"))
);
assert_eq!(t(r"(?x)a\ # hi there"), hir_lit("a "));
}
#[test]
- fn analysis_is_always_utf8() {
+ fn analysis_is_utf8() {
// Positive examples.
- assert!(t_bytes(r"a").is_always_utf8());
- assert!(t_bytes(r"ab").is_always_utf8());
- assert!(t_bytes(r"(?-u)a").is_always_utf8());
- assert!(t_bytes(r"(?-u)ab").is_always_utf8());
- assert!(t_bytes(r"\xFF").is_always_utf8());
- assert!(t_bytes(r"\xFF\xFF").is_always_utf8());
- assert!(t_bytes(r"[^a]").is_always_utf8());
- assert!(t_bytes(r"[^a][^a]").is_always_utf8());
- assert!(t_bytes(r"\b").is_always_utf8());
- assert!(t_bytes(r"\B").is_always_utf8());
- assert!(t_bytes(r"(?-u)\b").is_always_utf8());
+ assert!(props_bytes(r"a").is_utf8());
+ assert!(props_bytes(r"ab").is_utf8());
+ assert!(props_bytes(r"(?-u)a").is_utf8());
+ assert!(props_bytes(r"(?-u)ab").is_utf8());
+ assert!(props_bytes(r"\xFF").is_utf8());
+ assert!(props_bytes(r"\xFF\xFF").is_utf8());
+ assert!(props_bytes(r"[^a]").is_utf8());
+ assert!(props_bytes(r"[^a][^a]").is_utf8());
+ assert!(props_bytes(r"\b").is_utf8());
+ assert!(props_bytes(r"\B").is_utf8());
+ assert!(props_bytes(r"(?-u)\b").is_utf8());
+ assert!(props_bytes(r"(?-u)\B").is_utf8());
// Negative examples.
- assert!(!t_bytes(r"(?-u)\xFF").is_always_utf8());
- assert!(!t_bytes(r"(?-u)\xFF\xFF").is_always_utf8());
- assert!(!t_bytes(r"(?-u)[^a]").is_always_utf8());
- assert!(!t_bytes(r"(?-u)[^a][^a]").is_always_utf8());
- assert!(!t_bytes(r"(?-u)\B").is_always_utf8());
+ assert!(!props_bytes(r"(?-u)\xFF").is_utf8());
+ assert!(!props_bytes(r"(?-u)\xFF\xFF").is_utf8());
+ assert!(!props_bytes(r"(?-u)[^a]").is_utf8());
+ assert!(!props_bytes(r"(?-u)[^a][^a]").is_utf8());
+ }
+
+ #[test]
+ fn analysis_captures_len() {
+ assert_eq!(0, props(r"a").explicit_captures_len());
+ assert_eq!(0, props(r"(?:a)").explicit_captures_len());
+ assert_eq!(0, props(r"(?i-u:a)").explicit_captures_len());
+ assert_eq!(0, props(r"(?i-u)a").explicit_captures_len());
+ assert_eq!(1, props(r"(a)").explicit_captures_len());
+ assert_eq!(1, props(r"(?P<foo>a)").explicit_captures_len());
+ assert_eq!(1, props(r"()").explicit_captures_len());
+ assert_eq!(1, props(r"()a").explicit_captures_len());
+ assert_eq!(1, props(r"(a)+").explicit_captures_len());
+ assert_eq!(2, props(r"(a)(b)").explicit_captures_len());
+ assert_eq!(2, props(r"(a)|(b)").explicit_captures_len());
+ assert_eq!(2, props(r"((a))").explicit_captures_len());
+ assert_eq!(1, props(r"([a&&b])").explicit_captures_len());
+ }
+
+ #[test]
+ fn analysis_static_captures_len() {
+ let len = |pattern| props(pattern).static_explicit_captures_len();
+ assert_eq!(Some(0), len(r""));
+ assert_eq!(Some(0), len(r"foo|bar"));
+ assert_eq!(None, len(r"(foo)|bar"));
+ assert_eq!(None, len(r"foo|(bar)"));
+ assert_eq!(Some(1), len(r"(foo|bar)"));
+ assert_eq!(Some(1), len(r"(a|b|c|d|e|f)"));
+ assert_eq!(Some(1), len(r"(a)|(b)|(c)|(d)|(e)|(f)"));
+ assert_eq!(Some(2), len(r"(a)(b)|(c)(d)|(e)(f)"));
+ assert_eq!(Some(6), len(r"(a)(b)(c)(d)(e)(f)"));
+ assert_eq!(Some(3), len(r"(a)(b)(extra)|(a)(b)()"));
+ assert_eq!(Some(3), len(r"(a)(b)((?:extra)?)"));
+ assert_eq!(None, len(r"(a)(b)(extra)?"));
+ assert_eq!(Some(1), len(r"(foo)|(bar)"));
+ assert_eq!(Some(2), len(r"(foo)(bar)"));
+ assert_eq!(Some(2), len(r"(foo)+(bar)"));
+ assert_eq!(None, len(r"(foo)*(bar)"));
+ assert_eq!(Some(0), len(r"(foo)?{0}"));
+ assert_eq!(None, len(r"(foo)?{1}"));
+ assert_eq!(Some(1), len(r"(foo){1}"));
+ assert_eq!(Some(1), len(r"(foo){1,}"));
+ assert_eq!(Some(1), len(r"(foo){1,}?"));
+ assert_eq!(None, len(r"(foo){1,}??"));
+ assert_eq!(None, len(r"(foo){0,}"));
+ assert_eq!(Some(1), len(r"(foo)(?:bar)"));
+ assert_eq!(Some(2), len(r"(foo(?:bar)+)(?:baz(boo))"));
+ assert_eq!(Some(2), len(r"(?P<bar>foo)(?:bar)(bal|loon)"));
+ assert_eq!(
+ Some(2),
+ len(r#"<(a)[^>]+href="([^"]+)"|<(img)[^>]+src="([^"]+)""#)
+ );
}
#[test]
fn analysis_is_all_assertions() {
// Positive examples.
- assert!(t(r"\b").is_all_assertions());
- assert!(t(r"\B").is_all_assertions());
- assert!(t(r"^").is_all_assertions());
- assert!(t(r"$").is_all_assertions());
- assert!(t(r"\A").is_all_assertions());
- assert!(t(r"\z").is_all_assertions());
- assert!(t(r"$^\z\A\b\B").is_all_assertions());
- assert!(t(r"$|^|\z|\A|\b|\B").is_all_assertions());
- assert!(t(r"^$|$^").is_all_assertions());
- assert!(t(r"((\b)+())*^").is_all_assertions());
+ let p = props(r"\b");
+ assert!(!p.look_set().is_empty());
+ assert_eq!(p.minimum_len(), Some(0));
+
+ let p = props(r"\B");
+ assert!(!p.look_set().is_empty());
+ assert_eq!(p.minimum_len(), Some(0));
+
+ let p = props(r"^");
+ assert!(!p.look_set().is_empty());
+ assert_eq!(p.minimum_len(), Some(0));
+
+ let p = props(r"$");
+ assert!(!p.look_set().is_empty());
+ assert_eq!(p.minimum_len(), Some(0));
+
+ let p = props(r"\A");
+ assert!(!p.look_set().is_empty());
+ assert_eq!(p.minimum_len(), Some(0));
+
+ let p = props(r"\z");
+ assert!(!p.look_set().is_empty());
+ assert_eq!(p.minimum_len(), Some(0));
+
+ let p = props(r"$^\z\A\b\B");
+ assert!(!p.look_set().is_empty());
+ assert_eq!(p.minimum_len(), Some(0));
+
+ let p = props(r"$|^|\z|\A|\b|\B");
+ assert!(!p.look_set().is_empty());
+ assert_eq!(p.minimum_len(), Some(0));
+
+ let p = props(r"^$|$^");
+ assert!(!p.look_set().is_empty());
+ assert_eq!(p.minimum_len(), Some(0));
+
+ let p = props(r"((\b)+())*^");
+ assert!(!p.look_set().is_empty());
+ assert_eq!(p.minimum_len(), Some(0));
// Negative examples.
- assert!(!t(r"^a").is_all_assertions());
+ let p = props(r"^a");
+ assert!(!p.look_set().is_empty());
+ assert_eq!(p.minimum_len(), Some(1));
+ }
+
+ #[test]
+ fn analysis_look_set_prefix_any() {
+ let p = props(r"(?-u)(?i:(?:\b|_)win(?:32|64|dows)?(?:\b|_))");
+ assert!(p.look_set_prefix_any().contains(Look::WordAscii));
}
#[test]
fn analysis_is_anchored() {
+ let is_start = |p| props(p).look_set_prefix().contains(Look::Start);
+ let is_end = |p| props(p).look_set_suffix().contains(Look::End);
+
// Positive examples.
- assert!(t(r"^").is_anchored_start());
- assert!(t(r"$").is_anchored_end());
- assert!(t(r"^").is_line_anchored_start());
- assert!(t(r"$").is_line_anchored_end());
-
- assert!(t(r"^^").is_anchored_start());
- assert!(t(r"$$").is_anchored_end());
- assert!(t(r"^^").is_line_anchored_start());
- assert!(t(r"$$").is_line_anchored_end());
-
- assert!(t(r"^$").is_anchored_start());
- assert!(t(r"^$").is_anchored_end());
- assert!(t(r"^$").is_line_anchored_start());
- assert!(t(r"^$").is_line_anchored_end());
-
- assert!(t(r"^foo").is_anchored_start());
- assert!(t(r"foo$").is_anchored_end());
- assert!(t(r"^foo").is_line_anchored_start());
- assert!(t(r"foo$").is_line_anchored_end());
-
- assert!(t(r"^foo|^bar").is_anchored_start());
- assert!(t(r"foo$|bar$").is_anchored_end());
- assert!(t(r"^foo|^bar").is_line_anchored_start());
- assert!(t(r"foo$|bar$").is_line_anchored_end());
-
- assert!(t(r"^(foo|bar)").is_anchored_start());
- assert!(t(r"(foo|bar)$").is_anchored_end());
- assert!(t(r"^(foo|bar)").is_line_anchored_start());
- assert!(t(r"(foo|bar)$").is_line_anchored_end());
-
- assert!(t(r"^+").is_anchored_start());
- assert!(t(r"$+").is_anchored_end());
- assert!(t(r"^+").is_line_anchored_start());
- assert!(t(r"$+").is_line_anchored_end());
- assert!(t(r"^++").is_anchored_start());
- assert!(t(r"$++").is_anchored_end());
- assert!(t(r"^++").is_line_anchored_start());
- assert!(t(r"$++").is_line_anchored_end());
- assert!(t(r"(^)+").is_anchored_start());
- assert!(t(r"($)+").is_anchored_end());
- assert!(t(r"(^)+").is_line_anchored_start());
- assert!(t(r"($)+").is_line_anchored_end());
-
- assert!(t(r"$^").is_anchored_start());
- assert!(t(r"$^").is_anchored_start());
- assert!(t(r"$^").is_line_anchored_end());
- assert!(t(r"$^").is_line_anchored_end());
- assert!(t(r"$^|^$").is_anchored_start());
- assert!(t(r"$^|^$").is_anchored_end());
- assert!(t(r"$^|^$").is_line_anchored_start());
- assert!(t(r"$^|^$").is_line_anchored_end());
-
- assert!(t(r"\b^").is_anchored_start());
- assert!(t(r"$\b").is_anchored_end());
- assert!(t(r"\b^").is_line_anchored_start());
- assert!(t(r"$\b").is_line_anchored_end());
- assert!(t(r"^(?m:^)").is_anchored_start());
- assert!(t(r"(?m:$)$").is_anchored_end());
- assert!(t(r"^(?m:^)").is_line_anchored_start());
- assert!(t(r"(?m:$)$").is_line_anchored_end());
- assert!(t(r"(?m:^)^").is_anchored_start());
- assert!(t(r"$(?m:$)").is_anchored_end());
- assert!(t(r"(?m:^)^").is_line_anchored_start());
- assert!(t(r"$(?m:$)").is_line_anchored_end());
+ assert!(is_start(r"^"));
+ assert!(is_end(r"$"));
- // Negative examples.
- assert!(!t(r"(?m)^").is_anchored_start());
- assert!(!t(r"(?m)$").is_anchored_end());
- assert!(!t(r"(?m:^$)|$^").is_anchored_start());
- assert!(!t(r"(?m:^$)|$^").is_anchored_end());
- assert!(!t(r"$^|(?m:^$)").is_anchored_start());
- assert!(!t(r"$^|(?m:^$)").is_anchored_end());
-
- assert!(!t(r"a^").is_anchored_start());
- assert!(!t(r"$a").is_anchored_start());
- assert!(!t(r"a^").is_line_anchored_start());
- assert!(!t(r"$a").is_line_anchored_start());
-
- assert!(!t(r"a^").is_anchored_end());
- assert!(!t(r"$a").is_anchored_end());
- assert!(!t(r"a^").is_line_anchored_end());
- assert!(!t(r"$a").is_line_anchored_end());
-
- assert!(!t(r"^foo|bar").is_anchored_start());
- assert!(!t(r"foo|bar$").is_anchored_end());
- assert!(!t(r"^foo|bar").is_line_anchored_start());
- assert!(!t(r"foo|bar$").is_line_anchored_end());
-
- assert!(!t(r"^*").is_anchored_start());
- assert!(!t(r"$*").is_anchored_end());
- assert!(!t(r"^*").is_line_anchored_start());
- assert!(!t(r"$*").is_line_anchored_end());
- assert!(!t(r"^*+").is_anchored_start());
- assert!(!t(r"$*+").is_anchored_end());
- assert!(!t(r"^*+").is_line_anchored_start());
- assert!(!t(r"$*+").is_line_anchored_end());
- assert!(!t(r"^+*").is_anchored_start());
- assert!(!t(r"$+*").is_anchored_end());
- assert!(!t(r"^+*").is_line_anchored_start());
- assert!(!t(r"$+*").is_line_anchored_end());
- assert!(!t(r"(^)*").is_anchored_start());
- assert!(!t(r"($)*").is_anchored_end());
- assert!(!t(r"(^)*").is_line_anchored_start());
- assert!(!t(r"($)*").is_line_anchored_end());
- }
+ assert!(is_start(r"^^"));
+ assert!(props(r"$$").look_set_suffix().contains(Look::End));
- #[test]
- fn analysis_is_line_anchored() {
- assert!(t(r"(?m)^(foo|bar)").is_line_anchored_start());
- assert!(t(r"(?m)(foo|bar)$").is_line_anchored_end());
+ assert!(is_start(r"^$"));
+ assert!(is_end(r"^$"));
- assert!(t(r"(?m)^foo|^bar").is_line_anchored_start());
- assert!(t(r"(?m)foo$|bar$").is_line_anchored_end());
+ assert!(is_start(r"^foo"));
+ assert!(is_end(r"foo$"));
+
+ assert!(is_start(r"^foo|^bar"));
+ assert!(is_end(r"foo$|bar$"));
+
+ assert!(is_start(r"^(foo|bar)"));
+ assert!(is_end(r"(foo|bar)$"));
+
+ assert!(is_start(r"^+"));
+ assert!(is_end(r"$+"));
+ assert!(is_start(r"^++"));
+ assert!(is_end(r"$++"));
+ assert!(is_start(r"(^)+"));
+ assert!(is_end(r"($)+"));
+
+ assert!(is_start(r"$^"));
+ assert!(is_start(r"$^"));
+ assert!(is_start(r"$^|^$"));
+ assert!(is_end(r"$^|^$"));
+
+ assert!(is_start(r"\b^"));
+ assert!(is_end(r"$\b"));
+ assert!(is_start(r"^(?m:^)"));
+ assert!(is_end(r"(?m:$)$"));
+ assert!(is_start(r"(?m:^)^"));
+ assert!(is_end(r"$(?m:$)"));
+
+ // Negative examples.
+ assert!(!is_start(r"(?m)^"));
+ assert!(!is_end(r"(?m)$"));
+ assert!(!is_start(r"(?m:^$)|$^"));
+ assert!(!is_end(r"(?m:^$)|$^"));
+ assert!(!is_start(r"$^|(?m:^$)"));
+ assert!(!is_end(r"$^|(?m:^$)"));
- assert!(t(r"(?m)^").is_line_anchored_start());
- assert!(t(r"(?m)$").is_line_anchored_end());
+ assert!(!is_start(r"a^"));
+ assert!(!is_start(r"$a"));
- assert!(t(r"(?m:^$)|$^").is_line_anchored_start());
- assert!(t(r"(?m:^$)|$^").is_line_anchored_end());
+ assert!(!is_end(r"a^"));
+ assert!(!is_end(r"$a"));
- assert!(t(r"$^|(?m:^$)").is_line_anchored_start());
- assert!(t(r"$^|(?m:^$)").is_line_anchored_end());
+ assert!(!is_start(r"^foo|bar"));
+ assert!(!is_end(r"foo|bar$"));
+
+ assert!(!is_start(r"^*"));
+ assert!(!is_end(r"$*"));
+ assert!(!is_start(r"^*+"));
+ assert!(!is_end(r"$*+"));
+ assert!(!is_start(r"^+*"));
+ assert!(!is_end(r"$+*"));
+ assert!(!is_start(r"(^)*"));
+ assert!(!is_end(r"($)*"));
}
#[test]
fn analysis_is_any_anchored() {
+ let is_start = |p| props(p).look_set().contains(Look::Start);
+ let is_end = |p| props(p).look_set().contains(Look::End);
+
// Positive examples.
- assert!(t(r"^").is_any_anchored_start());
- assert!(t(r"$").is_any_anchored_end());
- assert!(t(r"\A").is_any_anchored_start());
- assert!(t(r"\z").is_any_anchored_end());
+ assert!(is_start(r"^"));
+ assert!(is_end(r"$"));
+ assert!(is_start(r"\A"));
+ assert!(is_end(r"\z"));
// Negative examples.
- assert!(!t(r"(?m)^").is_any_anchored_start());
- assert!(!t(r"(?m)$").is_any_anchored_end());
- assert!(!t(r"$").is_any_anchored_start());
- assert!(!t(r"^").is_any_anchored_end());
+ assert!(!is_start(r"(?m)^"));
+ assert!(!is_end(r"(?m)$"));
+ assert!(!is_start(r"$"));
+ assert!(!is_end(r"^"));
}
#[test]
- fn analysis_is_match_empty() {
+ fn analysis_can_empty() {
// Positive examples.
- assert!(t(r"").is_match_empty());
- assert!(t(r"()").is_match_empty());
- assert!(t(r"()*").is_match_empty());
- assert!(t(r"()+").is_match_empty());
- assert!(t(r"()?").is_match_empty());
- assert!(t(r"a*").is_match_empty());
- assert!(t(r"a?").is_match_empty());
- assert!(t(r"a{0}").is_match_empty());
- assert!(t(r"a{0,}").is_match_empty());
- assert!(t(r"a{0,1}").is_match_empty());
- assert!(t(r"a{0,10}").is_match_empty());
+ let assert_empty =
+ |p| assert_eq!(Some(0), props_bytes(p).minimum_len());
+ assert_empty(r"");
+ assert_empty(r"()");
+ assert_empty(r"()*");
+ assert_empty(r"()+");
+ assert_empty(r"()?");
+ assert_empty(r"a*");
+ assert_empty(r"a?");
+ assert_empty(r"a{0}");
+ assert_empty(r"a{0,}");
+ assert_empty(r"a{0,1}");
+ assert_empty(r"a{0,10}");
#[cfg(feature = "unicode-gencat")]
- assert!(t(r"\pL*").is_match_empty());
- assert!(t(r"a*|b").is_match_empty());
- assert!(t(r"b|a*").is_match_empty());
- assert!(t(r"a|").is_match_empty());
- assert!(t(r"|a").is_match_empty());
- assert!(t(r"a||b").is_match_empty());
- assert!(t(r"a*a?(abcd)*").is_match_empty());
- assert!(t(r"^").is_match_empty());
- assert!(t(r"$").is_match_empty());
- assert!(t(r"(?m)^").is_match_empty());
- assert!(t(r"(?m)$").is_match_empty());
- assert!(t(r"\A").is_match_empty());
- assert!(t(r"\z").is_match_empty());
- assert!(t(r"\B").is_match_empty());
- assert!(t_bytes(r"(?-u)\B").is_match_empty());
- assert!(t(r"\b").is_match_empty());
- assert!(t(r"(?-u)\b").is_match_empty());
+ assert_empty(r"\pL*");
+ assert_empty(r"a*|b");
+ assert_empty(r"b|a*");
+ assert_empty(r"a|");
+ assert_empty(r"|a");
+ assert_empty(r"a||b");
+ assert_empty(r"a*a?(abcd)*");
+ assert_empty(r"^");
+ assert_empty(r"$");
+ assert_empty(r"(?m)^");
+ assert_empty(r"(?m)$");
+ assert_empty(r"\A");
+ assert_empty(r"\z");
+ assert_empty(r"\B");
+ assert_empty(r"(?-u)\B");
+ assert_empty(r"\b");
+ assert_empty(r"(?-u)\b");
// Negative examples.
- assert!(!t(r"a+").is_match_empty());
- assert!(!t(r"a{1}").is_match_empty());
- assert!(!t(r"a{1,}").is_match_empty());
- assert!(!t(r"a{1,2}").is_match_empty());
- assert!(!t(r"a{1,10}").is_match_empty());
- assert!(!t(r"b|a").is_match_empty());
- assert!(!t(r"a*a+(abcd)*").is_match_empty());
+ let assert_non_empty =
+ |p| assert_ne!(Some(0), props_bytes(p).minimum_len());
+ assert_non_empty(r"a+");
+ assert_non_empty(r"a{1}");
+ assert_non_empty(r"a{1,}");
+ assert_non_empty(r"a{1,2}");
+ assert_non_empty(r"a{1,10}");
+ assert_non_empty(r"b|a");
+ assert_non_empty(r"a*a+(abcd)*");
+ #[cfg(feature = "unicode-gencat")]
+ assert_non_empty(r"\P{any}");
+ assert_non_empty(r"[a--a]");
+ assert_non_empty(r"[a&&b]");
}
#[test]
fn analysis_is_literal() {
// Positive examples.
- assert!(t(r"a").is_literal());
- assert!(t(r"ab").is_literal());
- assert!(t(r"abc").is_literal());
- assert!(t(r"(?m)abc").is_literal());
+ assert!(props(r"a").is_literal());
+ assert!(props(r"ab").is_literal());
+ assert!(props(r"abc").is_literal());
+ assert!(props(r"(?m)abc").is_literal());
+ assert!(props(r"(?:a)").is_literal());
+ assert!(props(r"foo(?:a)").is_literal());
+ assert!(props(r"(?:a)foo").is_literal());
+ assert!(props(r"[a]").is_literal());
// Negative examples.
- assert!(!t(r"").is_literal());
- assert!(!t(r"^").is_literal());
- assert!(!t(r"a|b").is_literal());
- assert!(!t(r"(a)").is_literal());
- assert!(!t(r"a+").is_literal());
- assert!(!t(r"foo(a)").is_literal());
- assert!(!t(r"(a)foo").is_literal());
- assert!(!t(r"[a]").is_literal());
+ assert!(!props(r"").is_literal());
+ assert!(!props(r"^").is_literal());
+ assert!(!props(r"a|b").is_literal());
+ assert!(!props(r"(a)").is_literal());
+ assert!(!props(r"a+").is_literal());
+ assert!(!props(r"foo(a)").is_literal());
+ assert!(!props(r"(a)foo").is_literal());
+ assert!(!props(r"[ab]").is_literal());
}
#[test]
fn analysis_is_alternation_literal() {
// Positive examples.
- assert!(t(r"a").is_alternation_literal());
- assert!(t(r"ab").is_alternation_literal());
- assert!(t(r"abc").is_alternation_literal());
- assert!(t(r"(?m)abc").is_alternation_literal());
- assert!(t(r"a|b").is_alternation_literal());
- assert!(t(r"a|b|c").is_alternation_literal());
- assert!(t(r"foo|bar").is_alternation_literal());
- assert!(t(r"foo|bar|baz").is_alternation_literal());
+ assert!(props(r"a").is_alternation_literal());
+ assert!(props(r"ab").is_alternation_literal());
+ assert!(props(r"abc").is_alternation_literal());
+ assert!(props(r"(?m)abc").is_alternation_literal());
+ assert!(props(r"foo|bar").is_alternation_literal());
+ assert!(props(r"foo|bar|baz").is_alternation_literal());
+ assert!(props(r"[a]").is_alternation_literal());
+ assert!(props(r"(?:ab)|cd").is_alternation_literal());
+ assert!(props(r"ab|(?:cd)").is_alternation_literal());
// Negative examples.
- assert!(!t(r"").is_alternation_literal());
- assert!(!t(r"^").is_alternation_literal());
- assert!(!t(r"(a)").is_alternation_literal());
- assert!(!t(r"a+").is_alternation_literal());
- assert!(!t(r"foo(a)").is_alternation_literal());
- assert!(!t(r"(a)foo").is_alternation_literal());
- assert!(!t(r"[a]").is_alternation_literal());
- assert!(!t(r"[a]|b").is_alternation_literal());
- assert!(!t(r"a|[b]").is_alternation_literal());
- assert!(!t(r"(a)|b").is_alternation_literal());
- assert!(!t(r"a|(b)").is_alternation_literal());
+ assert!(!props(r"").is_alternation_literal());
+ assert!(!props(r"^").is_alternation_literal());
+ assert!(!props(r"(a)").is_alternation_literal());
+ assert!(!props(r"a+").is_alternation_literal());
+ assert!(!props(r"foo(a)").is_alternation_literal());
+ assert!(!props(r"(a)foo").is_alternation_literal());
+ assert!(!props(r"[ab]").is_alternation_literal());
+ assert!(!props(r"[ab]|b").is_alternation_literal());
+ assert!(!props(r"a|[ab]").is_alternation_literal());
+ assert!(!props(r"(a)|b").is_alternation_literal());
+ assert!(!props(r"a|(b)").is_alternation_literal());
+ assert!(!props(r"a|b").is_alternation_literal());
+ assert!(!props(r"a|b|c").is_alternation_literal());
+ assert!(!props(r"[a]|b").is_alternation_literal());
+ assert!(!props(r"a|[b]").is_alternation_literal());
+ assert!(!props(r"(?:a)|b").is_alternation_literal());
+ assert!(!props(r"a|(?:b)").is_alternation_literal());
+ assert!(!props(r"(?:z|xx)@|xx").is_alternation_literal());
+ }
+
+ // This tests that the smart Hir::concat constructor simplifies the given
+ // exprs in a way we expect.
+ #[test]
+ fn smart_concat() {
+ assert_eq!(t(""), Hir::empty());
+ assert_eq!(t("(?:)"), Hir::empty());
+ assert_eq!(t("abc"), hir_lit("abc"));
+ assert_eq!(t("(?:foo)(?:bar)"), hir_lit("foobar"));
+ assert_eq!(t("quux(?:foo)(?:bar)baz"), hir_lit("quuxfoobarbaz"));
+ assert_eq!(
+ t("foo(?:bar^baz)quux"),
+ hir_cat(vec![
+ hir_lit("foobar"),
+ hir_look(hir::Look::Start),
+ hir_lit("bazquux"),
+ ])
+ );
+ assert_eq!(
+ t("foo(?:ba(?:r^b)az)quux"),
+ hir_cat(vec![
+ hir_lit("foobar"),
+ hir_look(hir::Look::Start),
+ hir_lit("bazquux"),
+ ])
+ );
+ }
+
+ // This tests that the smart Hir::alternation constructor simplifies the
+ // given exprs in a way we expect.
+ #[test]
+ fn smart_alternation() {
+ assert_eq!(
+ t("(?:foo)|(?:bar)"),
+ hir_alt(vec![hir_lit("foo"), hir_lit("bar")])
+ );
+ assert_eq!(
+ t("quux|(?:abc|def|xyz)|baz"),
+ hir_alt(vec![
+ hir_lit("quux"),
+ hir_lit("abc"),
+ hir_lit("def"),
+ hir_lit("xyz"),
+ hir_lit("baz"),
+ ])
+ );
+ assert_eq!(
+ t("quux|(?:abc|(?:def|mno)|xyz)|baz"),
+ hir_alt(vec![
+ hir_lit("quux"),
+ hir_lit("abc"),
+ hir_lit("def"),
+ hir_lit("mno"),
+ hir_lit("xyz"),
+ hir_lit("baz"),
+ ])
+ );
+ assert_eq!(
+ t("a|b|c|d|e|f|x|y|z"),
+ hir_uclass(&[('a', 'f'), ('x', 'z')]),
+ );
+ // Tests that we lift common prefixes out of an alternation.
+ assert_eq!(
+ t("[A-Z]foo|[A-Z]quux"),
+ hir_cat(vec![
+ hir_uclass(&[('A', 'Z')]),
+ hir_alt(vec![hir_lit("foo"), hir_lit("quux")]),
+ ]),
+ );
+ assert_eq!(
+ t("[A-Z][A-Z]|[A-Z]quux"),
+ hir_cat(vec![
+ hir_uclass(&[('A', 'Z')]),
+ hir_alt(vec![hir_uclass(&[('A', 'Z')]), hir_lit("quux")]),
+ ]),
+ );
+ assert_eq!(
+ t("[A-Z][A-Z]|[A-Z][A-Z]quux"),
+ hir_cat(vec![
+ hir_uclass(&[('A', 'Z')]),
+ hir_uclass(&[('A', 'Z')]),
+ hir_alt(vec![Hir::empty(), hir_lit("quux")]),
+ ]),
+ );
+ assert_eq!(
+ t("[A-Z]foo|[A-Z]foobar"),
+ hir_cat(vec![
+ hir_uclass(&[('A', 'Z')]),
+ hir_alt(vec![hir_lit("foo"), hir_lit("foobar")]),
+ ]),
+ );
}
}
diff --git a/vendor/regex-syntax/src/hir/visitor.rs b/vendor/regex-syntax/src/hir/visitor.rs
index 4f5a70909..e5f15cf1c 100644
--- a/vendor/regex-syntax/src/hir/visitor.rs
+++ b/vendor/regex-syntax/src/hir/visitor.rs
@@ -1,3 +1,5 @@
+use alloc::{vec, vec::Vec};
+
use crate::hir::{self, Hir, HirKind};
/// A trait for visiting the high-level IR (HIR) in depth first order.
@@ -9,7 +11,7 @@ use crate::hir::{self, Hir, HirKind};
/// important since the size of an HIR may be proportional to end user input.
///
/// Typical usage of this trait involves providing an implementation and then
-/// running it using the [`visit`](fn.visit.html) function.
+/// running it using the [`visit`] function.
pub trait Visitor {
/// The result of visiting an HIR.
type Output;
@@ -44,8 +46,7 @@ pub trait Visitor {
/// Executes an implementation of `Visitor` in constant stack space.
///
/// This function will visit every node in the given `Hir` while calling
-/// appropriate methods provided by the
-/// [`Visitor`](trait.Visitor.html) trait.
+/// appropriate methods provided by the [`Visitor`] trait.
///
/// The primary use case for this method is when one wants to perform case
/// analysis over an `Hir` without using a stack size proportional to the depth
@@ -74,9 +75,9 @@ enum Frame<'a> {
/// A stack frame allocated just before descending into a repetition
/// operator's child node.
Repetition(&'a hir::Repetition),
- /// A stack frame allocated just before descending into a group's child
+ /// A stack frame allocated just before descending into a capture's child
/// node.
- Group(&'a hir::Group),
+ Capture(&'a hir::Capture),
/// The stack frame used while visiting every child node of a concatenation
/// of expressions.
Concat {
@@ -149,7 +150,7 @@ impl<'a> HeapVisitor<'a> {
fn induct(&mut self, hir: &'a Hir) -> Option<Frame<'a>> {
match *hir.kind() {
HirKind::Repetition(ref x) => Some(Frame::Repetition(x)),
- HirKind::Group(ref x) => Some(Frame::Group(x)),
+ HirKind::Capture(ref x) => Some(Frame::Capture(x)),
HirKind::Concat(ref x) if x.is_empty() => None,
HirKind::Concat(ref x) => {
Some(Frame::Concat { head: &x[0], tail: &x[1..] })
@@ -167,7 +168,7 @@ impl<'a> HeapVisitor<'a> {
fn pop(&self, induct: Frame<'a>) -> Option<Frame<'a>> {
match induct {
Frame::Repetition(_) => None,
- Frame::Group(_) => None,
+ Frame::Capture(_) => None,
Frame::Concat { tail, .. } => {
if tail.is_empty() {
None
@@ -194,8 +195,8 @@ impl<'a> Frame<'a> {
/// child HIR node to visit.
fn child(&self) -> &'a Hir {
match *self {
- Frame::Repetition(rep) => &rep.hir,
- Frame::Group(group) => &group.hir,
+ Frame::Repetition(rep) => &rep.sub,
+ Frame::Capture(capture) => &capture.sub,
Frame::Concat { head, .. } => head,
Frame::Alternation { head, .. } => head,
}