summaryrefslogtreecommitdiffstats
path: root/vendor/regex-syntax/src/hir
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-06-19 09:26:03 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-06-19 09:26:03 +0000
commit9918693037dce8aa4bb6f08741b6812923486c18 (patch)
tree21d2b40bec7e6a7ea664acee056eb3d08e15a1cf /vendor/regex-syntax/src/hir
parentReleasing progress-linux version 1.75.0+dfsg1-5~progress7.99u1. (diff)
downloadrustc-9918693037dce8aa4bb6f08741b6812923486c18.tar.xz
rustc-9918693037dce8aa4bb6f08741b6812923486c18.zip
Merging upstream version 1.76.0+dfsg1.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'vendor/regex-syntax/src/hir')
-rw-r--r--vendor/regex-syntax/src/hir/literal.rs167
-rw-r--r--vendor/regex-syntax/src/hir/mod.rs196
-rw-r--r--vendor/regex-syntax/src/hir/print.rs59
-rw-r--r--vendor/regex-syntax/src/hir/translate.rs297
-rw-r--r--vendor/regex-syntax/src/hir/visitor.rs15
5 files changed, 544 insertions, 190 deletions
diff --git a/vendor/regex-syntax/src/hir/literal.rs b/vendor/regex-syntax/src/hir/literal.rs
index bd3a2d143..a5a3737f6 100644
--- a/vendor/regex-syntax/src/hir/literal.rs
+++ b/vendor/regex-syntax/src/hir/literal.rs
@@ -23,7 +23,7 @@ effective literal optimizations:
to lead to substring search that is only a little faster than a regex search,
and thus the overhead of using literal optimizations in the first place might
make things slower overall.
-* The literals in your [`Seq`] shoudn't be too short. In general, longer is
+* The literals in your [`Seq`] shouldn't be too short. In general, longer is
better. A sequence corresponding to single bytes that occur frequently in the
haystack, for example, is probably a bad literal optimization because it's
likely to produce many false positive candidates. Longer literals are less
@@ -51,7 +51,7 @@ the "trickier" parts are how to combine literal sequences, and that is all
implemented on [`Seq`].
*/
-use core::{cmp, mem};
+use core::{cmp, mem, num::NonZeroUsize};
use alloc::{vec, vec::Vec};
@@ -477,7 +477,7 @@ impl Extractor {
}
seq
}
- hir::Repetition { min, max: Some(max), .. } if min < max => {
+ hir::Repetition { min, .. } => {
assert!(min > 0); // handled above
let limit =
u32::try_from(self.limit_repeat).unwrap_or(u32::MAX);
@@ -491,10 +491,6 @@ impl Extractor {
seq.make_inexact();
seq
}
- hir::Repetition { .. } => {
- subseq.make_inexact();
- subseq
- }
}
}
@@ -692,7 +688,7 @@ impl Default for ExtractKind {
/// from making assumptions about what literals are required in order to match
/// a particular [`Hir`] expression. Generally speaking, when a set is in this
/// state, literal optimizations are inhibited. A good example of a regex that
-/// will cause this sort of set to apppear is `[A-Za-z]`. The character class
+/// will cause this sort of set to appear is `[A-Za-z]`. The character class
/// is just too big (and also too narrow) to be usefully expanded into 52
/// different literals. (Note that the decision for when a seq should become
/// infinite is determined by the caller. A seq itself has no hard-coded
@@ -1571,7 +1567,7 @@ impl Seq {
/// unioning `self` with `other`. If either set is infinite, then this
/// returns `None`.
#[inline]
- fn max_union_len(&self, other: &Seq) -> Option<usize> {
+ pub fn max_union_len(&self, other: &Seq) -> Option<usize> {
let len1 = self.len()?;
let len2 = other.len()?;
Some(len1.saturating_add(len2))
@@ -1581,7 +1577,7 @@ impl Seq {
/// cross product of `self` with `other`. If either set is infinite, then
/// this returns `None`.
#[inline]
- fn max_cross_len(&self, other: &Seq) -> Option<usize> {
+ pub fn max_cross_len(&self, other: &Seq) -> Option<usize> {
let len1 = self.len()?;
let len2 = other.len()?;
Some(len1.saturating_mul(len2))
@@ -1841,6 +1837,14 @@ impl Seq {
None => return,
Some(len) => len,
};
+ // Just give up now if our sequence contains an empty string.
+ if self.min_literal_len().map_or(false, |len| len == 0) {
+ // We squash the sequence so that nobody else gets any bright
+ // ideas to try and use it. An empty string implies a match at
+ // every position. A prefilter cannot help you here.
+ self.make_infinite();
+ return;
+ }
// Make sure we start with the smallest sequence possible. We use a
// special version of preference minimization that retains exactness.
// This is legal because optimization is only expected to occur once
@@ -1910,34 +1914,41 @@ impl Seq {
// longest common prefix to be subject to the poison check.
}
}
- // Everything below this check is more-or-less about trying to
- // heuristically reduce the false positive rate of a prefilter. But
- // if our sequence is completely exact, then it's possible the regex
- // engine can be skipped entirely. In this case, the false positive
- // rate is zero because every literal match corresponds to a regex
- // match.
+ // If we have an exact sequence, we *probably* just want to keep it
+ // as-is. But there are some cases where we don't. So we save a copy of
+ // the exact sequence now, and then try to do some more optimizations
+ // below. If those don't work out, we go back to this exact sequence.
//
- // This is OK even if the sequence contains a poison literal. Remember,
- // a literal is only poisononous because of what we assume about its
- // impact on the false positive rate. However, we do still check for
- // an empty string. Empty strings are weird and it's best to let the
- // regex engine handle those.
+ // The specific motivation for this is that we sometimes wind up with
+ // an exact sequence with a hefty number of literals. Say, 100. If we
+ // stuck with that, it would be too big for Teddy and would result in
+ // using Aho-Corasick. Which is fine... but the lazy DFA is plenty
+ // suitable in such cases. The real issue is that we will wind up not
+ // using a fast prefilter at all. So in cases like this, even though
+ // we have an exact sequence, it would be better to try and shrink the
+ // sequence (which we do below) and use it as a prefilter that can
+ // produce false positive matches.
//
- // We do currently do this check after the longest common prefix (or
- // suffix) check, under the theory that single-substring search is so
- // fast that we want that even if we'd end up turning an exact sequence
- // into an inexact one. But this might be wrong...
- if self.is_exact()
- && self.min_literal_len().map_or(false, |len| len > 0)
- {
- return;
- }
+ // But if the shrinking below results in a sequence that "sucks," then
+ // we don't want to use that because we already have an exact sequence
+ // in hand.
+ let exact: Option<Seq> =
+ if self.is_exact() { Some(self.clone()) } else { None };
// Now we attempt to shorten the sequence. The idea here is that we
// don't want to look for too many literals, but we want to shorten
// our sequence enough to improve our odds of using better algorithms
// downstream (such as Teddy).
+ //
+ // The pair of numbers in this list corresponds to the maximal prefix
+ // (in bytes) to keep for all literals and the length of the sequence
+ // at which to do it.
+ //
+ // So for example, the pair (3, 500) would mean, "if we have more than
+ // 500 literals in our sequence, then truncate all of our literals
+ // such that they are at most 3 bytes in length and the minimize the
+ // sequence."
const ATTEMPTS: [(usize, usize); 5] =
- [(5, 64), (4, 64), (3, 64), (2, 64), (1, 10)];
+ [(5, 10), (4, 10), (3, 64), (2, 64), (1, 10)];
for (keep, limit) in ATTEMPTS {
let len = match self.len() {
None => break,
@@ -1951,7 +1962,11 @@ impl Seq {
} else {
self.keep_last_bytes(keep);
}
- self.minimize_by_preference();
+ if prefix {
+ if let Some(ref mut lits) = self.literals {
+ PreferenceTrie::minimize(lits, true);
+ }
+ }
}
// Check for a poison literal. A poison literal is one that is short
// and is believed to have a very high match count. These poisons
@@ -1968,6 +1983,30 @@ impl Seq {
self.make_infinite();
}
}
+ // OK, if we had an exact sequence before attempting more optimizations
+ // above and our post-optimized sequence sucks for some reason or
+ // another, then we go back to the exact sequence.
+ if let Some(exact) = exact {
+ // If optimizing resulted in dropping our literals, then certainly
+ // backup and use the exact sequence that we had.
+ if !self.is_finite() {
+ *self = exact;
+ return;
+ }
+ // If our optimized sequence contains a short literal, then it's
+ // *probably* not so great. So throw it away and revert to the
+ // exact sequence.
+ if self.min_literal_len().map_or(true, |len| len <= 2) {
+ *self = exact;
+ return;
+ }
+ // Finally, if our optimized sequence is "big" (i.e., can't use
+ // Teddy), then also don't use it and rely on the exact sequence.
+ if self.len().map_or(true, |len| len > 64) {
+ *self = exact;
+ return;
+ }
+ }
}
}
@@ -1977,7 +2016,7 @@ impl core::fmt::Debug for Seq {
if let Some(lits) = self.literals() {
f.debug_list().entries(lits.iter()).finish()
} else {
- write!(f, "[∅]")
+ write!(f, "[∞]")
}
}
}
@@ -2160,12 +2199,19 @@ impl core::fmt::Debug for Literal {
/// never seen this show up on a profile. Because of the heuristic limits
/// imposed on literal extractions, the size of the inputs here is usually
/// very small.)
-#[derive(Debug, Default)]
+#[derive(Debug)]
struct PreferenceTrie {
/// The states in this trie. The index of a state in this vector is its ID.
states: Vec<State>,
+ /// This vec indicates which states are match states. It always has
+ /// the same length as `states` and is indexed by the same state ID.
+ /// A state with identifier `sid` is a match state if and only if
+ /// `matches[sid].is_some()`. The option contains the index of the literal
+ /// corresponding to the match. The index is offset by 1 so that it fits in
+ /// a NonZeroUsize.
+ matches: Vec<Option<NonZeroUsize>>,
/// The index to allocate to the next literal added to this trie. Starts at
- /// 0 and increments by 1 for every literal successfully added to the trie.
+ /// 1 and increments by 1 for every literal successfully added to the trie.
next_literal_index: usize,
}
@@ -2176,9 +2222,6 @@ struct State {
/// are sorted by byte. There is at most one such transition for any
/// particular byte.
trans: Vec<(u8, usize)>,
- /// Whether this is a matching state or not. If it is, then it contains the
- /// index to the matching literal.
- literal_index: Option<usize>,
}
impl PreferenceTrie {
@@ -2192,20 +2235,19 @@ impl PreferenceTrie {
/// after them and because any removed literals are guaranteed to never
/// match.
fn minimize(literals: &mut Vec<Literal>, keep_exact: bool) {
- use core::cell::RefCell;
-
- // MSRV(1.61): Use retain_mut here to avoid interior mutability.
- let trie = RefCell::new(PreferenceTrie::default());
+ let mut trie = PreferenceTrie {
+ states: vec![],
+ matches: vec![],
+ next_literal_index: 1,
+ };
let mut make_inexact = vec![];
- literals.retain(|lit| {
- match trie.borrow_mut().insert(lit.as_bytes()) {
- Ok(_) => true,
- Err(i) => {
- if !keep_exact {
- make_inexact.push(i);
- }
- false
+ literals.retain_mut(|lit| match trie.insert(lit.as_bytes()) {
+ Ok(_) => true,
+ Err(i) => {
+ if !keep_exact {
+ make_inexact.push(i.checked_sub(1).unwrap());
}
+ false
}
});
for i in make_inexact {
@@ -2225,15 +2267,15 @@ impl PreferenceTrie {
/// search.
fn insert(&mut self, bytes: &[u8]) -> Result<usize, usize> {
let mut prev = self.root();
- if let Some(idx) = self.states[prev].literal_index {
- return Err(idx);
+ if let Some(idx) = self.matches[prev] {
+ return Err(idx.get());
}
for &b in bytes.iter() {
match self.states[prev].trans.binary_search_by_key(&b, |t| t.0) {
Ok(i) => {
prev = self.states[prev].trans[i].1;
- if let Some(idx) = self.states[prev].literal_index {
- return Err(idx);
+ if let Some(idx) = self.matches[prev] {
+ return Err(idx.get());
}
}
Err(i) => {
@@ -2245,7 +2287,7 @@ impl PreferenceTrie {
}
let idx = self.next_literal_index;
self.next_literal_index += 1;
- self.states[prev].literal_index = Some(idx);
+ self.matches[prev] = NonZeroUsize::new(idx);
Ok(idx)
}
@@ -2262,6 +2304,7 @@ impl PreferenceTrie {
fn create_state(&mut self) -> usize {
let id = self.states.len();
self.states.push(State::default());
+ self.matches.push(None);
id
}
}
@@ -2603,6 +2646,12 @@ mod tests {
]),
e(r"(ab|cd)(ef|gh)(ij|kl)")
);
+
+ assert_eq!(inexact([E("abab")], [E("abab")]), e(r"(ab){2}"));
+
+ assert_eq!(inexact([I("abab")], [I("abab")]), e(r"(ab){2,3}"));
+
+ assert_eq!(inexact([I("abab")], [I("abab")]), e(r"(ab){2,}"));
}
#[test]
@@ -2815,13 +2864,13 @@ mod tests {
// repeats.
#[test]
fn crazy_repeats() {
- assert_eq!(inexact([I("")], [I("")]), e(r"(?:){4294967295}"));
+ assert_eq!(inexact([E("")], [E("")]), e(r"(?:){4294967295}"));
assert_eq!(
- inexact([I("")], [I("")]),
+ inexact([E("")], [E("")]),
e(r"(?:){64}{64}{64}{64}{64}{64}")
);
- assert_eq!(inexact([I("")], [I("")]), e(r"x{0}{4294967295}"));
- assert_eq!(inexact([I("")], [I("")]), e(r"(?:|){4294967295}"));
+ assert_eq!(inexact([E("")], [E("")]), e(r"x{0}{4294967295}"));
+ assert_eq!(inexact([E("")], [E("")]), e(r"(?:|){4294967295}"));
assert_eq!(
inexact([E("")], [E("")]),
diff --git a/vendor/regex-syntax/src/hir/mod.rs b/vendor/regex-syntax/src/hir/mod.rs
index 062d4dcab..ce38ead7b 100644
--- a/vendor/regex-syntax/src/hir/mod.rs
+++ b/vendor/regex-syntax/src/hir/mod.rs
@@ -88,6 +88,9 @@ pub enum ErrorKind {
/// This error occurs when translating a pattern that could match a byte
/// sequence that isn't UTF-8 and `utf8` was enabled.
InvalidUtf8,
+ /// This error occurs when one uses a non-ASCII byte for a line terminator,
+ /// but where Unicode mode is enabled and UTF-8 mode is disabled.
+ InvalidLineTerminator,
/// This occurs when an unrecognized Unicode property name could not
/// be found.
UnicodePropertyNotFound,
@@ -120,6 +123,7 @@ impl core::fmt::Display for ErrorKind {
let msg = match *self {
UnicodeNotAllowed => "Unicode not allowed here",
InvalidUtf8 => "pattern can match invalid UTF-8",
+ InvalidLineTerminator => "invalid line terminator, must be ASCII",
UnicodePropertyNotFound => "Unicode property not found",
UnicodePropertyValueNotFound => "Unicode property value not found",
UnicodePerlClassNotFound => {
@@ -180,7 +184,7 @@ impl core::fmt::Display for ErrorKind {
/// matches.
///
/// For empty matches, those can occur at any position. It is the
-/// repsonsibility of the regex engine to determine whether empty matches are
+/// responsibility of the regex engine to determine whether empty matches are
/// permitted between the code units of a single codepoint.
///
/// # Stack space
@@ -355,7 +359,13 @@ impl Hir {
/// Creates a repetition HIR expression.
#[inline]
- pub fn repetition(rep: Repetition) -> Hir {
+ pub fn repetition(mut rep: Repetition) -> Hir {
+ // If the sub-expression of a repetition can only match the empty
+ // string, then we force its maximum to be at most 1.
+ if rep.sub.properties().maximum_len() == Some(0) {
+ rep.min = cmp::min(rep.min, 1);
+ rep.max = rep.max.map(|n| cmp::min(n, 1)).or(Some(1));
+ }
// The regex 'a{0}' is always equivalent to the empty regex. This is
// true even when 'a' is an expression that never matches anything
// (like '\P{any}').
@@ -547,7 +557,7 @@ impl Hir {
// We rebuild the alternation by simplifying it. We proceed similarly
// as the concatenation case. But in this case, there's no literal
// simplification happening. We're just flattening alternations.
- let mut new = vec![];
+ let mut new = Vec::with_capacity(subs.len());
for sub in subs {
let (kind, props) = sub.into_parts();
match kind {
@@ -642,6 +652,12 @@ impl Hir {
cls.push(ClassBytesRange::new(b'\0', b'\xFF'));
Hir::class(Class::Bytes(cls))
}
+ Dot::AnyCharExcept(ch) => {
+ let mut cls =
+ ClassUnicode::new([ClassUnicodeRange::new(ch, ch)]);
+ cls.negate();
+ Hir::class(Class::Unicode(cls))
+ }
Dot::AnyCharExceptLF => {
let mut cls = ClassUnicode::empty();
cls.push(ClassUnicodeRange::new('\0', '\x09'));
@@ -655,6 +671,12 @@ impl Hir {
cls.push(ClassUnicodeRange::new('\x0E', '\u{10FFFF}'));
Hir::class(Class::Unicode(cls))
}
+ Dot::AnyByteExcept(byte) => {
+ let mut cls =
+ ClassBytes::new([ClassBytesRange::new(byte, byte)]);
+ cls.negate();
+ Hir::class(Class::Bytes(cls))
+ }
Dot::AnyByteExceptLF => {
let mut cls = ClassBytes::empty();
cls.push(ClassBytesRange::new(b'\0', b'\x09'));
@@ -775,13 +797,18 @@ impl core::fmt::Debug for Literal {
/// The high-level intermediate representation of a character class.
///
/// A character class corresponds to a set of characters. A character is either
-/// defined by a Unicode scalar value or a byte. Unicode characters are used
-/// by default, while bytes are used when Unicode mode (via the `u` flag) is
-/// disabled.
+/// defined by a Unicode scalar value or a byte.
///
/// A character class, regardless of its character type, is represented by a
/// sequence of non-overlapping non-adjacent ranges of characters.
///
+/// There are no guarantees about which class variant is used. Generally
+/// speaking, the Unicode variat is used whenever a class needs to contain
+/// non-ASCII Unicode scalar values. But the Unicode variant can be used even
+/// when Unicode mode is disabled. For example, at the time of writing, the
+/// regex `(?-u:a|\xc2\xa0)` will compile down to HIR for the Unicode class
+/// `[a\u00A0]` due to optimizations.
+///
/// Note that `Bytes` variant may be produced even when it exclusively matches
/// valid UTF-8. This is because a `Bytes` variant represents an intention by
/// the author of the regular expression to disable Unicode mode, which in turn
@@ -1304,8 +1331,9 @@ impl ClassUnicodeRange {
}
}
-/// A set of characters represented by arbitrary bytes (where one byte
-/// corresponds to one character).
+/// A set of characters represented by arbitrary bytes.
+///
+/// Each byte corresponds to one character.
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct ClassBytes {
set: IntervalSet<ClassBytesRange>,
@@ -1607,6 +1635,42 @@ pub enum Look {
WordUnicode = 1 << 8,
/// Match a Unicode-aware negation of a word boundary.
WordUnicodeNegate = 1 << 9,
+ /// Match the start of an ASCII-only word boundary. That is, this matches a
+ /// position at either the beginning of the haystack or where the previous
+ /// character is not a word character and the following character is a word
+ /// character.
+ WordStartAscii = 1 << 10,
+ /// Match the end of an ASCII-only word boundary. That is, this matches
+ /// a position at either the end of the haystack or where the previous
+ /// character is a word character and the following character is not a word
+ /// character.
+ WordEndAscii = 1 << 11,
+ /// Match the start of a Unicode word boundary. That is, this matches a
+ /// position at either the beginning of the haystack or where the previous
+ /// character is not a word character and the following character is a word
+ /// character.
+ WordStartUnicode = 1 << 12,
+ /// Match the end of a Unicode word boundary. That is, this matches a
+ /// position at either the end of the haystack or where the previous
+ /// character is a word character and the following character is not a word
+ /// character.
+ WordEndUnicode = 1 << 13,
+ /// Match the start half of an ASCII-only word boundary. That is, this
+ /// matches a position at either the beginning of the haystack or where the
+ /// previous character is not a word character.
+ WordStartHalfAscii = 1 << 14,
+ /// Match the end half of an ASCII-only word boundary. That is, this
+ /// matches a position at either the end of the haystack or where the
+ /// following character is not a word character.
+ WordEndHalfAscii = 1 << 15,
+ /// Match the start half of a Unicode word boundary. That is, this matches
+ /// a position at either the beginning of the haystack or where the
+ /// previous character is not a word character.
+ WordStartHalfUnicode = 1 << 16,
+ /// Match the end half of a Unicode word boundary. That is, this matches
+ /// a position at either the end of the haystack or where the following
+ /// character is not a word character.
+ WordEndHalfUnicode = 1 << 17,
}
impl Look {
@@ -1628,6 +1692,14 @@ impl Look {
Look::WordAsciiNegate => Look::WordAsciiNegate,
Look::WordUnicode => Look::WordUnicode,
Look::WordUnicodeNegate => Look::WordUnicodeNegate,
+ Look::WordStartAscii => Look::WordEndAscii,
+ Look::WordEndAscii => Look::WordStartAscii,
+ Look::WordStartUnicode => Look::WordEndUnicode,
+ Look::WordEndUnicode => Look::WordStartUnicode,
+ Look::WordStartHalfAscii => Look::WordEndHalfAscii,
+ Look::WordEndHalfAscii => Look::WordStartHalfAscii,
+ Look::WordStartHalfUnicode => Look::WordEndHalfUnicode,
+ Look::WordEndHalfUnicode => Look::WordStartHalfUnicode,
}
}
@@ -1636,28 +1708,36 @@ impl Look {
/// constructor is guaranteed to return the same look-around variant that
/// one started with within a semver compatible release of this crate.
#[inline]
- pub const fn as_repr(self) -> u16 {
+ pub const fn as_repr(self) -> u32 {
// AFAIK, 'as' is the only way to zero-cost convert an int enum to an
// actual int.
- self as u16
+ self as u32
}
/// Given the underlying representation of a `Look` value, return the
/// corresponding `Look` value if the representation is valid. Otherwise
/// `None` is returned.
#[inline]
- pub const fn from_repr(repr: u16) -> Option<Look> {
+ pub const fn from_repr(repr: u32) -> Option<Look> {
match repr {
- 0b00_0000_0001 => Some(Look::Start),
- 0b00_0000_0010 => Some(Look::End),
- 0b00_0000_0100 => Some(Look::StartLF),
- 0b00_0000_1000 => Some(Look::EndLF),
- 0b00_0001_0000 => Some(Look::StartCRLF),
- 0b00_0010_0000 => Some(Look::EndCRLF),
- 0b00_0100_0000 => Some(Look::WordAscii),
- 0b00_1000_0000 => Some(Look::WordAsciiNegate),
- 0b01_0000_0000 => Some(Look::WordUnicode),
- 0b10_0000_0000 => Some(Look::WordUnicodeNegate),
+ 0b00_0000_0000_0000_0001 => Some(Look::Start),
+ 0b00_0000_0000_0000_0010 => Some(Look::End),
+ 0b00_0000_0000_0000_0100 => Some(Look::StartLF),
+ 0b00_0000_0000_0000_1000 => Some(Look::EndLF),
+ 0b00_0000_0000_0001_0000 => Some(Look::StartCRLF),
+ 0b00_0000_0000_0010_0000 => Some(Look::EndCRLF),
+ 0b00_0000_0000_0100_0000 => Some(Look::WordAscii),
+ 0b00_0000_0000_1000_0000 => Some(Look::WordAsciiNegate),
+ 0b00_0000_0001_0000_0000 => Some(Look::WordUnicode),
+ 0b00_0000_0010_0000_0000 => Some(Look::WordUnicodeNegate),
+ 0b00_0000_0100_0000_0000 => Some(Look::WordStartAscii),
+ 0b00_0000_1000_0000_0000 => Some(Look::WordEndAscii),
+ 0b00_0001_0000_0000_0000 => Some(Look::WordStartUnicode),
+ 0b00_0010_0000_0000_0000 => Some(Look::WordEndUnicode),
+ 0b00_0100_0000_0000_0000 => Some(Look::WordStartHalfAscii),
+ 0b00_1000_0000_0000_0000 => Some(Look::WordEndHalfAscii),
+ 0b01_0000_0000_0000_0000 => Some(Look::WordStartHalfUnicode),
+ 0b10_0000_0000_0000_0000 => Some(Look::WordEndHalfUnicode),
_ => None,
}
}
@@ -1682,6 +1762,14 @@ impl Look {
Look::WordAsciiNegate => 'B',
Look::WordUnicode => '𝛃',
Look::WordUnicodeNegate => '𝚩',
+ Look::WordStartAscii => '<',
+ Look::WordEndAscii => '>',
+ Look::WordStartUnicode => '〈',
+ Look::WordEndUnicode => '〉',
+ Look::WordStartHalfAscii => '◁',
+ Look::WordEndHalfAscii => '▷',
+ Look::WordStartHalfUnicode => '◀',
+ Look::WordEndHalfUnicode => '▶',
}
}
}
@@ -1766,6 +1854,18 @@ pub enum Dot {
///
/// This is equivalent to `(?s-u:.)` and also `(?-u:[\x00-\xFF])`.
AnyByte,
+ /// Matches the UTF-8 encoding of any Unicode scalar value except for the
+ /// `char` given.
+ ///
+ /// This is equivalent to using `(?u-s:.)` with the line terminator set
+ /// to a particular ASCII byte. (Because of peculiarities in the regex
+ /// engines, a line terminator must be a single byte. It follows that when
+ /// UTF-8 mode is enabled, this single byte must also be a Unicode scalar
+ /// value. That is, ti must be ASCII.)
+ ///
+ /// (This and `AnyCharExceptLF` both exist because of legacy reasons.
+ /// `AnyCharExceptLF` will be dropped in the next breaking change release.)
+ AnyCharExcept(char),
/// Matches the UTF-8 encoding of any Unicode scalar value except for `\n`.
///
/// This is equivalent to `(?u-s:.)` and also `[\p{any}--\n]`.
@@ -1775,6 +1875,17 @@ pub enum Dot {
///
/// This is equivalent to `(?uR-s:.)` and also `[\p{any}--\r\n]`.
AnyCharExceptCRLF,
+ /// Matches any byte value except for the `u8` given.
+ ///
+ /// This is equivalent to using `(?-us:.)` with the line terminator set
+ /// to a particular ASCII byte. (Because of peculiarities in the regex
+ /// engines, a line terminator must be a single byte. It follows that when
+ /// UTF-8 mode is enabled, this single byte must also be a Unicode scalar
+ /// value. That is, ti must be ASCII.)
+ ///
+ /// (This and `AnyByteExceptLF` both exist because of legacy reasons.
+ /// `AnyByteExceptLF` will be dropped in the next breaking change release.)
+ AnyByteExcept(u8),
/// Matches any byte value except for `\n`.
///
/// This is equivalent to `(?-su:.)` and also `(?-u:[[\x00-\xFF]--\n])`.
@@ -2410,10 +2521,10 @@ impl Properties {
inner.look_set_prefix = p.look_set_prefix();
inner.look_set_suffix = p.look_set_suffix();
}
- // If the static captures len of the sub-expression is not known or is
- // zero, then it automatically propagates to the repetition, regardless
- // of the repetition. Otherwise, it might change, but only when the
- // repetition can match 0 times.
+ // If the static captures len of the sub-expression is not known or
+ // is greater than zero, then it automatically propagates to the
+ // repetition, regardless of the repetition. Otherwise, it might
+ // change, but only when the repetition can match 0 times.
if rep.min == 0
&& inner.static_explicit_captures_len.map_or(false, |len| len > 0)
{
@@ -2549,7 +2660,7 @@ pub struct LookSet {
/// range of `u16` values to be represented. For example, even if the
/// current implementation only makes use of the 10 least significant bits,
/// it may use more bits in a future semver compatible release.
- pub bits: u16,
+ pub bits: u32,
}
impl LookSet {
@@ -2652,13 +2763,22 @@ impl LookSet {
pub fn contains_word_unicode(self) -> bool {
self.contains(Look::WordUnicode)
|| self.contains(Look::WordUnicodeNegate)
+ || self.contains(Look::WordStartUnicode)
+ || self.contains(Look::WordEndUnicode)
+ || self.contains(Look::WordStartHalfUnicode)
+ || self.contains(Look::WordEndHalfUnicode)
}
/// Returns true if and only if this set contains any ASCII word boundary
/// or negated ASCII word boundary assertions.
#[inline]
pub fn contains_word_ascii(self) -> bool {
- self.contains(Look::WordAscii) || self.contains(Look::WordAsciiNegate)
+ self.contains(Look::WordAscii)
+ || self.contains(Look::WordAsciiNegate)
+ || self.contains(Look::WordStartAscii)
+ || self.contains(Look::WordEndAscii)
+ || self.contains(Look::WordStartHalfAscii)
+ || self.contains(Look::WordEndHalfAscii)
}
/// Returns an iterator over all of the look-around assertions in this set.
@@ -2737,29 +2857,31 @@ impl LookSet {
*self = self.intersect(other);
}
- /// Return a `LookSet` from the slice given as a native endian 16-bit
+ /// Return a `LookSet` from the slice given as a native endian 32-bit
/// integer.
///
/// # Panics
///
- /// This panics if `slice.len() < 2`.
+ /// This panics if `slice.len() < 4`.
#[inline]
pub fn read_repr(slice: &[u8]) -> LookSet {
- let bits = u16::from_ne_bytes(slice[..2].try_into().unwrap());
+ let bits = u32::from_ne_bytes(slice[..4].try_into().unwrap());
LookSet { bits }
}
- /// Write a `LookSet` as a native endian 16-bit integer to the beginning
+ /// Write a `LookSet` as a native endian 32-bit integer to the beginning
/// of the slice given.
///
/// # Panics
///
- /// This panics if `slice.len() < 2`.
+ /// This panics if `slice.len() < 4`.
#[inline]
pub fn write_repr(self, slice: &mut [u8]) {
let raw = self.bits.to_ne_bytes();
slice[0] = raw[0];
slice[1] = raw[1];
+ slice[2] = raw[2];
+ slice[3] = raw[3];
}
}
@@ -2792,9 +2914,9 @@ impl Iterator for LookSetIter {
return None;
}
// We'll never have more than u8::MAX distinct look-around assertions,
- // so 'repr' will always fit into a u16.
- let repr = u16::try_from(self.set.bits.trailing_zeros()).unwrap();
- let look = Look::from_repr(1 << repr)?;
+ // so 'bit' will always fit into a u16.
+ let bit = u16::try_from(self.set.bits.trailing_zeros()).unwrap();
+ let look = Look::from_repr(1 << bit)?;
self.set = self.set.remove(look);
Some(look)
}
@@ -3716,7 +3838,7 @@ mod tests {
assert_eq!(0, set.iter().count());
let set = LookSet::full();
- assert_eq!(10, set.iter().count());
+ assert_eq!(18, set.iter().count());
let set =
LookSet::empty().insert(Look::StartLF).insert(Look::WordUnicode);
@@ -3734,6 +3856,6 @@ mod tests {
let res = format!("{:?}", LookSet::empty());
assert_eq!("∅", res);
let res = format!("{:?}", LookSet::full());
- assert_eq!("Az^$rRbB𝛃𝚩", res);
+ assert_eq!("Az^$rRbB𝛃𝚩<>〈〉◁▷◀▶", res);
}
}
diff --git a/vendor/regex-syntax/src/hir/print.rs b/vendor/regex-syntax/src/hir/print.rs
index fcb7cd252..dfa6d4032 100644
--- a/vendor/regex-syntax/src/hir/print.rs
+++ b/vendor/regex-syntax/src/hir/print.rs
@@ -89,9 +89,16 @@ impl<W: fmt::Write> Visitor for Writer<W> {
fn visit_pre(&mut self, hir: &Hir) -> fmt::Result {
match *hir.kind() {
- // Empty is represented by nothing in the concrete syntax, and
- // repetition operators are strictly suffix oriented.
- HirKind::Empty | HirKind::Repetition(_) => {}
+ HirKind::Empty => {
+ // Technically an empty sub-expression could be "printed" by
+ // just ignoring it, but in practice, you could have a
+ // repetition operator attached to an empty expression, and you
+ // really need something in the concrete syntax to make that
+ // work as you'd expect.
+ self.wtr.write_str(r"(?:)")?;
+ }
+ // Repetition operators are strictly suffix oriented.
+ HirKind::Repetition(_) => {}
HirKind::Literal(hir::Literal(ref bytes)) => {
// See the comment on the 'Concat' and 'Alternation' case below
// for why we put parens here. Literals are, conceptually,
@@ -195,6 +202,30 @@ impl<W: fmt::Write> Visitor for Writer<W> {
hir::Look::WordUnicodeNegate => {
self.wtr.write_str(r"\B")?;
}
+ hir::Look::WordStartAscii => {
+ self.wtr.write_str(r"(?-u:\b{start})")?;
+ }
+ hir::Look::WordEndAscii => {
+ self.wtr.write_str(r"(?-u:\b{end})")?;
+ }
+ hir::Look::WordStartUnicode => {
+ self.wtr.write_str(r"\b{start}")?;
+ }
+ hir::Look::WordEndUnicode => {
+ self.wtr.write_str(r"\b{end}")?;
+ }
+ hir::Look::WordStartHalfAscii => {
+ self.wtr.write_str(r"(?-u:\b{start-half})")?;
+ }
+ hir::Look::WordEndHalfAscii => {
+ self.wtr.write_str(r"(?-u:\b{end-half})")?;
+ }
+ hir::Look::WordStartHalfUnicode => {
+ self.wtr.write_str(r"\b{start-half}")?;
+ }
+ hir::Look::WordEndHalfUnicode => {
+ self.wtr.write_str(r"\b{end-half}")?;
+ }
},
HirKind::Capture(hir::Capture { ref name, .. }) => {
self.wtr.write_str("(")?;
@@ -424,20 +455,20 @@ mod tests {
// Test that various zero-length repetitions always translate to an
// empty regex. This is more a property of HIR's smart constructors
// than the printer though.
- roundtrip("a{0}", "");
- roundtrip("(?:ab){0}", "");
+ roundtrip("a{0}", "(?:)");
+ roundtrip("(?:ab){0}", "(?:)");
#[cfg(feature = "unicode-gencat")]
{
- roundtrip(r"\p{any}{0}", "");
- roundtrip(r"\P{any}{0}", "");
+ roundtrip(r"\p{any}{0}", "(?:)");
+ roundtrip(r"\P{any}{0}", "(?:)");
}
}
#[test]
fn print_group() {
- roundtrip("()", "()");
- roundtrip("(?P<foo>)", "(?P<foo>)");
- roundtrip("(?:)", "");
+ roundtrip("()", "((?:))");
+ roundtrip("(?P<foo>)", "(?P<foo>(?:))");
+ roundtrip("(?:)", "(?:)");
roundtrip("(a)", "(a)");
roundtrip("(?P<foo>a)", "(?P<foo>a)");
@@ -448,8 +479,8 @@ mod tests {
#[test]
fn print_alternation() {
- roundtrip("|", "(?:|)");
- roundtrip("||", "(?:||)");
+ roundtrip("|", "(?:(?:)|(?:))");
+ roundtrip("||", "(?:(?:)|(?:)|(?:))");
roundtrip("a|b", "[ab]");
roundtrip("ab|cd", "(?:(?:ab)|(?:cd))");
@@ -503,7 +534,7 @@ mod tests {
}),
Hir::look(hir::Look::End),
]);
- assert_eq!(r"(?:\A(?:\A\z)+\z)", expr.to_string());
+ assert_eq!(r"(?:\A\A\z\z)", expr.to_string());
}
// Just like regression_repetition_concat, but with the repetition using
@@ -540,7 +571,7 @@ mod tests {
}),
Hir::look(hir::Look::End),
]);
- assert_eq!(r"(?:\A(?:\A|\z)+\z)", expr.to_string());
+ assert_eq!(r"(?:\A(?:\A|\z)\z)", expr.to_string());
}
// This regression test is very similar in flavor to
diff --git a/vendor/regex-syntax/src/hir/translate.rs b/vendor/regex-syntax/src/hir/translate.rs
index ff9c5ee91..313a1e9e8 100644
--- a/vendor/regex-syntax/src/hir/translate.rs
+++ b/vendor/regex-syntax/src/hir/translate.rs
@@ -19,6 +19,7 @@ type Result<T> = core::result::Result<T, Error>;
#[derive(Clone, Debug)]
pub struct TranslatorBuilder {
utf8: bool,
+ line_terminator: u8,
flags: Flags,
}
@@ -31,7 +32,11 @@ impl Default for TranslatorBuilder {
impl TranslatorBuilder {
/// Create a new translator builder with a default c onfiguration.
pub fn new() -> TranslatorBuilder {
- TranslatorBuilder { utf8: true, flags: Flags::default() }
+ TranslatorBuilder {
+ utf8: true,
+ line_terminator: b'\n',
+ flags: Flags::default(),
+ }
}
/// Build a translator using the current configuration.
@@ -40,6 +45,7 @@ impl TranslatorBuilder {
stack: RefCell::new(vec![]),
flags: Cell::new(self.flags),
utf8: self.utf8,
+ line_terminator: self.line_terminator,
}
}
@@ -63,6 +69,31 @@ impl TranslatorBuilder {
self
}
+ /// Sets the line terminator for use with `(?u-s:.)` and `(?-us:.)`.
+ ///
+ /// Namely, instead of `.` (by default) matching everything except for `\n`,
+ /// this will cause `.` to match everything except for the byte given.
+ ///
+ /// If `.` is used in a context where Unicode mode is enabled and this byte
+ /// isn't ASCII, then an error will be returned. When Unicode mode is
+ /// disabled, then any byte is permitted, but will return an error if UTF-8
+ /// mode is enabled and it is a non-ASCII byte.
+ ///
+ /// In short, any ASCII value for a line terminator is always okay. But a
+ /// non-ASCII byte might result in an error depending on whether Unicode
+ /// mode or UTF-8 mode are enabled.
+ ///
+ /// Note that if `R` mode is enabled then it always takes precedence and
+ /// the line terminator will be treated as `\r` and `\n` simultaneously.
+ ///
+ /// Note also that this *doesn't* impact the look-around assertions
+ /// `(?m:^)` and `(?m:$)`. That's usually controlled by additional
+ /// configuration in the regex engine itself.
+ pub fn line_terminator(&mut self, byte: u8) -> &mut TranslatorBuilder {
+ self.line_terminator = byte;
+ self
+ }
+
/// Enable or disable the case insensitive flag (`i`) by default.
pub fn case_insensitive(&mut self, yes: bool) -> &mut TranslatorBuilder {
self.flags.case_insensitive = if yes { Some(true) } else { None };
@@ -120,6 +151,8 @@ pub struct Translator {
flags: Cell<Flags>,
/// Whether we're allowed to produce HIR that can match arbitrary bytes.
utf8: bool,
+ /// The line terminator to use for `.`.
+ line_terminator: u8,
}
impl Translator {
@@ -304,7 +337,7 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
fn visit_pre(&mut self, ast: &Ast) -> Result<()> {
match *ast {
- Ast::Class(ast::Class::Bracketed(_)) => {
+ Ast::ClassBracketed(_) => {
if self.flags().unicode() {
let cls = hir::ClassUnicode::empty();
self.push(HirFrame::ClassUnicode(cls));
@@ -321,14 +354,14 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
.unwrap_or_else(|| self.flags());
self.push(HirFrame::Group { old_flags });
}
- Ast::Concat(ref x) if x.asts.is_empty() => {}
Ast::Concat(_) => {
self.push(HirFrame::Concat);
}
- Ast::Alternation(ref x) if x.asts.is_empty() => {}
- Ast::Alternation(_) => {
+ Ast::Alternation(ref x) => {
self.push(HirFrame::Alternation);
- self.push(HirFrame::AlternationBranch);
+ if !x.asts.is_empty() {
+ self.push(HirFrame::AlternationBranch);
+ }
}
_ => {}
}
@@ -353,29 +386,20 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
// consistency sake.
self.push(HirFrame::Expr(Hir::empty()));
}
- Ast::Literal(ref x) => {
- match self.ast_literal_to_scalar(x)? {
- Either::Right(byte) => self.push_byte(byte),
- Either::Left(ch) => {
- if !self.flags().unicode() && ch.len_utf8() > 1 {
- return Err(self
- .error(x.span, ErrorKind::UnicodeNotAllowed));
- }
- match self.case_fold_char(x.span, ch)? {
- None => self.push_char(ch),
- Some(expr) => self.push(HirFrame::Expr(expr)),
- }
- }
- }
- // self.push(HirFrame::Expr(self.hir_literal(x)?));
- }
- Ast::Dot(span) => {
- self.push(HirFrame::Expr(self.hir_dot(span)?));
+ Ast::Literal(ref x) => match self.ast_literal_to_scalar(x)? {
+ Either::Right(byte) => self.push_byte(byte),
+ Either::Left(ch) => match self.case_fold_char(x.span, ch)? {
+ None => self.push_char(ch),
+ Some(expr) => self.push(HirFrame::Expr(expr)),
+ },
+ },
+ Ast::Dot(ref span) => {
+ self.push(HirFrame::Expr(self.hir_dot(**span)?));
}
Ast::Assertion(ref x) => {
self.push(HirFrame::Expr(self.hir_assertion(x)?));
}
- Ast::Class(ast::Class::Perl(ref x)) => {
+ Ast::ClassPerl(ref x) => {
if self.flags().unicode() {
let cls = self.hir_perl_unicode_class(x)?;
let hcls = hir::Class::Unicode(cls);
@@ -386,11 +410,11 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
self.push(HirFrame::Expr(Hir::class(hcls)));
}
}
- Ast::Class(ast::Class::Unicode(ref x)) => {
+ Ast::ClassUnicode(ref x) => {
let cls = hir::Class::Unicode(self.hir_unicode_class(x)?);
self.push(HirFrame::Expr(Hir::class(cls)));
}
- Ast::Class(ast::Class::Bracketed(ref ast)) => {
+ Ast::ClassBracketed(ref ast) => {
if self.flags().unicode() {
let mut cls = self.pop().unwrap().unwrap_class_unicode();
self.unicode_fold_and_negate(
@@ -841,8 +865,8 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
})?;
Ok(Some(Hir::class(hir::Class::Unicode(cls))))
} else {
- if c.len_utf8() > 1 {
- return Err(self.error(span, ErrorKind::UnicodeNotAllowed));
+ if !c.is_ascii() {
+ return Ok(None);
}
// If case folding won't do anything, then don't bother trying.
match c {
@@ -862,10 +886,38 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
}
fn hir_dot(&self, span: Span) -> Result<Hir> {
- if !self.flags().unicode() && self.trans().utf8 {
+ let (utf8, lineterm, flags) =
+ (self.trans().utf8, self.trans().line_terminator, self.flags());
+ if utf8 && (!flags.unicode() || !lineterm.is_ascii()) {
return Err(self.error(span, ErrorKind::InvalidUtf8));
}
- Ok(Hir::dot(self.flags().dot()))
+ let dot = if flags.dot_matches_new_line() {
+ if flags.unicode() {
+ hir::Dot::AnyChar
+ } else {
+ hir::Dot::AnyByte
+ }
+ } else {
+ if flags.unicode() {
+ if flags.crlf() {
+ hir::Dot::AnyCharExceptCRLF
+ } else {
+ if !lineterm.is_ascii() {
+ return Err(
+ self.error(span, ErrorKind::InvalidLineTerminator)
+ );
+ }
+ hir::Dot::AnyCharExcept(char::from(lineterm))
+ }
+ } else {
+ if flags.crlf() {
+ hir::Dot::AnyByteExceptCRLF
+ } else {
+ hir::Dot::AnyByteExcept(lineterm)
+ }
+ }
+ };
+ Ok(Hir::dot(dot))
}
fn hir_assertion(&self, asst: &ast::Assertion) -> Result<Hir> {
@@ -903,6 +955,34 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
} else {
hir::Look::WordAsciiNegate
}),
+ ast::AssertionKind::WordBoundaryStart
+ | ast::AssertionKind::WordBoundaryStartAngle => {
+ Hir::look(if unicode {
+ hir::Look::WordStartUnicode
+ } else {
+ hir::Look::WordStartAscii
+ })
+ }
+ ast::AssertionKind::WordBoundaryEnd
+ | ast::AssertionKind::WordBoundaryEndAngle => {
+ Hir::look(if unicode {
+ hir::Look::WordEndUnicode
+ } else {
+ hir::Look::WordEndAscii
+ })
+ }
+ ast::AssertionKind::WordBoundaryStartHalf => {
+ Hir::look(if unicode {
+ hir::Look::WordStartHalfUnicode
+ } else {
+ hir::Look::WordStartHalfAscii
+ })
+ }
+ ast::AssertionKind::WordBoundaryEndHalf => Hir::look(if unicode {
+ hir::Look::WordEndHalfUnicode
+ } else {
+ hir::Look::WordEndHalfAscii
+ }),
})
}
@@ -1124,9 +1204,8 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
match self.ast_literal_to_scalar(ast)? {
Either::Right(byte) => Ok(byte),
Either::Left(ch) => {
- let cp = u32::from(ch);
- if cp <= 0x7F {
- Ok(u8::try_from(cp).unwrap())
+ if ch.is_ascii() {
+ Ok(u8::try_from(ch).unwrap())
} else {
// We can't feasibly support Unicode in
// byte oriented classes. Byte classes don't
@@ -1209,30 +1288,6 @@ impl Flags {
}
}
- fn dot(&self) -> hir::Dot {
- if self.dot_matches_new_line() {
- if self.unicode() {
- hir::Dot::AnyChar
- } else {
- hir::Dot::AnyByte
- }
- } else {
- if self.unicode() {
- if self.crlf() {
- hir::Dot::AnyCharExceptCRLF
- } else {
- hir::Dot::AnyCharExceptLF
- }
- } else {
- if self.crlf() {
- hir::Dot::AnyByteExceptCRLF
- } else {
- hir::Dot::AnyByteExceptLF
- }
- }
- }
- }
-
fn case_insensitive(&self) -> bool {
self.case_insensitive.unwrap_or(false)
}
@@ -1598,16 +1653,7 @@ mod tests {
assert_eq!(t_bytes(r"(?-u)\x61"), hir_lit("a"));
assert_eq!(t_bytes(r"(?-u)\xFF"), hir_blit(b"\xFF"));
- assert_eq!(
- t_err("(?-u)☃"),
- TestError {
- kind: hir::ErrorKind::UnicodeNotAllowed,
- span: Span::new(
- Position::new(5, 1, 6),
- Position::new(8, 1, 7)
- ),
- }
- );
+ assert_eq!(t("(?-u)☃"), hir_lit("☃"));
assert_eq!(
t_err(r"(?-u)\xFF"),
TestError {
@@ -1685,16 +1731,7 @@ mod tests {
);
assert_eq!(t_bytes(r"(?i-u)\xFF"), hir_blit(b"\xFF"));
- assert_eq!(
- t_err("(?i-u)β"),
- TestError {
- kind: hir::ErrorKind::UnicodeNotAllowed,
- span: Span::new(
- Position::new(6, 1, 7),
- Position::new(8, 1, 8),
- ),
- }
- );
+ assert_eq!(t("(?i-u)β"), hir_lit("β"),);
}
#[test]
@@ -3489,6 +3526,15 @@ mod tests {
assert!(!props(r"(?:z|xx)@|xx").is_alternation_literal());
}
+ // This tests that the smart Hir::repetition constructors does some basic
+ // simplifications.
+ #[test]
+ fn smart_repetition() {
+ assert_eq!(t(r"a{0}"), Hir::empty());
+ assert_eq!(t(r"a{1}"), hir_lit("a"));
+ assert_eq!(t(r"\B{32111}"), hir_look(hir::Look::WordUnicodeNegate));
+ }
+
// This tests that the smart Hir::concat constructor simplifies the given
// exprs in a way we expect.
#[test]
@@ -3580,4 +3626,99 @@ mod tests {
]),
);
}
+
+ #[test]
+ fn regression_alt_empty_concat() {
+ use crate::ast::{self, Ast};
+
+ let span = Span::splat(Position::new(0, 0, 0));
+ let ast = Ast::alternation(ast::Alternation {
+ span,
+ asts: vec![Ast::concat(ast::Concat { span, asts: vec![] })],
+ });
+
+ let mut t = Translator::new();
+ assert_eq!(Ok(Hir::empty()), t.translate("", &ast));
+ }
+
+ #[test]
+ fn regression_empty_alt() {
+ use crate::ast::{self, Ast};
+
+ let span = Span::splat(Position::new(0, 0, 0));
+ let ast = Ast::concat(ast::Concat {
+ span,
+ asts: vec![Ast::alternation(ast::Alternation {
+ span,
+ asts: vec![],
+ })],
+ });
+
+ let mut t = Translator::new();
+ assert_eq!(Ok(Hir::fail()), t.translate("", &ast));
+ }
+
+ #[test]
+ fn regression_singleton_alt() {
+ use crate::{
+ ast::{self, Ast},
+ hir::Dot,
+ };
+
+ let span = Span::splat(Position::new(0, 0, 0));
+ let ast = Ast::concat(ast::Concat {
+ span,
+ asts: vec![Ast::alternation(ast::Alternation {
+ span,
+ asts: vec![Ast::dot(span)],
+ })],
+ });
+
+ let mut t = Translator::new();
+ assert_eq!(Ok(Hir::dot(Dot::AnyCharExceptLF)), t.translate("", &ast));
+ }
+
+ // See: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=63168
+ #[test]
+ fn regression_fuzz_match() {
+ let pat = "[(\u{6} \0-\u{afdf5}] \0 ";
+ let ast = ParserBuilder::new()
+ .octal(false)
+ .ignore_whitespace(true)
+ .build()
+ .parse(pat)
+ .unwrap();
+ let hir = TranslatorBuilder::new()
+ .utf8(true)
+ .case_insensitive(false)
+ .multi_line(false)
+ .dot_matches_new_line(false)
+ .swap_greed(true)
+ .unicode(true)
+ .build()
+ .translate(pat, &ast)
+ .unwrap();
+ assert_eq!(
+ hir,
+ Hir::concat(vec![
+ hir_uclass(&[('\0', '\u{afdf5}')]),
+ hir_lit("\0"),
+ ])
+ );
+ }
+
+ // See: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=63155
+ #[cfg(feature = "unicode")]
+ #[test]
+ fn regression_fuzz_difference1() {
+ let pat = r"\W\W|\W[^\v--\W\W\P{Script_Extensions:Pau_Cin_Hau}\u10A1A1-\U{3E3E3}--~~~~--~~~~~~~~------~~~~~~--~~~~~~]*";
+ let _ = t(pat); // shouldn't panic
+ }
+
+ // See: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=63153
+ #[test]
+ fn regression_fuzz_char_decrement1() {
+ let pat = "w[w[^w?\rw\rw[^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w?\r\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0w?\rw[^w?\rw[^w?\rw[^w\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\u{1}\0]\0\0\0\0\0\0\0\0\0*\0\0\u{1}\0]\0\0-*\0][^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\u{1}\0]\0\0\0\0\0\0\0\0\0x\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\0\0\0*??\0\u{7f}{2}\u{10}??\0\0\0\0\0\0\0\0\0\u{3}\0\0\0}\0-*\0]\0\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\0\u{1}\0]\0\u{1}\u{1}H-i]-]\0\0\0\0\u{1}\0]\0\0\0\u{1}\0]\0\0-*\0\0\0\0\u{1}9-\u{7f}]\0'|-\u{7f}]\0'|(?i-ux)[-\u{7f}]\0'\u{3}\0\0\0}\0-*\0]<D\0\0\0\0\0\0\u{1}]\0\0\0\0]\0\0-*\0]\0\0 ";
+ let _ = t(pat); // shouldn't panic
+ }
}
diff --git a/vendor/regex-syntax/src/hir/visitor.rs b/vendor/regex-syntax/src/hir/visitor.rs
index e5f15cf1c..f30f0a163 100644
--- a/vendor/regex-syntax/src/hir/visitor.rs
+++ b/vendor/regex-syntax/src/hir/visitor.rs
@@ -41,6 +41,11 @@ pub trait Visitor {
fn visit_alternation_in(&mut self) -> Result<(), Self::Err> {
Ok(())
}
+
+ /// This method is called between child nodes of a concatenation.
+ fn visit_concat_in(&mut self) -> Result<(), Self::Err> {
+ Ok(())
+ }
}
/// Executes an implementation of `Visitor` in constant stack space.
@@ -131,8 +136,14 @@ impl<'a> HeapVisitor<'a> {
// If this is a concat/alternate, then we might have additional
// inductive steps to process.
if let Some(x) = self.pop(frame) {
- if let Frame::Alternation { .. } = x {
- visitor.visit_alternation_in()?;
+ match x {
+ Frame::Alternation { .. } => {
+ visitor.visit_alternation_in()?;
+ }
+ Frame::Concat { .. } => {
+ visitor.visit_concat_in()?;
+ }
+ _ => {}
}
hir = x.child();
self.stack.push((post_hir, x));