Merging upstream version 1.76.0+dfsg1.

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-06-19 09:26:03 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-06-19 09:26:03 +0000
commit: 9918693037dce8aa4bb6f08741b6812923486c18 (patch)
tree: 21d2b40bec7e6a7ea664acee056eb3d08e15a1cf /vendor/regex-syntax/src/hir
parent: Releasing progress-linux version 1.75.0+dfsg1-5~progress7.99u1. (diff)
download: rustc-9918693037dce8aa4bb6f08741b6812923486c18.tar.xz
rustc-9918693037dce8aa4bb6f08741b6812923486c18.zip
5 files changed, 544 insertions, 190 deletions
diff --git a/vendor/regex-syntax/src/hir/literal.rs b/vendor/regex-syntax/src/hir/literal.rs
index bd3a2d143..a5a3737f6 100644
--- a/vendor/regex-syntax/src/hir/literal.rs
+++ b/vendor/regex-syntax/src/hir/literal.rs
@@ -23,7 +23,7 @@ effective literal optimizations:
 to lead to substring search that is only a little faster than a regex search,
 and thus the overhead of using literal optimizations in the first place might
 make things slower overall.
-* The literals in your [`Seq`] shoudn't be too short. In general, longer is
+* The literals in your [`Seq`] shouldn't be too short. In general, longer is
 better. A sequence corresponding to single bytes that occur frequently in the
 haystack, for example, is probably a bad literal optimization because it's
 likely to produce many false positive candidates. Longer literals are less
@@ -51,7 +51,7 @@ the "trickier" parts are how to combine literal sequences, and that is all
 implemented on [`Seq`].
 */
 
-use core::{cmp, mem};
+use core::{cmp, mem, num::NonZeroUsize};
 
 use alloc::{vec, vec::Vec};
 
@@ -477,7 +477,7 @@ impl Extractor {
                 }
                 seq
             }
-            hir::Repetition { min, max: Some(max), .. } if min < max => {
+            hir::Repetition { min, .. } => {
                 assert!(min > 0); // handled above
                 let limit =
                     u32::try_from(self.limit_repeat).unwrap_or(u32::MAX);
@@ -491,10 +491,6 @@ impl Extractor {
                 seq.make_inexact();
                 seq
             }
-            hir::Repetition { .. } => {
-                subseq.make_inexact();
-                subseq
-            }
         }
     }
 
@@ -692,7 +688,7 @@ impl Default for ExtractKind {
 /// from making assumptions about what literals are required in order to match
 /// a particular [`Hir`] expression. Generally speaking, when a set is in this
 /// state, literal optimizations are inhibited. A good example of a regex that
-/// will cause this sort of set to apppear is `[A-Za-z]`. The character class
+/// will cause this sort of set to appear is `[A-Za-z]`. The character class
 /// is just too big (and also too narrow) to be usefully expanded into 52
 /// different literals. (Note that the decision for when a seq should become
 /// infinite is determined by the caller. A seq itself has no hard-coded
@@ -1571,7 +1567,7 @@ impl Seq {
     /// unioning `self` with `other`. If either set is infinite, then this
     /// returns `None`.
     #[inline]
-    fn max_union_len(&self, other: &Seq) -> Option<usize> {
+    pub fn max_union_len(&self, other: &Seq) -> Option<usize> {
         let len1 = self.len()?;
         let len2 = other.len()?;
         Some(len1.saturating_add(len2))
@@ -1581,7 +1577,7 @@ impl Seq {
     /// cross product of `self` with `other`. If either set is infinite, then
     /// this returns `None`.
     #[inline]
-    fn max_cross_len(&self, other: &Seq) -> Option<usize> {
+    pub fn max_cross_len(&self, other: &Seq) -> Option<usize> {
         let len1 = self.len()?;
         let len2 = other.len()?;
         Some(len1.saturating_mul(len2))
@@ -1841,6 +1837,14 @@ impl Seq {
             None => return,
             Some(len) => len,
         };
+        // Just give up now if our sequence contains an empty string.
+        if self.min_literal_len().map_or(false, |len| len == 0) {
+            // We squash the sequence so that nobody else gets any bright
+            // ideas to try and use it. An empty string implies a match at
+            // every position. A prefilter cannot help you here.
+            self.make_infinite();
+            return;
+        }
         // Make sure we start with the smallest sequence possible. We use a
         // special version of preference minimization that retains exactness.
         // This is legal because optimization is only expected to occur once
@@ -1910,34 +1914,41 @@ impl Seq {
                 // longest common prefix to be subject to the poison check.
             }
         }
-        // Everything below this check is more-or-less about trying to
-        // heuristically reduce the false positive rate of a prefilter. But
-        // if our sequence is completely exact, then it's possible the regex
-        // engine can be skipped entirely. In this case, the false positive
-        // rate is zero because every literal match corresponds to a regex
-        // match.
+        // If we have an exact sequence, we *probably* just want to keep it
+        // as-is. But there are some cases where we don't. So we save a copy of
+        // the exact sequence now, and then try to do some more optimizations
+        // below. If those don't work out, we go back to this exact sequence.
         //
-        // This is OK even if the sequence contains a poison literal. Remember,
-        // a literal is only poisononous because of what we assume about its
-        // impact on the false positive rate. However, we do still check for
-        // an empty string. Empty strings are weird and it's best to let the
-        // regex engine handle those.
+        // The specific motivation for this is that we sometimes wind up with
+        // an exact sequence with a hefty number of literals. Say, 100. If we
+        // stuck with that, it would be too big for Teddy and would result in
+        // using Aho-Corasick. Which is fine... but the lazy DFA is plenty
+        // suitable in such cases. The real issue is that we will wind up not
+        // using a fast prefilter at all. So in cases like this, even though
+        // we have an exact sequence, it would be better to try and shrink the
+        // sequence (which we do below) and use it as a prefilter that can
+        // produce false positive matches.
         //
-        // We do currently do this check after the longest common prefix (or
-        // suffix) check, under the theory that single-substring search is so
-        // fast that we want that even if we'd end up turning an exact sequence
-        // into an inexact one. But this might be wrong...
-        if self.is_exact()
-            && self.min_literal_len().map_or(false, |len| len > 0)
-        {
-            return;
-        }
+        // But if the shrinking below results in a sequence that "sucks," then
+        // we don't want to use that because we already have an exact sequence
+        // in hand.
+        let exact: Option<Seq> =
+            if self.is_exact() { Some(self.clone()) } else { None };
         // Now we attempt to shorten the sequence. The idea here is that we
         // don't want to look for too many literals, but we want to shorten
         // our sequence enough to improve our odds of using better algorithms
         // downstream (such as Teddy).
+        //
+        // The pair of numbers in this list corresponds to the maximal prefix
+        // (in bytes) to keep for all literals and the length of the sequence
+        // at which to do it.
+        //
+        // So for example, the pair (3, 500) would mean, "if we have more than
+        // 500 literals in our sequence, then truncate all of our literals
+        // such that they are at most 3 bytes in length and the minimize the
+        // sequence."
         const ATTEMPTS: [(usize, usize); 5] =
-            [(5, 64), (4, 64), (3, 64), (2, 64), (1, 10)];
+            [(5, 10), (4, 10), (3, 64), (2, 64), (1, 10)];
         for (keep, limit) in ATTEMPTS {
             let len = match self.len() {
                 None => break,
@@ -1951,7 +1962,11 @@ impl Seq {
             } else {
                 self.keep_last_bytes(keep);
             }
-            self.minimize_by_preference();
+            if prefix {
+                if let Some(ref mut lits) = self.literals {
+                    PreferenceTrie::minimize(lits, true);
+                }
+            }
         }
         // Check for a poison literal. A poison literal is one that is short
         // and is believed to have a very high match count. These poisons
@@ -1968,6 +1983,30 @@ impl Seq {
                 self.make_infinite();
             }
         }
+        // OK, if we had an exact sequence before attempting more optimizations
+        // above and our post-optimized sequence sucks for some reason or
+        // another, then we go back to the exact sequence.
+        if let Some(exact) = exact {
+            // If optimizing resulted in dropping our literals, then certainly
+            // backup and use the exact sequence that we had.
+            if !self.is_finite() {
+                *self = exact;
+                return;
+            }
+            // If our optimized sequence contains a short literal, then it's
+            // *probably* not so great. So throw it away and revert to the
+            // exact sequence.
+            if self.min_literal_len().map_or(true, |len| len <= 2) {
+                *self = exact;
+                return;
+            }
+            // Finally, if our optimized sequence is "big" (i.e., can't use
+            // Teddy), then also don't use it and rely on the exact sequence.
+            if self.len().map_or(true, |len| len > 64) {
+                *self = exact;
+                return;
+            }
+        }
     }
 }
 
@@ -1977,7 +2016,7 @@ impl core::fmt::Debug for Seq {
         if let Some(lits) = self.literals() {
             f.debug_list().entries(lits.iter()).finish()
         } else {
-            write!(f, "[∅]")
+            write!(f, "[∞]")
         }
     }
 }
@@ -2160,12 +2199,19 @@ impl core::fmt::Debug for Literal {
 /// never seen this show up on a profile. Because of the heuristic limits
 /// imposed on literal extractions, the size of the inputs here is usually
 /// very small.)
-#[derive(Debug, Default)]
+#[derive(Debug)]
 struct PreferenceTrie {
     /// The states in this trie. The index of a state in this vector is its ID.
     states: Vec<State>,
+    /// This vec indicates which states are match states. It always has
+    /// the same length as `states` and is indexed by the same state ID.
+    /// A state with identifier `sid` is a match state if and only if
+    /// `matches[sid].is_some()`. The option contains the index of the literal
+    /// corresponding to the match. The index is offset by 1 so that it fits in
+    /// a NonZeroUsize.
+    matches: Vec<Option<NonZeroUsize>>,
     /// The index to allocate to the next literal added to this trie. Starts at
-    /// 0 and increments by 1 for every literal successfully added to the trie.
+    /// 1 and increments by 1 for every literal successfully added to the trie.
     next_literal_index: usize,
 }
 
@@ -2176,9 +2222,6 @@ struct State {
     /// are sorted by byte. There is at most one such transition for any
     /// particular byte.
     trans: Vec<(u8, usize)>,
-    /// Whether this is a matching state or not. If it is, then it contains the
-    /// index to the matching literal.
-    literal_index: Option<usize>,
 }
 
 impl PreferenceTrie {
@@ -2192,20 +2235,19 @@ impl PreferenceTrie {
     /// after them and because any removed literals are guaranteed to never
     /// match.
     fn minimize(literals: &mut Vec<Literal>, keep_exact: bool) {
-        use core::cell::RefCell;
-
-        // MSRV(1.61): Use retain_mut here to avoid interior mutability.
-        let trie = RefCell::new(PreferenceTrie::default());
+        let mut trie = PreferenceTrie {
+            states: vec![],
+            matches: vec![],
+            next_literal_index: 1,
+        };
         let mut make_inexact = vec![];
-        literals.retain(|lit| {
-            match trie.borrow_mut().insert(lit.as_bytes()) {
-                Ok(_) => true,
-                Err(i) => {
-                    if !keep_exact {
-                        make_inexact.push(i);
-                    }
-                    false
+        literals.retain_mut(|lit| match trie.insert(lit.as_bytes()) {
+            Ok(_) => true,
+            Err(i) => {
+                if !keep_exact {
+                    make_inexact.push(i.checked_sub(1).unwrap());
                 }
+                false
             }
         });
         for i in make_inexact {
@@ -2225,15 +2267,15 @@ impl PreferenceTrie {
     /// search.
     fn insert(&mut self, bytes: &[u8]) -> Result<usize, usize> {
         let mut prev = self.root();
-        if let Some(idx) = self.states[prev].literal_index {
-            return Err(idx);
+        if let Some(idx) = self.matches[prev] {
+            return Err(idx.get());
         }
         for &b in bytes.iter() {
             match self.states[prev].trans.binary_search_by_key(&b, |t| t.0) {
                 Ok(i) => {
                     prev = self.states[prev].trans[i].1;
-                    if let Some(idx) = self.states[prev].literal_index {
-                        return Err(idx);
+                    if let Some(idx) = self.matches[prev] {
+                        return Err(idx.get());
                     }
                 }
                 Err(i) => {
@@ -2245,7 +2287,7 @@ impl PreferenceTrie {
         }
         let idx = self.next_literal_index;
         self.next_literal_index += 1;
-        self.states[prev].literal_index = Some(idx);
+        self.matches[prev] = NonZeroUsize::new(idx);
         Ok(idx)
     }
 
@@ -2262,6 +2304,7 @@ impl PreferenceTrie {
     fn create_state(&mut self) -> usize {
         let id = self.states.len();
         self.states.push(State::default());
+        self.matches.push(None);
         id
     }
 }
@@ -2603,6 +2646,12 @@ mod tests {
             ]),
             e(r"(ab|cd)(ef|gh)(ij|kl)")
         );
+
+        assert_eq!(inexact([E("abab")], [E("abab")]), e(r"(ab){2}"));
+
+        assert_eq!(inexact([I("abab")], [I("abab")]), e(r"(ab){2,3}"));
+
+        assert_eq!(inexact([I("abab")], [I("abab")]), e(r"(ab){2,}"));
     }
 
     #[test]
@@ -2815,13 +2864,13 @@ mod tests {
     // repeats.
     #[test]
     fn crazy_repeats() {
-        assert_eq!(inexact([I("")], [I("")]), e(r"(?:){4294967295}"));
+        assert_eq!(inexact([E("")], [E("")]), e(r"(?:){4294967295}"));
         assert_eq!(
-            inexact([I("")], [I("")]),
+            inexact([E("")], [E("")]),
             e(r"(?:){64}{64}{64}{64}{64}{64}")
         );
-        assert_eq!(inexact([I("")], [I("")]), e(r"x{0}{4294967295}"));
-        assert_eq!(inexact([I("")], [I("")]), e(r"(?:|){4294967295}"));
+        assert_eq!(inexact([E("")], [E("")]), e(r"x{0}{4294967295}"));
+        assert_eq!(inexact([E("")], [E("")]), e(r"(?:|){4294967295}"));
 
         assert_eq!(
             inexact([E("")], [E("")]),
diff --git a/vendor/regex-syntax/src/hir/mod.rs b/vendor/regex-syntax/src/hir/mod.rs
index 062d4dcab..ce38ead7b 100644
--- a/vendor/regex-syntax/src/hir/mod.rs
+++ b/vendor/regex-syntax/src/hir/mod.rs
@@ -88,6 +88,9 @@ pub enum ErrorKind {
     /// This error occurs when translating a pattern that could match a byte
     /// sequence that isn't UTF-8 and `utf8` was enabled.
     InvalidUtf8,
+    /// This error occurs when one uses a non-ASCII byte for a line terminator,
+    /// but where Unicode mode is enabled and UTF-8 mode is disabled.
+    InvalidLineTerminator,
     /// This occurs when an unrecognized Unicode property name could not
     /// be found.
     UnicodePropertyNotFound,
@@ -120,6 +123,7 @@ impl core::fmt::Display for ErrorKind {
         let msg = match *self {
             UnicodeNotAllowed => "Unicode not allowed here",
             InvalidUtf8 => "pattern can match invalid UTF-8",
+            InvalidLineTerminator => "invalid line terminator, must be ASCII",
             UnicodePropertyNotFound => "Unicode property not found",
             UnicodePropertyValueNotFound => "Unicode property value not found",
             UnicodePerlClassNotFound => {
@@ -180,7 +184,7 @@ impl core::fmt::Display for ErrorKind {
 /// matches.
 ///
 /// For empty matches, those can occur at any position. It is the
-/// repsonsibility of the regex engine to determine whether empty matches are
+/// responsibility of the regex engine to determine whether empty matches are
 /// permitted between the code units of a single codepoint.
 ///
 /// # Stack space
@@ -355,7 +359,13 @@ impl Hir {
 
     /// Creates a repetition HIR expression.
     #[inline]
-    pub fn repetition(rep: Repetition) -> Hir {
+    pub fn repetition(mut rep: Repetition) -> Hir {
+        // If the sub-expression of a repetition can only match the empty
+        // string, then we force its maximum to be at most 1.
+        if rep.sub.properties().maximum_len() == Some(0) {
+            rep.min = cmp::min(rep.min, 1);
+            rep.max = rep.max.map(|n| cmp::min(n, 1)).or(Some(1));
+        }
         // The regex 'a{0}' is always equivalent to the empty regex. This is
         // true even when 'a' is an expression that never matches anything
         // (like '\P{any}').
@@ -547,7 +557,7 @@ impl Hir {
         // We rebuild the alternation by simplifying it. We proceed similarly
         // as the concatenation case. But in this case, there's no literal
         // simplification happening. We're just flattening alternations.
-        let mut new = vec![];
+        let mut new = Vec::with_capacity(subs.len());
         for sub in subs {
             let (kind, props) = sub.into_parts();
             match kind {
@@ -642,6 +652,12 @@ impl Hir {
                 cls.push(ClassBytesRange::new(b'\0', b'\xFF'));
                 Hir::class(Class::Bytes(cls))
             }
+            Dot::AnyCharExcept(ch) => {
+                let mut cls =
+                    ClassUnicode::new([ClassUnicodeRange::new(ch, ch)]);
+                cls.negate();
+                Hir::class(Class::Unicode(cls))
+            }
             Dot::AnyCharExceptLF => {
                 let mut cls = ClassUnicode::empty();
                 cls.push(ClassUnicodeRange::new('\0', '\x09'));
@@ -655,6 +671,12 @@ impl Hir {
                 cls.push(ClassUnicodeRange::new('\x0E', '\u{10FFFF}'));
                 Hir::class(Class::Unicode(cls))
             }
+            Dot::AnyByteExcept(byte) => {
+                let mut cls =
+                    ClassBytes::new([ClassBytesRange::new(byte, byte)]);
+                cls.negate();
+                Hir::class(Class::Bytes(cls))
+            }
             Dot::AnyByteExceptLF => {
                 let mut cls = ClassBytes::empty();
                 cls.push(ClassBytesRange::new(b'\0', b'\x09'));
@@ -775,13 +797,18 @@ impl core::fmt::Debug for Literal {
 /// The high-level intermediate representation of a character class.
 ///
 /// A character class corresponds to a set of characters. A character is either
-/// defined by a Unicode scalar value or a byte. Unicode characters are used
-/// by default, while bytes are used when Unicode mode (via the `u` flag) is
-/// disabled.
+/// defined by a Unicode scalar value or a byte.
 ///
 /// A character class, regardless of its character type, is represented by a
 /// sequence of non-overlapping non-adjacent ranges of characters.
 ///
+/// There are no guarantees about which class variant is used. Generally
+/// speaking, the Unicode variat is used whenever a class needs to contain
+/// non-ASCII Unicode scalar values. But the Unicode variant can be used even
+/// when Unicode mode is disabled. For example, at the time of writing, the
+/// regex `(?-u:a|\xc2\xa0)` will compile down to HIR for the Unicode class
+/// `[a\u00A0]` due to optimizations.
+///
 /// Note that `Bytes` variant may be produced even when it exclusively matches
 /// valid UTF-8. This is because a `Bytes` variant represents an intention by
 /// the author of the regular expression to disable Unicode mode, which in turn
@@ -1304,8 +1331,9 @@ impl ClassUnicodeRange {
     }
 }
 
-/// A set of characters represented by arbitrary bytes (where one byte
-/// corresponds to one character).
+/// A set of characters represented by arbitrary bytes.
+///
+/// Each byte corresponds to one character.
 #[derive(Clone, Debug, Eq, PartialEq)]
 pub struct ClassBytes {
     set: IntervalSet<ClassBytesRange>,
@@ -1607,6 +1635,42 @@ pub enum Look {
     WordUnicode = 1 << 8,
     /// Match a Unicode-aware negation of a word boundary.
     WordUnicodeNegate = 1 << 9,
+    /// Match the start of an ASCII-only word boundary. That is, this matches a
+    /// position at either the beginning of the haystack or where the previous
+    /// character is not a word character and the following character is a word
+    /// character.
+    WordStartAscii = 1 << 10,
+    /// Match the end of an ASCII-only word boundary. That is, this matches
+    /// a position at either the end of the haystack or where the previous
+    /// character is a word character and the following character is not a word
+    /// character.
+    WordEndAscii = 1 << 11,
+    /// Match the start of a Unicode word boundary. That is, this matches a
+    /// position at either the beginning of the haystack or where the previous
+    /// character is not a word character and the following character is a word
+    /// character.
+    WordStartUnicode = 1 << 12,
+    /// Match the end of a Unicode word boundary. That is, this matches a
+    /// position at either the end of the haystack or where the previous
+    /// character is a word character and the following character is not a word
+    /// character.
+    WordEndUnicode = 1 << 13,
+    /// Match the start half of an ASCII-only word boundary. That is, this
+    /// matches a position at either the beginning of the haystack or where the
+    /// previous character is not a word character.
+    WordStartHalfAscii = 1 << 14,
+    /// Match the end half of an ASCII-only word boundary. That is, this
+    /// matches a position at either the end of the haystack or where the
+    /// following character is not a word character.
+    WordEndHalfAscii = 1 << 15,
+    /// Match the start half of a Unicode word boundary. That is, this matches
+    /// a position at either the beginning of the haystack or where the
+    /// previous character is not a word character.
+    WordStartHalfUnicode = 1 << 16,
+    /// Match the end half of a Unicode word boundary. That is, this matches
+    /// a position at either the end of the haystack or where the following
+    /// character is not a word character.
+    WordEndHalfUnicode = 1 << 17,
 }
 
 impl Look {
@@ -1628,6 +1692,14 @@ impl Look {
             Look::WordAsciiNegate => Look::WordAsciiNegate,
             Look::WordUnicode => Look::WordUnicode,
             Look::WordUnicodeNegate => Look::WordUnicodeNegate,
+            Look::WordStartAscii => Look::WordEndAscii,
+            Look::WordEndAscii => Look::WordStartAscii,
+            Look::WordStartUnicode => Look::WordEndUnicode,
+            Look::WordEndUnicode => Look::WordStartUnicode,
+            Look::WordStartHalfAscii => Look::WordEndHalfAscii,
+            Look::WordEndHalfAscii => Look::WordStartHalfAscii,
+            Look::WordStartHalfUnicode => Look::WordEndHalfUnicode,
+            Look::WordEndHalfUnicode => Look::WordStartHalfUnicode,
         }
     }
 
@@ -1636,28 +1708,36 @@ impl Look {
     /// constructor is guaranteed to return the same look-around variant that
     /// one started with within a semver compatible release of this crate.
     #[inline]
-    pub const fn as_repr(self) -> u16 {
+    pub const fn as_repr(self) -> u32 {
         // AFAIK, 'as' is the only way to zero-cost convert an int enum to an
         // actual int.
-        self as u16
+        self as u32
     }
 
     /// Given the underlying representation of a `Look` value, return the
     /// corresponding `Look` value if the representation is valid. Otherwise
     /// `None` is returned.
     #[inline]
-    pub const fn from_repr(repr: u16) -> Option<Look> {
+    pub const fn from_repr(repr: u32) -> Option<Look> {
         match repr {
-            0b00_0000_0001 => Some(Look::Start),
-            0b00_0000_0010 => Some(Look::End),
-            0b00_0000_0100 => Some(Look::StartLF),
-            0b00_0000_1000 => Some(Look::EndLF),
-            0b00_0001_0000 => Some(Look::StartCRLF),
-            0b00_0010_0000 => Some(Look::EndCRLF),
-            0b00_0100_0000 => Some(Look::WordAscii),
-            0b00_1000_0000 => Some(Look::WordAsciiNegate),
-            0b01_0000_0000 => Some(Look::WordUnicode),
-            0b10_0000_0000 => Some(Look::WordUnicodeNegate),
+            0b00_0000_0000_0000_0001 => Some(Look::Start),
+            0b00_0000_0000_0000_0010 => Some(Look::End),
+            0b00_0000_0000_0000_0100 => Some(Look::StartLF),
+            0b00_0000_0000_0000_1000 => Some(Look::EndLF),
+            0b00_0000_0000_0001_0000 => Some(Look::StartCRLF),
+            0b00_0000_0000_0010_0000 => Some(Look::EndCRLF),
+            0b00_0000_0000_0100_0000 => Some(Look::WordAscii),
+            0b00_0000_0000_1000_0000 => Some(Look::WordAsciiNegate),
+            0b00_0000_0001_0000_0000 => Some(Look::WordUnicode),
+            0b00_0000_0010_0000_0000 => Some(Look::WordUnicodeNegate),
+            0b00_0000_0100_0000_0000 => Some(Look::WordStartAscii),
+            0b00_0000_1000_0000_0000 => Some(Look::WordEndAscii),
+            0b00_0001_0000_0000_0000 => Some(Look::WordStartUnicode),
+            0b00_0010_0000_0000_0000 => Some(Look::WordEndUnicode),
+            0b00_0100_0000_0000_0000 => Some(Look::WordStartHalfAscii),
+            0b00_1000_0000_0000_0000 => Some(Look::WordEndHalfAscii),
+            0b01_0000_0000_0000_0000 => Some(Look::WordStartHalfUnicode),
+            0b10_0000_0000_0000_0000 => Some(Look::WordEndHalfUnicode),
             _ => None,
         }
     }
@@ -1682,6 +1762,14 @@ impl Look {
             Look::WordAsciiNegate => 'B',
             Look::WordUnicode => '𝛃',
             Look::WordUnicodeNegate => '𝚩',
+            Look::WordStartAscii => '<',
+            Look::WordEndAscii => '>',
+            Look::WordStartUnicode => '〈',
+            Look::WordEndUnicode => '〉',
+            Look::WordStartHalfAscii => '◁',
+            Look::WordEndHalfAscii => '▷',
+            Look::WordStartHalfUnicode => '◀',
+            Look::WordEndHalfUnicode => '▶',
         }
     }
 }
@@ -1766,6 +1854,18 @@ pub enum Dot {
     ///
     /// This is equivalent to `(?s-u:.)` and also `(?-u:[\x00-\xFF])`.
     AnyByte,
+    /// Matches the UTF-8 encoding of any Unicode scalar value except for the
+    /// `char` given.
+    ///
+    /// This is equivalent to using `(?u-s:.)` with the line terminator set
+    /// to a particular ASCII byte. (Because of peculiarities in the regex
+    /// engines, a line terminator must be a single byte. It follows that when
+    /// UTF-8 mode is enabled, this single byte must also be a Unicode scalar
+    /// value. That is, ti must be ASCII.)
+    ///
+    /// (This and `AnyCharExceptLF` both exist because of legacy reasons.
+    /// `AnyCharExceptLF` will be dropped in the next breaking change release.)
+    AnyCharExcept(char),
     /// Matches the UTF-8 encoding of any Unicode scalar value except for `\n`.
     ///
     /// This is equivalent to `(?u-s:.)` and also `[\p{any}--\n]`.
@@ -1775,6 +1875,17 @@ pub enum Dot {
     ///
     /// This is equivalent to `(?uR-s:.)` and also `[\p{any}--\r\n]`.
     AnyCharExceptCRLF,
+    /// Matches any byte value except for the `u8` given.
+    ///
+    /// This is equivalent to using `(?-us:.)` with the line terminator set
+    /// to a particular ASCII byte. (Because of peculiarities in the regex
+    /// engines, a line terminator must be a single byte. It follows that when
+    /// UTF-8 mode is enabled, this single byte must also be a Unicode scalar
+    /// value. That is, ti must be ASCII.)
+    ///
+    /// (This and `AnyByteExceptLF` both exist because of legacy reasons.
+    /// `AnyByteExceptLF` will be dropped in the next breaking change release.)
+    AnyByteExcept(u8),
     /// Matches any byte value except for `\n`.
     ///
     /// This is equivalent to `(?-su:.)` and also `(?-u:[[\x00-\xFF]--\n])`.
@@ -2410,10 +2521,10 @@ impl Properties {
             inner.look_set_prefix = p.look_set_prefix();
             inner.look_set_suffix = p.look_set_suffix();
         }
-        // If the static captures len of the sub-expression is not known or is
-        // zero, then it automatically propagates to the repetition, regardless
-        // of the repetition. Otherwise, it might change, but only when the
-        // repetition can match 0 times.
+        // If the static captures len of the sub-expression is not known or
+        // is greater than zero, then it automatically propagates to the
+        // repetition, regardless of the repetition. Otherwise, it might
+        // change, but only when the repetition can match 0 times.
         if rep.min == 0
             && inner.static_explicit_captures_len.map_or(false, |len| len > 0)
         {
@@ -2549,7 +2660,7 @@ pub struct LookSet {
     /// range of `u16` values to be represented. For example, even if the
     /// current implementation only makes use of the 10 least significant bits,
     /// it may use more bits in a future semver compatible release.
-    pub bits: u16,
+    pub bits: u32,
 }
 
 impl LookSet {
@@ -2652,13 +2763,22 @@ impl LookSet {
     pub fn contains_word_unicode(self) -> bool {
         self.contains(Look::WordUnicode)
             || self.contains(Look::WordUnicodeNegate)
+            || self.contains(Look::WordStartUnicode)
+            || self.contains(Look::WordEndUnicode)
+            || self.contains(Look::WordStartHalfUnicode)
+            || self.contains(Look::WordEndHalfUnicode)
     }
 
     /// Returns true if and only if this set contains any ASCII word boundary
     /// or negated ASCII word boundary assertions.
     #[inline]
     pub fn contains_word_ascii(self) -> bool {
-        self.contains(Look::WordAscii) || self.contains(Look::WordAsciiNegate)
+        self.contains(Look::WordAscii)
+            || self.contains(Look::WordAsciiNegate)
+            || self.contains(Look::WordStartAscii)
+            || self.contains(Look::WordEndAscii)
+            || self.contains(Look::WordStartHalfAscii)
+            || self.contains(Look::WordEndHalfAscii)
     }
 
     /// Returns an iterator over all of the look-around assertions in this set.
@@ -2737,29 +2857,31 @@ impl LookSet {
         *self = self.intersect(other);
     }
 
-    /// Return a `LookSet` from the slice given as a native endian 16-bit
+    /// Return a `LookSet` from the slice given as a native endian 32-bit
     /// integer.
     ///
     /// # Panics
     ///
-    /// This panics if `slice.len() < 2`.
+    /// This panics if `slice.len() < 4`.
     #[inline]
     pub fn read_repr(slice: &[u8]) -> LookSet {
-        let bits = u16::from_ne_bytes(slice[..2].try_into().unwrap());
+        let bits = u32::from_ne_bytes(slice[..4].try_into().unwrap());
         LookSet { bits }
     }
 
-    /// Write a `LookSet` as a native endian 16-bit integer to the beginning
+    /// Write a `LookSet` as a native endian 32-bit integer to the beginning
     /// of the slice given.
     ///
     /// # Panics
     ///
-    /// This panics if `slice.len() < 2`.
+    /// This panics if `slice.len() < 4`.
     #[inline]
     pub fn write_repr(self, slice: &mut [u8]) {
         let raw = self.bits.to_ne_bytes();
         slice[0] = raw[0];
         slice[1] = raw[1];
+        slice[2] = raw[2];
+        slice[3] = raw[3];
     }
 }
 
@@ -2792,9 +2914,9 @@ impl Iterator for LookSetIter {
             return None;
         }
         // We'll never have more than u8::MAX distinct look-around assertions,
-        // so 'repr' will always fit into a u16.
-        let repr = u16::try_from(self.set.bits.trailing_zeros()).unwrap();
-        let look = Look::from_repr(1 << repr)?;
+        // so 'bit' will always fit into a u16.
+        let bit = u16::try_from(self.set.bits.trailing_zeros()).unwrap();
+        let look = Look::from_repr(1 << bit)?;
         self.set = self.set.remove(look);
         Some(look)
     }
@@ -3716,7 +3838,7 @@ mod tests {
         assert_eq!(0, set.iter().count());
 
         let set = LookSet::full();
-        assert_eq!(10, set.iter().count());
+        assert_eq!(18, set.iter().count());
 
         let set =
             LookSet::empty().insert(Look::StartLF).insert(Look::WordUnicode);
@@ -3734,6 +3856,6 @@ mod tests {
         let res = format!("{:?}", LookSet::empty());
         assert_eq!("∅", res);
         let res = format!("{:?}", LookSet::full());
-        assert_eq!("Az^$rRbB𝛃𝚩", res);
+        assert_eq!("Az^$rRbB𝛃𝚩<>〈〉◁▷◀▶", res);
     }
 }
diff --git a/vendor/regex-syntax/src/hir/print.rs b/vendor/regex-syntax/src/hir/print.rs
index fcb7cd252..dfa6d4032 100644
--- a/vendor/regex-syntax/src/hir/print.rs
+++ b/vendor/regex-syntax/src/hir/print.rs
@@ -89,9 +89,16 @@ impl<W: fmt::Write> Visitor for Writer<W> {
 
     fn visit_pre(&mut self, hir: &Hir) -> fmt::Result {
         match *hir.kind() {
-            // Empty is represented by nothing in the concrete syntax, and
-            // repetition operators are strictly suffix oriented.
-            HirKind::Empty | HirKind::Repetition(_) => {}
+            HirKind::Empty => {
+                // Technically an empty sub-expression could be "printed" by
+                // just ignoring it, but in practice, you could have a
+                // repetition operator attached to an empty expression, and you
+                // really need something in the concrete syntax to make that
+                // work as you'd expect.
+                self.wtr.write_str(r"(?:)")?;
+            }
+            // Repetition operators are strictly suffix oriented.
+            HirKind::Repetition(_) => {}
             HirKind::Literal(hir::Literal(ref bytes)) => {
                 // See the comment on the 'Concat' and 'Alternation' case below
                 // for why we put parens here. Literals are, conceptually,
@@ -195,6 +202,30 @@ impl<W: fmt::Write> Visitor for Writer<W> {
                 hir::Look::WordUnicodeNegate => {
                     self.wtr.write_str(r"\B")?;
                 }
+                hir::Look::WordStartAscii => {
+                    self.wtr.write_str(r"(?-u:\b{start})")?;
+                }
+                hir::Look::WordEndAscii => {
+                    self.wtr.write_str(r"(?-u:\b{end})")?;
+                }
+                hir::Look::WordStartUnicode => {
+                    self.wtr.write_str(r"\b{start}")?;
+                }
+                hir::Look::WordEndUnicode => {
+                    self.wtr.write_str(r"\b{end}")?;
+                }
+                hir::Look::WordStartHalfAscii => {
+                    self.wtr.write_str(r"(?-u:\b{start-half})")?;
+                }
+                hir::Look::WordEndHalfAscii => {
+                    self.wtr.write_str(r"(?-u:\b{end-half})")?;
+                }
+                hir::Look::WordStartHalfUnicode => {
+                    self.wtr.write_str(r"\b{start-half}")?;
+                }
+                hir::Look::WordEndHalfUnicode => {
+                    self.wtr.write_str(r"\b{end-half}")?;
+                }
             },
             HirKind::Capture(hir::Capture { ref name, .. }) => {
                 self.wtr.write_str("(")?;
@@ -424,20 +455,20 @@ mod tests {
         // Test that various zero-length repetitions always translate to an
         // empty regex. This is more a property of HIR's smart constructors
         // than the printer though.
-        roundtrip("a{0}", "");
-        roundtrip("(?:ab){0}", "");
+        roundtrip("a{0}", "(?:)");
+        roundtrip("(?:ab){0}", "(?:)");
         #[cfg(feature = "unicode-gencat")]
         {
-            roundtrip(r"\p{any}{0}", "");
-            roundtrip(r"\P{any}{0}", "");
+            roundtrip(r"\p{any}{0}", "(?:)");
+            roundtrip(r"\P{any}{0}", "(?:)");
         }
     }
 
     #[test]
     fn print_group() {
-        roundtrip("()", "()");
-        roundtrip("(?P<foo>)", "(?P<foo>)");
-        roundtrip("(?:)", "");
+        roundtrip("()", "((?:))");
+        roundtrip("(?P<foo>)", "(?P<foo>(?:))");
+        roundtrip("(?:)", "(?:)");
 
         roundtrip("(a)", "(a)");
         roundtrip("(?P<foo>a)", "(?P<foo>a)");
@@ -448,8 +479,8 @@ mod tests {
 
     #[test]
     fn print_alternation() {
-        roundtrip("|", "(?:|)");
-        roundtrip("||", "(?:||)");
+        roundtrip("|", "(?:(?:)|(?:))");
+        roundtrip("||", "(?:(?:)|(?:)|(?:))");
 
         roundtrip("a|b", "[ab]");
         roundtrip("ab|cd", "(?:(?:ab)|(?:cd))");
@@ -503,7 +534,7 @@ mod tests {
             }),
             Hir::look(hir::Look::End),
         ]);
-        assert_eq!(r"(?:\A(?:\A\z)+\z)", expr.to_string());
+        assert_eq!(r"(?:\A\A\z\z)", expr.to_string());
     }
 
     // Just like regression_repetition_concat, but with the repetition using
@@ -540,7 +571,7 @@ mod tests {
             }),
             Hir::look(hir::Look::End),
         ]);
-        assert_eq!(r"(?:\A(?:\A|\z)+\z)", expr.to_string());
+        assert_eq!(r"(?:\A(?:\A|\z)\z)", expr.to_string());
     }
 
     // This regression test is very similar in flavor to
diff --git a/vendor/regex-syntax/src/hir/translate.rs b/vendor/regex-syntax/src/hir/translate.rs
index ff9c5ee91..313a1e9e8 100644
--- a/vendor/regex-syntax/src/hir/translate.rs
+++ b/vendor/regex-syntax/src/hir/translate.rs
@@ -19,6 +19,7 @@ type Result<T> = core::result::Result<T, Error>;
 #[derive(Clone, Debug)]
 pub struct TranslatorBuilder {
     utf8: bool,
+    line_terminator: u8,
     flags: Flags,
 }
 
@@ -31,7 +32,11 @@ impl Default for TranslatorBuilder {
 impl TranslatorBuilder {
     /// Create a new translator builder with a default c onfiguration.
     pub fn new() -> TranslatorBuilder {
-        TranslatorBuilder { utf8: true, flags: Flags::default() }
+        TranslatorBuilder {
+            utf8: true,
+            line_terminator: b'\n',
+            flags: Flags::default(),
+        }
     }
 
     /// Build a translator using the current configuration.
@@ -40,6 +45,7 @@ impl TranslatorBuilder {
             stack: RefCell::new(vec![]),
             flags: Cell::new(self.flags),
             utf8: self.utf8,
+            line_terminator: self.line_terminator,
         }
     }
 
@@ -63,6 +69,31 @@ impl TranslatorBuilder {
         self
     }
 
+    /// Sets the line terminator for use with `(?u-s:.)` and `(?-us:.)`.
+    ///
+    /// Namely, instead of `.` (by default) matching everything except for `\n`,
+    /// this will cause `.` to match everything except for the byte given.
+    ///
+    /// If `.` is used in a context where Unicode mode is enabled and this byte
+    /// isn't ASCII, then an error will be returned. When Unicode mode is
+    /// disabled, then any byte is permitted, but will return an error if UTF-8
+    /// mode is enabled and it is a non-ASCII byte.
+    ///
+    /// In short, any ASCII value for a line terminator is always okay. But a
+    /// non-ASCII byte might result in an error depending on whether Unicode
+    /// mode or UTF-8 mode are enabled.
+    ///
+    /// Note that if `R` mode is enabled then it always takes precedence and
+    /// the line terminator will be treated as `\r` and `\n` simultaneously.
+    ///
+    /// Note also that this *doesn't* impact the look-around assertions
+    /// `(?m:^)` and `(?m:$)`. That's usually controlled by additional
+    /// configuration in the regex engine itself.
+    pub fn line_terminator(&mut self, byte: u8) -> &mut TranslatorBuilder {
+        self.line_terminator = byte;
+        self
+    }
+
     /// Enable or disable the case insensitive flag (`i`) by default.
     pub fn case_insensitive(&mut self, yes: bool) -> &mut TranslatorBuilder {
         self.flags.case_insensitive = if yes { Some(true) } else { None };
@@ -120,6 +151,8 @@ pub struct Translator {
     flags: Cell<Flags>,
     /// Whether we're allowed to produce HIR that can match arbitrary bytes.
     utf8: bool,
+    /// The line terminator to use for `.`.
+    line_terminator: u8,
 }
 
 impl Translator {
@@ -304,7 +337,7 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
 
     fn visit_pre(&mut self, ast: &Ast) -> Result<()> {
         match *ast {
-            Ast::Class(ast::Class::Bracketed(_)) => {
+            Ast::ClassBracketed(_) => {
                 if self.flags().unicode() {
                     let cls = hir::ClassUnicode::empty();
                     self.push(HirFrame::ClassUnicode(cls));
@@ -321,14 +354,14 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
                     .unwrap_or_else(|| self.flags());
                 self.push(HirFrame::Group { old_flags });
             }
-            Ast::Concat(ref x) if x.asts.is_empty() => {}
             Ast::Concat(_) => {
                 self.push(HirFrame::Concat);
             }
-            Ast::Alternation(ref x) if x.asts.is_empty() => {}
-            Ast::Alternation(_) => {
+            Ast::Alternation(ref x) => {
                 self.push(HirFrame::Alternation);
-                self.push(HirFrame::AlternationBranch);
+                if !x.asts.is_empty() {
+                    self.push(HirFrame::AlternationBranch);
+                }
             }
             _ => {}
         }
@@ -353,29 +386,20 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
                 // consistency sake.
                 self.push(HirFrame::Expr(Hir::empty()));
             }
-            Ast::Literal(ref x) => {
-                match self.ast_literal_to_scalar(x)? {
-                    Either::Right(byte) => self.push_byte(byte),
-                    Either::Left(ch) => {
-                        if !self.flags().unicode() && ch.len_utf8() > 1 {
-                            return Err(self
-                                .error(x.span, ErrorKind::UnicodeNotAllowed));
-                        }
-                        match self.case_fold_char(x.span, ch)? {
-                            None => self.push_char(ch),
-                            Some(expr) => self.push(HirFrame::Expr(expr)),
-                        }
-                    }
-                }
-                // self.push(HirFrame::Expr(self.hir_literal(x)?));
-            }
-            Ast::Dot(span) => {
-                self.push(HirFrame::Expr(self.hir_dot(span)?));
+            Ast::Literal(ref x) => match self.ast_literal_to_scalar(x)? {
+                Either::Right(byte) => self.push_byte(byte),
+                Either::Left(ch) => match self.case_fold_char(x.span, ch)? {
+                    None => self.push_char(ch),
+                    Some(expr) => self.push(HirFrame::Expr(expr)),
+                },
+            },
+            Ast::Dot(ref span) => {
+                self.push(HirFrame::Expr(self.hir_dot(**span)?));
             }
             Ast::Assertion(ref x) => {
                 self.push(HirFrame::Expr(self.hir_assertion(x)?));
             }
-            Ast::Class(ast::Class::Perl(ref x)) => {
+            Ast::ClassPerl(ref x) => {
                 if self.flags().unicode() {
                     let cls = self.hir_perl_unicode_class(x)?;
                     let hcls = hir::Class::Unicode(cls);
@@ -386,11 +410,11 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
                     self.push(HirFrame::Expr(Hir::class(hcls)));
                 }
             }
-            Ast::Class(ast::Class::Unicode(ref x)) => {
+            Ast::ClassUnicode(ref x) => {
                 let cls = hir::Class::Unicode(self.hir_unicode_class(x)?);
                 self.push(HirFrame::Expr(Hir::class(cls)));
             }
-            Ast::Class(ast::Class::Bracketed(ref ast)) => {
+            Ast::ClassBracketed(ref ast) => {
                 if self.flags().unicode() {
                     let mut cls = self.pop().unwrap().unwrap_class_unicode();
                     self.unicode_fold_and_negate(
@@ -841,8 +865,8 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
             })?;
             Ok(Some(Hir::class(hir::Class::Unicode(cls))))
         } else {
-            if c.len_utf8() > 1 {
-                return Err(self.error(span, ErrorKind::UnicodeNotAllowed));
+            if !c.is_ascii() {
+                return Ok(None);
             }
             // If case folding won't do anything, then don't bother trying.
             match c {
@@ -862,10 +886,38 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
     }
 
     fn hir_dot(&self, span: Span) -> Result<Hir> {
-        if !self.flags().unicode() && self.trans().utf8 {
+        let (utf8, lineterm, flags) =
+            (self.trans().utf8, self.trans().line_terminator, self.flags());
+        if utf8 && (!flags.unicode() || !lineterm.is_ascii()) {
             return Err(self.error(span, ErrorKind::InvalidUtf8));
         }
-        Ok(Hir::dot(self.flags().dot()))
+        let dot = if flags.dot_matches_new_line() {
+            if flags.unicode() {
+                hir::Dot::AnyChar
+            } else {
+                hir::Dot::AnyByte
+            }
+        } else {
+            if flags.unicode() {
+                if flags.crlf() {
+                    hir::Dot::AnyCharExceptCRLF
+                } else {
+                    if !lineterm.is_ascii() {
+                        return Err(
+                            self.error(span, ErrorKind::InvalidLineTerminator)
+                        );
+                    }
+                    hir::Dot::AnyCharExcept(char::from(lineterm))
+                }
+            } else {
+                if flags.crlf() {
+                    hir::Dot::AnyByteExceptCRLF
+                } else {
+                    hir::Dot::AnyByteExcept(lineterm)
+                }
+            }
+        };
+        Ok(Hir::dot(dot))
     }
 
     fn hir_assertion(&self, asst: &ast::Assertion) -> Result<Hir> {
@@ -903,6 +955,34 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
             } else {
                 hir::Look::WordAsciiNegate
             }),
+            ast::AssertionKind::WordBoundaryStart
+            | ast::AssertionKind::WordBoundaryStartAngle => {
+                Hir::look(if unicode {
+                    hir::Look::WordStartUnicode
+                } else {
+                    hir::Look::WordStartAscii
+                })
+            }
+            ast::AssertionKind::WordBoundaryEnd
+            | ast::AssertionKind::WordBoundaryEndAngle => {
+                Hir::look(if unicode {
+                    hir::Look::WordEndUnicode
+                } else {
+                    hir::Look::WordEndAscii
+                })
+            }
+            ast::AssertionKind::WordBoundaryStartHalf => {
+                Hir::look(if unicode {
+                    hir::Look::WordStartHalfUnicode
+                } else {
+                    hir::Look::WordStartHalfAscii
+                })
+            }
+            ast::AssertionKind::WordBoundaryEndHalf => Hir::look(if unicode {
+                hir::Look::WordEndHalfUnicode
+            } else {
+                hir::Look::WordEndHalfAscii
+            }),
         })
     }
 
@@ -1124,9 +1204,8 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
         match self.ast_literal_to_scalar(ast)? {
             Either::Right(byte) => Ok(byte),
             Either::Left(ch) => {
-                let cp = u32::from(ch);
-                if cp <= 0x7F {
-                    Ok(u8::try_from(cp).unwrap())
+                if ch.is_ascii() {
+                    Ok(u8::try_from(ch).unwrap())
                 } else {
                     // We can't feasibly support Unicode in
                     // byte oriented classes. Byte classes don't
@@ -1209,30 +1288,6 @@ impl Flags {
         }
     }
 
-    fn dot(&self) -> hir::Dot {
-        if self.dot_matches_new_line() {
-            if self.unicode() {
-                hir::Dot::AnyChar
-            } else {
-                hir::Dot::AnyByte
-            }
-        } else {
-            if self.unicode() {
-                if self.crlf() {
-                    hir::Dot::AnyCharExceptCRLF
-                } else {
-                    hir::Dot::AnyCharExceptLF
-                }
-            } else {
-                if self.crlf() {
-                    hir::Dot::AnyByteExceptCRLF
-                } else {
-                    hir::Dot::AnyByteExceptLF
-                }
-            }
-        }
-    }
-
     fn case_insensitive(&self) -> bool {
         self.case_insensitive.unwrap_or(false)
     }
@@ -1598,16 +1653,7 @@ mod tests {
         assert_eq!(t_bytes(r"(?-u)\x61"), hir_lit("a"));
         assert_eq!(t_bytes(r"(?-u)\xFF"), hir_blit(b"\xFF"));
 
-        assert_eq!(
-            t_err("(?-u)☃"),
-            TestError {
-                kind: hir::ErrorKind::UnicodeNotAllowed,
-                span: Span::new(
-                    Position::new(5, 1, 6),
-                    Position::new(8, 1, 7)
-                ),
-            }
-        );
+        assert_eq!(t("(?-u)☃"), hir_lit("☃"));
         assert_eq!(
             t_err(r"(?-u)\xFF"),
             TestError {
@@ -1685,16 +1731,7 @@ mod tests {
         );
         assert_eq!(t_bytes(r"(?i-u)\xFF"), hir_blit(b"\xFF"));
 
-        assert_eq!(
-            t_err("(?i-u)β"),
-            TestError {
-                kind: hir::ErrorKind::UnicodeNotAllowed,
-                span: Span::new(
-                    Position::new(6, 1, 7),
-                    Position::new(8, 1, 8),
-                ),
-            }
-        );
+        assert_eq!(t("(?i-u)β"), hir_lit("β"),);
     }
 
     #[test]
@@ -3489,6 +3526,15 @@ mod tests {
         assert!(!props(r"(?:z|xx)@|xx").is_alternation_literal());
     }
 
+    // This tests that the smart Hir::repetition constructors does some basic
+    // simplifications.
+    #[test]
+    fn smart_repetition() {
+        assert_eq!(t(r"a{0}"), Hir::empty());
+        assert_eq!(t(r"a{1}"), hir_lit("a"));
+        assert_eq!(t(r"\B{32111}"), hir_look(hir::Look::WordUnicodeNegate));
+    }
+
     // This tests that the smart Hir::concat constructor simplifies the given
     // exprs in a way we expect.
     #[test]
@@ -3580,4 +3626,99 @@ mod tests {
             ]),
         );
     }
+
+    #[test]
+    fn regression_alt_empty_concat() {
+        use crate::ast::{self, Ast};
+
+        let span = Span::splat(Position::new(0, 0, 0));
+        let ast = Ast::alternation(ast::Alternation {
+            span,
+            asts: vec![Ast::concat(ast::Concat { span, asts: vec![] })],
+        });
+
+        let mut t = Translator::new();
+        assert_eq!(Ok(Hir::empty()), t.translate("", &ast));
+    }
+
+    #[test]
+    fn regression_empty_alt() {
+        use crate::ast::{self, Ast};
+
+        let span = Span::splat(Position::new(0, 0, 0));
+        let ast = Ast::concat(ast::Concat {
+            span,
+            asts: vec![Ast::alternation(ast::Alternation {
+                span,
+                asts: vec![],
+            })],
+        });
+
+        let mut t = Translator::new();
+        assert_eq!(Ok(Hir::fail()), t.translate("", &ast));
+    }
+
+    #[test]
+    fn regression_singleton_alt() {
+        use crate::{
+            ast::{self, Ast},
+            hir::Dot,
+        };
+
+        let span = Span::splat(Position::new(0, 0, 0));
+        let ast = Ast::concat(ast::Concat {
+            span,
+            asts: vec![Ast::alternation(ast::Alternation {
+                span,
+                asts: vec![Ast::dot(span)],
+            })],
+        });
+
+        let mut t = Translator::new();
+        assert_eq!(Ok(Hir::dot(Dot::AnyCharExceptLF)), t.translate("", &ast));
+    }
+
+    // See: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=63168
+    #[test]
+    fn regression_fuzz_match() {
+        let pat = "[(\u{6} \0-\u{afdf5}]  \0 ";
+        let ast = ParserBuilder::new()
+            .octal(false)
+            .ignore_whitespace(true)
+            .build()
+            .parse(pat)
+            .unwrap();
+        let hir = TranslatorBuilder::new()
+            .utf8(true)
+            .case_insensitive(false)
+            .multi_line(false)
+            .dot_matches_new_line(false)
+            .swap_greed(true)
+            .unicode(true)
+            .build()
+            .translate(pat, &ast)
+            .unwrap();
+        assert_eq!(
+            hir,
+            Hir::concat(vec![
+                hir_uclass(&[('\0', '\u{afdf5}')]),
+                hir_lit("\0"),
+            ])
+        );
+    }
+
+    // See: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=63155
+    #[cfg(feature = "unicode")]
+    #[test]
+    fn regression_fuzz_difference1() {
+        let pat = r"\W\W|\W[^\v--\W\W\P{Script_Extensions:Pau_Cin_Hau}\u10A1A1-\U{3E3E3}--~~~~--~~~~~~~~------~~~~~~--~~~~~~]*";
+        let _ = t(pat); // shouldn't panic
+    }
+
+    // See: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=63153
+    #[test]
+    fn regression_fuzz_char_decrement1() {
+        let pat = "w[w[^w?\rw\rw[^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w?\r\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0w?\rw[^w?\rw[^w?\rw[^w\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\u{1}\0]\0\0\0\0\0\0\0\0\0*\0\0\u{1}\0]\0\0-*\0][^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\u{1}\0]\0\0\0\0\0\0\0\0\0x\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\0\0\0*??\0\u{7f}{2}\u{10}??\0\0\0\0\0\0\0\0\0\u{3}\0\0\0}\0-*\0]\0\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\0\u{1}\0]\0\u{1}\u{1}H-i]-]\0\0\0\0\u{1}\0]\0\0\0\u{1}\0]\0\0-*\0\0\0\0\u{1}9-\u{7f}]\0'|-\u{7f}]\0'|(?i-ux)[-\u{7f}]\0'\u{3}\0\0\0}\0-*\0]<D\0\0\0\0\0\0\u{1}]\0\0\0\0]\0\0-*\0]\0\0 ";
+        let _ = t(pat); // shouldn't panic
+    }
 }
diff --git a/vendor/regex-syntax/src/hir/visitor.rs b/vendor/regex-syntax/src/hir/visitor.rs
index e5f15cf1c..f30f0a163 100644
--- a/vendor/regex-syntax/src/hir/visitor.rs
+++ b/vendor/regex-syntax/src/hir/visitor.rs
@@ -41,6 +41,11 @@ pub trait Visitor {
     fn visit_alternation_in(&mut self) -> Result<(), Self::Err> {
         Ok(())
     }
+
+    /// This method is called between child nodes of a concatenation.
+    fn visit_concat_in(&mut self) -> Result<(), Self::Err> {
+        Ok(())
+    }
 }
 
 /// Executes an implementation of `Visitor` in constant stack space.
@@ -131,8 +136,14 @@ impl<'a> HeapVisitor<'a> {
                 // If this is a concat/alternate, then we might have additional
                 // inductive steps to process.
                 if let Some(x) = self.pop(frame) {
-                    if let Frame::Alternation { .. } = x {
-                        visitor.visit_alternation_in()?;
+                    match x {
+                        Frame::Alternation { .. } => {
+                            visitor.visit_alternation_in()?;
+                        }
+                        Frame::Concat { .. } => {
+                            visitor.visit_concat_in()?;
+                        }
+                        _ => {}
                     }
                     hir = x.child();
                     self.stack.push((post_hir, x));
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-06-19 09:26:03 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-06-19 09:26:03 +0000
commit	9918693037dce8aa4bb6f08741b6812923486c18 (patch)
tree	21d2b40bec7e6a7ea664acee056eb3d08e15a1cf /vendor/regex-syntax/src/hir
parent	Releasing progress-linux version 1.75.0+dfsg1-5~progress7.99u1. (diff)
download	rustc-9918693037dce8aa4bb6f08741b6812923486c18.tar.xz rustc-9918693037dce8aa4bb6f08741b6812923486c18.zip