diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-30 03:57:31 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-30 03:57:31 +0000 |
commit | dc0db358abe19481e475e10c32149b53370f1a1c (patch) | |
tree | ab8ce99c4b255ce46f99ef402c27916055b899ee /vendor/regex/src/compile.rs | |
parent | Releasing progress-linux version 1.71.1+dfsg1-2~progress7.99u1. (diff) | |
download | rustc-dc0db358abe19481e475e10c32149b53370f1a1c.tar.xz rustc-dc0db358abe19481e475e10c32149b53370f1a1c.zip |
Merging upstream version 1.72.1+dfsg1.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'vendor/regex/src/compile.rs')
-rw-r--r-- | vendor/regex/src/compile.rs | 297 |
1 files changed, 183 insertions, 114 deletions
diff --git a/vendor/regex/src/compile.rs b/vendor/regex/src/compile.rs index 90ca25015..23e63ec89 100644 --- a/vendor/regex/src/compile.rs +++ b/vendor/regex/src/compile.rs @@ -4,7 +4,7 @@ use std::iter; use std::result; use std::sync::Arc; -use regex_syntax::hir::{self, Hir}; +use regex_syntax::hir::{self, Hir, Look}; use regex_syntax::is_word_byte; use regex_syntax::utf8::{Utf8Range, Utf8Sequence, Utf8Sequences}; @@ -137,13 +137,24 @@ impl Compiler { } fn compile_one(mut self, expr: &Hir) -> result::Result<Program, Error> { + if self.compiled.only_utf8 + && expr.properties().look_set().contains(Look::WordAsciiNegate) + { + return Err(Error::Syntax( + "ASCII-only \\B is not allowed in Unicode regexes \ + because it may result in invalid UTF-8 matches" + .to_string(), + )); + } // If we're compiling a forward DFA and we aren't anchored, then // add a `.*?` before the first capture group. // Other matching engines handle this by baking the logic into the // matching engine itself. let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 }; - self.compiled.is_anchored_start = expr.is_anchored_start(); - self.compiled.is_anchored_end = expr.is_anchored_end(); + self.compiled.is_anchored_start = + expr.properties().look_set_prefix().contains(Look::Start); + self.compiled.is_anchored_end = + expr.properties().look_set_suffix().contains(Look::End); if self.compiled.needs_dotstar() { dotstar_patch = self.c_dotstar()?; self.compiled.start = dotstar_patch.entry; @@ -159,6 +170,8 @@ impl Compiler { self.fill_to_next(patch.hole); self.compiled.matches = vec![self.insts.len()]; self.push_compiled(Inst::Match(0)); + self.compiled.static_captures_len = + expr.properties().static_explicit_captures_len(); self.compile_finish() } @@ -168,10 +181,12 @@ impl Compiler { ) -> result::Result<Program, Error> { debug_assert!(exprs.len() > 1); - self.compiled.is_anchored_start = - exprs.iter().all(|e| e.is_anchored_start()); - self.compiled.is_anchored_end = - exprs.iter().all(|e| e.is_anchored_end()); + self.compiled.is_anchored_start = exprs + .iter() + .all(|e| e.properties().look_set_prefix().contains(Look::Start)); + self.compiled.is_anchored_end = exprs + .iter() + .all(|e| e.properties().look_set_suffix().contains(Look::End)); let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 }; if self.compiled.needs_dotstar() { dotstar_patch = self.c_dotstar()?; @@ -272,17 +287,21 @@ impl Compiler { self.check_size()?; match *expr.kind() { Empty => self.c_empty(), - Literal(hir::Literal::Unicode(c)) => self.c_char(c), - Literal(hir::Literal::Byte(b)) => { - assert!(self.compiled.uses_bytes()); - self.c_byte(b) + Literal(hir::Literal(ref bytes)) => { + if self.compiled.is_reverse { + let mut bytes = bytes.to_vec(); + bytes.reverse(); + self.c_literal(&bytes) + } else { + self.c_literal(bytes) + } } Class(hir::Class::Unicode(ref cls)) => self.c_class(cls.ranges()), Class(hir::Class::Bytes(ref cls)) => { if self.compiled.uses_bytes() { self.c_class_bytes(cls.ranges()) } else { - assert!(cls.is_all_ascii()); + assert!(cls.is_ascii()); let mut char_ranges = vec![]; for r in cls.iter() { let (s, e) = (r.start() as char, r.end() as char); @@ -291,92 +310,94 @@ impl Compiler { self.c_class(&char_ranges) } } - Anchor(hir::Anchor::StartLine) if self.compiled.is_reverse => { - self.byte_classes.set_range(b'\n', b'\n'); - self.c_empty_look(prog::EmptyLook::EndLine) - } - Anchor(hir::Anchor::StartLine) => { - self.byte_classes.set_range(b'\n', b'\n'); - self.c_empty_look(prog::EmptyLook::StartLine) - } - Anchor(hir::Anchor::EndLine) if self.compiled.is_reverse => { - self.byte_classes.set_range(b'\n', b'\n'); - self.c_empty_look(prog::EmptyLook::StartLine) - } - Anchor(hir::Anchor::EndLine) => { - self.byte_classes.set_range(b'\n', b'\n'); - self.c_empty_look(prog::EmptyLook::EndLine) - } - Anchor(hir::Anchor::StartText) if self.compiled.is_reverse => { - self.c_empty_look(prog::EmptyLook::EndText) - } - Anchor(hir::Anchor::StartText) => { - self.c_empty_look(prog::EmptyLook::StartText) - } - Anchor(hir::Anchor::EndText) if self.compiled.is_reverse => { - self.c_empty_look(prog::EmptyLook::StartText) - } - Anchor(hir::Anchor::EndText) => { - self.c_empty_look(prog::EmptyLook::EndText) - } - WordBoundary(hir::WordBoundary::Unicode) => { - if !cfg!(feature = "unicode-perl") { - return Err(Error::Syntax( - "Unicode word boundaries are unavailable when \ - the unicode-perl feature is disabled" - .to_string(), - )); + Look(ref look) => match *look { + hir::Look::Start if self.compiled.is_reverse => { + self.c_empty_look(prog::EmptyLook::EndText) } - self.compiled.has_unicode_word_boundary = true; - self.byte_classes.set_word_boundary(); - // We also make sure that all ASCII bytes are in a different - // class from non-ASCII bytes. Otherwise, it's possible for - // ASCII bytes to get lumped into the same class as non-ASCII - // bytes. This in turn may cause the lazy DFA to falsely start - // when it sees an ASCII byte that maps to a byte class with - // non-ASCII bytes. This ensures that never happens. - self.byte_classes.set_range(0, 0x7F); - self.c_empty_look(prog::EmptyLook::WordBoundary) - } - WordBoundary(hir::WordBoundary::UnicodeNegate) => { - if !cfg!(feature = "unicode-perl") { + hir::Look::Start => { + self.c_empty_look(prog::EmptyLook::StartText) + } + hir::Look::End if self.compiled.is_reverse => { + self.c_empty_look(prog::EmptyLook::StartText) + } + hir::Look::End => self.c_empty_look(prog::EmptyLook::EndText), + hir::Look::StartLF if self.compiled.is_reverse => { + self.byte_classes.set_range(b'\n', b'\n'); + self.c_empty_look(prog::EmptyLook::EndLine) + } + hir::Look::StartLF => { + self.byte_classes.set_range(b'\n', b'\n'); + self.c_empty_look(prog::EmptyLook::StartLine) + } + hir::Look::EndLF if self.compiled.is_reverse => { + self.byte_classes.set_range(b'\n', b'\n'); + self.c_empty_look(prog::EmptyLook::StartLine) + } + hir::Look::EndLF => { + self.byte_classes.set_range(b'\n', b'\n'); + self.c_empty_look(prog::EmptyLook::EndLine) + } + hir::Look::StartCRLF | hir::Look::EndCRLF => { return Err(Error::Syntax( - "Unicode word boundaries are unavailable when \ - the unicode-perl feature is disabled" + "CRLF-aware line anchors are not supported yet" .to_string(), )); } - self.compiled.has_unicode_word_boundary = true; - self.byte_classes.set_word_boundary(); - // See comments above for why we set the ASCII range here. - self.byte_classes.set_range(0, 0x7F); - self.c_empty_look(prog::EmptyLook::NotWordBoundary) - } - WordBoundary(hir::WordBoundary::Ascii) => { - self.byte_classes.set_word_boundary(); - self.c_empty_look(prog::EmptyLook::WordBoundaryAscii) - } - WordBoundary(hir::WordBoundary::AsciiNegate) => { - self.byte_classes.set_word_boundary(); - self.c_empty_look(prog::EmptyLook::NotWordBoundaryAscii) - } - Group(ref g) => match g.kind { - hir::GroupKind::NonCapturing => self.c(&g.hir), - hir::GroupKind::CaptureIndex(index) => { - if index as usize >= self.compiled.captures.len() { - self.compiled.captures.push(None); + hir::Look::WordAscii => { + self.byte_classes.set_word_boundary(); + self.c_empty_look(prog::EmptyLook::WordBoundaryAscii) + } + hir::Look::WordAsciiNegate => { + self.byte_classes.set_word_boundary(); + self.c_empty_look(prog::EmptyLook::NotWordBoundaryAscii) + } + hir::Look::WordUnicode => { + if !cfg!(feature = "unicode-perl") { + return Err(Error::Syntax( + "Unicode word boundaries are unavailable when \ + the unicode-perl feature is disabled" + .to_string(), + )); } - self.c_capture(2 * index as usize, &g.hir) + self.compiled.has_unicode_word_boundary = true; + self.byte_classes.set_word_boundary(); + // We also make sure that all ASCII bytes are in a different + // class from non-ASCII bytes. Otherwise, it's possible for + // ASCII bytes to get lumped into the same class as non-ASCII + // bytes. This in turn may cause the lazy DFA to falsely start + // when it sees an ASCII byte that maps to a byte class with + // non-ASCII bytes. This ensures that never happens. + self.byte_classes.set_range(0, 0x7F); + self.c_empty_look(prog::EmptyLook::WordBoundary) } - hir::GroupKind::CaptureName { index, ref name } => { - if index as usize >= self.compiled.captures.len() { - let n = name.to_string(); - self.compiled.captures.push(Some(n.clone())); - self.capture_name_idx.insert(n, index as usize); + hir::Look::WordUnicodeNegate => { + if !cfg!(feature = "unicode-perl") { + return Err(Error::Syntax( + "Unicode word boundaries are unavailable when \ + the unicode-perl feature is disabled" + .to_string(), + )); } - self.c_capture(2 * index as usize, &g.hir) + self.compiled.has_unicode_word_boundary = true; + self.byte_classes.set_word_boundary(); + // See comments above for why we set the ASCII range here. + self.byte_classes.set_range(0, 0x7F); + self.c_empty_look(prog::EmptyLook::NotWordBoundary) } }, + Capture(hir::Capture { index, ref name, ref sub }) => { + if index as usize >= self.compiled.captures.len() { + let name = match *name { + None => None, + Some(ref boxed_str) => Some(boxed_str.to_string()), + }; + self.compiled.captures.push(name.clone()); + if let Some(name) = name { + self.capture_name_idx.insert(name, index as usize); + } + } + self.c_capture(2 * index as usize, sub) + } Concat(ref es) => { if self.compiled.is_reverse { self.c_concat(es.iter().rev()) @@ -420,21 +441,19 @@ impl Compiler { } fn c_dotstar(&mut self) -> Result { - Ok(if !self.compiled.only_utf8() { - self.c(&Hir::repetition(hir::Repetition { - kind: hir::RepetitionKind::ZeroOrMore, - greedy: false, - hir: Box::new(Hir::any(true)), - }))? - .unwrap() + let hir = if self.compiled.only_utf8() { + Hir::dot(hir::Dot::AnyChar) } else { - self.c(&Hir::repetition(hir::Repetition { - kind: hir::RepetitionKind::ZeroOrMore, + Hir::dot(hir::Dot::AnyByte) + }; + Ok(self + .c(&Hir::repetition(hir::Repetition { + min: 0, + max: None, greedy: false, - hir: Box::new(Hir::any(false)), + sub: Box::new(hir), }))? - .unwrap() - }) + .unwrap()) } fn c_char(&mut self, c: char) -> ResultOrEmpty { @@ -457,7 +476,11 @@ impl Compiler { fn c_class(&mut self, ranges: &[hir::ClassUnicodeRange]) -> ResultOrEmpty { use std::mem::size_of; - assert!(!ranges.is_empty()); + if ranges.is_empty() { + return Err(Error::Syntax( + "empty character classes are not allowed".to_string(), + )); + } if self.compiled.uses_bytes() { Ok(Some(CompileClass { c: self, ranges }.compile()?)) } else { @@ -482,7 +505,11 @@ impl Compiler { &mut self, ranges: &[hir::ClassBytesRange], ) -> ResultOrEmpty { - debug_assert!(!ranges.is_empty()); + if ranges.is_empty() { + return Err(Error::Syntax( + "empty character classes are not allowed".to_string(), + )); + } let first_split_entry = self.insts.len(); let mut holes = vec![]; @@ -513,6 +540,52 @@ impl Compiler { Ok(Some(Patch { hole, entry: self.insts.len() - 1 })) } + fn c_literal(&mut self, bytes: &[u8]) -> ResultOrEmpty { + match core::str::from_utf8(bytes) { + Ok(string) => { + let mut it = string.chars(); + let Patch { mut hole, entry } = loop { + match it.next() { + None => return self.c_empty(), + Some(ch) => { + if let Some(p) = self.c_char(ch)? { + break p; + } + } + } + }; + for ch in it { + if let Some(p) = self.c_char(ch)? { + self.fill(hole, p.entry); + hole = p.hole; + } + } + Ok(Some(Patch { hole, entry })) + } + Err(_) => { + assert!(self.compiled.uses_bytes()); + let mut it = bytes.iter().copied(); + let Patch { mut hole, entry } = loop { + match it.next() { + None => return self.c_empty(), + Some(byte) => { + if let Some(p) = self.c_byte(byte)? { + break p; + } + } + } + }; + for byte in it { + if let Some(p) = self.c_byte(byte)? { + self.fill(hole, p.entry); + hole = p.hole; + } + } + Ok(Some(Patch { hole, entry })) + } + } + } + fn c_concat<'a, I>(&mut self, exprs: I) -> ResultOrEmpty where I: IntoIterator<Item = &'a Hir>, @@ -587,19 +660,15 @@ impl Compiler { } fn c_repeat(&mut self, rep: &hir::Repetition) -> ResultOrEmpty { - use regex_syntax::hir::RepetitionKind::*; - match rep.kind { - ZeroOrOne => self.c_repeat_zero_or_one(&rep.hir, rep.greedy), - ZeroOrMore => self.c_repeat_zero_or_more(&rep.hir, rep.greedy), - OneOrMore => self.c_repeat_one_or_more(&rep.hir, rep.greedy), - Range(hir::RepetitionRange::Exactly(min_max)) => { - self.c_repeat_range(&rep.hir, rep.greedy, min_max, min_max) - } - Range(hir::RepetitionRange::AtLeast(min)) => { - self.c_repeat_range_min_or_more(&rep.hir, rep.greedy, min) + match (rep.min, rep.max) { + (0, Some(1)) => self.c_repeat_zero_or_one(&rep.sub, rep.greedy), + (0, None) => self.c_repeat_zero_or_more(&rep.sub, rep.greedy), + (1, None) => self.c_repeat_one_or_more(&rep.sub, rep.greedy), + (min, None) => { + self.c_repeat_range_min_or_more(&rep.sub, rep.greedy, min) } - Range(hir::RepetitionRange::Bounded(min, max)) => { - self.c_repeat_range(&rep.hir, rep.greedy, min, max) + (min, Some(max)) => { + self.c_repeat_range(&rep.sub, rep.greedy, min, max) } } } |