summaryrefslogtreecommitdiffstats
path: root/vendor/regex/src/compile.rs
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-30 03:57:31 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-30 03:57:31 +0000
commitdc0db358abe19481e475e10c32149b53370f1a1c (patch)
treeab8ce99c4b255ce46f99ef402c27916055b899ee /vendor/regex/src/compile.rs
parentReleasing progress-linux version 1.71.1+dfsg1-2~progress7.99u1. (diff)
downloadrustc-dc0db358abe19481e475e10c32149b53370f1a1c.tar.xz
rustc-dc0db358abe19481e475e10c32149b53370f1a1c.zip
Merging upstream version 1.72.1+dfsg1.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'vendor/regex/src/compile.rs')
-rw-r--r--vendor/regex/src/compile.rs297
1 files changed, 183 insertions, 114 deletions
diff --git a/vendor/regex/src/compile.rs b/vendor/regex/src/compile.rs
index 90ca25015..23e63ec89 100644
--- a/vendor/regex/src/compile.rs
+++ b/vendor/regex/src/compile.rs
@@ -4,7 +4,7 @@ use std::iter;
use std::result;
use std::sync::Arc;
-use regex_syntax::hir::{self, Hir};
+use regex_syntax::hir::{self, Hir, Look};
use regex_syntax::is_word_byte;
use regex_syntax::utf8::{Utf8Range, Utf8Sequence, Utf8Sequences};
@@ -137,13 +137,24 @@ impl Compiler {
}
fn compile_one(mut self, expr: &Hir) -> result::Result<Program, Error> {
+ if self.compiled.only_utf8
+ && expr.properties().look_set().contains(Look::WordAsciiNegate)
+ {
+ return Err(Error::Syntax(
+ "ASCII-only \\B is not allowed in Unicode regexes \
+ because it may result in invalid UTF-8 matches"
+ .to_string(),
+ ));
+ }
// If we're compiling a forward DFA and we aren't anchored, then
// add a `.*?` before the first capture group.
// Other matching engines handle this by baking the logic into the
// matching engine itself.
let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 };
- self.compiled.is_anchored_start = expr.is_anchored_start();
- self.compiled.is_anchored_end = expr.is_anchored_end();
+ self.compiled.is_anchored_start =
+ expr.properties().look_set_prefix().contains(Look::Start);
+ self.compiled.is_anchored_end =
+ expr.properties().look_set_suffix().contains(Look::End);
if self.compiled.needs_dotstar() {
dotstar_patch = self.c_dotstar()?;
self.compiled.start = dotstar_patch.entry;
@@ -159,6 +170,8 @@ impl Compiler {
self.fill_to_next(patch.hole);
self.compiled.matches = vec![self.insts.len()];
self.push_compiled(Inst::Match(0));
+ self.compiled.static_captures_len =
+ expr.properties().static_explicit_captures_len();
self.compile_finish()
}
@@ -168,10 +181,12 @@ impl Compiler {
) -> result::Result<Program, Error> {
debug_assert!(exprs.len() > 1);
- self.compiled.is_anchored_start =
- exprs.iter().all(|e| e.is_anchored_start());
- self.compiled.is_anchored_end =
- exprs.iter().all(|e| e.is_anchored_end());
+ self.compiled.is_anchored_start = exprs
+ .iter()
+ .all(|e| e.properties().look_set_prefix().contains(Look::Start));
+ self.compiled.is_anchored_end = exprs
+ .iter()
+ .all(|e| e.properties().look_set_suffix().contains(Look::End));
let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 };
if self.compiled.needs_dotstar() {
dotstar_patch = self.c_dotstar()?;
@@ -272,17 +287,21 @@ impl Compiler {
self.check_size()?;
match *expr.kind() {
Empty => self.c_empty(),
- Literal(hir::Literal::Unicode(c)) => self.c_char(c),
- Literal(hir::Literal::Byte(b)) => {
- assert!(self.compiled.uses_bytes());
- self.c_byte(b)
+ Literal(hir::Literal(ref bytes)) => {
+ if self.compiled.is_reverse {
+ let mut bytes = bytes.to_vec();
+ bytes.reverse();
+ self.c_literal(&bytes)
+ } else {
+ self.c_literal(bytes)
+ }
}
Class(hir::Class::Unicode(ref cls)) => self.c_class(cls.ranges()),
Class(hir::Class::Bytes(ref cls)) => {
if self.compiled.uses_bytes() {
self.c_class_bytes(cls.ranges())
} else {
- assert!(cls.is_all_ascii());
+ assert!(cls.is_ascii());
let mut char_ranges = vec![];
for r in cls.iter() {
let (s, e) = (r.start() as char, r.end() as char);
@@ -291,92 +310,94 @@ impl Compiler {
self.c_class(&char_ranges)
}
}
- Anchor(hir::Anchor::StartLine) if self.compiled.is_reverse => {
- self.byte_classes.set_range(b'\n', b'\n');
- self.c_empty_look(prog::EmptyLook::EndLine)
- }
- Anchor(hir::Anchor::StartLine) => {
- self.byte_classes.set_range(b'\n', b'\n');
- self.c_empty_look(prog::EmptyLook::StartLine)
- }
- Anchor(hir::Anchor::EndLine) if self.compiled.is_reverse => {
- self.byte_classes.set_range(b'\n', b'\n');
- self.c_empty_look(prog::EmptyLook::StartLine)
- }
- Anchor(hir::Anchor::EndLine) => {
- self.byte_classes.set_range(b'\n', b'\n');
- self.c_empty_look(prog::EmptyLook::EndLine)
- }
- Anchor(hir::Anchor::StartText) if self.compiled.is_reverse => {
- self.c_empty_look(prog::EmptyLook::EndText)
- }
- Anchor(hir::Anchor::StartText) => {
- self.c_empty_look(prog::EmptyLook::StartText)
- }
- Anchor(hir::Anchor::EndText) if self.compiled.is_reverse => {
- self.c_empty_look(prog::EmptyLook::StartText)
- }
- Anchor(hir::Anchor::EndText) => {
- self.c_empty_look(prog::EmptyLook::EndText)
- }
- WordBoundary(hir::WordBoundary::Unicode) => {
- if !cfg!(feature = "unicode-perl") {
- return Err(Error::Syntax(
- "Unicode word boundaries are unavailable when \
- the unicode-perl feature is disabled"
- .to_string(),
- ));
+ Look(ref look) => match *look {
+ hir::Look::Start if self.compiled.is_reverse => {
+ self.c_empty_look(prog::EmptyLook::EndText)
}
- self.compiled.has_unicode_word_boundary = true;
- self.byte_classes.set_word_boundary();
- // We also make sure that all ASCII bytes are in a different
- // class from non-ASCII bytes. Otherwise, it's possible for
- // ASCII bytes to get lumped into the same class as non-ASCII
- // bytes. This in turn may cause the lazy DFA to falsely start
- // when it sees an ASCII byte that maps to a byte class with
- // non-ASCII bytes. This ensures that never happens.
- self.byte_classes.set_range(0, 0x7F);
- self.c_empty_look(prog::EmptyLook::WordBoundary)
- }
- WordBoundary(hir::WordBoundary::UnicodeNegate) => {
- if !cfg!(feature = "unicode-perl") {
+ hir::Look::Start => {
+ self.c_empty_look(prog::EmptyLook::StartText)
+ }
+ hir::Look::End if self.compiled.is_reverse => {
+ self.c_empty_look(prog::EmptyLook::StartText)
+ }
+ hir::Look::End => self.c_empty_look(prog::EmptyLook::EndText),
+ hir::Look::StartLF if self.compiled.is_reverse => {
+ self.byte_classes.set_range(b'\n', b'\n');
+ self.c_empty_look(prog::EmptyLook::EndLine)
+ }
+ hir::Look::StartLF => {
+ self.byte_classes.set_range(b'\n', b'\n');
+ self.c_empty_look(prog::EmptyLook::StartLine)
+ }
+ hir::Look::EndLF if self.compiled.is_reverse => {
+ self.byte_classes.set_range(b'\n', b'\n');
+ self.c_empty_look(prog::EmptyLook::StartLine)
+ }
+ hir::Look::EndLF => {
+ self.byte_classes.set_range(b'\n', b'\n');
+ self.c_empty_look(prog::EmptyLook::EndLine)
+ }
+ hir::Look::StartCRLF | hir::Look::EndCRLF => {
return Err(Error::Syntax(
- "Unicode word boundaries are unavailable when \
- the unicode-perl feature is disabled"
+ "CRLF-aware line anchors are not supported yet"
.to_string(),
));
}
- self.compiled.has_unicode_word_boundary = true;
- self.byte_classes.set_word_boundary();
- // See comments above for why we set the ASCII range here.
- self.byte_classes.set_range(0, 0x7F);
- self.c_empty_look(prog::EmptyLook::NotWordBoundary)
- }
- WordBoundary(hir::WordBoundary::Ascii) => {
- self.byte_classes.set_word_boundary();
- self.c_empty_look(prog::EmptyLook::WordBoundaryAscii)
- }
- WordBoundary(hir::WordBoundary::AsciiNegate) => {
- self.byte_classes.set_word_boundary();
- self.c_empty_look(prog::EmptyLook::NotWordBoundaryAscii)
- }
- Group(ref g) => match g.kind {
- hir::GroupKind::NonCapturing => self.c(&g.hir),
- hir::GroupKind::CaptureIndex(index) => {
- if index as usize >= self.compiled.captures.len() {
- self.compiled.captures.push(None);
+ hir::Look::WordAscii => {
+ self.byte_classes.set_word_boundary();
+ self.c_empty_look(prog::EmptyLook::WordBoundaryAscii)
+ }
+ hir::Look::WordAsciiNegate => {
+ self.byte_classes.set_word_boundary();
+ self.c_empty_look(prog::EmptyLook::NotWordBoundaryAscii)
+ }
+ hir::Look::WordUnicode => {
+ if !cfg!(feature = "unicode-perl") {
+ return Err(Error::Syntax(
+ "Unicode word boundaries are unavailable when \
+ the unicode-perl feature is disabled"
+ .to_string(),
+ ));
}
- self.c_capture(2 * index as usize, &g.hir)
+ self.compiled.has_unicode_word_boundary = true;
+ self.byte_classes.set_word_boundary();
+ // We also make sure that all ASCII bytes are in a different
+ // class from non-ASCII bytes. Otherwise, it's possible for
+ // ASCII bytes to get lumped into the same class as non-ASCII
+ // bytes. This in turn may cause the lazy DFA to falsely start
+ // when it sees an ASCII byte that maps to a byte class with
+ // non-ASCII bytes. This ensures that never happens.
+ self.byte_classes.set_range(0, 0x7F);
+ self.c_empty_look(prog::EmptyLook::WordBoundary)
}
- hir::GroupKind::CaptureName { index, ref name } => {
- if index as usize >= self.compiled.captures.len() {
- let n = name.to_string();
- self.compiled.captures.push(Some(n.clone()));
- self.capture_name_idx.insert(n, index as usize);
+ hir::Look::WordUnicodeNegate => {
+ if !cfg!(feature = "unicode-perl") {
+ return Err(Error::Syntax(
+ "Unicode word boundaries are unavailable when \
+ the unicode-perl feature is disabled"
+ .to_string(),
+ ));
}
- self.c_capture(2 * index as usize, &g.hir)
+ self.compiled.has_unicode_word_boundary = true;
+ self.byte_classes.set_word_boundary();
+ // See comments above for why we set the ASCII range here.
+ self.byte_classes.set_range(0, 0x7F);
+ self.c_empty_look(prog::EmptyLook::NotWordBoundary)
}
},
+ Capture(hir::Capture { index, ref name, ref sub }) => {
+ if index as usize >= self.compiled.captures.len() {
+ let name = match *name {
+ None => None,
+ Some(ref boxed_str) => Some(boxed_str.to_string()),
+ };
+ self.compiled.captures.push(name.clone());
+ if let Some(name) = name {
+ self.capture_name_idx.insert(name, index as usize);
+ }
+ }
+ self.c_capture(2 * index as usize, sub)
+ }
Concat(ref es) => {
if self.compiled.is_reverse {
self.c_concat(es.iter().rev())
@@ -420,21 +441,19 @@ impl Compiler {
}
fn c_dotstar(&mut self) -> Result {
- Ok(if !self.compiled.only_utf8() {
- self.c(&Hir::repetition(hir::Repetition {
- kind: hir::RepetitionKind::ZeroOrMore,
- greedy: false,
- hir: Box::new(Hir::any(true)),
- }))?
- .unwrap()
+ let hir = if self.compiled.only_utf8() {
+ Hir::dot(hir::Dot::AnyChar)
} else {
- self.c(&Hir::repetition(hir::Repetition {
- kind: hir::RepetitionKind::ZeroOrMore,
+ Hir::dot(hir::Dot::AnyByte)
+ };
+ Ok(self
+ .c(&Hir::repetition(hir::Repetition {
+ min: 0,
+ max: None,
greedy: false,
- hir: Box::new(Hir::any(false)),
+ sub: Box::new(hir),
}))?
- .unwrap()
- })
+ .unwrap())
}
fn c_char(&mut self, c: char) -> ResultOrEmpty {
@@ -457,7 +476,11 @@ impl Compiler {
fn c_class(&mut self, ranges: &[hir::ClassUnicodeRange]) -> ResultOrEmpty {
use std::mem::size_of;
- assert!(!ranges.is_empty());
+ if ranges.is_empty() {
+ return Err(Error::Syntax(
+ "empty character classes are not allowed".to_string(),
+ ));
+ }
if self.compiled.uses_bytes() {
Ok(Some(CompileClass { c: self, ranges }.compile()?))
} else {
@@ -482,7 +505,11 @@ impl Compiler {
&mut self,
ranges: &[hir::ClassBytesRange],
) -> ResultOrEmpty {
- debug_assert!(!ranges.is_empty());
+ if ranges.is_empty() {
+ return Err(Error::Syntax(
+ "empty character classes are not allowed".to_string(),
+ ));
+ }
let first_split_entry = self.insts.len();
let mut holes = vec![];
@@ -513,6 +540,52 @@ impl Compiler {
Ok(Some(Patch { hole, entry: self.insts.len() - 1 }))
}
+ fn c_literal(&mut self, bytes: &[u8]) -> ResultOrEmpty {
+ match core::str::from_utf8(bytes) {
+ Ok(string) => {
+ let mut it = string.chars();
+ let Patch { mut hole, entry } = loop {
+ match it.next() {
+ None => return self.c_empty(),
+ Some(ch) => {
+ if let Some(p) = self.c_char(ch)? {
+ break p;
+ }
+ }
+ }
+ };
+ for ch in it {
+ if let Some(p) = self.c_char(ch)? {
+ self.fill(hole, p.entry);
+ hole = p.hole;
+ }
+ }
+ Ok(Some(Patch { hole, entry }))
+ }
+ Err(_) => {
+ assert!(self.compiled.uses_bytes());
+ let mut it = bytes.iter().copied();
+ let Patch { mut hole, entry } = loop {
+ match it.next() {
+ None => return self.c_empty(),
+ Some(byte) => {
+ if let Some(p) = self.c_byte(byte)? {
+ break p;
+ }
+ }
+ }
+ };
+ for byte in it {
+ if let Some(p) = self.c_byte(byte)? {
+ self.fill(hole, p.entry);
+ hole = p.hole;
+ }
+ }
+ Ok(Some(Patch { hole, entry }))
+ }
+ }
+ }
+
fn c_concat<'a, I>(&mut self, exprs: I) -> ResultOrEmpty
where
I: IntoIterator<Item = &'a Hir>,
@@ -587,19 +660,15 @@ impl Compiler {
}
fn c_repeat(&mut self, rep: &hir::Repetition) -> ResultOrEmpty {
- use regex_syntax::hir::RepetitionKind::*;
- match rep.kind {
- ZeroOrOne => self.c_repeat_zero_or_one(&rep.hir, rep.greedy),
- ZeroOrMore => self.c_repeat_zero_or_more(&rep.hir, rep.greedy),
- OneOrMore => self.c_repeat_one_or_more(&rep.hir, rep.greedy),
- Range(hir::RepetitionRange::Exactly(min_max)) => {
- self.c_repeat_range(&rep.hir, rep.greedy, min_max, min_max)
- }
- Range(hir::RepetitionRange::AtLeast(min)) => {
- self.c_repeat_range_min_or_more(&rep.hir, rep.greedy, min)
+ match (rep.min, rep.max) {
+ (0, Some(1)) => self.c_repeat_zero_or_one(&rep.sub, rep.greedy),
+ (0, None) => self.c_repeat_zero_or_more(&rep.sub, rep.greedy),
+ (1, None) => self.c_repeat_one_or_more(&rep.sub, rep.greedy),
+ (min, None) => {
+ self.c_repeat_range_min_or_more(&rep.sub, rep.greedy, min)
}
- Range(hir::RepetitionRange::Bounded(min, max)) => {
- self.c_repeat_range(&rep.hir, rep.greedy, min, max)
+ (min, Some(max)) => {
+ self.c_repeat_range(&rep.sub, rep.greedy, min, max)
}
}
}