summaryrefslogtreecommitdiffstats
path: root/vendor/regex-syntax/src/hir/translate.rs
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/regex-syntax/src/hir/translate.rs')
-rw-r--r--vendor/regex-syntax/src/hir/translate.rs297
1 files changed, 219 insertions, 78 deletions
diff --git a/vendor/regex-syntax/src/hir/translate.rs b/vendor/regex-syntax/src/hir/translate.rs
index ff9c5ee91..313a1e9e8 100644
--- a/vendor/regex-syntax/src/hir/translate.rs
+++ b/vendor/regex-syntax/src/hir/translate.rs
@@ -19,6 +19,7 @@ type Result<T> = core::result::Result<T, Error>;
#[derive(Clone, Debug)]
pub struct TranslatorBuilder {
utf8: bool,
+ line_terminator: u8,
flags: Flags,
}
@@ -31,7 +32,11 @@ impl Default for TranslatorBuilder {
impl TranslatorBuilder {
/// Create a new translator builder with a default c onfiguration.
pub fn new() -> TranslatorBuilder {
- TranslatorBuilder { utf8: true, flags: Flags::default() }
+ TranslatorBuilder {
+ utf8: true,
+ line_terminator: b'\n',
+ flags: Flags::default(),
+ }
}
/// Build a translator using the current configuration.
@@ -40,6 +45,7 @@ impl TranslatorBuilder {
stack: RefCell::new(vec![]),
flags: Cell::new(self.flags),
utf8: self.utf8,
+ line_terminator: self.line_terminator,
}
}
@@ -63,6 +69,31 @@ impl TranslatorBuilder {
self
}
+ /// Sets the line terminator for use with `(?u-s:.)` and `(?-us:.)`.
+ ///
+ /// Namely, instead of `.` (by default) matching everything except for `\n`,
+ /// this will cause `.` to match everything except for the byte given.
+ ///
+ /// If `.` is used in a context where Unicode mode is enabled and this byte
+ /// isn't ASCII, then an error will be returned. When Unicode mode is
+ /// disabled, then any byte is permitted, but will return an error if UTF-8
+ /// mode is enabled and it is a non-ASCII byte.
+ ///
+ /// In short, any ASCII value for a line terminator is always okay. But a
+ /// non-ASCII byte might result in an error depending on whether Unicode
+ /// mode or UTF-8 mode are enabled.
+ ///
+ /// Note that if `R` mode is enabled then it always takes precedence and
+ /// the line terminator will be treated as `\r` and `\n` simultaneously.
+ ///
+ /// Note also that this *doesn't* impact the look-around assertions
+ /// `(?m:^)` and `(?m:$)`. That's usually controlled by additional
+ /// configuration in the regex engine itself.
+ pub fn line_terminator(&mut self, byte: u8) -> &mut TranslatorBuilder {
+ self.line_terminator = byte;
+ self
+ }
+
/// Enable or disable the case insensitive flag (`i`) by default.
pub fn case_insensitive(&mut self, yes: bool) -> &mut TranslatorBuilder {
self.flags.case_insensitive = if yes { Some(true) } else { None };
@@ -120,6 +151,8 @@ pub struct Translator {
flags: Cell<Flags>,
/// Whether we're allowed to produce HIR that can match arbitrary bytes.
utf8: bool,
+ /// The line terminator to use for `.`.
+ line_terminator: u8,
}
impl Translator {
@@ -304,7 +337,7 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
fn visit_pre(&mut self, ast: &Ast) -> Result<()> {
match *ast {
- Ast::Class(ast::Class::Bracketed(_)) => {
+ Ast::ClassBracketed(_) => {
if self.flags().unicode() {
let cls = hir::ClassUnicode::empty();
self.push(HirFrame::ClassUnicode(cls));
@@ -321,14 +354,14 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
.unwrap_or_else(|| self.flags());
self.push(HirFrame::Group { old_flags });
}
- Ast::Concat(ref x) if x.asts.is_empty() => {}
Ast::Concat(_) => {
self.push(HirFrame::Concat);
}
- Ast::Alternation(ref x) if x.asts.is_empty() => {}
- Ast::Alternation(_) => {
+ Ast::Alternation(ref x) => {
self.push(HirFrame::Alternation);
- self.push(HirFrame::AlternationBranch);
+ if !x.asts.is_empty() {
+ self.push(HirFrame::AlternationBranch);
+ }
}
_ => {}
}
@@ -353,29 +386,20 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
// consistency sake.
self.push(HirFrame::Expr(Hir::empty()));
}
- Ast::Literal(ref x) => {
- match self.ast_literal_to_scalar(x)? {
- Either::Right(byte) => self.push_byte(byte),
- Either::Left(ch) => {
- if !self.flags().unicode() && ch.len_utf8() > 1 {
- return Err(self
- .error(x.span, ErrorKind::UnicodeNotAllowed));
- }
- match self.case_fold_char(x.span, ch)? {
- None => self.push_char(ch),
- Some(expr) => self.push(HirFrame::Expr(expr)),
- }
- }
- }
- // self.push(HirFrame::Expr(self.hir_literal(x)?));
- }
- Ast::Dot(span) => {
- self.push(HirFrame::Expr(self.hir_dot(span)?));
+ Ast::Literal(ref x) => match self.ast_literal_to_scalar(x)? {
+ Either::Right(byte) => self.push_byte(byte),
+ Either::Left(ch) => match self.case_fold_char(x.span, ch)? {
+ None => self.push_char(ch),
+ Some(expr) => self.push(HirFrame::Expr(expr)),
+ },
+ },
+ Ast::Dot(ref span) => {
+ self.push(HirFrame::Expr(self.hir_dot(**span)?));
}
Ast::Assertion(ref x) => {
self.push(HirFrame::Expr(self.hir_assertion(x)?));
}
- Ast::Class(ast::Class::Perl(ref x)) => {
+ Ast::ClassPerl(ref x) => {
if self.flags().unicode() {
let cls = self.hir_perl_unicode_class(x)?;
let hcls = hir::Class::Unicode(cls);
@@ -386,11 +410,11 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
self.push(HirFrame::Expr(Hir::class(hcls)));
}
}
- Ast::Class(ast::Class::Unicode(ref x)) => {
+ Ast::ClassUnicode(ref x) => {
let cls = hir::Class::Unicode(self.hir_unicode_class(x)?);
self.push(HirFrame::Expr(Hir::class(cls)));
}
- Ast::Class(ast::Class::Bracketed(ref ast)) => {
+ Ast::ClassBracketed(ref ast) => {
if self.flags().unicode() {
let mut cls = self.pop().unwrap().unwrap_class_unicode();
self.unicode_fold_and_negate(
@@ -841,8 +865,8 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
})?;
Ok(Some(Hir::class(hir::Class::Unicode(cls))))
} else {
- if c.len_utf8() > 1 {
- return Err(self.error(span, ErrorKind::UnicodeNotAllowed));
+ if !c.is_ascii() {
+ return Ok(None);
}
// If case folding won't do anything, then don't bother trying.
match c {
@@ -862,10 +886,38 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
}
fn hir_dot(&self, span: Span) -> Result<Hir> {
- if !self.flags().unicode() && self.trans().utf8 {
+ let (utf8, lineterm, flags) =
+ (self.trans().utf8, self.trans().line_terminator, self.flags());
+ if utf8 && (!flags.unicode() || !lineterm.is_ascii()) {
return Err(self.error(span, ErrorKind::InvalidUtf8));
}
- Ok(Hir::dot(self.flags().dot()))
+ let dot = if flags.dot_matches_new_line() {
+ if flags.unicode() {
+ hir::Dot::AnyChar
+ } else {
+ hir::Dot::AnyByte
+ }
+ } else {
+ if flags.unicode() {
+ if flags.crlf() {
+ hir::Dot::AnyCharExceptCRLF
+ } else {
+ if !lineterm.is_ascii() {
+ return Err(
+ self.error(span, ErrorKind::InvalidLineTerminator)
+ );
+ }
+ hir::Dot::AnyCharExcept(char::from(lineterm))
+ }
+ } else {
+ if flags.crlf() {
+ hir::Dot::AnyByteExceptCRLF
+ } else {
+ hir::Dot::AnyByteExcept(lineterm)
+ }
+ }
+ };
+ Ok(Hir::dot(dot))
}
fn hir_assertion(&self, asst: &ast::Assertion) -> Result<Hir> {
@@ -903,6 +955,34 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
} else {
hir::Look::WordAsciiNegate
}),
+ ast::AssertionKind::WordBoundaryStart
+ | ast::AssertionKind::WordBoundaryStartAngle => {
+ Hir::look(if unicode {
+ hir::Look::WordStartUnicode
+ } else {
+ hir::Look::WordStartAscii
+ })
+ }
+ ast::AssertionKind::WordBoundaryEnd
+ | ast::AssertionKind::WordBoundaryEndAngle => {
+ Hir::look(if unicode {
+ hir::Look::WordEndUnicode
+ } else {
+ hir::Look::WordEndAscii
+ })
+ }
+ ast::AssertionKind::WordBoundaryStartHalf => {
+ Hir::look(if unicode {
+ hir::Look::WordStartHalfUnicode
+ } else {
+ hir::Look::WordStartHalfAscii
+ })
+ }
+ ast::AssertionKind::WordBoundaryEndHalf => Hir::look(if unicode {
+ hir::Look::WordEndHalfUnicode
+ } else {
+ hir::Look::WordEndHalfAscii
+ }),
})
}
@@ -1124,9 +1204,8 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
match self.ast_literal_to_scalar(ast)? {
Either::Right(byte) => Ok(byte),
Either::Left(ch) => {
- let cp = u32::from(ch);
- if cp <= 0x7F {
- Ok(u8::try_from(cp).unwrap())
+ if ch.is_ascii() {
+ Ok(u8::try_from(ch).unwrap())
} else {
// We can't feasibly support Unicode in
// byte oriented classes. Byte classes don't
@@ -1209,30 +1288,6 @@ impl Flags {
}
}
- fn dot(&self) -> hir::Dot {
- if self.dot_matches_new_line() {
- if self.unicode() {
- hir::Dot::AnyChar
- } else {
- hir::Dot::AnyByte
- }
- } else {
- if self.unicode() {
- if self.crlf() {
- hir::Dot::AnyCharExceptCRLF
- } else {
- hir::Dot::AnyCharExceptLF
- }
- } else {
- if self.crlf() {
- hir::Dot::AnyByteExceptCRLF
- } else {
- hir::Dot::AnyByteExceptLF
- }
- }
- }
- }
-
fn case_insensitive(&self) -> bool {
self.case_insensitive.unwrap_or(false)
}
@@ -1598,16 +1653,7 @@ mod tests {
assert_eq!(t_bytes(r"(?-u)\x61"), hir_lit("a"));
assert_eq!(t_bytes(r"(?-u)\xFF"), hir_blit(b"\xFF"));
- assert_eq!(
- t_err("(?-u)☃"),
- TestError {
- kind: hir::ErrorKind::UnicodeNotAllowed,
- span: Span::new(
- Position::new(5, 1, 6),
- Position::new(8, 1, 7)
- ),
- }
- );
+ assert_eq!(t("(?-u)☃"), hir_lit("☃"));
assert_eq!(
t_err(r"(?-u)\xFF"),
TestError {
@@ -1685,16 +1731,7 @@ mod tests {
);
assert_eq!(t_bytes(r"(?i-u)\xFF"), hir_blit(b"\xFF"));
- assert_eq!(
- t_err("(?i-u)β"),
- TestError {
- kind: hir::ErrorKind::UnicodeNotAllowed,
- span: Span::new(
- Position::new(6, 1, 7),
- Position::new(8, 1, 8),
- ),
- }
- );
+ assert_eq!(t("(?i-u)β"), hir_lit("β"),);
}
#[test]
@@ -3489,6 +3526,15 @@ mod tests {
assert!(!props(r"(?:z|xx)@|xx").is_alternation_literal());
}
+ // This tests that the smart Hir::repetition constructors does some basic
+ // simplifications.
+ #[test]
+ fn smart_repetition() {
+ assert_eq!(t(r"a{0}"), Hir::empty());
+ assert_eq!(t(r"a{1}"), hir_lit("a"));
+ assert_eq!(t(r"\B{32111}"), hir_look(hir::Look::WordUnicodeNegate));
+ }
+
// This tests that the smart Hir::concat constructor simplifies the given
// exprs in a way we expect.
#[test]
@@ -3580,4 +3626,99 @@ mod tests {
]),
);
}
+
+ #[test]
+ fn regression_alt_empty_concat() {
+ use crate::ast::{self, Ast};
+
+ let span = Span::splat(Position::new(0, 0, 0));
+ let ast = Ast::alternation(ast::Alternation {
+ span,
+ asts: vec![Ast::concat(ast::Concat { span, asts: vec![] })],
+ });
+
+ let mut t = Translator::new();
+ assert_eq!(Ok(Hir::empty()), t.translate("", &ast));
+ }
+
+ #[test]
+ fn regression_empty_alt() {
+ use crate::ast::{self, Ast};
+
+ let span = Span::splat(Position::new(0, 0, 0));
+ let ast = Ast::concat(ast::Concat {
+ span,
+ asts: vec![Ast::alternation(ast::Alternation {
+ span,
+ asts: vec![],
+ })],
+ });
+
+ let mut t = Translator::new();
+ assert_eq!(Ok(Hir::fail()), t.translate("", &ast));
+ }
+
+ #[test]
+ fn regression_singleton_alt() {
+ use crate::{
+ ast::{self, Ast},
+ hir::Dot,
+ };
+
+ let span = Span::splat(Position::new(0, 0, 0));
+ let ast = Ast::concat(ast::Concat {
+ span,
+ asts: vec![Ast::alternation(ast::Alternation {
+ span,
+ asts: vec![Ast::dot(span)],
+ })],
+ });
+
+ let mut t = Translator::new();
+ assert_eq!(Ok(Hir::dot(Dot::AnyCharExceptLF)), t.translate("", &ast));
+ }
+
+ // See: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=63168
+ #[test]
+ fn regression_fuzz_match() {
+ let pat = "[(\u{6} \0-\u{afdf5}] \0 ";
+ let ast = ParserBuilder::new()
+ .octal(false)
+ .ignore_whitespace(true)
+ .build()
+ .parse(pat)
+ .unwrap();
+ let hir = TranslatorBuilder::new()
+ .utf8(true)
+ .case_insensitive(false)
+ .multi_line(false)
+ .dot_matches_new_line(false)
+ .swap_greed(true)
+ .unicode(true)
+ .build()
+ .translate(pat, &ast)
+ .unwrap();
+ assert_eq!(
+ hir,
+ Hir::concat(vec![
+ hir_uclass(&[('\0', '\u{afdf5}')]),
+ hir_lit("\0"),
+ ])
+ );
+ }
+
+ // See: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=63155
+ #[cfg(feature = "unicode")]
+ #[test]
+ fn regression_fuzz_difference1() {
+ let pat = r"\W\W|\W[^\v--\W\W\P{Script_Extensions:Pau_Cin_Hau}\u10A1A1-\U{3E3E3}--~~~~--~~~~~~~~------~~~~~~--~~~~~~]*";
+ let _ = t(pat); // shouldn't panic
+ }
+
+ // See: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=63153
+ #[test]
+ fn regression_fuzz_char_decrement1() {
+ let pat = "w[w[^w?\rw\rw[^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w?\r\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0w?\rw[^w?\rw[^w?\rw[^w\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\u{1}\0]\0\0\0\0\0\0\0\0\0*\0\0\u{1}\0]\0\0-*\0][^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\u{1}\0]\0\0\0\0\0\0\0\0\0x\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\0\0\0*??\0\u{7f}{2}\u{10}??\0\0\0\0\0\0\0\0\0\u{3}\0\0\0}\0-*\0]\0\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\0\u{1}\0]\0\u{1}\u{1}H-i]-]\0\0\0\0\u{1}\0]\0\0\0\u{1}\0]\0\0-*\0\0\0\0\u{1}9-\u{7f}]\0'|-\u{7f}]\0'|(?i-ux)[-\u{7f}]\0'\u{3}\0\0\0}\0-*\0]<D\0\0\0\0\0\0\u{1}]\0\0\0\0]\0\0-*\0]\0\0 ";
+ let _ = t(pat); // shouldn't panic
+ }
}