diff options
Diffstat (limited to 'vendor/regex-syntax/src/ast/parse.rs')
-rw-r--r-- | vendor/regex-syntax/src/ast/parse.rs | 442 |
1 files changed, 321 insertions, 121 deletions
diff --git a/vendor/regex-syntax/src/ast/parse.rs b/vendor/regex-syntax/src/ast/parse.rs index 6e9c9aca0..9cf64e9ec 100644 --- a/vendor/regex-syntax/src/ast/parse.rs +++ b/vendor/regex-syntax/src/ast/parse.rs @@ -2,17 +2,26 @@ This module provides a regular expression parser. */ -use std::borrow::Borrow; -use std::cell::{Cell, RefCell}; -use std::mem; -use std::result; - -use crate::ast::{self, Ast, Position, Span}; -use crate::either::Either; - -use crate::is_meta_character; - -type Result<T> = result::Result<T, ast::Error>; +use core::{ + borrow::Borrow, + cell::{Cell, RefCell}, + mem, +}; + +use alloc::{ + boxed::Box, + string::{String, ToString}, + vec, + vec::Vec, +}; + +use crate::{ + ast::{self, Ast, Position, Span}, + either::Either, + is_escapeable_character, is_meta_character, +}; + +type Result<T> = core::result::Result<T, ast::Error>; /// A primitive is an expression with no sub-expressions. This includes /// literals, assertions and non-set character classes. This representation @@ -100,11 +109,11 @@ fn is_hex(c: char) -> bool { /// If `first` is true, then `c` is treated as the first character in the /// group name (which must be alphabetic or underscore). fn is_capture_char(c: char, first: bool) -> bool { - c == '_' - || (!first - && (('0' <= c && c <= '9') || c == '.' || c == '[' || c == ']')) - || ('A' <= c && c <= 'Z') - || ('a' <= c && c <= 'z') + if first { + c == '_' || c.is_alphabetic() + } else { + c == '_' || c == '.' || c == '[' || c == ']' || c.is_alphanumeric() + } } /// A builder for a regular expression parser. @@ -162,7 +171,7 @@ impl ParserBuilder { /// constant stack space and moving the call stack to the heap), other /// crates may. /// - /// This limit is not checked until the entire Ast is parsed. Therefore, + /// This limit is not checked until the entire AST is parsed. Therefore, /// if callers want to put a limit on the amount of heap space used, then /// they should impose a limit on the length, in bytes, of the concrete /// pattern string. In particular, this is viable since this parser @@ -220,8 +229,7 @@ impl ParserBuilder { /// abstract syntax tree. The size of the tree is proportional to the length /// of the regular expression pattern. /// -/// A `Parser` can be configured in more detail via a -/// [`ParserBuilder`](struct.ParserBuilder.html). +/// A `Parser` can be configured in more detail via a [`ParserBuilder`]. #[derive(Clone, Debug)] pub struct Parser { /// The current position of the parser. @@ -327,8 +335,7 @@ impl Parser { /// The parser can be run with either the `parse` or `parse_with_comments` /// methods. The parse methods return an abstract syntax tree. /// - /// To set configuration options on the parser, use - /// [`ParserBuilder`](struct.ParserBuilder.html). + /// To set configuration options on the parser, use [`ParserBuilder`]. pub fn new() -> Parser { ParserBuilder::new().build() } @@ -1195,12 +1202,16 @@ impl<'s, P: Borrow<Parser>> ParserI<'s, P> { )); } let inner_span = self.span(); - if self.bump_if("?P<") { + let mut starts_with_p = true; + if self.bump_if("?P<") || { + starts_with_p = false; + self.bump_if("?<") + } { let capture_index = self.next_capture_index(open_span)?; - let cap = self.parse_capture_name(capture_index)?; + let name = self.parse_capture_name(capture_index)?; Ok(Either::Right(ast::Group { span: open_span, - kind: ast::GroupKind::CaptureName(cap), + kind: ast::GroupKind::CaptureName { starts_with_p, name }, ast: Box::new(Ast::Empty(self.span())), })) } else if self.bump_if("?") { @@ -1370,6 +1381,7 @@ impl<'s, P: Borrow<Parser>> ParserI<'s, P> { 's' => Ok(ast::Flag::DotMatchesNewLine), 'U' => Ok(ast::Flag::SwapGreed), 'u' => Ok(ast::Flag::Unicode), + 'R' => Ok(ast::Flag::CRLF), 'x' => Ok(ast::Flag::IgnoreWhitespace), _ => { Err(self @@ -1483,7 +1495,14 @@ impl<'s, P: Borrow<Parser>> ParserI<'s, P> { if is_meta_character(c) { return Ok(Primitive::Literal(ast::Literal { span, - kind: ast::LiteralKind::Punctuation, + kind: ast::LiteralKind::Meta, + c, + })); + } + if is_escapeable_character(c) { + return Ok(Primitive::Literal(ast::Literal { + span, + kind: ast::LiteralKind::Superfluous, c, })); } @@ -1501,9 +1520,6 @@ impl<'s, P: Borrow<Parser>> ParserI<'s, P> { 'n' => special(ast::SpecialLiteralKind::LineFeed, '\n'), 'r' => special(ast::SpecialLiteralKind::CarriageReturn, '\r'), 'v' => special(ast::SpecialLiteralKind::VerticalTab, '\x0B'), - ' ' if self.ignore_whitespace() => { - special(ast::SpecialLiteralKind::Space, ' ') - } 'A' => Ok(Primitive::Assertion(ast::Assertion { span, kind: ast::AssertionKind::StartText, @@ -1533,9 +1549,6 @@ impl<'s, P: Borrow<Parser>> ParserI<'s, P> { /// Assuming the preconditions are met, this routine can never fail. #[inline(never)] fn parse_octal(&self) -> ast::Literal { - use std::char; - use std::u32; - assert!(self.parser().octal); assert!('0' <= self.char() && self.char() <= '7'); let start = self.pos(); @@ -1600,9 +1613,6 @@ impl<'s, P: Borrow<Parser>> ParserI<'s, P> { &self, kind: ast::HexLiteralKind, ) -> Result<ast::Literal> { - use std::char; - use std::u32; - let mut scratch = self.parser().scratch.borrow_mut(); scratch.clear(); @@ -1646,9 +1656,6 @@ impl<'s, P: Borrow<Parser>> ParserI<'s, P> { &self, kind: ast::HexLiteralKind, ) -> Result<ast::Literal> { - use std::char; - use std::u32; - let mut scratch = self.parser().scratch.borrow_mut(); scratch.clear(); @@ -2146,7 +2153,7 @@ impl<'p, 's, P: Borrow<Parser>> NestLimiter<'p, 's, P> { let new = self.depth.checked_add(1).ok_or_else(|| { self.p.error( span.clone(), - ast::ErrorKind::NestLimitExceeded(::std::u32::MAX), + ast::ErrorKind::NestLimitExceeded(u32::MAX), ) })?; let limit = self.p.parser().nest_limit; @@ -2297,11 +2304,14 @@ fn specialize_err<T>( #[cfg(test)] mod tests { - use std::ops::Range; + use core::ops::Range; + + use alloc::format; - use super::{Parser, ParserBuilder, ParserI, Primitive}; use crate::ast::{self, Ast, Position, Span}; + use super::*; + // Our own assert_eq, which has slightly better formatting (but honestly // still kind of crappy). macro_rules! assert_eq { @@ -2414,13 +2424,9 @@ mod tests { lit_with(c, span(start..start + c.len_utf8())) } - /// Create a punctuation literal starting at the given position. - fn punct_lit(c: char, span: Span) -> Ast { - Ast::Literal(ast::Literal { - span, - kind: ast::LiteralKind::Punctuation, - c, - }) + /// Create a meta literal starting at the given position. + fn meta_lit(c: char, span: Span) -> Ast { + Ast::Literal(ast::Literal { span, kind: ast::LiteralKind::Meta, c }) } /// Create a verbatim literal with the given span. @@ -2704,24 +2710,24 @@ bar Ok(concat( 0..36, vec![ - punct_lit('\\', span(0..2)), - punct_lit('.', span(2..4)), - punct_lit('+', span(4..6)), - punct_lit('*', span(6..8)), - punct_lit('?', span(8..10)), - punct_lit('(', span(10..12)), - punct_lit(')', span(12..14)), - punct_lit('|', span(14..16)), - punct_lit('[', span(16..18)), - punct_lit(']', span(18..20)), - punct_lit('{', span(20..22)), - punct_lit('}', span(22..24)), - punct_lit('^', span(24..26)), - punct_lit('$', span(26..28)), - punct_lit('#', span(28..30)), - punct_lit('&', span(30..32)), - punct_lit('-', span(32..34)), - punct_lit('~', span(34..36)), + meta_lit('\\', span(0..2)), + meta_lit('.', span(2..4)), + meta_lit('+', span(4..6)), + meta_lit('*', span(6..8)), + meta_lit('?', span(8..10)), + meta_lit('(', span(10..12)), + meta_lit(')', span(12..14)), + meta_lit('|', span(14..16)), + meta_lit('[', span(16..18)), + meta_lit(']', span(18..20)), + meta_lit('{', span(20..22)), + meta_lit('}', span(22..24)), + meta_lit('^', span(24..26)), + meta_lit('$', span(26..28)), + meta_lit('#', span(28..30)), + meta_lit('&', span(30..32)), + meta_lit('-', span(32..34)), + meta_lit('~', span(34..36)), ] )) ); @@ -2799,11 +2805,14 @@ bar flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), Ast::Group(ast::Group { span: span_range(pat, 4..pat.len()), - kind: ast::GroupKind::CaptureName(ast::CaptureName { - span: span_range(pat, 9..12), - name: s("foo"), - index: 1, - }), + kind: ast::GroupKind::CaptureName { + starts_with_p: true, + name: ast::CaptureName { + span: span_range(pat, 9..12), + name: s("foo"), + index: 1, + } + }, ast: Box::new(lit_with('a', span_range(pat, 14..15))), }), ] @@ -2870,23 +2879,12 @@ bar flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), Ast::Literal(ast::Literal { span: span_range(pat, 4..6), - kind: ast::LiteralKind::Special( - ast::SpecialLiteralKind::Space - ), + kind: ast::LiteralKind::Superfluous, c: ' ', }), ] )) ); - // ... but only when `x` mode is enabled. - let pat = r"\ "; - assert_eq!( - parser(pat).parse().unwrap_err(), - TestError { - span: span_range(pat, 0..2), - kind: ast::ErrorKind::EscapeUnrecognized, - } - ); } #[test] @@ -3819,14 +3817,32 @@ bar #[test] fn parse_capture_name() { assert_eq!( + parser("(?<a>z)").parse(), + Ok(Ast::Group(ast::Group { + span: span(0..7), + kind: ast::GroupKind::CaptureName { + starts_with_p: false, + name: ast::CaptureName { + span: span(3..4), + name: s("a"), + index: 1, + } + }, + ast: Box::new(lit('z', 5)), + })) + ); + assert_eq!( parser("(?P<a>z)").parse(), Ok(Ast::Group(ast::Group { span: span(0..8), - kind: ast::GroupKind::CaptureName(ast::CaptureName { - span: span(4..5), - name: s("a"), - index: 1, - }), + kind: ast::GroupKind::CaptureName { + starts_with_p: true, + name: ast::CaptureName { + span: span(4..5), + name: s("a"), + index: 1, + } + }, ast: Box::new(lit('z', 6)), })) ); @@ -3834,11 +3850,14 @@ bar parser("(?P<abc>z)").parse(), Ok(Ast::Group(ast::Group { span: span(0..10), - kind: ast::GroupKind::CaptureName(ast::CaptureName { - span: span(4..7), - name: s("abc"), - index: 1, - }), + kind: ast::GroupKind::CaptureName { + starts_with_p: true, + name: ast::CaptureName { + span: span(4..7), + name: s("abc"), + index: 1, + } + }, ast: Box::new(lit('z', 8)), })) ); @@ -3847,11 +3866,14 @@ bar parser("(?P<a_1>z)").parse(), Ok(Ast::Group(ast::Group { span: span(0..10), - kind: ast::GroupKind::CaptureName(ast::CaptureName { - span: span(4..7), - name: s("a_1"), - index: 1, - }), + kind: ast::GroupKind::CaptureName { + starts_with_p: true, + name: ast::CaptureName { + span: span(4..7), + name: s("a_1"), + index: 1, + } + }, ast: Box::new(lit('z', 8)), })) ); @@ -3860,11 +3882,14 @@ bar parser("(?P<a.1>z)").parse(), Ok(Ast::Group(ast::Group { span: span(0..10), - kind: ast::GroupKind::CaptureName(ast::CaptureName { - span: span(4..7), - name: s("a.1"), - index: 1, - }), + kind: ast::GroupKind::CaptureName { + starts_with_p: true, + name: ast::CaptureName { + span: span(4..7), + name: s("a.1"), + index: 1, + } + }, ast: Box::new(lit('z', 8)), })) ); @@ -3873,16 +3898,68 @@ bar parser("(?P<a[1]>z)").parse(), Ok(Ast::Group(ast::Group { span: span(0..11), - kind: ast::GroupKind::CaptureName(ast::CaptureName { - span: span(4..8), - name: s("a[1]"), - index: 1, - }), + kind: ast::GroupKind::CaptureName { + starts_with_p: true, + name: ast::CaptureName { + span: span(4..8), + name: s("a[1]"), + index: 1, + } + }, ast: Box::new(lit('z', 9)), })) ); assert_eq!( + parser("(?P<a¾>)").parse(), + Ok(Ast::Group(ast::Group { + span: Span::new( + Position::new(0, 1, 1), + Position::new(9, 1, 9), + ), + kind: ast::GroupKind::CaptureName { + starts_with_p: true, + name: ast::CaptureName { + span: Span::new( + Position::new(4, 1, 5), + Position::new(7, 1, 7), + ), + name: s("a¾"), + index: 1, + } + }, + ast: Box::new(Ast::Empty(Span::new( + Position::new(8, 1, 8), + Position::new(8, 1, 8), + ))), + })) + ); + assert_eq!( + parser("(?P<名字>)").parse(), + Ok(Ast::Group(ast::Group { + span: Span::new( + Position::new(0, 1, 1), + Position::new(12, 1, 9), + ), + kind: ast::GroupKind::CaptureName { + starts_with_p: true, + name: ast::CaptureName { + span: Span::new( + Position::new(4, 1, 5), + Position::new(10, 1, 7), + ), + name: s("名字"), + index: 1, + } + }, + ast: Box::new(Ast::Empty(Span::new( + Position::new(11, 1, 8), + Position::new(11, 1, 8), + ))), + })) + ); + + assert_eq!( parser("(?P<").parse().unwrap_err(), TestError { span: span(4..4), @@ -3940,6 +4017,60 @@ bar }, } ); + assert_eq!( + parser("(?P<5>)").parse().unwrap_err(), + TestError { + span: span(4..5), + kind: ast::ErrorKind::GroupNameInvalid, + } + ); + assert_eq!( + parser("(?P<5a>)").parse().unwrap_err(), + TestError { + span: span(4..5), + kind: ast::ErrorKind::GroupNameInvalid, + } + ); + assert_eq!( + parser("(?P<¾>)").parse().unwrap_err(), + TestError { + span: Span::new( + Position::new(4, 1, 5), + Position::new(6, 1, 6), + ), + kind: ast::ErrorKind::GroupNameInvalid, + } + ); + assert_eq!( + parser("(?P<¾a>)").parse().unwrap_err(), + TestError { + span: Span::new( + Position::new(4, 1, 5), + Position::new(6, 1, 6), + ), + kind: ast::ErrorKind::GroupNameInvalid, + } + ); + assert_eq!( + parser("(?P<☃>)").parse().unwrap_err(), + TestError { + span: Span::new( + Position::new(4, 1, 5), + Position::new(7, 1, 6), + ), + kind: ast::ErrorKind::GroupNameInvalid, + } + ); + assert_eq!( + parser("(?P<a☃>)").parse().unwrap_err(), + TestError { + span: Span::new( + Position::new(5, 1, 6), + Position::new(8, 1, 7), + ), + kind: ast::ErrorKind::GroupNameInvalid, + } + ); } #[test] @@ -4046,6 +4177,34 @@ bar ], }) ); + assert_eq!( + parser("i-sR:").parse_flags(), + Ok(ast::Flags { + span: span(0..4), + items: vec![ + ast::FlagsItem { + span: span(0..1), + kind: ast::FlagsItemKind::Flag( + ast::Flag::CaseInsensitive + ), + }, + ast::FlagsItem { + span: span(1..2), + kind: ast::FlagsItemKind::Negation, + }, + ast::FlagsItem { + span: span(2..3), + kind: ast::FlagsItemKind::Flag( + ast::Flag::DotMatchesNewLine + ), + }, + ast::FlagsItem { + span: span(3..4), + kind: ast::FlagsItemKind::Flag(ast::Flag::CRLF), + }, + ], + }) + ); assert_eq!( parser("isU").parse_flags().unwrap_err(), @@ -4107,6 +4266,7 @@ bar assert_eq!(parser("s").parse_flag(), Ok(ast::Flag::DotMatchesNewLine)); assert_eq!(parser("U").parse_flag(), Ok(ast::Flag::SwapGreed)); assert_eq!(parser("u").parse_flag(), Ok(ast::Flag::Unicode)); + assert_eq!(parser("R").parse_flag(), Ok(ast::Flag::CRLF)); assert_eq!(parser("x").parse_flag(), Ok(ast::Flag::IgnoreWhitespace)); assert_eq!( @@ -4178,7 +4338,7 @@ bar parser(r"\|").parse_primitive(), Ok(Primitive::Literal(ast::Literal { span: span(0..2), - kind: ast::LiteralKind::Punctuation, + kind: ast::LiteralKind::Meta, c: '|', })) ); @@ -4229,11 +4389,26 @@ bar })) ); + // We also support superfluous escapes in most cases now too. + for c in ['!', '@', '%', '"', '\'', '/', ' '] { + let pat = format!(r"\{}", c); + assert_eq!( + parser(&pat).parse_primitive(), + Ok(Primitive::Literal(ast::Literal { + span: span(0..2), + kind: ast::LiteralKind::Superfluous, + c, + })) + ); + } + + // Some superfluous escapes, namely [0-9A-Za-z], are still banned. This + // gives flexibility for future evolution. assert_eq!( - parser(r"\").parse_escape().unwrap_err(), + parser(r"\e").parse_escape().unwrap_err(), TestError { - span: span(0..1), - kind: ast::ErrorKind::EscapeUnexpectedEof, + span: span(0..2), + kind: ast::ErrorKind::EscapeUnrecognized, } ); assert_eq!( @@ -4243,6 +4418,31 @@ bar kind: ast::ErrorKind::EscapeUnrecognized, } ); + // But also, < and > are banned, so that we may evolve them into + // start/end word boundary assertions. (Not sure if we will...) + assert_eq!( + parser(r"\<").parse_escape().unwrap_err(), + TestError { + span: span(0..2), + kind: ast::ErrorKind::EscapeUnrecognized, + } + ); + assert_eq!( + parser(r"\>").parse_escape().unwrap_err(), + TestError { + span: span(0..2), + kind: ast::ErrorKind::EscapeUnrecognized, + } + ); + + // An unfinished escape is illegal. + assert_eq!( + parser(r"\").parse_escape().unwrap_err(), + TestError { + span: span(0..1), + kind: ast::ErrorKind::EscapeUnexpectedEof, + } + ); } #[test] @@ -4272,7 +4472,7 @@ bar Ok(Primitive::Literal(ast::Literal { span: span(0..pat.len()), kind: ast::LiteralKind::Octal, - c: ::std::char::from_u32(i).unwrap(), + c: char::from_u32(i).unwrap(), })) ); } @@ -4347,7 +4547,7 @@ bar Ok(Primitive::Literal(ast::Literal { span: span(0..pat.len()), kind: ast::LiteralKind::HexFixed(ast::HexLiteralKind::X), - c: ::std::char::from_u32(i).unwrap(), + c: char::from_u32(i).unwrap(), })) ); } @@ -4378,7 +4578,7 @@ bar #[test] fn parse_hex_four() { for i in 0..65536 { - let c = match ::std::char::from_u32(i) { + let c = match char::from_u32(i) { None => continue, Some(c) => c, }; @@ -4442,7 +4642,7 @@ bar #[test] fn parse_hex_eight() { for i in 0..65536 { - let c = match ::std::char::from_u32(i) { + let c = match char::from_u32(i) { None => continue, Some(c) => c, }; @@ -4839,7 +5039,7 @@ bar lit(span(1..2), 'a'), ast::ClassSetItem::Literal(ast::Literal { span: span(2..4), - kind: ast::LiteralKind::Punctuation, + kind: ast::LiteralKind::Meta, c: ']', }), ] @@ -4857,7 +5057,7 @@ bar lit(span(1..2), 'a'), ast::ClassSetItem::Literal(ast::Literal { span: span(2..4), - kind: ast::LiteralKind::Punctuation, + kind: ast::LiteralKind::Meta, c: '-', }), lit(span(4..5), 'z'), @@ -5049,7 +5249,7 @@ bar span(1..6), itemset(ast::ClassSetItem::Literal(ast::Literal { span: span(1..3), - kind: ast::LiteralKind::Punctuation, + kind: ast::LiteralKind::Meta, c: '^', })), itemset(lit(span(5..6), '^')), @@ -5065,7 +5265,7 @@ bar span(1..6), itemset(ast::ClassSetItem::Literal(ast::Literal { span: span(1..3), - kind: ast::LiteralKind::Punctuation, + kind: ast::LiteralKind::Meta, c: '&', })), itemset(lit(span(5..6), '&')), @@ -5130,7 +5330,7 @@ bar lit(span(1..2), ']'), ast::ClassSetItem::Literal(ast::Literal { span: span(2..4), - kind: ast::LiteralKind::Punctuation, + kind: ast::LiteralKind::Meta, c: '[', }), ] @@ -5148,7 +5348,7 @@ bar kind: itemset(ast::ClassSetItem::Literal( ast::Literal { span: span(1..3), - kind: ast::LiteralKind::Punctuation, + kind: ast::LiteralKind::Meta, c: '[', } )), |