/*! Defines an abstract syntax for regular expressions. */ use std::cmp::Ordering; use std::error; use std::fmt; pub use crate::ast::visitor::{visit, Visitor}; pub mod parse; pub mod print; mod visitor; /// An error that occurred while parsing a regular expression into an abstract /// syntax tree. /// /// Note that not all ASTs represents a valid regular expression. For example, /// an AST is constructed without error for `\p{Quux}`, but `Quux` is not a /// valid Unicode property name. That particular error is reported when /// translating an AST to the high-level intermediate representation (`HIR`). #[derive(Clone, Debug, Eq, PartialEq)] pub struct Error { /// The kind of error. kind: ErrorKind, /// The original pattern that the parser generated the error from. Every /// span in an error is a valid range into this string. pattern: String, /// The span of this error. span: Span, } impl Error { /// Return the type of this error. pub fn kind(&self) -> &ErrorKind { &self.kind } /// The original pattern string in which this error occurred. /// /// Every span reported by this error is reported in terms of this string. pub fn pattern(&self) -> &str { &self.pattern } /// Return the span at which this error occurred. pub fn span(&self) -> &Span { &self.span } /// Return an auxiliary span. This span exists only for some errors that /// benefit from being able to point to two locations in the original /// regular expression. For example, "duplicate" errors will have the /// main error position set to the duplicate occurrence while its /// auxiliary span will be set to the initial occurrence. pub fn auxiliary_span(&self) -> Option<&Span> { use self::ErrorKind::*; match self.kind { FlagDuplicate { ref original } => Some(original), FlagRepeatedNegation { ref original, .. } => Some(original), GroupNameDuplicate { ref original, .. } => Some(original), _ => None, } } } /// The type of an error that occurred while building an AST. #[derive(Clone, Debug, Eq, PartialEq)] pub enum ErrorKind { /// The capturing group limit was exceeded. /// /// Note that this represents a limit on the total number of capturing /// groups in a regex and not necessarily the number of nested capturing /// groups. That is, the nest limit can be low and it is still possible for /// this error to occur. CaptureLimitExceeded, /// An invalid escape sequence was found in a character class set. ClassEscapeInvalid, /// An invalid character class range was found. An invalid range is any /// range where the start is greater than the end. ClassRangeInvalid, /// An invalid range boundary was found in a character class. Range /// boundaries must be a single literal codepoint, but this error indicates /// that something else was found, such as a nested class. ClassRangeLiteral, /// An opening `[` was found with no corresponding closing `]`. ClassUnclosed, /// Note that this error variant is no longer used. Namely, a decimal /// number can only appear as a repetition quantifier. When the number /// in a repetition quantifier is empty, then it gets its own specialized /// error, `RepetitionCountDecimalEmpty`. DecimalEmpty, /// An invalid decimal number was given where one was expected. DecimalInvalid, /// A bracketed hex literal was empty. EscapeHexEmpty, /// A bracketed hex literal did not correspond to a Unicode scalar value. EscapeHexInvalid, /// An invalid hexadecimal digit was found. EscapeHexInvalidDigit, /// EOF was found before an escape sequence was completed. EscapeUnexpectedEof, /// An unrecognized escape sequence. EscapeUnrecognized, /// A dangling negation was used when setting flags, e.g., `i-`. FlagDanglingNegation, /// A flag was used twice, e.g., `i-i`. FlagDuplicate { /// The position of the original flag. The error position /// points to the duplicate flag. original: Span, }, /// The negation operator was used twice, e.g., `-i-s`. FlagRepeatedNegation { /// The position of the original negation operator. The error position /// points to the duplicate negation operator. original: Span, }, /// Expected a flag but got EOF, e.g., `(?`. FlagUnexpectedEof, /// Unrecognized flag, e.g., `a`. FlagUnrecognized, /// A duplicate capture name was found. GroupNameDuplicate { /// The position of the initial occurrence of the capture name. The /// error position itself points to the duplicate occurrence. original: Span, }, /// A capture group name is empty, e.g., `(?P<>abc)`. GroupNameEmpty, /// An invalid character was seen for a capture group name. This includes /// errors where the first character is a digit (even though subsequent /// characters are allowed to be digits). GroupNameInvalid, /// A closing `>` could not be found for a capture group name. GroupNameUnexpectedEof, /// An unclosed group, e.g., `(ab`. /// /// The span of this error corresponds to the unclosed parenthesis. GroupUnclosed, /// An unopened group, e.g., `ab)`. GroupUnopened, /// The nest limit was exceeded. The limit stored here is the limit /// configured in the parser. NestLimitExceeded(u32), /// The range provided in a counted repetition operator is invalid. The /// range is invalid if the start is greater than the end. RepetitionCountInvalid, /// An opening `{` was not followed by a valid decimal value. /// For example, `x{}` or `x{]}` would fail. RepetitionCountDecimalEmpty, /// An opening `{` was found with no corresponding closing `}`. RepetitionCountUnclosed, /// A repetition operator was applied to a missing sub-expression. This /// occurs, for example, in the regex consisting of just a `*` or even /// `(?i)*`. It is, however, possible to create a repetition operating on /// an empty sub-expression. For example, `()*` is still considered valid. RepetitionMissing, /// The Unicode class is not valid. This typically occurs when a `\p` is /// followed by something other than a `{`. UnicodeClassInvalid, /// When octal support is disabled, this error is produced when an octal /// escape is used. The octal escape is assumed to be an invocation of /// a backreference, which is the common case. UnsupportedBackreference, /// When syntax similar to PCRE's look-around is used, this error is /// returned. Some example syntaxes that are rejected include, but are /// not necessarily limited to, `(?=re)`, `(?!re)`, `(?<=re)` and /// `(? &str { use self::ErrorKind::*; match self.kind { CaptureLimitExceeded => "capture group limit exceeded", ClassEscapeInvalid => "invalid escape sequence in character class", ClassRangeInvalid => "invalid character class range", ClassRangeLiteral => "invalid range boundary, must be a literal", ClassUnclosed => "unclosed character class", DecimalEmpty => "empty decimal literal", DecimalInvalid => "invalid decimal literal", EscapeHexEmpty => "empty hexadecimal literal", EscapeHexInvalid => "invalid hexadecimal literal", EscapeHexInvalidDigit => "invalid hexadecimal digit", EscapeUnexpectedEof => "unexpected eof (escape sequence)", EscapeUnrecognized => "unrecognized escape sequence", FlagDanglingNegation => "dangling flag negation operator", FlagDuplicate { .. } => "duplicate flag", FlagRepeatedNegation { .. } => "repeated negation", FlagUnexpectedEof => "unexpected eof (flag)", FlagUnrecognized => "unrecognized flag", GroupNameDuplicate { .. } => "duplicate capture group name", GroupNameEmpty => "empty capture group name", GroupNameInvalid => "invalid capture group name", GroupNameUnexpectedEof => "unclosed capture group name", GroupUnclosed => "unclosed group", GroupUnopened => "unopened group", NestLimitExceeded(_) => "nest limit exceeded", RepetitionCountInvalid => "invalid repetition count range", RepetitionCountUnclosed => "unclosed counted repetition", RepetitionMissing => "repetition operator missing expression", UnicodeClassInvalid => "invalid Unicode character class", UnsupportedBackreference => "backreferences are not supported", UnsupportedLookAround => "look-around is not supported", _ => unreachable!(), } } } impl fmt::Display for Error { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { crate::error::Formatter::from(self).fmt(f) } } impl fmt::Display for ErrorKind { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { use self::ErrorKind::*; match *self { CaptureLimitExceeded => write!( f, "exceeded the maximum number of \ capturing groups ({})", ::std::u32::MAX ), ClassEscapeInvalid => { write!(f, "invalid escape sequence found in character class") } ClassRangeInvalid => write!( f, "invalid character class range, \ the start must be <= the end" ), ClassRangeLiteral => { write!(f, "invalid range boundary, must be a literal") } ClassUnclosed => write!(f, "unclosed character class"), DecimalEmpty => write!(f, "decimal literal empty"), DecimalInvalid => write!(f, "decimal literal invalid"), EscapeHexEmpty => write!(f, "hexadecimal literal empty"), EscapeHexInvalid => { write!(f, "hexadecimal literal is not a Unicode scalar value") } EscapeHexInvalidDigit => write!(f, "invalid hexadecimal digit"), EscapeUnexpectedEof => write!( f, "incomplete escape sequence, \ reached end of pattern prematurely" ), EscapeUnrecognized => write!(f, "unrecognized escape sequence"), FlagDanglingNegation => { write!(f, "dangling flag negation operator") } FlagDuplicate { .. } => write!(f, "duplicate flag"), FlagRepeatedNegation { .. } => { write!(f, "flag negation operator repeated") } FlagUnexpectedEof => { write!(f, "expected flag but got end of regex") } FlagUnrecognized => write!(f, "unrecognized flag"), GroupNameDuplicate { .. } => { write!(f, "duplicate capture group name") } GroupNameEmpty => write!(f, "empty capture group name"), GroupNameInvalid => write!(f, "invalid capture group character"), GroupNameUnexpectedEof => write!(f, "unclosed capture group name"), GroupUnclosed => write!(f, "unclosed group"), GroupUnopened => write!(f, "unopened group"), NestLimitExceeded(limit) => write!( f, "exceed the maximum number of \ nested parentheses/brackets ({})", limit ), RepetitionCountInvalid => write!( f, "invalid repetition count range, \ the start must be <= the end" ), RepetitionCountDecimalEmpty => { write!(f, "repetition quantifier expects a valid decimal") } RepetitionCountUnclosed => { write!(f, "unclosed counted repetition") } RepetitionMissing => { write!(f, "repetition operator missing expression") } UnicodeClassInvalid => { write!(f, "invalid Unicode character class") } UnsupportedBackreference => { write!(f, "backreferences are not supported") } UnsupportedLookAround => write!( f, "look-around, including look-ahead and look-behind, \ is not supported" ), _ => unreachable!(), } } } /// Span represents the position information of a single AST item. /// /// All span positions are absolute byte offsets that can be used on the /// original regular expression that was parsed. #[derive(Clone, Copy, Eq, PartialEq)] pub struct Span { /// The start byte offset. pub start: Position, /// The end byte offset. pub end: Position, } impl fmt::Debug for Span { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "Span({:?}, {:?})", self.start, self.end) } } impl Ord for Span { fn cmp(&self, other: &Span) -> Ordering { (&self.start, &self.end).cmp(&(&other.start, &other.end)) } } impl PartialOrd for Span { fn partial_cmp(&self, other: &Span) -> Option { Some(self.cmp(other)) } } /// A single position in a regular expression. /// /// A position encodes one half of a span, and include the byte offset, line /// number and column number. #[derive(Clone, Copy, Eq, PartialEq)] pub struct Position { /// The absolute offset of this position, starting at `0` from the /// beginning of the regular expression pattern string. pub offset: usize, /// The line number, starting at `1`. pub line: usize, /// The approximate column number, starting at `1`. pub column: usize, } impl fmt::Debug for Position { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!( f, "Position(o: {:?}, l: {:?}, c: {:?})", self.offset, self.line, self.column ) } } impl Ord for Position { fn cmp(&self, other: &Position) -> Ordering { self.offset.cmp(&other.offset) } } impl PartialOrd for Position { fn partial_cmp(&self, other: &Position) -> Option { Some(self.cmp(other)) } } impl Span { /// Create a new span with the given positions. pub fn new(start: Position, end: Position) -> Span { Span { start, end } } /// Create a new span using the given position as the start and end. pub fn splat(pos: Position) -> Span { Span::new(pos, pos) } /// Create a new span by replacing the starting the position with the one /// given. pub fn with_start(self, pos: Position) -> Span { Span { start: pos, ..self } } /// Create a new span by replacing the ending the position with the one /// given. pub fn with_end(self, pos: Position) -> Span { Span { end: pos, ..self } } /// Returns true if and only if this span occurs on a single line. pub fn is_one_line(&self) -> bool { self.start.line == self.end.line } /// Returns true if and only if this span is empty. That is, it points to /// a single position in the concrete syntax of a regular expression. pub fn is_empty(&self) -> bool { self.start.offset == self.end.offset } } impl Position { /// Create a new position with the given information. /// /// `offset` is the absolute offset of the position, starting at `0` from /// the beginning of the regular expression pattern string. /// /// `line` is the line number, starting at `1`. /// /// `column` is the approximate column number, starting at `1`. pub fn new(offset: usize, line: usize, column: usize) -> Position { Position { offset, line, column } } } /// An abstract syntax tree for a singular expression along with comments /// found. /// /// Comments are not stored in the tree itself to avoid complexity. Each /// comment contains a span of precisely where it occurred in the original /// regular expression. #[derive(Clone, Debug, Eq, PartialEq)] pub struct WithComments { /// The actual ast. pub ast: Ast, /// All comments found in the original regular expression. pub comments: Vec, } /// A comment from a regular expression with an associated span. /// /// A regular expression can only contain comments when the `x` flag is /// enabled. #[derive(Clone, Debug, Eq, PartialEq)] pub struct Comment { /// The span of this comment, including the beginning `#` and ending `\n`. pub span: Span, /// The comment text, starting with the first character following the `#` /// and ending with the last character preceding the `\n`. pub comment: String, } /// An abstract syntax tree for a single regular expression. /// /// An `Ast`'s `fmt::Display` implementation uses constant stack space and heap /// space proportional to the size of the `Ast`. /// /// This type defines its own destructor that uses constant stack space and /// heap space proportional to the size of the `Ast`. #[derive(Clone, Debug, Eq, PartialEq)] pub enum Ast { /// An empty regex that matches everything. Empty(Span), /// A set of flags, e.g., `(?is)`. Flags(SetFlags), /// A single character literal, which includes escape sequences. Literal(Literal), /// The "any character" class. Dot(Span), /// A single zero-width assertion. Assertion(Assertion), /// A single character class. This includes all forms of character classes /// except for `.`. e.g., `\d`, `\pN`, `[a-z]` and `[[:alpha:]]`. Class(Class), /// A repetition operator applied to an arbitrary regular expression. Repetition(Repetition), /// A grouped regular expression. Group(Group), /// An alternation of regular expressions. Alternation(Alternation), /// A concatenation of regular expressions. Concat(Concat), } impl Ast { /// Return the span of this abstract syntax tree. pub fn span(&self) -> &Span { match *self { Ast::Empty(ref span) => span, Ast::Flags(ref x) => &x.span, Ast::Literal(ref x) => &x.span, Ast::Dot(ref span) => span, Ast::Assertion(ref x) => &x.span, Ast::Class(ref x) => x.span(), Ast::Repetition(ref x) => &x.span, Ast::Group(ref x) => &x.span, Ast::Alternation(ref x) => &x.span, Ast::Concat(ref x) => &x.span, } } /// Return true if and only if this Ast is empty. pub fn is_empty(&self) -> bool { match *self { Ast::Empty(_) => true, _ => false, } } /// Returns true if and only if this AST has any (including possibly empty) /// subexpressions. fn has_subexprs(&self) -> bool { match *self { Ast::Empty(_) | Ast::Flags(_) | Ast::Literal(_) | Ast::Dot(_) | Ast::Assertion(_) => false, Ast::Class(_) | Ast::Repetition(_) | Ast::Group(_) | Ast::Alternation(_) | Ast::Concat(_) => true, } } } /// Print a display representation of this Ast. /// /// This does not preserve any of the original whitespace formatting that may /// have originally been present in the concrete syntax from which this Ast /// was generated. /// /// This implementation uses constant stack space and heap space proportional /// to the size of the `Ast`. impl fmt::Display for Ast { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { use crate::ast::print::Printer; Printer::new().print(self, f) } } /// An alternation of regular expressions. #[derive(Clone, Debug, Eq, PartialEq)] pub struct Alternation { /// The span of this alternation. pub span: Span, /// The alternate regular expressions. pub asts: Vec, } impl Alternation { /// Return this alternation as an AST. /// /// If this alternation contains zero ASTs, then Ast::Empty is /// returned. If this alternation contains exactly 1 AST, then the /// corresponding AST is returned. Otherwise, Ast::Alternation is returned. pub fn into_ast(mut self) -> Ast { match self.asts.len() { 0 => Ast::Empty(self.span), 1 => self.asts.pop().unwrap(), _ => Ast::Alternation(self), } } } /// A concatenation of regular expressions. #[derive(Clone, Debug, Eq, PartialEq)] pub struct Concat { /// The span of this concatenation. pub span: Span, /// The concatenation regular expressions. pub asts: Vec, } impl Concat { /// Return this concatenation as an AST. /// /// If this concatenation contains zero ASTs, then Ast::Empty is /// returned. If this concatenation contains exactly 1 AST, then the /// corresponding AST is returned. Otherwise, Ast::Concat is returned. pub fn into_ast(mut self) -> Ast { match self.asts.len() { 0 => Ast::Empty(self.span), 1 => self.asts.pop().unwrap(), _ => Ast::Concat(self), } } } /// A single literal expression. /// /// A literal corresponds to a single Unicode scalar value. Literals may be /// represented in their literal form, e.g., `a` or in their escaped form, /// e.g., `\x61`. #[derive(Clone, Debug, Eq, PartialEq)] pub struct Literal { /// The span of this literal. pub span: Span, /// The kind of this literal. pub kind: LiteralKind, /// The Unicode scalar value corresponding to this literal. pub c: char, } impl Literal { /// If this literal was written as a `\x` hex escape, then this returns /// the corresponding byte value. Otherwise, this returns `None`. pub fn byte(&self) -> Option { let short_hex = LiteralKind::HexFixed(HexLiteralKind::X); if self.c as u32 <= 255 && self.kind == short_hex { Some(self.c as u8) } else { None } } } /// The kind of a single literal expression. #[derive(Clone, Debug, Eq, PartialEq)] pub enum LiteralKind { /// The literal is written verbatim, e.g., `a` or `☃`. Verbatim, /// The literal is written as an escape because it is punctuation, e.g., /// `\*` or `\[`. Punctuation, /// The literal is written as an octal escape, e.g., `\141`. Octal, /// The literal is written as a hex code with a fixed number of digits /// depending on the type of the escape, e.g., `\x61` or or `\u0061` or /// `\U00000061`. HexFixed(HexLiteralKind), /// The literal is written as a hex code with a bracketed number of /// digits. The only restriction is that the bracketed hex code must refer /// to a valid Unicode scalar value. HexBrace(HexLiteralKind), /// The literal is written as a specially recognized escape, e.g., `\f` /// or `\n`. Special(SpecialLiteralKind), } /// The type of a special literal. /// /// A special literal is a special escape sequence recognized by the regex /// parser, e.g., `\f` or `\n`. #[derive(Clone, Debug, Eq, PartialEq)] pub enum SpecialLiteralKind { /// Bell, spelled `\a` (`\x07`). Bell, /// Form feed, spelled `\f` (`\x0C`). FormFeed, /// Tab, spelled `\t` (`\x09`). Tab, /// Line feed, spelled `\n` (`\x0A`). LineFeed, /// Carriage return, spelled `\r` (`\x0D`). CarriageReturn, /// Vertical tab, spelled `\v` (`\x0B`). VerticalTab, /// Space, spelled `\ ` (`\x20`). Note that this can only appear when /// parsing in verbose mode. Space, } /// The type of a Unicode hex literal. /// /// Note that all variants behave the same when used with brackets. They only /// differ when used without brackets in the number of hex digits that must /// follow. #[derive(Clone, Debug, Eq, PartialEq)] pub enum HexLiteralKind { /// A `\x` prefix. When used without brackets, this form is limited to /// two digits. X, /// A `\u` prefix. When used without brackets, this form is limited to /// four digits. UnicodeShort, /// A `\U` prefix. When used without brackets, this form is limited to /// eight digits. UnicodeLong, } impl HexLiteralKind { /// The number of digits that must be used with this literal form when /// used without brackets. When used with brackets, there is no /// restriction on the number of digits. pub fn digits(&self) -> u32 { match *self { HexLiteralKind::X => 2, HexLiteralKind::UnicodeShort => 4, HexLiteralKind::UnicodeLong => 8, } } } /// A single character class expression. #[derive(Clone, Debug, Eq, PartialEq)] pub enum Class { /// A Unicode character class, e.g., `\pL` or `\p{Greek}`. Unicode(ClassUnicode), /// A perl character class, e.g., `\d` or `\W`. Perl(ClassPerl), /// A bracketed character class set, which may contain zero or more /// character ranges and/or zero or more nested classes. e.g., /// `[a-zA-Z\pL]`. Bracketed(ClassBracketed), } impl Class { /// Return the span of this character class. pub fn span(&self) -> &Span { match *self { Class::Perl(ref x) => &x.span, Class::Unicode(ref x) => &x.span, Class::Bracketed(ref x) => &x.span, } } } /// A Perl character class. #[derive(Clone, Debug, Eq, PartialEq)] pub struct ClassPerl { /// The span of this class. pub span: Span, /// The kind of Perl class. pub kind: ClassPerlKind, /// Whether the class is negated or not. e.g., `\d` is not negated but /// `\D` is. pub negated: bool, } /// The available Perl character classes. #[derive(Clone, Debug, Eq, PartialEq)] pub enum ClassPerlKind { /// Decimal numbers. Digit, /// Whitespace. Space, /// Word characters. Word, } /// An ASCII character class. #[derive(Clone, Debug, Eq, PartialEq)] pub struct ClassAscii { /// The span of this class. pub span: Span, /// The kind of ASCII class. pub kind: ClassAsciiKind, /// Whether the class is negated or not. e.g., `[[:alpha:]]` is not negated /// but `[[:^alpha:]]` is. pub negated: bool, } /// The available ASCII character classes. #[derive(Clone, Debug, Eq, PartialEq)] pub enum ClassAsciiKind { /// `[0-9A-Za-z]` Alnum, /// `[A-Za-z]` Alpha, /// `[\x00-\x7F]` Ascii, /// `[ \t]` Blank, /// `[\x00-\x1F\x7F]` Cntrl, /// `[0-9]` Digit, /// `[!-~]` Graph, /// `[a-z]` Lower, /// `[ -~]` Print, /// `[!-/:-@\[-`{-~]` Punct, /// `[\t\n\v\f\r ]` Space, /// `[A-Z]` Upper, /// `[0-9A-Za-z_]` Word, /// `[0-9A-Fa-f]` Xdigit, } impl ClassAsciiKind { /// Return the corresponding ClassAsciiKind variant for the given name. /// /// The name given should correspond to the lowercase version of the /// variant name. e.g., `cntrl` is the name for `ClassAsciiKind::Cntrl`. /// /// If no variant with the corresponding name exists, then `None` is /// returned. pub fn from_name(name: &str) -> Option { use self::ClassAsciiKind::*; match name { "alnum" => Some(Alnum), "alpha" => Some(Alpha), "ascii" => Some(Ascii), "blank" => Some(Blank), "cntrl" => Some(Cntrl), "digit" => Some(Digit), "graph" => Some(Graph), "lower" => Some(Lower), "print" => Some(Print), "punct" => Some(Punct), "space" => Some(Space), "upper" => Some(Upper), "word" => Some(Word), "xdigit" => Some(Xdigit), _ => None, } } } /// A Unicode character class. #[derive(Clone, Debug, Eq, PartialEq)] pub struct ClassUnicode { /// The span of this class. pub span: Span, /// Whether this class is negated or not. /// /// Note: be careful when using this attribute. This specifically refers /// to whether the class is written as `\p` or `\P`, where the latter /// is `negated = true`. However, it also possible to write something like /// `\P{scx!=Katakana}` which is actually equivalent to /// `\p{scx=Katakana}` and is therefore not actually negated even though /// `negated = true` here. To test whether this class is truly negated /// or not, use the `is_negated` method. pub negated: bool, /// The kind of Unicode class. pub kind: ClassUnicodeKind, } impl ClassUnicode { /// Returns true if this class has been negated. /// /// Note that this takes the Unicode op into account, if it's present. /// e.g., `is_negated` for `\P{scx!=Katakana}` will return `false`. pub fn is_negated(&self) -> bool { match self.kind { ClassUnicodeKind::NamedValue { op: ClassUnicodeOpKind::NotEqual, .. } => !self.negated, _ => self.negated, } } } /// The available forms of Unicode character classes. #[derive(Clone, Debug, Eq, PartialEq)] pub enum ClassUnicodeKind { /// A one letter abbreviated class, e.g., `\pN`. OneLetter(char), /// A binary property, general category or script. The string may be /// empty. Named(String), /// A property name and an associated value. NamedValue { /// The type of Unicode op used to associate `name` with `value`. op: ClassUnicodeOpKind, /// The property name (which may be empty). name: String, /// The property value (which may be empty). value: String, }, } /// The type of op used in a Unicode character class. #[derive(Clone, Debug, Eq, PartialEq)] pub enum ClassUnicodeOpKind { /// A property set to a specific value, e.g., `\p{scx=Katakana}`. Equal, /// A property set to a specific value using a colon, e.g., /// `\p{scx:Katakana}`. Colon, /// A property that isn't a particular value, e.g., `\p{scx!=Katakana}`. NotEqual, } impl ClassUnicodeOpKind { /// Whether the op is an equality op or not. pub fn is_equal(&self) -> bool { match *self { ClassUnicodeOpKind::Equal | ClassUnicodeOpKind::Colon => true, _ => false, } } } /// A bracketed character class, e.g., `[a-z0-9]`. #[derive(Clone, Debug, Eq, PartialEq)] pub struct ClassBracketed { /// The span of this class. pub span: Span, /// Whether this class is negated or not. e.g., `[a]` is not negated but /// `[^a]` is. pub negated: bool, /// The type of this set. A set is either a normal union of things, e.g., /// `[abc]` or a result of applying set operations, e.g., `[\pL--c]`. pub kind: ClassSet, } /// A character class set. /// /// This type corresponds to the internal structure of a bracketed character /// class. That is, every bracketed character is one of two types: a union of /// items (literals, ranges, other bracketed classes) or a tree of binary set /// operations. #[derive(Clone, Debug, Eq, PartialEq)] pub enum ClassSet { /// An item, which can be a single literal, range, nested character class /// or a union of items. Item(ClassSetItem), /// A single binary operation (i.e., &&, -- or ~~). BinaryOp(ClassSetBinaryOp), } impl ClassSet { /// Build a set from a union. pub fn union(ast: ClassSetUnion) -> ClassSet { ClassSet::Item(ClassSetItem::Union(ast)) } /// Return the span of this character class set. pub fn span(&self) -> &Span { match *self { ClassSet::Item(ref x) => x.span(), ClassSet::BinaryOp(ref x) => &x.span, } } /// Return true if and only if this class set is empty. fn is_empty(&self) -> bool { match *self { ClassSet::Item(ClassSetItem::Empty(_)) => true, _ => false, } } } /// A single component of a character class set. #[derive(Clone, Debug, Eq, PartialEq)] pub enum ClassSetItem { /// An empty item. /// /// Note that a bracketed character class cannot contain a single empty /// item. Empty items can appear when using one of the binary operators. /// For example, `[&&]` is the intersection of two empty classes. Empty(Span), /// A single literal. Literal(Literal), /// A range between two literals. Range(ClassSetRange), /// An ASCII character class, e.g., `[:alnum:]` or `[:punct:]`. Ascii(ClassAscii), /// A Unicode character class, e.g., `\pL` or `\p{Greek}`. Unicode(ClassUnicode), /// A perl character class, e.g., `\d` or `\W`. Perl(ClassPerl), /// A bracketed character class set, which may contain zero or more /// character ranges and/or zero or more nested classes. e.g., /// `[a-zA-Z\pL]`. Bracketed(Box), /// A union of items. Union(ClassSetUnion), } impl ClassSetItem { /// Return the span of this character class set item. pub fn span(&self) -> &Span { match *self { ClassSetItem::Empty(ref span) => span, ClassSetItem::Literal(ref x) => &x.span, ClassSetItem::Range(ref x) => &x.span, ClassSetItem::Ascii(ref x) => &x.span, ClassSetItem::Perl(ref x) => &x.span, ClassSetItem::Unicode(ref x) => &x.span, ClassSetItem::Bracketed(ref x) => &x.span, ClassSetItem::Union(ref x) => &x.span, } } } /// A single character class range in a set. #[derive(Clone, Debug, Eq, PartialEq)] pub struct ClassSetRange { /// The span of this range. pub span: Span, /// The start of this range. pub start: Literal, /// The end of this range. pub end: Literal, } impl ClassSetRange { /// Returns true if and only if this character class range is valid. /// /// The only case where a range is invalid is if its start is greater than /// its end. pub fn is_valid(&self) -> bool { self.start.c <= self.end.c } } /// A union of items inside a character class set. #[derive(Clone, Debug, Eq, PartialEq)] pub struct ClassSetUnion { /// The span of the items in this operation. e.g., the `a-z0-9` in /// `[^a-z0-9]` pub span: Span, /// The sequence of items that make up this union. pub items: Vec, } impl ClassSetUnion { /// Push a new item in this union. /// /// The ending position of this union's span is updated to the ending /// position of the span of the item given. If the union is empty, then /// the starting position of this union is set to the starting position /// of this item. /// /// In other words, if you only use this method to add items to a union /// and you set the spans on each item correctly, then you should never /// need to adjust the span of the union directly. pub fn push(&mut self, item: ClassSetItem) { if self.items.is_empty() { self.span.start = item.span().start; } self.span.end = item.span().end; self.items.push(item); } /// Return this union as a character class set item. /// /// If this union contains zero items, then an empty union is /// returned. If this concatenation contains exactly 1 item, then the /// corresponding item is returned. Otherwise, ClassSetItem::Union is /// returned. pub fn into_item(mut self) -> ClassSetItem { match self.items.len() { 0 => ClassSetItem::Empty(self.span), 1 => self.items.pop().unwrap(), _ => ClassSetItem::Union(self), } } } /// A Unicode character class set operation. #[derive(Clone, Debug, Eq, PartialEq)] pub struct ClassSetBinaryOp { /// The span of this operation. e.g., the `a-z--[h-p]` in `[a-z--h-p]`. pub span: Span, /// The type of this set operation. pub kind: ClassSetBinaryOpKind, /// The left hand side of the operation. pub lhs: Box, /// The right hand side of the operation. pub rhs: Box, } /// The type of a Unicode character class set operation. /// /// Note that this doesn't explicitly represent union since there is no /// explicit union operator. Concatenation inside a character class corresponds /// to the union operation. #[derive(Clone, Copy, Debug, Eq, PartialEq)] pub enum ClassSetBinaryOpKind { /// The intersection of two sets, e.g., `\pN&&[a-z]`. Intersection, /// The difference of two sets, e.g., `\pN--[0-9]`. Difference, /// The symmetric difference of two sets. The symmetric difference is the /// set of elements belonging to one but not both sets. /// e.g., `[\pL~~[:ascii:]]`. SymmetricDifference, } /// A single zero-width assertion. #[derive(Clone, Debug, Eq, PartialEq)] pub struct Assertion { /// The span of this assertion. pub span: Span, /// The assertion kind, e.g., `\b` or `^`. pub kind: AssertionKind, } /// An assertion kind. #[derive(Clone, Debug, Eq, PartialEq)] pub enum AssertionKind { /// `^` StartLine, /// `$` EndLine, /// `\A` StartText, /// `\z` EndText, /// `\b` WordBoundary, /// `\B` NotWordBoundary, } /// A repetition operation applied to a regular expression. #[derive(Clone, Debug, Eq, PartialEq)] pub struct Repetition { /// The span of this operation. pub span: Span, /// The actual operation. pub op: RepetitionOp, /// Whether this operation was applied greedily or not. pub greedy: bool, /// The regular expression under repetition. pub ast: Box, } /// The repetition operator itself. #[derive(Clone, Debug, Eq, PartialEq)] pub struct RepetitionOp { /// The span of this operator. This includes things like `+`, `*?` and /// `{m,n}`. pub span: Span, /// The type of operation. pub kind: RepetitionKind, } /// The kind of a repetition operator. #[derive(Clone, Debug, Eq, PartialEq)] pub enum RepetitionKind { /// `?` ZeroOrOne, /// `*` ZeroOrMore, /// `+` OneOrMore, /// `{m,n}` Range(RepetitionRange), } /// A range repetition operator. #[derive(Clone, Debug, Eq, PartialEq)] pub enum RepetitionRange { /// `{m}` Exactly(u32), /// `{m,}` AtLeast(u32), /// `{m,n}` Bounded(u32, u32), } impl RepetitionRange { /// Returns true if and only if this repetition range is valid. /// /// The only case where a repetition range is invalid is if it is bounded /// and its start is greater than its end. pub fn is_valid(&self) -> bool { match *self { RepetitionRange::Bounded(s, e) if s > e => false, _ => true, } } } /// A grouped regular expression. /// /// This includes both capturing and non-capturing groups. This does **not** /// include flag-only groups like `(?is)`, but does contain any group that /// contains a sub-expression, e.g., `(a)`, `(?Pa)`, `(?:a)` and /// `(?is:a)`. #[derive(Clone, Debug, Eq, PartialEq)] pub struct Group { /// The span of this group. pub span: Span, /// The kind of this group. pub kind: GroupKind, /// The regular expression in this group. pub ast: Box, } impl Group { /// If this group is non-capturing, then this returns the (possibly empty) /// set of flags. Otherwise, `None` is returned. pub fn flags(&self) -> Option<&Flags> { match self.kind { GroupKind::NonCapturing(ref flags) => Some(flags), _ => None, } } /// Returns true if and only if this group is capturing. pub fn is_capturing(&self) -> bool { match self.kind { GroupKind::CaptureIndex(_) | GroupKind::CaptureName(_) => true, GroupKind::NonCapturing(_) => false, } } /// Returns the capture index of this group, if this is a capturing group. /// /// This returns a capture index precisely when `is_capturing` is `true`. pub fn capture_index(&self) -> Option { match self.kind { GroupKind::CaptureIndex(i) => Some(i), GroupKind::CaptureName(ref x) => Some(x.index), GroupKind::NonCapturing(_) => None, } } } /// The kind of a group. #[derive(Clone, Debug, Eq, PartialEq)] pub enum GroupKind { /// `(a)` CaptureIndex(u32), /// `(?Pa)` CaptureName(CaptureName), /// `(?:a)` and `(?i:a)` NonCapturing(Flags), } /// A capture name. /// /// This corresponds to the name itself between the angle brackets in, e.g., /// `(?Pexpr)`. #[derive(Clone, Debug, Eq, PartialEq)] pub struct CaptureName { /// The span of this capture name. pub span: Span, /// The capture name. pub name: String, /// The capture index. pub index: u32, } /// A group of flags that is not applied to a particular regular expression. #[derive(Clone, Debug, Eq, PartialEq)] pub struct SetFlags { /// The span of these flags, including the grouping parentheses. pub span: Span, /// The actual sequence of flags. pub flags: Flags, } /// A group of flags. /// /// This corresponds only to the sequence of flags themselves, e.g., `is-u`. #[derive(Clone, Debug, Eq, PartialEq)] pub struct Flags { /// The span of this group of flags. pub span: Span, /// A sequence of flag items. Each item is either a flag or a negation /// operator. pub items: Vec, } impl Flags { /// Add the given item to this sequence of flags. /// /// If the item was added successfully, then `None` is returned. If the /// given item is a duplicate, then `Some(i)` is returned, where /// `items[i].kind == item.kind`. pub fn add_item(&mut self, item: FlagsItem) -> Option { for (i, x) in self.items.iter().enumerate() { if x.kind == item.kind { return Some(i); } } self.items.push(item); None } /// Returns the state of the given flag in this set. /// /// If the given flag is in the set but is negated, then `Some(false)` is /// returned. /// /// If the given flag is in the set and is not negated, then `Some(true)` /// is returned. /// /// Otherwise, `None` is returned. pub fn flag_state(&self, flag: Flag) -> Option { let mut negated = false; for x in &self.items { match x.kind { FlagsItemKind::Negation => { negated = true; } FlagsItemKind::Flag(ref xflag) if xflag == &flag => { return Some(!negated); } _ => {} } } None } } /// A single item in a group of flags. #[derive(Clone, Debug, Eq, PartialEq)] pub struct FlagsItem { /// The span of this item. pub span: Span, /// The kind of this item. pub kind: FlagsItemKind, } /// The kind of an item in a group of flags. #[derive(Clone, Debug, Eq, PartialEq)] pub enum FlagsItemKind { /// A negation operator applied to all subsequent flags in the enclosing /// group. Negation, /// A single flag in a group. Flag(Flag), } impl FlagsItemKind { /// Returns true if and only if this item is a negation operator. pub fn is_negation(&self) -> bool { match *self { FlagsItemKind::Negation => true, _ => false, } } } /// A single flag. #[derive(Clone, Copy, Debug, Eq, PartialEq)] pub enum Flag { /// `i` CaseInsensitive, /// `m` MultiLine, /// `s` DotMatchesNewLine, /// `U` SwapGreed, /// `u` Unicode, /// `x` IgnoreWhitespace, } /// A custom `Drop` impl is used for `Ast` such that it uses constant stack /// space but heap space proportional to the depth of the `Ast`. impl Drop for Ast { fn drop(&mut self) { use std::mem; match *self { Ast::Empty(_) | Ast::Flags(_) | Ast::Literal(_) | Ast::Dot(_) | Ast::Assertion(_) // Classes are recursive, so they get their own Drop impl. | Ast::Class(_) => return, Ast::Repetition(ref x) if !x.ast.has_subexprs() => return, Ast::Group(ref x) if !x.ast.has_subexprs() => return, Ast::Alternation(ref x) if x.asts.is_empty() => return, Ast::Concat(ref x) if x.asts.is_empty() => return, _ => {} } let empty_span = || Span::splat(Position::new(0, 0, 0)); let empty_ast = || Ast::Empty(empty_span()); let mut stack = vec![mem::replace(self, empty_ast())]; while let Some(mut ast) = stack.pop() { match ast { Ast::Empty(_) | Ast::Flags(_) | Ast::Literal(_) | Ast::Dot(_) | Ast::Assertion(_) // Classes are recursive, so they get their own Drop impl. | Ast::Class(_) => {} Ast::Repetition(ref mut x) => { stack.push(mem::replace(&mut x.ast, empty_ast())); } Ast::Group(ref mut x) => { stack.push(mem::replace(&mut x.ast, empty_ast())); } Ast::Alternation(ref mut x) => { stack.extend(x.asts.drain(..)); } Ast::Concat(ref mut x) => { stack.extend(x.asts.drain(..)); } } } } } /// A custom `Drop` impl is used for `ClassSet` such that it uses constant /// stack space but heap space proportional to the depth of the `ClassSet`. impl Drop for ClassSet { fn drop(&mut self) { use std::mem; match *self { ClassSet::Item(ref item) => match *item { ClassSetItem::Empty(_) | ClassSetItem::Literal(_) | ClassSetItem::Range(_) | ClassSetItem::Ascii(_) | ClassSetItem::Unicode(_) | ClassSetItem::Perl(_) => return, ClassSetItem::Bracketed(ref x) => { if x.kind.is_empty() { return; } } ClassSetItem::Union(ref x) => { if x.items.is_empty() { return; } } }, ClassSet::BinaryOp(ref op) => { if op.lhs.is_empty() && op.rhs.is_empty() { return; } } } let empty_span = || Span::splat(Position::new(0, 0, 0)); let empty_set = || ClassSet::Item(ClassSetItem::Empty(empty_span())); let mut stack = vec![mem::replace(self, empty_set())]; while let Some(mut set) = stack.pop() { match set { ClassSet::Item(ref mut item) => match *item { ClassSetItem::Empty(_) | ClassSetItem::Literal(_) | ClassSetItem::Range(_) | ClassSetItem::Ascii(_) | ClassSetItem::Unicode(_) | ClassSetItem::Perl(_) => {} ClassSetItem::Bracketed(ref mut x) => { stack.push(mem::replace(&mut x.kind, empty_set())); } ClassSetItem::Union(ref mut x) => { stack.extend(x.items.drain(..).map(ClassSet::Item)); } }, ClassSet::BinaryOp(ref mut op) => { stack.push(mem::replace(&mut op.lhs, empty_set())); stack.push(mem::replace(&mut op.rhs, empty_set())); } } } } } #[cfg(test)] mod tests { use super::*; // We use a thread with an explicit stack size to test that our destructor // for Ast can handle arbitrarily sized expressions in constant stack // space. In case we run on a platform without threads (WASM?), we limit // this test to Windows/Unix. #[test] #[cfg(any(unix, windows))] fn no_stack_overflow_on_drop() { use std::thread; let run = || { let span = || Span::splat(Position::new(0, 0, 0)); let mut ast = Ast::Empty(span()); for i in 0..200 { ast = Ast::Group(Group { span: span(), kind: GroupKind::CaptureIndex(i), ast: Box::new(ast), }); } assert!(!ast.is_empty()); }; // We run our test on a thread with a small stack size so we can // force the issue more easily. thread::Builder::new() .stack_size(1 << 10) .spawn(run) .unwrap() .join() .unwrap(); } }