use crate::ast; use crate::hir; use crate::Result; /// A builder for a regular expression parser. /// /// This builder permits modifying configuration options for the parser. /// /// This type combines the builder options for both the /// [AST `ParserBuilder`](ast/parse/struct.ParserBuilder.html) /// and the /// [HIR `TranslatorBuilder`](hir/translate/struct.TranslatorBuilder.html). #[derive(Clone, Debug, Default)] pub struct ParserBuilder { ast: ast::parse::ParserBuilder, hir: hir::translate::TranslatorBuilder, } impl ParserBuilder { /// Create a new parser builder with a default configuration. pub fn new() -> ParserBuilder { ParserBuilder::default() } /// Build a parser from this configuration with the given pattern. pub fn build(&self) -> Parser { Parser { ast: self.ast.build(), hir: self.hir.build() } } /// Set the nesting limit for this parser. /// /// The nesting limit controls how deep the abstract syntax tree is allowed /// to be. If the AST exceeds the given limit (e.g., with too many nested /// groups), then an error is returned by the parser. /// /// The purpose of this limit is to act as a heuristic to prevent stack /// overflow for consumers that do structural induction on an `Ast` using /// explicit recursion. While this crate never does this (instead using /// constant stack space and moving the call stack to the heap), other /// crates may. /// /// This limit is not checked until the entire Ast is parsed. Therefore, /// if callers want to put a limit on the amount of heap space used, then /// they should impose a limit on the length, in bytes, of the concrete /// pattern string. In particular, this is viable since this parser /// implementation will limit itself to heap space proportional to the /// length of the pattern string. /// /// Note that a nest limit of `0` will return a nest limit error for most /// patterns but not all. For example, a nest limit of `0` permits `a` but /// not `ab`, since `ab` requires a concatenation, which results in a nest /// depth of `1`. In general, a nest limit is not something that manifests /// in an obvious way in the concrete syntax, therefore, it should not be /// used in a granular way. pub fn nest_limit(&mut self, limit: u32) -> &mut ParserBuilder { self.ast.nest_limit(limit); self } /// Whether to support octal syntax or not. /// /// Octal syntax is a little-known way of uttering Unicode codepoints in /// a regular expression. For example, `a`, `\x61`, `\u0061` and /// `\141` are all equivalent regular expressions, where the last example /// shows octal syntax. /// /// While supporting octal syntax isn't in and of itself a problem, it does /// make good error messages harder. That is, in PCRE based regex engines, /// syntax like `\0` invokes a backreference, which is explicitly /// unsupported in Rust's regex engine. However, many users expect it to /// be supported. Therefore, when octal support is disabled, the error /// message will explicitly mention that backreferences aren't supported. /// /// Octal syntax is disabled by default. pub fn octal(&mut self, yes: bool) -> &mut ParserBuilder { self.ast.octal(yes); self } /// When enabled, the parser will permit the construction of a regular /// expression that may match invalid UTF-8. /// /// When disabled (the default), the parser is guaranteed to produce /// an expression that will only ever match valid UTF-8 (otherwise, the /// parser will return an error). /// /// Perhaps surprisingly, when invalid UTF-8 isn't allowed, a negated ASCII /// word boundary (uttered as `(?-u:\B)` in the concrete syntax) will cause /// the parser to return an error. Namely, a negated ASCII word boundary /// can result in matching positions that aren't valid UTF-8 boundaries. pub fn allow_invalid_utf8(&mut self, yes: bool) -> &mut ParserBuilder { self.hir.allow_invalid_utf8(yes); self } /// Enable verbose mode in the regular expression. /// /// When enabled, verbose mode permits insignificant whitespace in many /// places in the regular expression, as well as comments. Comments are /// started using `#` and continue until the end of the line. /// /// By default, this is disabled. It may be selectively enabled in the /// regular expression by using the `x` flag regardless of this setting. pub fn ignore_whitespace(&mut self, yes: bool) -> &mut ParserBuilder { self.ast.ignore_whitespace(yes); self } /// Enable or disable the case insensitive flag by default. /// /// By default this is disabled. It may alternatively be selectively /// enabled in the regular expression itself via the `i` flag. pub fn case_insensitive(&mut self, yes: bool) -> &mut ParserBuilder { self.hir.case_insensitive(yes); self } /// Enable or disable the multi-line matching flag by default. /// /// By default this is disabled. It may alternatively be selectively /// enabled in the regular expression itself via the `m` flag. pub fn multi_line(&mut self, yes: bool) -> &mut ParserBuilder { self.hir.multi_line(yes); self } /// Enable or disable the "dot matches any character" flag by default. /// /// By default this is disabled. It may alternatively be selectively /// enabled in the regular expression itself via the `s` flag. pub fn dot_matches_new_line(&mut self, yes: bool) -> &mut ParserBuilder { self.hir.dot_matches_new_line(yes); self } /// Enable or disable the "swap greed" flag by default. /// /// By default this is disabled. It may alternatively be selectively /// enabled in the regular expression itself via the `U` flag. pub fn swap_greed(&mut self, yes: bool) -> &mut ParserBuilder { self.hir.swap_greed(yes); self } /// Enable or disable the Unicode flag (`u`) by default. /// /// By default this is **enabled**. It may alternatively be selectively /// disabled in the regular expression itself via the `u` flag. /// /// Note that unless `allow_invalid_utf8` is enabled (it's disabled by /// default), a regular expression will fail to parse if Unicode mode is /// disabled and a sub-expression could possibly match invalid UTF-8. pub fn unicode(&mut self, yes: bool) -> &mut ParserBuilder { self.hir.unicode(yes); self } } /// A convenience parser for regular expressions. /// /// This parser takes as input a regular expression pattern string (the /// "concrete syntax") and returns a high-level intermediate representation /// (the HIR) suitable for most types of analysis. In particular, this parser /// hides the intermediate state of producing an AST (the "abstract syntax"). /// The AST is itself far more complex than the HIR, so this parser serves as a /// convenience for never having to deal with it at all. /// /// If callers have more fine grained use cases that need an AST, then please /// see the [`ast::parse`](ast/parse/index.html) module. /// /// A `Parser` can be configured in more detail via a /// [`ParserBuilder`](struct.ParserBuilder.html). #[derive(Clone, Debug)] pub struct Parser { ast: ast::parse::Parser, hir: hir::translate::Translator, } impl Parser { /// Create a new parser with a default configuration. /// /// The parser can be run with `parse` method. The parse method returns /// a high level intermediate representation of the given regular /// expression. /// /// To set configuration options on the parser, use /// [`ParserBuilder`](struct.ParserBuilder.html). pub fn new() -> Parser { ParserBuilder::new().build() } /// Parse the regular expression into a high level intermediate /// representation. pub fn parse(&mut self, pattern: &str) -> Result { let ast = self.ast.parse(pattern)?; let hir = self.hir.translate(pattern, &ast)?; Ok(hir) } }