diff options
Diffstat (limited to 'vendor/regex-automata/src/util/syntax.rs')
-rw-r--r-- | vendor/regex-automata/src/util/syntax.rs | 252 |
1 files changed, 231 insertions, 21 deletions
diff --git a/vendor/regex-automata/src/util/syntax.rs b/vendor/regex-automata/src/util/syntax.rs index 88beeee75..78e3cf9a1 100644 --- a/vendor/regex-automata/src/util/syntax.rs +++ b/vendor/regex-automata/src/util/syntax.rs @@ -1,4 +1,132 @@ -use regex_syntax::ParserBuilder; +/*! +Utilities for dealing with the syntax of a regular expression. + +This module currently only exposes a [`Config`] type that +itself represents a wrapper around the configuration for a +[`regex-syntax::ParserBuilder`](regex_syntax::ParserBuilder). The purpose of +this wrapper is to make configuring syntax options very similar to how other +configuration is done throughout this crate. Namely, instead of duplicating +syntax options across every builder (of which there are many), we instead +create small config objects like this one that can be passed around and +composed. +*/ + +use alloc::{vec, vec::Vec}; + +use regex_syntax::{ + ast, + hir::{self, Hir}, + Error, ParserBuilder, +}; + +/// A convenience routine for parsing a pattern into an HIR value with the +/// default configuration. +/// +/// # Example +/// +/// This shows how to parse a pattern into an HIR value: +/// +/// ``` +/// use regex_automata::util::syntax; +/// +/// let hir = syntax::parse(r"([a-z]+)|([0-9]+)")?; +/// assert_eq!(Some(1), hir.properties().static_explicit_captures_len()); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +pub fn parse(pattern: &str) -> Result<Hir, Error> { + parse_with(pattern, &Config::default()) +} + +/// A convenience routine for parsing many patterns into HIR value with the +/// default configuration. +/// +/// # Example +/// +/// This shows how to parse many patterns into an corresponding HIR values: +/// +/// ``` +/// use { +/// regex_automata::util::syntax, +/// regex_syntax::hir::Properties, +/// }; +/// +/// let hirs = syntax::parse_many(&[ +/// r"([a-z]+)|([0-9]+)", +/// r"foo(A-Z]+)bar", +/// ])?; +/// let props = Properties::union(hirs.iter().map(|h| h.properties())); +/// assert_eq!(Some(1), props.static_explicit_captures_len()); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +pub fn parse_many<P: AsRef<str>>(patterns: &[P]) -> Result<Vec<Hir>, Error> { + parse_many_with(patterns, &Config::default()) +} + +/// A convenience routine for parsing a pattern into an HIR value using a +/// `Config`. +/// +/// # Example +/// +/// This shows how to parse a pattern into an HIR value with a non-default +/// configuration: +/// +/// ``` +/// use regex_automata::util::syntax; +/// +/// let hir = syntax::parse_with( +/// r"^[a-z]+$", +/// &syntax::Config::new().multi_line(true).crlf(true), +/// )?; +/// assert!(hir.properties().look_set().contains_anchor_crlf()); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +pub fn parse_with(pattern: &str, config: &Config) -> Result<Hir, Error> { + let mut builder = ParserBuilder::new(); + config.apply(&mut builder); + builder.build().parse(pattern) +} + +/// A convenience routine for parsing many patterns into HIR values using a +/// `Config`. +/// +/// # Example +/// +/// This shows how to parse many patterns into an corresponding HIR values +/// with a non-default configuration: +/// +/// ``` +/// use { +/// regex_automata::util::syntax, +/// regex_syntax::hir::Properties, +/// }; +/// +/// let patterns = &[ +/// r"([a-z]+)|([0-9]+)", +/// r"\W", +/// r"foo(A-Z]+)bar", +/// ]; +/// let config = syntax::Config::new().unicode(false).utf8(false); +/// let hirs = syntax::parse_many_with(patterns, &config)?; +/// let props = Properties::union(hirs.iter().map(|h| h.properties())); +/// assert!(!props.is_utf8()); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +pub fn parse_many_with<P: AsRef<str>>( + patterns: &[P], + config: &Config, +) -> Result<Vec<Hir>, Error> { + let mut builder = ParserBuilder::new(); + config.apply(&mut builder); + let mut hirs = vec![]; + for p in patterns.iter() { + hirs.push(builder.build().parse(p.as_ref())?); + } + Ok(hirs) +} /// A common set of configuration options that apply to the syntax of a regex. /// @@ -14,10 +142,12 @@ use regex_syntax::ParserBuilder; /// in this crate. Instead of re-defining them on every engine's builder, they /// are instead provided here as one cohesive unit. #[derive(Clone, Copy, Debug)] -pub struct SyntaxConfig { +pub struct Config { case_insensitive: bool, multi_line: bool, dot_matches_new_line: bool, + crlf: bool, + line_terminator: u8, swap_greed: bool, ignore_whitespace: bool, unicode: bool, @@ -26,14 +156,16 @@ pub struct SyntaxConfig { octal: bool, } -impl SyntaxConfig { +impl Config { /// Return a new default syntax configuration. - pub fn new() -> SyntaxConfig { + pub fn new() -> Config { // These defaults match the ones used in regex-syntax. - SyntaxConfig { + Config { case_insensitive: false, multi_line: false, dot_matches_new_line: false, + crlf: false, + line_terminator: b'\n', swap_greed: false, ignore_whitespace: false, unicode: true, @@ -51,7 +183,7 @@ impl SyntaxConfig { /// /// By default this is disabled. It may alternatively be selectively /// enabled in the regular expression itself via the `i` flag. - pub fn case_insensitive(mut self, yes: bool) -> SyntaxConfig { + pub fn case_insensitive(mut self, yes: bool) -> Config { self.case_insensitive = yes; self } @@ -66,7 +198,7 @@ impl SyntaxConfig { /// /// By default this is disabled. It may alternatively be selectively /// enabled in the regular expression itself via the `m` flag. - pub fn multi_line(mut self, yes: bool) -> SyntaxConfig { + pub fn multi_line(mut self, yes: bool) -> Config { self.multi_line = yes; self } @@ -77,7 +209,7 @@ impl SyntaxConfig { /// then `.` will match any character except for a new line character. /// /// Note that `.` is impacted by whether the "unicode" setting is enabled - /// or not. When Unicode is enabled (the defualt), `.` will match any UTF-8 + /// or not. When Unicode is enabled (the default), `.` will match any UTF-8 /// encoding of any Unicode scalar value (sans a new line, depending on /// whether this "dot matches new line" option is enabled). When Unicode /// mode is disabled, `.` will match any byte instead. Because of this, @@ -87,11 +219,53 @@ impl SyntaxConfig { /// /// By default this is disabled. It may alternatively be selectively /// enabled in the regular expression itself via the `s` flag. - pub fn dot_matches_new_line(mut self, yes: bool) -> SyntaxConfig { + pub fn dot_matches_new_line(mut self, yes: bool) -> Config { self.dot_matches_new_line = yes; self } + /// Enable or disable the "CRLF mode" flag by default. + /// + /// By default this is disabled. It may alternatively be selectively + /// enabled in the regular expression itself via the `R` flag. + /// + /// When CRLF mode is enabled, the following happens: + /// + /// * Unless `dot_matches_new_line` is enabled, `.` will match any character + /// except for `\r` and `\n`. + /// * When `multi_line` mode is enabled, `^` and `$` will treat `\r\n`, + /// `\r` and `\n` as line terminators. And in particular, neither will + /// match between a `\r` and a `\n`. + pub fn crlf(mut self, yes: bool) -> Config { + self.crlf = yes; + self + } + + /// Sets the line terminator for use with `(?u-s:.)` and `(?-us:.)`. + /// + /// Namely, instead of `.` (by default) matching everything except for `\n`, + /// this will cause `.` to match everything except for the byte given. + /// + /// If `.` is used in a context where Unicode mode is enabled and this byte + /// isn't ASCII, then an error will be returned. When Unicode mode is + /// disabled, then any byte is permitted, but will return an error if UTF-8 + /// mode is enabled and it is a non-ASCII byte. + /// + /// In short, any ASCII value for a line terminator is always okay. But a + /// non-ASCII byte might result in an error depending on whether Unicode + /// mode or UTF-8 mode are enabled. + /// + /// Note that if `R` mode is enabled then it always takes precedence and + /// the line terminator will be treated as `\r` and `\n` simultaneously. + /// + /// Note also that this *doesn't* impact the look-around assertions + /// `(?m:^)` and `(?m:$)`. That's usually controlled by additional + /// configuration in the regex engine itself. + pub fn line_terminator(mut self, byte: u8) -> Config { + self.line_terminator = byte; + self + } + /// Enable or disable the "swap greed" flag by default. /// /// When this is enabled, `.*` (for example) will become ungreedy and `.*?` @@ -99,7 +273,7 @@ impl SyntaxConfig { /// /// By default this is disabled. It may alternatively be selectively /// enabled in the regular expression itself via the `U` flag. - pub fn swap_greed(mut self, yes: bool) -> SyntaxConfig { + pub fn swap_greed(mut self, yes: bool) -> Config { self.swap_greed = yes; self } @@ -112,7 +286,7 @@ impl SyntaxConfig { /// /// By default, this is disabled. It may be selectively enabled in the /// regular expression by using the `x` flag regardless of this setting. - pub fn ignore_whitespace(mut self, yes: bool) -> SyntaxConfig { + pub fn ignore_whitespace(mut self, yes: bool) -> Config { self.ignore_whitespace = yes; self } @@ -131,7 +305,7 @@ impl SyntaxConfig { /// time. This is especially noticeable if your regex contains character /// classes like `\w` that are impacted by whether Unicode is enabled or /// not. If Unicode is not necessary, you are encouraged to disable it. - pub fn unicode(mut self, yes: bool) -> SyntaxConfig { + pub fn unicode(mut self, yes: bool) -> Config { self.unicode = yes; self } @@ -139,7 +313,7 @@ impl SyntaxConfig { /// When disabled, the builder will permit the construction of a regular /// expression that may match invalid UTF-8. /// - /// For example, when [`SyntaxConfig::unicode`] is disabled, then + /// For example, when [`Config::unicode`] is disabled, then /// expressions like `[^a]` may match invalid UTF-8 since they can match /// any single byte that is not `a`. By default, these sub-expressions /// are disallowed to avoid returning offsets that split a UTF-8 @@ -150,7 +324,7 @@ impl SyntaxConfig { /// When enabled (the default), the builder is guaranteed to produce a /// regex that will only ever match valid UTF-8 (otherwise, the builder /// will return an error). - pub fn utf8(mut self, yes: bool) -> SyntaxConfig { + pub fn utf8(mut self, yes: bool) -> Config { self.utf8 = yes; self } @@ -171,7 +345,7 @@ impl SyntaxConfig { /// if callers want to put a limit on the amount of heap space used, then /// they should impose a limit on the length, in bytes, of the concrete /// pattern string. In particular, this is viable since the parser will - /// limit itself to heap space proportional to the lenth of the pattern + /// limit itself to heap space proportional to the length of the pattern /// string. /// /// Note that a nest limit of `0` will return a nest limit error for most @@ -180,7 +354,7 @@ impl SyntaxConfig { /// in a nest depth of `1`. In general, a nest limit is not something that /// manifests in an obvious way in the concrete syntax, therefore, it /// should not be used in a granular way. - pub fn nest_limit(mut self, limit: u32) -> SyntaxConfig { + pub fn nest_limit(mut self, limit: u32) -> Config { self.nest_limit = limit; self } @@ -200,7 +374,7 @@ impl SyntaxConfig { /// message will explicitly mention that backreferences aren't supported. /// /// Octal syntax is disabled by default. - pub fn octal(mut self, yes: bool) -> SyntaxConfig { + pub fn octal(mut self, yes: bool) -> Config { self.octal = yes; self } @@ -225,6 +399,16 @@ impl SyntaxConfig { self.dot_matches_new_line } + /// Returns whether "CRLF" mode is enabled. + pub fn get_crlf(&self) -> bool { + self.crlf + } + + /// Returns the line terminator in this syntax configuration. + pub fn get_line_terminator(&self) -> u8 { + self.line_terminator + } + /// Returns whether "swap greed" mode is enabled. pub fn get_swap_greed(&self) -> bool { self.swap_greed @@ -257,16 +441,42 @@ impl SyntaxConfig { .case_insensitive(self.case_insensitive) .multi_line(self.multi_line) .dot_matches_new_line(self.dot_matches_new_line) + .crlf(self.crlf) + .line_terminator(self.line_terminator) .swap_greed(self.swap_greed) .ignore_whitespace(self.ignore_whitespace) - .allow_invalid_utf8(!self.utf8) + .utf8(self.utf8) + .nest_limit(self.nest_limit) + .octal(self.octal); + } + + /// Applies this configuration to the given AST parser. + pub(crate) fn apply_ast(&self, builder: &mut ast::parse::ParserBuilder) { + builder + .ignore_whitespace(self.ignore_whitespace) .nest_limit(self.nest_limit) .octal(self.octal); } + + /// Applies this configuration to the given AST-to-HIR translator. + pub(crate) fn apply_hir( + &self, + builder: &mut hir::translate::TranslatorBuilder, + ) { + builder + .unicode(self.unicode) + .case_insensitive(self.case_insensitive) + .multi_line(self.multi_line) + .crlf(self.crlf) + .dot_matches_new_line(self.dot_matches_new_line) + .line_terminator(self.line_terminator) + .swap_greed(self.swap_greed) + .utf8(self.utf8); + } } -impl Default for SyntaxConfig { - fn default() -> SyntaxConfig { - SyntaxConfig::new() +impl Default for Config { + fn default() -> Config { + Config::new() } } |