summaryrefslogtreecommitdiffstats
path: root/third_party/rust/regex-automata/src/util/syntax.rs
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/rust/regex-automata/src/util/syntax.rs')
-rw-r--r--third_party/rust/regex-automata/src/util/syntax.rs482
1 files changed, 482 insertions, 0 deletions
diff --git a/third_party/rust/regex-automata/src/util/syntax.rs b/third_party/rust/regex-automata/src/util/syntax.rs
new file mode 100644
index 0000000000..78e3cf9a11
--- /dev/null
+++ b/third_party/rust/regex-automata/src/util/syntax.rs
@@ -0,0 +1,482 @@
+/*!
+Utilities for dealing with the syntax of a regular expression.
+
+This module currently only exposes a [`Config`] type that
+itself represents a wrapper around the configuration for a
+[`regex-syntax::ParserBuilder`](regex_syntax::ParserBuilder). The purpose of
+this wrapper is to make configuring syntax options very similar to how other
+configuration is done throughout this crate. Namely, instead of duplicating
+syntax options across every builder (of which there are many), we instead
+create small config objects like this one that can be passed around and
+composed.
+*/
+
+use alloc::{vec, vec::Vec};
+
+use regex_syntax::{
+ ast,
+ hir::{self, Hir},
+ Error, ParserBuilder,
+};
+
+/// A convenience routine for parsing a pattern into an HIR value with the
+/// default configuration.
+///
+/// # Example
+///
+/// This shows how to parse a pattern into an HIR value:
+///
+/// ```
+/// use regex_automata::util::syntax;
+///
+/// let hir = syntax::parse(r"([a-z]+)|([0-9]+)")?;
+/// assert_eq!(Some(1), hir.properties().static_explicit_captures_len());
+///
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+pub fn parse(pattern: &str) -> Result<Hir, Error> {
+ parse_with(pattern, &Config::default())
+}
+
+/// A convenience routine for parsing many patterns into HIR value with the
+/// default configuration.
+///
+/// # Example
+///
+/// This shows how to parse many patterns into an corresponding HIR values:
+///
+/// ```
+/// use {
+/// regex_automata::util::syntax,
+/// regex_syntax::hir::Properties,
+/// };
+///
+/// let hirs = syntax::parse_many(&[
+/// r"([a-z]+)|([0-9]+)",
+/// r"foo(A-Z]+)bar",
+/// ])?;
+/// let props = Properties::union(hirs.iter().map(|h| h.properties()));
+/// assert_eq!(Some(1), props.static_explicit_captures_len());
+///
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+pub fn parse_many<P: AsRef<str>>(patterns: &[P]) -> Result<Vec<Hir>, Error> {
+ parse_many_with(patterns, &Config::default())
+}
+
+/// A convenience routine for parsing a pattern into an HIR value using a
+/// `Config`.
+///
+/// # Example
+///
+/// This shows how to parse a pattern into an HIR value with a non-default
+/// configuration:
+///
+/// ```
+/// use regex_automata::util::syntax;
+///
+/// let hir = syntax::parse_with(
+/// r"^[a-z]+$",
+/// &syntax::Config::new().multi_line(true).crlf(true),
+/// )?;
+/// assert!(hir.properties().look_set().contains_anchor_crlf());
+///
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+pub fn parse_with(pattern: &str, config: &Config) -> Result<Hir, Error> {
+ let mut builder = ParserBuilder::new();
+ config.apply(&mut builder);
+ builder.build().parse(pattern)
+}
+
+/// A convenience routine for parsing many patterns into HIR values using a
+/// `Config`.
+///
+/// # Example
+///
+/// This shows how to parse many patterns into an corresponding HIR values
+/// with a non-default configuration:
+///
+/// ```
+/// use {
+/// regex_automata::util::syntax,
+/// regex_syntax::hir::Properties,
+/// };
+///
+/// let patterns = &[
+/// r"([a-z]+)|([0-9]+)",
+/// r"\W",
+/// r"foo(A-Z]+)bar",
+/// ];
+/// let config = syntax::Config::new().unicode(false).utf8(false);
+/// let hirs = syntax::parse_many_with(patterns, &config)?;
+/// let props = Properties::union(hirs.iter().map(|h| h.properties()));
+/// assert!(!props.is_utf8());
+///
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+pub fn parse_many_with<P: AsRef<str>>(
+ patterns: &[P],
+ config: &Config,
+) -> Result<Vec<Hir>, Error> {
+ let mut builder = ParserBuilder::new();
+ config.apply(&mut builder);
+ let mut hirs = vec![];
+ for p in patterns.iter() {
+ hirs.push(builder.build().parse(p.as_ref())?);
+ }
+ Ok(hirs)
+}
+
+/// A common set of configuration options that apply to the syntax of a regex.
+///
+/// This represents a group of configuration options that specifically apply
+/// to how the concrete syntax of a regular expression is interpreted. In
+/// particular, they are generally forwarded to the
+/// [`ParserBuilder`](https://docs.rs/regex-syntax/*/regex_syntax/struct.ParserBuilder.html)
+/// in the
+/// [`regex-syntax`](https://docs.rs/regex-syntax)
+/// crate when building a regex from its concrete syntax directly.
+///
+/// These options are defined as a group since they apply to every regex engine
+/// in this crate. Instead of re-defining them on every engine's builder, they
+/// are instead provided here as one cohesive unit.
+#[derive(Clone, Copy, Debug)]
+pub struct Config {
+ case_insensitive: bool,
+ multi_line: bool,
+ dot_matches_new_line: bool,
+ crlf: bool,
+ line_terminator: u8,
+ swap_greed: bool,
+ ignore_whitespace: bool,
+ unicode: bool,
+ utf8: bool,
+ nest_limit: u32,
+ octal: bool,
+}
+
+impl Config {
+ /// Return a new default syntax configuration.
+ pub fn new() -> Config {
+ // These defaults match the ones used in regex-syntax.
+ Config {
+ case_insensitive: false,
+ multi_line: false,
+ dot_matches_new_line: false,
+ crlf: false,
+ line_terminator: b'\n',
+ swap_greed: false,
+ ignore_whitespace: false,
+ unicode: true,
+ utf8: true,
+ nest_limit: 250,
+ octal: false,
+ }
+ }
+
+ /// Enable or disable the case insensitive flag by default.
+ ///
+ /// When Unicode mode is enabled, case insensitivity is Unicode-aware.
+ /// Specifically, it will apply the "simple" case folding rules as
+ /// specified by Unicode.
+ ///
+ /// By default this is disabled. It may alternatively be selectively
+ /// enabled in the regular expression itself via the `i` flag.
+ pub fn case_insensitive(mut self, yes: bool) -> Config {
+ self.case_insensitive = yes;
+ self
+ }
+
+ /// Enable or disable the multi-line matching flag by default.
+ ///
+ /// When this is enabled, the `^` and `$` look-around assertions will
+ /// match immediately after and immediately before a new line character,
+ /// respectively. Note that the `\A` and `\z` look-around assertions are
+ /// unaffected by this setting and always correspond to matching at the
+ /// beginning and end of the input.
+ ///
+ /// By default this is disabled. It may alternatively be selectively
+ /// enabled in the regular expression itself via the `m` flag.
+ pub fn multi_line(mut self, yes: bool) -> Config {
+ self.multi_line = yes;
+ self
+ }
+
+ /// Enable or disable the "dot matches any character" flag by default.
+ ///
+ /// When this is enabled, `.` will match any character. When it's disabled,
+ /// then `.` will match any character except for a new line character.
+ ///
+ /// Note that `.` is impacted by whether the "unicode" setting is enabled
+ /// or not. When Unicode is enabled (the default), `.` will match any UTF-8
+ /// encoding of any Unicode scalar value (sans a new line, depending on
+ /// whether this "dot matches new line" option is enabled). When Unicode
+ /// mode is disabled, `.` will match any byte instead. Because of this,
+ /// when Unicode mode is disabled, `.` can only be used when the "allow
+ /// invalid UTF-8" option is enabled, since `.` could otherwise match
+ /// invalid UTF-8.
+ ///
+ /// By default this is disabled. It may alternatively be selectively
+ /// enabled in the regular expression itself via the `s` flag.
+ pub fn dot_matches_new_line(mut self, yes: bool) -> Config {
+ self.dot_matches_new_line = yes;
+ self
+ }
+
+ /// Enable or disable the "CRLF mode" flag by default.
+ ///
+ /// By default this is disabled. It may alternatively be selectively
+ /// enabled in the regular expression itself via the `R` flag.
+ ///
+ /// When CRLF mode is enabled, the following happens:
+ ///
+ /// * Unless `dot_matches_new_line` is enabled, `.` will match any character
+ /// except for `\r` and `\n`.
+ /// * When `multi_line` mode is enabled, `^` and `$` will treat `\r\n`,
+ /// `\r` and `\n` as line terminators. And in particular, neither will
+ /// match between a `\r` and a `\n`.
+ pub fn crlf(mut self, yes: bool) -> Config {
+ self.crlf = yes;
+ self
+ }
+
+ /// Sets the line terminator for use with `(?u-s:.)` and `(?-us:.)`.
+ ///
+ /// Namely, instead of `.` (by default) matching everything except for `\n`,
+ /// this will cause `.` to match everything except for the byte given.
+ ///
+ /// If `.` is used in a context where Unicode mode is enabled and this byte
+ /// isn't ASCII, then an error will be returned. When Unicode mode is
+ /// disabled, then any byte is permitted, but will return an error if UTF-8
+ /// mode is enabled and it is a non-ASCII byte.
+ ///
+ /// In short, any ASCII value for a line terminator is always okay. But a
+ /// non-ASCII byte might result in an error depending on whether Unicode
+ /// mode or UTF-8 mode are enabled.
+ ///
+ /// Note that if `R` mode is enabled then it always takes precedence and
+ /// the line terminator will be treated as `\r` and `\n` simultaneously.
+ ///
+ /// Note also that this *doesn't* impact the look-around assertions
+ /// `(?m:^)` and `(?m:$)`. That's usually controlled by additional
+ /// configuration in the regex engine itself.
+ pub fn line_terminator(mut self, byte: u8) -> Config {
+ self.line_terminator = byte;
+ self
+ }
+
+ /// Enable or disable the "swap greed" flag by default.
+ ///
+ /// When this is enabled, `.*` (for example) will become ungreedy and `.*?`
+ /// will become greedy.
+ ///
+ /// By default this is disabled. It may alternatively be selectively
+ /// enabled in the regular expression itself via the `U` flag.
+ pub fn swap_greed(mut self, yes: bool) -> Config {
+ self.swap_greed = yes;
+ self
+ }
+
+ /// Enable verbose mode in the regular expression.
+ ///
+ /// When enabled, verbose mode permits insigificant whitespace in many
+ /// places in the regular expression, as well as comments. Comments are
+ /// started using `#` and continue until the end of the line.
+ ///
+ /// By default, this is disabled. It may be selectively enabled in the
+ /// regular expression by using the `x` flag regardless of this setting.
+ pub fn ignore_whitespace(mut self, yes: bool) -> Config {
+ self.ignore_whitespace = yes;
+ self
+ }
+
+ /// Enable or disable the Unicode flag (`u`) by default.
+ ///
+ /// By default this is **enabled**. It may alternatively be selectively
+ /// disabled in the regular expression itself via the `u` flag.
+ ///
+ /// Note that unless "allow invalid UTF-8" is enabled (it's disabled by
+ /// default), a regular expression will fail to parse if Unicode mode is
+ /// disabled and a sub-expression could possibly match invalid UTF-8.
+ ///
+ /// **WARNING**: Unicode mode can greatly increase the size of the compiled
+ /// DFA, which can noticeably impact both memory usage and compilation
+ /// time. This is especially noticeable if your regex contains character
+ /// classes like `\w` that are impacted by whether Unicode is enabled or
+ /// not. If Unicode is not necessary, you are encouraged to disable it.
+ pub fn unicode(mut self, yes: bool) -> Config {
+ self.unicode = yes;
+ self
+ }
+
+ /// When disabled, the builder will permit the construction of a regular
+ /// expression that may match invalid UTF-8.
+ ///
+ /// For example, when [`Config::unicode`] is disabled, then
+ /// expressions like `[^a]` may match invalid UTF-8 since they can match
+ /// any single byte that is not `a`. By default, these sub-expressions
+ /// are disallowed to avoid returning offsets that split a UTF-8
+ /// encoded codepoint. However, in cases where matching at arbitrary
+ /// locations is desired, this option can be disabled to permit all such
+ /// sub-expressions.
+ ///
+ /// When enabled (the default), the builder is guaranteed to produce a
+ /// regex that will only ever match valid UTF-8 (otherwise, the builder
+ /// will return an error).
+ pub fn utf8(mut self, yes: bool) -> Config {
+ self.utf8 = yes;
+ self
+ }
+
+ /// Set the nesting limit used for the regular expression parser.
+ ///
+ /// The nesting limit controls how deep the abstract syntax tree is allowed
+ /// to be. If the AST exceeds the given limit (e.g., with too many nested
+ /// groups), then an error is returned by the parser.
+ ///
+ /// The purpose of this limit is to act as a heuristic to prevent stack
+ /// overflow when building a finite automaton from a regular expression's
+ /// abstract syntax tree. In particular, construction currently uses
+ /// recursion. In the future, the implementation may stop using recursion
+ /// and this option will no longer be necessary.
+ ///
+ /// This limit is not checked until the entire AST is parsed. Therefore,
+ /// if callers want to put a limit on the amount of heap space used, then
+ /// they should impose a limit on the length, in bytes, of the concrete
+ /// pattern string. In particular, this is viable since the parser will
+ /// limit itself to heap space proportional to the length of the pattern
+ /// string.
+ ///
+ /// Note that a nest limit of `0` will return a nest limit error for most
+ /// patterns but not all. For example, a nest limit of `0` permits `a` but
+ /// not `ab`, since `ab` requires a concatenation AST item, which results
+ /// in a nest depth of `1`. In general, a nest limit is not something that
+ /// manifests in an obvious way in the concrete syntax, therefore, it
+ /// should not be used in a granular way.
+ pub fn nest_limit(mut self, limit: u32) -> Config {
+ self.nest_limit = limit;
+ self
+ }
+
+ /// Whether to support octal syntax or not.
+ ///
+ /// Octal syntax is a little-known way of uttering Unicode codepoints in
+ /// a regular expression. For example, `a`, `\x61`, `\u0061` and
+ /// `\141` are all equivalent regular expressions, where the last example
+ /// shows octal syntax.
+ ///
+ /// While supporting octal syntax isn't in and of itself a problem, it does
+ /// make good error messages harder. That is, in PCRE based regex engines,
+ /// syntax like `\1` invokes a backreference, which is explicitly
+ /// unsupported in Rust's regex engine. However, many users expect it to
+ /// be supported. Therefore, when octal support is disabled, the error
+ /// message will explicitly mention that backreferences aren't supported.
+ ///
+ /// Octal syntax is disabled by default.
+ pub fn octal(mut self, yes: bool) -> Config {
+ self.octal = yes;
+ self
+ }
+
+ /// Returns whether "unicode" mode is enabled.
+ pub fn get_unicode(&self) -> bool {
+ self.unicode
+ }
+
+ /// Returns whether "case insensitive" mode is enabled.
+ pub fn get_case_insensitive(&self) -> bool {
+ self.case_insensitive
+ }
+
+ /// Returns whether "multi line" mode is enabled.
+ pub fn get_multi_line(&self) -> bool {
+ self.multi_line
+ }
+
+ /// Returns whether "dot matches new line" mode is enabled.
+ pub fn get_dot_matches_new_line(&self) -> bool {
+ self.dot_matches_new_line
+ }
+
+ /// Returns whether "CRLF" mode is enabled.
+ pub fn get_crlf(&self) -> bool {
+ self.crlf
+ }
+
+ /// Returns the line terminator in this syntax configuration.
+ pub fn get_line_terminator(&self) -> u8 {
+ self.line_terminator
+ }
+
+ /// Returns whether "swap greed" mode is enabled.
+ pub fn get_swap_greed(&self) -> bool {
+ self.swap_greed
+ }
+
+ /// Returns whether "ignore whitespace" mode is enabled.
+ pub fn get_ignore_whitespace(&self) -> bool {
+ self.ignore_whitespace
+ }
+
+ /// Returns whether UTF-8 mode is enabled.
+ pub fn get_utf8(&self) -> bool {
+ self.utf8
+ }
+
+ /// Returns the "nest limit" setting.
+ pub fn get_nest_limit(&self) -> u32 {
+ self.nest_limit
+ }
+
+ /// Returns whether "octal" mode is enabled.
+ pub fn get_octal(&self) -> bool {
+ self.octal
+ }
+
+ /// Applies this configuration to the given parser.
+ pub(crate) fn apply(&self, builder: &mut ParserBuilder) {
+ builder
+ .unicode(self.unicode)
+ .case_insensitive(self.case_insensitive)
+ .multi_line(self.multi_line)
+ .dot_matches_new_line(self.dot_matches_new_line)
+ .crlf(self.crlf)
+ .line_terminator(self.line_terminator)
+ .swap_greed(self.swap_greed)
+ .ignore_whitespace(self.ignore_whitespace)
+ .utf8(self.utf8)
+ .nest_limit(self.nest_limit)
+ .octal(self.octal);
+ }
+
+ /// Applies this configuration to the given AST parser.
+ pub(crate) fn apply_ast(&self, builder: &mut ast::parse::ParserBuilder) {
+ builder
+ .ignore_whitespace(self.ignore_whitespace)
+ .nest_limit(self.nest_limit)
+ .octal(self.octal);
+ }
+
+ /// Applies this configuration to the given AST-to-HIR translator.
+ pub(crate) fn apply_hir(
+ &self,
+ builder: &mut hir::translate::TranslatorBuilder,
+ ) {
+ builder
+ .unicode(self.unicode)
+ .case_insensitive(self.case_insensitive)
+ .multi_line(self.multi_line)
+ .crlf(self.crlf)
+ .dot_matches_new_line(self.dot_matches_new_line)
+ .line_terminator(self.line_terminator)
+ .swap_greed(self.swap_greed)
+ .utf8(self.utf8);
+ }
+}
+
+impl Default for Config {
+ fn default() -> Config {
+ Config::new()
+ }
+}