diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-30 18:31:36 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-30 18:31:36 +0000 |
commit | e02c5b5930c2c9ba3e5423fe12e2ef0155017297 (patch) | |
tree | fd60ebbbb5299e16e5fca8c773ddb74f764760db /vendor/regex-automata-0.2.0/src/util/syntax.rs | |
parent | Adding debian version 1.73.0+dfsg1-1. (diff) | |
download | rustc-e02c5b5930c2c9ba3e5423fe12e2ef0155017297.tar.xz rustc-e02c5b5930c2c9ba3e5423fe12e2ef0155017297.zip |
Merging upstream version 1.74.1+dfsg1.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'vendor/regex-automata-0.2.0/src/util/syntax.rs')
-rw-r--r-- | vendor/regex-automata-0.2.0/src/util/syntax.rs | 272 |
1 files changed, 272 insertions, 0 deletions
diff --git a/vendor/regex-automata-0.2.0/src/util/syntax.rs b/vendor/regex-automata-0.2.0/src/util/syntax.rs new file mode 100644 index 000000000..88beeee75 --- /dev/null +++ b/vendor/regex-automata-0.2.0/src/util/syntax.rs @@ -0,0 +1,272 @@ +use regex_syntax::ParserBuilder; + +/// A common set of configuration options that apply to the syntax of a regex. +/// +/// This represents a group of configuration options that specifically apply +/// to how the concrete syntax of a regular expression is interpreted. In +/// particular, they are generally forwarded to the +/// [`ParserBuilder`](https://docs.rs/regex-syntax/*/regex_syntax/struct.ParserBuilder.html) +/// in the +/// [`regex-syntax`](https://docs.rs/regex-syntax) +/// crate when building a regex from its concrete syntax directly. +/// +/// These options are defined as a group since they apply to every regex engine +/// in this crate. Instead of re-defining them on every engine's builder, they +/// are instead provided here as one cohesive unit. +#[derive(Clone, Copy, Debug)] +pub struct SyntaxConfig { + case_insensitive: bool, + multi_line: bool, + dot_matches_new_line: bool, + swap_greed: bool, + ignore_whitespace: bool, + unicode: bool, + utf8: bool, + nest_limit: u32, + octal: bool, +} + +impl SyntaxConfig { + /// Return a new default syntax configuration. + pub fn new() -> SyntaxConfig { + // These defaults match the ones used in regex-syntax. + SyntaxConfig { + case_insensitive: false, + multi_line: false, + dot_matches_new_line: false, + swap_greed: false, + ignore_whitespace: false, + unicode: true, + utf8: true, + nest_limit: 250, + octal: false, + } + } + + /// Enable or disable the case insensitive flag by default. + /// + /// When Unicode mode is enabled, case insensitivity is Unicode-aware. + /// Specifically, it will apply the "simple" case folding rules as + /// specified by Unicode. + /// + /// By default this is disabled. It may alternatively be selectively + /// enabled in the regular expression itself via the `i` flag. + pub fn case_insensitive(mut self, yes: bool) -> SyntaxConfig { + self.case_insensitive = yes; + self + } + + /// Enable or disable the multi-line matching flag by default. + /// + /// When this is enabled, the `^` and `$` look-around assertions will + /// match immediately after and immediately before a new line character, + /// respectively. Note that the `\A` and `\z` look-around assertions are + /// unaffected by this setting and always correspond to matching at the + /// beginning and end of the input. + /// + /// By default this is disabled. It may alternatively be selectively + /// enabled in the regular expression itself via the `m` flag. + pub fn multi_line(mut self, yes: bool) -> SyntaxConfig { + self.multi_line = yes; + self + } + + /// Enable or disable the "dot matches any character" flag by default. + /// + /// When this is enabled, `.` will match any character. When it's disabled, + /// then `.` will match any character except for a new line character. + /// + /// Note that `.` is impacted by whether the "unicode" setting is enabled + /// or not. When Unicode is enabled (the defualt), `.` will match any UTF-8 + /// encoding of any Unicode scalar value (sans a new line, depending on + /// whether this "dot matches new line" option is enabled). When Unicode + /// mode is disabled, `.` will match any byte instead. Because of this, + /// when Unicode mode is disabled, `.` can only be used when the "allow + /// invalid UTF-8" option is enabled, since `.` could otherwise match + /// invalid UTF-8. + /// + /// By default this is disabled. It may alternatively be selectively + /// enabled in the regular expression itself via the `s` flag. + pub fn dot_matches_new_line(mut self, yes: bool) -> SyntaxConfig { + self.dot_matches_new_line = yes; + self + } + + /// Enable or disable the "swap greed" flag by default. + /// + /// When this is enabled, `.*` (for example) will become ungreedy and `.*?` + /// will become greedy. + /// + /// By default this is disabled. It may alternatively be selectively + /// enabled in the regular expression itself via the `U` flag. + pub fn swap_greed(mut self, yes: bool) -> SyntaxConfig { + self.swap_greed = yes; + self + } + + /// Enable verbose mode in the regular expression. + /// + /// When enabled, verbose mode permits insigificant whitespace in many + /// places in the regular expression, as well as comments. Comments are + /// started using `#` and continue until the end of the line. + /// + /// By default, this is disabled. It may be selectively enabled in the + /// regular expression by using the `x` flag regardless of this setting. + pub fn ignore_whitespace(mut self, yes: bool) -> SyntaxConfig { + self.ignore_whitespace = yes; + self + } + + /// Enable or disable the Unicode flag (`u`) by default. + /// + /// By default this is **enabled**. It may alternatively be selectively + /// disabled in the regular expression itself via the `u` flag. + /// + /// Note that unless "allow invalid UTF-8" is enabled (it's disabled by + /// default), a regular expression will fail to parse if Unicode mode is + /// disabled and a sub-expression could possibly match invalid UTF-8. + /// + /// **WARNING**: Unicode mode can greatly increase the size of the compiled + /// DFA, which can noticeably impact both memory usage and compilation + /// time. This is especially noticeable if your regex contains character + /// classes like `\w` that are impacted by whether Unicode is enabled or + /// not. If Unicode is not necessary, you are encouraged to disable it. + pub fn unicode(mut self, yes: bool) -> SyntaxConfig { + self.unicode = yes; + self + } + + /// When disabled, the builder will permit the construction of a regular + /// expression that may match invalid UTF-8. + /// + /// For example, when [`SyntaxConfig::unicode`] is disabled, then + /// expressions like `[^a]` may match invalid UTF-8 since they can match + /// any single byte that is not `a`. By default, these sub-expressions + /// are disallowed to avoid returning offsets that split a UTF-8 + /// encoded codepoint. However, in cases where matching at arbitrary + /// locations is desired, this option can be disabled to permit all such + /// sub-expressions. + /// + /// When enabled (the default), the builder is guaranteed to produce a + /// regex that will only ever match valid UTF-8 (otherwise, the builder + /// will return an error). + pub fn utf8(mut self, yes: bool) -> SyntaxConfig { + self.utf8 = yes; + self + } + + /// Set the nesting limit used for the regular expression parser. + /// + /// The nesting limit controls how deep the abstract syntax tree is allowed + /// to be. If the AST exceeds the given limit (e.g., with too many nested + /// groups), then an error is returned by the parser. + /// + /// The purpose of this limit is to act as a heuristic to prevent stack + /// overflow when building a finite automaton from a regular expression's + /// abstract syntax tree. In particular, construction currently uses + /// recursion. In the future, the implementation may stop using recursion + /// and this option will no longer be necessary. + /// + /// This limit is not checked until the entire AST is parsed. Therefore, + /// if callers want to put a limit on the amount of heap space used, then + /// they should impose a limit on the length, in bytes, of the concrete + /// pattern string. In particular, this is viable since the parser will + /// limit itself to heap space proportional to the lenth of the pattern + /// string. + /// + /// Note that a nest limit of `0` will return a nest limit error for most + /// patterns but not all. For example, a nest limit of `0` permits `a` but + /// not `ab`, since `ab` requires a concatenation AST item, which results + /// in a nest depth of `1`. In general, a nest limit is not something that + /// manifests in an obvious way in the concrete syntax, therefore, it + /// should not be used in a granular way. + pub fn nest_limit(mut self, limit: u32) -> SyntaxConfig { + self.nest_limit = limit; + self + } + + /// Whether to support octal syntax or not. + /// + /// Octal syntax is a little-known way of uttering Unicode codepoints in + /// a regular expression. For example, `a`, `\x61`, `\u0061` and + /// `\141` are all equivalent regular expressions, where the last example + /// shows octal syntax. + /// + /// While supporting octal syntax isn't in and of itself a problem, it does + /// make good error messages harder. That is, in PCRE based regex engines, + /// syntax like `\1` invokes a backreference, which is explicitly + /// unsupported in Rust's regex engine. However, many users expect it to + /// be supported. Therefore, when octal support is disabled, the error + /// message will explicitly mention that backreferences aren't supported. + /// + /// Octal syntax is disabled by default. + pub fn octal(mut self, yes: bool) -> SyntaxConfig { + self.octal = yes; + self + } + + /// Returns whether "unicode" mode is enabled. + pub fn get_unicode(&self) -> bool { + self.unicode + } + + /// Returns whether "case insensitive" mode is enabled. + pub fn get_case_insensitive(&self) -> bool { + self.case_insensitive + } + + /// Returns whether "multi line" mode is enabled. + pub fn get_multi_line(&self) -> bool { + self.multi_line + } + + /// Returns whether "dot matches new line" mode is enabled. + pub fn get_dot_matches_new_line(&self) -> bool { + self.dot_matches_new_line + } + + /// Returns whether "swap greed" mode is enabled. + pub fn get_swap_greed(&self) -> bool { + self.swap_greed + } + + /// Returns whether "ignore whitespace" mode is enabled. + pub fn get_ignore_whitespace(&self) -> bool { + self.ignore_whitespace + } + + /// Returns whether UTF-8 mode is enabled. + pub fn get_utf8(&self) -> bool { + self.utf8 + } + + /// Returns the "nest limit" setting. + pub fn get_nest_limit(&self) -> u32 { + self.nest_limit + } + + /// Returns whether "octal" mode is enabled. + pub fn get_octal(&self) -> bool { + self.octal + } + + /// Applies this configuration to the given parser. + pub(crate) fn apply(&self, builder: &mut ParserBuilder) { + builder + .unicode(self.unicode) + .case_insensitive(self.case_insensitive) + .multi_line(self.multi_line) + .dot_matches_new_line(self.dot_matches_new_line) + .swap_greed(self.swap_greed) + .ignore_whitespace(self.ignore_whitespace) + .allow_invalid_utf8(!self.utf8) + .nest_limit(self.nest_limit) + .octal(self.octal); + } +} + +impl Default for SyntaxConfig { + fn default() -> SyntaxConfig { + SyntaxConfig::new() + } +} |