From 4f1a3b5f9ad05aa7b08715d48909a2b06ee2fcb1 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Mon, 15 Apr 2024 18:35:31 +0200 Subject: Adding upstream version 3.0.43. Signed-off-by: Daniel Baumann --- .../contrib/regular_languages/regex_parser.py | 282 +++++++++++++++++++++ 1 file changed, 282 insertions(+) create mode 100644 src/prompt_toolkit/contrib/regular_languages/regex_parser.py (limited to 'src/prompt_toolkit/contrib/regular_languages/regex_parser.py') diff --git a/src/prompt_toolkit/contrib/regular_languages/regex_parser.py b/src/prompt_toolkit/contrib/regular_languages/regex_parser.py new file mode 100644 index 0000000..a365ba8 --- /dev/null +++ b/src/prompt_toolkit/contrib/regular_languages/regex_parser.py @@ -0,0 +1,282 @@ +""" +Parser for parsing a regular expression. +Take a string representing a regular expression and return the root node of its +parse tree. + +usage:: + + root_node = parse_regex('(hello|world)') + +Remarks: +- The regex parser processes multiline, it ignores all whitespace and supports + multiple named groups with the same name and #-style comments. + +Limitations: +- Lookahead is not supported. +""" +from __future__ import annotations + +import re + +__all__ = [ + "Repeat", + "Variable", + "Regex", + "Lookahead", + "tokenize_regex", + "parse_regex", +] + + +class Node: + """ + Base class for all the grammar nodes. + (You don't initialize this one.) + """ + + def __add__(self, other_node: Node) -> NodeSequence: + return NodeSequence([self, other_node]) + + def __or__(self, other_node: Node) -> AnyNode: + return AnyNode([self, other_node]) + + +class AnyNode(Node): + """ + Union operation (OR operation) between several grammars. You don't + initialize this yourself, but it's a result of a "Grammar1 | Grammar2" + operation. + """ + + def __init__(self, children: list[Node]) -> None: + self.children = children + + def __or__(self, other_node: Node) -> AnyNode: + return AnyNode(self.children + [other_node]) + + def __repr__(self) -> str: + return f"{self.__class__.__name__}({self.children!r})" + + +class NodeSequence(Node): + """ + Concatenation operation of several grammars. You don't initialize this + yourself, but it's a result of a "Grammar1 + Grammar2" operation. + """ + + def __init__(self, children: list[Node]) -> None: + self.children = children + + def __add__(self, other_node: Node) -> NodeSequence: + return NodeSequence(self.children + [other_node]) + + def __repr__(self) -> str: + return f"{self.__class__.__name__}({self.children!r})" + + +class Regex(Node): + """ + Regular expression. + """ + + def __init__(self, regex: str) -> None: + re.compile(regex) # Validate + + self.regex = regex + + def __repr__(self) -> str: + return f"{self.__class__.__name__}(/{self.regex}/)" + + +class Lookahead(Node): + """ + Lookahead expression. + """ + + def __init__(self, childnode: Node, negative: bool = False) -> None: + self.childnode = childnode + self.negative = negative + + def __repr__(self) -> str: + return f"{self.__class__.__name__}({self.childnode!r})" + + +class Variable(Node): + """ + Mark a variable in the regular grammar. This will be translated into a + named group. Each variable can have his own completer, validator, etc.. + + :param childnode: The grammar which is wrapped inside this variable. + :param varname: String. + """ + + def __init__(self, childnode: Node, varname: str = "") -> None: + self.childnode = childnode + self.varname = varname + + def __repr__(self) -> str: + return "{}(childnode={!r}, varname={!r})".format( + self.__class__.__name__, + self.childnode, + self.varname, + ) + + +class Repeat(Node): + def __init__( + self, + childnode: Node, + min_repeat: int = 0, + max_repeat: int | None = None, + greedy: bool = True, + ) -> None: + self.childnode = childnode + self.min_repeat = min_repeat + self.max_repeat = max_repeat + self.greedy = greedy + + def __repr__(self) -> str: + return f"{self.__class__.__name__}(childnode={self.childnode!r})" + + +def tokenize_regex(input: str) -> list[str]: + """ + Takes a string, representing a regular expression as input, and tokenizes + it. + + :param input: string, representing a regular expression. + :returns: List of tokens. + """ + # Regular expression for tokenizing other regular expressions. + p = re.compile( + r"""^( + \(\?P\<[a-zA-Z0-9_-]+\> | # Start of named group. + \(\?#[^)]*\) | # Comment + \(\?= | # Start of lookahead assertion + \(\?! | # Start of negative lookahead assertion + \(\?<= | # If preceded by. + \(\?< | # If not preceded by. + \(?: | # Start of group. (non capturing.) + \( | # Start of group. + \(?[iLmsux] | # Flags. + \(?P=[a-zA-Z]+\) | # Back reference to named group + \) | # End of group. + \{[^{}]*\} | # Repetition + \*\? | \+\? | \?\?\ | # Non greedy repetition. + \* | \+ | \? | # Repetition + \#.*\n | # Comment + \\. | + + # Character group. + \[ + ( [^\]\\] | \\.)* + \] | + + [^(){}] | + . + )""", + re.VERBOSE, + ) + + tokens = [] + + while input: + m = p.match(input) + if m: + token, input = input[: m.end()], input[m.end() :] + if not token.isspace(): + tokens.append(token) + else: + raise Exception("Could not tokenize input regex.") + + return tokens + + +def parse_regex(regex_tokens: list[str]) -> Node: + """ + Takes a list of tokens from the tokenizer, and returns a parse tree. + """ + # We add a closing brace because that represents the final pop of the stack. + tokens: list[str] = [")"] + regex_tokens[::-1] + + def wrap(lst: list[Node]) -> Node: + """Turn list into sequence when it contains several items.""" + if len(lst) == 1: + return lst[0] + else: + return NodeSequence(lst) + + def _parse() -> Node: + or_list: list[list[Node]] = [] + result: list[Node] = [] + + def wrapped_result() -> Node: + if or_list == []: + return wrap(result) + else: + or_list.append(result) + return AnyNode([wrap(i) for i in or_list]) + + while tokens: + t = tokens.pop() + + if t.startswith("(?P<"): + variable = Variable(_parse(), varname=t[4:-1]) + result.append(variable) + + elif t in ("*", "*?"): + greedy = t == "*" + result[-1] = Repeat(result[-1], greedy=greedy) + + elif t in ("+", "+?"): + greedy = t == "+" + result[-1] = Repeat(result[-1], min_repeat=1, greedy=greedy) + + elif t in ("?", "??"): + if result == []: + raise Exception("Nothing to repeat." + repr(tokens)) + else: + greedy = t == "?" + result[-1] = Repeat( + result[-1], min_repeat=0, max_repeat=1, greedy=greedy + ) + + elif t == "|": + or_list.append(result) + result = [] + + elif t in ("(", "(?:"): + result.append(_parse()) + + elif t == "(?!": + result.append(Lookahead(_parse(), negative=True)) + + elif t == "(?=": + result.append(Lookahead(_parse(), negative=False)) + + elif t == ")": + return wrapped_result() + + elif t.startswith("#"): + pass + + elif t.startswith("{"): + # TODO: implement! + raise Exception(f"{t}-style repetition not yet supported") + + elif t.startswith("(?"): + raise Exception("%r not supported" % t) + + elif t.isspace(): + pass + else: + result.append(Regex(t)) + + raise Exception("Expecting ')' token") + + result = _parse() + + if len(tokens) != 0: + raise Exception("Unmatched parentheses.") + else: + return result -- cgit v1.2.3