1 files changed, 282 insertions, 0 deletions
diff --git a/src/prompt_toolkit/contrib/regular_languages/regex_parser.py b/src/prompt_toolkit/contrib/regular_languages/regex_parser.py
new file mode 100644
index 0000000..a365ba8
--- /dev/null
+++ b/src/prompt_toolkit/contrib/regular_languages/regex_parser.py
@@ -0,0 +1,282 @@
+"""
+Parser for parsing a regular expression.
+Take a string representing a regular expression and return the root node of its
+parse tree.
+
+usage::
+
+    root_node = parse_regex('(hello|world)')
+
+Remarks:
+- The regex parser processes multiline, it ignores all whitespace and supports
+  multiple named groups with the same name and #-style comments.
+
+Limitations:
+- Lookahead is not supported.
+"""
+from __future__ import annotations
+
+import re
+
+__all__ = [
+    "Repeat",
+    "Variable",
+    "Regex",
+    "Lookahead",
+    "tokenize_regex",
+    "parse_regex",
+]
+
+
+class Node:
+    """
+    Base class for all the grammar nodes.
+    (You don't initialize this one.)
+    """
+
+    def __add__(self, other_node: Node) -> NodeSequence:
+        return NodeSequence([self, other_node])
+
+    def __or__(self, other_node: Node) -> AnyNode:
+        return AnyNode([self, other_node])
+
+
+class AnyNode(Node):
+    """
+    Union operation (OR operation) between several grammars. You don't
+    initialize this yourself, but it's a result of a "Grammar1 | Grammar2"
+    operation.
+    """
+
+    def __init__(self, children: list[Node]) -> None:
+        self.children = children
+
+    def __or__(self, other_node: Node) -> AnyNode:
+        return AnyNode(self.children + [other_node])
+
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}({self.children!r})"
+
+
+class NodeSequence(Node):
+    """
+    Concatenation operation of several grammars. You don't initialize this
+    yourself, but it's a result of a "Grammar1 + Grammar2" operation.
+    """
+
+    def __init__(self, children: list[Node]) -> None:
+        self.children = children
+
+    def __add__(self, other_node: Node) -> NodeSequence:
+        return NodeSequence(self.children + [other_node])
+
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}({self.children!r})"
+
+
+class Regex(Node):
+    """
+    Regular expression.
+    """
+
+    def __init__(self, regex: str) -> None:
+        re.compile(regex)  # Validate
+
+        self.regex = regex
+
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(/{self.regex}/)"
+
+
+class Lookahead(Node):
+    """
+    Lookahead expression.
+    """
+
+    def __init__(self, childnode: Node, negative: bool = False) -> None:
+        self.childnode = childnode
+        self.negative = negative
+
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}({self.childnode!r})"
+
+
+class Variable(Node):
+    """
+    Mark a variable in the regular grammar. This will be translated into a
+    named group. Each variable can have his own completer, validator, etc..
+
+    :param childnode: The grammar which is wrapped inside this variable.
+    :param varname: String.
+    """
+
+    def __init__(self, childnode: Node, varname: str = "") -> None:
+        self.childnode = childnode
+        self.varname = varname
+
+    def __repr__(self) -> str:
+        return "{}(childnode={!r}, varname={!r})".format(
+            self.__class__.__name__,
+            self.childnode,
+            self.varname,
+        )
+
+
+class Repeat(Node):
+    def __init__(
+        self,
+        childnode: Node,
+        min_repeat: int = 0,
+        max_repeat: int | None = None,
+        greedy: bool = True,
+    ) -> None:
+        self.childnode = childnode
+        self.min_repeat = min_repeat
+        self.max_repeat = max_repeat
+        self.greedy = greedy
+
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(childnode={self.childnode!r})"
+
+
+def tokenize_regex(input: str) -> list[str]:
+    """
+    Takes a string, representing a regular expression as input, and tokenizes
+    it.
+
+    :param input: string, representing a regular expression.
+    :returns: List of tokens.
+    """
+    # Regular expression for tokenizing other regular expressions.
+    p = re.compile(
+        r"""^(
+        \(\?P\<[a-zA-Z0-9_-]+\>  | # Start of named group.
+        \(\?#[^)]*\)             | # Comment
+        \(\?=                    | # Start of lookahead assertion
+        \(\?!                    | # Start of negative lookahead assertion
+        \(\?<=                   | # If preceded by.
+        \(\?<                    | # If not preceded by.
+        \(?:                     | # Start of group. (non capturing.)
+        \(                       | # Start of group.
+        \(?[iLmsux]              | # Flags.
+        \(?P=[a-zA-Z]+\)         | # Back reference to named group
+        \)                       | # End of group.
+        \{[^{}]*\}               | # Repetition
+        \*\? | \+\? | \?\?\      | # Non greedy repetition.
+        \* | \+ | \?             | # Repetition
+        \#.*\n                   | # Comment
+        \\. |
+
+        # Character group.
+        \[
+            ( [^\]\\]  |  \\.)*
+        \]                  |
+
+        [^(){}]             |
+        .
+    )""",
+        re.VERBOSE,
+    )
+
+    tokens = []
+
+    while input:
+        m = p.match(input)
+        if m:
+            token, input = input[: m.end()], input[m.end() :]
+            if not token.isspace():
+                tokens.append(token)
+        else:
+            raise Exception("Could not tokenize input regex.")
+
+    return tokens
+
+
+def parse_regex(regex_tokens: list[str]) -> Node:
+    """
+    Takes a list of tokens from the tokenizer, and returns a parse tree.
+    """
+    # We add a closing brace because that represents the final pop of the stack.
+    tokens: list[str] = [")"] + regex_tokens[::-1]
+
+    def wrap(lst: list[Node]) -> Node:
+        """Turn list into sequence when it contains several items."""
+        if len(lst) == 1:
+            return lst[0]
+        else:
+            return NodeSequence(lst)
+
+    def _parse() -> Node:
+        or_list: list[list[Node]] = []
+        result: list[Node] = []
+
+        def wrapped_result() -> Node:
+            if or_list == []:
+                return wrap(result)
+            else:
+                or_list.append(result)
+                return AnyNode([wrap(i) for i in or_list])
+
+        while tokens:
+            t = tokens.pop()
+
+            if t.startswith("(?P<"):
+                variable = Variable(_parse(), varname=t[4:-1])
+                result.append(variable)
+
+            elif t in ("*", "*?"):
+                greedy = t == "*"
+                result[-1] = Repeat(result[-1], greedy=greedy)
+
+            elif t in ("+", "+?"):
+                greedy = t == "+"
+                result[-1] = Repeat(result[-1], min_repeat=1, greedy=greedy)
+
+            elif t in ("?", "??"):
+                if result == []:
+                    raise Exception("Nothing to repeat." + repr(tokens))
+                else:
+                    greedy = t == "?"
+                    result[-1] = Repeat(
+                        result[-1], min_repeat=0, max_repeat=1, greedy=greedy
+                    )
+
+            elif t == "|":
+                or_list.append(result)
+                result = []
+
+            elif t in ("(", "(?:"):
+                result.append(_parse())
+
+            elif t == "(?!":
+                result.append(Lookahead(_parse(), negative=True))
+
+            elif t == "(?=":
+                result.append(Lookahead(_parse(), negative=False))
+
+            elif t == ")":
+                return wrapped_result()
+
+            elif t.startswith("#"):
+                pass
+
+            elif t.startswith("{"):
+                # TODO: implement!
+                raise Exception(f"{t}-style repetition not yet supported")
+
+            elif t.startswith("(?"):
+                raise Exception("%r not supported" % t)
+
+            elif t.isspace():
+                pass
+            else:
+                result.append(Regex(t))
+
+        raise Exception("Expecting ')' token")
+
+    result = _parse()
+
+    if len(tokens) != 0:
+        raise Exception("Unmatched parentheses.")
+    else:
+        return result