summaryrefslogtreecommitdiffstats
path: root/src/prompt_toolkit/contrib/regular_languages/regex_parser.py
diff options
context:
space:
mode:
Diffstat (limited to 'src/prompt_toolkit/contrib/regular_languages/regex_parser.py')
-rw-r--r--src/prompt_toolkit/contrib/regular_languages/regex_parser.py282
1 files changed, 282 insertions, 0 deletions
diff --git a/src/prompt_toolkit/contrib/regular_languages/regex_parser.py b/src/prompt_toolkit/contrib/regular_languages/regex_parser.py
new file mode 100644
index 0000000..a365ba8
--- /dev/null
+++ b/src/prompt_toolkit/contrib/regular_languages/regex_parser.py
@@ -0,0 +1,282 @@
+"""
+Parser for parsing a regular expression.
+Take a string representing a regular expression and return the root node of its
+parse tree.
+
+usage::
+
+ root_node = parse_regex('(hello|world)')
+
+Remarks:
+- The regex parser processes multiline, it ignores all whitespace and supports
+ multiple named groups with the same name and #-style comments.
+
+Limitations:
+- Lookahead is not supported.
+"""
+from __future__ import annotations
+
+import re
+
+__all__ = [
+ "Repeat",
+ "Variable",
+ "Regex",
+ "Lookahead",
+ "tokenize_regex",
+ "parse_regex",
+]
+
+
+class Node:
+ """
+ Base class for all the grammar nodes.
+ (You don't initialize this one.)
+ """
+
+ def __add__(self, other_node: Node) -> NodeSequence:
+ return NodeSequence([self, other_node])
+
+ def __or__(self, other_node: Node) -> AnyNode:
+ return AnyNode([self, other_node])
+
+
+class AnyNode(Node):
+ """
+ Union operation (OR operation) between several grammars. You don't
+ initialize this yourself, but it's a result of a "Grammar1 | Grammar2"
+ operation.
+ """
+
+ def __init__(self, children: list[Node]) -> None:
+ self.children = children
+
+ def __or__(self, other_node: Node) -> AnyNode:
+ return AnyNode(self.children + [other_node])
+
+ def __repr__(self) -> str:
+ return f"{self.__class__.__name__}({self.children!r})"
+
+
+class NodeSequence(Node):
+ """
+ Concatenation operation of several grammars. You don't initialize this
+ yourself, but it's a result of a "Grammar1 + Grammar2" operation.
+ """
+
+ def __init__(self, children: list[Node]) -> None:
+ self.children = children
+
+ def __add__(self, other_node: Node) -> NodeSequence:
+ return NodeSequence(self.children + [other_node])
+
+ def __repr__(self) -> str:
+ return f"{self.__class__.__name__}({self.children!r})"
+
+
+class Regex(Node):
+ """
+ Regular expression.
+ """
+
+ def __init__(self, regex: str) -> None:
+ re.compile(regex) # Validate
+
+ self.regex = regex
+
+ def __repr__(self) -> str:
+ return f"{self.__class__.__name__}(/{self.regex}/)"
+
+
+class Lookahead(Node):
+ """
+ Lookahead expression.
+ """
+
+ def __init__(self, childnode: Node, negative: bool = False) -> None:
+ self.childnode = childnode
+ self.negative = negative
+
+ def __repr__(self) -> str:
+ return f"{self.__class__.__name__}({self.childnode!r})"
+
+
+class Variable(Node):
+ """
+ Mark a variable in the regular grammar. This will be translated into a
+ named group. Each variable can have his own completer, validator, etc..
+
+ :param childnode: The grammar which is wrapped inside this variable.
+ :param varname: String.
+ """
+
+ def __init__(self, childnode: Node, varname: str = "") -> None:
+ self.childnode = childnode
+ self.varname = varname
+
+ def __repr__(self) -> str:
+ return "{}(childnode={!r}, varname={!r})".format(
+ self.__class__.__name__,
+ self.childnode,
+ self.varname,
+ )
+
+
+class Repeat(Node):
+ def __init__(
+ self,
+ childnode: Node,
+ min_repeat: int = 0,
+ max_repeat: int | None = None,
+ greedy: bool = True,
+ ) -> None:
+ self.childnode = childnode
+ self.min_repeat = min_repeat
+ self.max_repeat = max_repeat
+ self.greedy = greedy
+
+ def __repr__(self) -> str:
+ return f"{self.__class__.__name__}(childnode={self.childnode!r})"
+
+
+def tokenize_regex(input: str) -> list[str]:
+ """
+ Takes a string, representing a regular expression as input, and tokenizes
+ it.
+
+ :param input: string, representing a regular expression.
+ :returns: List of tokens.
+ """
+ # Regular expression for tokenizing other regular expressions.
+ p = re.compile(
+ r"""^(
+ \(\?P\<[a-zA-Z0-9_-]+\> | # Start of named group.
+ \(\?#[^)]*\) | # Comment
+ \(\?= | # Start of lookahead assertion
+ \(\?! | # Start of negative lookahead assertion
+ \(\?<= | # If preceded by.
+ \(\?< | # If not preceded by.
+ \(?: | # Start of group. (non capturing.)
+ \( | # Start of group.
+ \(?[iLmsux] | # Flags.
+ \(?P=[a-zA-Z]+\) | # Back reference to named group
+ \) | # End of group.
+ \{[^{}]*\} | # Repetition
+ \*\? | \+\? | \?\?\ | # Non greedy repetition.
+ \* | \+ | \? | # Repetition
+ \#.*\n | # Comment
+ \\. |
+
+ # Character group.
+ \[
+ ( [^\]\\] | \\.)*
+ \] |
+
+ [^(){}] |
+ .
+ )""",
+ re.VERBOSE,
+ )
+
+ tokens = []
+
+ while input:
+ m = p.match(input)
+ if m:
+ token, input = input[: m.end()], input[m.end() :]
+ if not token.isspace():
+ tokens.append(token)
+ else:
+ raise Exception("Could not tokenize input regex.")
+
+ return tokens
+
+
+def parse_regex(regex_tokens: list[str]) -> Node:
+ """
+ Takes a list of tokens from the tokenizer, and returns a parse tree.
+ """
+ # We add a closing brace because that represents the final pop of the stack.
+ tokens: list[str] = [")"] + regex_tokens[::-1]
+
+ def wrap(lst: list[Node]) -> Node:
+ """Turn list into sequence when it contains several items."""
+ if len(lst) == 1:
+ return lst[0]
+ else:
+ return NodeSequence(lst)
+
+ def _parse() -> Node:
+ or_list: list[list[Node]] = []
+ result: list[Node] = []
+
+ def wrapped_result() -> Node:
+ if or_list == []:
+ return wrap(result)
+ else:
+ or_list.append(result)
+ return AnyNode([wrap(i) for i in or_list])
+
+ while tokens:
+ t = tokens.pop()
+
+ if t.startswith("(?P<"):
+ variable = Variable(_parse(), varname=t[4:-1])
+ result.append(variable)
+
+ elif t in ("*", "*?"):
+ greedy = t == "*"
+ result[-1] = Repeat(result[-1], greedy=greedy)
+
+ elif t in ("+", "+?"):
+ greedy = t == "+"
+ result[-1] = Repeat(result[-1], min_repeat=1, greedy=greedy)
+
+ elif t in ("?", "??"):
+ if result == []:
+ raise Exception("Nothing to repeat." + repr(tokens))
+ else:
+ greedy = t == "?"
+ result[-1] = Repeat(
+ result[-1], min_repeat=0, max_repeat=1, greedy=greedy
+ )
+
+ elif t == "|":
+ or_list.append(result)
+ result = []
+
+ elif t in ("(", "(?:"):
+ result.append(_parse())
+
+ elif t == "(?!":
+ result.append(Lookahead(_parse(), negative=True))
+
+ elif t == "(?=":
+ result.append(Lookahead(_parse(), negative=False))
+
+ elif t == ")":
+ return wrapped_result()
+
+ elif t.startswith("#"):
+ pass
+
+ elif t.startswith("{"):
+ # TODO: implement!
+ raise Exception(f"{t}-style repetition not yet supported")
+
+ elif t.startswith("(?"):
+ raise Exception("%r not supported" % t)
+
+ elif t.isspace():
+ pass
+ else:
+ result.append(Regex(t))
+
+ raise Exception("Expecting ')' token")
+
+ result = _parse()
+
+ if len(tokens) != 0:
+ raise Exception("Unmatched parentheses.")
+ else:
+ return result