6 files changed, 1178 insertions, 0 deletions
diff --git a/src/prompt_toolkit/contrib/regular_languages/__init__.py b/src/prompt_toolkit/contrib/regular_languages/__init__.py
new file mode 100644
index 0000000..c947fd5
--- /dev/null
+++ b/src/prompt_toolkit/contrib/regular_languages/__init__.py
@@ -0,0 +1,79 @@
+r"""
+Tool for expressing the grammar of an input as a regular language.
+==================================================================
+
+The grammar for the input of many simple command line interfaces can be
+expressed by a regular language. Examples are PDB (the Python debugger); a
+simple (bash-like) shell with "pwd", "cd", "cat" and "ls" commands; arguments
+that you can pass to an executable; etc. It is possible to use regular
+expressions for validation and parsing of such a grammar. (More about regular
+languages: http://en.wikipedia.org/wiki/Regular_language)
+
+Example
+-------
+
+Let's take the pwd/cd/cat/ls example. We want to have a shell that accepts
+these three commands. "cd" is followed by a quoted directory name and "cat" is
+followed by a quoted file name. (We allow quotes inside the filename when
+they're escaped with a backslash.) We could define the grammar using the
+following regular expression::
+
+    grammar = \s* (
+        pwd |
+        ls |
+        (cd  \s+ " ([^"]|\.)+ ") |
+        (cat \s+ " ([^"]|\.)+ ")
+    ) \s*
+
+
+What can we do with this grammar?
+---------------------------------
+
+- Syntax highlighting: We could use this for instance to give file names
+                       different color.
+- Parse the result: .. We can extract the file names and commands by using a
+                       regular expression with named groups.
+- Input validation: .. Don't accept anything that does not match this grammar.
+                       When combined with a parser, we can also recursively do
+                       filename validation (and accept only existing files.)
+- Autocompletion: .... Each part of the grammar can have its own autocompleter.
+                       "cat" has to be completed using file names, while "cd"
+                       has to be completed using directory names.
+
+How does it work?
+-----------------
+
+As a user of this library, you have to define the grammar of the input as a
+regular expression. The parts of this grammar where autocompletion, validation
+or any other processing is required need to be marked using a regex named
+group. Like ``(?P<varname>...)`` for instance.
+
+When the input is processed for validation (for instance), the regex will
+execute, the named group is captured, and the validator associated with this
+named group will test the captured string.
+
+There is one tricky bit:
+
+    Often we operate on incomplete input (this is by definition the case for
+    autocompletion) and we have to decide for the cursor position in which
+    possible state the grammar it could be and in which way variables could be
+    matched up to that point.
+
+To solve this problem, the compiler takes the original regular expression and
+translates it into a set of other regular expressions which each match certain
+prefixes of the original regular expression. We generate one prefix regular
+expression for every named variable (with this variable being the end of that
+expression).
+
+
+TODO: some examples of:
+    - How to create a highlighter from this grammar.
+    - How to create a validator from this grammar.
+    - How to create an autocompleter from this grammar.
+    - How to create a parser from this grammar.
+"""
+from __future__ import annotations
+
+from .compiler import compile
+
+__all__ = ["compile"]
diff --git a/src/prompt_toolkit/contrib/regular_languages/compiler.py b/src/prompt_toolkit/contrib/regular_languages/compiler.py
new file mode 100644
index 0000000..474f6cf
--- /dev/null
+++ b/src/prompt_toolkit/contrib/regular_languages/compiler.py
@@ -0,0 +1,571 @@
+r"""
+Compiler for a regular grammar.
+
+Example usage::
+
+    # Create and compile grammar.
+    p = compile('add \s+ (?P<var1>[^\s]+)  \s+  (?P<var2>[^\s]+)')
+
+    # Match input string.
+    m = p.match('add 23 432')
+
+    # Get variables.
+    m.variables().get('var1')  # Returns "23"
+    m.variables().get('var2')  # Returns "432"
+
+
+Partial matches are possible::
+
+    # Create and compile grammar.
+    p = compile('''
+        # Operators with two arguments.
+        ((?P<operator1>[^\s]+)  \s+ (?P<var1>[^\s]+)  \s+  (?P<var2>[^\s]+)) |
+
+        # Operators with only one arguments.
+        ((?P<operator2>[^\s]+)  \s+ (?P<var1>[^\s]+))
+    ''')
+
+    # Match partial input string.
+    m = p.match_prefix('add 23')
+
+    # Get variables. (Notice that both operator1 and operator2 contain the
+    # value "add".) This is because our input is incomplete, and we don't know
+    # yet in which rule of the regex we we'll end up. It could also be that
+    # `operator1` and `operator2` have a different autocompleter and we want to
+    # call all possible autocompleters that would result in valid input.)
+    m.variables().get('var1')  # Returns "23"
+    m.variables().get('operator1')  # Returns "add"
+    m.variables().get('operator2')  # Returns "add"
+
+"""
+from __future__ import annotations
+
+import re
+from typing import Callable, Dict, Iterable, Iterator, Pattern
+from typing import Match as RegexMatch
+
+from .regex_parser import (
+    AnyNode,
+    Lookahead,
+    Node,
+    NodeSequence,
+    Regex,
+    Repeat,
+    Variable,
+    parse_regex,
+    tokenize_regex,
+)
+
+__all__ = [
+    "compile",
+]
+
+
+# Name of the named group in the regex, matching trailing input.
+# (Trailing input is when the input contains characters after the end of the
+# expression has been matched.)
+_INVALID_TRAILING_INPUT = "invalid_trailing"
+
+EscapeFuncDict = Dict[str, Callable[[str], str]]
+
+
+class _CompiledGrammar:
+    """
+    Compiles a grammar. This will take the parse tree of a regular expression
+    and compile the grammar.
+
+    :param root_node: :class~`.regex_parser.Node` instance.
+    :param escape_funcs: `dict` mapping variable names to escape callables.
+    :param unescape_funcs: `dict` mapping variable names to unescape callables.
+    """
+
+    def __init__(
+        self,
+        root_node: Node,
+        escape_funcs: EscapeFuncDict | None = None,
+        unescape_funcs: EscapeFuncDict | None = None,
+    ) -> None:
+        self.root_node = root_node
+        self.escape_funcs = escape_funcs or {}
+        self.unescape_funcs = unescape_funcs or {}
+
+        #: Dictionary that will map the regex names to Node instances.
+        self._group_names_to_nodes: dict[
+            str, str
+        ] = {}  # Maps regex group names to varnames.
+        counter = [0]
+
+        def create_group_func(node: Variable) -> str:
+            name = "n%s" % counter[0]
+            self._group_names_to_nodes[name] = node.varname
+            counter[0] += 1
+            return name
+
+        # Compile regex strings.
+        self._re_pattern = "^%s$" % self._transform(root_node, create_group_func)
+        self._re_prefix_patterns = list(
+            self._transform_prefix(root_node, create_group_func)
+        )
+
+        # Compile the regex itself.
+        flags = re.DOTALL  # Note that we don't need re.MULTILINE! (^ and $
+        # still represent the start and end of input text.)
+        self._re = re.compile(self._re_pattern, flags)
+        self._re_prefix = [re.compile(t, flags) for t in self._re_prefix_patterns]
+
+        # We compile one more set of regexes, similar to `_re_prefix`, but accept any trailing
+        # input. This will ensure that we can still highlight the input correctly, even when the
+        # input contains some additional characters at the end that don't match the grammar.)
+        self._re_prefix_with_trailing_input = [
+            re.compile(
+                r"(?:{})(?P<{}>.*?)$".format(t.rstrip("$"), _INVALID_TRAILING_INPUT),
+                flags,
+            )
+            for t in self._re_prefix_patterns
+        ]
+
+    def escape(self, varname: str, value: str) -> str:
+        """
+        Escape `value` to fit in the place of this variable into the grammar.
+        """
+        f = self.escape_funcs.get(varname)
+        return f(value) if f else value
+
+    def unescape(self, varname: str, value: str) -> str:
+        """
+        Unescape `value`.
+        """
+        f = self.unescape_funcs.get(varname)
+        return f(value) if f else value
+
+    @classmethod
+    def _transform(
+        cls, root_node: Node, create_group_func: Callable[[Variable], str]
+    ) -> str:
+        """
+        Turn a :class:`Node` object into a regular expression.
+
+        :param root_node: The :class:`Node` instance for which we generate the grammar.
+        :param create_group_func: A callable which takes a `Node` and returns the next
+            free name for this node.
+        """
+
+        def transform(node: Node) -> str:
+            # Turn `AnyNode` into an OR.
+            if isinstance(node, AnyNode):
+                return "(?:%s)" % "|".join(transform(c) for c in node.children)
+
+            # Concatenate a `NodeSequence`
+            elif isinstance(node, NodeSequence):
+                return "".join(transform(c) for c in node.children)
+
+            # For Regex and Lookahead nodes, just insert them literally.
+            elif isinstance(node, Regex):
+                return node.regex
+
+            elif isinstance(node, Lookahead):
+                before = "(?!" if node.negative else "(="
+                return before + transform(node.childnode) + ")"
+
+            # A `Variable` wraps the children into a named group.
+            elif isinstance(node, Variable):
+                return f"(?P<{create_group_func(node)}>{transform(node.childnode)})"
+
+            # `Repeat`.
+            elif isinstance(node, Repeat):
+                if node.max_repeat is None:
+                    if node.min_repeat == 0:
+                        repeat_sign = "*"
+                    elif node.min_repeat == 1:
+                        repeat_sign = "+"
+                else:
+                    repeat_sign = "{%i,%s}" % (
+                        node.min_repeat,
+                        ("" if node.max_repeat is None else str(node.max_repeat)),
+                    )
+
+                return "(?:{}){}{}".format(
+                    transform(node.childnode),
+                    repeat_sign,
+                    ("" if node.greedy else "?"),
+                )
+            else:
+                raise TypeError(f"Got {node!r}")
+
+        return transform(root_node)
+
+    @classmethod
+    def _transform_prefix(
+        cls, root_node: Node, create_group_func: Callable[[Variable], str]
+    ) -> Iterable[str]:
+        """
+        Yield all the regular expressions matching a prefix of the grammar
+        defined by the `Node` instance.
+
+        For each `Variable`, one regex pattern will be generated, with this
+        named group at the end. This is required because a regex engine will
+        terminate once a match is found. For autocompletion however, we need
+        the matches for all possible paths, so that we can provide completions
+        for each `Variable`.
+
+        - So, in the case of an `Any` (`A|B|C)', we generate a pattern for each
+          clause. This is one for `A`, one for `B` and one for `C`. Unless some
+          groups don't contain a `Variable`, then these can be merged together.
+        - In the case of a `NodeSequence` (`ABC`), we generate a pattern for
+          each prefix that ends with a variable, and one pattern for the whole
+          sequence. So, that's one for `A`, one for `AB` and one for `ABC`.
+
+        :param root_node: The :class:`Node` instance for which we generate the grammar.
+        :param create_group_func: A callable which takes a `Node` and returns the next
+            free name for this node.
+        """
+
+        def contains_variable(node: Node) -> bool:
+            if isinstance(node, Regex):
+                return False
+            elif isinstance(node, Variable):
+                return True
+            elif isinstance(node, (Lookahead, Repeat)):
+                return contains_variable(node.childnode)
+            elif isinstance(node, (NodeSequence, AnyNode)):
+                return any(contains_variable(child) for child in node.children)
+
+            return False
+
+        def transform(node: Node) -> Iterable[str]:
+            # Generate separate pattern for all terms that contain variables
+            # within this OR. Terms that don't contain a variable can be merged
+            # together in one pattern.
+            if isinstance(node, AnyNode):
+                # If we have a definition like:
+                #           (?P<name> .*)  | (?P<city> .*)
+                # Then we want to be able to generate completions for both the
+                # name as well as the city. We do this by yielding two
+                # different regular expressions, because the engine won't
+                # follow multiple paths, if multiple are possible.
+                children_with_variable = []
+                children_without_variable = []
+                for c in node.children:
+                    if contains_variable(c):
+                        children_with_variable.append(c)
+                    else:
+                        children_without_variable.append(c)
+
+                for c in children_with_variable:
+                    yield from transform(c)
+
+                # Merge options without variable together.
+                if children_without_variable:
+                    yield "|".join(
+                        r for c in children_without_variable for r in transform(c)
+                    )
+
+            # For a sequence, generate a pattern for each prefix that ends with
+            # a variable + one pattern of the complete sequence.
+            # (This is because, for autocompletion, we match the text before
+            # the cursor, and completions are given for the variable that we
+            # match right before the cursor.)
+            elif isinstance(node, NodeSequence):
+                # For all components in the sequence, compute prefix patterns,
+                # as well as full patterns.
+                complete = [cls._transform(c, create_group_func) for c in node.children]
+                prefixes = [list(transform(c)) for c in node.children]
+                variable_nodes = [contains_variable(c) for c in node.children]
+
+                # If any child is contains a variable, we should yield a
+                # pattern up to that point, so that we are sure this will be
+                # matched.
+                for i in range(len(node.children)):
+                    if variable_nodes[i]:
+                        for c_str in prefixes[i]:
+                            yield "".join(complete[:i]) + c_str
+
+                # If there are non-variable nodes, merge all the prefixes into
+                # one pattern. If the input is: "[part1] [part2] [part3]", then
+                # this gets compiled into:
+                #  (complete1 + (complete2 + (complete3  | partial3) | partial2) | partial1 )
+                # For nodes that contain a variable, we skip the "|partial"
+                # part here, because thees are matched with the previous
+                # patterns.
+                if not all(variable_nodes):
+                    result = []
+
+                    # Start with complete patterns.
+                    for i in range(len(node.children)):
+                        result.append("(?:")
+                        result.append(complete[i])
+
+                    # Add prefix patterns.
+                    for i in range(len(node.children) - 1, -1, -1):
+                        if variable_nodes[i]:
+                            # No need to yield a prefix for this one, we did
+                            # the variable prefixes earlier.
+                            result.append(")")
+                        else:
+                            result.append("|(?:")
+                            # If this yields multiple, we should yield all combinations.
+                            assert len(prefixes[i]) == 1
+                            result.append(prefixes[i][0])
+                            result.append("))")
+
+                    yield "".join(result)
+
+            elif isinstance(node, Regex):
+                yield "(?:%s)?" % node.regex
+
+            elif isinstance(node, Lookahead):
+                if node.negative:
+                    yield "(?!%s)" % cls._transform(node.childnode, create_group_func)
+                else:
+                    # Not sure what the correct semantics are in this case.
+                    # (Probably it's not worth implementing this.)
+                    raise Exception("Positive lookahead not yet supported.")
+
+            elif isinstance(node, Variable):
+                # (Note that we should not append a '?' here. the 'transform'
+                # method will already recursively do that.)
+                for c_str in transform(node.childnode):
+                    yield f"(?P<{create_group_func(node)}>{c_str})"
+
+            elif isinstance(node, Repeat):
+                # If we have a repetition of 8 times. That would mean that the
+                # current input could have for instance 7 times a complete
+                # match, followed by a partial match.
+                prefix = cls._transform(node.childnode, create_group_func)
+
+                if node.max_repeat == 1:
+                    yield from transform(node.childnode)
+                else:
+                    for c_str in transform(node.childnode):
+                        if node.max_repeat:
+                            repeat_sign = "{,%i}" % (node.max_repeat - 1)
+                        else:
+                            repeat_sign = "*"
+                        yield "(?:{}){}{}{}".format(
+                            prefix,
+                            repeat_sign,
+                            ("" if node.greedy else "?"),
+                            c_str,
+                        )
+
+            else:
+                raise TypeError("Got %r" % node)
+
+        for r in transform(root_node):
+            yield "^(?:%s)$" % r
+
+    def match(self, string: str) -> Match | None:
+        """
+        Match the string with the grammar.
+        Returns a :class:`Match` instance or `None` when the input doesn't match the grammar.
+
+        :param string: The input string.
+        """
+        m = self._re.match(string)
+
+        if m:
+            return Match(
+                string, [(self._re, m)], self._group_names_to_nodes, self.unescape_funcs
+            )
+        return None
+
+    def match_prefix(self, string: str) -> Match | None:
+        """
+        Do a partial match of the string with the grammar. The returned
+        :class:`Match` instance can contain multiple representations of the
+        match. This will never return `None`. If it doesn't match at all, the "trailing input"
+        part will capture all of the input.
+
+        :param string: The input string.
+        """
+        # First try to match using `_re_prefix`. If nothing is found, use the patterns that
+        # also accept trailing characters.
+        for patterns in [self._re_prefix, self._re_prefix_with_trailing_input]:
+            matches = [(r, r.match(string)) for r in patterns]
+            matches2 = [(r, m) for r, m in matches if m]
+
+            if matches2 != []:
+                return Match(
+                    string, matches2, self._group_names_to_nodes, self.unescape_funcs
+                )
+
+        return None
+
+
+class Match:
+    """
+    :param string: The input string.
+    :param re_matches: List of (compiled_re_pattern, re_match) tuples.
+    :param group_names_to_nodes: Dictionary mapping all the re group names to the matching Node instances.
+    """
+
+    def __init__(
+        self,
+        string: str,
+        re_matches: list[tuple[Pattern[str], RegexMatch[str]]],
+        group_names_to_nodes: dict[str, str],
+        unescape_funcs: dict[str, Callable[[str], str]],
+    ):
+        self.string = string
+        self._re_matches = re_matches
+        self._group_names_to_nodes = group_names_to_nodes
+        self._unescape_funcs = unescape_funcs
+
+    def _nodes_to_regs(self) -> list[tuple[str, tuple[int, int]]]:
+        """
+        Return a list of (varname, reg) tuples.
+        """
+
+        def get_tuples() -> Iterable[tuple[str, tuple[int, int]]]:
+            for r, re_match in self._re_matches:
+                for group_name, group_index in r.groupindex.items():
+                    if group_name != _INVALID_TRAILING_INPUT:
+                        regs = re_match.regs
+                        reg = regs[group_index]
+                        node = self._group_names_to_nodes[group_name]
+                        yield (node, reg)
+
+        return list(get_tuples())
+
+    def _nodes_to_values(self) -> list[tuple[str, str, tuple[int, int]]]:
+        """
+        Returns list of (Node, string_value) tuples.
+        """
+
+        def is_none(sl: tuple[int, int]) -> bool:
+            return sl[0] == -1 and sl[1] == -1
+
+        def get(sl: tuple[int, int]) -> str:
+            return self.string[sl[0] : sl[1]]
+
+        return [
+            (varname, get(slice), slice)
+            for varname, slice in self._nodes_to_regs()
+            if not is_none(slice)
+        ]
+
+    def _unescape(self, varname: str, value: str) -> str:
+        unwrapper = self._unescape_funcs.get(varname)
+        return unwrapper(value) if unwrapper else value
+
+    def variables(self) -> Variables:
+        """
+        Returns :class:`Variables` instance.
+        """
+        return Variables(
+            [(k, self._unescape(k, v), sl) for k, v, sl in self._nodes_to_values()]
+        )
+
+    def trailing_input(self) -> MatchVariable | None:
+        """
+        Get the `MatchVariable` instance, representing trailing input, if there is any.
+        "Trailing input" is input at the end that does not match the grammar anymore, but
+        when this is removed from the end of the input, the input would be a valid string.
+        """
+        slices: list[tuple[int, int]] = []
+
+        # Find all regex group for the name _INVALID_TRAILING_INPUT.
+        for r, re_match in self._re_matches:
+            for group_name, group_index in r.groupindex.items():
+                if group_name == _INVALID_TRAILING_INPUT:
+                    slices.append(re_match.regs[group_index])
+
+        # Take the smallest part. (Smaller trailing text means that a larger input has
+        # been matched, so that is better.)
+        if slices:
+            slice = (max(i[0] for i in slices), max(i[1] for i in slices))
+            value = self.string[slice[0] : slice[1]]
+            return MatchVariable("<trailing_input>", value, slice)
+        return None
+
+    def end_nodes(self) -> Iterable[MatchVariable]:
+        """
+        Yields `MatchVariable` instances for all the nodes having their end
+        position at the end of the input string.
+        """
+        for varname, reg in self._nodes_to_regs():
+            # If this part goes until the end of the input string.
+            if reg[1] == len(self.string):
+                value = self._unescape(varname, self.string[reg[0] : reg[1]])
+                yield MatchVariable(varname, value, (reg[0], reg[1]))
+
+
+class Variables:
+    def __init__(self, tuples: list[tuple[str, str, tuple[int, int]]]) -> None:
+        #: List of (varname, value, slice) tuples.
+        self._tuples = tuples
+
+    def __repr__(self) -> str:
+        return "{}({})".format(
+            self.__class__.__name__,
+            ", ".join(f"{k}={v!r}" for k, v, _ in self._tuples),
+        )
+
+    def get(self, key: str, default: str | None = None) -> str | None:
+        items = self.getall(key)
+        return items[0] if items else default
+
+    def getall(self, key: str) -> list[str]:
+        return [v for k, v, _ in self._tuples if k == key]
+
+    def __getitem__(self, key: str) -> str | None:
+        return self.get(key)
+
+    def __iter__(self) -> Iterator[MatchVariable]:
+        """
+        Yield `MatchVariable` instances.
+        """
+        for varname, value, slice in self._tuples:
+            yield MatchVariable(varname, value, slice)
+
+
+class MatchVariable:
+    """
+    Represents a match of a variable in the grammar.
+
+    :param varname: (string) Name of the variable.
+    :param value: (string) Value of this variable.
+    :param slice: (start, stop) tuple, indicating the position of this variable
+                  in the input string.
+    """
+
+    def __init__(self, varname: str, value: str, slice: tuple[int, int]) -> None:
+        self.varname = varname
+        self.value = value
+        self.slice = slice
+
+        self.start = self.slice[0]
+        self.stop = self.slice[1]
+
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}({self.varname!r}, {self.value!r})"
+
+
+def compile(
+    expression: str,
+    escape_funcs: EscapeFuncDict | None = None,
+    unescape_funcs: EscapeFuncDict | None = None,
+) -> _CompiledGrammar:
+    """
+    Compile grammar (given as regex string), returning a `CompiledGrammar`
+    instance.
+    """
+    return _compile_from_parse_tree(
+        parse_regex(tokenize_regex(expression)),
+        escape_funcs=escape_funcs,
+        unescape_funcs=unescape_funcs,
+    )
+
+
+def _compile_from_parse_tree(
+    root_node: Node,
+    escape_funcs: EscapeFuncDict | None = None,
+    unescape_funcs: EscapeFuncDict | None = None,
+) -> _CompiledGrammar:
+    """
+    Compile grammar (given as parse tree), returning a `CompiledGrammar`
+    instance.
+    """
+    return _CompiledGrammar(
+        root_node, escape_funcs=escape_funcs, unescape_funcs=unescape_funcs
+    )
diff --git a/src/prompt_toolkit/contrib/regular_languages/completion.py b/src/prompt_toolkit/contrib/regular_languages/completion.py
new file mode 100644
index 0000000..2e353e8
--- /dev/null
+++ b/src/prompt_toolkit/contrib/regular_languages/completion.py
@@ -0,0 +1,94 @@
+"""
+Completer for a regular grammar.
+"""
+from __future__ import annotations
+
+from typing import Iterable
+
+from prompt_toolkit.completion import CompleteEvent, Completer, Completion
+from prompt_toolkit.document import Document
+
+from .compiler import Match, _CompiledGrammar
+
+__all__ = [
+    "GrammarCompleter",
+]
+
+
+class GrammarCompleter(Completer):
+    """
+    Completer which can be used for autocompletion according to variables in
+    the grammar. Each variable can have a different autocompleter.
+
+    :param compiled_grammar: `GrammarCompleter` instance.
+    :param completers: `dict` mapping variable names of the grammar to the
+                       `Completer` instances to be used for each variable.
+    """
+
+    def __init__(
+        self, compiled_grammar: _CompiledGrammar, completers: dict[str, Completer]
+    ) -> None:
+        self.compiled_grammar = compiled_grammar
+        self.completers = completers
+
+    def get_completions(
+        self, document: Document, complete_event: CompleteEvent
+    ) -> Iterable[Completion]:
+        m = self.compiled_grammar.match_prefix(document.text_before_cursor)
+
+        if m:
+            completions = self._remove_duplicates(
+                self._get_completions_for_match(m, complete_event)
+            )
+
+            yield from completions
+
+    def _get_completions_for_match(
+        self, match: Match, complete_event: CompleteEvent
+    ) -> Iterable[Completion]:
+        """
+        Yield all the possible completions for this input string.
+        (The completer assumes that the cursor position was at the end of the
+        input string.)
+        """
+        for match_variable in match.end_nodes():
+            varname = match_variable.varname
+            start = match_variable.start
+
+            completer = self.completers.get(varname)
+
+            if completer:
+                text = match_variable.value
+
+                # Unwrap text.
+                unwrapped_text = self.compiled_grammar.unescape(varname, text)
+
+                # Create a document, for the completions API (text/cursor_position)
+                document = Document(unwrapped_text, len(unwrapped_text))
+
+                # Call completer
+                for completion in completer.get_completions(document, complete_event):
+                    new_text = (
+                        unwrapped_text[: len(text) + completion.start_position]
+                        + completion.text
+                    )
+
+                    # Wrap again.
+                    yield Completion(
+                        text=self.compiled_grammar.escape(varname, new_text),
+                        start_position=start - len(match.string),
+                        display=completion.display,
+                        display_meta=completion.display_meta,
+                    )
+
+    def _remove_duplicates(self, items: Iterable[Completion]) -> list[Completion]:
+        """
+        Remove duplicates, while keeping the order.
+        (Sometimes we have duplicates, because the there several matches of the
+        same grammar, each yielding similar completions.)
+        """
+        result: list[Completion] = []
+        for i in items:
+            if i not in result:
+                result.append(i)
+        return result
diff --git a/src/prompt_toolkit/contrib/regular_languages/lexer.py b/src/prompt_toolkit/contrib/regular_languages/lexer.py
new file mode 100644
index 0000000..b0a4deb
--- /dev/null
+++ b/src/prompt_toolkit/contrib/regular_languages/lexer.py
@@ -0,0 +1,93 @@
+"""
+`GrammarLexer` is compatible with other lexers and can be used to highlight
+the input using a regular grammar with annotations.
+"""
+from __future__ import annotations
+
+from typing import Callable
+
+from prompt_toolkit.document import Document
+from prompt_toolkit.formatted_text.base import StyleAndTextTuples
+from prompt_toolkit.formatted_text.utils import split_lines
+from prompt_toolkit.lexers import Lexer
+
+from .compiler import _CompiledGrammar
+
+__all__ = [
+    "GrammarLexer",
+]
+
+
+class GrammarLexer(Lexer):
+    """
+    Lexer which can be used for highlighting of fragments according to variables in the grammar.
+
+    (It does not actual lexing of the string, but it exposes an API, compatible
+    with the Pygments lexer class.)
+
+    :param compiled_grammar: Grammar as returned by the `compile()` function.
+    :param lexers: Dictionary mapping variable names of the regular grammar to
+                   the lexers that should be used for this part. (This can
+                   call other lexers recursively.) If you wish a part of the
+                   grammar to just get one fragment, use a
+                   `prompt_toolkit.lexers.SimpleLexer`.
+    """
+
+    def __init__(
+        self,
+        compiled_grammar: _CompiledGrammar,
+        default_style: str = "",
+        lexers: dict[str, Lexer] | None = None,
+    ) -> None:
+        self.compiled_grammar = compiled_grammar
+        self.default_style = default_style
+        self.lexers = lexers or {}
+
+    def _get_text_fragments(self, text: str) -> StyleAndTextTuples:
+        m = self.compiled_grammar.match_prefix(text)
+
+        if m:
+            characters: StyleAndTextTuples = [(self.default_style, c) for c in text]
+
+            for v in m.variables():
+                # If we have a `Lexer` instance for this part of the input.
+                # Tokenize recursively and apply tokens.
+                lexer = self.lexers.get(v.varname)
+
+                if lexer:
+                    document = Document(text[v.start : v.stop])
+                    lexer_tokens_for_line = lexer.lex_document(document)
+                    text_fragments: StyleAndTextTuples = []
+                    for i in range(len(document.lines)):
+                        text_fragments.extend(lexer_tokens_for_line(i))
+                        text_fragments.append(("", "\n"))
+                    if text_fragments:
+                        text_fragments.pop()
+
+                    i = v.start
+                    for t, s, *_ in text_fragments:
+                        for c in s:
+                            if characters[i][0] == self.default_style:
+                                characters[i] = (t, characters[i][1])
+                            i += 1
+
+            # Highlight trailing input.
+            trailing_input = m.trailing_input()
+            if trailing_input:
+                for i in range(trailing_input.start, trailing_input.stop):
+                    characters[i] = ("class:trailing-input", characters[i][1])
+
+            return characters
+        else:
+            return [("", text)]
+
+    def lex_document(self, document: Document) -> Callable[[int], StyleAndTextTuples]:
+        lines = list(split_lines(self._get_text_fragments(document.text)))
+
+        def get_line(lineno: int) -> StyleAndTextTuples:
+            try:
+                return lines[lineno]
+            except IndexError:
+                return []
+
+        return get_line
diff --git a/src/prompt_toolkit/contrib/regular_languages/regex_parser.py b/src/prompt_toolkit/contrib/regular_languages/regex_parser.py
new file mode 100644
index 0000000..a365ba8
--- /dev/null
+++ b/src/prompt_toolkit/contrib/regular_languages/regex_parser.py
@@ -0,0 +1,282 @@
+"""
+Parser for parsing a regular expression.
+Take a string representing a regular expression and return the root node of its
+parse tree.
+
+usage::
+
+    root_node = parse_regex('(hello|world)')
+
+Remarks:
+- The regex parser processes multiline, it ignores all whitespace and supports
+  multiple named groups with the same name and #-style comments.
+
+Limitations:
+- Lookahead is not supported.
+"""
+from __future__ import annotations
+
+import re
+
+__all__ = [
+    "Repeat",
+    "Variable",
+    "Regex",
+    "Lookahead",
+    "tokenize_regex",
+    "parse_regex",
+]
+
+
+class Node:
+    """
+    Base class for all the grammar nodes.
+    (You don't initialize this one.)
+    """
+
+    def __add__(self, other_node: Node) -> NodeSequence:
+        return NodeSequence([self, other_node])
+
+    def __or__(self, other_node: Node) -> AnyNode:
+        return AnyNode([self, other_node])
+
+
+class AnyNode(Node):
+    """
+    Union operation (OR operation) between several grammars. You don't
+    initialize this yourself, but it's a result of a "Grammar1 | Grammar2"
+    operation.
+    """
+
+    def __init__(self, children: list[Node]) -> None:
+        self.children = children
+
+    def __or__(self, other_node: Node) -> AnyNode:
+        return AnyNode(self.children + [other_node])
+
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}({self.children!r})"
+
+
+class NodeSequence(Node):
+    """
+    Concatenation operation of several grammars. You don't initialize this
+    yourself, but it's a result of a "Grammar1 + Grammar2" operation.
+    """
+
+    def __init__(self, children: list[Node]) -> None:
+        self.children = children
+
+    def __add__(self, other_node: Node) -> NodeSequence:
+        return NodeSequence(self.children + [other_node])
+
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}({self.children!r})"
+
+
+class Regex(Node):
+    """
+    Regular expression.
+    """
+
+    def __init__(self, regex: str) -> None:
+        re.compile(regex)  # Validate
+
+        self.regex = regex
+
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(/{self.regex}/)"
+
+
+class Lookahead(Node):
+    """
+    Lookahead expression.
+    """
+
+    def __init__(self, childnode: Node, negative: bool = False) -> None:
+        self.childnode = childnode
+        self.negative = negative
+
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}({self.childnode!r})"
+
+
+class Variable(Node):
+    """
+    Mark a variable in the regular grammar. This will be translated into a
+    named group. Each variable can have his own completer, validator, etc..
+
+    :param childnode: The grammar which is wrapped inside this variable.
+    :param varname: String.
+    """
+
+    def __init__(self, childnode: Node, varname: str = "") -> None:
+        self.childnode = childnode
+        self.varname = varname
+
+    def __repr__(self) -> str:
+        return "{}(childnode={!r}, varname={!r})".format(
+            self.__class__.__name__,
+            self.childnode,
+            self.varname,
+        )
+
+
+class Repeat(Node):
+    def __init__(
+        self,
+        childnode: Node,
+        min_repeat: int = 0,
+        max_repeat: int | None = None,
+        greedy: bool = True,
+    ) -> None:
+        self.childnode = childnode
+        self.min_repeat = min_repeat
+        self.max_repeat = max_repeat
+        self.greedy = greedy
+
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(childnode={self.childnode!r})"
+
+
+def tokenize_regex(input: str) -> list[str]:
+    """
+    Takes a string, representing a regular expression as input, and tokenizes
+    it.
+
+    :param input: string, representing a regular expression.
+    :returns: List of tokens.
+    """
+    # Regular expression for tokenizing other regular expressions.
+    p = re.compile(
+        r"""^(
+        \(\?P\<[a-zA-Z0-9_-]+\>  | # Start of named group.
+        \(\?#[^)]*\)             | # Comment
+        \(\?=                    | # Start of lookahead assertion
+        \(\?!                    | # Start of negative lookahead assertion
+        \(\?<=                   | # If preceded by.
+        \(\?<                    | # If not preceded by.
+        \(?:                     | # Start of group. (non capturing.)
+        \(                       | # Start of group.
+        \(?[iLmsux]              | # Flags.
+        \(?P=[a-zA-Z]+\)         | # Back reference to named group
+        \)                       | # End of group.
+        \{[^{}]*\}               | # Repetition
+        \*\? | \+\? | \?\?\      | # Non greedy repetition.
+        \* | \+ | \?             | # Repetition
+        \#.*\n                   | # Comment
+        \\. |
+
+        # Character group.
+        \[
+            ( [^\]\\]  |  \\.)*
+        \]                  |
+
+        [^(){}]             |
+        .
+    )""",
+        re.VERBOSE,
+    )
+
+    tokens = []
+
+    while input:
+        m = p.match(input)
+        if m:
+            token, input = input[: m.end()], input[m.end() :]
+            if not token.isspace():
+                tokens.append(token)
+        else:
+            raise Exception("Could not tokenize input regex.")
+
+    return tokens
+
+
+def parse_regex(regex_tokens: list[str]) -> Node:
+    """
+    Takes a list of tokens from the tokenizer, and returns a parse tree.
+    """
+    # We add a closing brace because that represents the final pop of the stack.
+    tokens: list[str] = [")"] + regex_tokens[::-1]
+
+    def wrap(lst: list[Node]) -> Node:
+        """Turn list into sequence when it contains several items."""
+        if len(lst) == 1:
+            return lst[0]
+        else:
+            return NodeSequence(lst)
+
+    def _parse() -> Node:
+        or_list: list[list[Node]] = []
+        result: list[Node] = []
+
+        def wrapped_result() -> Node:
+            if or_list == []:
+                return wrap(result)
+            else:
+                or_list.append(result)
+                return AnyNode([wrap(i) for i in or_list])
+
+        while tokens:
+            t = tokens.pop()
+
+            if t.startswith("(?P<"):
+                variable = Variable(_parse(), varname=t[4:-1])
+                result.append(variable)
+
+            elif t in ("*", "*?"):
+                greedy = t == "*"
+                result[-1] = Repeat(result[-1], greedy=greedy)
+
+            elif t in ("+", "+?"):
+                greedy = t == "+"
+                result[-1] = Repeat(result[-1], min_repeat=1, greedy=greedy)
+
+            elif t in ("?", "??"):
+                if result == []:
+                    raise Exception("Nothing to repeat." + repr(tokens))
+                else:
+                    greedy = t == "?"
+                    result[-1] = Repeat(
+                        result[-1], min_repeat=0, max_repeat=1, greedy=greedy
+                    )
+
+            elif t == "|":
+                or_list.append(result)
+                result = []
+
+            elif t in ("(", "(?:"):
+                result.append(_parse())
+
+            elif t == "(?!":
+                result.append(Lookahead(_parse(), negative=True))
+
+            elif t == "(?=":
+                result.append(Lookahead(_parse(), negative=False))
+
+            elif t == ")":
+                return wrapped_result()
+
+            elif t.startswith("#"):
+                pass
+
+            elif t.startswith("{"):
+                # TODO: implement!
+                raise Exception(f"{t}-style repetition not yet supported")
+
+            elif t.startswith("(?"):
+                raise Exception("%r not supported" % t)
+
+            elif t.isspace():
+                pass
+            else:
+                result.append(Regex(t))
+
+        raise Exception("Expecting ')' token")
+
+    result = _parse()
+
+    if len(tokens) != 0:
+        raise Exception("Unmatched parentheses.")
+    else:
+        return result
diff --git a/src/prompt_toolkit/contrib/regular_languages/validation.py b/src/prompt_toolkit/contrib/regular_languages/validation.py
new file mode 100644
index 0000000..8e56e05
--- /dev/null
+++ b/src/prompt_toolkit/contrib/regular_languages/validation.py
@@ -0,0 +1,59 @@
+"""
+Validator for a regular language.
+"""
+from __future__ import annotations
+
+from prompt_toolkit.document import Document
+from prompt_toolkit.validation import ValidationError, Validator
+
+from .compiler import _CompiledGrammar
+
+__all__ = [
+    "GrammarValidator",
+]
+
+
+class GrammarValidator(Validator):
+    """
+    Validator which can be used for validation according to variables in
+    the grammar. Each variable can have its own validator.
+
+    :param compiled_grammar: `GrammarCompleter` instance.
+    :param validators: `dict` mapping variable names of the grammar to the
+                       `Validator` instances to be used for each variable.
+    """
+
+    def __init__(
+        self, compiled_grammar: _CompiledGrammar, validators: dict[str, Validator]
+    ) -> None:
+        self.compiled_grammar = compiled_grammar
+        self.validators = validators
+
+    def validate(self, document: Document) -> None:
+        # Parse input document.
+        # We use `match`, not `match_prefix`, because for validation, we want
+        # the actual, unambiguous interpretation of the input.
+        m = self.compiled_grammar.match(document.text)
+
+        if m:
+            for v in m.variables():
+                validator = self.validators.get(v.varname)
+
+                if validator:
+                    # Unescape text.
+                    unwrapped_text = self.compiled_grammar.unescape(v.varname, v.value)
+
+                    # Create a document, for the completions API (text/cursor_position)
+                    inner_document = Document(unwrapped_text, len(unwrapped_text))
+
+                    try:
+                        validator.validate(inner_document)
+                    except ValidationError as e:
+                        raise ValidationError(
+                            cursor_position=v.start + e.cursor_position,
+                            message=e.message,
+                        ) from e
+        else:
+            raise ValidationError(
+                cursor_position=len(document.text), message="Invalid command"
+            )