"""
Parser for parsing a regular expression.
Take a string representing a regular expression and return the root node of its
parse tree.

usage::

    root_node = parse_regex('(hello|world)')

Remarks:
- The regex parser processes multiline, it ignores all whitespace and supports
  multiple named groups with the same name and #-style comments.

Limitations:
- Lookahead is not supported.
"""
import re
from typing import List, Optional

__all__ = [
    "Repeat",
    "Variable",
    "Regex",
    "Lookahead",
    "tokenize_regex",
    "parse_regex",
]


class Node:
    """
    Base class for all the grammar nodes.
    (You don't initialize this one.)
    """

    def __add__(self, other_node: "Node") -> "NodeSequence":
        return NodeSequence([self, other_node])

    def __or__(self, other_node: "Node") -> "AnyNode":
        return AnyNode([self, other_node])


class AnyNode(Node):
    """
    Union operation (OR operation) between several grammars. You don't
    initialize this yourself, but it's a result of a "Grammar1 | Grammar2"
    operation.
    """

    def __init__(self, children: List[Node]) -> None:
        self.children = children

    def __or__(self, other_node: Node) -> "AnyNode":
        return AnyNode(self.children + [other_node])

    def __repr__(self) -> str:
        return f"{self.__class__.__name__}({self.children!r})"


class NodeSequence(Node):
    """
    Concatenation operation of several grammars. You don't initialize this
    yourself, but it's a result of a "Grammar1 + Grammar2" operation.
    """

    def __init__(self, children: List[Node]) -> None:
        self.children = children

    def __add__(self, other_node: Node) -> "NodeSequence":
        return NodeSequence(self.children + [other_node])

    def __repr__(self) -> str:
        return f"{self.__class__.__name__}({self.children!r})"


class Regex(Node):
    """
    Regular expression.
    """

    def __init__(self, regex: str) -> None:
        re.compile(regex)  # Validate

        self.regex = regex

    def __repr__(self) -> str:
        return f"{self.__class__.__name__}(/{self.regex}/)"


class Lookahead(Node):
    """
    Lookahead expression.
    """

    def __init__(self, childnode: Node, negative: bool = False) -> None:
        self.childnode = childnode
        self.negative = negative

    def __repr__(self) -> str:
        return f"{self.__class__.__name__}({self.childnode!r})"


class Variable(Node):
    """
    Mark a variable in the regular grammar. This will be translated into a
    named group. Each variable can have his own completer, validator, etc..

    :param childnode: The grammar which is wrapped inside this variable.
    :param varname: String.
    """

    def __init__(self, childnode: Node, varname: str = "") -> None:
        self.childnode = childnode
        self.varname = varname

    def __repr__(self) -> str:
        return "{}(childnode={!r}, varname={!r})".format(
            self.__class__.__name__,
            self.childnode,
            self.varname,
        )


class Repeat(Node):
    def __init__(
        self,
        childnode: Node,
        min_repeat: int = 0,
        max_repeat: Optional[int] = None,
        greedy: bool = True,
    ) -> None:
        self.childnode = childnode
        self.min_repeat = min_repeat
        self.max_repeat = max_repeat
        self.greedy = greedy

    def __repr__(self) -> str:
        return f"{self.__class__.__name__}(childnode={self.childnode!r})"


def tokenize_regex(input: str) -> List[str]:
    """
    Takes a string, representing a regular expression as input, and tokenizes
    it.

    :param input: string, representing a regular expression.
    :returns: List of tokens.
    """
    # Regular expression for tokenizing other regular expressions.
    p = re.compile(
        r"""^(
        \(\?P\<[a-zA-Z0-9_-]+\>  | # Start of named group.
        \(\?#[^)]*\)             | # Comment
        \(\?=                    | # Start of lookahead assertion
        \(\?!                    | # Start of negative lookahead assertion
        \(\?<=                   | # If preceded by.
        \(\?<                    | # If not preceded by.
        \(?:                     | # Start of group. (non capturing.)
        \(                       | # Start of group.
        \(?[iLmsux]              | # Flags.
        \(?P=[a-zA-Z]+\)         | # Back reference to named group
        \)                       | # End of group.
        \{[^{}]*\}               | # Repetition
        \*\? | \+\? | \?\?\      | # Non greedy repetition.
        \* | \+ | \?             | # Repetition
        \#.*\n                   | # Comment
        \\. |

        # Character group.
        \[
            ( [^\]\\]  |  \\.)*
        \]                  |

        [^(){}]             |
        .
    )""",
        re.VERBOSE,
    )

    tokens = []

    while input:
        m = p.match(input)
        if m:
            token, input = input[: m.end()], input[m.end() :]
            if not token.isspace():
                tokens.append(token)
        else:
            raise Exception("Could not tokenize input regex.")

    return tokens


def parse_regex(regex_tokens: List[str]) -> Node:
    """
    Takes a list of tokens from the tokenizer, and returns a parse tree.
    """
    # We add a closing brace because that represents the final pop of the stack.
    tokens: List[str] = [")"] + regex_tokens[::-1]

    def wrap(lst: List[Node]) -> Node:
        """Turn list into sequence when it contains several items."""
        if len(lst) == 1:
            return lst[0]
        else:
            return NodeSequence(lst)

    def _parse() -> Node:
        or_list: List[List[Node]] = []
        result: List[Node] = []

        def wrapped_result() -> Node:
            if or_list == []:
                return wrap(result)
            else:
                or_list.append(result)
                return AnyNode([wrap(i) for i in or_list])

        while tokens:
            t = tokens.pop()

            if t.startswith("(?P<"):
                variable = Variable(_parse(), varname=t[4:-1])
                result.append(variable)

            elif t in ("*", "*?"):
                greedy = t == "*"
                result[-1] = Repeat(result[-1], greedy=greedy)

            elif t in ("+", "+?"):
                greedy = t == "+"
                result[-1] = Repeat(result[-1], min_repeat=1, greedy=greedy)

            elif t in ("?", "??"):
                if result == []:
                    raise Exception("Nothing to repeat." + repr(tokens))
                else:
                    greedy = t == "?"
                    result[-1] = Repeat(
                        result[-1], min_repeat=0, max_repeat=1, greedy=greedy
                    )

            elif t == "|":
                or_list.append(result)
                result = []

            elif t in ("(", "(?:"):
                result.append(_parse())

            elif t == "(?!":
                result.append(Lookahead(_parse(), negative=True))

            elif t == "(?=":
                result.append(Lookahead(_parse(), negative=False))

            elif t == ")":
                return wrapped_result()

            elif t.startswith("#"):
                pass

            elif t.startswith("{"):
                # TODO: implement!
                raise Exception(f"{t}-style repetition not yet supported")

            elif t.startswith("(?"):
                raise Exception("%r not supported" % t)

            elif t.isspace():
                pass
            else:
                result.append(Regex(t))

        raise Exception("Expecting ')' token")

    result = _parse()

    if len(tokens) != 0:
        raise Exception("Unmatched parentheses.")
    else:
        return result