diff options
Diffstat (limited to '')
-rw-r--r-- | mdit_py_plugins/attrs/parse.py | 265 |
1 files changed, 265 insertions, 0 deletions
diff --git a/mdit_py_plugins/attrs/parse.py b/mdit_py_plugins/attrs/parse.py new file mode 100644 index 0000000..4a30353 --- /dev/null +++ b/mdit_py_plugins/attrs/parse.py @@ -0,0 +1,265 @@ +"""Parser for attributes:: + + attributes { id = "foo", class = "bar baz", + key1 = "val1", key2 = "val2" } + +Adapted from: +https://github.com/jgm/djot/blob/fae7364b86bfce69bc6d5b5eede1f5196d845fd6/djot/attributes.lua#L1 + +syntax: + +attributes <- '{' whitespace* attribute (whitespace attribute)* whitespace* '}' +attribute <- identifier | class | keyval +identifier <- '#' name +class <- '.' name +name <- (nonspace, nonpunctuation other than ':', '_', '-')+ +keyval <- key '=' val +key <- (ASCII_ALPHANUM | ':' | '_' | '-')+ +val <- bareval | quotedval +bareval <- (ASCII_ALPHANUM | ':' | '_' | '-')+ +quotedval <- '"' ([^"] | '\"') '"' +""" +from __future__ import annotations + +from enum import Enum +import re +from typing import Callable + + +class State(Enum): + START = 0 + SCANNING = 1 + SCANNING_ID = 2 + SCANNING_CLASS = 3 + SCANNING_KEY = 4 + SCANNING_VALUE = 5 + SCANNING_BARE_VALUE = 6 + SCANNING_QUOTED_VALUE = 7 + SCANNING_COMMENT = 8 + SCANNING_ESCAPED = 9 + DONE = 10 + + +REGEX_SPACE = re.compile(r"\s") +REGEX_SPACE_PUNCTUATION = re.compile(r"[\s!\"#$%&'()*+,./;<=>?@[\]^`{|}~]") +REGEX_KEY_CHARACTERS = re.compile(r"[a-zA-Z\d_:-]") + + +class TokenState: + def __init__(self): + self._tokens = [] + self.start: int = 0 + + def set_start(self, start: int) -> None: + self.start = start + + def append(self, start: int, end: int, ttype: str): + self._tokens.append((start, end, ttype)) + + def compile(self, string: str) -> dict[str, str]: + """compile the tokens into a dictionary""" + attributes = {} + classes = [] + idx = 0 + while idx < len(self._tokens): + start, end, ttype = self._tokens[idx] + if ttype == "id": + attributes["id"] = string[start:end] + elif ttype == "class": + classes.append(string[start:end]) + elif ttype == "key": + key = string[start:end] + if idx + 1 < len(self._tokens): + start, end, ttype = self._tokens[idx + 1] + if ttype == "value": + if key == "class": + classes.append(string[start:end]) + else: + attributes[key] = string[start:end] + idx += 1 + idx += 1 + if classes: + attributes["class"] = " ".join(classes) + return attributes + + def __str__(self) -> str: + return str(self._tokens) + + def __repr__(self) -> str: + return repr(self._tokens) + + +class ParseError(Exception): + def __init__(self, msg: str, pos: int) -> None: + self.pos = pos + super().__init__(msg + f" at position {pos}") + + +def parse(string: str) -> tuple[int, dict[str, str]]: + """Parse attributes from start of string. + + :returns: (length of parsed string, dict of attributes) + """ + pos = 0 + state: State = State.START + tokens = TokenState() + while pos < len(string): + state = HANDLERS[state](string[pos], pos, tokens) + if state == State.DONE: + return pos, tokens.compile(string) + pos = pos + 1 + + return pos, tokens.compile(string) + + +def handle_start(char: str, pos: int, tokens: TokenState) -> State: + + if char == "{": + return State.SCANNING + raise ParseError("Attributes must start with '{'", pos) + + +def handle_scanning(char: str, pos: int, tokens: TokenState) -> State: + + if char == " " or char == "\t" or char == "\n" or char == "\r": + return State.SCANNING + if char == "}": + return State.DONE + if char == "#": + tokens.set_start(pos) + return State.SCANNING_ID + if char == "%": + tokens.set_start(pos) + return State.SCANNING_COMMENT + if char == ".": + tokens.set_start(pos) + return State.SCANNING_CLASS + if REGEX_KEY_CHARACTERS.fullmatch(char): + tokens.set_start(pos) + return State.SCANNING_KEY + + raise ParseError(f"Unexpected character whilst scanning: {char}", pos) + + +def handle_scanning_comment(char: str, pos: int, tokens: TokenState) -> State: + + if char == "%": + return State.SCANNING + + return State.SCANNING_COMMENT + + +def handle_scanning_id(char: str, pos: int, tokens: TokenState) -> State: + + if not REGEX_SPACE_PUNCTUATION.fullmatch(char): + return State.SCANNING_ID + + if char == "}": + if (pos - 1) > tokens.start: + tokens.append(tokens.start + 1, pos, "id") + return State.DONE + + if REGEX_SPACE.fullmatch(char): + if (pos - 1) > tokens.start: + tokens.append(tokens.start + 1, pos, "id") + return State.SCANNING + + raise ParseError(f"Unexpected character whilst scanning id: {char}", pos) + + +def handle_scanning_class(char: str, pos: int, tokens: TokenState) -> State: + + if not REGEX_SPACE_PUNCTUATION.fullmatch(char): + return State.SCANNING_CLASS + + if char == "}": + if (pos - 1) > tokens.start: + tokens.append(tokens.start + 1, pos, "class") + return State.DONE + + if REGEX_SPACE.fullmatch(char): + if (pos - 1) > tokens.start: + tokens.append(tokens.start + 1, pos, "class") + return State.SCANNING + + raise ParseError(f"Unexpected character whilst scanning class: {char}", pos) + + +def handle_scanning_key(char: str, pos: int, tokens: TokenState) -> State: + + if char == "=": + tokens.append(tokens.start, pos, "key") + return State.SCANNING_VALUE + + if REGEX_KEY_CHARACTERS.fullmatch(char): + return State.SCANNING_KEY + + raise ParseError(f"Unexpected character whilst scanning key: {char}", pos) + + +def handle_scanning_value(char: str, pos: int, tokens: TokenState) -> State: + + if char == '"': + tokens.set_start(pos) + return State.SCANNING_QUOTED_VALUE + + if REGEX_KEY_CHARACTERS.fullmatch(char): + tokens.set_start(pos) + return State.SCANNING_BARE_VALUE + + raise ParseError(f"Unexpected character whilst scanning value: {char}", pos) + + +def handle_scanning_bare_value(char: str, pos: int, tokens: TokenState) -> State: + + if REGEX_KEY_CHARACTERS.fullmatch(char): + return State.SCANNING_BARE_VALUE + + if char == "}": + tokens.append(tokens.start, pos, "value") + return State.DONE + + if REGEX_SPACE.fullmatch(char): + tokens.append(tokens.start, pos, "value") + return State.SCANNING + + raise ParseError(f"Unexpected character whilst scanning bare value: {char}", pos) + + +def handle_scanning_escaped(char: str, pos: int, tokens: TokenState) -> State: + return State.SCANNING_QUOTED_VALUE + + +def handle_scanning_quoted_value(char: str, pos: int, tokens: TokenState) -> State: + + if char == '"': + tokens.append(tokens.start + 1, pos, "value") + return State.SCANNING + + if char == "\\": + return State.SCANNING_ESCAPED + + if char == "{" or char == "}": + raise ParseError( + f"Unexpected character whilst scanning quoted value: {char}", pos + ) + + if char == "\n": + tokens.append(tokens.start + 1, pos, "value") + return State.SCANNING_QUOTED_VALUE + + return State.SCANNING_QUOTED_VALUE + + +HANDLERS: dict[State, Callable[[str, int, TokenState], State]] = { + State.START: handle_start, + State.SCANNING: handle_scanning, + State.SCANNING_COMMENT: handle_scanning_comment, + State.SCANNING_ID: handle_scanning_id, + State.SCANNING_CLASS: handle_scanning_class, + State.SCANNING_KEY: handle_scanning_key, + State.SCANNING_VALUE: handle_scanning_value, + State.SCANNING_BARE_VALUE: handle_scanning_bare_value, + State.SCANNING_QUOTED_VALUE: handle_scanning_quoted_value, + State.SCANNING_ESCAPED: handle_scanning_escaped, +} |