diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-29 04:23:02 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-29 04:23:02 +0000 |
commit | 943e3dc057eca53e68ddec51529bd6a1279ebd8e (patch) | |
tree | 61fb7bac619a56dfbcdcbdb7b0d4d6535fc36fe9 /myst_parser/parsers/parse_html.py | |
parent | Initial commit. (diff) | |
download | myst-parser-upstream/0.18.1.tar.xz myst-parser-upstream/0.18.1.zip |
Adding upstream version 0.18.1.upstream/0.18.1upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to '')
-rw-r--r-- | myst_parser/parsers/parse_html.py | 440 |
1 files changed, 440 insertions, 0 deletions
diff --git a/myst_parser/parsers/parse_html.py b/myst_parser/parsers/parse_html.py new file mode 100644 index 0000000..7539e42 --- /dev/null +++ b/myst_parser/parsers/parse_html.py @@ -0,0 +1,440 @@ +"""A simple but complete HTML to Abstract Syntax Tree (AST) parser. + +The AST can also reproduce the HTML text. + +Example:: + + >> text = '<div class="note"><p>text</p></div>' + >> ast = tokenize_html(text) + >> list(ast.walk(include_self=True)) + [Root(''), Tag('div', {'class': 'note'}), Tag('p'), Data('text')] + >> str(ast) + '<div class="note"><p>text</p></div>' + >> str(ast[0][0]) + '<p>text</p>' + +Note: optional tags are not accounted for +(see https://html.spec.whatwg.org/multipage/syntax.html#optional-tags) + +""" +from __future__ import annotations + +import inspect +import itertools +from collections import abc, deque +from html.parser import HTMLParser +from typing import Any, Callable, Iterable, Iterator + + +class Attribute(dict): + """This class holds the tags's attributes.""" + + def __getitem__(self, key: str) -> str: + """If self doesn't have the key it returns ''.""" + return self.get(key, "") + + @property + def classes(self) -> list[str]: + """Return 'class' attribute as list.""" + return self["class"].split() + + def __str__(self) -> str: + """Return a htmlized representation for attributes.""" + return " ".join(f'{key}="{value}"' for key, value in self.items()) + + +class Element(abc.MutableSequence): + """An Element of the xml/html document. + + All xml/html entities inherit from this class. + """ + + def __init__(self, name: str = "", attr: dict | None = None) -> None: + """Initialise the element.""" + self.name = name + self.attrs: Attribute = Attribute(attr or {}) + self._parent: Element | None = None + self._children: list[Element] = [] + + @property + def parent(self) -> Element | None: + """Return parent.""" + return self._parent + + @property + def children(self) -> list[Element]: + """Return copy of children.""" + return self._children[:] + + def reset_children(self, children: list[Element], deepcopy: bool = False): + new_children = [] + for i, item in enumerate(children): + assert isinstance(item, Element) + if deepcopy: + item = item.deepcopy() + if item._parent is None: + item._parent = self + elif item._parent != self: + raise AssertionError(f"different parent already set for item {i}") + new_children.append(item) + self._children = new_children + + def __getitem__(self, index: int) -> Element: # type: ignore[override] + return self._children[index] + + def __setitem__(self, index: int, item: Element): # type: ignore[override] + assert isinstance(item, Element) + if item._parent is not None and item._parent != self: + raise AssertionError(f"different parent already set for: {item!r}") + item._parent = self + return self._children.__setitem__(index, item) + + def __delitem__(self, index: int): # type: ignore[override] + return self._children.__delitem__(index) + + def __len__(self) -> int: + return self._children.__len__() + + def __iter__(self) -> Iterator[Element]: + yield from self._children + + def insert(self, index: int, item: Element): + assert isinstance(item, Element) + if item._parent is not None and item._parent != self: + raise AssertionError(f"different parent already set for: {item!r}") + item._parent = self + return self._children.insert(index, item) + + def deepcopy(self) -> Element: + """Recursively copy and remove parent.""" + _copy = self.__class__(self.name, self.attrs) + for child in self: + _copy_child = child.deepcopy() + _copy.append(_copy_child) + return _copy + + def __repr__(self) -> str: + text = f"{self.__class__.__name__}({self.name!r}" + if self.attrs: + text += f", {self.attrs!r}" + text += ")" + return text + + def render( + self, + tag_overrides: dict[str, Callable[[Element, dict], str]] | None = None, + **kwargs, + ) -> str: + """Returns a HTML string representation of the element. + + :param tag_overrides: Provide a dictionary of render function + for specific tag names, to override the normal render format + + """ + raise NotImplementedError + + def __str__(self) -> str: + return self.render() + + def __eq__(self, item: Any) -> bool: + return item is self + + def walk(self, include_self: bool = False) -> Iterator[Element]: + """Walk through the xml/html AST.""" + if include_self: + yield self + for child in self: + yield child + yield from child.walk() + + def strip(self, inplace: bool = False, recurse: bool = False) -> Element: + """Return copy with all `Data` tokens + that only contain whitespace / newlines removed. + """ + element = self + if not inplace: + element = self.deepcopy() + element.reset_children( + [ + e + for e in element.children + if not (isinstance(e, Data) and e.data.strip() == "") + ] + ) + if recurse: + for child in element: + child.strip(inplace=True, recurse=True) + return element + + def find( + self, + identifier: str | type[Element], + attrs: dict | None = None, + classes: Iterable[str] | None = None, + include_self: bool = False, + recurse: bool = True, + ) -> Iterator[Element]: + """Find all elements that match name and specific attributes.""" + iterator = self.walk() if recurse else self + if include_self: + iterator = itertools.chain([self], iterator) + if inspect.isclass(identifier): + test_func = lambda c: isinstance(c, identifier) # noqa: E731 + else: + test_func = lambda c: c.name == identifier # noqa: E731 + classes = set(classes) if classes is not None else classes + for child in iterator: + if test_func(child): + if classes is not None and not classes.issubset(child.attrs.classes): + continue + for key, value in (attrs or {}).items(): + if child.attrs[key] != value: + break + else: + yield child + + +class Root(Element): + """The root of the AST tree.""" + + def render(self, **kwargs) -> str: # type: ignore[override] + """Returns a string HTML representation of the structure.""" + return "".join(child.render(**kwargs) for child in self) + + +class Tag(Element): + """Represent xml/html tags under the form: <name key="value" ...> ... </name>.""" + + def render( + self, + tag_overrides: dict[str, Callable[[Element, dict], str]] | None = None, + **kwargs, + ) -> str: + if tag_overrides and self.name in tag_overrides: + return tag_overrides[self.name](self, tag_overrides) + return ( + f"<{self.name}{' ' if self.attrs else ''}{self.attrs}>" + + "".join( + child.render(tag_overrides=tag_overrides, **kwargs) for child in self + ) + + f"</{self.name}>" + ) + + +class XTag(Element): + """Represent XHTML style tags with no children, like `<img src="t.gif" />`""" + + def render( + self, + tag_overrides: dict[str, Callable[[Element, dict], str]] | None = None, + **kwargs, + ) -> str: + if tag_overrides is not None and self.name in tag_overrides: + return tag_overrides[self.name](self, tag_overrides) + return f"<{self.name}{' ' if self.attrs else ''}{self.attrs}/>" + + +class VoidTag(Element): + """Represent tags with no children, only start tag, like `<img src="t.gif" >`""" + + def render(self, **kwargs) -> str: # type: ignore[override] + return f"<{self.name}{' ' if self.attrs else ''}{self.attrs}>" + + +class TerminalElement(Element): + def __init__(self, data: str): + super().__init__("") + self.data: str = data + + def __repr__(self) -> str: + text = self.data + if len(text) > 20: + text = text[:17] + "..." + return f"{self.__class__.__name__}({text!r})" + + def deepcopy(self) -> TerminalElement: + """Copy and remove parent.""" + _copy = self.__class__(self.data) + return _copy + + +class Data(TerminalElement): + """Represent data inside xml/html documents, like raw text.""" + + def render(self, **kwargs) -> str: # type: ignore[override] + return self.data + + +class Declaration(TerminalElement): + """Represent declarations, like `<!DOCTYPE html>`""" + + def render(self, **kwargs) -> str: # type: ignore[override] + return f"<!{self.data}>" + + +class Comment(TerminalElement): + """Represent HTML comments""" + + def render(self, **kwargs) -> str: # type: ignore[override] + return f"<!--{self.data}-->" + + +class Pi(TerminalElement): + """Represent processing instructions like `<?xml-stylesheet ?>`""" + + def render(self, **kwargs) -> str: # type: ignore[override] + return f"<?{self.data}>" + + +class Char(TerminalElement): + """Represent character codes like: `�`""" + + def render(self, **kwargs) -> str: # type: ignore[override] + return f"&#{self.data};" + + +class Entity(TerminalElement): + """Represent entities like `&`""" + + def render(self, **kwargs) -> str: # type: ignore[override] + return f"&{self.data};" + + +class Tree: + """The engine class to generate the AST tree.""" + + def __init__(self, name: str = ""): + """Initialise Tree""" + self.name = name + self.outmost = Root(name) + self.stack: deque = deque() + self.stack.append(self.outmost) + + def clear(self): + """Clear the outmost and stack for a new parsing.""" + self.outmost = Root(self.name) + self.stack.clear() + self.stack.append(self.outmost) + + def last(self) -> Element: + """Return the last pointer which point to the actual tag scope.""" + return self.stack[-1] + + def nest_tag(self, name: str, attrs: dict): + """Nest a given tag at the bottom of the tree using + the last stack's pointer. + """ + pointer = self.stack.pop() + item = Tag(name, attrs) + pointer.append(item) + self.stack.append(pointer) + self.stack.append(item) + + def nest_xtag(self, name: str, attrs: dict): + """Nest an XTag onto the tree.""" + top = self.last() + item = XTag(name, attrs) + top.append(item) + + def nest_vtag(self, name: str, attrs: dict): + """Nest a VoidTag onto the tree.""" + top = self.last() + item = VoidTag(name, attrs) + top.append(item) + + def nest_terminal(self, klass: type[TerminalElement], data: str): + """Nest the data onto the tree.""" + top = self.last() + item = klass(data) + top.append(item) + + def enclose(self, name: str): + """When a closing tag is found, pop the pointer's scope from the stack, + to then point to the earlier scope's tag. + """ + count = 0 + for ind in reversed(self.stack): + count = count + 1 + if ind.name == name: + break + else: + count = 0 + + # It pops all the items which do not match with the closing tag. + for _ in range(0, count): + self.stack.pop() + + +class HtmlToAst(HTMLParser): + """The tokenizer class.""" + + # see https://html.spec.whatwg.org/multipage/syntax.html#void-elements + void_elements = { + "area", + "base", + "br", + "col", + "embed", + "hr", + "img", + "input", + "link", + "meta", + "param", + "source", + "track", + "wbr", + } + + def __init__(self, name: str = "", convert_charrefs: bool = False): + super().__init__(convert_charrefs=convert_charrefs) + self.struct = Tree(name) + + def feed(self, source: str) -> Root: # type: ignore[override] + """Parse the source string.""" + self.struct.clear() + super().feed(source) + return self.struct.outmost + + def handle_starttag(self, name: str, attr): + """When found an opening tag then nest it onto the tree.""" + if name in self.void_elements: + self.struct.nest_vtag(name, attr) + else: + self.struct.nest_tag(name, attr) + + def handle_startendtag(self, name: str, attr): + """When found a XHTML tag style then nest it up to the tree.""" + self.struct.nest_xtag(name, attr) + + def handle_endtag(self, name: str): + """When found a closing tag then makes it point to the right scope.""" + if name not in self.void_elements: + self.struct.enclose(name) + + def handle_data(self, data: str): + """Nest data onto the tree.""" + self.struct.nest_terminal(Data, data) + + def handle_decl(self, decl: str): + self.struct.nest_terminal(Declaration, decl) + + def unknown_decl(self, decl: str): + self.struct.nest_terminal(Declaration, decl) + + def handle_charref(self, data: str): + self.struct.nest_terminal(Char, data) + + def handle_entityref(self, data: str): + self.struct.nest_terminal(Entity, data) + + def handle_pi(self, data: str): + self.struct.nest_terminal(Pi, data) + + def handle_comment(self, data: str): + self.struct.nest_terminal(Comment, data) + + +def tokenize_html(text: str, name: str = "", convert_charrefs: bool = False) -> Root: + parser = HtmlToAst(name, convert_charrefs=convert_charrefs) + return parser.feed(text) |