myst_parser/mdit_to_docutils/html_to_nodes.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139

"""Convert HTML to docutils nodes."""
from __future__ import annotations

import re
from typing import TYPE_CHECKING

from docutils import nodes

from myst_parser.parsers.parse_html import Data, tokenize_html

if TYPE_CHECKING:
    from .base import DocutilsRenderer


def make_error(
    document: nodes.document, error_msg: str, text: str, line_number: int
) -> nodes.system_message:
    return document.reporter.error(
        error_msg,
        nodes.literal_block(text, text),
        line=line_number,
    )


OPTION_KEYS_IMAGE = {"class", "alt", "height", "width", "align", "name"}
# note: docutils also has scale and target

OPTION_KEYS_ADMONITION = {"class", "name"}

# See https://github.com/micromark/micromark-extension-gfm-tagfilter
RE_FLOW = re.compile(
    r"<(\/?)(iframe|noembed|noframes|plaintext|script|style|title|textarea|xmp)(?=[\t\n\f\r />])",
    re.IGNORECASE,
)


def default_html(text: str, source: str, line_number: int) -> list[nodes.Element]:
    raw_html = nodes.raw("", text, format="html")
    raw_html.source = source
    raw_html.line = line_number
    return [raw_html]


def html_to_nodes(
    text: str, line_number: int, renderer: DocutilsRenderer
) -> list[nodes.Element]:
    """Convert HTML to docutils nodes."""
    if renderer.md_config.gfm_only:
        text, _ = RE_FLOW.subn(lambda s: s.group(0).replace("<", "&lt;"), text)

    enable_html_img = "html_image" in renderer.md_config.enable_extensions
    enable_html_admonition = "html_admonition" in renderer.md_config.enable_extensions
    if not (enable_html_img or enable_html_admonition):
        return default_html(text, renderer.document["source"], line_number)

    # parse the HTML to AST
    try:
        root = tokenize_html(text).strip(inplace=True, recurse=False)
    except Exception:
        msg_node = renderer.create_warning(
            "HTML could not be parsed", line=line_number, subtype="html"
        )
        return ([msg_node] if msg_node else []) + default_html(
            text, renderer.document["source"], line_number
        )

    if len(root) < 1:
        # if empty
        return default_html(text, renderer.document["source"], line_number)

    if not all(
        (enable_html_img and child.name == "img")
        or (
            enable_html_admonition
            and child.name == "div"
            and "admonition" in child.attrs.classes
        )
        for child in root
    ):
        return default_html(text, renderer.document["source"], line_number)

    nodes_list = []
    for child in root:

        if child.name == "img":
            if "src" not in child.attrs:
                return [
                    renderer.reporter.error(
                        "<img> missing 'src' attribute", line=line_number
                    )
                ]
            content = "\n".join(
                f":{k}: {v}"
                for k, v in sorted(child.attrs.items())
                if k in OPTION_KEYS_IMAGE
            )
            nodes_list.extend(
                renderer.run_directive(
                    "image", child.attrs["src"], content, line_number
                )
            )

        else:
            children = child.strip().children
            if (
                children
                and children[0].name in ("div", "p")
                and (
                    "title" in children[0].attrs.classes
                    or "admonition-title" in children[0].attrs.classes
                )
            ):
                title = "".join(child.render() for child in children.pop(0))
            else:
                title = "Note"

            options = "\n".join(
                f":{k}: {v}"
                for k, v in sorted(child.attrs.items())
                if k in OPTION_KEYS_ADMONITION
            ).rstrip()
            new_children = []
            for child in children:
                if child.name == "p":
                    new_children.extend(child.children)
                    new_children.append(Data("\n\n"))
                else:
                    new_children.append(child)
            content = (
                options
                + ("\n\n" if options else "")
                + "".join(child.render() for child in new_children).lstrip()
            )

            nodes_list.extend(
                renderer.run_directive("admonition", title, content, line_number)
            )

    return nodes_list