diff options
Diffstat (limited to '')
-rw-r--r-- | myst_parser/mdit_to_docutils/__init__.py | 1 | ||||
-rw-r--r-- | myst_parser/mdit_to_docutils/base.py | 1483 | ||||
-rw-r--r-- | myst_parser/mdit_to_docutils/html_to_nodes.py | 139 | ||||
-rw-r--r-- | myst_parser/mdit_to_docutils/sphinx_.py | 245 | ||||
-rw-r--r-- | myst_parser/mdit_to_docutils/utils.py | 36 |
5 files changed, 1904 insertions, 0 deletions
diff --git a/myst_parser/mdit_to_docutils/__init__.py b/myst_parser/mdit_to_docutils/__init__.py new file mode 100644 index 0000000..0b9307f --- /dev/null +++ b/myst_parser/mdit_to_docutils/__init__.py @@ -0,0 +1 @@ +"""Conversion of Markdown-it tokens to docutils AST.""" diff --git a/myst_parser/mdit_to_docutils/base.py b/myst_parser/mdit_to_docutils/base.py new file mode 100644 index 0000000..cedd6c3 --- /dev/null +++ b/myst_parser/mdit_to_docutils/base.py @@ -0,0 +1,1483 @@ +"""Convert Markdown-it tokens to docutils nodes.""" +from __future__ import annotations + +import inspect +import json +import os +import re +from collections import OrderedDict +from contextlib import contextmanager +from datetime import date, datetime +from types import ModuleType +from typing import TYPE_CHECKING, Any, Iterator, MutableMapping, Sequence, cast +from urllib.parse import urlparse + +import jinja2 +import yaml +from docutils import nodes +from docutils.frontend import OptionParser +from docutils.languages import get_language +from docutils.parsers.rst import Directive, DirectiveError +from docutils.parsers.rst import Parser as RSTParser +from docutils.parsers.rst import directives, roles +from docutils.parsers.rst.directives.misc import Include +from docutils.parsers.rst.languages import get_language as get_language_rst +from docutils.statemachine import StringList +from docutils.transforms.components import Filter +from docutils.utils import Reporter, new_document +from docutils.utils.code_analyzer import Lexer, LexerError, NumberLines +from markdown_it import MarkdownIt +from markdown_it.common.utils import escapeHtml +from markdown_it.renderer import RendererProtocol +from markdown_it.token import Token +from markdown_it.tree import SyntaxTreeNode + +from myst_parser._compat import findall +from myst_parser.config.main import MdParserConfig +from myst_parser.mocking import ( + MockIncludeDirective, + MockingError, + MockInliner, + MockRSTParser, + MockState, + MockStateMachine, +) +from myst_parser.parsers.directives import DirectiveParsingError, parse_directive_text +from .html_to_nodes import html_to_nodes +from .utils import is_external_url + +if TYPE_CHECKING: + from sphinx.environment import BuildEnvironment + + +def make_document(source_path="notset", parser_cls=RSTParser) -> nodes.document: + """Create a new docutils document, with the parser classes' default settings.""" + settings = OptionParser(components=(parser_cls,)).get_default_values() + return new_document(source_path, settings=settings) + + +REGEX_DIRECTIVE_START = re.compile(r"^[\s]{0,3}([`]{3,10}|[~]{3,10}|[:]{3,10})\{") + + +def token_line(token: SyntaxTreeNode, default: int | None = None) -> int: + """Retrieve the initial line of a token.""" + if not getattr(token, "map", None): + if default is not None: + return default + raise ValueError(f"token map not set: {token}") + return token.map[0] # type: ignore[index] + + +def create_warning( + document: nodes.document, + message: str, + *, + line: int | None = None, + append_to: nodes.Element | None = None, + wtype: str = "myst", + subtype: str = "other", +) -> nodes.system_message | None: + """Generate a warning, logging if it is necessary. + + Note this is overridden in the ``SphinxRenderer``, + to handle suppressed warning types. + """ + kwargs = {"line": line} if line is not None else {} + msg_node = document.reporter.warning(f"{message} [{wtype}.{subtype}]", **kwargs) + if append_to is not None: + append_to.append(msg_node) + return msg_node + + +class DocutilsRenderer(RendererProtocol): + """A markdown-it-py renderer to populate (in-place) a `docutils.document` AST. + + Note, this render is not dependent on Sphinx. + """ + + __output__ = "docutils" + + def __init__(self, parser: MarkdownIt) -> None: + """Load the renderer (called by ``MarkdownIt``)""" + self.md = parser + self.rules = { + k: v + for k, v in inspect.getmembers(self, predicate=inspect.ismethod) + if k.startswith("render_") and k != "render_children" + } + + def __getattr__(self, name: str): + """Warn when the renderer has not been setup yet.""" + if name in ( + "md_env", + "md_config", + "md_options", + "document", + "current_node", + "reporter", + "language_module_rst", + "_level_to_elem", + ): + raise AttributeError( + f"'{name}' attribute is not available until setup_render() is called" + ) + raise AttributeError( + f"'{type(self).__name__}' object has no attribute '{name}'" + ) + + def setup_render( + self, options: dict[str, Any], env: MutableMapping[str, Any] + ) -> None: + """Setup the renderer with per render variables.""" + self.md_env = env + self.md_options = options + self.md_config: MdParserConfig = options["myst_config"] + self.document: nodes.document = options.get("document", make_document()) + self.current_node: nodes.Element = options.get("current_node", self.document) + self.reporter: Reporter = self.document.reporter + # note there are actually two possible language modules: + # one from docutils.languages, and one from docutils.parsers.rst.languages + self.language_module_rst: ModuleType = get_language_rst( + self.document.settings.language_code + ) + # a mapping of heading levels to its currently associated node + self._level_to_elem: dict[int, nodes.document | nodes.section] = { + 0: self.document + } + + @property + def sphinx_env(self) -> BuildEnvironment | None: + """Return the sphinx env, if using Sphinx.""" + try: + return self.document.settings.env + except AttributeError: + return None + + def create_warning( + self, + message: str, + *, + line: int | None = None, + append_to: nodes.Element | None = None, + wtype: str = "myst", + subtype: str = "other", + ) -> nodes.system_message | None: + """Generate a warning, logging if it is necessary. + + Note this is overridden in the ``SphinxRenderer``, + to handle suppressed warning types. + """ + return create_warning( + self.document, + message, + line=line, + append_to=append_to, + wtype=wtype, + subtype=subtype, + ) + + def _render_tokens(self, tokens: list[Token]) -> None: + """Render the tokens.""" + # propagate line number down to inline elements + for token in tokens: + if not token.map: + continue + # For docutils we want 1 based line numbers (not 0) + token.map = [token.map[0] + 1, token.map[1] + 1] + for token_child in token.children or []: + token_child.map = token.map + + # nest tokens + node_tree = SyntaxTreeNode(tokens) + + # move footnote definitions to env + self.md_env.setdefault("foot_refs", {}) + for node in node_tree.walk(include_self=True): + new_children = [] + for child in node.children: + if child.type == "footnote_reference": + label = child.meta["label"] + self.md_env["foot_refs"].setdefault(label, []).append(child) + else: + new_children.append(child) + + node.children = new_children + + # render + for child in node_tree.children: + # skip hidden? + if f"render_{child.type}" in self.rules: + self.rules[f"render_{child.type}"](child) + else: + self.create_warning( + f"No render method for: {child.type}", + line=token_line(child, default=0), + subtype="render", + append_to=self.current_node, + ) + + def render( + self, tokens: Sequence[Token], options, md_env: MutableMapping[str, Any] + ) -> nodes.document: + """Run the render on a token stream. + + :param tokens: list on block tokens to render + :param options: params of parser instance + :param md_env: the markdown-it environment sandbox associated with the tokens, + containing additional metadata like reference info + """ + self.setup_render(options, md_env) + self._render_initialise() + self._render_tokens(list(tokens)) + self._render_finalise() + return self.document + + def _render_initialise(self) -> None: + """Initialise the render of the document.""" + self.current_node.extend( + html_meta_to_nodes( + self.md_config.html_meta, + document=self.document, + line=0, + reporter=self.reporter, + ) + ) + + def _render_finalise(self) -> None: + """Finalise the render of the document.""" + + # log warnings for duplicate reference definitions + # "duplicate_refs": [{"href": "ijk", "label": "B", "map": [4, 5], "title": ""}], + for dup_ref in self.md_env.get("duplicate_refs", []): + self.create_warning( + f"Duplicate reference definition: {dup_ref['label']}", + line=dup_ref["map"][0] + 1, + subtype="ref", + append_to=self.document, + ) + + # we don't use the foot_references stored in the env + # since references within directives/roles will have been added after + # those from the initial markdown parse + # instead we gather them from a walk of the created document + foot_refs = OrderedDict() + for refnode in findall(self.document)(nodes.footnote_reference): + if refnode["refname"] not in foot_refs: + foot_refs[refnode["refname"]] = True + + if foot_refs and self.md_config.footnote_transition: + self.current_node.append(nodes.transition(classes=["footnotes"])) + for footref in foot_refs: + foot_ref_tokens = self.md_env["foot_refs"].get(footref, []) + if len(foot_ref_tokens) > 1: + self.create_warning( + f"Multiple footnote definitions found for label: '{footref}'", + subtype="footnote", + append_to=self.current_node, + ) + + if len(foot_ref_tokens) < 1: + self.create_warning( + f"No footnote definitions found for label: '{footref}'", + subtype="footnote", + append_to=self.current_node, + ) + else: + self.render_footnote_reference(foot_ref_tokens[0]) + + # Add the wordcount, generated by the ``mdit_py_plugins.wordcount_plugin``. + wordcount_metadata = self.md_env.get("wordcount", {}) + if wordcount_metadata: + + # save the wordcount to the sphinx BuildEnvironment metadata + if self.sphinx_env is not None: + meta = self.sphinx_env.metadata.setdefault(self.sphinx_env.docname, {}) + meta["wordcount"] = wordcount_metadata + + # now add the wordcount as substitution definitions, + # so we can reference them in the document + for key in ("words", "minutes"): + value = wordcount_metadata.get(key, None) + if value is None: + continue + substitution_node = nodes.substitution_definition( + str(value), nodes.Text(str(value)) + ) + substitution_node.source = self.document["source"] + substitution_node["names"].append(f"wordcount-{key}") + self.document.note_substitution_def( + substitution_node, f"wordcount-{key}" + ) + + def nested_render_text( + self, text: str, lineno: int, inline: bool = False, allow_headings: bool = True + ) -> None: + """Render unparsed text (appending to the current node). + + :param text: the text to render + :param lineno: the starting line number of the text, within the full source + :param inline: whether the text is inline or block + :param allow_headings: whether to allow headings in the text + """ + if inline: + tokens = self.md.parseInline(text, self.md_env) + else: + tokens = self.md.parse(text + "\n", self.md_env) + + # remove front matter, if present, e.g. from included documents + if tokens and tokens[0].type == "front_matter": + tokens.pop(0) + + # update the line numbers + for token in tokens: + if token.map: + token.map = [token.map[0] + lineno, token.map[1] + lineno] + + current_match_titles = self.md_env.get("match_titles", None) + try: + self.md_env["match_titles"] = allow_headings + self._render_tokens(tokens) + finally: + self.md_env["match_titles"] = current_match_titles + + @contextmanager + def current_node_context( + self, node: nodes.Element, append: bool = False + ) -> Iterator: + """Context manager for temporarily setting the current node.""" + if append: + self.current_node.append(node) + current_node = self.current_node + self.current_node = node + yield + self.current_node = current_node + + def render_children(self, token: SyntaxTreeNode) -> None: + """Render the children of a token.""" + for child in token.children or []: + if f"render_{child.type}" in self.rules: + self.rules[f"render_{child.type}"](child) + else: + self.create_warning( + f"No render method for: {child.type}", + line=token_line(child, default=0), + subtype="render", + append_to=self.current_node, + ) + + def add_line_and_source_path(self, node, token: SyntaxTreeNode) -> None: + """Copy the line number and document source path to the docutils node.""" + try: + node.line = token_line(token) + except ValueError: + pass + node.source = self.document["source"] + + def add_line_and_source_path_r( + self, nodes: list[nodes.Element], token: SyntaxTreeNode + ) -> None: + """Copy the line number and document source path to the docutils nodes, + and recursively to all descendants. + """ + for node in nodes: + self.add_line_and_source_path(node, token) + for child in findall(node)(): + self.add_line_and_source_path(child, token) + + def update_section_level_state(self, section: nodes.section, level: int) -> None: + """Update the section level state, with the new current section and level.""" + # find the closest parent section + parent_level = max( + section_level + for section_level in self._level_to_elem + if level > section_level + ) + parent = self._level_to_elem[parent_level] + + # if we are jumping up to a non-consecutive level, + # then warn about this, since this will not be propagated in the docutils AST + if (level > parent_level) and (parent_level + 1 != level): + msg = f"Non-consecutive header level increase; H{parent_level} to H{level}" + if parent_level == 0: + msg = f"Document headings start at H{level}, not H1" + self.create_warning( + msg, + line=section.line, + subtype="header", + append_to=self.current_node, + ) + + # append the new section to the parent + parent.append(section) + # update the state for this section level + self._level_to_elem[level] = section + + # Remove all descendant sections from the section level state + self._level_to_elem = { + section_level: section + for section_level, section in self._level_to_elem.items() + if section_level <= level + } + + def renderInlineAsText(self, tokens: list[SyntaxTreeNode]) -> str: + """Special kludge for image `alt` attributes to conform CommonMark spec. + + Don't try to use it! Spec requires to show `alt` content with stripped markup, + instead of simple escaping. + """ + result = "" + + for token in tokens or []: + if token.type == "text": + result += token.content + # elif token.type == "image": + # result += self.renderInlineAsText(token.children) + else: + result += self.renderInlineAsText(token.children or []) + return result + + # ### render methods for commonmark tokens + + def render_paragraph(self, token: SyntaxTreeNode) -> None: + para = nodes.paragraph(token.children[0].content if token.children else "") + self.add_line_and_source_path(para, token) + with self.current_node_context(para, append=True): + self.render_children(token) + + def render_inline(self, token: SyntaxTreeNode) -> None: + self.render_children(token) + + def render_text(self, token: SyntaxTreeNode) -> None: + self.current_node.append(nodes.Text(token.content)) + + def render_bullet_list(self, token: SyntaxTreeNode) -> None: + list_node = nodes.bullet_list() + if token.markup: + list_node["bullet"] = token.markup + if token.attrs.get("class"): + # this is used e.g. by tasklist + list_node["classes"] = str(token.attrs["class"]).split() + self.add_line_and_source_path(list_node, token) + with self.current_node_context(list_node, append=True): + self.render_children(token) + + def render_ordered_list(self, token: SyntaxTreeNode) -> None: + list_node = nodes.enumerated_list(enumtype="arabic", prefix="") + list_node["suffix"] = token.markup # for CommonMark, this should be "." or ")" + if "start" in token.attrs: # starting number + list_node["start"] = token.attrs["start"] + self.add_line_and_source_path(list_node, token) + with self.current_node_context(list_node, append=True): + self.render_children(token) + + def render_list_item(self, token: SyntaxTreeNode) -> None: + item_node = nodes.list_item() + if token.attrs.get("class"): + # this is used e.g. by tasklist + item_node["classes"] = str(token.attrs["class"]).split() + self.add_line_and_source_path(item_node, token) + with self.current_node_context(item_node, append=True): + self.render_children(token) + + def render_em(self, token: SyntaxTreeNode) -> None: + node = nodes.emphasis() + self.add_line_and_source_path(node, token) + with self.current_node_context(node, append=True): + self.render_children(token) + + def render_softbreak(self, token: SyntaxTreeNode) -> None: + self.current_node.append(nodes.Text("\n")) + + def render_hardbreak(self, token: SyntaxTreeNode) -> None: + self.current_node.append(nodes.raw("", "<br />\n", format="html")) + self.current_node.append(nodes.raw("", "\\\\\n", format="latex")) + + def render_strong(self, token: SyntaxTreeNode) -> None: + node = nodes.strong() + self.add_line_and_source_path(node, token) + with self.current_node_context(node, append=True): + self.render_children(token) + + def render_blockquote(self, token: SyntaxTreeNode) -> None: + quote = nodes.block_quote() + self.add_line_and_source_path(quote, token) + with self.current_node_context(quote, append=True): + self.render_children(token) + + def render_hr(self, token: SyntaxTreeNode) -> None: + node = nodes.transition() + self.add_line_and_source_path(node, token) + self.current_node.append(node) + + def render_code_inline(self, token: SyntaxTreeNode) -> None: + node = nodes.literal(token.content, token.content) + self.add_line_and_source_path(node, token) + self.current_node.append(node) + + def create_highlighted_code_block( + self, + text: str, + lexer_name: str | None, + number_lines: bool = False, + lineno_start: int = 1, + source: str | None = None, + line: int | None = None, + node_cls: type[nodes.Element] = nodes.literal_block, + ) -> nodes.Element: + """Create a literal block with syntax highlighting. + + This mimics the behaviour of the `code-block` directive. + + In docutils, this directive directly parses the text with the pygments lexer, + whereas in sphinx, the lexer name is only recorded as the `language` attribute, + and the text is lexed later by pygments within the `visit_literal_block` + method of the output format ``SphinxTranslator``. + + Note, this function does not add the literal block to the document. + """ + if self.sphinx_env is not None: + node = node_cls(text, text, language=lexer_name or "none") + if number_lines: + node["linenos"] = True + if lineno_start != 1: + node["highlight_args"] = {"linenostart": lineno_start} + else: + node = node_cls( + text, classes=["code"] + ([lexer_name] if lexer_name else []) + ) + try: + lex_tokens = Lexer( + text, + lexer_name or "", + "short" if self.md_config.highlight_code_blocks else "none", + ) + except LexerError as err: + self.reporter.warning( + str(err), + **{ + name: value + for name, value in (("source", source), ("line", line)) + if value is not None + }, + ) + lex_tokens = Lexer(text, lexer_name or "", "none") + + if number_lines: + lex_tokens = NumberLines( + lex_tokens, lineno_start, lineno_start + len(text.splitlines()) + ) + + for classes, value in lex_tokens: + if classes: + node += nodes.inline(value, value, classes=classes) + else: + # insert as Text to decrease the verbosity of the output + node += nodes.Text(value) + + if source is not None: + node.source = source + if line is not None: + node.line = line + return node + + def render_code_block(self, token: SyntaxTreeNode) -> None: + lexer = token.info.split()[0] if token.info else None + node = self.create_highlighted_code_block( + token.content, + lexer, + source=self.document["source"], + line=token_line(token, 0) or None, + ) + self.current_node.append(node) + + def render_fence(self, token: SyntaxTreeNode) -> None: + text = token.content + # Ensure that we'll have an empty string if info exists but is only spaces + info = token.info.strip() if token.info else token.info + language = info.split()[0] if info else "" + + if (not self.md_config.commonmark_only) and (not self.md_config.gfm_only): + if language == "{eval-rst}": + return self.render_restructuredtext(token) + if language.startswith("{") and language.endswith("}"): + return self.render_directive(token) + + if not language and self.sphinx_env is not None: + # use the current highlight setting, via the ``highlight`` directive, + # or ``highlight_language`` configuration. + language = self.sphinx_env.temp_data.get( + "highlight_language", self.sphinx_env.config.highlight_language + ) + + node = self.create_highlighted_code_block( + text, + language, + number_lines=language in self.md_config.number_code_blocks, + source=self.document["source"], + line=token_line(token, 0) or None, + ) + self.current_node.append(node) + + @property + def blocks_mathjax_processing(self) -> bool: + """Only add mathjax ignore classes if using sphinx, + and using the ``dollarmath`` extension, and ``myst_update_mathjax=True``. + """ + return ( + self.sphinx_env is not None + and "dollarmath" in self.md_config.enable_extensions + and self.md_config.update_mathjax + ) + + def render_heading(self, token: SyntaxTreeNode) -> None: + """Render a heading, e.g. `# Heading`.""" + + if self.md_env.get("match_titles", None) is False: + # this can occur if a nested parse is performed by a directive + # (such as an admonition) which contains a header. + # this would break the document structure + self.create_warning( + "Disallowed nested header found, converting to rubric", + line=token_line(token, default=0), + subtype="nested_header", + append_to=self.current_node, + ) + rubric = nodes.rubric(token.content, "") + self.add_line_and_source_path(rubric, token) + with self.current_node_context(rubric, append=True): + self.render_children(token) + return + + level = int(token.tag[1]) + + # create the section node + new_section = nodes.section() + self.add_line_and_source_path(new_section, token) + # if a top level section, + # then add classes to set default mathjax processing to false + # we then turn it back on, on a per-node basis + if level == 1 and self.blocks_mathjax_processing: + new_section["classes"].extend(["tex2jax_ignore", "mathjax_ignore"]) + + # update the state of the section levels + self.update_section_level_state(new_section, level) + + # create the title for this section + title_node = nodes.title(token.children[0].content if token.children else "") + self.add_line_and_source_path(title_node, token) + new_section.append(title_node) + # render the heading children into the title + with self.current_node_context(title_node): + self.render_children(token) + + # create a target reference for the section, based on the heading text + name = nodes.fully_normalize_name(title_node.astext()) + new_section["names"].append(name) + self.document.note_implicit_target(new_section, new_section) + + # set the section as the current node for subsequent rendering + self.current_node = new_section + + def render_link(self, token: SyntaxTreeNode) -> None: + """Parse `<http://link.com>` or `[text](link "title")` syntax to docutils AST: + + - If `<>` autolink, forward to `render_autolink` + - If `myst_all_links_external` is True, forward to `render_external_url` + - If link is an external URL, forward to `render_external_url` + - External URLs start with a scheme (e.g. `http:`) in `myst_url_schemes`, + or any scheme if `myst_url_schemes` is None. + - Otherwise, forward to `render_internal_link` + """ + if token.info == "auto": # handles both autolink and linkify + return self.render_autolink(token) + + if ( + self.md_config.commonmark_only + or self.md_config.gfm_only + or self.md_config.all_links_external + ): + return self.render_external_url(token) + + # Check for external URL + url_scheme = urlparse(cast(str, token.attrGet("href") or "")).scheme + allowed_url_schemes = self.md_config.url_schemes + if (allowed_url_schemes is None and url_scheme) or ( + allowed_url_schemes is not None and url_scheme in allowed_url_schemes + ): + return self.render_external_url(token) + + return self.render_internal_link(token) + + def render_external_url(self, token: SyntaxTreeNode) -> None: + """Render link token `[text](link "title")`, + where the link has been identified as an external URL:: + + <reference refuri="link" title="title"> + text + + `text` can contain nested syntax, e.g. `[**bold**](url "title")`. + """ + ref_node = nodes.reference() + self.add_line_and_source_path(ref_node, token) + ref_node["refuri"] = cast(str, token.attrGet("href") or "") + title = token.attrGet("title") + if title: + ref_node["title"] = title + with self.current_node_context(ref_node, append=True): + self.render_children(token) + + def render_internal_link(self, token: SyntaxTreeNode) -> None: + """Render link token `[text](link "title")`, + where the link has not been identified as an external URL:: + + <reference refname="link" title="title"> + text + + `text` can contain nested syntax, e.g. `[**bold**](link "title")`. + + Note, this is overridden by `SphinxRenderer`, to use `pending_xref` nodes. + """ + ref_node = nodes.reference() + self.add_line_and_source_path(ref_node, token) + ref_node["refname"] = cast(str, token.attrGet("href") or "") + self.document.note_refname(ref_node) + title = token.attrGet("title") + if title: + ref_node["title"] = title + with self.current_node_context(ref_node, append=True): + self.render_children(token) + + def render_autolink(self, token: SyntaxTreeNode) -> None: + refuri = escapeHtml(token.attrGet("href") or "") # type: ignore[arg-type] + ref_node = nodes.reference() + ref_node["refuri"] = refuri + self.add_line_and_source_path(ref_node, token) + with self.current_node_context(ref_node, append=True): + self.render_children(token) + + def render_html_inline(self, token: SyntaxTreeNode) -> None: + self.render_html_block(token) + + def render_html_block(self, token: SyntaxTreeNode) -> None: + node_list = html_to_nodes(token.content, token_line(token), self) + self.current_node.extend(node_list) + + def render_image(self, token: SyntaxTreeNode) -> None: + img_node = nodes.image() + self.add_line_and_source_path(img_node, token) + destination = cast(str, token.attrGet("src") or "") + + if self.md_env.get("relative-images", None) is not None and not is_external_url( + destination, None, True + ): + # make the path relative to an "including" document + # this is set when using the `relative-images` option of the MyST `include` directive + destination = os.path.normpath( + os.path.join( + self.md_env.get("relative-images", ""), + os.path.normpath(destination), + ) + ) + + img_node["uri"] = destination + + img_node["alt"] = self.renderInlineAsText(token.children or []) + title = token.attrGet("title") + if title: + img_node["title"] = token.attrGet("title") + + # apply other attributes that can be set on the image + if "class" in token.attrs: + img_node["classes"].extend(str(token.attrs["class"]).split()) + if "width" in token.attrs: + try: + width = directives.length_or_percentage_or_unitless( + str(token.attrs["width"]) + ) + except ValueError: + self.create_warning( + f"Invalid width value for image: {token.attrs['width']!r}", + line=token_line(token, default=0), + subtype="image", + append_to=self.current_node, + ) + else: + img_node["width"] = width + if "height" in token.attrs: + try: + height = directives.length_or_unitless(str(token.attrs["height"])) + except ValueError: + self.create_warning( + f"Invalid height value for image: {token.attrs['height']!r}", + line=token_line(token, default=0), + subtype="image", + append_to=self.current_node, + ) + else: + img_node["height"] = height + if "align" in token.attrs: + if token.attrs["align"] not in ("left", "center", "right"): + self.create_warning( + f"Invalid align value for image: {token.attrs['align']!r}", + line=token_line(token, default=0), + subtype="image", + append_to=self.current_node, + ) + else: + img_node["align"] = token.attrs["align"] + if "id" in token.attrs: + name = nodes.fully_normalize_name(str(token.attrs["id"])) + img_node["names"].append(name) + self.document.note_explicit_target(img_node, img_node) + + self.current_node.append(img_node) + + # ### render methods for plugin tokens + + def render_front_matter(self, token: SyntaxTreeNode) -> None: + """Pass document front matter data.""" + position = token_line(token, default=0) + + if isinstance(token.content, str): + try: + data = yaml.safe_load(token.content) + except (yaml.parser.ParserError, yaml.scanner.ScannerError): + self.create_warning( + "Malformed YAML", + line=position, + append_to=self.current_node, + subtype="topmatter", + ) + return + else: + data = token.content + + if not isinstance(data, dict): + self.create_warning( + f"YAML is not a dict: {type(data)}", + line=position, + append_to=self.current_node, + subtype="topmatter", + ) + return + + fields = { + k: v + for k, v in data.items() + if k not in ("myst", "mystnb", "substitutions", "html_meta") + } + if fields: + field_list = self.dict_to_fm_field_list( + fields, language_code=self.document.settings.language_code + ) + self.current_node.append(field_list) + + if data.get("title") and self.md_config.title_to_header: + self.nested_render_text(f"# {data['title']}", 0) + + def dict_to_fm_field_list( + self, data: dict[str, Any], language_code: str, line: int = 0 + ) -> nodes.field_list: + """Render each key/val pair as a docutils ``field_node``. + + Bibliographic keys below will be parsed as Markdown, + all others will be left as literal text. + + The field list should be at the start of the document, + and will then be converted to a `docinfo` node during the + `docutils.docutils.transforms.frontmatter.DocInfo` transform (priority 340), + and bibliographic keys (or their translation) will be converted to nodes:: + + {'author': docutils.nodes.author, + 'authors': docutils.nodes.authors, + 'organization': docutils.nodes.organization, + 'address': docutils.nodes.address, + 'contact': docutils.nodes.contact, + 'version': docutils.nodes.version, + 'revision': docutils.nodes.revision, + 'status': docutils.nodes.status, + 'date': docutils.nodes.date, + 'copyright': docutils.nodes.copyright, + 'dedication': docutils.nodes.topic, + 'abstract': docutils.nodes.topic} + + Also, the 'dedication' and 'abstract' will be placed outside the `docinfo`, + and so will always be shown in the document. + + If using sphinx, this `docinfo` node will later be extracted from the AST, + by the `DoctreeReadEvent` transform (priority 880), + calling `MetadataCollector.process_doc`. + In this case keys and values will be converted to strings and stored in + `app.env.metadata[app.env.docname]` + + See + https://www.sphinx-doc.org/en/master/usage/restructuredtext/field-lists.html + for docinfo fields used by sphinx. + + """ + field_list = nodes.field_list() + field_list.source, field_list.line = self.document["source"], line + + bibliofields = get_language(language_code).bibliographic_fields + + for key, value in data.items(): + if not isinstance(value, (str, int, float, date, datetime)): + value = json.dumps(value) + value = str(value) + body = nodes.paragraph() + body.source, body.line = self.document["source"], line + if key in bibliofields: + with self.current_node_context(body): + self.nested_render_text(value, line, inline=True) + else: + body += nodes.literal(value, value) + + field_node = nodes.field() + field_node.source = value + field_node += nodes.field_name(key, "", nodes.Text(key)) + field_node += nodes.field_body(value, *[body]) + field_list += field_node + + return field_list + + def render_table(self, token: SyntaxTreeNode) -> None: + + # markdown-it table always contains at least a header: + assert token.children + header = token.children[0] + # with one header row + assert header.children + header_row = header.children[0] + assert header_row.children + + # top-level element + table = nodes.table() + table["classes"] += ["colwidths-auto"] + self.add_line_and_source_path(table, token) + self.current_node.append(table) + + # column settings element + maxcols = len(header_row.children) + colwidths = [100 // maxcols] * maxcols + tgroup = nodes.tgroup(cols=len(colwidths)) + table += tgroup + for colwidth in colwidths: + colspec = nodes.colspec(colwidth=colwidth) + tgroup += colspec + + # header + thead = nodes.thead() + tgroup += thead + with self.current_node_context(thead): + self.render_table_row(header_row) + + # body + if len(token.children) > 1: + body = token.children[1] + tbody = nodes.tbody() + tgroup += tbody + with self.current_node_context(tbody): + for body_row in body.children or []: + self.render_table_row(body_row) + + def render_table_row(self, token: SyntaxTreeNode) -> None: + row = nodes.row() + with self.current_node_context(row, append=True): + for child in token.children or []: + entry = nodes.entry() + para = nodes.paragraph( + child.children[0].content if child.children else "" + ) + style = child.attrGet("style") # i.e. the alignment when using e.g. :-- + if style and style in ( + "text-align:left", + "text-align:right", + "text-align:center", + ): + entry["classes"].append(f"text-{cast(str, style).split(':')[1]}") + with self.current_node_context(entry, append=True): + with self.current_node_context(para, append=True): + self.render_children(child) + + def render_s(self, token: SyntaxTreeNode) -> None: + """Render a strikethrough token.""" + # TODO strikethrough not currently directly supported in docutils + self.create_warning( + "Strikethrough is currently only supported in HTML output", + line=token_line(token, 0), + subtype="strikethrough", + append_to=self.current_node, + ) + self.current_node.append(nodes.raw("", "<s>", format="html")) + self.render_children(token) + self.current_node.append(nodes.raw("", "</s>", format="html")) + + def render_math_inline(self, token: SyntaxTreeNode) -> None: + content = token.content + node = nodes.math(content, content) + self.add_line_and_source_path(node, token) + self.current_node.append(node) + + def render_math_inline_double(self, token: SyntaxTreeNode) -> None: + content = token.content + node = nodes.math_block(content, content, nowrap=False, number=None) + self.add_line_and_source_path(node, token) + self.current_node.append(node) + + def render_math_single(self, token: SyntaxTreeNode) -> None: + content = token.content + node = nodes.math(content, content) + self.add_line_and_source_path(node, token) + self.current_node.append(node) + + def render_math_block(self, token: SyntaxTreeNode) -> None: + content = token.content + node = nodes.math_block(content, content, nowrap=False, number=None) + self.add_line_and_source_path(node, token) + self.current_node.append(node) + + def render_amsmath(self, token: SyntaxTreeNode) -> None: + # note docutils does not currently support the nowrap attribute + # or equation numbering, so this is overridden in the sphinx renderer + node = nodes.math_block( + token.content, token.content, nowrap=True, classes=["amsmath"] + ) + if token.meta["numbered"] != "*": + node["numbered"] = True + self.add_line_and_source_path(node, token) + self.current_node.append(node) + + def render_footnote_ref(self, token: SyntaxTreeNode) -> None: + """Footnote references are added as auto-numbered, + .i.e. `[^a]` is read as rST `[#a]_` + """ + target = token.meta["label"] + + refnode = nodes.footnote_reference(f"[^{target}]") + self.add_line_and_source_path(refnode, token) + if not target.isdigit(): + refnode["auto"] = 1 + self.document.note_autofootnote_ref(refnode) + else: + refnode += nodes.Text(target) + + refnode["refname"] = target + self.document.note_footnote_ref(refnode) + + self.current_node.append(refnode) + + def render_footnote_reference(self, token: SyntaxTreeNode) -> None: + target = token.meta["label"] + + footnote = nodes.footnote() + self.add_line_and_source_path(footnote, token) + footnote["names"].append(target) + if not target.isdigit(): + footnote["auto"] = 1 + self.document.note_autofootnote(footnote) + else: + footnote += nodes.label("", target) + self.document.note_footnote(footnote) + self.document.note_explicit_target(footnote, footnote) + with self.current_node_context(footnote, append=True): + self.render_children(token) + + def render_myst_block_break(self, token: SyntaxTreeNode) -> None: + block_break = nodes.comment(token.content, token.content) + block_break["classes"] += ["block_break"] + self.add_line_and_source_path(block_break, token) + self.current_node.append(block_break) + + def render_myst_target(self, token: SyntaxTreeNode) -> None: + text = token.content + name = nodes.fully_normalize_name(text) + target = nodes.target(text) + target["names"].append(name) + self.add_line_and_source_path(target, token) + self.document.note_explicit_target(target, self.current_node) + self.current_node.append(target) + + def render_myst_line_comment(self, token: SyntaxTreeNode) -> None: + self.current_node.append(nodes.comment(token.content, token.content.strip())) + + def render_myst_role(self, token: SyntaxTreeNode) -> None: + name = token.meta["name"] + text = token.content + rawsource = f":{name}:`{token.content}`" + lineno = token_line(token) if token.map else 0 + role_func, messages = roles.role( + name, self.language_module_rst, lineno, self.reporter + ) + inliner = MockInliner(self) + if role_func: + nodes, messages2 = role_func(name, rawsource, text, lineno, inliner) + # return nodes, messages + messages2 + self.current_node += nodes + else: + message = self.reporter.error( + f'Unknown interpreted text role "{name}".', line=lineno + ) + problematic = inliner.problematic(text, rawsource, message) + self.current_node += problematic + + def render_colon_fence(self, token: SyntaxTreeNode) -> None: + """Render a code fence with ``:`` colon delimiters.""" + + if token.content.startswith(":::"): + # the content starts with a nested fence block, + # but must distinguish between ``:options:``, so we add a new line + assert token.token is not None, '"colon_fence" must have a `token`' + linear_token = token.token.copy() + linear_token.content = "\n" + linear_token.content + token.token = linear_token + + return self.render_fence(token) + + def render_dl(self, token: SyntaxTreeNode) -> None: + """Render a definition list.""" + node = nodes.definition_list(classes=["simple", "myst"]) + self.add_line_and_source_path(node, token) + with self.current_node_context(node, append=True): + item = None + for child in token.children or []: + if child.type == "dt": + item = nodes.definition_list_item() + self.add_line_and_source_path(item, child) + with self.current_node_context(item, append=True): + term = nodes.term( + child.children[0].content if child.children else "" + ) + self.add_line_and_source_path(term, child) + with self.current_node_context(term, append=True): + self.render_children(child) + elif child.type == "dd": + if item is None: + error = self.reporter.error( + ( + "Found a definition in a definition list, " + "with no preceding term" + ), + # nodes.literal_block(content, content), + line=token_line(child), + ) + self.current_node += [error] + with self.current_node_context(item): + definition = nodes.definition() + self.add_line_and_source_path(definition, child) + with self.current_node_context(definition, append=True): + self.render_children(child) + else: + error_msg = self.reporter.error( + ( + "Expected a term/definition as a child of a definition list" + f", but found a: {child.type}" + ), + # nodes.literal_block(content, content), + line=token_line(child), + ) + self.current_node += [error_msg] + + def render_field_list(self, token: SyntaxTreeNode) -> None: + """Render a field list.""" + field_list = nodes.field_list(classes=["myst"]) + self.add_line_and_source_path(field_list, token) + with self.current_node_context(field_list, append=True): + # raise ValueError(token.pretty(show_text=True)) + children = (token.children or [])[:] + while children: + child = children.pop(0) + if not child.type == "fieldlist_name": + error_msg = self.reporter.error( + ( + "Expected a fieldlist_name as a child of a field_list" + f", but found a: {child.type}" + ), + # nodes.literal_block(content, content), + line=token_line(child), + ) + self.current_node += [error_msg] + break + field = nodes.field() + self.add_line_and_source_path(field, child) + field_list += field + field_name = nodes.field_name() + self.add_line_and_source_path(field_name, child) + field += field_name + with self.current_node_context(field_name): + self.render_children(child) + field_body = nodes.field_body() + self.add_line_and_source_path(field_name, child) + field += field_body + if children and children[0].type == "fieldlist_body": + child = children.pop(0) + with self.current_node_context(field_body): + self.render_children(child) + + def render_restructuredtext(self, token: SyntaxTreeNode) -> None: + """Render the content of the token as restructuredtext.""" + # copy necessary elements (source, line no, env, reporter) + newdoc = make_document() + newdoc["source"] = self.document["source"] + newdoc.settings = self.document.settings + newdoc.reporter = self.reporter + # pad the line numbers artificially so they offset with the fence block + pseudosource = ("\n" * token_line(token)) + token.content + # actually parse the rst into our document + MockRSTParser().parse(pseudosource, newdoc) + for node in newdoc: + if node["names"]: + self.document.note_explicit_target(node, node) + self.current_node.extend(newdoc.children) + + def render_directive(self, token: SyntaxTreeNode) -> None: + """Render special fenced code blocks as directives.""" + first_line = token.info.split(maxsplit=1) + name = first_line[0][1:-1] + arguments = "" if len(first_line) == 1 else first_line[1] + content = token.content + position = token_line(token) + nodes_list = self.run_directive(name, arguments, content, position) + self.current_node += nodes_list + + def run_directive( + self, name: str, first_line: str, content: str, position: int + ) -> list[nodes.Element]: + """Run a directive and return the generated nodes. + + :param name: the name of the directive + :param first_line: The text on the same line as the directive name. + May be an argument or body text, dependent on the directive + :param content: All text after the first line. Can include options. + :param position: The line number of the first line + + """ + # TODO directive name white/black lists + + self.document.current_line = position + + # get directive class + output: tuple[Directive, list] = directives.directive( + name, self.language_module_rst, self.document + ) + directive_class, messages = output + if not directive_class: + error = self.reporter.error( + f'Unknown directive type "{name}".\n', + # nodes.literal_block(content, content), + line=position, + ) + return [error] + messages + + if issubclass(directive_class, Include): + # this is a Markdown only option, + # to allow for altering relative image reference links + directive_class.option_spec["relative-images"] = directives.flag + directive_class.option_spec["relative-docs"] = directives.path + + try: + arguments, options, body_lines, content_offset = parse_directive_text( + directive_class, first_line, content + ) + except DirectiveParsingError as error: + error = self.reporter.error( + f"Directive '{name}': {error}", + nodes.literal_block(content, content), + line=position, + ) + return [error] + + # initialise directive + if issubclass(directive_class, Include): + directive_instance = MockIncludeDirective( + self, + name=name, + klass=directive_class, + arguments=arguments, + options=options, + body=body_lines, + lineno=position, + ) + else: + state_machine = MockStateMachine(self, position) + state = MockState(self, state_machine, position) + directive_instance = directive_class( + name=name, + # the list of positional arguments + arguments=arguments, + # a dictionary mapping option names to values + options=options, + # the directive content line by line + content=StringList(body_lines, self.document["source"]), + # the absolute line number of the first line of the directive + lineno=position, + # the line offset of the first line of the content + content_offset=content_offset, + # a string containing the entire directive + block_text="\n".join(body_lines), + state=state, + state_machine=state_machine, + ) + + # run directive + try: + result = directive_instance.run() + except DirectiveError as error: + msg_node = self.reporter.system_message( + error.level, error.msg, line=position + ) + msg_node += nodes.literal_block(content, content) + result = [msg_node] + except MockingError as exc: + error_msg = self.reporter.error( + "Directive '{}' cannot be mocked: {}: {}".format( + name, exc.__class__.__name__, exc + ), + nodes.literal_block(content, content), + line=position, + ) + return [error_msg] + + assert isinstance( + result, list + ), f'Directive "{name}" must return a list of nodes.' + for i in range(len(result)): + assert isinstance( + result[i], nodes.Node + ), 'Directive "{}" returned non-Node object (index {}): {}'.format( + name, i, result[i] + ) + return result + + def render_substitution_inline(self, token: SyntaxTreeNode) -> None: + """Render inline substitution {{key}}.""" + self.render_substitution(token, inline=True) + + def render_substitution_block(self, token: SyntaxTreeNode) -> None: + """Render block substitution {{key}}.""" + self.render_substitution(token, inline=False) + + def render_substitution(self, token: SyntaxTreeNode, inline: bool) -> None: + """Substitutions are rendered by: + + 1. Combining global substitutions with front-matter substitutions + to create a variable context (front-matter takes priority) + 2. Add the sphinx `env` to the variable context (if available) + 3. Create the string content with Jinja2 (passing it the variable context) + 4. If the substitution is inline and not a directive, + parse to nodes ignoring block syntaxes (like lists or block-quotes), + otherwise parse to nodes with all syntax rules. + + """ + position = token_line(token) + + # front-matter substitutions take priority over config ones + variable_context: dict[str, Any] = {**self.md_config.substitutions} + if self.sphinx_env is not None: + variable_context["env"] = self.sphinx_env + + # fail on undefined variables + env = jinja2.Environment(undefined=jinja2.StrictUndefined) + + # try rendering + try: + rendered = env.from_string(f"{{{{{token.content}}}}}").render( + variable_context + ) + except Exception as error: + error_msg = self.reporter.error( + f"Substitution error:{error.__class__.__name__}: {error}", + line=position, + ) + self.current_node += [error_msg] + return + + # handle circular references + ast = env.parse(f"{{{{{token.content}}}}}") + references = { + n.name for n in ast.find_all(jinja2.nodes.Name) if n.name != "env" + } + self.document.sub_references = getattr(self.document, "sub_references", set()) + cyclic = references.intersection(self.document.sub_references) + if cyclic: + error_msg = self.reporter.error( + f"circular substitution reference: {cyclic}", + line=position, + ) + self.current_node += [error_msg] + return + + # TODO improve error reporting; + # at present, for a multi-line substitution, + # an error may point to a line lower than the substitution + # should it point to the source of the substitution? + # or the error message should at least indicate that its a substitution + + # we record used references before nested parsing, then remove them after + self.document.sub_references.update(references) + try: + if inline and not REGEX_DIRECTIVE_START.match(rendered): + self.nested_render_text(rendered, position, inline=True) + else: + self.nested_render_text(rendered, position, allow_headings=False) + finally: + self.document.sub_references.difference_update(references) + + +def html_meta_to_nodes( + data: dict[str, Any], document: nodes.document, line: int, reporter: Reporter +) -> list[nodes.pending | nodes.system_message]: + """Replicate the `meta` directive, + by converting a dictionary to a list of pending meta nodes + + See: + https://www.sphinx-doc.org/en/master/usage/restructuredtext/basics.html#html-metadata + """ + if not data: + return [] + + try: + # if sphinx available + from sphinx.addnodes import meta as meta_cls + except ImportError: + try: + # docutils >= 0.19 + meta_cls = nodes.meta # type: ignore + except AttributeError: + from docutils.parsers.rst.directives.html import MetaBody + + meta_cls = MetaBody.meta # type: ignore + + output = [] + + for key, value in data.items(): + content = str(value or "") + meta_node = meta_cls(content) + meta_node.source = document["source"] + meta_node.line = line + meta_node["content"] = content + try: + if not content: + raise ValueError("No content") + for i, key_part in enumerate(key.split()): + if "=" not in key_part and i == 0: + meta_node["name"] = key_part + continue + if "=" not in key_part: + raise ValueError(f"no '=' in {key_part}") + attr_name, attr_val = key_part.split("=", 1) + if not (attr_name and attr_val): + raise ValueError(f"malformed {key_part}") + meta_node[attr_name.lower()] = attr_val + except ValueError as error: + msg = reporter.error(f'Error parsing meta tag attribute "{key}": {error}.') + output.append(msg) + continue + + pending = nodes.pending( + Filter, + {"component": "writer", "format": "html", "nodes": [meta_node]}, + ) + document.note_pending(pending) + output.append(pending) + + return output diff --git a/myst_parser/mdit_to_docutils/html_to_nodes.py b/myst_parser/mdit_to_docutils/html_to_nodes.py new file mode 100644 index 0000000..2cc3066 --- /dev/null +++ b/myst_parser/mdit_to_docutils/html_to_nodes.py @@ -0,0 +1,139 @@ +"""Convert HTML to docutils nodes.""" +from __future__ import annotations + +import re +from typing import TYPE_CHECKING + +from docutils import nodes + +from myst_parser.parsers.parse_html import Data, tokenize_html + +if TYPE_CHECKING: + from .base import DocutilsRenderer + + +def make_error( + document: nodes.document, error_msg: str, text: str, line_number: int +) -> nodes.system_message: + return document.reporter.error( + error_msg, + nodes.literal_block(text, text), + line=line_number, + ) + + +OPTION_KEYS_IMAGE = {"class", "alt", "height", "width", "align", "name"} +# note: docutils also has scale and target + +OPTION_KEYS_ADMONITION = {"class", "name"} + +# See https://github.com/micromark/micromark-extension-gfm-tagfilter +RE_FLOW = re.compile( + r"<(\/?)(iframe|noembed|noframes|plaintext|script|style|title|textarea|xmp)(?=[\t\n\f\r />])", + re.IGNORECASE, +) + + +def default_html(text: str, source: str, line_number: int) -> list[nodes.Element]: + raw_html = nodes.raw("", text, format="html") + raw_html.source = source + raw_html.line = line_number + return [raw_html] + + +def html_to_nodes( + text: str, line_number: int, renderer: DocutilsRenderer +) -> list[nodes.Element]: + """Convert HTML to docutils nodes.""" + if renderer.md_config.gfm_only: + text, _ = RE_FLOW.subn(lambda s: s.group(0).replace("<", "<"), text) + + enable_html_img = "html_image" in renderer.md_config.enable_extensions + enable_html_admonition = "html_admonition" in renderer.md_config.enable_extensions + if not (enable_html_img or enable_html_admonition): + return default_html(text, renderer.document["source"], line_number) + + # parse the HTML to AST + try: + root = tokenize_html(text).strip(inplace=True, recurse=False) + except Exception: + msg_node = renderer.create_warning( + "HTML could not be parsed", line=line_number, subtype="html" + ) + return ([msg_node] if msg_node else []) + default_html( + text, renderer.document["source"], line_number + ) + + if len(root) < 1: + # if empty + return default_html(text, renderer.document["source"], line_number) + + if not all( + (enable_html_img and child.name == "img") + or ( + enable_html_admonition + and child.name == "div" + and "admonition" in child.attrs.classes + ) + for child in root + ): + return default_html(text, renderer.document["source"], line_number) + + nodes_list = [] + for child in root: + + if child.name == "img": + if "src" not in child.attrs: + return [ + renderer.reporter.error( + "<img> missing 'src' attribute", line=line_number + ) + ] + content = "\n".join( + f":{k}: {v}" + for k, v in sorted(child.attrs.items()) + if k in OPTION_KEYS_IMAGE + ) + nodes_list.extend( + renderer.run_directive( + "image", child.attrs["src"], content, line_number + ) + ) + + else: + children = child.strip().children + if ( + children + and children[0].name in ("div", "p") + and ( + "title" in children[0].attrs.classes + or "admonition-title" in children[0].attrs.classes + ) + ): + title = "".join(child.render() for child in children.pop(0)) + else: + title = "Note" + + options = "\n".join( + f":{k}: {v}" + for k, v in sorted(child.attrs.items()) + if k in OPTION_KEYS_ADMONITION + ).rstrip() + new_children = [] + for child in children: + if child.name == "p": + new_children.extend(child.children) + new_children.append(Data("\n\n")) + else: + new_children.append(child) + content = ( + options + + ("\n\n" if options else "") + + "".join(child.render() for child in new_children).lstrip() + ) + + nodes_list.extend( + renderer.run_directive("admonition", title, content, line_number) + ) + + return nodes_list diff --git a/myst_parser/mdit_to_docutils/sphinx_.py b/myst_parser/mdit_to_docutils/sphinx_.py new file mode 100644 index 0000000..3c1bc23 --- /dev/null +++ b/myst_parser/mdit_to_docutils/sphinx_.py @@ -0,0 +1,245 @@ +"""Convert Markdown-it tokens to docutils nodes, including sphinx specific elements.""" +from __future__ import annotations + +import os +from pathlib import Path +from typing import cast +from urllib.parse import unquote +from uuid import uuid4 + +from docutils import nodes +from markdown_it.tree import SyntaxTreeNode +from sphinx import addnodes +from sphinx.domains.math import MathDomain +from sphinx.domains.std import StandardDomain +from sphinx.environment import BuildEnvironment +from sphinx.util import logging +from sphinx.util.nodes import clean_astext + +from myst_parser.mdit_to_docutils.base import DocutilsRenderer + +LOGGER = logging.getLogger(__name__) + + +def create_warning( + document: nodes.document, + message: str, + *, + line: int | None = None, + append_to: nodes.Element | None = None, + wtype: str = "myst", + subtype: str = "other", +) -> nodes.system_message | None: + """Generate a warning, logging it if necessary. + + If the warning type is listed in the ``suppress_warnings`` configuration, + then ``None`` will be returned and no warning logged. + """ + message = f"{message} [{wtype}.{subtype}]" + kwargs = {"line": line} if line is not None else {} + + if logging.is_suppressed_warning( + wtype, subtype, document.settings.env.app.config.suppress_warnings + ): + return None + + msg_node = document.reporter.warning(message, **kwargs) + if append_to is not None: + append_to.append(msg_node) + + return None + + +class SphinxRenderer(DocutilsRenderer): + """A markdown-it-py renderer to populate (in-place) a `docutils.document` AST. + + This is sub-class of `DocutilsRenderer` that handles sphinx specific aspects, + such as cross-referencing. + """ + + @property + def doc_env(self) -> BuildEnvironment: + return self.document.settings.env + + def create_warning( + self, + message: str, + *, + line: int | None = None, + append_to: nodes.Element | None = None, + wtype: str = "myst", + subtype: str = "other", + ) -> nodes.system_message | None: + """Generate a warning, logging it if necessary. + + If the warning type is listed in the ``suppress_warnings`` configuration, + then ``None`` will be returned and no warning logged. + """ + return create_warning( + self.document, + message, + line=line, + append_to=append_to, + wtype=wtype, + subtype=subtype, + ) + + def render_internal_link(self, token: SyntaxTreeNode) -> None: + """Render link token `[text](link "title")`, + where the link has not been identified as an external URL. + """ + destination = unquote(cast(str, token.attrGet("href") or "")) + + # make the path relative to an "including" document + # this is set when using the `relative-docs` option of the MyST `include` directive + relative_include = self.md_env.get("relative-docs", None) + if relative_include is not None and destination.startswith(relative_include[0]): + source_dir, include_dir = relative_include[1:] + destination = os.path.relpath( + os.path.join(include_dir, os.path.normpath(destination)), source_dir + ) + + potential_path = ( + Path(self.doc_env.doc2path(self.doc_env.docname)).parent / destination + if self.doc_env.srcdir # not set in some test situations + else None + ) + if ( + potential_path + and potential_path.is_file() + and not any( + destination.endswith(suffix) + for suffix in self.doc_env.config.source_suffix + ) + ): + wrap_node = addnodes.download_reference( + refdoc=self.doc_env.docname, + reftarget=destination, + reftype="myst", + refdomain=None, # Added to enable cross-linking + refexplicit=len(token.children or []) > 0, + refwarn=False, + ) + classes = ["xref", "download", "myst"] + text = destination if not token.children else "" + else: + wrap_node = addnodes.pending_xref( + refdoc=self.doc_env.docname, + reftarget=destination, + reftype="myst", + refdomain=None, # Added to enable cross-linking + refexplicit=len(token.children or []) > 0, + refwarn=True, + ) + classes = ["xref", "myst"] + text = "" + + self.add_line_and_source_path(wrap_node, token) + title = token.attrGet("title") + if title: + wrap_node["title"] = title + self.current_node.append(wrap_node) + + inner_node = nodes.inline("", text, classes=classes) + wrap_node.append(inner_node) + with self.current_node_context(inner_node): + self.render_children(token) + + def render_heading(self, token: SyntaxTreeNode) -> None: + """This extends the docutils method, to allow for the addition of heading ids. + These ids are computed by the ``markdown-it-py`` ``anchors_plugin`` + as "slugs" which are unique to a document. + + The approach is similar to ``sphinx.ext.autosectionlabel`` + """ + super().render_heading(token) + + if not isinstance(self.current_node, nodes.section): + return + + # create the slug string + slug = cast(str, token.attrGet("id")) + if slug is None: + return + + section = self.current_node + doc_slug = self.doc_env.doc2path(self.doc_env.docname, base=False) + "#" + slug + + # save the reference in the standard domain, so that it can be handled properly + domain = cast(StandardDomain, self.doc_env.get_domain("std")) + if doc_slug in domain.labels: + other_doc = self.doc_env.doc2path(domain.labels[doc_slug][0]) + self.create_warning( + f"duplicate label {doc_slug}, other instance in {other_doc}", + line=section.line, + subtype="anchor", + ) + labelid = section["ids"][0] + domain.anonlabels[doc_slug] = self.doc_env.docname, labelid + domain.labels[doc_slug] = ( + self.doc_env.docname, + labelid, + clean_astext(section[0]), + ) + + self.doc_env.metadata[self.doc_env.docname]["myst_anchors"] = True + section["myst-anchor"] = doc_slug + + def render_math_block_label(self, token: SyntaxTreeNode) -> None: + """Render math with referencable labels, e.g. ``$a=1$ (label)``.""" + label = token.info + content = token.content + node = nodes.math_block( + content, content, nowrap=False, number=None, label=label + ) + target = self.add_math_target(node) + self.add_line_and_source_path(target, token) + self.current_node.append(target) + self.add_line_and_source_path(node, token) + self.current_node.append(node) + + def _random_label(self) -> str: + return str(uuid4()) + + def render_amsmath(self, token: SyntaxTreeNode) -> None: + """Renderer for the amsmath extension.""" + # environment = token.meta["environment"] + content = token.content + + if token.meta["numbered"] != "*": + # TODO how to parse and reference labels within environment? + # for now we give create a unique hash, so the equation will be numbered + # but there will be no reference clashes + label = self._random_label() + node = nodes.math_block( + content, + content, + nowrap=True, + number=None, + classes=["amsmath"], + label=label, + ) + target = self.add_math_target(node) + self.add_line_and_source_path(target, token) + self.current_node.append(target) + else: + node = nodes.math_block( + content, content, nowrap=True, number=None, classes=["amsmath"] + ) + self.add_line_and_source_path(node, token) + self.current_node.append(node) + + def add_math_target(self, node: nodes.math_block) -> nodes.target: + # Code mainly copied from sphinx.directives.patches.MathDirective + + # register label to domain + domain = cast(MathDomain, self.doc_env.get_domain("math")) + domain.note_equation(self.doc_env.docname, node["label"], location=node) + node["number"] = domain.get_equation_number_for(node["label"]) + node["docname"] = self.doc_env.docname + + # create target node + node_id = nodes.make_id("equation-%s" % node["label"]) + target = nodes.target("", "", ids=[node_id]) + self.document.note_explicit_target(target) + return target diff --git a/myst_parser/mdit_to_docutils/utils.py b/myst_parser/mdit_to_docutils/utils.py new file mode 100644 index 0000000..b31d8c7 --- /dev/null +++ b/myst_parser/mdit_to_docutils/utils.py @@ -0,0 +1,36 @@ +import html +from typing import Iterable, Optional +from urllib.parse import quote, urlparse + + +def escape_url(raw: str) -> str: + """ + Escape urls to prevent code injection craziness. (Hopefully.) + """ + return html.escape(quote(html.unescape(raw), safe="/#:()*?=%@+,&")) + + +def is_external_url( + reference: str, + known_url_schemes: Optional[Iterable[str]], + match_fragment: bool = False, +) -> bool: + """Return if a reference should be recognised as an external URL. + + URLs are of the format: scheme://netloc/path;parameters?query#fragment + + This checks if there is a url scheme (e.g. 'https') and, if so, + if the scheme is is the list of known_url_schemes (if supplied). + + :param known_url_schemes: e.g. ["http", "https", "mailto"] + If None, match all schemes + :param match_fragment: If True and a fragment found, then True will be returned, + irrespective of a scheme match + + """ + url_check = urlparse(reference) + if known_url_schemes is not None: + scheme_known = url_check.scheme in known_url_schemes + else: + scheme_known = bool(url_check.scheme) + return scheme_known or (match_fragment and url_check.fragment != "") |