diff options
Diffstat (limited to 'myst_parser')
27 files changed, 4958 insertions, 0 deletions
diff --git a/myst_parser/__init__.py b/myst_parser/__init__.py new file mode 100644 index 0000000..56dd460 --- /dev/null +++ b/myst_parser/__init__.py @@ -0,0 +1,10 @@ +"""An extended commonmark compliant parser, with bridges to docutils & sphinx.""" +__version__ = "0.18.1" + + +def setup(app): + """Initialize the Sphinx extension.""" + from myst_parser.sphinx_ext.main import setup_sphinx + + setup_sphinx(app, load_parser=True) + return {"version": __version__, "parallel_read_safe": True} diff --git a/myst_parser/_compat.py b/myst_parser/_compat.py new file mode 100644 index 0000000..d29cf4d --- /dev/null +++ b/myst_parser/_compat.py @@ -0,0 +1,11 @@ +"""Helpers for cross compatibility across dependency versions.""" +from typing import Callable, Iterable + +from docutils.nodes import Element + + +def findall(node: Element) -> Callable[..., Iterable[Element]]: + """Iterate through""" + # findall replaces traverse in docutils v0.18 + # note a difference is that findall is an iterator + return getattr(node, "findall", node.traverse) diff --git a/myst_parser/_docs.py b/myst_parser/_docs.py new file mode 100644 index 0000000..a7c46a3 --- /dev/null +++ b/myst_parser/_docs.py @@ -0,0 +1,198 @@ +"""Code to use internally, for documentation.""" +from __future__ import annotations + +import io +from typing import Sequence, Union + +from docutils import nodes +from docutils.frontend import OptionParser +from docutils.parsers.rst import directives +from sphinx.directives import other +from sphinx.util import logging +from sphinx.util.docutils import SphinxDirective +from typing_extensions import get_args, get_origin + +from .config.main import MdParserConfig +from .parsers.docutils_ import Parser as DocutilsParser + +logger = logging.getLogger(__name__) + + +class _ConfigBase(SphinxDirective): + """Directive to automate rendering of the configuration.""" + + @staticmethod + def table_header(): + return [ + "```````{list-table}", + ":header-rows: 1", + ":widths: 15 10 20", + "", + "* - Name", + " - Type", + " - Description", + ] + + @staticmethod + def field_default(value): + default = " ".join(f"{value!r}".splitlines()) + return default + + @staticmethod + def field_type(field): + ftypes: Sequence[str] + if get_origin(field.type) is Union: + ftypes = get_args(field.type) + else: + ftypes = [field.type] + ctype = " | ".join( + str("None" if ftype == type(None) else ftype) # type: ignore # noqa: E721 + for ftype in ftypes + ) + ctype = " ".join(ctype.splitlines()) + ctype = ctype.replace("typing.", "") + ctype = ctype.replace("typing_extensions.", "") + for tname in ("str", "int", "float", "bool"): + ctype = ctype.replace(f"<class '{tname}'>", tname) + return ctype + + +class MystConfigDirective(_ConfigBase): + + option_spec = { + "sphinx": directives.flag, + "extensions": directives.flag, + "scope": lambda x: directives.choice(x, ["global", "local"]), + } + + def run(self): + """Run the directive.""" + config = MdParserConfig() + text = self.table_header() + count = 0 + for name, value, field in config.as_triple(): + + # filter by sphinx options + if "sphinx" in self.options and field.metadata.get("sphinx_exclude"): + continue + + if "extensions" in self.options: + if not field.metadata.get("extension"): + continue + else: + if field.metadata.get("extension"): + continue + + if self.options.get("scope") == "local": + if field.metadata.get("global_only"): + continue + + if self.options.get("scope") == "global": + name = f"myst_{name}" + + description = " ".join(field.metadata.get("help", "").splitlines()) + if field.metadata.get("extension"): + description = f"{field.metadata.get('extension')}: {description}" + default = self.field_default(value) + ctype = self.field_type(field) + text.extend( + [ + f"* - `{name}`", + f" - `{ctype}`", + f" - {description} (default: `{default}`)", + ] + ) + + count += 1 + + if not count: + return [] + + text.append("```````") + node = nodes.Element() + self.state.nested_parse(text, 0, node) + return node.children + + +class DocutilsCliHelpDirective(SphinxDirective): + """Directive to print the docutils CLI help.""" + + has_content = False + required_arguments = 0 + optional_arguments = 0 + final_argument_whitespace = False + + def run(self): + """Run the directive.""" + stream = io.StringIO() + OptionParser( + components=(DocutilsParser,), + usage="myst-docutils-<writer> [options] [<source> [<destination>]]", + ).print_help(stream) + return [nodes.literal_block("", stream.getvalue())] + + +class DirectiveDoc(SphinxDirective): + """Load and document a directive.""" + + required_arguments = 1 # name of the directive + has_content = True + + def run(self): + """Run the directive.""" + name = self.arguments[0] + # load the directive class + klass, _ = directives.directive( + name, self.state.memo.language, self.state.document + ) + if klass is None: + logger.warning(f"Directive {name} not found.", line=self.lineno) + return [] + content = " ".join(self.content) + text = f"""\ +:Name: `{name}` +:Description: {content} +:Arguments: {klass.required_arguments} required, {klass.optional_arguments} optional +:Content: {'yes' if klass.has_content else 'no'} +:Options: +""" + if klass.option_spec: + text += " name | type\n -----|------\n" + for key, func in klass.option_spec.items(): + text += f" {key} | {convert_opt(name, func)}\n" + node = nodes.Element() + self.state.nested_parse(text.splitlines(), 0, node) + return node.children + + +def convert_opt(name, func): + """Convert an option function to a string.""" + if func is directives.flag: + return "flag" + if func is directives.unchanged: + return "text" + if func is directives.unchanged_required: + return "text" + if func is directives.class_option: + return "space-delimited list" + if func is directives.uri: + return "URI" + if func is directives.path: + return "path" + if func is int: + return "integer" + if func is directives.positive_int: + return "integer (positive)" + if func is directives.nonnegative_int: + return "integer (non-negative)" + if func is directives.positive_int_list: + return "space/comma-delimited list of integers (positive)" + if func is directives.percentage: + return "percentage" + if func is directives.length_or_unitless: + return "length or unitless" + if func is directives.length_or_percentage_or_unitless: + return "length, percentage or unitless" + if func is other.int_or_nothing: + return "integer" + return "" diff --git a/myst_parser/cli.py b/myst_parser/cli.py new file mode 100644 index 0000000..b9bb1ba --- /dev/null +++ b/myst_parser/cli.py @@ -0,0 +1,42 @@ +import argparse +import sys + +from markdown_it.renderer import RendererHTML + +from myst_parser.config.main import MdParserConfig +from myst_parser.parsers.mdit import create_md_parser + + +def print_anchors(args=None): + """ """ + arg_parser = argparse.ArgumentParser() + arg_parser.add_argument( + "input", + nargs="?", + type=argparse.FileType("r", encoding="utf8"), + default=sys.stdin, + help="Input file (default stdin)", + ) + arg_parser.add_argument( + "-o", + "--output", + type=argparse.FileType("w", encoding="utf8"), + default=sys.stdout, + help="Output file (default stdout)", + ) + arg_parser.add_argument( + "-l", "--level", type=int, default=2, help="Maximum heading level." + ) + args = arg_parser.parse_args(args) + parser = create_md_parser(MdParserConfig(heading_anchors=args.level), RendererHTML) + + def _filter_plugin(state): + state.tokens = [ + t + for t in state.tokens + if t.type.startswith("heading_") and int(t.tag[1]) <= args.level + ] + + parser.use(lambda p: p.core.ruler.push("filter", _filter_plugin)) + text = parser.render(args.input.read()) + args.output.write(text) diff --git a/myst_parser/config/__init__.py b/myst_parser/config/__init__.py new file mode 100644 index 0000000..898f9ce --- /dev/null +++ b/myst_parser/config/__init__.py @@ -0,0 +1 @@ +"""This module holds the global configuration for the parser ``MdParserConfig``.""" diff --git a/myst_parser/config/dc_validators.py b/myst_parser/config/dc_validators.py new file mode 100644 index 0000000..765cfb9 --- /dev/null +++ b/myst_parser/config/dc_validators.py @@ -0,0 +1,161 @@ +"""Validators for dataclasses, mirroring those of https://github.com/python-attrs/attrs.""" +from __future__ import annotations + +import dataclasses as dc +from typing import Any, Sequence + +from typing_extensions import Protocol + + +def validate_field(inst: Any, field: dc.Field, value: Any) -> None: + """Validate the field of a dataclass, + according to a `validator` function set in the field.metadata. + + The validator function should take as input (inst, field, value) and + raise an exception if the value is invalid. + """ + if "validator" not in field.metadata: + return + if isinstance(field.metadata["validator"], list): + for validator in field.metadata["validator"]: + validator(inst, field, value) + else: + field.metadata["validator"](inst, field, value) + + +def validate_fields(inst: Any) -> None: + """Validate the fields of a dataclass, + according to `validator` functions set in the field metadata. + + This function should be called in the `__post_init__` of the dataclass. + + The validator function should take as input (inst, field, value) and + raise an exception if the value is invalid. + """ + for field in dc.fields(inst): + validate_field(inst, field, getattr(inst, field.name)) + + +class ValidatorType(Protocol): + def __call__( + self, inst: bytes, field: dc.Field, value: Any, suffix: str = "" + ) -> None: + ... + + +def instance_of(type: type[Any] | tuple[type[Any], ...]) -> ValidatorType: + """ + A validator that raises a `TypeError` if the initializer is called + with a wrong type for this particular attribute (checks are performed using + `isinstance` therefore it's also valid to pass a tuple of types). + + :param type: The type to check for. + """ + + def _validator(inst, field, value, suffix=""): + """ + We use a callable class to be able to change the ``__repr__``. + """ + if not isinstance(value, type): + raise TypeError( + f"'{field.name}{suffix}' must be of type {type!r} " + f"(got {value!r} that is a {value.__class__!r})." + ) + + return _validator + + +def optional(validator: ValidatorType) -> ValidatorType: + """ + A validator that makes an attribute optional. An optional attribute is one + which can be set to ``None`` in addition to satisfying the requirements of + the sub-validator. + """ + + def _validator(inst, field, value, suffix=""): + if value is None: + return + + validator(inst, field, value, suffix=suffix) + + return _validator + + +def is_callable(inst, field, value, suffix=""): + """ + A validator that raises a `TypeError` if the + initializer is called with a value for this particular attribute + that is not callable. + """ + if not callable(value): + raise TypeError( + f"'{field.name}{suffix}' must be callable " + f"(got {value!r} that is a {value.__class__!r})." + ) + + +def in_(options: Sequence) -> ValidatorType: + """ + A validator that raises a `ValueError` if the initializer is called + with a value that does not belong in the options provided. The check is + performed using ``value in options``. + + :param options: Allowed options. + """ + + def _validator(inst, field, value, suffix=""): + try: + in_options = value in options + except TypeError: # e.g. `1 in "abc"` + in_options = False + + if not in_options: + raise ValueError( + f"'{field.name}{suffix}' must be in {options!r} (got {value!r})" + ) + + return _validator + + +def deep_iterable( + member_validator: ValidatorType, iterable_validator: ValidatorType | None = None +) -> ValidatorType: + """ + A validator that performs deep validation of an iterable. + + :param member_validator: Validator to apply to iterable members + :param iterable_validator: Validator to apply to iterable itself + """ + + def _validator(inst, field, value, suffix=""): + if iterable_validator is not None: + iterable_validator(inst, field, value, suffix=suffix) + + for idx, member in enumerate(value): + member_validator(inst, field, member, suffix=f"{suffix}[{idx}]") + + return _validator + + +def deep_mapping( + key_validator: ValidatorType, + value_validator: ValidatorType, + mapping_validator: ValidatorType | None = None, +) -> ValidatorType: + """ + A validator that performs deep validation of a dictionary. + + :param key_validator: Validator to apply to dictionary keys + :param value_validator: Validator to apply to dictionary values + :param mapping_validator: Validator to apply to top-level mapping attribute (optional) + """ + + def _validator(inst, field: dc.Field, value, suffix=""): + if mapping_validator is not None: + mapping_validator(inst, field, value) + + for key in value: + key_validator(inst, field, key, suffix=f"{suffix}[{key!r}]") + value_validator(inst, field, value[key], suffix=f"{suffix}[{key!r}]") + + return _validator diff --git a/myst_parser/config/main.py b/myst_parser/config/main.py new file mode 100644 index 0000000..a134ea7 --- /dev/null +++ b/myst_parser/config/main.py @@ -0,0 +1,409 @@ +"""The configuration for the myst parser.""" +import dataclasses as dc +from typing import ( + Any, + Callable, + Dict, + Iterable, + Iterator, + Optional, + Sequence, + Tuple, + Union, + cast, +) + +from .dc_validators import ( + deep_iterable, + deep_mapping, + in_, + instance_of, + is_callable, + optional, + validate_field, + validate_fields, +) + + +def check_extensions(_, __, value): + if not isinstance(value, Iterable): + raise TypeError(f"'enable_extensions' not iterable: {value}") + diff = set(value).difference( + [ + "amsmath", + "attrs_image", + "colon_fence", + "deflist", + "dollarmath", + "fieldlist", + "html_admonition", + "html_image", + "linkify", + "replacements", + "smartquotes", + "strikethrough", + "substitution", + "tasklist", + ] + ) + if diff: + raise ValueError(f"'enable_extensions' items not recognised: {diff}") + + +def check_sub_delimiters(_, __, value): + if (not isinstance(value, (tuple, list))) or len(value) != 2: + raise TypeError(f"myst_sub_delimiters is not a tuple of length 2: {value}") + for delim in value: + if (not isinstance(delim, str)) or len(delim) != 1: + raise TypeError( + f"myst_sub_delimiters does not contain strings of length 1: {value}" + ) + + +@dc.dataclass() +class MdParserConfig: + """Configuration options for the Markdown Parser. + + Note in the sphinx configuration these option names are prepended with ``myst_`` + """ + + # TODO replace commonmark_only, gfm_only with a single option + + commonmark_only: bool = dc.field( + default=False, + metadata={ + "validator": instance_of(bool), + "help": "Use strict CommonMark parser", + }, + ) + gfm_only: bool = dc.field( + default=False, + metadata={ + "validator": instance_of(bool), + "help": "Use strict Github Flavoured Markdown parser", + }, + ) + + enable_extensions: Sequence[str] = dc.field( + default_factory=list, + metadata={"validator": check_extensions, "help": "Enable syntax extensions"}, + ) + + disable_syntax: Iterable[str] = dc.field( + default_factory=list, + metadata={ + "validator": deep_iterable(instance_of(str), instance_of((list, tuple))), + "help": "Disable Commonmark syntax elements", + }, + ) + + all_links_external: bool = dc.field( + default=False, + metadata={ + "validator": instance_of(bool), + "help": "Parse all links as simple hyperlinks", + }, + ) + + # see https://en.wikipedia.org/wiki/List_of_URI_schemes + url_schemes: Optional[Iterable[str]] = dc.field( + default=cast(Optional[Iterable[str]], ("http", "https", "mailto", "ftp")), + metadata={ + "validator": optional( + deep_iterable(instance_of(str), instance_of((list, tuple))) + ), + "help": "URL scheme prefixes identified as external links", + }, + ) + + ref_domains: Optional[Iterable[str]] = dc.field( + default=None, + metadata={ + "validator": optional( + deep_iterable(instance_of(str), instance_of((list, tuple))) + ), + "help": "Sphinx domain names to search in for link references", + }, + ) + + highlight_code_blocks: bool = dc.field( + default=True, + metadata={ + "validator": instance_of(bool), + "help": "Syntax highlight code blocks with pygments", + "docutils_only": True, + }, + ) + + number_code_blocks: Sequence[str] = dc.field( + default_factory=list, + metadata={ + "validator": deep_iterable(instance_of(str), instance_of((list, tuple))), + "help": "Add line numbers to code blocks with these languages", + }, + ) + + title_to_header: bool = dc.field( + default=False, + metadata={ + "validator": instance_of(bool), + "help": "Convert a `title` field in the top-matter to a H1 header", + }, + ) + + heading_anchors: Optional[int] = dc.field( + default=None, + metadata={ + "validator": optional(in_([1, 2, 3, 4, 5, 6, 7])), + "help": "Heading level depth to assign HTML anchors", + }, + ) + + heading_slug_func: Optional[Callable[[str], str]] = dc.field( + default=None, + metadata={ + "validator": optional(is_callable), + "help": "Function for creating heading anchors", + "global_only": True, + }, + ) + + html_meta: Dict[str, str] = dc.field( + default_factory=dict, + repr=False, + metadata={ + "validator": deep_mapping( + instance_of(str), instance_of(str), instance_of(dict) + ), + "merge_topmatter": True, + "help": "HTML meta tags", + }, + ) + + footnote_transition: bool = dc.field( + default=True, + metadata={ + "validator": instance_of(bool), + "help": "Place a transition before any footnotes", + }, + ) + + words_per_minute: int = dc.field( + default=200, + metadata={ + "validator": instance_of(int), + "help": "For reading speed calculations", + }, + ) + + # Extension specific + + substitutions: Dict[str, Union[str, int, float]] = dc.field( + default_factory=dict, + repr=False, + metadata={ + "validator": deep_mapping( + instance_of(str), instance_of((str, int, float)), instance_of(dict) + ), + "merge_topmatter": True, + "help": "Substitutions mapping", + "extension": "substitutions", + }, + ) + + sub_delimiters: Tuple[str, str] = dc.field( + default=("{", "}"), + metadata={ + "validator": check_sub_delimiters, + "help": "Substitution delimiters", + "extension": "substitutions", + }, + ) + + linkify_fuzzy_links: bool = dc.field( + default=True, + metadata={ + "validator": instance_of(bool), + "help": "Recognise URLs without schema prefixes", + "extension": "linkify", + }, + ) + + dmath_allow_labels: bool = dc.field( + default=True, + metadata={ + "validator": instance_of(bool), + "help": "Parse `$$...$$ (label)`", + "extension": "dollarmath", + }, + ) + dmath_allow_space: bool = dc.field( + default=True, + metadata={ + "validator": instance_of(bool), + "help": "Allow initial/final spaces in `$ ... $`", + "extension": "dollarmath", + }, + ) + dmath_allow_digits: bool = dc.field( + default=True, + metadata={ + "validator": instance_of(bool), + "help": "Allow initial/final digits `1$ ...$2`", + "extension": "dollarmath", + }, + ) + dmath_double_inline: bool = dc.field( + default=False, + metadata={ + "validator": instance_of(bool), + "help": "Parse inline `$$ ... $$`", + "extension": "dollarmath", + }, + ) + + update_mathjax: bool = dc.field( + default=True, + metadata={ + "validator": instance_of(bool), + "help": "Update sphinx.ext.mathjax configuration to ignore `$` delimiters", + "extension": "dollarmath", + "global_only": True, + }, + ) + + mathjax_classes: str = dc.field( + default="tex2jax_process|mathjax_process|math|output_area", + metadata={ + "validator": instance_of(str), + "help": "MathJax classes to add to math HTML", + "extension": "dollarmath", + "global_only": True, + }, + ) + + def __post_init__(self): + validate_fields(self) + + def copy(self, **kwargs: Any) -> "MdParserConfig": + """Return a new object replacing specified fields with new values. + + Note: initiating the copy will also validate the new fields. + """ + return dc.replace(self, **kwargs) + + @classmethod + def get_fields(cls) -> Tuple[dc.Field, ...]: + """Return all attribute fields in this class.""" + return dc.fields(cls) + + def as_dict(self, dict_factory=dict) -> dict: + """Return a dictionary of field name -> value.""" + return dc.asdict(self, dict_factory=dict_factory) + + def as_triple(self) -> Iterable[Tuple[str, Any, dc.Field]]: + """Yield triples of (name, value, field).""" + fields = {f.name: f for f in dc.fields(self.__class__)} + for name, value in dc.asdict(self).items(): + yield name, value, fields[name] + + +def merge_file_level( + config: MdParserConfig, + topmatter: Dict[str, Any], + warning: Callable[[str, str], None], +) -> MdParserConfig: + """Merge the file-level topmatter with the global config. + + :param config: Global config. + :param topmatter: Topmatter from the file. + :param warning: Function to call with a warning (type, message). + :returns: A new config object + """ + # get updates + updates: Dict[str, Any] = {} + myst = topmatter.get("myst", {}) + if not isinstance(myst, dict): + warning("topmatter", f"'myst' key not a dict: {type(myst)}") + else: + updates = myst + + # allow html_meta and substitutions at top-level for back-compatibility + if "html_meta" in topmatter: + warning( + "topmatter", + "top-level 'html_meta' key is deprecated, " + "place under 'myst' key instead", + ) + updates["html_meta"] = topmatter["html_meta"] + if "substitutions" in topmatter: + warning( + "topmatter", + "top-level 'substitutions' key is deprecated, " + "place under 'myst' key instead", + ) + updates["substitutions"] = topmatter["substitutions"] + + new = config.copy() + + # validate each update + fields = {name: (value, field) for name, value, field in config.as_triple()} + for name, value in updates.items(): + + if name not in fields: + warning("topmatter", f"Unknown field: {name}") + continue + + old_value, field = fields[name] + + try: + validate_field(new, field, value) + except Exception as exc: + warning("topmatter", str(exc)) + continue + + if field.metadata.get("merge_topmatter"): + value = {**old_value, **value} + + setattr(new, name, value) + + return new + + +class TopmatterReadError(Exception): + """Topmatter parsing error.""" + + +def read_topmatter(text: Union[str, Iterator[str]]) -> Optional[Dict[str, Any]]: + """Read the (optional) YAML topmatter from a source string. + + This is identified by the first line starting with `---`, + then read up to a terminating line of `---`, or `...`. + + :param source: The source string to read from + :return: The topmatter + """ + import yaml + + if isinstance(text, str): + if not text.startswith("---"): # skip creating the line list in memory + return None + text = (line for line in text.splitlines()) + try: + if not next(text).startswith("---"): + return None + except StopIteration: + return None + top_matter = [] + for line in text: + if line.startswith("---") or line.startswith("..."): + break + top_matter.append(line.rstrip() + "\n") + try: + metadata = yaml.safe_load("".join(top_matter)) + assert isinstance(metadata, dict) + except (yaml.parser.ParserError, yaml.scanner.ScannerError) as err: + raise TopmatterReadError("Malformed YAML") from err + if not isinstance(metadata, dict): + raise TopmatterReadError(f"YAML is not a dict: {type(metadata)}") + return metadata diff --git a/myst_parser/docutils_.py b/myst_parser/docutils_.py new file mode 100644 index 0000000..6f2cc84 --- /dev/null +++ b/myst_parser/docutils_.py @@ -0,0 +1,6 @@ +"""A module for compatibility with the docutils>=0.17 `include` directive, in RST documents:: + + .. include:: path/to/file.md + :parser: myst_parser.docutils_ +""" +from myst_parser.parsers.docutils_ import Parser # noqa: F401 diff --git a/myst_parser/mdit_to_docutils/__init__.py b/myst_parser/mdit_to_docutils/__init__.py new file mode 100644 index 0000000..0b9307f --- /dev/null +++ b/myst_parser/mdit_to_docutils/__init__.py @@ -0,0 +1 @@ +"""Conversion of Markdown-it tokens to docutils AST.""" diff --git a/myst_parser/mdit_to_docutils/base.py b/myst_parser/mdit_to_docutils/base.py new file mode 100644 index 0000000..cedd6c3 --- /dev/null +++ b/myst_parser/mdit_to_docutils/base.py @@ -0,0 +1,1483 @@ +"""Convert Markdown-it tokens to docutils nodes.""" +from __future__ import annotations + +import inspect +import json +import os +import re +from collections import OrderedDict +from contextlib import contextmanager +from datetime import date, datetime +from types import ModuleType +from typing import TYPE_CHECKING, Any, Iterator, MutableMapping, Sequence, cast +from urllib.parse import urlparse + +import jinja2 +import yaml +from docutils import nodes +from docutils.frontend import OptionParser +from docutils.languages import get_language +from docutils.parsers.rst import Directive, DirectiveError +from docutils.parsers.rst import Parser as RSTParser +from docutils.parsers.rst import directives, roles +from docutils.parsers.rst.directives.misc import Include +from docutils.parsers.rst.languages import get_language as get_language_rst +from docutils.statemachine import StringList +from docutils.transforms.components import Filter +from docutils.utils import Reporter, new_document +from docutils.utils.code_analyzer import Lexer, LexerError, NumberLines +from markdown_it import MarkdownIt +from markdown_it.common.utils import escapeHtml +from markdown_it.renderer import RendererProtocol +from markdown_it.token import Token +from markdown_it.tree import SyntaxTreeNode + +from myst_parser._compat import findall +from myst_parser.config.main import MdParserConfig +from myst_parser.mocking import ( + MockIncludeDirective, + MockingError, + MockInliner, + MockRSTParser, + MockState, + MockStateMachine, +) +from myst_parser.parsers.directives import DirectiveParsingError, parse_directive_text +from .html_to_nodes import html_to_nodes +from .utils import is_external_url + +if TYPE_CHECKING: + from sphinx.environment import BuildEnvironment + + +def make_document(source_path="notset", parser_cls=RSTParser) -> nodes.document: + """Create a new docutils document, with the parser classes' default settings.""" + settings = OptionParser(components=(parser_cls,)).get_default_values() + return new_document(source_path, settings=settings) + + +REGEX_DIRECTIVE_START = re.compile(r"^[\s]{0,3}([`]{3,10}|[~]{3,10}|[:]{3,10})\{") + + +def token_line(token: SyntaxTreeNode, default: int | None = None) -> int: + """Retrieve the initial line of a token.""" + if not getattr(token, "map", None): + if default is not None: + return default + raise ValueError(f"token map not set: {token}") + return token.map[0] # type: ignore[index] + + +def create_warning( + document: nodes.document, + message: str, + *, + line: int | None = None, + append_to: nodes.Element | None = None, + wtype: str = "myst", + subtype: str = "other", +) -> nodes.system_message | None: + """Generate a warning, logging if it is necessary. + + Note this is overridden in the ``SphinxRenderer``, + to handle suppressed warning types. + """ + kwargs = {"line": line} if line is not None else {} + msg_node = document.reporter.warning(f"{message} [{wtype}.{subtype}]", **kwargs) + if append_to is not None: + append_to.append(msg_node) + return msg_node + + +class DocutilsRenderer(RendererProtocol): + """A markdown-it-py renderer to populate (in-place) a `docutils.document` AST. + + Note, this render is not dependent on Sphinx. + """ + + __output__ = "docutils" + + def __init__(self, parser: MarkdownIt) -> None: + """Load the renderer (called by ``MarkdownIt``)""" + self.md = parser + self.rules = { + k: v + for k, v in inspect.getmembers(self, predicate=inspect.ismethod) + if k.startswith("render_") and k != "render_children" + } + + def __getattr__(self, name: str): + """Warn when the renderer has not been setup yet.""" + if name in ( + "md_env", + "md_config", + "md_options", + "document", + "current_node", + "reporter", + "language_module_rst", + "_level_to_elem", + ): + raise AttributeError( + f"'{name}' attribute is not available until setup_render() is called" + ) + raise AttributeError( + f"'{type(self).__name__}' object has no attribute '{name}'" + ) + + def setup_render( + self, options: dict[str, Any], env: MutableMapping[str, Any] + ) -> None: + """Setup the renderer with per render variables.""" + self.md_env = env + self.md_options = options + self.md_config: MdParserConfig = options["myst_config"] + self.document: nodes.document = options.get("document", make_document()) + self.current_node: nodes.Element = options.get("current_node", self.document) + self.reporter: Reporter = self.document.reporter + # note there are actually two possible language modules: + # one from docutils.languages, and one from docutils.parsers.rst.languages + self.language_module_rst: ModuleType = get_language_rst( + self.document.settings.language_code + ) + # a mapping of heading levels to its currently associated node + self._level_to_elem: dict[int, nodes.document | nodes.section] = { + 0: self.document + } + + @property + def sphinx_env(self) -> BuildEnvironment | None: + """Return the sphinx env, if using Sphinx.""" + try: + return self.document.settings.env + except AttributeError: + return None + + def create_warning( + self, + message: str, + *, + line: int | None = None, + append_to: nodes.Element | None = None, + wtype: str = "myst", + subtype: str = "other", + ) -> nodes.system_message | None: + """Generate a warning, logging if it is necessary. + + Note this is overridden in the ``SphinxRenderer``, + to handle suppressed warning types. + """ + return create_warning( + self.document, + message, + line=line, + append_to=append_to, + wtype=wtype, + subtype=subtype, + ) + + def _render_tokens(self, tokens: list[Token]) -> None: + """Render the tokens.""" + # propagate line number down to inline elements + for token in tokens: + if not token.map: + continue + # For docutils we want 1 based line numbers (not 0) + token.map = [token.map[0] + 1, token.map[1] + 1] + for token_child in token.children or []: + token_child.map = token.map + + # nest tokens + node_tree = SyntaxTreeNode(tokens) + + # move footnote definitions to env + self.md_env.setdefault("foot_refs", {}) + for node in node_tree.walk(include_self=True): + new_children = [] + for child in node.children: + if child.type == "footnote_reference": + label = child.meta["label"] + self.md_env["foot_refs"].setdefault(label, []).append(child) + else: + new_children.append(child) + + node.children = new_children + + # render + for child in node_tree.children: + # skip hidden? + if f"render_{child.type}" in self.rules: + self.rules[f"render_{child.type}"](child) + else: + self.create_warning( + f"No render method for: {child.type}", + line=token_line(child, default=0), + subtype="render", + append_to=self.current_node, + ) + + def render( + self, tokens: Sequence[Token], options, md_env: MutableMapping[str, Any] + ) -> nodes.document: + """Run the render on a token stream. + + :param tokens: list on block tokens to render + :param options: params of parser instance + :param md_env: the markdown-it environment sandbox associated with the tokens, + containing additional metadata like reference info + """ + self.setup_render(options, md_env) + self._render_initialise() + self._render_tokens(list(tokens)) + self._render_finalise() + return self.document + + def _render_initialise(self) -> None: + """Initialise the render of the document.""" + self.current_node.extend( + html_meta_to_nodes( + self.md_config.html_meta, + document=self.document, + line=0, + reporter=self.reporter, + ) + ) + + def _render_finalise(self) -> None: + """Finalise the render of the document.""" + + # log warnings for duplicate reference definitions + # "duplicate_refs": [{"href": "ijk", "label": "B", "map": [4, 5], "title": ""}], + for dup_ref in self.md_env.get("duplicate_refs", []): + self.create_warning( + f"Duplicate reference definition: {dup_ref['label']}", + line=dup_ref["map"][0] + 1, + subtype="ref", + append_to=self.document, + ) + + # we don't use the foot_references stored in the env + # since references within directives/roles will have been added after + # those from the initial markdown parse + # instead we gather them from a walk of the created document + foot_refs = OrderedDict() + for refnode in findall(self.document)(nodes.footnote_reference): + if refnode["refname"] not in foot_refs: + foot_refs[refnode["refname"]] = True + + if foot_refs and self.md_config.footnote_transition: + self.current_node.append(nodes.transition(classes=["footnotes"])) + for footref in foot_refs: + foot_ref_tokens = self.md_env["foot_refs"].get(footref, []) + if len(foot_ref_tokens) > 1: + self.create_warning( + f"Multiple footnote definitions found for label: '{footref}'", + subtype="footnote", + append_to=self.current_node, + ) + + if len(foot_ref_tokens) < 1: + self.create_warning( + f"No footnote definitions found for label: '{footref}'", + subtype="footnote", + append_to=self.current_node, + ) + else: + self.render_footnote_reference(foot_ref_tokens[0]) + + # Add the wordcount, generated by the ``mdit_py_plugins.wordcount_plugin``. + wordcount_metadata = self.md_env.get("wordcount", {}) + if wordcount_metadata: + + # save the wordcount to the sphinx BuildEnvironment metadata + if self.sphinx_env is not None: + meta = self.sphinx_env.metadata.setdefault(self.sphinx_env.docname, {}) + meta["wordcount"] = wordcount_metadata + + # now add the wordcount as substitution definitions, + # so we can reference them in the document + for key in ("words", "minutes"): + value = wordcount_metadata.get(key, None) + if value is None: + continue + substitution_node = nodes.substitution_definition( + str(value), nodes.Text(str(value)) + ) + substitution_node.source = self.document["source"] + substitution_node["names"].append(f"wordcount-{key}") + self.document.note_substitution_def( + substitution_node, f"wordcount-{key}" + ) + + def nested_render_text( + self, text: str, lineno: int, inline: bool = False, allow_headings: bool = True + ) -> None: + """Render unparsed text (appending to the current node). + + :param text: the text to render + :param lineno: the starting line number of the text, within the full source + :param inline: whether the text is inline or block + :param allow_headings: whether to allow headings in the text + """ + if inline: + tokens = self.md.parseInline(text, self.md_env) + else: + tokens = self.md.parse(text + "\n", self.md_env) + + # remove front matter, if present, e.g. from included documents + if tokens and tokens[0].type == "front_matter": + tokens.pop(0) + + # update the line numbers + for token in tokens: + if token.map: + token.map = [token.map[0] + lineno, token.map[1] + lineno] + + current_match_titles = self.md_env.get("match_titles", None) + try: + self.md_env["match_titles"] = allow_headings + self._render_tokens(tokens) + finally: + self.md_env["match_titles"] = current_match_titles + + @contextmanager + def current_node_context( + self, node: nodes.Element, append: bool = False + ) -> Iterator: + """Context manager for temporarily setting the current node.""" + if append: + self.current_node.append(node) + current_node = self.current_node + self.current_node = node + yield + self.current_node = current_node + + def render_children(self, token: SyntaxTreeNode) -> None: + """Render the children of a token.""" + for child in token.children or []: + if f"render_{child.type}" in self.rules: + self.rules[f"render_{child.type}"](child) + else: + self.create_warning( + f"No render method for: {child.type}", + line=token_line(child, default=0), + subtype="render", + append_to=self.current_node, + ) + + def add_line_and_source_path(self, node, token: SyntaxTreeNode) -> None: + """Copy the line number and document source path to the docutils node.""" + try: + node.line = token_line(token) + except ValueError: + pass + node.source = self.document["source"] + + def add_line_and_source_path_r( + self, nodes: list[nodes.Element], token: SyntaxTreeNode + ) -> None: + """Copy the line number and document source path to the docutils nodes, + and recursively to all descendants. + """ + for node in nodes: + self.add_line_and_source_path(node, token) + for child in findall(node)(): + self.add_line_and_source_path(child, token) + + def update_section_level_state(self, section: nodes.section, level: int) -> None: + """Update the section level state, with the new current section and level.""" + # find the closest parent section + parent_level = max( + section_level + for section_level in self._level_to_elem + if level > section_level + ) + parent = self._level_to_elem[parent_level] + + # if we are jumping up to a non-consecutive level, + # then warn about this, since this will not be propagated in the docutils AST + if (level > parent_level) and (parent_level + 1 != level): + msg = f"Non-consecutive header level increase; H{parent_level} to H{level}" + if parent_level == 0: + msg = f"Document headings start at H{level}, not H1" + self.create_warning( + msg, + line=section.line, + subtype="header", + append_to=self.current_node, + ) + + # append the new section to the parent + parent.append(section) + # update the state for this section level + self._level_to_elem[level] = section + + # Remove all descendant sections from the section level state + self._level_to_elem = { + section_level: section + for section_level, section in self._level_to_elem.items() + if section_level <= level + } + + def renderInlineAsText(self, tokens: list[SyntaxTreeNode]) -> str: + """Special kludge for image `alt` attributes to conform CommonMark spec. + + Don't try to use it! Spec requires to show `alt` content with stripped markup, + instead of simple escaping. + """ + result = "" + + for token in tokens or []: + if token.type == "text": + result += token.content + # elif token.type == "image": + # result += self.renderInlineAsText(token.children) + else: + result += self.renderInlineAsText(token.children or []) + return result + + # ### render methods for commonmark tokens + + def render_paragraph(self, token: SyntaxTreeNode) -> None: + para = nodes.paragraph(token.children[0].content if token.children else "") + self.add_line_and_source_path(para, token) + with self.current_node_context(para, append=True): + self.render_children(token) + + def render_inline(self, token: SyntaxTreeNode) -> None: + self.render_children(token) + + def render_text(self, token: SyntaxTreeNode) -> None: + self.current_node.append(nodes.Text(token.content)) + + def render_bullet_list(self, token: SyntaxTreeNode) -> None: + list_node = nodes.bullet_list() + if token.markup: + list_node["bullet"] = token.markup + if token.attrs.get("class"): + # this is used e.g. by tasklist + list_node["classes"] = str(token.attrs["class"]).split() + self.add_line_and_source_path(list_node, token) + with self.current_node_context(list_node, append=True): + self.render_children(token) + + def render_ordered_list(self, token: SyntaxTreeNode) -> None: + list_node = nodes.enumerated_list(enumtype="arabic", prefix="") + list_node["suffix"] = token.markup # for CommonMark, this should be "." or ")" + if "start" in token.attrs: # starting number + list_node["start"] = token.attrs["start"] + self.add_line_and_source_path(list_node, token) + with self.current_node_context(list_node, append=True): + self.render_children(token) + + def render_list_item(self, token: SyntaxTreeNode) -> None: + item_node = nodes.list_item() + if token.attrs.get("class"): + # this is used e.g. by tasklist + item_node["classes"] = str(token.attrs["class"]).split() + self.add_line_and_source_path(item_node, token) + with self.current_node_context(item_node, append=True): + self.render_children(token) + + def render_em(self, token: SyntaxTreeNode) -> None: + node = nodes.emphasis() + self.add_line_and_source_path(node, token) + with self.current_node_context(node, append=True): + self.render_children(token) + + def render_softbreak(self, token: SyntaxTreeNode) -> None: + self.current_node.append(nodes.Text("\n")) + + def render_hardbreak(self, token: SyntaxTreeNode) -> None: + self.current_node.append(nodes.raw("", "<br />\n", format="html")) + self.current_node.append(nodes.raw("", "\\\\\n", format="latex")) + + def render_strong(self, token: SyntaxTreeNode) -> None: + node = nodes.strong() + self.add_line_and_source_path(node, token) + with self.current_node_context(node, append=True): + self.render_children(token) + + def render_blockquote(self, token: SyntaxTreeNode) -> None: + quote = nodes.block_quote() + self.add_line_and_source_path(quote, token) + with self.current_node_context(quote, append=True): + self.render_children(token) + + def render_hr(self, token: SyntaxTreeNode) -> None: + node = nodes.transition() + self.add_line_and_source_path(node, token) + self.current_node.append(node) + + def render_code_inline(self, token: SyntaxTreeNode) -> None: + node = nodes.literal(token.content, token.content) + self.add_line_and_source_path(node, token) + self.current_node.append(node) + + def create_highlighted_code_block( + self, + text: str, + lexer_name: str | None, + number_lines: bool = False, + lineno_start: int = 1, + source: str | None = None, + line: int | None = None, + node_cls: type[nodes.Element] = nodes.literal_block, + ) -> nodes.Element: + """Create a literal block with syntax highlighting. + + This mimics the behaviour of the `code-block` directive. + + In docutils, this directive directly parses the text with the pygments lexer, + whereas in sphinx, the lexer name is only recorded as the `language` attribute, + and the text is lexed later by pygments within the `visit_literal_block` + method of the output format ``SphinxTranslator``. + + Note, this function does not add the literal block to the document. + """ + if self.sphinx_env is not None: + node = node_cls(text, text, language=lexer_name or "none") + if number_lines: + node["linenos"] = True + if lineno_start != 1: + node["highlight_args"] = {"linenostart": lineno_start} + else: + node = node_cls( + text, classes=["code"] + ([lexer_name] if lexer_name else []) + ) + try: + lex_tokens = Lexer( + text, + lexer_name or "", + "short" if self.md_config.highlight_code_blocks else "none", + ) + except LexerError as err: + self.reporter.warning( + str(err), + **{ + name: value + for name, value in (("source", source), ("line", line)) + if value is not None + }, + ) + lex_tokens = Lexer(text, lexer_name or "", "none") + + if number_lines: + lex_tokens = NumberLines( + lex_tokens, lineno_start, lineno_start + len(text.splitlines()) + ) + + for classes, value in lex_tokens: + if classes: + node += nodes.inline(value, value, classes=classes) + else: + # insert as Text to decrease the verbosity of the output + node += nodes.Text(value) + + if source is not None: + node.source = source + if line is not None: + node.line = line + return node + + def render_code_block(self, token: SyntaxTreeNode) -> None: + lexer = token.info.split()[0] if token.info else None + node = self.create_highlighted_code_block( + token.content, + lexer, + source=self.document["source"], + line=token_line(token, 0) or None, + ) + self.current_node.append(node) + + def render_fence(self, token: SyntaxTreeNode) -> None: + text = token.content + # Ensure that we'll have an empty string if info exists but is only spaces + info = token.info.strip() if token.info else token.info + language = info.split()[0] if info else "" + + if (not self.md_config.commonmark_only) and (not self.md_config.gfm_only): + if language == "{eval-rst}": + return self.render_restructuredtext(token) + if language.startswith("{") and language.endswith("}"): + return self.render_directive(token) + + if not language and self.sphinx_env is not None: + # use the current highlight setting, via the ``highlight`` directive, + # or ``highlight_language`` configuration. + language = self.sphinx_env.temp_data.get( + "highlight_language", self.sphinx_env.config.highlight_language + ) + + node = self.create_highlighted_code_block( + text, + language, + number_lines=language in self.md_config.number_code_blocks, + source=self.document["source"], + line=token_line(token, 0) or None, + ) + self.current_node.append(node) + + @property + def blocks_mathjax_processing(self) -> bool: + """Only add mathjax ignore classes if using sphinx, + and using the ``dollarmath`` extension, and ``myst_update_mathjax=True``. + """ + return ( + self.sphinx_env is not None + and "dollarmath" in self.md_config.enable_extensions + and self.md_config.update_mathjax + ) + + def render_heading(self, token: SyntaxTreeNode) -> None: + """Render a heading, e.g. `# Heading`.""" + + if self.md_env.get("match_titles", None) is False: + # this can occur if a nested parse is performed by a directive + # (such as an admonition) which contains a header. + # this would break the document structure + self.create_warning( + "Disallowed nested header found, converting to rubric", + line=token_line(token, default=0), + subtype="nested_header", + append_to=self.current_node, + ) + rubric = nodes.rubric(token.content, "") + self.add_line_and_source_path(rubric, token) + with self.current_node_context(rubric, append=True): + self.render_children(token) + return + + level = int(token.tag[1]) + + # create the section node + new_section = nodes.section() + self.add_line_and_source_path(new_section, token) + # if a top level section, + # then add classes to set default mathjax processing to false + # we then turn it back on, on a per-node basis + if level == 1 and self.blocks_mathjax_processing: + new_section["classes"].extend(["tex2jax_ignore", "mathjax_ignore"]) + + # update the state of the section levels + self.update_section_level_state(new_section, level) + + # create the title for this section + title_node = nodes.title(token.children[0].content if token.children else "") + self.add_line_and_source_path(title_node, token) + new_section.append(title_node) + # render the heading children into the title + with self.current_node_context(title_node): + self.render_children(token) + + # create a target reference for the section, based on the heading text + name = nodes.fully_normalize_name(title_node.astext()) + new_section["names"].append(name) + self.document.note_implicit_target(new_section, new_section) + + # set the section as the current node for subsequent rendering + self.current_node = new_section + + def render_link(self, token: SyntaxTreeNode) -> None: + """Parse `<http://link.com>` or `[text](link "title")` syntax to docutils AST: + + - If `<>` autolink, forward to `render_autolink` + - If `myst_all_links_external` is True, forward to `render_external_url` + - If link is an external URL, forward to `render_external_url` + - External URLs start with a scheme (e.g. `http:`) in `myst_url_schemes`, + or any scheme if `myst_url_schemes` is None. + - Otherwise, forward to `render_internal_link` + """ + if token.info == "auto": # handles both autolink and linkify + return self.render_autolink(token) + + if ( + self.md_config.commonmark_only + or self.md_config.gfm_only + or self.md_config.all_links_external + ): + return self.render_external_url(token) + + # Check for external URL + url_scheme = urlparse(cast(str, token.attrGet("href") or "")).scheme + allowed_url_schemes = self.md_config.url_schemes + if (allowed_url_schemes is None and url_scheme) or ( + allowed_url_schemes is not None and url_scheme in allowed_url_schemes + ): + return self.render_external_url(token) + + return self.render_internal_link(token) + + def render_external_url(self, token: SyntaxTreeNode) -> None: + """Render link token `[text](link "title")`, + where the link has been identified as an external URL:: + + <reference refuri="link" title="title"> + text + + `text` can contain nested syntax, e.g. `[**bold**](url "title")`. + """ + ref_node = nodes.reference() + self.add_line_and_source_path(ref_node, token) + ref_node["refuri"] = cast(str, token.attrGet("href") or "") + title = token.attrGet("title") + if title: + ref_node["title"] = title + with self.current_node_context(ref_node, append=True): + self.render_children(token) + + def render_internal_link(self, token: SyntaxTreeNode) -> None: + """Render link token `[text](link "title")`, + where the link has not been identified as an external URL:: + + <reference refname="link" title="title"> + text + + `text` can contain nested syntax, e.g. `[**bold**](link "title")`. + + Note, this is overridden by `SphinxRenderer`, to use `pending_xref` nodes. + """ + ref_node = nodes.reference() + self.add_line_and_source_path(ref_node, token) + ref_node["refname"] = cast(str, token.attrGet("href") or "") + self.document.note_refname(ref_node) + title = token.attrGet("title") + if title: + ref_node["title"] = title + with self.current_node_context(ref_node, append=True): + self.render_children(token) + + def render_autolink(self, token: SyntaxTreeNode) -> None: + refuri = escapeHtml(token.attrGet("href") or "") # type: ignore[arg-type] + ref_node = nodes.reference() + ref_node["refuri"] = refuri + self.add_line_and_source_path(ref_node, token) + with self.current_node_context(ref_node, append=True): + self.render_children(token) + + def render_html_inline(self, token: SyntaxTreeNode) -> None: + self.render_html_block(token) + + def render_html_block(self, token: SyntaxTreeNode) -> None: + node_list = html_to_nodes(token.content, token_line(token), self) + self.current_node.extend(node_list) + + def render_image(self, token: SyntaxTreeNode) -> None: + img_node = nodes.image() + self.add_line_and_source_path(img_node, token) + destination = cast(str, token.attrGet("src") or "") + + if self.md_env.get("relative-images", None) is not None and not is_external_url( + destination, None, True + ): + # make the path relative to an "including" document + # this is set when using the `relative-images` option of the MyST `include` directive + destination = os.path.normpath( + os.path.join( + self.md_env.get("relative-images", ""), + os.path.normpath(destination), + ) + ) + + img_node["uri"] = destination + + img_node["alt"] = self.renderInlineAsText(token.children or []) + title = token.attrGet("title") + if title: + img_node["title"] = token.attrGet("title") + + # apply other attributes that can be set on the image + if "class" in token.attrs: + img_node["classes"].extend(str(token.attrs["class"]).split()) + if "width" in token.attrs: + try: + width = directives.length_or_percentage_or_unitless( + str(token.attrs["width"]) + ) + except ValueError: + self.create_warning( + f"Invalid width value for image: {token.attrs['width']!r}", + line=token_line(token, default=0), + subtype="image", + append_to=self.current_node, + ) + else: + img_node["width"] = width + if "height" in token.attrs: + try: + height = directives.length_or_unitless(str(token.attrs["height"])) + except ValueError: + self.create_warning( + f"Invalid height value for image: {token.attrs['height']!r}", + line=token_line(token, default=0), + subtype="image", + append_to=self.current_node, + ) + else: + img_node["height"] = height + if "align" in token.attrs: + if token.attrs["align"] not in ("left", "center", "right"): + self.create_warning( + f"Invalid align value for image: {token.attrs['align']!r}", + line=token_line(token, default=0), + subtype="image", + append_to=self.current_node, + ) + else: + img_node["align"] = token.attrs["align"] + if "id" in token.attrs: + name = nodes.fully_normalize_name(str(token.attrs["id"])) + img_node["names"].append(name) + self.document.note_explicit_target(img_node, img_node) + + self.current_node.append(img_node) + + # ### render methods for plugin tokens + + def render_front_matter(self, token: SyntaxTreeNode) -> None: + """Pass document front matter data.""" + position = token_line(token, default=0) + + if isinstance(token.content, str): + try: + data = yaml.safe_load(token.content) + except (yaml.parser.ParserError, yaml.scanner.ScannerError): + self.create_warning( + "Malformed YAML", + line=position, + append_to=self.current_node, + subtype="topmatter", + ) + return + else: + data = token.content + + if not isinstance(data, dict): + self.create_warning( + f"YAML is not a dict: {type(data)}", + line=position, + append_to=self.current_node, + subtype="topmatter", + ) + return + + fields = { + k: v + for k, v in data.items() + if k not in ("myst", "mystnb", "substitutions", "html_meta") + } + if fields: + field_list = self.dict_to_fm_field_list( + fields, language_code=self.document.settings.language_code + ) + self.current_node.append(field_list) + + if data.get("title") and self.md_config.title_to_header: + self.nested_render_text(f"# {data['title']}", 0) + + def dict_to_fm_field_list( + self, data: dict[str, Any], language_code: str, line: int = 0 + ) -> nodes.field_list: + """Render each key/val pair as a docutils ``field_node``. + + Bibliographic keys below will be parsed as Markdown, + all others will be left as literal text. + + The field list should be at the start of the document, + and will then be converted to a `docinfo` node during the + `docutils.docutils.transforms.frontmatter.DocInfo` transform (priority 340), + and bibliographic keys (or their translation) will be converted to nodes:: + + {'author': docutils.nodes.author, + 'authors': docutils.nodes.authors, + 'organization': docutils.nodes.organization, + 'address': docutils.nodes.address, + 'contact': docutils.nodes.contact, + 'version': docutils.nodes.version, + 'revision': docutils.nodes.revision, + 'status': docutils.nodes.status, + 'date': docutils.nodes.date, + 'copyright': docutils.nodes.copyright, + 'dedication': docutils.nodes.topic, + 'abstract': docutils.nodes.topic} + + Also, the 'dedication' and 'abstract' will be placed outside the `docinfo`, + and so will always be shown in the document. + + If using sphinx, this `docinfo` node will later be extracted from the AST, + by the `DoctreeReadEvent` transform (priority 880), + calling `MetadataCollector.process_doc`. + In this case keys and values will be converted to strings and stored in + `app.env.metadata[app.env.docname]` + + See + https://www.sphinx-doc.org/en/master/usage/restructuredtext/field-lists.html + for docinfo fields used by sphinx. + + """ + field_list = nodes.field_list() + field_list.source, field_list.line = self.document["source"], line + + bibliofields = get_language(language_code).bibliographic_fields + + for key, value in data.items(): + if not isinstance(value, (str, int, float, date, datetime)): + value = json.dumps(value) + value = str(value) + body = nodes.paragraph() + body.source, body.line = self.document["source"], line + if key in bibliofields: + with self.current_node_context(body): + self.nested_render_text(value, line, inline=True) + else: + body += nodes.literal(value, value) + + field_node = nodes.field() + field_node.source = value + field_node += nodes.field_name(key, "", nodes.Text(key)) + field_node += nodes.field_body(value, *[body]) + field_list += field_node + + return field_list + + def render_table(self, token: SyntaxTreeNode) -> None: + + # markdown-it table always contains at least a header: + assert token.children + header = token.children[0] + # with one header row + assert header.children + header_row = header.children[0] + assert header_row.children + + # top-level element + table = nodes.table() + table["classes"] += ["colwidths-auto"] + self.add_line_and_source_path(table, token) + self.current_node.append(table) + + # column settings element + maxcols = len(header_row.children) + colwidths = [100 // maxcols] * maxcols + tgroup = nodes.tgroup(cols=len(colwidths)) + table += tgroup + for colwidth in colwidths: + colspec = nodes.colspec(colwidth=colwidth) + tgroup += colspec + + # header + thead = nodes.thead() + tgroup += thead + with self.current_node_context(thead): + self.render_table_row(header_row) + + # body + if len(token.children) > 1: + body = token.children[1] + tbody = nodes.tbody() + tgroup += tbody + with self.current_node_context(tbody): + for body_row in body.children or []: + self.render_table_row(body_row) + + def render_table_row(self, token: SyntaxTreeNode) -> None: + row = nodes.row() + with self.current_node_context(row, append=True): + for child in token.children or []: + entry = nodes.entry() + para = nodes.paragraph( + child.children[0].content if child.children else "" + ) + style = child.attrGet("style") # i.e. the alignment when using e.g. :-- + if style and style in ( + "text-align:left", + "text-align:right", + "text-align:center", + ): + entry["classes"].append(f"text-{cast(str, style).split(':')[1]}") + with self.current_node_context(entry, append=True): + with self.current_node_context(para, append=True): + self.render_children(child) + + def render_s(self, token: SyntaxTreeNode) -> None: + """Render a strikethrough token.""" + # TODO strikethrough not currently directly supported in docutils + self.create_warning( + "Strikethrough is currently only supported in HTML output", + line=token_line(token, 0), + subtype="strikethrough", + append_to=self.current_node, + ) + self.current_node.append(nodes.raw("", "<s>", format="html")) + self.render_children(token) + self.current_node.append(nodes.raw("", "</s>", format="html")) + + def render_math_inline(self, token: SyntaxTreeNode) -> None: + content = token.content + node = nodes.math(content, content) + self.add_line_and_source_path(node, token) + self.current_node.append(node) + + def render_math_inline_double(self, token: SyntaxTreeNode) -> None: + content = token.content + node = nodes.math_block(content, content, nowrap=False, number=None) + self.add_line_and_source_path(node, token) + self.current_node.append(node) + + def render_math_single(self, token: SyntaxTreeNode) -> None: + content = token.content + node = nodes.math(content, content) + self.add_line_and_source_path(node, token) + self.current_node.append(node) + + def render_math_block(self, token: SyntaxTreeNode) -> None: + content = token.content + node = nodes.math_block(content, content, nowrap=False, number=None) + self.add_line_and_source_path(node, token) + self.current_node.append(node) + + def render_amsmath(self, token: SyntaxTreeNode) -> None: + # note docutils does not currently support the nowrap attribute + # or equation numbering, so this is overridden in the sphinx renderer + node = nodes.math_block( + token.content, token.content, nowrap=True, classes=["amsmath"] + ) + if token.meta["numbered"] != "*": + node["numbered"] = True + self.add_line_and_source_path(node, token) + self.current_node.append(node) + + def render_footnote_ref(self, token: SyntaxTreeNode) -> None: + """Footnote references are added as auto-numbered, + .i.e. `[^a]` is read as rST `[#a]_` + """ + target = token.meta["label"] + + refnode = nodes.footnote_reference(f"[^{target}]") + self.add_line_and_source_path(refnode, token) + if not target.isdigit(): + refnode["auto"] = 1 + self.document.note_autofootnote_ref(refnode) + else: + refnode += nodes.Text(target) + + refnode["refname"] = target + self.document.note_footnote_ref(refnode) + + self.current_node.append(refnode) + + def render_footnote_reference(self, token: SyntaxTreeNode) -> None: + target = token.meta["label"] + + footnote = nodes.footnote() + self.add_line_and_source_path(footnote, token) + footnote["names"].append(target) + if not target.isdigit(): + footnote["auto"] = 1 + self.document.note_autofootnote(footnote) + else: + footnote += nodes.label("", target) + self.document.note_footnote(footnote) + self.document.note_explicit_target(footnote, footnote) + with self.current_node_context(footnote, append=True): + self.render_children(token) + + def render_myst_block_break(self, token: SyntaxTreeNode) -> None: + block_break = nodes.comment(token.content, token.content) + block_break["classes"] += ["block_break"] + self.add_line_and_source_path(block_break, token) + self.current_node.append(block_break) + + def render_myst_target(self, token: SyntaxTreeNode) -> None: + text = token.content + name = nodes.fully_normalize_name(text) + target = nodes.target(text) + target["names"].append(name) + self.add_line_and_source_path(target, token) + self.document.note_explicit_target(target, self.current_node) + self.current_node.append(target) + + def render_myst_line_comment(self, token: SyntaxTreeNode) -> None: + self.current_node.append(nodes.comment(token.content, token.content.strip())) + + def render_myst_role(self, token: SyntaxTreeNode) -> None: + name = token.meta["name"] + text = token.content + rawsource = f":{name}:`{token.content}`" + lineno = token_line(token) if token.map else 0 + role_func, messages = roles.role( + name, self.language_module_rst, lineno, self.reporter + ) + inliner = MockInliner(self) + if role_func: + nodes, messages2 = role_func(name, rawsource, text, lineno, inliner) + # return nodes, messages + messages2 + self.current_node += nodes + else: + message = self.reporter.error( + f'Unknown interpreted text role "{name}".', line=lineno + ) + problematic = inliner.problematic(text, rawsource, message) + self.current_node += problematic + + def render_colon_fence(self, token: SyntaxTreeNode) -> None: + """Render a code fence with ``:`` colon delimiters.""" + + if token.content.startswith(":::"): + # the content starts with a nested fence block, + # but must distinguish between ``:options:``, so we add a new line + assert token.token is not None, '"colon_fence" must have a `token`' + linear_token = token.token.copy() + linear_token.content = "\n" + linear_token.content + token.token = linear_token + + return self.render_fence(token) + + def render_dl(self, token: SyntaxTreeNode) -> None: + """Render a definition list.""" + node = nodes.definition_list(classes=["simple", "myst"]) + self.add_line_and_source_path(node, token) + with self.current_node_context(node, append=True): + item = None + for child in token.children or []: + if child.type == "dt": + item = nodes.definition_list_item() + self.add_line_and_source_path(item, child) + with self.current_node_context(item, append=True): + term = nodes.term( + child.children[0].content if child.children else "" + ) + self.add_line_and_source_path(term, child) + with self.current_node_context(term, append=True): + self.render_children(child) + elif child.type == "dd": + if item is None: + error = self.reporter.error( + ( + "Found a definition in a definition list, " + "with no preceding term" + ), + # nodes.literal_block(content, content), + line=token_line(child), + ) + self.current_node += [error] + with self.current_node_context(item): + definition = nodes.definition() + self.add_line_and_source_path(definition, child) + with self.current_node_context(definition, append=True): + self.render_children(child) + else: + error_msg = self.reporter.error( + ( + "Expected a term/definition as a child of a definition list" + f", but found a: {child.type}" + ), + # nodes.literal_block(content, content), + line=token_line(child), + ) + self.current_node += [error_msg] + + def render_field_list(self, token: SyntaxTreeNode) -> None: + """Render a field list.""" + field_list = nodes.field_list(classes=["myst"]) + self.add_line_and_source_path(field_list, token) + with self.current_node_context(field_list, append=True): + # raise ValueError(token.pretty(show_text=True)) + children = (token.children or [])[:] + while children: + child = children.pop(0) + if not child.type == "fieldlist_name": + error_msg = self.reporter.error( + ( + "Expected a fieldlist_name as a child of a field_list" + f", but found a: {child.type}" + ), + # nodes.literal_block(content, content), + line=token_line(child), + ) + self.current_node += [error_msg] + break + field = nodes.field() + self.add_line_and_source_path(field, child) + field_list += field + field_name = nodes.field_name() + self.add_line_and_source_path(field_name, child) + field += field_name + with self.current_node_context(field_name): + self.render_children(child) + field_body = nodes.field_body() + self.add_line_and_source_path(field_name, child) + field += field_body + if children and children[0].type == "fieldlist_body": + child = children.pop(0) + with self.current_node_context(field_body): + self.render_children(child) + + def render_restructuredtext(self, token: SyntaxTreeNode) -> None: + """Render the content of the token as restructuredtext.""" + # copy necessary elements (source, line no, env, reporter) + newdoc = make_document() + newdoc["source"] = self.document["source"] + newdoc.settings = self.document.settings + newdoc.reporter = self.reporter + # pad the line numbers artificially so they offset with the fence block + pseudosource = ("\n" * token_line(token)) + token.content + # actually parse the rst into our document + MockRSTParser().parse(pseudosource, newdoc) + for node in newdoc: + if node["names"]: + self.document.note_explicit_target(node, node) + self.current_node.extend(newdoc.children) + + def render_directive(self, token: SyntaxTreeNode) -> None: + """Render special fenced code blocks as directives.""" + first_line = token.info.split(maxsplit=1) + name = first_line[0][1:-1] + arguments = "" if len(first_line) == 1 else first_line[1] + content = token.content + position = token_line(token) + nodes_list = self.run_directive(name, arguments, content, position) + self.current_node += nodes_list + + def run_directive( + self, name: str, first_line: str, content: str, position: int + ) -> list[nodes.Element]: + """Run a directive and return the generated nodes. + + :param name: the name of the directive + :param first_line: The text on the same line as the directive name. + May be an argument or body text, dependent on the directive + :param content: All text after the first line. Can include options. + :param position: The line number of the first line + + """ + # TODO directive name white/black lists + + self.document.current_line = position + + # get directive class + output: tuple[Directive, list] = directives.directive( + name, self.language_module_rst, self.document + ) + directive_class, messages = output + if not directive_class: + error = self.reporter.error( + f'Unknown directive type "{name}".\n', + # nodes.literal_block(content, content), + line=position, + ) + return [error] + messages + + if issubclass(directive_class, Include): + # this is a Markdown only option, + # to allow for altering relative image reference links + directive_class.option_spec["relative-images"] = directives.flag + directive_class.option_spec["relative-docs"] = directives.path + + try: + arguments, options, body_lines, content_offset = parse_directive_text( + directive_class, first_line, content + ) + except DirectiveParsingError as error: + error = self.reporter.error( + f"Directive '{name}': {error}", + nodes.literal_block(content, content), + line=position, + ) + return [error] + + # initialise directive + if issubclass(directive_class, Include): + directive_instance = MockIncludeDirective( + self, + name=name, + klass=directive_class, + arguments=arguments, + options=options, + body=body_lines, + lineno=position, + ) + else: + state_machine = MockStateMachine(self, position) + state = MockState(self, state_machine, position) + directive_instance = directive_class( + name=name, + # the list of positional arguments + arguments=arguments, + # a dictionary mapping option names to values + options=options, + # the directive content line by line + content=StringList(body_lines, self.document["source"]), + # the absolute line number of the first line of the directive + lineno=position, + # the line offset of the first line of the content + content_offset=content_offset, + # a string containing the entire directive + block_text="\n".join(body_lines), + state=state, + state_machine=state_machine, + ) + + # run directive + try: + result = directive_instance.run() + except DirectiveError as error: + msg_node = self.reporter.system_message( + error.level, error.msg, line=position + ) + msg_node += nodes.literal_block(content, content) + result = [msg_node] + except MockingError as exc: + error_msg = self.reporter.error( + "Directive '{}' cannot be mocked: {}: {}".format( + name, exc.__class__.__name__, exc + ), + nodes.literal_block(content, content), + line=position, + ) + return [error_msg] + + assert isinstance( + result, list + ), f'Directive "{name}" must return a list of nodes.' + for i in range(len(result)): + assert isinstance( + result[i], nodes.Node + ), 'Directive "{}" returned non-Node object (index {}): {}'.format( + name, i, result[i] + ) + return result + + def render_substitution_inline(self, token: SyntaxTreeNode) -> None: + """Render inline substitution {{key}}.""" + self.render_substitution(token, inline=True) + + def render_substitution_block(self, token: SyntaxTreeNode) -> None: + """Render block substitution {{key}}.""" + self.render_substitution(token, inline=False) + + def render_substitution(self, token: SyntaxTreeNode, inline: bool) -> None: + """Substitutions are rendered by: + + 1. Combining global substitutions with front-matter substitutions + to create a variable context (front-matter takes priority) + 2. Add the sphinx `env` to the variable context (if available) + 3. Create the string content with Jinja2 (passing it the variable context) + 4. If the substitution is inline and not a directive, + parse to nodes ignoring block syntaxes (like lists or block-quotes), + otherwise parse to nodes with all syntax rules. + + """ + position = token_line(token) + + # front-matter substitutions take priority over config ones + variable_context: dict[str, Any] = {**self.md_config.substitutions} + if self.sphinx_env is not None: + variable_context["env"] = self.sphinx_env + + # fail on undefined variables + env = jinja2.Environment(undefined=jinja2.StrictUndefined) + + # try rendering + try: + rendered = env.from_string(f"{{{{{token.content}}}}}").render( + variable_context + ) + except Exception as error: + error_msg = self.reporter.error( + f"Substitution error:{error.__class__.__name__}: {error}", + line=position, + ) + self.current_node += [error_msg] + return + + # handle circular references + ast = env.parse(f"{{{{{token.content}}}}}") + references = { + n.name for n in ast.find_all(jinja2.nodes.Name) if n.name != "env" + } + self.document.sub_references = getattr(self.document, "sub_references", set()) + cyclic = references.intersection(self.document.sub_references) + if cyclic: + error_msg = self.reporter.error( + f"circular substitution reference: {cyclic}", + line=position, + ) + self.current_node += [error_msg] + return + + # TODO improve error reporting; + # at present, for a multi-line substitution, + # an error may point to a line lower than the substitution + # should it point to the source of the substitution? + # or the error message should at least indicate that its a substitution + + # we record used references before nested parsing, then remove them after + self.document.sub_references.update(references) + try: + if inline and not REGEX_DIRECTIVE_START.match(rendered): + self.nested_render_text(rendered, position, inline=True) + else: + self.nested_render_text(rendered, position, allow_headings=False) + finally: + self.document.sub_references.difference_update(references) + + +def html_meta_to_nodes( + data: dict[str, Any], document: nodes.document, line: int, reporter: Reporter +) -> list[nodes.pending | nodes.system_message]: + """Replicate the `meta` directive, + by converting a dictionary to a list of pending meta nodes + + See: + https://www.sphinx-doc.org/en/master/usage/restructuredtext/basics.html#html-metadata + """ + if not data: + return [] + + try: + # if sphinx available + from sphinx.addnodes import meta as meta_cls + except ImportError: + try: + # docutils >= 0.19 + meta_cls = nodes.meta # type: ignore + except AttributeError: + from docutils.parsers.rst.directives.html import MetaBody + + meta_cls = MetaBody.meta # type: ignore + + output = [] + + for key, value in data.items(): + content = str(value or "") + meta_node = meta_cls(content) + meta_node.source = document["source"] + meta_node.line = line + meta_node["content"] = content + try: + if not content: + raise ValueError("No content") + for i, key_part in enumerate(key.split()): + if "=" not in key_part and i == 0: + meta_node["name"] = key_part + continue + if "=" not in key_part: + raise ValueError(f"no '=' in {key_part}") + attr_name, attr_val = key_part.split("=", 1) + if not (attr_name and attr_val): + raise ValueError(f"malformed {key_part}") + meta_node[attr_name.lower()] = attr_val + except ValueError as error: + msg = reporter.error(f'Error parsing meta tag attribute "{key}": {error}.') + output.append(msg) + continue + + pending = nodes.pending( + Filter, + {"component": "writer", "format": "html", "nodes": [meta_node]}, + ) + document.note_pending(pending) + output.append(pending) + + return output diff --git a/myst_parser/mdit_to_docutils/html_to_nodes.py b/myst_parser/mdit_to_docutils/html_to_nodes.py new file mode 100644 index 0000000..2cc3066 --- /dev/null +++ b/myst_parser/mdit_to_docutils/html_to_nodes.py @@ -0,0 +1,139 @@ +"""Convert HTML to docutils nodes.""" +from __future__ import annotations + +import re +from typing import TYPE_CHECKING + +from docutils import nodes + +from myst_parser.parsers.parse_html import Data, tokenize_html + +if TYPE_CHECKING: + from .base import DocutilsRenderer + + +def make_error( + document: nodes.document, error_msg: str, text: str, line_number: int +) -> nodes.system_message: + return document.reporter.error( + error_msg, + nodes.literal_block(text, text), + line=line_number, + ) + + +OPTION_KEYS_IMAGE = {"class", "alt", "height", "width", "align", "name"} +# note: docutils also has scale and target + +OPTION_KEYS_ADMONITION = {"class", "name"} + +# See https://github.com/micromark/micromark-extension-gfm-tagfilter +RE_FLOW = re.compile( + r"<(\/?)(iframe|noembed|noframes|plaintext|script|style|title|textarea|xmp)(?=[\t\n\f\r />])", + re.IGNORECASE, +) + + +def default_html(text: str, source: str, line_number: int) -> list[nodes.Element]: + raw_html = nodes.raw("", text, format="html") + raw_html.source = source + raw_html.line = line_number + return [raw_html] + + +def html_to_nodes( + text: str, line_number: int, renderer: DocutilsRenderer +) -> list[nodes.Element]: + """Convert HTML to docutils nodes.""" + if renderer.md_config.gfm_only: + text, _ = RE_FLOW.subn(lambda s: s.group(0).replace("<", "<"), text) + + enable_html_img = "html_image" in renderer.md_config.enable_extensions + enable_html_admonition = "html_admonition" in renderer.md_config.enable_extensions + if not (enable_html_img or enable_html_admonition): + return default_html(text, renderer.document["source"], line_number) + + # parse the HTML to AST + try: + root = tokenize_html(text).strip(inplace=True, recurse=False) + except Exception: + msg_node = renderer.create_warning( + "HTML could not be parsed", line=line_number, subtype="html" + ) + return ([msg_node] if msg_node else []) + default_html( + text, renderer.document["source"], line_number + ) + + if len(root) < 1: + # if empty + return default_html(text, renderer.document["source"], line_number) + + if not all( + (enable_html_img and child.name == "img") + or ( + enable_html_admonition + and child.name == "div" + and "admonition" in child.attrs.classes + ) + for child in root + ): + return default_html(text, renderer.document["source"], line_number) + + nodes_list = [] + for child in root: + + if child.name == "img": + if "src" not in child.attrs: + return [ + renderer.reporter.error( + "<img> missing 'src' attribute", line=line_number + ) + ] + content = "\n".join( + f":{k}: {v}" + for k, v in sorted(child.attrs.items()) + if k in OPTION_KEYS_IMAGE + ) + nodes_list.extend( + renderer.run_directive( + "image", child.attrs["src"], content, line_number + ) + ) + + else: + children = child.strip().children + if ( + children + and children[0].name in ("div", "p") + and ( + "title" in children[0].attrs.classes + or "admonition-title" in children[0].attrs.classes + ) + ): + title = "".join(child.render() for child in children.pop(0)) + else: + title = "Note" + + options = "\n".join( + f":{k}: {v}" + for k, v in sorted(child.attrs.items()) + if k in OPTION_KEYS_ADMONITION + ).rstrip() + new_children = [] + for child in children: + if child.name == "p": + new_children.extend(child.children) + new_children.append(Data("\n\n")) + else: + new_children.append(child) + content = ( + options + + ("\n\n" if options else "") + + "".join(child.render() for child in new_children).lstrip() + ) + + nodes_list.extend( + renderer.run_directive("admonition", title, content, line_number) + ) + + return nodes_list diff --git a/myst_parser/mdit_to_docutils/sphinx_.py b/myst_parser/mdit_to_docutils/sphinx_.py new file mode 100644 index 0000000..3c1bc23 --- /dev/null +++ b/myst_parser/mdit_to_docutils/sphinx_.py @@ -0,0 +1,245 @@ +"""Convert Markdown-it tokens to docutils nodes, including sphinx specific elements.""" +from __future__ import annotations + +import os +from pathlib import Path +from typing import cast +from urllib.parse import unquote +from uuid import uuid4 + +from docutils import nodes +from markdown_it.tree import SyntaxTreeNode +from sphinx import addnodes +from sphinx.domains.math import MathDomain +from sphinx.domains.std import StandardDomain +from sphinx.environment import BuildEnvironment +from sphinx.util import logging +from sphinx.util.nodes import clean_astext + +from myst_parser.mdit_to_docutils.base import DocutilsRenderer + +LOGGER = logging.getLogger(__name__) + + +def create_warning( + document: nodes.document, + message: str, + *, + line: int | None = None, + append_to: nodes.Element | None = None, + wtype: str = "myst", + subtype: str = "other", +) -> nodes.system_message | None: + """Generate a warning, logging it if necessary. + + If the warning type is listed in the ``suppress_warnings`` configuration, + then ``None`` will be returned and no warning logged. + """ + message = f"{message} [{wtype}.{subtype}]" + kwargs = {"line": line} if line is not None else {} + + if logging.is_suppressed_warning( + wtype, subtype, document.settings.env.app.config.suppress_warnings + ): + return None + + msg_node = document.reporter.warning(message, **kwargs) + if append_to is not None: + append_to.append(msg_node) + + return None + + +class SphinxRenderer(DocutilsRenderer): + """A markdown-it-py renderer to populate (in-place) a `docutils.document` AST. + + This is sub-class of `DocutilsRenderer` that handles sphinx specific aspects, + such as cross-referencing. + """ + + @property + def doc_env(self) -> BuildEnvironment: + return self.document.settings.env + + def create_warning( + self, + message: str, + *, + line: int | None = None, + append_to: nodes.Element | None = None, + wtype: str = "myst", + subtype: str = "other", + ) -> nodes.system_message | None: + """Generate a warning, logging it if necessary. + + If the warning type is listed in the ``suppress_warnings`` configuration, + then ``None`` will be returned and no warning logged. + """ + return create_warning( + self.document, + message, + line=line, + append_to=append_to, + wtype=wtype, + subtype=subtype, + ) + + def render_internal_link(self, token: SyntaxTreeNode) -> None: + """Render link token `[text](link "title")`, + where the link has not been identified as an external URL. + """ + destination = unquote(cast(str, token.attrGet("href") or "")) + + # make the path relative to an "including" document + # this is set when using the `relative-docs` option of the MyST `include` directive + relative_include = self.md_env.get("relative-docs", None) + if relative_include is not None and destination.startswith(relative_include[0]): + source_dir, include_dir = relative_include[1:] + destination = os.path.relpath( + os.path.join(include_dir, os.path.normpath(destination)), source_dir + ) + + potential_path = ( + Path(self.doc_env.doc2path(self.doc_env.docname)).parent / destination + if self.doc_env.srcdir # not set in some test situations + else None + ) + if ( + potential_path + and potential_path.is_file() + and not any( + destination.endswith(suffix) + for suffix in self.doc_env.config.source_suffix + ) + ): + wrap_node = addnodes.download_reference( + refdoc=self.doc_env.docname, + reftarget=destination, + reftype="myst", + refdomain=None, # Added to enable cross-linking + refexplicit=len(token.children or []) > 0, + refwarn=False, + ) + classes = ["xref", "download", "myst"] + text = destination if not token.children else "" + else: + wrap_node = addnodes.pending_xref( + refdoc=self.doc_env.docname, + reftarget=destination, + reftype="myst", + refdomain=None, # Added to enable cross-linking + refexplicit=len(token.children or []) > 0, + refwarn=True, + ) + classes = ["xref", "myst"] + text = "" + + self.add_line_and_source_path(wrap_node, token) + title = token.attrGet("title") + if title: + wrap_node["title"] = title + self.current_node.append(wrap_node) + + inner_node = nodes.inline("", text, classes=classes) + wrap_node.append(inner_node) + with self.current_node_context(inner_node): + self.render_children(token) + + def render_heading(self, token: SyntaxTreeNode) -> None: + """This extends the docutils method, to allow for the addition of heading ids. + These ids are computed by the ``markdown-it-py`` ``anchors_plugin`` + as "slugs" which are unique to a document. + + The approach is similar to ``sphinx.ext.autosectionlabel`` + """ + super().render_heading(token) + + if not isinstance(self.current_node, nodes.section): + return + + # create the slug string + slug = cast(str, token.attrGet("id")) + if slug is None: + return + + section = self.current_node + doc_slug = self.doc_env.doc2path(self.doc_env.docname, base=False) + "#" + slug + + # save the reference in the standard domain, so that it can be handled properly + domain = cast(StandardDomain, self.doc_env.get_domain("std")) + if doc_slug in domain.labels: + other_doc = self.doc_env.doc2path(domain.labels[doc_slug][0]) + self.create_warning( + f"duplicate label {doc_slug}, other instance in {other_doc}", + line=section.line, + subtype="anchor", + ) + labelid = section["ids"][0] + domain.anonlabels[doc_slug] = self.doc_env.docname, labelid + domain.labels[doc_slug] = ( + self.doc_env.docname, + labelid, + clean_astext(section[0]), + ) + + self.doc_env.metadata[self.doc_env.docname]["myst_anchors"] = True + section["myst-anchor"] = doc_slug + + def render_math_block_label(self, token: SyntaxTreeNode) -> None: + """Render math with referencable labels, e.g. ``$a=1$ (label)``.""" + label = token.info + content = token.content + node = nodes.math_block( + content, content, nowrap=False, number=None, label=label + ) + target = self.add_math_target(node) + self.add_line_and_source_path(target, token) + self.current_node.append(target) + self.add_line_and_source_path(node, token) + self.current_node.append(node) + + def _random_label(self) -> str: + return str(uuid4()) + + def render_amsmath(self, token: SyntaxTreeNode) -> None: + """Renderer for the amsmath extension.""" + # environment = token.meta["environment"] + content = token.content + + if token.meta["numbered"] != "*": + # TODO how to parse and reference labels within environment? + # for now we give create a unique hash, so the equation will be numbered + # but there will be no reference clashes + label = self._random_label() + node = nodes.math_block( + content, + content, + nowrap=True, + number=None, + classes=["amsmath"], + label=label, + ) + target = self.add_math_target(node) + self.add_line_and_source_path(target, token) + self.current_node.append(target) + else: + node = nodes.math_block( + content, content, nowrap=True, number=None, classes=["amsmath"] + ) + self.add_line_and_source_path(node, token) + self.current_node.append(node) + + def add_math_target(self, node: nodes.math_block) -> nodes.target: + # Code mainly copied from sphinx.directives.patches.MathDirective + + # register label to domain + domain = cast(MathDomain, self.doc_env.get_domain("math")) + domain.note_equation(self.doc_env.docname, node["label"], location=node) + node["number"] = domain.get_equation_number_for(node["label"]) + node["docname"] = self.doc_env.docname + + # create target node + node_id = nodes.make_id("equation-%s" % node["label"]) + target = nodes.target("", "", ids=[node_id]) + self.document.note_explicit_target(target) + return target diff --git a/myst_parser/mdit_to_docutils/utils.py b/myst_parser/mdit_to_docutils/utils.py new file mode 100644 index 0000000..b31d8c7 --- /dev/null +++ b/myst_parser/mdit_to_docutils/utils.py @@ -0,0 +1,36 @@ +import html +from typing import Iterable, Optional +from urllib.parse import quote, urlparse + + +def escape_url(raw: str) -> str: + """ + Escape urls to prevent code injection craziness. (Hopefully.) + """ + return html.escape(quote(html.unescape(raw), safe="/#:()*?=%@+,&")) + + +def is_external_url( + reference: str, + known_url_schemes: Optional[Iterable[str]], + match_fragment: bool = False, +) -> bool: + """Return if a reference should be recognised as an external URL. + + URLs are of the format: scheme://netloc/path;parameters?query#fragment + + This checks if there is a url scheme (e.g. 'https') and, if so, + if the scheme is is the list of known_url_schemes (if supplied). + + :param known_url_schemes: e.g. ["http", "https", "mailto"] + If None, match all schemes + :param match_fragment: If True and a fragment found, then True will be returned, + irrespective of a scheme match + + """ + url_check = urlparse(reference) + if known_url_schemes is not None: + scheme_known = url_check.scheme in known_url_schemes + else: + scheme_known = bool(url_check.scheme) + return scheme_known or (match_fragment and url_check.fragment != "") diff --git a/myst_parser/mocking.py b/myst_parser/mocking.py new file mode 100644 index 0000000..b22475d --- /dev/null +++ b/myst_parser/mocking.py @@ -0,0 +1,514 @@ +"""This module provides classes to Mock the core components of the docutils.RSTParser, +the key difference being that nested parsing treats the text as Markdown not rST. +""" +from __future__ import annotations + +import os +import re +import sys +from pathlib import Path +from typing import TYPE_CHECKING, Any + +from docutils import nodes +from docutils.parsers.rst import Directive, DirectiveError +from docutils.parsers.rst import Parser as RSTParser +from docutils.parsers.rst.directives.misc import Include +from docutils.parsers.rst.states import Body, Inliner, RSTStateMachine +from docutils.statemachine import StringList +from docutils.utils import unescape + +from .parsers.directives import parse_directive_text + +if TYPE_CHECKING: + from .mdit_to_docutils.base import DocutilsRenderer + + +class MockingError(Exception): + """An exception to signal an error during mocking of docutils components.""" + + +class MockInliner: + """A mock version of `docutils.parsers.rst.states.Inliner`. + + This is parsed to role functions. + """ + + def __init__(self, renderer: DocutilsRenderer): + """Initialize the mock inliner.""" + self._renderer = renderer + # here we mock that the `parse` method has already been called + # which is where these attributes are set (via the RST state Memo) + self.document = renderer.document + self.reporter = renderer.document.reporter + self.language = renderer.language_module_rst + self.parent = renderer.current_node + + if not hasattr(self.reporter, "get_source_and_line"): + # In docutils this is set by `RSTState.runtime_init` + self.reporter.get_source_and_line = lambda l: (self.document["source"], l) + + self.rfc_url = "rfc%d.html" + + def problematic( + self, text: str, rawsource: str, message: nodes.system_message + ) -> nodes.problematic: + """Record a system message from parsing.""" + msgid = self.document.set_id(message, self.parent) + problematic = nodes.problematic(rawsource, text, refid=msgid) + prbid = self.document.set_id(problematic) + message.add_backref(prbid) + return problematic + + def parse( + self, text: str, lineno: int, memo: Any, parent: nodes.Node + ) -> tuple[list[nodes.Node], list[nodes.system_message]]: + """Parse the text and return a list of nodes.""" + # note the only place this is normally called, + # is by `RSTState.inline_text`, or in directives: `self.state.inline_text`, + # and there the state parses its own parent + # self.reporter = memo.reporter + # self.document = memo.document + # self.language = memo.language + with self._renderer.current_node_context(parent): + # the parent is never actually appended to though, + # so we make a temporary parent to parse into + container = nodes.Element() + with self._renderer.current_node_context(container): + self._renderer.nested_render_text(text, lineno, inline=True) + + return container.children, [] + + def __getattr__(self, name: str): + """This method is only be called if the attribute requested has not + been defined. Defined attributes will not be overridden. + """ + # TODO use document.reporter mechanism? + if hasattr(Inliner, name): + msg = "{cls} has not yet implemented attribute '{name}'".format( + cls=type(self).__name__, name=name + ) + raise MockingError(msg).with_traceback(sys.exc_info()[2]) + msg = f"{type(self).__name__} has no attribute {name}" + raise MockingError(msg).with_traceback(sys.exc_info()[2]) + + +class MockState: + """A mock version of `docutils.parsers.rst.states.RSTState`. + + This is parsed to the `Directives.run()` method, + so that they may run nested parses on their content that will be parsed as markdown, + rather than RST. + """ + + def __init__( + self, + renderer: DocutilsRenderer, + state_machine: MockStateMachine, + lineno: int, + ): + self._renderer = renderer + self._lineno = lineno + self.document = renderer.document + self.reporter = renderer.document.reporter + self.state_machine = state_machine + self.inliner = MockInliner(renderer) + + class Struct: + document = self.document + reporter = self.document.reporter + language = renderer.language_module_rst + title_styles: list[str] = [] + section_level = max(renderer._level_to_elem) + section_bubble_up_kludge = False + inliner = self.inliner + + self.memo = Struct + + def parse_directive_block( + self, + content: StringList, + line_offset: int, + directive: type[Directive], + option_presets: dict, + ) -> tuple[list, dict, StringList, int]: + """Parse the full directive text + + :returns: (arguments, options, content, content_offset) + """ + if option_presets: + raise MockingError("parse_directive_block: option_presets not implemented") + # TODO should argument_str always be ""? + arguments, options, body_lines, content_offset = parse_directive_text( + directive, "", "\n".join(content) + ) + return ( + arguments, + options, + StringList(body_lines, source=content.source), + line_offset + content_offset, + ) + + def nested_parse( + self, + block: StringList, + input_offset: int, + node: nodes.Element, + match_titles: bool = False, + state_machine_class=None, + state_machine_kwargs=None, + ) -> None: + """Perform a nested parse of the input block, with ``node`` as the parent. + + :param block: The block of lines to parse. + :param input_offset: The offset of the first line of block, + to the starting line of the state (i.e. directive). + :param node: The parent node to attach the parsed content to. + :param match_titles: Whether to to allow the parsing of headings + (normally this is false, + since nested heading would break the document structure) + """ + sm_match_titles = self.state_machine.match_titles + with self._renderer.current_node_context(node): + self._renderer.nested_render_text( + "\n".join(block), + self._lineno + input_offset, + allow_headings=match_titles, + ) + self.state_machine.match_titles = sm_match_titles + + def parse_target(self, block, block_text, lineno: int): + """ + Taken from https://github.com/docutils-mirror/docutils/blob/e88c5fb08d5cdfa8b4ac1020dd6f7177778d5990/docutils/parsers/rst/states.py#L1927 # noqa: E501 + """ + # Commenting out this code because it only applies to rST + # if block and block[-1].strip()[-1:] == "_": # possible indirect target + # reference = " ".join([line.strip() for line in block]) + # refname = self.is_reference(reference) + # if refname: + # return "refname", refname + reference = "".join(["".join(line.split()) for line in block]) + return "refuri", unescape(reference) + + def inline_text( + self, text: str, lineno: int + ) -> tuple[list[nodes.Element], list[nodes.Element]]: + """Parse text with only inline rules. + + :returns: (list of nodes, list of messages) + """ + return self.inliner.parse(text, lineno, self.memo, self._renderer.current_node) + + # U+2014 is an em-dash: + attribution_pattern = re.compile("^((?:---?(?!-)|\u2014) *)(.+)") + + def block_quote(self, lines: list[str], line_offset: int) -> list[nodes.Element]: + """Parse a block quote, which is a block of text, + followed by an (optional) attribution. + + :: + + No matter where you go, there you are. + + -- Buckaroo Banzai + """ + elements = [] + # split attribution + last_line_blank = False + blockquote_lines = lines + attribution_lines = [] + attribution_line_offset = None + # First line after a blank line must begin with a dash + for i, line in enumerate(lines): + if not line.strip(): + last_line_blank = True + continue + if not last_line_blank: + last_line_blank = False + continue + last_line_blank = False + match = self.attribution_pattern.match(line) + if not match: + continue + attribution_line_offset = i + attribution_lines = [match.group(2)] + for at_line in lines[i + 1 :]: + indented_line = at_line[len(match.group(1)) :] + if len(indented_line) != len(at_line.lstrip()): + break + attribution_lines.append(indented_line) + blockquote_lines = lines[:i] + break + # parse block + blockquote = nodes.block_quote() + self.nested_parse(blockquote_lines, line_offset, blockquote) + elements.append(blockquote) + # parse attribution + if attribution_lines: + attribution_text = "\n".join(attribution_lines) + lineno = self._lineno + line_offset + (attribution_line_offset or 0) + textnodes, messages = self.inline_text(attribution_text, lineno) + attribution = nodes.attribution(attribution_text, "", *textnodes) + ( + attribution.source, + attribution.line, + ) = self.state_machine.get_source_and_line(lineno) + blockquote += attribution + elements += messages + return elements + + def build_table(self, tabledata, tableline, stub_columns: int = 0, widths=None): + return Body.build_table(self, tabledata, tableline, stub_columns, widths) + + def build_table_row(self, rowdata, tableline): + return Body.build_table_row(self, rowdata, tableline) + + def __getattr__(self, name: str): + """This method is only be called if the attribute requested has not + been defined. Defined attributes will not be overridden. + """ + cls = type(self).__name__ + if hasattr(Body, name): + msg = ( + f"{cls} has not yet implemented attribute '{name}'. " + "You can parse RST directly via the `{eval-rst}` directive: " + "https://myst-parser.readthedocs.io/en/latest/syntax/syntax.html#how-directives-parse-content" # noqa: E501 + ) + else: + # The requested `name` is not a docutils Body element + # (such as "footnote", "block_quote", "paragraph", …) + msg = f"{cls} has no attribute '{name}'" + raise MockingError(msg).with_traceback(sys.exc_info()[2]) + + +class MockStateMachine: + """A mock version of `docutils.parsers.rst.states.RSTStateMachine`. + + This is parsed to the `Directives.run()` method. + """ + + def __init__(self, renderer: DocutilsRenderer, lineno: int): + self._renderer = renderer + self._lineno = lineno + self.document = renderer.document + self.language = renderer.language_module_rst + self.reporter = self.document.reporter + self.node: nodes.Element = renderer.current_node + self.match_titles: bool = True + + def get_source(self, lineno: int | None = None): + """Return document source path.""" + return self.document["source"] + + def get_source_and_line(self, lineno: int | None = None): + """Return (source path, line) tuple for current or given line number.""" + return self.document["source"], lineno or self._lineno + + def __getattr__(self, name: str): + """This method is only be called if the attribute requested has not + been defined. Defined attributes will not be overridden. + """ + if hasattr(RSTStateMachine, name): + msg = "{cls} has not yet implemented attribute '{name}'".format( + cls=type(self).__name__, name=name + ) + raise MockingError(msg).with_traceback(sys.exc_info()[2]) + msg = f"{type(self).__name__} has no attribute {name}" + raise MockingError(msg).with_traceback(sys.exc_info()[2]) + + +class MockIncludeDirective: + """This directive uses a lot of statemachine logic that is not yet mocked. + Therefore, we treat it as a special case (at least for now). + + See: + https://docutils.sourceforge.io/docs/ref/rst/directives.html#including-an-external-document-fragment + """ + + def __init__( + self, + renderer: DocutilsRenderer, + name: str, + klass: Include, + arguments: list, + options: dict, + body: list[str], + lineno: int, + ): + self.renderer = renderer + self.document = renderer.document + self.name = name + self.klass = klass + self.arguments = arguments + self.options = options + self.body = body + self.lineno = lineno + + def run(self) -> list[nodes.Element]: + + from docutils.parsers.rst.directives.body import CodeBlock, NumberLines + + if not self.document.settings.file_insertion_enabled: + raise DirectiveError(2, f'Directive "{self.name}" disabled.') + + source_dir = Path(self.document["source"]).absolute().parent + include_arg = "".join([s.strip() for s in self.arguments[0].splitlines()]) + + if include_arg.startswith("<") and include_arg.endswith(">"): + # # docutils "standard" includes + path = Path(self.klass.standard_include_path).joinpath(include_arg[1:-1]) + else: + # if using sphinx interpret absolute paths "correctly", + # i.e. relative to source directory + try: + sphinx_env = self.document.settings.env + except AttributeError: + pass + else: + _, include_arg = sphinx_env.relfn2path(self.arguments[0]) + sphinx_env.note_included(include_arg) + path = Path(include_arg) + path = source_dir.joinpath(path) + # this ensures that the parent file is rebuilt if the included file changes + self.document.settings.record_dependencies.add(str(path)) + + # read file + encoding = self.options.get("encoding", self.document.settings.input_encoding) + error_handler = self.document.settings.input_encoding_error_handler + # tab_width = self.options.get("tab-width", self.document.settings.tab_width) + try: + file_content = path.read_text(encoding=encoding, errors=error_handler) + except Exception as error: + raise DirectiveError( + 4, + 'Directive "{}": error reading file: {}\n{}.'.format( + self.name, path, error + ), + ) + + # get required section of text + startline = self.options.get("start-line", None) + endline = self.options.get("end-line", None) + file_content = "\n".join(file_content.splitlines()[startline:endline]) + startline = startline or 0 + for split_on_type in ["start-after", "end-before"]: + split_on = self.options.get(split_on_type, None) + if not split_on: + continue + split_index = file_content.find(split_on) + if split_index < 0: + raise DirectiveError( + 4, + 'Directive "{}"; option "{}": text not found "{}".'.format( + self.name, split_on_type, split_on + ), + ) + if split_on_type == "start-after": + startline += split_index + len(split_on) + file_content = file_content[split_index + len(split_on) :] + else: + file_content = file_content[:split_index] + + if "literal" in self.options: + literal_block = nodes.literal_block( + file_content, source=str(path), classes=self.options.get("class", []) + ) + literal_block.line = 1 # TODO don;t think this should be 1? + self.add_name(literal_block) + if "number-lines" in self.options: + try: + startline = int(self.options["number-lines"] or 1) + except ValueError: + raise DirectiveError( + 3, ":number-lines: with non-integer " "start value" + ) + endline = startline + len(file_content.splitlines()) + if file_content.endswith("\n"): + file_content = file_content[:-1] + tokens = NumberLines([([], file_content)], startline, endline) + for classes, value in tokens: + if classes: + literal_block += nodes.inline(value, value, classes=classes) + else: + literal_block += nodes.Text(value) + else: + literal_block += nodes.Text(file_content) + return [literal_block] + if "code" in self.options: + self.options["source"] = str(path) + state_machine = MockStateMachine(self.renderer, self.lineno) + state = MockState(self.renderer, state_machine, self.lineno) + codeblock = CodeBlock( + name=self.name, + arguments=[self.options.pop("code")], + options=self.options, + content=file_content.splitlines(), + lineno=self.lineno, + content_offset=0, + block_text=file_content, + state=state, + state_machine=state_machine, + ) + return codeblock.run() + + # Here we perform a nested render, but temporarily setup the document/reporter + # with the correct document path and lineno for the included file. + source = self.renderer.document["source"] + rsource = self.renderer.reporter.source + line_func = getattr(self.renderer.reporter, "get_source_and_line", None) + try: + self.renderer.document["source"] = str(path) + self.renderer.reporter.source = str(path) + self.renderer.reporter.get_source_and_line = lambda l: (str(path), l) + if "relative-images" in self.options: + self.renderer.md_env["relative-images"] = os.path.relpath( + path.parent, source_dir + ) + if "relative-docs" in self.options: + self.renderer.md_env["relative-docs"] = ( + self.options["relative-docs"], + source_dir, + path.parent, + ) + self.renderer.nested_render_text( + file_content, startline + 1, allow_headings=True + ) + finally: + self.renderer.document["source"] = source + self.renderer.reporter.source = rsource + self.renderer.md_env.pop("relative-images", None) + self.renderer.md_env.pop("relative-docs", None) + if line_func is not None: + self.renderer.reporter.get_source_and_line = line_func + else: + del self.renderer.reporter.get_source_and_line + return [] + + def add_name(self, node: nodes.Element): + """Append self.options['name'] to node['names'] if it exists. + + Also normalize the name string and register it as explicit target. + """ + if "name" in self.options: + name = nodes.fully_normalize_name(self.options.pop("name")) + if "name" in node: + del node["name"] + node["names"].append(name) + self.renderer.document.note_explicit_target(node, node) + + +class MockRSTParser(RSTParser): + """RSTParser which avoids a negative side effect.""" + + def parse(self, inputstring: str, document: nodes.document): + """Parse the input to populate the document AST.""" + from docutils.parsers.rst import roles + + should_restore = False + if "" in roles._roles: + should_restore = True + blankrole = roles._roles[""] + + super().parse(inputstring, document) + + if should_restore: + roles._roles[""] = blankrole diff --git a/myst_parser/parsers/__init__.py b/myst_parser/parsers/__init__.py new file mode 100644 index 0000000..26fbfca --- /dev/null +++ b/myst_parser/parsers/__init__.py @@ -0,0 +1 @@ +"""Parsers of MyST Markdown source text to docutils AST.""" diff --git a/myst_parser/parsers/directives.py b/myst_parser/parsers/directives.py new file mode 100644 index 0000000..5637254 --- /dev/null +++ b/myst_parser/parsers/directives.py @@ -0,0 +1,190 @@ +"""Fenced code blocks are parsed as directives, +if the block starts with ``{directive_name}``, +followed by arguments on the same line. + +Directive options are read from a YAML block, +if the first content line starts with ``---``, e.g. + +:: + + ```{directive_name} arguments + --- + option1: name + option2: | + Longer text block + --- + content... + ``` + +Or the option block will be parsed if the first content line starts with ``:``, +as a YAML block consisting of every line that starts with a ``:``, e.g. + +:: + + ```{directive_name} arguments + :option1: name + :option2: other + + content... + ``` + +If the first line of a directive's content is blank, this will be stripped +from the content. +This is to allow for separation between the option block and content. + +""" +from __future__ import annotations + +import datetime +import re +from textwrap import dedent +from typing import Any, Callable + +import yaml +from docutils.parsers.rst import Directive +from docutils.parsers.rst.directives.misc import TestDirective + + +class DirectiveParsingError(Exception): + """Raise on parsing/validation error.""" + + pass + + +def parse_directive_text( + directive_class: type[Directive], + first_line: str, + content: str, + validate_options: bool = True, +) -> tuple[list[str], dict, list[str], int]: + """Parse (and validate) the full directive text. + + :param first_line: The text on the same line as the directive name. + May be an argument or body text, dependent on the directive + :param content: All text after the first line. Can include options. + :param validate_options: Whether to validate the values of options + + :returns: (arguments, options, body_lines, content_offset) + """ + if directive_class.option_spec: + body, options = parse_directive_options( + content, directive_class, validate=validate_options + ) + body_lines = body.splitlines() + content_offset = len(content.splitlines()) - len(body_lines) + else: + # If there are no possible options, we do not look for a YAML block + options = {} + body_lines = content.splitlines() + content_offset = 0 + + if not (directive_class.required_arguments or directive_class.optional_arguments): + # If there are no possible arguments, then the body starts on the argument line + if first_line: + body_lines.insert(0, first_line) + arguments = [] + else: + arguments = parse_directive_arguments(directive_class, first_line) + + # remove first line of body if blank + # this is to allow space between the options and the content + if body_lines and not body_lines[0].strip(): + body_lines = body_lines[1:] + content_offset += 1 + + # check for body content + if body_lines and not directive_class.has_content: + raise DirectiveParsingError("No content permitted") + + return arguments, options, body_lines, content_offset + + +def parse_directive_options( + content: str, directive_class: type[Directive], validate: bool = True +): + """Parse (and validate) the directive option section.""" + options: dict[str, Any] = {} + if content.startswith("---"): + content = "\n".join(content.splitlines()[1:]) + match = re.search(r"^-{3,}", content, re.MULTILINE) + if match: + yaml_block = content[: match.start()] + content = content[match.end() + 1 :] # TODO advance line number + else: + yaml_block = content + content = "" + yaml_block = dedent(yaml_block) + try: + options = yaml.safe_load(yaml_block) or {} + except (yaml.parser.ParserError, yaml.scanner.ScannerError) as error: + raise DirectiveParsingError("Invalid options YAML: " + str(error)) + elif content.lstrip().startswith(":"): + content_lines = content.splitlines() # type: list + yaml_lines = [] + while content_lines: + if not content_lines[0].lstrip().startswith(":"): + break + yaml_lines.append(content_lines.pop(0).lstrip()[1:]) + yaml_block = "\n".join(yaml_lines) + content = "\n".join(content_lines) + try: + options = yaml.safe_load(yaml_block) or {} + except (yaml.parser.ParserError, yaml.scanner.ScannerError) as error: + raise DirectiveParsingError("Invalid options YAML: " + str(error)) + if not isinstance(options, dict): + raise DirectiveParsingError(f"Invalid options (not dict): {options}") + + if (not validate) or issubclass(directive_class, TestDirective): + # technically this directive spec only accepts one option ('option') + # but since its for testing only we accept all options + return content, options + + # check options against spec + options_spec: dict[str, Callable] = directive_class.option_spec + for name, value in list(options.items()): + try: + convertor = options_spec[name] + except KeyError: + raise DirectiveParsingError(f"Unknown option: {name}") + if not isinstance(value, str): + if value is True or value is None: + value = None # flag converter requires no argument + elif isinstance(value, (int, float, datetime.date, datetime.datetime)): + # convertor always requires string input + value = str(value) + else: + raise DirectiveParsingError( + f'option "{name}" value not string (enclose with ""): {value}' + ) + try: + converted_value = convertor(value) + except (ValueError, TypeError) as error: + raise DirectiveParsingError( + "Invalid option value: (option: '{}'; value: {})\n{}".format( + name, value, error + ) + ) + options[name] = converted_value + + return content, options + + +def parse_directive_arguments(directive, arg_text): + """Parse (and validate) the directive argument section.""" + required = directive.required_arguments + optional = directive.optional_arguments + arguments = arg_text.split() + if len(arguments) < required: + raise DirectiveParsingError( + f"{required} argument(s) required, {len(arguments)} supplied" + ) + elif len(arguments) > required + optional: + if directive.final_argument_whitespace: + arguments = arg_text.split(None, required + optional - 1) + else: + raise DirectiveParsingError( + "maximum {} argument(s) allowed, {} supplied".format( + required + optional, len(arguments) + ) + ) + return arguments diff --git a/myst_parser/parsers/docutils_.py b/myst_parser/parsers/docutils_.py new file mode 100644 index 0000000..aaef5e2 --- /dev/null +++ b/myst_parser/parsers/docutils_.py @@ -0,0 +1,275 @@ +"""MyST Markdown parser for docutils.""" +from dataclasses import Field +from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple, Union + +from docutils import frontend, nodes +from docutils.core import default_description, publish_cmdline +from docutils.parsers.rst import Parser as RstParser +from typing_extensions import Literal, get_args, get_origin + +from myst_parser.config.main import ( + MdParserConfig, + TopmatterReadError, + merge_file_level, + read_topmatter, +) +from myst_parser.mdit_to_docutils.base import DocutilsRenderer, create_warning +from myst_parser.parsers.mdit import create_md_parser + + +def _validate_int( + setting, value, option_parser, config_parser=None, config_section=None +) -> int: + """Validate an integer setting.""" + return int(value) + + +def _create_validate_tuple(length: int) -> Callable[..., Tuple[str, ...]]: + """Create a validator for a tuple of length `length`.""" + + def _validate( + setting, value, option_parser, config_parser=None, config_section=None + ): + string_list = frontend.validate_comma_separated_list( + setting, value, option_parser, config_parser, config_section + ) + if len(string_list) != length: + raise ValueError( + f"Expecting {length} items in {setting}, got {len(string_list)}." + ) + return tuple(string_list) + + return _validate + + +class Unset: + """A sentinel class for unset settings.""" + + def __repr__(self): + return "UNSET" + + +DOCUTILS_UNSET = Unset() +"""Sentinel for arguments not set through docutils.conf.""" + + +DOCUTILS_EXCLUDED_ARGS = ( + # docutils.conf can't represent callables + "heading_slug_func", + # docutils.conf can't represent dicts + "html_meta", + "substitutions", + # we can't add substitutions so not needed + "sub_delimiters", + # sphinx only options + "heading_anchors", + "ref_domains", + "update_mathjax", + "mathjax_classes", +) +"""Names of settings that cannot be set in docutils.conf.""" + + +def _attr_to_optparse_option(at: Field, default: Any) -> Tuple[dict, str]: + """Convert a field into a Docutils optparse options dict.""" + if at.type is int: + return {"metavar": "<int>", "validator": _validate_int}, f"(default: {default})" + if at.type is bool: + return { + "metavar": "<boolean>", + "validator": frontend.validate_boolean, + }, f"(default: {default})" + if at.type is str: + return { + "metavar": "<str>", + }, f"(default: '{default}')" + if get_origin(at.type) is Literal and all( + isinstance(a, str) for a in get_args(at.type) + ): + args = get_args(at.type) + return { + "metavar": f"<{'|'.join(repr(a) for a in args)}>", + "type": "choice", + "choices": args, + }, f"(default: {default!r})" + if at.type in (Iterable[str], Sequence[str]): + return { + "metavar": "<comma-delimited>", + "validator": frontend.validate_comma_separated_list, + }, f"(default: '{','.join(default)}')" + if at.type == Tuple[str, str]: + return { + "metavar": "<str,str>", + "validator": _create_validate_tuple(2), + }, f"(default: '{','.join(default)}')" + if at.type == Union[int, type(None)]: + return { + "metavar": "<null|int>", + "validator": _validate_int, + }, f"(default: {default})" + if at.type == Union[Iterable[str], type(None)]: + default_str = ",".join(default) if default else "" + return { + "metavar": "<null|comma-delimited>", + "validator": frontend.validate_comma_separated_list, + }, f"(default: {default_str!r})" + raise AssertionError( + f"Configuration option {at.name} not set up for use in docutils.conf." + ) + + +def attr_to_optparse_option( + attribute: Field, default: Any, prefix: str = "myst_" +) -> Tuple[str, List[str], Dict[str, Any]]: + """Convert an ``MdParserConfig`` attribute into a Docutils setting tuple. + + :returns: A tuple of ``(help string, option flags, optparse kwargs)``. + """ + name = f"{prefix}{attribute.name}" + flag = "--" + name.replace("_", "-") + options = {"dest": name, "default": DOCUTILS_UNSET} + at_options, type_str = _attr_to_optparse_option(attribute, default) + options.update(at_options) + help_str = attribute.metadata.get("help", "") if attribute.metadata else "" + return (f"{help_str} {type_str}", [flag], options) + + +def create_myst_settings_spec( + excluded: Sequence[str], config_cls=MdParserConfig, prefix: str = "myst_" +): + """Return a list of Docutils setting for the docutils MyST section.""" + defaults = config_cls() + return tuple( + attr_to_optparse_option(at, getattr(defaults, at.name), prefix) + for at in config_cls.get_fields() + if at.name not in excluded + ) + + +def create_myst_config( + settings: frontend.Values, + excluded: Sequence[str], + config_cls=MdParserConfig, + prefix: str = "myst_", +): + """Create a configuration instance from the given settings.""" + values = {} + for attribute in config_cls.get_fields(): + if attribute.name in excluded: + continue + setting = f"{prefix}{attribute.name}" + val = getattr(settings, setting, DOCUTILS_UNSET) + if val is not DOCUTILS_UNSET: + values[attribute.name] = val + return config_cls(**values) + + +class Parser(RstParser): + """Docutils parser for Markedly Structured Text (MyST).""" + + supported: Tuple[str, ...] = ("md", "markdown", "myst") + """Aliases this parser supports.""" + + settings_spec = ( + "MyST options", + None, + create_myst_settings_spec(DOCUTILS_EXCLUDED_ARGS), + *RstParser.settings_spec, + ) + """Runtime settings specification.""" + + config_section = "myst parser" + config_section_dependencies = ("parsers",) + translate_section_name = None + + def parse(self, inputstring: str, document: nodes.document) -> None: + """Parse source text. + + :param inputstring: The source string to parse + :param document: The root docutils node to add AST elements to + """ + + self.setup_parse(inputstring, document) + + # check for exorbitantly long lines + if hasattr(document.settings, "line_length_limit"): + for i, line in enumerate(inputstring.split("\n")): + if len(line) > document.settings.line_length_limit: + error = document.reporter.error( + f"Line {i+1} exceeds the line-length-limit:" + f" {document.settings.line_length_limit}." + ) + document.append(error) + return + + # create parsing configuration from the global config + try: + config = create_myst_config(document.settings, DOCUTILS_EXCLUDED_ARGS) + except Exception as exc: + error = document.reporter.error(f"Global myst configuration invalid: {exc}") + document.append(error) + config = MdParserConfig() + + # update the global config with the file-level config + try: + topmatter = read_topmatter(inputstring) + except TopmatterReadError: + pass # this will be reported during the render + else: + if topmatter: + warning = lambda wtype, msg: create_warning( # noqa: E731 + document, msg, line=1, append_to=document, subtype=wtype + ) + config = merge_file_level(config, topmatter, warning) + + # parse content + parser = create_md_parser(config, DocutilsRenderer) + parser.options["document"] = document + parser.render(inputstring) + + # post-processing + + # replace raw nodes if raw is not allowed + if not getattr(document.settings, "raw_enabled", True): + for node in document.traverse(nodes.raw): + warning = document.reporter.warning("Raw content disabled.") + node.parent.replace(node, warning) + + self.finish_parse() + + +def _run_cli(writer_name: str, writer_description: str, argv: Optional[List[str]]): + """Run the command line interface for a particular writer.""" + publish_cmdline( + parser=Parser(), + writer_name=writer_name, + description=( + f"Generates {writer_description} from standalone MyST sources.\n{default_description}" + ), + argv=argv, + ) + + +def cli_html(argv: Optional[List[str]] = None) -> None: + """Cmdline entrypoint for converting MyST to HTML.""" + _run_cli("html", "(X)HTML documents", argv) + + +def cli_html5(argv: Optional[List[str]] = None): + """Cmdline entrypoint for converting MyST to HTML5.""" + _run_cli("html5", "HTML5 documents", argv) + + +def cli_latex(argv: Optional[List[str]] = None): + """Cmdline entrypoint for converting MyST to LaTeX.""" + _run_cli("latex", "LaTeX documents", argv) + + +def cli_xml(argv: Optional[List[str]] = None): + """Cmdline entrypoint for converting MyST to XML.""" + _run_cli("xml", "Docutils-native XML", argv) + + +def cli_pseudoxml(argv: Optional[List[str]] = None): + """Cmdline entrypoint for converting MyST to pseudo-XML.""" + _run_cli("pseudoxml", "pseudo-XML", argv) diff --git a/myst_parser/parsers/mdit.py b/myst_parser/parsers/mdit.py new file mode 100644 index 0000000..8476495 --- /dev/null +++ b/myst_parser/parsers/mdit.py @@ -0,0 +1,123 @@ +"""This module holds the ``create_md_parser`` function, +which creates a parser from the config. +""" +from __future__ import annotations + +from typing import Callable + +from markdown_it import MarkdownIt +from markdown_it.renderer import RendererProtocol +from mdit_py_plugins.amsmath import amsmath_plugin +from mdit_py_plugins.anchors import anchors_plugin +from mdit_py_plugins.attrs import attrs_plugin +from mdit_py_plugins.colon_fence import colon_fence_plugin +from mdit_py_plugins.deflist import deflist_plugin +from mdit_py_plugins.dollarmath import dollarmath_plugin +from mdit_py_plugins.field_list import fieldlist_plugin +from mdit_py_plugins.footnote import footnote_plugin +from mdit_py_plugins.front_matter import front_matter_plugin +from mdit_py_plugins.myst_blocks import myst_block_plugin +from mdit_py_plugins.myst_role import myst_role_plugin +from mdit_py_plugins.substitution import substitution_plugin +from mdit_py_plugins.tasklists import tasklists_plugin +from mdit_py_plugins.wordcount import wordcount_plugin + +from myst_parser.config.main import MdParserConfig + + +def create_md_parser( + config: MdParserConfig, renderer: Callable[[MarkdownIt], RendererProtocol] +) -> MarkdownIt: + """Return a Markdown parser with the required MyST configuration.""" + + # TODO warn if linkify required and linkify-it-py not installed + # (currently the parse will unceremoniously except) + + if config.commonmark_only: + # see https://spec.commonmark.org/ + md = MarkdownIt("commonmark", renderer_cls=renderer).use( + wordcount_plugin, per_minute=config.words_per_minute + ) + md.options.update({"myst_config": config}) + return md + + if config.gfm_only: + # see https://github.github.com/gfm/ + md = ( + MarkdownIt("commonmark", renderer_cls=renderer) + # note, strikethrough currently only supported tentatively for HTML + .enable("strikethrough") + .enable("table") + .use(tasklists_plugin) + .enable("linkify") + .use(wordcount_plugin, per_minute=config.words_per_minute) + ) + md.options.update({"linkify": True, "myst_config": config}) + return md + + md = ( + MarkdownIt("commonmark", renderer_cls=renderer) + .enable("table") + .use(front_matter_plugin) + .use(myst_block_plugin) + .use(myst_role_plugin) + .use(footnote_plugin) + .use(wordcount_plugin, per_minute=config.words_per_minute) + .disable("footnote_inline") + # disable this for now, because it need a new implementation in the renderer + .disable("footnote_tail") + ) + + typographer = False + if "smartquotes" in config.enable_extensions: + md.enable("smartquotes") + typographer = True + if "replacements" in config.enable_extensions: + md.enable("replacements") + typographer = True + if "linkify" in config.enable_extensions: + md.enable("linkify") + if md.linkify is not None: + md.linkify.set({"fuzzy_link": config.linkify_fuzzy_links}) + if "strikethrough" in config.enable_extensions: + md.enable("strikethrough") + if "dollarmath" in config.enable_extensions: + md.use( + dollarmath_plugin, + allow_labels=config.dmath_allow_labels, + allow_space=config.dmath_allow_space, + allow_digits=config.dmath_allow_digits, + double_inline=config.dmath_double_inline, + ) + if "colon_fence" in config.enable_extensions: + md.use(colon_fence_plugin) + if "amsmath" in config.enable_extensions: + md.use(amsmath_plugin) + if "deflist" in config.enable_extensions: + md.use(deflist_plugin) + if "fieldlist" in config.enable_extensions: + md.use(fieldlist_plugin) + if "tasklist" in config.enable_extensions: + md.use(tasklists_plugin) + if "substitution" in config.enable_extensions: + md.use(substitution_plugin, *config.sub_delimiters) + if "attrs_image" in config.enable_extensions: + md.use(attrs_plugin, after=("image",)) + if config.heading_anchors is not None: + md.use( + anchors_plugin, + max_level=config.heading_anchors, + slug_func=config.heading_slug_func, + ) + for name in config.disable_syntax: + md.disable(name, True) + + md.options.update( + { + "typographer": typographer, + "linkify": "linkify" in config.enable_extensions, + "myst_config": config, + } + ) + + return md diff --git a/myst_parser/parsers/parse_html.py b/myst_parser/parsers/parse_html.py new file mode 100644 index 0000000..7539e42 --- /dev/null +++ b/myst_parser/parsers/parse_html.py @@ -0,0 +1,440 @@ +"""A simple but complete HTML to Abstract Syntax Tree (AST) parser. + +The AST can also reproduce the HTML text. + +Example:: + + >> text = '<div class="note"><p>text</p></div>' + >> ast = tokenize_html(text) + >> list(ast.walk(include_self=True)) + [Root(''), Tag('div', {'class': 'note'}), Tag('p'), Data('text')] + >> str(ast) + '<div class="note"><p>text</p></div>' + >> str(ast[0][0]) + '<p>text</p>' + +Note: optional tags are not accounted for +(see https://html.spec.whatwg.org/multipage/syntax.html#optional-tags) + +""" +from __future__ import annotations + +import inspect +import itertools +from collections import abc, deque +from html.parser import HTMLParser +from typing import Any, Callable, Iterable, Iterator + + +class Attribute(dict): + """This class holds the tags's attributes.""" + + def __getitem__(self, key: str) -> str: + """If self doesn't have the key it returns ''.""" + return self.get(key, "") + + @property + def classes(self) -> list[str]: + """Return 'class' attribute as list.""" + return self["class"].split() + + def __str__(self) -> str: + """Return a htmlized representation for attributes.""" + return " ".join(f'{key}="{value}"' for key, value in self.items()) + + +class Element(abc.MutableSequence): + """An Element of the xml/html document. + + All xml/html entities inherit from this class. + """ + + def __init__(self, name: str = "", attr: dict | None = None) -> None: + """Initialise the element.""" + self.name = name + self.attrs: Attribute = Attribute(attr or {}) + self._parent: Element | None = None + self._children: list[Element] = [] + + @property + def parent(self) -> Element | None: + """Return parent.""" + return self._parent + + @property + def children(self) -> list[Element]: + """Return copy of children.""" + return self._children[:] + + def reset_children(self, children: list[Element], deepcopy: bool = False): + new_children = [] + for i, item in enumerate(children): + assert isinstance(item, Element) + if deepcopy: + item = item.deepcopy() + if item._parent is None: + item._parent = self + elif item._parent != self: + raise AssertionError(f"different parent already set for item {i}") + new_children.append(item) + self._children = new_children + + def __getitem__(self, index: int) -> Element: # type: ignore[override] + return self._children[index] + + def __setitem__(self, index: int, item: Element): # type: ignore[override] + assert isinstance(item, Element) + if item._parent is not None and item._parent != self: + raise AssertionError(f"different parent already set for: {item!r}") + item._parent = self + return self._children.__setitem__(index, item) + + def __delitem__(self, index: int): # type: ignore[override] + return self._children.__delitem__(index) + + def __len__(self) -> int: + return self._children.__len__() + + def __iter__(self) -> Iterator[Element]: + yield from self._children + + def insert(self, index: int, item: Element): + assert isinstance(item, Element) + if item._parent is not None and item._parent != self: + raise AssertionError(f"different parent already set for: {item!r}") + item._parent = self + return self._children.insert(index, item) + + def deepcopy(self) -> Element: + """Recursively copy and remove parent.""" + _copy = self.__class__(self.name, self.attrs) + for child in self: + _copy_child = child.deepcopy() + _copy.append(_copy_child) + return _copy + + def __repr__(self) -> str: + text = f"{self.__class__.__name__}({self.name!r}" + if self.attrs: + text += f", {self.attrs!r}" + text += ")" + return text + + def render( + self, + tag_overrides: dict[str, Callable[[Element, dict], str]] | None = None, + **kwargs, + ) -> str: + """Returns a HTML string representation of the element. + + :param tag_overrides: Provide a dictionary of render function + for specific tag names, to override the normal render format + + """ + raise NotImplementedError + + def __str__(self) -> str: + return self.render() + + def __eq__(self, item: Any) -> bool: + return item is self + + def walk(self, include_self: bool = False) -> Iterator[Element]: + """Walk through the xml/html AST.""" + if include_self: + yield self + for child in self: + yield child + yield from child.walk() + + def strip(self, inplace: bool = False, recurse: bool = False) -> Element: + """Return copy with all `Data` tokens + that only contain whitespace / newlines removed. + """ + element = self + if not inplace: + element = self.deepcopy() + element.reset_children( + [ + e + for e in element.children + if not (isinstance(e, Data) and e.data.strip() == "") + ] + ) + if recurse: + for child in element: + child.strip(inplace=True, recurse=True) + return element + + def find( + self, + identifier: str | type[Element], + attrs: dict | None = None, + classes: Iterable[str] | None = None, + include_self: bool = False, + recurse: bool = True, + ) -> Iterator[Element]: + """Find all elements that match name and specific attributes.""" + iterator = self.walk() if recurse else self + if include_self: + iterator = itertools.chain([self], iterator) + if inspect.isclass(identifier): + test_func = lambda c: isinstance(c, identifier) # noqa: E731 + else: + test_func = lambda c: c.name == identifier # noqa: E731 + classes = set(classes) if classes is not None else classes + for child in iterator: + if test_func(child): + if classes is not None and not classes.issubset(child.attrs.classes): + continue + for key, value in (attrs or {}).items(): + if child.attrs[key] != value: + break + else: + yield child + + +class Root(Element): + """The root of the AST tree.""" + + def render(self, **kwargs) -> str: # type: ignore[override] + """Returns a string HTML representation of the structure.""" + return "".join(child.render(**kwargs) for child in self) + + +class Tag(Element): + """Represent xml/html tags under the form: <name key="value" ...> ... </name>.""" + + def render( + self, + tag_overrides: dict[str, Callable[[Element, dict], str]] | None = None, + **kwargs, + ) -> str: + if tag_overrides and self.name in tag_overrides: + return tag_overrides[self.name](self, tag_overrides) + return ( + f"<{self.name}{' ' if self.attrs else ''}{self.attrs}>" + + "".join( + child.render(tag_overrides=tag_overrides, **kwargs) for child in self + ) + + f"</{self.name}>" + ) + + +class XTag(Element): + """Represent XHTML style tags with no children, like `<img src="t.gif" />`""" + + def render( + self, + tag_overrides: dict[str, Callable[[Element, dict], str]] | None = None, + **kwargs, + ) -> str: + if tag_overrides is not None and self.name in tag_overrides: + return tag_overrides[self.name](self, tag_overrides) + return f"<{self.name}{' ' if self.attrs else ''}{self.attrs}/>" + + +class VoidTag(Element): + """Represent tags with no children, only start tag, like `<img src="t.gif" >`""" + + def render(self, **kwargs) -> str: # type: ignore[override] + return f"<{self.name}{' ' if self.attrs else ''}{self.attrs}>" + + +class TerminalElement(Element): + def __init__(self, data: str): + super().__init__("") + self.data: str = data + + def __repr__(self) -> str: + text = self.data + if len(text) > 20: + text = text[:17] + "..." + return f"{self.__class__.__name__}({text!r})" + + def deepcopy(self) -> TerminalElement: + """Copy and remove parent.""" + _copy = self.__class__(self.data) + return _copy + + +class Data(TerminalElement): + """Represent data inside xml/html documents, like raw text.""" + + def render(self, **kwargs) -> str: # type: ignore[override] + return self.data + + +class Declaration(TerminalElement): + """Represent declarations, like `<!DOCTYPE html>`""" + + def render(self, **kwargs) -> str: # type: ignore[override] + return f"<!{self.data}>" + + +class Comment(TerminalElement): + """Represent HTML comments""" + + def render(self, **kwargs) -> str: # type: ignore[override] + return f"<!--{self.data}-->" + + +class Pi(TerminalElement): + """Represent processing instructions like `<?xml-stylesheet ?>`""" + + def render(self, **kwargs) -> str: # type: ignore[override] + return f"<?{self.data}>" + + +class Char(TerminalElement): + """Represent character codes like: `�`""" + + def render(self, **kwargs) -> str: # type: ignore[override] + return f"&#{self.data};" + + +class Entity(TerminalElement): + """Represent entities like `&`""" + + def render(self, **kwargs) -> str: # type: ignore[override] + return f"&{self.data};" + + +class Tree: + """The engine class to generate the AST tree.""" + + def __init__(self, name: str = ""): + """Initialise Tree""" + self.name = name + self.outmost = Root(name) + self.stack: deque = deque() + self.stack.append(self.outmost) + + def clear(self): + """Clear the outmost and stack for a new parsing.""" + self.outmost = Root(self.name) + self.stack.clear() + self.stack.append(self.outmost) + + def last(self) -> Element: + """Return the last pointer which point to the actual tag scope.""" + return self.stack[-1] + + def nest_tag(self, name: str, attrs: dict): + """Nest a given tag at the bottom of the tree using + the last stack's pointer. + """ + pointer = self.stack.pop() + item = Tag(name, attrs) + pointer.append(item) + self.stack.append(pointer) + self.stack.append(item) + + def nest_xtag(self, name: str, attrs: dict): + """Nest an XTag onto the tree.""" + top = self.last() + item = XTag(name, attrs) + top.append(item) + + def nest_vtag(self, name: str, attrs: dict): + """Nest a VoidTag onto the tree.""" + top = self.last() + item = VoidTag(name, attrs) + top.append(item) + + def nest_terminal(self, klass: type[TerminalElement], data: str): + """Nest the data onto the tree.""" + top = self.last() + item = klass(data) + top.append(item) + + def enclose(self, name: str): + """When a closing tag is found, pop the pointer's scope from the stack, + to then point to the earlier scope's tag. + """ + count = 0 + for ind in reversed(self.stack): + count = count + 1 + if ind.name == name: + break + else: + count = 0 + + # It pops all the items which do not match with the closing tag. + for _ in range(0, count): + self.stack.pop() + + +class HtmlToAst(HTMLParser): + """The tokenizer class.""" + + # see https://html.spec.whatwg.org/multipage/syntax.html#void-elements + void_elements = { + "area", + "base", + "br", + "col", + "embed", + "hr", + "img", + "input", + "link", + "meta", + "param", + "source", + "track", + "wbr", + } + + def __init__(self, name: str = "", convert_charrefs: bool = False): + super().__init__(convert_charrefs=convert_charrefs) + self.struct = Tree(name) + + def feed(self, source: str) -> Root: # type: ignore[override] + """Parse the source string.""" + self.struct.clear() + super().feed(source) + return self.struct.outmost + + def handle_starttag(self, name: str, attr): + """When found an opening tag then nest it onto the tree.""" + if name in self.void_elements: + self.struct.nest_vtag(name, attr) + else: + self.struct.nest_tag(name, attr) + + def handle_startendtag(self, name: str, attr): + """When found a XHTML tag style then nest it up to the tree.""" + self.struct.nest_xtag(name, attr) + + def handle_endtag(self, name: str): + """When found a closing tag then makes it point to the right scope.""" + if name not in self.void_elements: + self.struct.enclose(name) + + def handle_data(self, data: str): + """Nest data onto the tree.""" + self.struct.nest_terminal(Data, data) + + def handle_decl(self, decl: str): + self.struct.nest_terminal(Declaration, decl) + + def unknown_decl(self, decl: str): + self.struct.nest_terminal(Declaration, decl) + + def handle_charref(self, data: str): + self.struct.nest_terminal(Char, data) + + def handle_entityref(self, data: str): + self.struct.nest_terminal(Entity, data) + + def handle_pi(self, data: str): + self.struct.nest_terminal(Pi, data) + + def handle_comment(self, data: str): + self.struct.nest_terminal(Comment, data) + + +def tokenize_html(text: str, name: str = "", convert_charrefs: bool = False) -> Root: + parser = HtmlToAst(name, convert_charrefs=convert_charrefs) + return parser.feed(text) diff --git a/myst_parser/parsers/sphinx_.py b/myst_parser/parsers/sphinx_.py new file mode 100644 index 0000000..fff098f --- /dev/null +++ b/myst_parser/parsers/sphinx_.py @@ -0,0 +1,69 @@ +"""MyST Markdown parser for sphinx.""" +from __future__ import annotations + +from docutils import nodes +from docutils.parsers.rst import Parser as RstParser +from sphinx.parsers import Parser as SphinxParser +from sphinx.util import logging + +from myst_parser.config.main import ( + MdParserConfig, + TopmatterReadError, + merge_file_level, + read_topmatter, +) +from myst_parser.mdit_to_docutils.sphinx_ import SphinxRenderer, create_warning +from myst_parser.parsers.mdit import create_md_parser + +SPHINX_LOGGER = logging.getLogger(__name__) + + +class MystParser(SphinxParser): + """Sphinx parser for Markedly Structured Text (MyST).""" + + supported: tuple[str, ...] = ("md", "markdown", "myst") + """Aliases this parser supports.""" + + settings_spec = RstParser.settings_spec + """Runtime settings specification. + + Defines runtime settings and associated command-line options, as used by + `docutils.frontend.OptionParser`. This is a concatenation of tuples of: + + - Option group title (string or `None` which implies no group, just a list + of single options). + + - Description (string or `None`). + + - A sequence of option tuples + """ + + config_section = "myst parser" + config_section_dependencies = ("parsers",) + translate_section_name = None + + def parse(self, inputstring: str, document: nodes.document) -> None: + """Parse source text. + + :param inputstring: The source string to parse + :param document: The root docutils node to add AST elements to + + """ + # get the global config + config: MdParserConfig = document.settings.env.myst_config + + # update the global config with the file-level config + try: + topmatter = read_topmatter(inputstring) + except TopmatterReadError: + pass # this will be reported during the render + else: + if topmatter: + warning = lambda wtype, msg: create_warning( # noqa: E731 + document, msg, line=1, append_to=document, subtype=wtype + ) + config = merge_file_level(config, topmatter, warning) + + parser = create_md_parser(config, SphinxRenderer) + parser.options["document"] = document + parser.render(inputstring) diff --git a/myst_parser/py.typed b/myst_parser/py.typed new file mode 100644 index 0000000..7632ecf --- /dev/null +++ b/myst_parser/py.typed @@ -0,0 +1 @@ +# Marker file for PEP 561 diff --git a/myst_parser/sphinx_.py b/myst_parser/sphinx_.py new file mode 100644 index 0000000..b085086 --- /dev/null +++ b/myst_parser/sphinx_.py @@ -0,0 +1,6 @@ +"""A module for compatibility with the docutils>=0.17 `include` directive, in RST documents:: + + .. include:: path/to/file.md + :parser: myst_parser.sphinx_ +""" +from myst_parser.parsers.sphinx_ import MystParser as Parser # noqa: F401 diff --git a/myst_parser/sphinx_ext/__init__.py b/myst_parser/sphinx_ext/__init__.py new file mode 100644 index 0000000..1bfeb71 --- /dev/null +++ b/myst_parser/sphinx_ext/__init__.py @@ -0,0 +1 @@ +"""Sphinx extension for myst_parser.""" diff --git a/myst_parser/sphinx_ext/directives.py b/myst_parser/sphinx_ext/directives.py new file mode 100644 index 0000000..39ca2c6 --- /dev/null +++ b/myst_parser/sphinx_ext/directives.py @@ -0,0 +1,136 @@ +"""MyST specific directives""" +from copy import copy +from typing import List, Tuple, cast + +from docutils import nodes +from docutils.parsers.rst import directives +from sphinx.directives import SphinxDirective +from sphinx.util.docutils import SphinxRole + +from myst_parser.mocking import MockState + + +def align(argument): + return directives.choice(argument, ("left", "center", "right")) + + +def figwidth_value(argument): + if argument.lower() == "image": + return "image" + else: + return directives.length_or_percentage_or_unitless(argument, "px") + + +class SubstitutionReferenceRole(SphinxRole): + """Implement substitution references as a role. + + Note, in ``docutils/parsers/rst/roles.py`` this is left unimplemented. + """ + + def run(self) -> Tuple[List[nodes.Node], List[nodes.system_message]]: + subref_node = nodes.substitution_reference(self.rawtext, self.text) + self.set_source_info(subref_node, self.lineno) + subref_node["refname"] = nodes.fully_normalize_name(self.text) + return [subref_node], [] + + +class FigureMarkdown(SphinxDirective): + """Directive for creating a figure with Markdown compatible syntax. + + Example:: + + :::{figure-md} target + <img src="img/fun-fish.png" alt="fishy" class="bg-primary mb-1" width="200px"> + + This is a caption in **Markdown** + ::: + + """ + + required_arguments = 0 + optional_arguments = 1 # image target + final_argument_whitespace = True + has_content = True + + option_spec = { + "width": figwidth_value, + "class": directives.class_option, + "align": align, + "name": directives.unchanged, + } + + def run(self) -> List[nodes.Node]: + figwidth = self.options.pop("width", None) + figclasses = self.options.pop("class", None) + align = self.options.pop("align", None) + + if not isinstance(self.state, MockState): + return [self.figure_error("Directive is only supported in myst parser")] + state = cast(MockState, self.state) + + # ensure html image enabled + myst_extensions = copy(state._renderer.md_config.enable_extensions) + node = nodes.Element() + try: + state._renderer.md_config.enable_extensions = list( + state._renderer.md_config.enable_extensions + ) + ["html_image"] + state.nested_parse(self.content, self.content_offset, node) + finally: + state._renderer.md_config.enable_extensions = myst_extensions + + if not len(node.children) == 2: + return [ + self.figure_error( + "content should be one image, " + "followed by a single paragraph caption" + ) + ] + + image_node, caption_para = node.children + if isinstance(image_node, nodes.paragraph): + image_node = image_node[0] + + if not isinstance(image_node, nodes.image): + return [ + self.figure_error( + "content should be one image (not found), " + "followed by single paragraph caption" + ) + ] + + if not isinstance(caption_para, nodes.paragraph): + return [ + self.figure_error( + "content should be one image, " + "followed by single paragraph caption (not found)" + ) + ] + + caption_node = nodes.caption(caption_para.rawsource, "", *caption_para.children) + caption_node.source = caption_para.source + caption_node.line = caption_para.line + + figure_node = nodes.figure("", image_node, caption_node) + self.set_source_info(figure_node) + + if figwidth is not None: + figure_node["width"] = figwidth + if figclasses: + figure_node["classes"] += figclasses + if align: + figure_node["align"] = align + if self.arguments: + self.options["name"] = self.arguments[0] + self.add_name(figure_node) + + return [figure_node] + + def figure_error(self, message): + """A warning for reporting an invalid figure.""" + error = self.state_machine.reporter.error( + message, + nodes.literal_block(self.block_text, self.block_text), + line=self.lineno, + ) + return error diff --git a/myst_parser/sphinx_ext/main.py b/myst_parser/sphinx_ext/main.py new file mode 100644 index 0000000..f5aeffc --- /dev/null +++ b/myst_parser/sphinx_ext/main.py @@ -0,0 +1,60 @@ +"""The setup for the sphinx extension.""" +from typing import Any + +from sphinx.application import Sphinx + + +def setup_sphinx(app: Sphinx, load_parser=False): + """Initialize all settings and transforms in Sphinx.""" + # we do this separately to setup, + # so that it can be called by external packages like myst_nb + from myst_parser.config.main import MdParserConfig + from myst_parser.parsers.sphinx_ import MystParser + from myst_parser.sphinx_ext.directives import ( + FigureMarkdown, + SubstitutionReferenceRole, + ) + from myst_parser.sphinx_ext.mathjax import override_mathjax + from myst_parser.sphinx_ext.myst_refs import MystReferenceResolver + + if load_parser: + app.add_source_suffix(".md", "markdown") + app.add_source_parser(MystParser) + + app.add_role("sub-ref", SubstitutionReferenceRole()) + app.add_directive("figure-md", FigureMarkdown) + + app.add_post_transform(MystReferenceResolver) + + for name, default, field in MdParserConfig().as_triple(): + if not field.metadata.get("docutils_only", False): + # TODO add types? + app.add_config_value(f"myst_{name}", default, "env", types=Any) + + app.connect("builder-inited", create_myst_config) + app.connect("builder-inited", override_mathjax) + + +def create_myst_config(app): + from sphinx.util import logging + + # Ignore type checkers because the attribute is dynamically assigned + from sphinx.util.console import bold # type: ignore[attr-defined] + + from myst_parser import __version__ + from myst_parser.config.main import MdParserConfig + + logger = logging.getLogger(__name__) + + values = { + name: app.config[f"myst_{name}"] + for name, _, field in MdParserConfig().as_triple() + if not field.metadata.get("docutils_only", False) + } + + try: + app.env.myst_config = MdParserConfig(**values) + logger.info(bold("myst v%s:") + " %s", __version__, app.env.myst_config) + except (TypeError, ValueError) as error: + logger.error("myst configuration invalid: %s", error.args[0]) + app.env.myst_config = MdParserConfig() diff --git a/myst_parser/sphinx_ext/mathjax.py b/myst_parser/sphinx_ext/mathjax.py new file mode 100644 index 0000000..260f008 --- /dev/null +++ b/myst_parser/sphinx_ext/mathjax.py @@ -0,0 +1,118 @@ +"""Overrides to ``sphinx.ext.mathjax`` + +This fixes two issues: + +1. Mathjax should not search for ``$`` delimiters, nor LaTeX amsmath environments, + since we already achieve this with the dollarmath and amsmath mrakdown-it-py plugins +2. amsmath math blocks should be wrapped in mathjax delimiters (default ``\\[...\\]``), + and assigned an equation number + +""" +from docutils import nodes +from sphinx.application import Sphinx +from sphinx.ext import mathjax +from sphinx.locale import _ +from sphinx.util import logging +from sphinx.util.math import get_node_equation_number +from sphinx.writers.html import HTMLTranslator + +logger = logging.getLogger(__name__) + + +def log_override_warning(app: Sphinx, version: int, current: str, new: str) -> None: + """Log a warning if MathJax configuration being overridden.""" + if logging.is_suppressed_warning("myst", "mathjax", app.config.suppress_warnings): + return + config_name = ( + "mathjax3_config['options']['processHtmlClass']" + if version == 3 + else "mathjax_config['tex2jax']['processClass']" + ) + logger.warning( + f"`{config_name}` is being overridden by myst-parser: '{current}' -> '{new}'. " + "Set `suppress_warnings=['myst.mathjax']` to ignore this warning, or " + "`myst_update_mathjax=False` if this is undesirable." + ) + + +def override_mathjax(app: Sphinx): + """Override aspects of the mathjax extension. + + MyST-Parser parses dollar and latex math, via markdown-it plugins. + Therefore, we tell Mathjax to only render these HTML elements. + This is accompanied by setting the `ignoreClass` on the top-level section of each MyST document. + """ + if ( + "amsmath" in app.config["myst_enable_extensions"] + and "mathjax" in app.registry.html_block_math_renderers + ): + app.registry.html_block_math_renderers["mathjax"] = ( + html_visit_displaymath, # type: ignore[assignment] + None, + ) + + if "dollarmath" not in app.config["myst_enable_extensions"]: + return + if not app.env.myst_config.update_mathjax: # type: ignore + return + + mjax_classes = app.env.myst_config.mathjax_classes # type: ignore + + if "mathjax3_config" in app.config: + # sphinx 4 + mathjax 3 + app.config.mathjax3_config = app.config.mathjax3_config or {} # type: ignore + app.config.mathjax3_config.setdefault("options", {}) + if ( + "processHtmlClass" in app.config.mathjax3_config["options"] + and app.config.mathjax3_config["options"]["processHtmlClass"] + != mjax_classes + ): + log_override_warning( + app, + 3, + app.config.mathjax3_config["options"]["processHtmlClass"], + mjax_classes, + ) + app.config.mathjax3_config["options"]["processHtmlClass"] = mjax_classes + elif "mathjax_config" in app.config: + # sphinx 3 + mathjax 2 + app.config.mathjax_config = app.config.mathjax_config or {} # type: ignore[attr-defined] + app.config.mathjax_config.setdefault("tex2jax", {}) + if ( + "processClass" in app.config.mathjax_config["tex2jax"] + and app.config.mathjax_config["tex2jax"]["processClass"] != mjax_classes + ): + log_override_warning( + app, + 2, + app.config.mathjax_config["tex2jax"]["processClass"], + mjax_classes, + ) + app.config.mathjax_config["tex2jax"]["processClass"] = mjax_classes + + +def html_visit_displaymath(self: HTMLTranslator, node: nodes.math_block) -> None: + """Override for sphinx.ext.mathjax.html_visit_displaymath to handle amsmath. + + By default displaymath, are normally wrapped in a prefix/suffix, + defined by mathjax_display, and labelled nodes are numbered. + However, this is not the case if the math_block is set as 'nowrap', as for amsmath. + Therefore, we need to override this behaviour. + """ + if "amsmath" in node.get("classes", []): + self.body.append( + self.starttag(node, "div", CLASS="math notranslate nohighlight amsmath") + ) + if node["number"]: + number = get_node_equation_number(self, node) + self.body.append('<span class="eqno">(%s)' % number) + self.add_permalink_ref(node, _("Permalink to this equation")) + self.body.append("</span>") + prefix, suffix = self.builder.config.mathjax_display + self.body.append(prefix) + self.body.append(self.encode(node.astext())) + self.body.append(suffix) + self.body.append("</div>\n") + raise nodes.SkipNode + + return mathjax.html_visit_displaymath(self, node) diff --git a/myst_parser/sphinx_ext/myst_refs.py b/myst_parser/sphinx_ext/myst_refs.py new file mode 100644 index 0000000..f364345 --- /dev/null +++ b/myst_parser/sphinx_ext/myst_refs.py @@ -0,0 +1,282 @@ +"""A post-transform for overriding the behaviour of sphinx reference resolution. + +This is applied to MyST type references only, such as ``[text](target)``, +and allows for nested syntax +""" +import os +from typing import Any, List, Optional, Tuple, cast + +from docutils import nodes +from docutils.nodes import Element, document +from sphinx import addnodes, version_info +from sphinx.addnodes import pending_xref +from sphinx.domains.std import StandardDomain +from sphinx.locale import __ +from sphinx.transforms.post_transforms import ReferencesResolver +from sphinx.util import docname_join, logging +from sphinx.util.nodes import clean_astext, make_refnode + +from myst_parser._compat import findall + +try: + from sphinx.errors import NoUri +except ImportError: + # sphinx < 2.1 + from sphinx.environment import NoUri # type: ignore + +logger = logging.getLogger(__name__) + + +class MystReferenceResolver(ReferencesResolver): + """Resolves cross-references on doctrees. + + Overrides default sphinx implementation, to allow for nested syntax + """ + + default_priority = 9 # higher priority than ReferencesResolver (10) + + def run(self, **kwargs: Any) -> None: + self.document: document + for node in findall(self.document)(addnodes.pending_xref): + if node["reftype"] != "myst": + continue + + contnode = cast(nodes.TextElement, node[0].deepcopy()) + newnode = None + + target = node["reftarget"] + refdoc = node.get("refdoc", self.env.docname) + domain = None + + try: + newnode = self.resolve_myst_ref(refdoc, node, contnode) + if newnode is None: + # no new node found? try the missing-reference event + # but first we change the the reftype to 'any' + # this means it is picked up by extensions like intersphinx + node["reftype"] = "any" + try: + newnode = self.app.emit_firstresult( + "missing-reference", + self.env, + node, + contnode, + **( + {"allowed_exceptions": (NoUri,)} + if version_info[0] > 2 + else {} + ), + ) + finally: + node["reftype"] = "myst" + # still not found? warn if node wishes to be warned about or + # we are in nit-picky mode + if newnode is None: + node["refdomain"] = "" + # TODO ideally we would override the warning message here, + # to show the [ref.myst] for suppressing warning + self.warn_missing_reference( + refdoc, node["reftype"], target, node, domain + ) + except NoUri: + newnode = contnode + + node.replace_self(newnode or contnode) + + def resolve_myst_ref( + self, refdoc: str, node: pending_xref, contnode: Element + ) -> Element: + """Resolve reference generated by the "myst" role; ``[text](reference)``. + + This builds on the sphinx ``any`` role to also resolve: + + - Document references with extensions; ``[text](./doc.md)`` + - Document references with anchors with anchors; ``[text](./doc.md#target)`` + - Nested syntax for explicit text with std:doc and std:ref; + ``[**nested**](reference)`` + + """ + target = node["reftarget"] # type: str + results = [] # type: List[Tuple[str, Element]] + + res_anchor = self._resolve_anchor(node, refdoc) + if res_anchor: + results.append(("std:doc", res_anchor)) + else: + # if we've already found an anchored doc, + # don't search in the std:ref/std:doc (leads to duplication) + + # resolve standard references + res = self._resolve_ref_nested(node, refdoc) + if res: + results.append(("std:ref", res)) + + # resolve doc names + res = self._resolve_doc_nested(node, refdoc) + if res: + results.append(("std:doc", res)) + + # get allowed domains for referencing + ref_domains = self.env.config.myst_ref_domains + + assert self.app.builder + + # next resolve for any other standard reference objects + if ref_domains is None or "std" in ref_domains: + stddomain = cast(StandardDomain, self.env.get_domain("std")) + for objtype in stddomain.object_types: + key = (objtype, target) + if objtype == "term": + key = (objtype, target.lower()) + if key in stddomain.objects: + docname, labelid = stddomain.objects[key] + domain_role = "std:" + stddomain.role_for_objtype(objtype) + ref_node = make_refnode( + self.app.builder, refdoc, docname, labelid, contnode + ) + results.append((domain_role, ref_node)) + + # finally resolve for any other type of allowed reference domain + for domain in self.env.domains.values(): + if domain.name == "std": + continue # we did this one already + if ref_domains is not None and domain.name not in ref_domains: + continue + try: + results.extend( + domain.resolve_any_xref( + self.env, refdoc, self.app.builder, target, node, contnode + ) + ) + except NotImplementedError: + # the domain doesn't yet support the new interface + # we have to manually collect possible references (SLOW) + if not (getattr(domain, "__module__", "").startswith("sphinx.")): + logger.warning( + f"Domain '{domain.__module__}::{domain.name}' has not " + "implemented a `resolve_any_xref` method [myst.domains]", + type="myst", + subtype="domains", + once=True, + ) + for role in domain.roles: + res = domain.resolve_xref( + self.env, refdoc, self.app.builder, role, target, node, contnode + ) + if res and len(res) and isinstance(res[0], nodes.Element): + results.append((f"{domain.name}:{role}", res)) + + # now, see how many matches we got... + if not results: + return None + if len(results) > 1: + + def stringify(name, node): + reftitle = node.get("reftitle", node.astext()) + return f":{name}:`{reftitle}`" + + candidates = " or ".join(stringify(name, role) for name, role in results) + logger.warning( + __( + f"more than one target found for 'myst' cross-reference {target}: " + f"could be {candidates} [myst.ref]" + ), + location=node, + type="myst", + subtype="ref", + ) + + res_role, newnode = results[0] + # Override "myst" class with the actual role type to get the styling + # approximately correct. + res_domain = res_role.split(":")[0] + if len(newnode) > 0 and isinstance(newnode[0], nodes.Element): + newnode[0]["classes"] = newnode[0].get("classes", []) + [ + res_domain, + res_role.replace(":", "-"), + ] + + return newnode + + def _resolve_anchor( + self, node: pending_xref, fromdocname: str + ) -> Optional[Element]: + """Resolve doc with anchor.""" + if self.env.config.myst_heading_anchors is None: + # no target anchors will have been created, so we don't look for them + return None + target = node["reftarget"] # type: str + if "#" not in target: + return None + # the link may be a heading anchor; we need to first get the relative path + rel_path, anchor = target.rsplit("#", 1) + rel_path = os.path.normpath(rel_path) + if rel_path == ".": + # anchor in the same doc as the node + doc_path = self.env.doc2path(node.get("refdoc", fromdocname), base=False) + else: + # anchor in a different doc from the node + doc_path = os.path.normpath( + os.path.join(node.get("refdoc", fromdocname), "..", rel_path) + ) + return self._resolve_ref_nested(node, fromdocname, doc_path + "#" + anchor) + + def _resolve_ref_nested( + self, node: pending_xref, fromdocname: str, target=None + ) -> Optional[Element]: + """This is the same as ``sphinx.domains.std._resolve_ref_xref``, + but allows for nested syntax, rather than converting the inner node to raw text. + """ + stddomain = cast(StandardDomain, self.env.get_domain("std")) + target = target or node["reftarget"].lower() + + if node["refexplicit"]: + # reference to anonymous label; the reference uses + # the supplied link caption + docname, labelid = stddomain.anonlabels.get(target, ("", "")) + sectname = node.astext() + innernode = nodes.inline(sectname, "") + innernode.extend(node[0].children) + else: + # reference to named label; the final node will + # contain the section name after the label + docname, labelid, sectname = stddomain.labels.get(target, ("", "", "")) + innernode = nodes.inline(sectname, sectname) + + if not docname: + return None + + assert self.app.builder + return make_refnode(self.app.builder, fromdocname, docname, labelid, innernode) + + def _resolve_doc_nested( + self, node: pending_xref, fromdocname: str + ) -> Optional[Element]: + """This is the same as ``sphinx.domains.std._resolve_doc_xref``, + but allows for nested syntax, rather than converting the inner node to raw text. + + It also allows for extensions on document names. + """ + # directly reference to document by source name; can be absolute or relative + refdoc = node.get("refdoc", fromdocname) + docname = docname_join(refdoc, node["reftarget"]) + + if docname not in self.env.all_docs: + # try stripping known extensions from doc name + if os.path.splitext(docname)[1] in self.env.config.source_suffix: + docname = os.path.splitext(docname)[0] + if docname not in self.env.all_docs: + return None + + if node["refexplicit"]: + # reference with explicit title + caption = node.astext() + innernode = nodes.inline(caption, "", classes=["doc"]) + innernode.extend(node[0].children) + else: + # TODO do we want nested syntax for titles? + caption = clean_astext(self.env.titles[docname]) + innernode = nodes.inline(caption, caption, classes=["doc"]) + + assert self.app.builder + return make_refnode(self.app.builder, fromdocname, docname, "", innernode) |