summaryrefslogtreecommitdiffstats
path: root/myst_parser
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-29 04:23:02 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-29 04:23:02 +0000
commit943e3dc057eca53e68ddec51529bd6a1279ebd8e (patch)
tree61fb7bac619a56dfbcdcbdb7b0d4d6535fc36fe9 /myst_parser
parentInitial commit. (diff)
downloadmyst-parser-upstream.tar.xz
myst-parser-upstream.zip
Adding upstream version 0.18.1.upstream/0.18.1upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'myst_parser')
-rw-r--r--myst_parser/__init__.py10
-rw-r--r--myst_parser/_compat.py11
-rw-r--r--myst_parser/_docs.py198
-rw-r--r--myst_parser/cli.py42
-rw-r--r--myst_parser/config/__init__.py1
-rw-r--r--myst_parser/config/dc_validators.py161
-rw-r--r--myst_parser/config/main.py409
-rw-r--r--myst_parser/docutils_.py6
-rw-r--r--myst_parser/mdit_to_docutils/__init__.py1
-rw-r--r--myst_parser/mdit_to_docutils/base.py1483
-rw-r--r--myst_parser/mdit_to_docutils/html_to_nodes.py139
-rw-r--r--myst_parser/mdit_to_docutils/sphinx_.py245
-rw-r--r--myst_parser/mdit_to_docutils/utils.py36
-rw-r--r--myst_parser/mocking.py514
-rw-r--r--myst_parser/parsers/__init__.py1
-rw-r--r--myst_parser/parsers/directives.py190
-rw-r--r--myst_parser/parsers/docutils_.py275
-rw-r--r--myst_parser/parsers/mdit.py123
-rw-r--r--myst_parser/parsers/parse_html.py440
-rw-r--r--myst_parser/parsers/sphinx_.py69
-rw-r--r--myst_parser/py.typed1
-rw-r--r--myst_parser/sphinx_.py6
-rw-r--r--myst_parser/sphinx_ext/__init__.py1
-rw-r--r--myst_parser/sphinx_ext/directives.py136
-rw-r--r--myst_parser/sphinx_ext/main.py60
-rw-r--r--myst_parser/sphinx_ext/mathjax.py118
-rw-r--r--myst_parser/sphinx_ext/myst_refs.py282
27 files changed, 4958 insertions, 0 deletions
diff --git a/myst_parser/__init__.py b/myst_parser/__init__.py
new file mode 100644
index 0000000..56dd460
--- /dev/null
+++ b/myst_parser/__init__.py
@@ -0,0 +1,10 @@
+"""An extended commonmark compliant parser, with bridges to docutils & sphinx."""
+__version__ = "0.18.1"
+
+
+def setup(app):
+ """Initialize the Sphinx extension."""
+ from myst_parser.sphinx_ext.main import setup_sphinx
+
+ setup_sphinx(app, load_parser=True)
+ return {"version": __version__, "parallel_read_safe": True}
diff --git a/myst_parser/_compat.py b/myst_parser/_compat.py
new file mode 100644
index 0000000..d29cf4d
--- /dev/null
+++ b/myst_parser/_compat.py
@@ -0,0 +1,11 @@
+"""Helpers for cross compatibility across dependency versions."""
+from typing import Callable, Iterable
+
+from docutils.nodes import Element
+
+
+def findall(node: Element) -> Callable[..., Iterable[Element]]:
+ """Iterate through"""
+ # findall replaces traverse in docutils v0.18
+ # note a difference is that findall is an iterator
+ return getattr(node, "findall", node.traverse)
diff --git a/myst_parser/_docs.py b/myst_parser/_docs.py
new file mode 100644
index 0000000..a7c46a3
--- /dev/null
+++ b/myst_parser/_docs.py
@@ -0,0 +1,198 @@
+"""Code to use internally, for documentation."""
+from __future__ import annotations
+
+import io
+from typing import Sequence, Union
+
+from docutils import nodes
+from docutils.frontend import OptionParser
+from docutils.parsers.rst import directives
+from sphinx.directives import other
+from sphinx.util import logging
+from sphinx.util.docutils import SphinxDirective
+from typing_extensions import get_args, get_origin
+
+from .config.main import MdParserConfig
+from .parsers.docutils_ import Parser as DocutilsParser
+
+logger = logging.getLogger(__name__)
+
+
+class _ConfigBase(SphinxDirective):
+ """Directive to automate rendering of the configuration."""
+
+ @staticmethod
+ def table_header():
+ return [
+ "```````{list-table}",
+ ":header-rows: 1",
+ ":widths: 15 10 20",
+ "",
+ "* - Name",
+ " - Type",
+ " - Description",
+ ]
+
+ @staticmethod
+ def field_default(value):
+ default = " ".join(f"{value!r}".splitlines())
+ return default
+
+ @staticmethod
+ def field_type(field):
+ ftypes: Sequence[str]
+ if get_origin(field.type) is Union:
+ ftypes = get_args(field.type)
+ else:
+ ftypes = [field.type]
+ ctype = " | ".join(
+ str("None" if ftype == type(None) else ftype) # type: ignore # noqa: E721
+ for ftype in ftypes
+ )
+ ctype = " ".join(ctype.splitlines())
+ ctype = ctype.replace("typing.", "")
+ ctype = ctype.replace("typing_extensions.", "")
+ for tname in ("str", "int", "float", "bool"):
+ ctype = ctype.replace(f"<class '{tname}'>", tname)
+ return ctype
+
+
+class MystConfigDirective(_ConfigBase):
+
+ option_spec = {
+ "sphinx": directives.flag,
+ "extensions": directives.flag,
+ "scope": lambda x: directives.choice(x, ["global", "local"]),
+ }
+
+ def run(self):
+ """Run the directive."""
+ config = MdParserConfig()
+ text = self.table_header()
+ count = 0
+ for name, value, field in config.as_triple():
+
+ # filter by sphinx options
+ if "sphinx" in self.options and field.metadata.get("sphinx_exclude"):
+ continue
+
+ if "extensions" in self.options:
+ if not field.metadata.get("extension"):
+ continue
+ else:
+ if field.metadata.get("extension"):
+ continue
+
+ if self.options.get("scope") == "local":
+ if field.metadata.get("global_only"):
+ continue
+
+ if self.options.get("scope") == "global":
+ name = f"myst_{name}"
+
+ description = " ".join(field.metadata.get("help", "").splitlines())
+ if field.metadata.get("extension"):
+ description = f"{field.metadata.get('extension')}: {description}"
+ default = self.field_default(value)
+ ctype = self.field_type(field)
+ text.extend(
+ [
+ f"* - `{name}`",
+ f" - `{ctype}`",
+ f" - {description} (default: `{default}`)",
+ ]
+ )
+
+ count += 1
+
+ if not count:
+ return []
+
+ text.append("```````")
+ node = nodes.Element()
+ self.state.nested_parse(text, 0, node)
+ return node.children
+
+
+class DocutilsCliHelpDirective(SphinxDirective):
+ """Directive to print the docutils CLI help."""
+
+ has_content = False
+ required_arguments = 0
+ optional_arguments = 0
+ final_argument_whitespace = False
+
+ def run(self):
+ """Run the directive."""
+ stream = io.StringIO()
+ OptionParser(
+ components=(DocutilsParser,),
+ usage="myst-docutils-<writer> [options] [<source> [<destination>]]",
+ ).print_help(stream)
+ return [nodes.literal_block("", stream.getvalue())]
+
+
+class DirectiveDoc(SphinxDirective):
+ """Load and document a directive."""
+
+ required_arguments = 1 # name of the directive
+ has_content = True
+
+ def run(self):
+ """Run the directive."""
+ name = self.arguments[0]
+ # load the directive class
+ klass, _ = directives.directive(
+ name, self.state.memo.language, self.state.document
+ )
+ if klass is None:
+ logger.warning(f"Directive {name} not found.", line=self.lineno)
+ return []
+ content = " ".join(self.content)
+ text = f"""\
+:Name: `{name}`
+:Description: {content}
+:Arguments: {klass.required_arguments} required, {klass.optional_arguments} optional
+:Content: {'yes' if klass.has_content else 'no'}
+:Options:
+"""
+ if klass.option_spec:
+ text += " name | type\n -----|------\n"
+ for key, func in klass.option_spec.items():
+ text += f" {key} | {convert_opt(name, func)}\n"
+ node = nodes.Element()
+ self.state.nested_parse(text.splitlines(), 0, node)
+ return node.children
+
+
+def convert_opt(name, func):
+ """Convert an option function to a string."""
+ if func is directives.flag:
+ return "flag"
+ if func is directives.unchanged:
+ return "text"
+ if func is directives.unchanged_required:
+ return "text"
+ if func is directives.class_option:
+ return "space-delimited list"
+ if func is directives.uri:
+ return "URI"
+ if func is directives.path:
+ return "path"
+ if func is int:
+ return "integer"
+ if func is directives.positive_int:
+ return "integer (positive)"
+ if func is directives.nonnegative_int:
+ return "integer (non-negative)"
+ if func is directives.positive_int_list:
+ return "space/comma-delimited list of integers (positive)"
+ if func is directives.percentage:
+ return "percentage"
+ if func is directives.length_or_unitless:
+ return "length or unitless"
+ if func is directives.length_or_percentage_or_unitless:
+ return "length, percentage or unitless"
+ if func is other.int_or_nothing:
+ return "integer"
+ return ""
diff --git a/myst_parser/cli.py b/myst_parser/cli.py
new file mode 100644
index 0000000..b9bb1ba
--- /dev/null
+++ b/myst_parser/cli.py
@@ -0,0 +1,42 @@
+import argparse
+import sys
+
+from markdown_it.renderer import RendererHTML
+
+from myst_parser.config.main import MdParserConfig
+from myst_parser.parsers.mdit import create_md_parser
+
+
+def print_anchors(args=None):
+ """ """
+ arg_parser = argparse.ArgumentParser()
+ arg_parser.add_argument(
+ "input",
+ nargs="?",
+ type=argparse.FileType("r", encoding="utf8"),
+ default=sys.stdin,
+ help="Input file (default stdin)",
+ )
+ arg_parser.add_argument(
+ "-o",
+ "--output",
+ type=argparse.FileType("w", encoding="utf8"),
+ default=sys.stdout,
+ help="Output file (default stdout)",
+ )
+ arg_parser.add_argument(
+ "-l", "--level", type=int, default=2, help="Maximum heading level."
+ )
+ args = arg_parser.parse_args(args)
+ parser = create_md_parser(MdParserConfig(heading_anchors=args.level), RendererHTML)
+
+ def _filter_plugin(state):
+ state.tokens = [
+ t
+ for t in state.tokens
+ if t.type.startswith("heading_") and int(t.tag[1]) <= args.level
+ ]
+
+ parser.use(lambda p: p.core.ruler.push("filter", _filter_plugin))
+ text = parser.render(args.input.read())
+ args.output.write(text)
diff --git a/myst_parser/config/__init__.py b/myst_parser/config/__init__.py
new file mode 100644
index 0000000..898f9ce
--- /dev/null
+++ b/myst_parser/config/__init__.py
@@ -0,0 +1 @@
+"""This module holds the global configuration for the parser ``MdParserConfig``."""
diff --git a/myst_parser/config/dc_validators.py b/myst_parser/config/dc_validators.py
new file mode 100644
index 0000000..765cfb9
--- /dev/null
+++ b/myst_parser/config/dc_validators.py
@@ -0,0 +1,161 @@
+"""Validators for dataclasses, mirroring those of https://github.com/python-attrs/attrs."""
+from __future__ import annotations
+
+import dataclasses as dc
+from typing import Any, Sequence
+
+from typing_extensions import Protocol
+
+
+def validate_field(inst: Any, field: dc.Field, value: Any) -> None:
+ """Validate the field of a dataclass,
+ according to a `validator` function set in the field.metadata.
+
+ The validator function should take as input (inst, field, value) and
+ raise an exception if the value is invalid.
+ """
+ if "validator" not in field.metadata:
+ return
+ if isinstance(field.metadata["validator"], list):
+ for validator in field.metadata["validator"]:
+ validator(inst, field, value)
+ else:
+ field.metadata["validator"](inst, field, value)
+
+
+def validate_fields(inst: Any) -> None:
+ """Validate the fields of a dataclass,
+ according to `validator` functions set in the field metadata.
+
+ This function should be called in the `__post_init__` of the dataclass.
+
+ The validator function should take as input (inst, field, value) and
+ raise an exception if the value is invalid.
+ """
+ for field in dc.fields(inst):
+ validate_field(inst, field, getattr(inst, field.name))
+
+
+class ValidatorType(Protocol):
+ def __call__(
+ self, inst: bytes, field: dc.Field, value: Any, suffix: str = ""
+ ) -> None:
+ ...
+
+
+def instance_of(type: type[Any] | tuple[type[Any], ...]) -> ValidatorType:
+ """
+ A validator that raises a `TypeError` if the initializer is called
+ with a wrong type for this particular attribute (checks are performed using
+ `isinstance` therefore it's also valid to pass a tuple of types).
+
+ :param type: The type to check for.
+ """
+
+ def _validator(inst, field, value, suffix=""):
+ """
+ We use a callable class to be able to change the ``__repr__``.
+ """
+ if not isinstance(value, type):
+ raise TypeError(
+ f"'{field.name}{suffix}' must be of type {type!r} "
+ f"(got {value!r} that is a {value.__class__!r})."
+ )
+
+ return _validator
+
+
+def optional(validator: ValidatorType) -> ValidatorType:
+ """
+ A validator that makes an attribute optional. An optional attribute is one
+ which can be set to ``None`` in addition to satisfying the requirements of
+ the sub-validator.
+ """
+
+ def _validator(inst, field, value, suffix=""):
+ if value is None:
+ return
+
+ validator(inst, field, value, suffix=suffix)
+
+ return _validator
+
+
+def is_callable(inst, field, value, suffix=""):
+ """
+ A validator that raises a `TypeError` if the
+ initializer is called with a value for this particular attribute
+ that is not callable.
+ """
+ if not callable(value):
+ raise TypeError(
+ f"'{field.name}{suffix}' must be callable "
+ f"(got {value!r} that is a {value.__class__!r})."
+ )
+
+
+def in_(options: Sequence) -> ValidatorType:
+ """
+ A validator that raises a `ValueError` if the initializer is called
+ with a value that does not belong in the options provided. The check is
+ performed using ``value in options``.
+
+ :param options: Allowed options.
+ """
+
+ def _validator(inst, field, value, suffix=""):
+ try:
+ in_options = value in options
+ except TypeError: # e.g. `1 in "abc"`
+ in_options = False
+
+ if not in_options:
+ raise ValueError(
+ f"'{field.name}{suffix}' must be in {options!r} (got {value!r})"
+ )
+
+ return _validator
+
+
+def deep_iterable(
+ member_validator: ValidatorType, iterable_validator: ValidatorType | None = None
+) -> ValidatorType:
+ """
+ A validator that performs deep validation of an iterable.
+
+ :param member_validator: Validator to apply to iterable members
+ :param iterable_validator: Validator to apply to iterable itself
+ """
+
+ def _validator(inst, field, value, suffix=""):
+ if iterable_validator is not None:
+ iterable_validator(inst, field, value, suffix=suffix)
+
+ for idx, member in enumerate(value):
+ member_validator(inst, field, member, suffix=f"{suffix}[{idx}]")
+
+ return _validator
+
+
+def deep_mapping(
+ key_validator: ValidatorType,
+ value_validator: ValidatorType,
+ mapping_validator: ValidatorType | None = None,
+) -> ValidatorType:
+ """
+ A validator that performs deep validation of a dictionary.
+
+ :param key_validator: Validator to apply to dictionary keys
+ :param value_validator: Validator to apply to dictionary values
+ :param mapping_validator: Validator to apply to top-level mapping attribute (optional)
+ """
+
+ def _validator(inst, field: dc.Field, value, suffix=""):
+ if mapping_validator is not None:
+ mapping_validator(inst, field, value)
+
+ for key in value:
+ key_validator(inst, field, key, suffix=f"{suffix}[{key!r}]")
+ value_validator(inst, field, value[key], suffix=f"{suffix}[{key!r}]")
+
+ return _validator
diff --git a/myst_parser/config/main.py b/myst_parser/config/main.py
new file mode 100644
index 0000000..a134ea7
--- /dev/null
+++ b/myst_parser/config/main.py
@@ -0,0 +1,409 @@
+"""The configuration for the myst parser."""
+import dataclasses as dc
+from typing import (
+ Any,
+ Callable,
+ Dict,
+ Iterable,
+ Iterator,
+ Optional,
+ Sequence,
+ Tuple,
+ Union,
+ cast,
+)
+
+from .dc_validators import (
+ deep_iterable,
+ deep_mapping,
+ in_,
+ instance_of,
+ is_callable,
+ optional,
+ validate_field,
+ validate_fields,
+)
+
+
+def check_extensions(_, __, value):
+ if not isinstance(value, Iterable):
+ raise TypeError(f"'enable_extensions' not iterable: {value}")
+ diff = set(value).difference(
+ [
+ "amsmath",
+ "attrs_image",
+ "colon_fence",
+ "deflist",
+ "dollarmath",
+ "fieldlist",
+ "html_admonition",
+ "html_image",
+ "linkify",
+ "replacements",
+ "smartquotes",
+ "strikethrough",
+ "substitution",
+ "tasklist",
+ ]
+ )
+ if diff:
+ raise ValueError(f"'enable_extensions' items not recognised: {diff}")
+
+
+def check_sub_delimiters(_, __, value):
+ if (not isinstance(value, (tuple, list))) or len(value) != 2:
+ raise TypeError(f"myst_sub_delimiters is not a tuple of length 2: {value}")
+ for delim in value:
+ if (not isinstance(delim, str)) or len(delim) != 1:
+ raise TypeError(
+ f"myst_sub_delimiters does not contain strings of length 1: {value}"
+ )
+
+
+@dc.dataclass()
+class MdParserConfig:
+ """Configuration options for the Markdown Parser.
+
+ Note in the sphinx configuration these option names are prepended with ``myst_``
+ """
+
+ # TODO replace commonmark_only, gfm_only with a single option
+
+ commonmark_only: bool = dc.field(
+ default=False,
+ metadata={
+ "validator": instance_of(bool),
+ "help": "Use strict CommonMark parser",
+ },
+ )
+ gfm_only: bool = dc.field(
+ default=False,
+ metadata={
+ "validator": instance_of(bool),
+ "help": "Use strict Github Flavoured Markdown parser",
+ },
+ )
+
+ enable_extensions: Sequence[str] = dc.field(
+ default_factory=list,
+ metadata={"validator": check_extensions, "help": "Enable syntax extensions"},
+ )
+
+ disable_syntax: Iterable[str] = dc.field(
+ default_factory=list,
+ metadata={
+ "validator": deep_iterable(instance_of(str), instance_of((list, tuple))),
+ "help": "Disable Commonmark syntax elements",
+ },
+ )
+
+ all_links_external: bool = dc.field(
+ default=False,
+ metadata={
+ "validator": instance_of(bool),
+ "help": "Parse all links as simple hyperlinks",
+ },
+ )
+
+ # see https://en.wikipedia.org/wiki/List_of_URI_schemes
+ url_schemes: Optional[Iterable[str]] = dc.field(
+ default=cast(Optional[Iterable[str]], ("http", "https", "mailto", "ftp")),
+ metadata={
+ "validator": optional(
+ deep_iterable(instance_of(str), instance_of((list, tuple)))
+ ),
+ "help": "URL scheme prefixes identified as external links",
+ },
+ )
+
+ ref_domains: Optional[Iterable[str]] = dc.field(
+ default=None,
+ metadata={
+ "validator": optional(
+ deep_iterable(instance_of(str), instance_of((list, tuple)))
+ ),
+ "help": "Sphinx domain names to search in for link references",
+ },
+ )
+
+ highlight_code_blocks: bool = dc.field(
+ default=True,
+ metadata={
+ "validator": instance_of(bool),
+ "help": "Syntax highlight code blocks with pygments",
+ "docutils_only": True,
+ },
+ )
+
+ number_code_blocks: Sequence[str] = dc.field(
+ default_factory=list,
+ metadata={
+ "validator": deep_iterable(instance_of(str), instance_of((list, tuple))),
+ "help": "Add line numbers to code blocks with these languages",
+ },
+ )
+
+ title_to_header: bool = dc.field(
+ default=False,
+ metadata={
+ "validator": instance_of(bool),
+ "help": "Convert a `title` field in the top-matter to a H1 header",
+ },
+ )
+
+ heading_anchors: Optional[int] = dc.field(
+ default=None,
+ metadata={
+ "validator": optional(in_([1, 2, 3, 4, 5, 6, 7])),
+ "help": "Heading level depth to assign HTML anchors",
+ },
+ )
+
+ heading_slug_func: Optional[Callable[[str], str]] = dc.field(
+ default=None,
+ metadata={
+ "validator": optional(is_callable),
+ "help": "Function for creating heading anchors",
+ "global_only": True,
+ },
+ )
+
+ html_meta: Dict[str, str] = dc.field(
+ default_factory=dict,
+ repr=False,
+ metadata={
+ "validator": deep_mapping(
+ instance_of(str), instance_of(str), instance_of(dict)
+ ),
+ "merge_topmatter": True,
+ "help": "HTML meta tags",
+ },
+ )
+
+ footnote_transition: bool = dc.field(
+ default=True,
+ metadata={
+ "validator": instance_of(bool),
+ "help": "Place a transition before any footnotes",
+ },
+ )
+
+ words_per_minute: int = dc.field(
+ default=200,
+ metadata={
+ "validator": instance_of(int),
+ "help": "For reading speed calculations",
+ },
+ )
+
+ # Extension specific
+
+ substitutions: Dict[str, Union[str, int, float]] = dc.field(
+ default_factory=dict,
+ repr=False,
+ metadata={
+ "validator": deep_mapping(
+ instance_of(str), instance_of((str, int, float)), instance_of(dict)
+ ),
+ "merge_topmatter": True,
+ "help": "Substitutions mapping",
+ "extension": "substitutions",
+ },
+ )
+
+ sub_delimiters: Tuple[str, str] = dc.field(
+ default=("{", "}"),
+ metadata={
+ "validator": check_sub_delimiters,
+ "help": "Substitution delimiters",
+ "extension": "substitutions",
+ },
+ )
+
+ linkify_fuzzy_links: bool = dc.field(
+ default=True,
+ metadata={
+ "validator": instance_of(bool),
+ "help": "Recognise URLs without schema prefixes",
+ "extension": "linkify",
+ },
+ )
+
+ dmath_allow_labels: bool = dc.field(
+ default=True,
+ metadata={
+ "validator": instance_of(bool),
+ "help": "Parse `$$...$$ (label)`",
+ "extension": "dollarmath",
+ },
+ )
+ dmath_allow_space: bool = dc.field(
+ default=True,
+ metadata={
+ "validator": instance_of(bool),
+ "help": "Allow initial/final spaces in `$ ... $`",
+ "extension": "dollarmath",
+ },
+ )
+ dmath_allow_digits: bool = dc.field(
+ default=True,
+ metadata={
+ "validator": instance_of(bool),
+ "help": "Allow initial/final digits `1$ ...$2`",
+ "extension": "dollarmath",
+ },
+ )
+ dmath_double_inline: bool = dc.field(
+ default=False,
+ metadata={
+ "validator": instance_of(bool),
+ "help": "Parse inline `$$ ... $$`",
+ "extension": "dollarmath",
+ },
+ )
+
+ update_mathjax: bool = dc.field(
+ default=True,
+ metadata={
+ "validator": instance_of(bool),
+ "help": "Update sphinx.ext.mathjax configuration to ignore `$` delimiters",
+ "extension": "dollarmath",
+ "global_only": True,
+ },
+ )
+
+ mathjax_classes: str = dc.field(
+ default="tex2jax_process|mathjax_process|math|output_area",
+ metadata={
+ "validator": instance_of(str),
+ "help": "MathJax classes to add to math HTML",
+ "extension": "dollarmath",
+ "global_only": True,
+ },
+ )
+
+ def __post_init__(self):
+ validate_fields(self)
+
+ def copy(self, **kwargs: Any) -> "MdParserConfig":
+ """Return a new object replacing specified fields with new values.
+
+ Note: initiating the copy will also validate the new fields.
+ """
+ return dc.replace(self, **kwargs)
+
+ @classmethod
+ def get_fields(cls) -> Tuple[dc.Field, ...]:
+ """Return all attribute fields in this class."""
+ return dc.fields(cls)
+
+ def as_dict(self, dict_factory=dict) -> dict:
+ """Return a dictionary of field name -> value."""
+ return dc.asdict(self, dict_factory=dict_factory)
+
+ def as_triple(self) -> Iterable[Tuple[str, Any, dc.Field]]:
+ """Yield triples of (name, value, field)."""
+ fields = {f.name: f for f in dc.fields(self.__class__)}
+ for name, value in dc.asdict(self).items():
+ yield name, value, fields[name]
+
+
+def merge_file_level(
+ config: MdParserConfig,
+ topmatter: Dict[str, Any],
+ warning: Callable[[str, str], None],
+) -> MdParserConfig:
+ """Merge the file-level topmatter with the global config.
+
+ :param config: Global config.
+ :param topmatter: Topmatter from the file.
+ :param warning: Function to call with a warning (type, message).
+ :returns: A new config object
+ """
+ # get updates
+ updates: Dict[str, Any] = {}
+ myst = topmatter.get("myst", {})
+ if not isinstance(myst, dict):
+ warning("topmatter", f"'myst' key not a dict: {type(myst)}")
+ else:
+ updates = myst
+
+ # allow html_meta and substitutions at top-level for back-compatibility
+ if "html_meta" in topmatter:
+ warning(
+ "topmatter",
+ "top-level 'html_meta' key is deprecated, "
+ "place under 'myst' key instead",
+ )
+ updates["html_meta"] = topmatter["html_meta"]
+ if "substitutions" in topmatter:
+ warning(
+ "topmatter",
+ "top-level 'substitutions' key is deprecated, "
+ "place under 'myst' key instead",
+ )
+ updates["substitutions"] = topmatter["substitutions"]
+
+ new = config.copy()
+
+ # validate each update
+ fields = {name: (value, field) for name, value, field in config.as_triple()}
+ for name, value in updates.items():
+
+ if name not in fields:
+ warning("topmatter", f"Unknown field: {name}")
+ continue
+
+ old_value, field = fields[name]
+
+ try:
+ validate_field(new, field, value)
+ except Exception as exc:
+ warning("topmatter", str(exc))
+ continue
+
+ if field.metadata.get("merge_topmatter"):
+ value = {**old_value, **value}
+
+ setattr(new, name, value)
+
+ return new
+
+
+class TopmatterReadError(Exception):
+ """Topmatter parsing error."""
+
+
+def read_topmatter(text: Union[str, Iterator[str]]) -> Optional[Dict[str, Any]]:
+ """Read the (optional) YAML topmatter from a source string.
+
+ This is identified by the first line starting with `---`,
+ then read up to a terminating line of `---`, or `...`.
+
+ :param source: The source string to read from
+ :return: The topmatter
+ """
+ import yaml
+
+ if isinstance(text, str):
+ if not text.startswith("---"): # skip creating the line list in memory
+ return None
+ text = (line for line in text.splitlines())
+ try:
+ if not next(text).startswith("---"):
+ return None
+ except StopIteration:
+ return None
+ top_matter = []
+ for line in text:
+ if line.startswith("---") or line.startswith("..."):
+ break
+ top_matter.append(line.rstrip() + "\n")
+ try:
+ metadata = yaml.safe_load("".join(top_matter))
+ assert isinstance(metadata, dict)
+ except (yaml.parser.ParserError, yaml.scanner.ScannerError) as err:
+ raise TopmatterReadError("Malformed YAML") from err
+ if not isinstance(metadata, dict):
+ raise TopmatterReadError(f"YAML is not a dict: {type(metadata)}")
+ return metadata
diff --git a/myst_parser/docutils_.py b/myst_parser/docutils_.py
new file mode 100644
index 0000000..6f2cc84
--- /dev/null
+++ b/myst_parser/docutils_.py
@@ -0,0 +1,6 @@
+"""A module for compatibility with the docutils>=0.17 `include` directive, in RST documents::
+
+ .. include:: path/to/file.md
+ :parser: myst_parser.docutils_
+"""
+from myst_parser.parsers.docutils_ import Parser # noqa: F401
diff --git a/myst_parser/mdit_to_docutils/__init__.py b/myst_parser/mdit_to_docutils/__init__.py
new file mode 100644
index 0000000..0b9307f
--- /dev/null
+++ b/myst_parser/mdit_to_docutils/__init__.py
@@ -0,0 +1 @@
+"""Conversion of Markdown-it tokens to docutils AST."""
diff --git a/myst_parser/mdit_to_docutils/base.py b/myst_parser/mdit_to_docutils/base.py
new file mode 100644
index 0000000..cedd6c3
--- /dev/null
+++ b/myst_parser/mdit_to_docutils/base.py
@@ -0,0 +1,1483 @@
+"""Convert Markdown-it tokens to docutils nodes."""
+from __future__ import annotations
+
+import inspect
+import json
+import os
+import re
+from collections import OrderedDict
+from contextlib import contextmanager
+from datetime import date, datetime
+from types import ModuleType
+from typing import TYPE_CHECKING, Any, Iterator, MutableMapping, Sequence, cast
+from urllib.parse import urlparse
+
+import jinja2
+import yaml
+from docutils import nodes
+from docutils.frontend import OptionParser
+from docutils.languages import get_language
+from docutils.parsers.rst import Directive, DirectiveError
+from docutils.parsers.rst import Parser as RSTParser
+from docutils.parsers.rst import directives, roles
+from docutils.parsers.rst.directives.misc import Include
+from docutils.parsers.rst.languages import get_language as get_language_rst
+from docutils.statemachine import StringList
+from docutils.transforms.components import Filter
+from docutils.utils import Reporter, new_document
+from docutils.utils.code_analyzer import Lexer, LexerError, NumberLines
+from markdown_it import MarkdownIt
+from markdown_it.common.utils import escapeHtml
+from markdown_it.renderer import RendererProtocol
+from markdown_it.token import Token
+from markdown_it.tree import SyntaxTreeNode
+
+from myst_parser._compat import findall
+from myst_parser.config.main import MdParserConfig
+from myst_parser.mocking import (
+ MockIncludeDirective,
+ MockingError,
+ MockInliner,
+ MockRSTParser,
+ MockState,
+ MockStateMachine,
+)
+from myst_parser.parsers.directives import DirectiveParsingError, parse_directive_text
+from .html_to_nodes import html_to_nodes
+from .utils import is_external_url
+
+if TYPE_CHECKING:
+ from sphinx.environment import BuildEnvironment
+
+
+def make_document(source_path="notset", parser_cls=RSTParser) -> nodes.document:
+ """Create a new docutils document, with the parser classes' default settings."""
+ settings = OptionParser(components=(parser_cls,)).get_default_values()
+ return new_document(source_path, settings=settings)
+
+
+REGEX_DIRECTIVE_START = re.compile(r"^[\s]{0,3}([`]{3,10}|[~]{3,10}|[:]{3,10})\{")
+
+
+def token_line(token: SyntaxTreeNode, default: int | None = None) -> int:
+ """Retrieve the initial line of a token."""
+ if not getattr(token, "map", None):
+ if default is not None:
+ return default
+ raise ValueError(f"token map not set: {token}")
+ return token.map[0] # type: ignore[index]
+
+
+def create_warning(
+ document: nodes.document,
+ message: str,
+ *,
+ line: int | None = None,
+ append_to: nodes.Element | None = None,
+ wtype: str = "myst",
+ subtype: str = "other",
+) -> nodes.system_message | None:
+ """Generate a warning, logging if it is necessary.
+
+ Note this is overridden in the ``SphinxRenderer``,
+ to handle suppressed warning types.
+ """
+ kwargs = {"line": line} if line is not None else {}
+ msg_node = document.reporter.warning(f"{message} [{wtype}.{subtype}]", **kwargs)
+ if append_to is not None:
+ append_to.append(msg_node)
+ return msg_node
+
+
+class DocutilsRenderer(RendererProtocol):
+ """A markdown-it-py renderer to populate (in-place) a `docutils.document` AST.
+
+ Note, this render is not dependent on Sphinx.
+ """
+
+ __output__ = "docutils"
+
+ def __init__(self, parser: MarkdownIt) -> None:
+ """Load the renderer (called by ``MarkdownIt``)"""
+ self.md = parser
+ self.rules = {
+ k: v
+ for k, v in inspect.getmembers(self, predicate=inspect.ismethod)
+ if k.startswith("render_") and k != "render_children"
+ }
+
+ def __getattr__(self, name: str):
+ """Warn when the renderer has not been setup yet."""
+ if name in (
+ "md_env",
+ "md_config",
+ "md_options",
+ "document",
+ "current_node",
+ "reporter",
+ "language_module_rst",
+ "_level_to_elem",
+ ):
+ raise AttributeError(
+ f"'{name}' attribute is not available until setup_render() is called"
+ )
+ raise AttributeError(
+ f"'{type(self).__name__}' object has no attribute '{name}'"
+ )
+
+ def setup_render(
+ self, options: dict[str, Any], env: MutableMapping[str, Any]
+ ) -> None:
+ """Setup the renderer with per render variables."""
+ self.md_env = env
+ self.md_options = options
+ self.md_config: MdParserConfig = options["myst_config"]
+ self.document: nodes.document = options.get("document", make_document())
+ self.current_node: nodes.Element = options.get("current_node", self.document)
+ self.reporter: Reporter = self.document.reporter
+ # note there are actually two possible language modules:
+ # one from docutils.languages, and one from docutils.parsers.rst.languages
+ self.language_module_rst: ModuleType = get_language_rst(
+ self.document.settings.language_code
+ )
+ # a mapping of heading levels to its currently associated node
+ self._level_to_elem: dict[int, nodes.document | nodes.section] = {
+ 0: self.document
+ }
+
+ @property
+ def sphinx_env(self) -> BuildEnvironment | None:
+ """Return the sphinx env, if using Sphinx."""
+ try:
+ return self.document.settings.env
+ except AttributeError:
+ return None
+
+ def create_warning(
+ self,
+ message: str,
+ *,
+ line: int | None = None,
+ append_to: nodes.Element | None = None,
+ wtype: str = "myst",
+ subtype: str = "other",
+ ) -> nodes.system_message | None:
+ """Generate a warning, logging if it is necessary.
+
+ Note this is overridden in the ``SphinxRenderer``,
+ to handle suppressed warning types.
+ """
+ return create_warning(
+ self.document,
+ message,
+ line=line,
+ append_to=append_to,
+ wtype=wtype,
+ subtype=subtype,
+ )
+
+ def _render_tokens(self, tokens: list[Token]) -> None:
+ """Render the tokens."""
+ # propagate line number down to inline elements
+ for token in tokens:
+ if not token.map:
+ continue
+ # For docutils we want 1 based line numbers (not 0)
+ token.map = [token.map[0] + 1, token.map[1] + 1]
+ for token_child in token.children or []:
+ token_child.map = token.map
+
+ # nest tokens
+ node_tree = SyntaxTreeNode(tokens)
+
+ # move footnote definitions to env
+ self.md_env.setdefault("foot_refs", {})
+ for node in node_tree.walk(include_self=True):
+ new_children = []
+ for child in node.children:
+ if child.type == "footnote_reference":
+ label = child.meta["label"]
+ self.md_env["foot_refs"].setdefault(label, []).append(child)
+ else:
+ new_children.append(child)
+
+ node.children = new_children
+
+ # render
+ for child in node_tree.children:
+ # skip hidden?
+ if f"render_{child.type}" in self.rules:
+ self.rules[f"render_{child.type}"](child)
+ else:
+ self.create_warning(
+ f"No render method for: {child.type}",
+ line=token_line(child, default=0),
+ subtype="render",
+ append_to=self.current_node,
+ )
+
+ def render(
+ self, tokens: Sequence[Token], options, md_env: MutableMapping[str, Any]
+ ) -> nodes.document:
+ """Run the render on a token stream.
+
+ :param tokens: list on block tokens to render
+ :param options: params of parser instance
+ :param md_env: the markdown-it environment sandbox associated with the tokens,
+ containing additional metadata like reference info
+ """
+ self.setup_render(options, md_env)
+ self._render_initialise()
+ self._render_tokens(list(tokens))
+ self._render_finalise()
+ return self.document
+
+ def _render_initialise(self) -> None:
+ """Initialise the render of the document."""
+ self.current_node.extend(
+ html_meta_to_nodes(
+ self.md_config.html_meta,
+ document=self.document,
+ line=0,
+ reporter=self.reporter,
+ )
+ )
+
+ def _render_finalise(self) -> None:
+ """Finalise the render of the document."""
+
+ # log warnings for duplicate reference definitions
+ # "duplicate_refs": [{"href": "ijk", "label": "B", "map": [4, 5], "title": ""}],
+ for dup_ref in self.md_env.get("duplicate_refs", []):
+ self.create_warning(
+ f"Duplicate reference definition: {dup_ref['label']}",
+ line=dup_ref["map"][0] + 1,
+ subtype="ref",
+ append_to=self.document,
+ )
+
+ # we don't use the foot_references stored in the env
+ # since references within directives/roles will have been added after
+ # those from the initial markdown parse
+ # instead we gather them from a walk of the created document
+ foot_refs = OrderedDict()
+ for refnode in findall(self.document)(nodes.footnote_reference):
+ if refnode["refname"] not in foot_refs:
+ foot_refs[refnode["refname"]] = True
+
+ if foot_refs and self.md_config.footnote_transition:
+ self.current_node.append(nodes.transition(classes=["footnotes"]))
+ for footref in foot_refs:
+ foot_ref_tokens = self.md_env["foot_refs"].get(footref, [])
+ if len(foot_ref_tokens) > 1:
+ self.create_warning(
+ f"Multiple footnote definitions found for label: '{footref}'",
+ subtype="footnote",
+ append_to=self.current_node,
+ )
+
+ if len(foot_ref_tokens) < 1:
+ self.create_warning(
+ f"No footnote definitions found for label: '{footref}'",
+ subtype="footnote",
+ append_to=self.current_node,
+ )
+ else:
+ self.render_footnote_reference(foot_ref_tokens[0])
+
+ # Add the wordcount, generated by the ``mdit_py_plugins.wordcount_plugin``.
+ wordcount_metadata = self.md_env.get("wordcount", {})
+ if wordcount_metadata:
+
+ # save the wordcount to the sphinx BuildEnvironment metadata
+ if self.sphinx_env is not None:
+ meta = self.sphinx_env.metadata.setdefault(self.sphinx_env.docname, {})
+ meta["wordcount"] = wordcount_metadata
+
+ # now add the wordcount as substitution definitions,
+ # so we can reference them in the document
+ for key in ("words", "minutes"):
+ value = wordcount_metadata.get(key, None)
+ if value is None:
+ continue
+ substitution_node = nodes.substitution_definition(
+ str(value), nodes.Text(str(value))
+ )
+ substitution_node.source = self.document["source"]
+ substitution_node["names"].append(f"wordcount-{key}")
+ self.document.note_substitution_def(
+ substitution_node, f"wordcount-{key}"
+ )
+
+ def nested_render_text(
+ self, text: str, lineno: int, inline: bool = False, allow_headings: bool = True
+ ) -> None:
+ """Render unparsed text (appending to the current node).
+
+ :param text: the text to render
+ :param lineno: the starting line number of the text, within the full source
+ :param inline: whether the text is inline or block
+ :param allow_headings: whether to allow headings in the text
+ """
+ if inline:
+ tokens = self.md.parseInline(text, self.md_env)
+ else:
+ tokens = self.md.parse(text + "\n", self.md_env)
+
+ # remove front matter, if present, e.g. from included documents
+ if tokens and tokens[0].type == "front_matter":
+ tokens.pop(0)
+
+ # update the line numbers
+ for token in tokens:
+ if token.map:
+ token.map = [token.map[0] + lineno, token.map[1] + lineno]
+
+ current_match_titles = self.md_env.get("match_titles", None)
+ try:
+ self.md_env["match_titles"] = allow_headings
+ self._render_tokens(tokens)
+ finally:
+ self.md_env["match_titles"] = current_match_titles
+
+ @contextmanager
+ def current_node_context(
+ self, node: nodes.Element, append: bool = False
+ ) -> Iterator:
+ """Context manager for temporarily setting the current node."""
+ if append:
+ self.current_node.append(node)
+ current_node = self.current_node
+ self.current_node = node
+ yield
+ self.current_node = current_node
+
+ def render_children(self, token: SyntaxTreeNode) -> None:
+ """Render the children of a token."""
+ for child in token.children or []:
+ if f"render_{child.type}" in self.rules:
+ self.rules[f"render_{child.type}"](child)
+ else:
+ self.create_warning(
+ f"No render method for: {child.type}",
+ line=token_line(child, default=0),
+ subtype="render",
+ append_to=self.current_node,
+ )
+
+ def add_line_and_source_path(self, node, token: SyntaxTreeNode) -> None:
+ """Copy the line number and document source path to the docutils node."""
+ try:
+ node.line = token_line(token)
+ except ValueError:
+ pass
+ node.source = self.document["source"]
+
+ def add_line_and_source_path_r(
+ self, nodes: list[nodes.Element], token: SyntaxTreeNode
+ ) -> None:
+ """Copy the line number and document source path to the docutils nodes,
+ and recursively to all descendants.
+ """
+ for node in nodes:
+ self.add_line_and_source_path(node, token)
+ for child in findall(node)():
+ self.add_line_and_source_path(child, token)
+
+ def update_section_level_state(self, section: nodes.section, level: int) -> None:
+ """Update the section level state, with the new current section and level."""
+ # find the closest parent section
+ parent_level = max(
+ section_level
+ for section_level in self._level_to_elem
+ if level > section_level
+ )
+ parent = self._level_to_elem[parent_level]
+
+ # if we are jumping up to a non-consecutive level,
+ # then warn about this, since this will not be propagated in the docutils AST
+ if (level > parent_level) and (parent_level + 1 != level):
+ msg = f"Non-consecutive header level increase; H{parent_level} to H{level}"
+ if parent_level == 0:
+ msg = f"Document headings start at H{level}, not H1"
+ self.create_warning(
+ msg,
+ line=section.line,
+ subtype="header",
+ append_to=self.current_node,
+ )
+
+ # append the new section to the parent
+ parent.append(section)
+ # update the state for this section level
+ self._level_to_elem[level] = section
+
+ # Remove all descendant sections from the section level state
+ self._level_to_elem = {
+ section_level: section
+ for section_level, section in self._level_to_elem.items()
+ if section_level <= level
+ }
+
+ def renderInlineAsText(self, tokens: list[SyntaxTreeNode]) -> str:
+ """Special kludge for image `alt` attributes to conform CommonMark spec.
+
+ Don't try to use it! Spec requires to show `alt` content with stripped markup,
+ instead of simple escaping.
+ """
+ result = ""
+
+ for token in tokens or []:
+ if token.type == "text":
+ result += token.content
+ # elif token.type == "image":
+ # result += self.renderInlineAsText(token.children)
+ else:
+ result += self.renderInlineAsText(token.children or [])
+ return result
+
+ # ### render methods for commonmark tokens
+
+ def render_paragraph(self, token: SyntaxTreeNode) -> None:
+ para = nodes.paragraph(token.children[0].content if token.children else "")
+ self.add_line_and_source_path(para, token)
+ with self.current_node_context(para, append=True):
+ self.render_children(token)
+
+ def render_inline(self, token: SyntaxTreeNode) -> None:
+ self.render_children(token)
+
+ def render_text(self, token: SyntaxTreeNode) -> None:
+ self.current_node.append(nodes.Text(token.content))
+
+ def render_bullet_list(self, token: SyntaxTreeNode) -> None:
+ list_node = nodes.bullet_list()
+ if token.markup:
+ list_node["bullet"] = token.markup
+ if token.attrs.get("class"):
+ # this is used e.g. by tasklist
+ list_node["classes"] = str(token.attrs["class"]).split()
+ self.add_line_and_source_path(list_node, token)
+ with self.current_node_context(list_node, append=True):
+ self.render_children(token)
+
+ def render_ordered_list(self, token: SyntaxTreeNode) -> None:
+ list_node = nodes.enumerated_list(enumtype="arabic", prefix="")
+ list_node["suffix"] = token.markup # for CommonMark, this should be "." or ")"
+ if "start" in token.attrs: # starting number
+ list_node["start"] = token.attrs["start"]
+ self.add_line_and_source_path(list_node, token)
+ with self.current_node_context(list_node, append=True):
+ self.render_children(token)
+
+ def render_list_item(self, token: SyntaxTreeNode) -> None:
+ item_node = nodes.list_item()
+ if token.attrs.get("class"):
+ # this is used e.g. by tasklist
+ item_node["classes"] = str(token.attrs["class"]).split()
+ self.add_line_and_source_path(item_node, token)
+ with self.current_node_context(item_node, append=True):
+ self.render_children(token)
+
+ def render_em(self, token: SyntaxTreeNode) -> None:
+ node = nodes.emphasis()
+ self.add_line_and_source_path(node, token)
+ with self.current_node_context(node, append=True):
+ self.render_children(token)
+
+ def render_softbreak(self, token: SyntaxTreeNode) -> None:
+ self.current_node.append(nodes.Text("\n"))
+
+ def render_hardbreak(self, token: SyntaxTreeNode) -> None:
+ self.current_node.append(nodes.raw("", "<br />\n", format="html"))
+ self.current_node.append(nodes.raw("", "\\\\\n", format="latex"))
+
+ def render_strong(self, token: SyntaxTreeNode) -> None:
+ node = nodes.strong()
+ self.add_line_and_source_path(node, token)
+ with self.current_node_context(node, append=True):
+ self.render_children(token)
+
+ def render_blockquote(self, token: SyntaxTreeNode) -> None:
+ quote = nodes.block_quote()
+ self.add_line_and_source_path(quote, token)
+ with self.current_node_context(quote, append=True):
+ self.render_children(token)
+
+ def render_hr(self, token: SyntaxTreeNode) -> None:
+ node = nodes.transition()
+ self.add_line_and_source_path(node, token)
+ self.current_node.append(node)
+
+ def render_code_inline(self, token: SyntaxTreeNode) -> None:
+ node = nodes.literal(token.content, token.content)
+ self.add_line_and_source_path(node, token)
+ self.current_node.append(node)
+
+ def create_highlighted_code_block(
+ self,
+ text: str,
+ lexer_name: str | None,
+ number_lines: bool = False,
+ lineno_start: int = 1,
+ source: str | None = None,
+ line: int | None = None,
+ node_cls: type[nodes.Element] = nodes.literal_block,
+ ) -> nodes.Element:
+ """Create a literal block with syntax highlighting.
+
+ This mimics the behaviour of the `code-block` directive.
+
+ In docutils, this directive directly parses the text with the pygments lexer,
+ whereas in sphinx, the lexer name is only recorded as the `language` attribute,
+ and the text is lexed later by pygments within the `visit_literal_block`
+ method of the output format ``SphinxTranslator``.
+
+ Note, this function does not add the literal block to the document.
+ """
+ if self.sphinx_env is not None:
+ node = node_cls(text, text, language=lexer_name or "none")
+ if number_lines:
+ node["linenos"] = True
+ if lineno_start != 1:
+ node["highlight_args"] = {"linenostart": lineno_start}
+ else:
+ node = node_cls(
+ text, classes=["code"] + ([lexer_name] if lexer_name else [])
+ )
+ try:
+ lex_tokens = Lexer(
+ text,
+ lexer_name or "",
+ "short" if self.md_config.highlight_code_blocks else "none",
+ )
+ except LexerError as err:
+ self.reporter.warning(
+ str(err),
+ **{
+ name: value
+ for name, value in (("source", source), ("line", line))
+ if value is not None
+ },
+ )
+ lex_tokens = Lexer(text, lexer_name or "", "none")
+
+ if number_lines:
+ lex_tokens = NumberLines(
+ lex_tokens, lineno_start, lineno_start + len(text.splitlines())
+ )
+
+ for classes, value in lex_tokens:
+ if classes:
+ node += nodes.inline(value, value, classes=classes)
+ else:
+ # insert as Text to decrease the verbosity of the output
+ node += nodes.Text(value)
+
+ if source is not None:
+ node.source = source
+ if line is not None:
+ node.line = line
+ return node
+
+ def render_code_block(self, token: SyntaxTreeNode) -> None:
+ lexer = token.info.split()[0] if token.info else None
+ node = self.create_highlighted_code_block(
+ token.content,
+ lexer,
+ source=self.document["source"],
+ line=token_line(token, 0) or None,
+ )
+ self.current_node.append(node)
+
+ def render_fence(self, token: SyntaxTreeNode) -> None:
+ text = token.content
+ # Ensure that we'll have an empty string if info exists but is only spaces
+ info = token.info.strip() if token.info else token.info
+ language = info.split()[0] if info else ""
+
+ if (not self.md_config.commonmark_only) and (not self.md_config.gfm_only):
+ if language == "{eval-rst}":
+ return self.render_restructuredtext(token)
+ if language.startswith("{") and language.endswith("}"):
+ return self.render_directive(token)
+
+ if not language and self.sphinx_env is not None:
+ # use the current highlight setting, via the ``highlight`` directive,
+ # or ``highlight_language`` configuration.
+ language = self.sphinx_env.temp_data.get(
+ "highlight_language", self.sphinx_env.config.highlight_language
+ )
+
+ node = self.create_highlighted_code_block(
+ text,
+ language,
+ number_lines=language in self.md_config.number_code_blocks,
+ source=self.document["source"],
+ line=token_line(token, 0) or None,
+ )
+ self.current_node.append(node)
+
+ @property
+ def blocks_mathjax_processing(self) -> bool:
+ """Only add mathjax ignore classes if using sphinx,
+ and using the ``dollarmath`` extension, and ``myst_update_mathjax=True``.
+ """
+ return (
+ self.sphinx_env is not None
+ and "dollarmath" in self.md_config.enable_extensions
+ and self.md_config.update_mathjax
+ )
+
+ def render_heading(self, token: SyntaxTreeNode) -> None:
+ """Render a heading, e.g. `# Heading`."""
+
+ if self.md_env.get("match_titles", None) is False:
+ # this can occur if a nested parse is performed by a directive
+ # (such as an admonition) which contains a header.
+ # this would break the document structure
+ self.create_warning(
+ "Disallowed nested header found, converting to rubric",
+ line=token_line(token, default=0),
+ subtype="nested_header",
+ append_to=self.current_node,
+ )
+ rubric = nodes.rubric(token.content, "")
+ self.add_line_and_source_path(rubric, token)
+ with self.current_node_context(rubric, append=True):
+ self.render_children(token)
+ return
+
+ level = int(token.tag[1])
+
+ # create the section node
+ new_section = nodes.section()
+ self.add_line_and_source_path(new_section, token)
+ # if a top level section,
+ # then add classes to set default mathjax processing to false
+ # we then turn it back on, on a per-node basis
+ if level == 1 and self.blocks_mathjax_processing:
+ new_section["classes"].extend(["tex2jax_ignore", "mathjax_ignore"])
+
+ # update the state of the section levels
+ self.update_section_level_state(new_section, level)
+
+ # create the title for this section
+ title_node = nodes.title(token.children[0].content if token.children else "")
+ self.add_line_and_source_path(title_node, token)
+ new_section.append(title_node)
+ # render the heading children into the title
+ with self.current_node_context(title_node):
+ self.render_children(token)
+
+ # create a target reference for the section, based on the heading text
+ name = nodes.fully_normalize_name(title_node.astext())
+ new_section["names"].append(name)
+ self.document.note_implicit_target(new_section, new_section)
+
+ # set the section as the current node for subsequent rendering
+ self.current_node = new_section
+
+ def render_link(self, token: SyntaxTreeNode) -> None:
+ """Parse `<http://link.com>` or `[text](link "title")` syntax to docutils AST:
+
+ - If `<>` autolink, forward to `render_autolink`
+ - If `myst_all_links_external` is True, forward to `render_external_url`
+ - If link is an external URL, forward to `render_external_url`
+ - External URLs start with a scheme (e.g. `http:`) in `myst_url_schemes`,
+ or any scheme if `myst_url_schemes` is None.
+ - Otherwise, forward to `render_internal_link`
+ """
+ if token.info == "auto": # handles both autolink and linkify
+ return self.render_autolink(token)
+
+ if (
+ self.md_config.commonmark_only
+ or self.md_config.gfm_only
+ or self.md_config.all_links_external
+ ):
+ return self.render_external_url(token)
+
+ # Check for external URL
+ url_scheme = urlparse(cast(str, token.attrGet("href") or "")).scheme
+ allowed_url_schemes = self.md_config.url_schemes
+ if (allowed_url_schemes is None and url_scheme) or (
+ allowed_url_schemes is not None and url_scheme in allowed_url_schemes
+ ):
+ return self.render_external_url(token)
+
+ return self.render_internal_link(token)
+
+ def render_external_url(self, token: SyntaxTreeNode) -> None:
+ """Render link token `[text](link "title")`,
+ where the link has been identified as an external URL::
+
+ <reference refuri="link" title="title">
+ text
+
+ `text` can contain nested syntax, e.g. `[**bold**](url "title")`.
+ """
+ ref_node = nodes.reference()
+ self.add_line_and_source_path(ref_node, token)
+ ref_node["refuri"] = cast(str, token.attrGet("href") or "")
+ title = token.attrGet("title")
+ if title:
+ ref_node["title"] = title
+ with self.current_node_context(ref_node, append=True):
+ self.render_children(token)
+
+ def render_internal_link(self, token: SyntaxTreeNode) -> None:
+ """Render link token `[text](link "title")`,
+ where the link has not been identified as an external URL::
+
+ <reference refname="link" title="title">
+ text
+
+ `text` can contain nested syntax, e.g. `[**bold**](link "title")`.
+
+ Note, this is overridden by `SphinxRenderer`, to use `pending_xref` nodes.
+ """
+ ref_node = nodes.reference()
+ self.add_line_and_source_path(ref_node, token)
+ ref_node["refname"] = cast(str, token.attrGet("href") or "")
+ self.document.note_refname(ref_node)
+ title = token.attrGet("title")
+ if title:
+ ref_node["title"] = title
+ with self.current_node_context(ref_node, append=True):
+ self.render_children(token)
+
+ def render_autolink(self, token: SyntaxTreeNode) -> None:
+ refuri = escapeHtml(token.attrGet("href") or "") # type: ignore[arg-type]
+ ref_node = nodes.reference()
+ ref_node["refuri"] = refuri
+ self.add_line_and_source_path(ref_node, token)
+ with self.current_node_context(ref_node, append=True):
+ self.render_children(token)
+
+ def render_html_inline(self, token: SyntaxTreeNode) -> None:
+ self.render_html_block(token)
+
+ def render_html_block(self, token: SyntaxTreeNode) -> None:
+ node_list = html_to_nodes(token.content, token_line(token), self)
+ self.current_node.extend(node_list)
+
+ def render_image(self, token: SyntaxTreeNode) -> None:
+ img_node = nodes.image()
+ self.add_line_and_source_path(img_node, token)
+ destination = cast(str, token.attrGet("src") or "")
+
+ if self.md_env.get("relative-images", None) is not None and not is_external_url(
+ destination, None, True
+ ):
+ # make the path relative to an "including" document
+ # this is set when using the `relative-images` option of the MyST `include` directive
+ destination = os.path.normpath(
+ os.path.join(
+ self.md_env.get("relative-images", ""),
+ os.path.normpath(destination),
+ )
+ )
+
+ img_node["uri"] = destination
+
+ img_node["alt"] = self.renderInlineAsText(token.children or [])
+ title = token.attrGet("title")
+ if title:
+ img_node["title"] = token.attrGet("title")
+
+ # apply other attributes that can be set on the image
+ if "class" in token.attrs:
+ img_node["classes"].extend(str(token.attrs["class"]).split())
+ if "width" in token.attrs:
+ try:
+ width = directives.length_or_percentage_or_unitless(
+ str(token.attrs["width"])
+ )
+ except ValueError:
+ self.create_warning(
+ f"Invalid width value for image: {token.attrs['width']!r}",
+ line=token_line(token, default=0),
+ subtype="image",
+ append_to=self.current_node,
+ )
+ else:
+ img_node["width"] = width
+ if "height" in token.attrs:
+ try:
+ height = directives.length_or_unitless(str(token.attrs["height"]))
+ except ValueError:
+ self.create_warning(
+ f"Invalid height value for image: {token.attrs['height']!r}",
+ line=token_line(token, default=0),
+ subtype="image",
+ append_to=self.current_node,
+ )
+ else:
+ img_node["height"] = height
+ if "align" in token.attrs:
+ if token.attrs["align"] not in ("left", "center", "right"):
+ self.create_warning(
+ f"Invalid align value for image: {token.attrs['align']!r}",
+ line=token_line(token, default=0),
+ subtype="image",
+ append_to=self.current_node,
+ )
+ else:
+ img_node["align"] = token.attrs["align"]
+ if "id" in token.attrs:
+ name = nodes.fully_normalize_name(str(token.attrs["id"]))
+ img_node["names"].append(name)
+ self.document.note_explicit_target(img_node, img_node)
+
+ self.current_node.append(img_node)
+
+ # ### render methods for plugin tokens
+
+ def render_front_matter(self, token: SyntaxTreeNode) -> None:
+ """Pass document front matter data."""
+ position = token_line(token, default=0)
+
+ if isinstance(token.content, str):
+ try:
+ data = yaml.safe_load(token.content)
+ except (yaml.parser.ParserError, yaml.scanner.ScannerError):
+ self.create_warning(
+ "Malformed YAML",
+ line=position,
+ append_to=self.current_node,
+ subtype="topmatter",
+ )
+ return
+ else:
+ data = token.content
+
+ if not isinstance(data, dict):
+ self.create_warning(
+ f"YAML is not a dict: {type(data)}",
+ line=position,
+ append_to=self.current_node,
+ subtype="topmatter",
+ )
+ return
+
+ fields = {
+ k: v
+ for k, v in data.items()
+ if k not in ("myst", "mystnb", "substitutions", "html_meta")
+ }
+ if fields:
+ field_list = self.dict_to_fm_field_list(
+ fields, language_code=self.document.settings.language_code
+ )
+ self.current_node.append(field_list)
+
+ if data.get("title") and self.md_config.title_to_header:
+ self.nested_render_text(f"# {data['title']}", 0)
+
+ def dict_to_fm_field_list(
+ self, data: dict[str, Any], language_code: str, line: int = 0
+ ) -> nodes.field_list:
+ """Render each key/val pair as a docutils ``field_node``.
+
+ Bibliographic keys below will be parsed as Markdown,
+ all others will be left as literal text.
+
+ The field list should be at the start of the document,
+ and will then be converted to a `docinfo` node during the
+ `docutils.docutils.transforms.frontmatter.DocInfo` transform (priority 340),
+ and bibliographic keys (or their translation) will be converted to nodes::
+
+ {'author': docutils.nodes.author,
+ 'authors': docutils.nodes.authors,
+ 'organization': docutils.nodes.organization,
+ 'address': docutils.nodes.address,
+ 'contact': docutils.nodes.contact,
+ 'version': docutils.nodes.version,
+ 'revision': docutils.nodes.revision,
+ 'status': docutils.nodes.status,
+ 'date': docutils.nodes.date,
+ 'copyright': docutils.nodes.copyright,
+ 'dedication': docutils.nodes.topic,
+ 'abstract': docutils.nodes.topic}
+
+ Also, the 'dedication' and 'abstract' will be placed outside the `docinfo`,
+ and so will always be shown in the document.
+
+ If using sphinx, this `docinfo` node will later be extracted from the AST,
+ by the `DoctreeReadEvent` transform (priority 880),
+ calling `MetadataCollector.process_doc`.
+ In this case keys and values will be converted to strings and stored in
+ `app.env.metadata[app.env.docname]`
+
+ See
+ https://www.sphinx-doc.org/en/master/usage/restructuredtext/field-lists.html
+ for docinfo fields used by sphinx.
+
+ """
+ field_list = nodes.field_list()
+ field_list.source, field_list.line = self.document["source"], line
+
+ bibliofields = get_language(language_code).bibliographic_fields
+
+ for key, value in data.items():
+ if not isinstance(value, (str, int, float, date, datetime)):
+ value = json.dumps(value)
+ value = str(value)
+ body = nodes.paragraph()
+ body.source, body.line = self.document["source"], line
+ if key in bibliofields:
+ with self.current_node_context(body):
+ self.nested_render_text(value, line, inline=True)
+ else:
+ body += nodes.literal(value, value)
+
+ field_node = nodes.field()
+ field_node.source = value
+ field_node += nodes.field_name(key, "", nodes.Text(key))
+ field_node += nodes.field_body(value, *[body])
+ field_list += field_node
+
+ return field_list
+
+ def render_table(self, token: SyntaxTreeNode) -> None:
+
+ # markdown-it table always contains at least a header:
+ assert token.children
+ header = token.children[0]
+ # with one header row
+ assert header.children
+ header_row = header.children[0]
+ assert header_row.children
+
+ # top-level element
+ table = nodes.table()
+ table["classes"] += ["colwidths-auto"]
+ self.add_line_and_source_path(table, token)
+ self.current_node.append(table)
+
+ # column settings element
+ maxcols = len(header_row.children)
+ colwidths = [100 // maxcols] * maxcols
+ tgroup = nodes.tgroup(cols=len(colwidths))
+ table += tgroup
+ for colwidth in colwidths:
+ colspec = nodes.colspec(colwidth=colwidth)
+ tgroup += colspec
+
+ # header
+ thead = nodes.thead()
+ tgroup += thead
+ with self.current_node_context(thead):
+ self.render_table_row(header_row)
+
+ # body
+ if len(token.children) > 1:
+ body = token.children[1]
+ tbody = nodes.tbody()
+ tgroup += tbody
+ with self.current_node_context(tbody):
+ for body_row in body.children or []:
+ self.render_table_row(body_row)
+
+ def render_table_row(self, token: SyntaxTreeNode) -> None:
+ row = nodes.row()
+ with self.current_node_context(row, append=True):
+ for child in token.children or []:
+ entry = nodes.entry()
+ para = nodes.paragraph(
+ child.children[0].content if child.children else ""
+ )
+ style = child.attrGet("style") # i.e. the alignment when using e.g. :--
+ if style and style in (
+ "text-align:left",
+ "text-align:right",
+ "text-align:center",
+ ):
+ entry["classes"].append(f"text-{cast(str, style).split(':')[1]}")
+ with self.current_node_context(entry, append=True):
+ with self.current_node_context(para, append=True):
+ self.render_children(child)
+
+ def render_s(self, token: SyntaxTreeNode) -> None:
+ """Render a strikethrough token."""
+ # TODO strikethrough not currently directly supported in docutils
+ self.create_warning(
+ "Strikethrough is currently only supported in HTML output",
+ line=token_line(token, 0),
+ subtype="strikethrough",
+ append_to=self.current_node,
+ )
+ self.current_node.append(nodes.raw("", "<s>", format="html"))
+ self.render_children(token)
+ self.current_node.append(nodes.raw("", "</s>", format="html"))
+
+ def render_math_inline(self, token: SyntaxTreeNode) -> None:
+ content = token.content
+ node = nodes.math(content, content)
+ self.add_line_and_source_path(node, token)
+ self.current_node.append(node)
+
+ def render_math_inline_double(self, token: SyntaxTreeNode) -> None:
+ content = token.content
+ node = nodes.math_block(content, content, nowrap=False, number=None)
+ self.add_line_and_source_path(node, token)
+ self.current_node.append(node)
+
+ def render_math_single(self, token: SyntaxTreeNode) -> None:
+ content = token.content
+ node = nodes.math(content, content)
+ self.add_line_and_source_path(node, token)
+ self.current_node.append(node)
+
+ def render_math_block(self, token: SyntaxTreeNode) -> None:
+ content = token.content
+ node = nodes.math_block(content, content, nowrap=False, number=None)
+ self.add_line_and_source_path(node, token)
+ self.current_node.append(node)
+
+ def render_amsmath(self, token: SyntaxTreeNode) -> None:
+ # note docutils does not currently support the nowrap attribute
+ # or equation numbering, so this is overridden in the sphinx renderer
+ node = nodes.math_block(
+ token.content, token.content, nowrap=True, classes=["amsmath"]
+ )
+ if token.meta["numbered"] != "*":
+ node["numbered"] = True
+ self.add_line_and_source_path(node, token)
+ self.current_node.append(node)
+
+ def render_footnote_ref(self, token: SyntaxTreeNode) -> None:
+ """Footnote references are added as auto-numbered,
+ .i.e. `[^a]` is read as rST `[#a]_`
+ """
+ target = token.meta["label"]
+
+ refnode = nodes.footnote_reference(f"[^{target}]")
+ self.add_line_and_source_path(refnode, token)
+ if not target.isdigit():
+ refnode["auto"] = 1
+ self.document.note_autofootnote_ref(refnode)
+ else:
+ refnode += nodes.Text(target)
+
+ refnode["refname"] = target
+ self.document.note_footnote_ref(refnode)
+
+ self.current_node.append(refnode)
+
+ def render_footnote_reference(self, token: SyntaxTreeNode) -> None:
+ target = token.meta["label"]
+
+ footnote = nodes.footnote()
+ self.add_line_and_source_path(footnote, token)
+ footnote["names"].append(target)
+ if not target.isdigit():
+ footnote["auto"] = 1
+ self.document.note_autofootnote(footnote)
+ else:
+ footnote += nodes.label("", target)
+ self.document.note_footnote(footnote)
+ self.document.note_explicit_target(footnote, footnote)
+ with self.current_node_context(footnote, append=True):
+ self.render_children(token)
+
+ def render_myst_block_break(self, token: SyntaxTreeNode) -> None:
+ block_break = nodes.comment(token.content, token.content)
+ block_break["classes"] += ["block_break"]
+ self.add_line_and_source_path(block_break, token)
+ self.current_node.append(block_break)
+
+ def render_myst_target(self, token: SyntaxTreeNode) -> None:
+ text = token.content
+ name = nodes.fully_normalize_name(text)
+ target = nodes.target(text)
+ target["names"].append(name)
+ self.add_line_and_source_path(target, token)
+ self.document.note_explicit_target(target, self.current_node)
+ self.current_node.append(target)
+
+ def render_myst_line_comment(self, token: SyntaxTreeNode) -> None:
+ self.current_node.append(nodes.comment(token.content, token.content.strip()))
+
+ def render_myst_role(self, token: SyntaxTreeNode) -> None:
+ name = token.meta["name"]
+ text = token.content
+ rawsource = f":{name}:`{token.content}`"
+ lineno = token_line(token) if token.map else 0
+ role_func, messages = roles.role(
+ name, self.language_module_rst, lineno, self.reporter
+ )
+ inliner = MockInliner(self)
+ if role_func:
+ nodes, messages2 = role_func(name, rawsource, text, lineno, inliner)
+ # return nodes, messages + messages2
+ self.current_node += nodes
+ else:
+ message = self.reporter.error(
+ f'Unknown interpreted text role "{name}".', line=lineno
+ )
+ problematic = inliner.problematic(text, rawsource, message)
+ self.current_node += problematic
+
+ def render_colon_fence(self, token: SyntaxTreeNode) -> None:
+ """Render a code fence with ``:`` colon delimiters."""
+
+ if token.content.startswith(":::"):
+ # the content starts with a nested fence block,
+ # but must distinguish between ``:options:``, so we add a new line
+ assert token.token is not None, '"colon_fence" must have a `token`'
+ linear_token = token.token.copy()
+ linear_token.content = "\n" + linear_token.content
+ token.token = linear_token
+
+ return self.render_fence(token)
+
+ def render_dl(self, token: SyntaxTreeNode) -> None:
+ """Render a definition list."""
+ node = nodes.definition_list(classes=["simple", "myst"])
+ self.add_line_and_source_path(node, token)
+ with self.current_node_context(node, append=True):
+ item = None
+ for child in token.children or []:
+ if child.type == "dt":
+ item = nodes.definition_list_item()
+ self.add_line_and_source_path(item, child)
+ with self.current_node_context(item, append=True):
+ term = nodes.term(
+ child.children[0].content if child.children else ""
+ )
+ self.add_line_and_source_path(term, child)
+ with self.current_node_context(term, append=True):
+ self.render_children(child)
+ elif child.type == "dd":
+ if item is None:
+ error = self.reporter.error(
+ (
+ "Found a definition in a definition list, "
+ "with no preceding term"
+ ),
+ # nodes.literal_block(content, content),
+ line=token_line(child),
+ )
+ self.current_node += [error]
+ with self.current_node_context(item):
+ definition = nodes.definition()
+ self.add_line_and_source_path(definition, child)
+ with self.current_node_context(definition, append=True):
+ self.render_children(child)
+ else:
+ error_msg = self.reporter.error(
+ (
+ "Expected a term/definition as a child of a definition list"
+ f", but found a: {child.type}"
+ ),
+ # nodes.literal_block(content, content),
+ line=token_line(child),
+ )
+ self.current_node += [error_msg]
+
+ def render_field_list(self, token: SyntaxTreeNode) -> None:
+ """Render a field list."""
+ field_list = nodes.field_list(classes=["myst"])
+ self.add_line_and_source_path(field_list, token)
+ with self.current_node_context(field_list, append=True):
+ # raise ValueError(token.pretty(show_text=True))
+ children = (token.children or [])[:]
+ while children:
+ child = children.pop(0)
+ if not child.type == "fieldlist_name":
+ error_msg = self.reporter.error(
+ (
+ "Expected a fieldlist_name as a child of a field_list"
+ f", but found a: {child.type}"
+ ),
+ # nodes.literal_block(content, content),
+ line=token_line(child),
+ )
+ self.current_node += [error_msg]
+ break
+ field = nodes.field()
+ self.add_line_and_source_path(field, child)
+ field_list += field
+ field_name = nodes.field_name()
+ self.add_line_and_source_path(field_name, child)
+ field += field_name
+ with self.current_node_context(field_name):
+ self.render_children(child)
+ field_body = nodes.field_body()
+ self.add_line_and_source_path(field_name, child)
+ field += field_body
+ if children and children[0].type == "fieldlist_body":
+ child = children.pop(0)
+ with self.current_node_context(field_body):
+ self.render_children(child)
+
+ def render_restructuredtext(self, token: SyntaxTreeNode) -> None:
+ """Render the content of the token as restructuredtext."""
+ # copy necessary elements (source, line no, env, reporter)
+ newdoc = make_document()
+ newdoc["source"] = self.document["source"]
+ newdoc.settings = self.document.settings
+ newdoc.reporter = self.reporter
+ # pad the line numbers artificially so they offset with the fence block
+ pseudosource = ("\n" * token_line(token)) + token.content
+ # actually parse the rst into our document
+ MockRSTParser().parse(pseudosource, newdoc)
+ for node in newdoc:
+ if node["names"]:
+ self.document.note_explicit_target(node, node)
+ self.current_node.extend(newdoc.children)
+
+ def render_directive(self, token: SyntaxTreeNode) -> None:
+ """Render special fenced code blocks as directives."""
+ first_line = token.info.split(maxsplit=1)
+ name = first_line[0][1:-1]
+ arguments = "" if len(first_line) == 1 else first_line[1]
+ content = token.content
+ position = token_line(token)
+ nodes_list = self.run_directive(name, arguments, content, position)
+ self.current_node += nodes_list
+
+ def run_directive(
+ self, name: str, first_line: str, content: str, position: int
+ ) -> list[nodes.Element]:
+ """Run a directive and return the generated nodes.
+
+ :param name: the name of the directive
+ :param first_line: The text on the same line as the directive name.
+ May be an argument or body text, dependent on the directive
+ :param content: All text after the first line. Can include options.
+ :param position: The line number of the first line
+
+ """
+ # TODO directive name white/black lists
+
+ self.document.current_line = position
+
+ # get directive class
+ output: tuple[Directive, list] = directives.directive(
+ name, self.language_module_rst, self.document
+ )
+ directive_class, messages = output
+ if not directive_class:
+ error = self.reporter.error(
+ f'Unknown directive type "{name}".\n',
+ # nodes.literal_block(content, content),
+ line=position,
+ )
+ return [error] + messages
+
+ if issubclass(directive_class, Include):
+ # this is a Markdown only option,
+ # to allow for altering relative image reference links
+ directive_class.option_spec["relative-images"] = directives.flag
+ directive_class.option_spec["relative-docs"] = directives.path
+
+ try:
+ arguments, options, body_lines, content_offset = parse_directive_text(
+ directive_class, first_line, content
+ )
+ except DirectiveParsingError as error:
+ error = self.reporter.error(
+ f"Directive '{name}': {error}",
+ nodes.literal_block(content, content),
+ line=position,
+ )
+ return [error]
+
+ # initialise directive
+ if issubclass(directive_class, Include):
+ directive_instance = MockIncludeDirective(
+ self,
+ name=name,
+ klass=directive_class,
+ arguments=arguments,
+ options=options,
+ body=body_lines,
+ lineno=position,
+ )
+ else:
+ state_machine = MockStateMachine(self, position)
+ state = MockState(self, state_machine, position)
+ directive_instance = directive_class(
+ name=name,
+ # the list of positional arguments
+ arguments=arguments,
+ # a dictionary mapping option names to values
+ options=options,
+ # the directive content line by line
+ content=StringList(body_lines, self.document["source"]),
+ # the absolute line number of the first line of the directive
+ lineno=position,
+ # the line offset of the first line of the content
+ content_offset=content_offset,
+ # a string containing the entire directive
+ block_text="\n".join(body_lines),
+ state=state,
+ state_machine=state_machine,
+ )
+
+ # run directive
+ try:
+ result = directive_instance.run()
+ except DirectiveError as error:
+ msg_node = self.reporter.system_message(
+ error.level, error.msg, line=position
+ )
+ msg_node += nodes.literal_block(content, content)
+ result = [msg_node]
+ except MockingError as exc:
+ error_msg = self.reporter.error(
+ "Directive '{}' cannot be mocked: {}: {}".format(
+ name, exc.__class__.__name__, exc
+ ),
+ nodes.literal_block(content, content),
+ line=position,
+ )
+ return [error_msg]
+
+ assert isinstance(
+ result, list
+ ), f'Directive "{name}" must return a list of nodes.'
+ for i in range(len(result)):
+ assert isinstance(
+ result[i], nodes.Node
+ ), 'Directive "{}" returned non-Node object (index {}): {}'.format(
+ name, i, result[i]
+ )
+ return result
+
+ def render_substitution_inline(self, token: SyntaxTreeNode) -> None:
+ """Render inline substitution {{key}}."""
+ self.render_substitution(token, inline=True)
+
+ def render_substitution_block(self, token: SyntaxTreeNode) -> None:
+ """Render block substitution {{key}}."""
+ self.render_substitution(token, inline=False)
+
+ def render_substitution(self, token: SyntaxTreeNode, inline: bool) -> None:
+ """Substitutions are rendered by:
+
+ 1. Combining global substitutions with front-matter substitutions
+ to create a variable context (front-matter takes priority)
+ 2. Add the sphinx `env` to the variable context (if available)
+ 3. Create the string content with Jinja2 (passing it the variable context)
+ 4. If the substitution is inline and not a directive,
+ parse to nodes ignoring block syntaxes (like lists or block-quotes),
+ otherwise parse to nodes with all syntax rules.
+
+ """
+ position = token_line(token)
+
+ # front-matter substitutions take priority over config ones
+ variable_context: dict[str, Any] = {**self.md_config.substitutions}
+ if self.sphinx_env is not None:
+ variable_context["env"] = self.sphinx_env
+
+ # fail on undefined variables
+ env = jinja2.Environment(undefined=jinja2.StrictUndefined)
+
+ # try rendering
+ try:
+ rendered = env.from_string(f"{{{{{token.content}}}}}").render(
+ variable_context
+ )
+ except Exception as error:
+ error_msg = self.reporter.error(
+ f"Substitution error:{error.__class__.__name__}: {error}",
+ line=position,
+ )
+ self.current_node += [error_msg]
+ return
+
+ # handle circular references
+ ast = env.parse(f"{{{{{token.content}}}}}")
+ references = {
+ n.name for n in ast.find_all(jinja2.nodes.Name) if n.name != "env"
+ }
+ self.document.sub_references = getattr(self.document, "sub_references", set())
+ cyclic = references.intersection(self.document.sub_references)
+ if cyclic:
+ error_msg = self.reporter.error(
+ f"circular substitution reference: {cyclic}",
+ line=position,
+ )
+ self.current_node += [error_msg]
+ return
+
+ # TODO improve error reporting;
+ # at present, for a multi-line substitution,
+ # an error may point to a line lower than the substitution
+ # should it point to the source of the substitution?
+ # or the error message should at least indicate that its a substitution
+
+ # we record used references before nested parsing, then remove them after
+ self.document.sub_references.update(references)
+ try:
+ if inline and not REGEX_DIRECTIVE_START.match(rendered):
+ self.nested_render_text(rendered, position, inline=True)
+ else:
+ self.nested_render_text(rendered, position, allow_headings=False)
+ finally:
+ self.document.sub_references.difference_update(references)
+
+
+def html_meta_to_nodes(
+ data: dict[str, Any], document: nodes.document, line: int, reporter: Reporter
+) -> list[nodes.pending | nodes.system_message]:
+ """Replicate the `meta` directive,
+ by converting a dictionary to a list of pending meta nodes
+
+ See:
+ https://www.sphinx-doc.org/en/master/usage/restructuredtext/basics.html#html-metadata
+ """
+ if not data:
+ return []
+
+ try:
+ # if sphinx available
+ from sphinx.addnodes import meta as meta_cls
+ except ImportError:
+ try:
+ # docutils >= 0.19
+ meta_cls = nodes.meta # type: ignore
+ except AttributeError:
+ from docutils.parsers.rst.directives.html import MetaBody
+
+ meta_cls = MetaBody.meta # type: ignore
+
+ output = []
+
+ for key, value in data.items():
+ content = str(value or "")
+ meta_node = meta_cls(content)
+ meta_node.source = document["source"]
+ meta_node.line = line
+ meta_node["content"] = content
+ try:
+ if not content:
+ raise ValueError("No content")
+ for i, key_part in enumerate(key.split()):
+ if "=" not in key_part and i == 0:
+ meta_node["name"] = key_part
+ continue
+ if "=" not in key_part:
+ raise ValueError(f"no '=' in {key_part}")
+ attr_name, attr_val = key_part.split("=", 1)
+ if not (attr_name and attr_val):
+ raise ValueError(f"malformed {key_part}")
+ meta_node[attr_name.lower()] = attr_val
+ except ValueError as error:
+ msg = reporter.error(f'Error parsing meta tag attribute "{key}": {error}.')
+ output.append(msg)
+ continue
+
+ pending = nodes.pending(
+ Filter,
+ {"component": "writer", "format": "html", "nodes": [meta_node]},
+ )
+ document.note_pending(pending)
+ output.append(pending)
+
+ return output
diff --git a/myst_parser/mdit_to_docutils/html_to_nodes.py b/myst_parser/mdit_to_docutils/html_to_nodes.py
new file mode 100644
index 0000000..2cc3066
--- /dev/null
+++ b/myst_parser/mdit_to_docutils/html_to_nodes.py
@@ -0,0 +1,139 @@
+"""Convert HTML to docutils nodes."""
+from __future__ import annotations
+
+import re
+from typing import TYPE_CHECKING
+
+from docutils import nodes
+
+from myst_parser.parsers.parse_html import Data, tokenize_html
+
+if TYPE_CHECKING:
+ from .base import DocutilsRenderer
+
+
+def make_error(
+ document: nodes.document, error_msg: str, text: str, line_number: int
+) -> nodes.system_message:
+ return document.reporter.error(
+ error_msg,
+ nodes.literal_block(text, text),
+ line=line_number,
+ )
+
+
+OPTION_KEYS_IMAGE = {"class", "alt", "height", "width", "align", "name"}
+# note: docutils also has scale and target
+
+OPTION_KEYS_ADMONITION = {"class", "name"}
+
+# See https://github.com/micromark/micromark-extension-gfm-tagfilter
+RE_FLOW = re.compile(
+ r"<(\/?)(iframe|noembed|noframes|plaintext|script|style|title|textarea|xmp)(?=[\t\n\f\r />])",
+ re.IGNORECASE,
+)
+
+
+def default_html(text: str, source: str, line_number: int) -> list[nodes.Element]:
+ raw_html = nodes.raw("", text, format="html")
+ raw_html.source = source
+ raw_html.line = line_number
+ return [raw_html]
+
+
+def html_to_nodes(
+ text: str, line_number: int, renderer: DocutilsRenderer
+) -> list[nodes.Element]:
+ """Convert HTML to docutils nodes."""
+ if renderer.md_config.gfm_only:
+ text, _ = RE_FLOW.subn(lambda s: s.group(0).replace("<", "&lt;"), text)
+
+ enable_html_img = "html_image" in renderer.md_config.enable_extensions
+ enable_html_admonition = "html_admonition" in renderer.md_config.enable_extensions
+ if not (enable_html_img or enable_html_admonition):
+ return default_html(text, renderer.document["source"], line_number)
+
+ # parse the HTML to AST
+ try:
+ root = tokenize_html(text).strip(inplace=True, recurse=False)
+ except Exception:
+ msg_node = renderer.create_warning(
+ "HTML could not be parsed", line=line_number, subtype="html"
+ )
+ return ([msg_node] if msg_node else []) + default_html(
+ text, renderer.document["source"], line_number
+ )
+
+ if len(root) < 1:
+ # if empty
+ return default_html(text, renderer.document["source"], line_number)
+
+ if not all(
+ (enable_html_img and child.name == "img")
+ or (
+ enable_html_admonition
+ and child.name == "div"
+ and "admonition" in child.attrs.classes
+ )
+ for child in root
+ ):
+ return default_html(text, renderer.document["source"], line_number)
+
+ nodes_list = []
+ for child in root:
+
+ if child.name == "img":
+ if "src" not in child.attrs:
+ return [
+ renderer.reporter.error(
+ "<img> missing 'src' attribute", line=line_number
+ )
+ ]
+ content = "\n".join(
+ f":{k}: {v}"
+ for k, v in sorted(child.attrs.items())
+ if k in OPTION_KEYS_IMAGE
+ )
+ nodes_list.extend(
+ renderer.run_directive(
+ "image", child.attrs["src"], content, line_number
+ )
+ )
+
+ else:
+ children = child.strip().children
+ if (
+ children
+ and children[0].name in ("div", "p")
+ and (
+ "title" in children[0].attrs.classes
+ or "admonition-title" in children[0].attrs.classes
+ )
+ ):
+ title = "".join(child.render() for child in children.pop(0))
+ else:
+ title = "Note"
+
+ options = "\n".join(
+ f":{k}: {v}"
+ for k, v in sorted(child.attrs.items())
+ if k in OPTION_KEYS_ADMONITION
+ ).rstrip()
+ new_children = []
+ for child in children:
+ if child.name == "p":
+ new_children.extend(child.children)
+ new_children.append(Data("\n\n"))
+ else:
+ new_children.append(child)
+ content = (
+ options
+ + ("\n\n" if options else "")
+ + "".join(child.render() for child in new_children).lstrip()
+ )
+
+ nodes_list.extend(
+ renderer.run_directive("admonition", title, content, line_number)
+ )
+
+ return nodes_list
diff --git a/myst_parser/mdit_to_docutils/sphinx_.py b/myst_parser/mdit_to_docutils/sphinx_.py
new file mode 100644
index 0000000..3c1bc23
--- /dev/null
+++ b/myst_parser/mdit_to_docutils/sphinx_.py
@@ -0,0 +1,245 @@
+"""Convert Markdown-it tokens to docutils nodes, including sphinx specific elements."""
+from __future__ import annotations
+
+import os
+from pathlib import Path
+from typing import cast
+from urllib.parse import unquote
+from uuid import uuid4
+
+from docutils import nodes
+from markdown_it.tree import SyntaxTreeNode
+from sphinx import addnodes
+from sphinx.domains.math import MathDomain
+from sphinx.domains.std import StandardDomain
+from sphinx.environment import BuildEnvironment
+from sphinx.util import logging
+from sphinx.util.nodes import clean_astext
+
+from myst_parser.mdit_to_docutils.base import DocutilsRenderer
+
+LOGGER = logging.getLogger(__name__)
+
+
+def create_warning(
+ document: nodes.document,
+ message: str,
+ *,
+ line: int | None = None,
+ append_to: nodes.Element | None = None,
+ wtype: str = "myst",
+ subtype: str = "other",
+) -> nodes.system_message | None:
+ """Generate a warning, logging it if necessary.
+
+ If the warning type is listed in the ``suppress_warnings`` configuration,
+ then ``None`` will be returned and no warning logged.
+ """
+ message = f"{message} [{wtype}.{subtype}]"
+ kwargs = {"line": line} if line is not None else {}
+
+ if logging.is_suppressed_warning(
+ wtype, subtype, document.settings.env.app.config.suppress_warnings
+ ):
+ return None
+
+ msg_node = document.reporter.warning(message, **kwargs)
+ if append_to is not None:
+ append_to.append(msg_node)
+
+ return None
+
+
+class SphinxRenderer(DocutilsRenderer):
+ """A markdown-it-py renderer to populate (in-place) a `docutils.document` AST.
+
+ This is sub-class of `DocutilsRenderer` that handles sphinx specific aspects,
+ such as cross-referencing.
+ """
+
+ @property
+ def doc_env(self) -> BuildEnvironment:
+ return self.document.settings.env
+
+ def create_warning(
+ self,
+ message: str,
+ *,
+ line: int | None = None,
+ append_to: nodes.Element | None = None,
+ wtype: str = "myst",
+ subtype: str = "other",
+ ) -> nodes.system_message | None:
+ """Generate a warning, logging it if necessary.
+
+ If the warning type is listed in the ``suppress_warnings`` configuration,
+ then ``None`` will be returned and no warning logged.
+ """
+ return create_warning(
+ self.document,
+ message,
+ line=line,
+ append_to=append_to,
+ wtype=wtype,
+ subtype=subtype,
+ )
+
+ def render_internal_link(self, token: SyntaxTreeNode) -> None:
+ """Render link token `[text](link "title")`,
+ where the link has not been identified as an external URL.
+ """
+ destination = unquote(cast(str, token.attrGet("href") or ""))
+
+ # make the path relative to an "including" document
+ # this is set when using the `relative-docs` option of the MyST `include` directive
+ relative_include = self.md_env.get("relative-docs", None)
+ if relative_include is not None and destination.startswith(relative_include[0]):
+ source_dir, include_dir = relative_include[1:]
+ destination = os.path.relpath(
+ os.path.join(include_dir, os.path.normpath(destination)), source_dir
+ )
+
+ potential_path = (
+ Path(self.doc_env.doc2path(self.doc_env.docname)).parent / destination
+ if self.doc_env.srcdir # not set in some test situations
+ else None
+ )
+ if (
+ potential_path
+ and potential_path.is_file()
+ and not any(
+ destination.endswith(suffix)
+ for suffix in self.doc_env.config.source_suffix
+ )
+ ):
+ wrap_node = addnodes.download_reference(
+ refdoc=self.doc_env.docname,
+ reftarget=destination,
+ reftype="myst",
+ refdomain=None, # Added to enable cross-linking
+ refexplicit=len(token.children or []) > 0,
+ refwarn=False,
+ )
+ classes = ["xref", "download", "myst"]
+ text = destination if not token.children else ""
+ else:
+ wrap_node = addnodes.pending_xref(
+ refdoc=self.doc_env.docname,
+ reftarget=destination,
+ reftype="myst",
+ refdomain=None, # Added to enable cross-linking
+ refexplicit=len(token.children or []) > 0,
+ refwarn=True,
+ )
+ classes = ["xref", "myst"]
+ text = ""
+
+ self.add_line_and_source_path(wrap_node, token)
+ title = token.attrGet("title")
+ if title:
+ wrap_node["title"] = title
+ self.current_node.append(wrap_node)
+
+ inner_node = nodes.inline("", text, classes=classes)
+ wrap_node.append(inner_node)
+ with self.current_node_context(inner_node):
+ self.render_children(token)
+
+ def render_heading(self, token: SyntaxTreeNode) -> None:
+ """This extends the docutils method, to allow for the addition of heading ids.
+ These ids are computed by the ``markdown-it-py`` ``anchors_plugin``
+ as "slugs" which are unique to a document.
+
+ The approach is similar to ``sphinx.ext.autosectionlabel``
+ """
+ super().render_heading(token)
+
+ if not isinstance(self.current_node, nodes.section):
+ return
+
+ # create the slug string
+ slug = cast(str, token.attrGet("id"))
+ if slug is None:
+ return
+
+ section = self.current_node
+ doc_slug = self.doc_env.doc2path(self.doc_env.docname, base=False) + "#" + slug
+
+ # save the reference in the standard domain, so that it can be handled properly
+ domain = cast(StandardDomain, self.doc_env.get_domain("std"))
+ if doc_slug in domain.labels:
+ other_doc = self.doc_env.doc2path(domain.labels[doc_slug][0])
+ self.create_warning(
+ f"duplicate label {doc_slug}, other instance in {other_doc}",
+ line=section.line,
+ subtype="anchor",
+ )
+ labelid = section["ids"][0]
+ domain.anonlabels[doc_slug] = self.doc_env.docname, labelid
+ domain.labels[doc_slug] = (
+ self.doc_env.docname,
+ labelid,
+ clean_astext(section[0]),
+ )
+
+ self.doc_env.metadata[self.doc_env.docname]["myst_anchors"] = True
+ section["myst-anchor"] = doc_slug
+
+ def render_math_block_label(self, token: SyntaxTreeNode) -> None:
+ """Render math with referencable labels, e.g. ``$a=1$ (label)``."""
+ label = token.info
+ content = token.content
+ node = nodes.math_block(
+ content, content, nowrap=False, number=None, label=label
+ )
+ target = self.add_math_target(node)
+ self.add_line_and_source_path(target, token)
+ self.current_node.append(target)
+ self.add_line_and_source_path(node, token)
+ self.current_node.append(node)
+
+ def _random_label(self) -> str:
+ return str(uuid4())
+
+ def render_amsmath(self, token: SyntaxTreeNode) -> None:
+ """Renderer for the amsmath extension."""
+ # environment = token.meta["environment"]
+ content = token.content
+
+ if token.meta["numbered"] != "*":
+ # TODO how to parse and reference labels within environment?
+ # for now we give create a unique hash, so the equation will be numbered
+ # but there will be no reference clashes
+ label = self._random_label()
+ node = nodes.math_block(
+ content,
+ content,
+ nowrap=True,
+ number=None,
+ classes=["amsmath"],
+ label=label,
+ )
+ target = self.add_math_target(node)
+ self.add_line_and_source_path(target, token)
+ self.current_node.append(target)
+ else:
+ node = nodes.math_block(
+ content, content, nowrap=True, number=None, classes=["amsmath"]
+ )
+ self.add_line_and_source_path(node, token)
+ self.current_node.append(node)
+
+ def add_math_target(self, node: nodes.math_block) -> nodes.target:
+ # Code mainly copied from sphinx.directives.patches.MathDirective
+
+ # register label to domain
+ domain = cast(MathDomain, self.doc_env.get_domain("math"))
+ domain.note_equation(self.doc_env.docname, node["label"], location=node)
+ node["number"] = domain.get_equation_number_for(node["label"])
+ node["docname"] = self.doc_env.docname
+
+ # create target node
+ node_id = nodes.make_id("equation-%s" % node["label"])
+ target = nodes.target("", "", ids=[node_id])
+ self.document.note_explicit_target(target)
+ return target
diff --git a/myst_parser/mdit_to_docutils/utils.py b/myst_parser/mdit_to_docutils/utils.py
new file mode 100644
index 0000000..b31d8c7
--- /dev/null
+++ b/myst_parser/mdit_to_docutils/utils.py
@@ -0,0 +1,36 @@
+import html
+from typing import Iterable, Optional
+from urllib.parse import quote, urlparse
+
+
+def escape_url(raw: str) -> str:
+ """
+ Escape urls to prevent code injection craziness. (Hopefully.)
+ """
+ return html.escape(quote(html.unescape(raw), safe="/#:()*?=%@+,&"))
+
+
+def is_external_url(
+ reference: str,
+ known_url_schemes: Optional[Iterable[str]],
+ match_fragment: bool = False,
+) -> bool:
+ """Return if a reference should be recognised as an external URL.
+
+ URLs are of the format: scheme://netloc/path;parameters?query#fragment
+
+ This checks if there is a url scheme (e.g. 'https') and, if so,
+ if the scheme is is the list of known_url_schemes (if supplied).
+
+ :param known_url_schemes: e.g. ["http", "https", "mailto"]
+ If None, match all schemes
+ :param match_fragment: If True and a fragment found, then True will be returned,
+ irrespective of a scheme match
+
+ """
+ url_check = urlparse(reference)
+ if known_url_schemes is not None:
+ scheme_known = url_check.scheme in known_url_schemes
+ else:
+ scheme_known = bool(url_check.scheme)
+ return scheme_known or (match_fragment and url_check.fragment != "")
diff --git a/myst_parser/mocking.py b/myst_parser/mocking.py
new file mode 100644
index 0000000..b22475d
--- /dev/null
+++ b/myst_parser/mocking.py
@@ -0,0 +1,514 @@
+"""This module provides classes to Mock the core components of the docutils.RSTParser,
+the key difference being that nested parsing treats the text as Markdown not rST.
+"""
+from __future__ import annotations
+
+import os
+import re
+import sys
+from pathlib import Path
+from typing import TYPE_CHECKING, Any
+
+from docutils import nodes
+from docutils.parsers.rst import Directive, DirectiveError
+from docutils.parsers.rst import Parser as RSTParser
+from docutils.parsers.rst.directives.misc import Include
+from docutils.parsers.rst.states import Body, Inliner, RSTStateMachine
+from docutils.statemachine import StringList
+from docutils.utils import unescape
+
+from .parsers.directives import parse_directive_text
+
+if TYPE_CHECKING:
+ from .mdit_to_docutils.base import DocutilsRenderer
+
+
+class MockingError(Exception):
+ """An exception to signal an error during mocking of docutils components."""
+
+
+class MockInliner:
+ """A mock version of `docutils.parsers.rst.states.Inliner`.
+
+ This is parsed to role functions.
+ """
+
+ def __init__(self, renderer: DocutilsRenderer):
+ """Initialize the mock inliner."""
+ self._renderer = renderer
+ # here we mock that the `parse` method has already been called
+ # which is where these attributes are set (via the RST state Memo)
+ self.document = renderer.document
+ self.reporter = renderer.document.reporter
+ self.language = renderer.language_module_rst
+ self.parent = renderer.current_node
+
+ if not hasattr(self.reporter, "get_source_and_line"):
+ # In docutils this is set by `RSTState.runtime_init`
+ self.reporter.get_source_and_line = lambda l: (self.document["source"], l)
+
+ self.rfc_url = "rfc%d.html"
+
+ def problematic(
+ self, text: str, rawsource: str, message: nodes.system_message
+ ) -> nodes.problematic:
+ """Record a system message from parsing."""
+ msgid = self.document.set_id(message, self.parent)
+ problematic = nodes.problematic(rawsource, text, refid=msgid)
+ prbid = self.document.set_id(problematic)
+ message.add_backref(prbid)
+ return problematic
+
+ def parse(
+ self, text: str, lineno: int, memo: Any, parent: nodes.Node
+ ) -> tuple[list[nodes.Node], list[nodes.system_message]]:
+ """Parse the text and return a list of nodes."""
+ # note the only place this is normally called,
+ # is by `RSTState.inline_text`, or in directives: `self.state.inline_text`,
+ # and there the state parses its own parent
+ # self.reporter = memo.reporter
+ # self.document = memo.document
+ # self.language = memo.language
+ with self._renderer.current_node_context(parent):
+ # the parent is never actually appended to though,
+ # so we make a temporary parent to parse into
+ container = nodes.Element()
+ with self._renderer.current_node_context(container):
+ self._renderer.nested_render_text(text, lineno, inline=True)
+
+ return container.children, []
+
+ def __getattr__(self, name: str):
+ """This method is only be called if the attribute requested has not
+ been defined. Defined attributes will not be overridden.
+ """
+ # TODO use document.reporter mechanism?
+ if hasattr(Inliner, name):
+ msg = "{cls} has not yet implemented attribute '{name}'".format(
+ cls=type(self).__name__, name=name
+ )
+ raise MockingError(msg).with_traceback(sys.exc_info()[2])
+ msg = f"{type(self).__name__} has no attribute {name}"
+ raise MockingError(msg).with_traceback(sys.exc_info()[2])
+
+
+class MockState:
+ """A mock version of `docutils.parsers.rst.states.RSTState`.
+
+ This is parsed to the `Directives.run()` method,
+ so that they may run nested parses on their content that will be parsed as markdown,
+ rather than RST.
+ """
+
+ def __init__(
+ self,
+ renderer: DocutilsRenderer,
+ state_machine: MockStateMachine,
+ lineno: int,
+ ):
+ self._renderer = renderer
+ self._lineno = lineno
+ self.document = renderer.document
+ self.reporter = renderer.document.reporter
+ self.state_machine = state_machine
+ self.inliner = MockInliner(renderer)
+
+ class Struct:
+ document = self.document
+ reporter = self.document.reporter
+ language = renderer.language_module_rst
+ title_styles: list[str] = []
+ section_level = max(renderer._level_to_elem)
+ section_bubble_up_kludge = False
+ inliner = self.inliner
+
+ self.memo = Struct
+
+ def parse_directive_block(
+ self,
+ content: StringList,
+ line_offset: int,
+ directive: type[Directive],
+ option_presets: dict,
+ ) -> tuple[list, dict, StringList, int]:
+ """Parse the full directive text
+
+ :returns: (arguments, options, content, content_offset)
+ """
+ if option_presets:
+ raise MockingError("parse_directive_block: option_presets not implemented")
+ # TODO should argument_str always be ""?
+ arguments, options, body_lines, content_offset = parse_directive_text(
+ directive, "", "\n".join(content)
+ )
+ return (
+ arguments,
+ options,
+ StringList(body_lines, source=content.source),
+ line_offset + content_offset,
+ )
+
+ def nested_parse(
+ self,
+ block: StringList,
+ input_offset: int,
+ node: nodes.Element,
+ match_titles: bool = False,
+ state_machine_class=None,
+ state_machine_kwargs=None,
+ ) -> None:
+ """Perform a nested parse of the input block, with ``node`` as the parent.
+
+ :param block: The block of lines to parse.
+ :param input_offset: The offset of the first line of block,
+ to the starting line of the state (i.e. directive).
+ :param node: The parent node to attach the parsed content to.
+ :param match_titles: Whether to to allow the parsing of headings
+ (normally this is false,
+ since nested heading would break the document structure)
+ """
+ sm_match_titles = self.state_machine.match_titles
+ with self._renderer.current_node_context(node):
+ self._renderer.nested_render_text(
+ "\n".join(block),
+ self._lineno + input_offset,
+ allow_headings=match_titles,
+ )
+ self.state_machine.match_titles = sm_match_titles
+
+ def parse_target(self, block, block_text, lineno: int):
+ """
+ Taken from https://github.com/docutils-mirror/docutils/blob/e88c5fb08d5cdfa8b4ac1020dd6f7177778d5990/docutils/parsers/rst/states.py#L1927 # noqa: E501
+ """
+ # Commenting out this code because it only applies to rST
+ # if block and block[-1].strip()[-1:] == "_": # possible indirect target
+ # reference = " ".join([line.strip() for line in block])
+ # refname = self.is_reference(reference)
+ # if refname:
+ # return "refname", refname
+ reference = "".join(["".join(line.split()) for line in block])
+ return "refuri", unescape(reference)
+
+ def inline_text(
+ self, text: str, lineno: int
+ ) -> tuple[list[nodes.Element], list[nodes.Element]]:
+ """Parse text with only inline rules.
+
+ :returns: (list of nodes, list of messages)
+ """
+ return self.inliner.parse(text, lineno, self.memo, self._renderer.current_node)
+
+ # U+2014 is an em-dash:
+ attribution_pattern = re.compile("^((?:---?(?!-)|\u2014) *)(.+)")
+
+ def block_quote(self, lines: list[str], line_offset: int) -> list[nodes.Element]:
+ """Parse a block quote, which is a block of text,
+ followed by an (optional) attribution.
+
+ ::
+
+ No matter where you go, there you are.
+
+ -- Buckaroo Banzai
+ """
+ elements = []
+ # split attribution
+ last_line_blank = False
+ blockquote_lines = lines
+ attribution_lines = []
+ attribution_line_offset = None
+ # First line after a blank line must begin with a dash
+ for i, line in enumerate(lines):
+ if not line.strip():
+ last_line_blank = True
+ continue
+ if not last_line_blank:
+ last_line_blank = False
+ continue
+ last_line_blank = False
+ match = self.attribution_pattern.match(line)
+ if not match:
+ continue
+ attribution_line_offset = i
+ attribution_lines = [match.group(2)]
+ for at_line in lines[i + 1 :]:
+ indented_line = at_line[len(match.group(1)) :]
+ if len(indented_line) != len(at_line.lstrip()):
+ break
+ attribution_lines.append(indented_line)
+ blockquote_lines = lines[:i]
+ break
+ # parse block
+ blockquote = nodes.block_quote()
+ self.nested_parse(blockquote_lines, line_offset, blockquote)
+ elements.append(blockquote)
+ # parse attribution
+ if attribution_lines:
+ attribution_text = "\n".join(attribution_lines)
+ lineno = self._lineno + line_offset + (attribution_line_offset or 0)
+ textnodes, messages = self.inline_text(attribution_text, lineno)
+ attribution = nodes.attribution(attribution_text, "", *textnodes)
+ (
+ attribution.source,
+ attribution.line,
+ ) = self.state_machine.get_source_and_line(lineno)
+ blockquote += attribution
+ elements += messages
+ return elements
+
+ def build_table(self, tabledata, tableline, stub_columns: int = 0, widths=None):
+ return Body.build_table(self, tabledata, tableline, stub_columns, widths)
+
+ def build_table_row(self, rowdata, tableline):
+ return Body.build_table_row(self, rowdata, tableline)
+
+ def __getattr__(self, name: str):
+ """This method is only be called if the attribute requested has not
+ been defined. Defined attributes will not be overridden.
+ """
+ cls = type(self).__name__
+ if hasattr(Body, name):
+ msg = (
+ f"{cls} has not yet implemented attribute '{name}'. "
+ "You can parse RST directly via the `{eval-rst}` directive: "
+ "https://myst-parser.readthedocs.io/en/latest/syntax/syntax.html#how-directives-parse-content" # noqa: E501
+ )
+ else:
+ # The requested `name` is not a docutils Body element
+ # (such as "footnote", "block_quote", "paragraph", …)
+ msg = f"{cls} has no attribute '{name}'"
+ raise MockingError(msg).with_traceback(sys.exc_info()[2])
+
+
+class MockStateMachine:
+ """A mock version of `docutils.parsers.rst.states.RSTStateMachine`.
+
+ This is parsed to the `Directives.run()` method.
+ """
+
+ def __init__(self, renderer: DocutilsRenderer, lineno: int):
+ self._renderer = renderer
+ self._lineno = lineno
+ self.document = renderer.document
+ self.language = renderer.language_module_rst
+ self.reporter = self.document.reporter
+ self.node: nodes.Element = renderer.current_node
+ self.match_titles: bool = True
+
+ def get_source(self, lineno: int | None = None):
+ """Return document source path."""
+ return self.document["source"]
+
+ def get_source_and_line(self, lineno: int | None = None):
+ """Return (source path, line) tuple for current or given line number."""
+ return self.document["source"], lineno or self._lineno
+
+ def __getattr__(self, name: str):
+ """This method is only be called if the attribute requested has not
+ been defined. Defined attributes will not be overridden.
+ """
+ if hasattr(RSTStateMachine, name):
+ msg = "{cls} has not yet implemented attribute '{name}'".format(
+ cls=type(self).__name__, name=name
+ )
+ raise MockingError(msg).with_traceback(sys.exc_info()[2])
+ msg = f"{type(self).__name__} has no attribute {name}"
+ raise MockingError(msg).with_traceback(sys.exc_info()[2])
+
+
+class MockIncludeDirective:
+ """This directive uses a lot of statemachine logic that is not yet mocked.
+ Therefore, we treat it as a special case (at least for now).
+
+ See:
+ https://docutils.sourceforge.io/docs/ref/rst/directives.html#including-an-external-document-fragment
+ """
+
+ def __init__(
+ self,
+ renderer: DocutilsRenderer,
+ name: str,
+ klass: Include,
+ arguments: list,
+ options: dict,
+ body: list[str],
+ lineno: int,
+ ):
+ self.renderer = renderer
+ self.document = renderer.document
+ self.name = name
+ self.klass = klass
+ self.arguments = arguments
+ self.options = options
+ self.body = body
+ self.lineno = lineno
+
+ def run(self) -> list[nodes.Element]:
+
+ from docutils.parsers.rst.directives.body import CodeBlock, NumberLines
+
+ if not self.document.settings.file_insertion_enabled:
+ raise DirectiveError(2, f'Directive "{self.name}" disabled.')
+
+ source_dir = Path(self.document["source"]).absolute().parent
+ include_arg = "".join([s.strip() for s in self.arguments[0].splitlines()])
+
+ if include_arg.startswith("<") and include_arg.endswith(">"):
+ # # docutils "standard" includes
+ path = Path(self.klass.standard_include_path).joinpath(include_arg[1:-1])
+ else:
+ # if using sphinx interpret absolute paths "correctly",
+ # i.e. relative to source directory
+ try:
+ sphinx_env = self.document.settings.env
+ except AttributeError:
+ pass
+ else:
+ _, include_arg = sphinx_env.relfn2path(self.arguments[0])
+ sphinx_env.note_included(include_arg)
+ path = Path(include_arg)
+ path = source_dir.joinpath(path)
+ # this ensures that the parent file is rebuilt if the included file changes
+ self.document.settings.record_dependencies.add(str(path))
+
+ # read file
+ encoding = self.options.get("encoding", self.document.settings.input_encoding)
+ error_handler = self.document.settings.input_encoding_error_handler
+ # tab_width = self.options.get("tab-width", self.document.settings.tab_width)
+ try:
+ file_content = path.read_text(encoding=encoding, errors=error_handler)
+ except Exception as error:
+ raise DirectiveError(
+ 4,
+ 'Directive "{}": error reading file: {}\n{}.'.format(
+ self.name, path, error
+ ),
+ )
+
+ # get required section of text
+ startline = self.options.get("start-line", None)
+ endline = self.options.get("end-line", None)
+ file_content = "\n".join(file_content.splitlines()[startline:endline])
+ startline = startline or 0
+ for split_on_type in ["start-after", "end-before"]:
+ split_on = self.options.get(split_on_type, None)
+ if not split_on:
+ continue
+ split_index = file_content.find(split_on)
+ if split_index < 0:
+ raise DirectiveError(
+ 4,
+ 'Directive "{}"; option "{}": text not found "{}".'.format(
+ self.name, split_on_type, split_on
+ ),
+ )
+ if split_on_type == "start-after":
+ startline += split_index + len(split_on)
+ file_content = file_content[split_index + len(split_on) :]
+ else:
+ file_content = file_content[:split_index]
+
+ if "literal" in self.options:
+ literal_block = nodes.literal_block(
+ file_content, source=str(path), classes=self.options.get("class", [])
+ )
+ literal_block.line = 1 # TODO don;t think this should be 1?
+ self.add_name(literal_block)
+ if "number-lines" in self.options:
+ try:
+ startline = int(self.options["number-lines"] or 1)
+ except ValueError:
+ raise DirectiveError(
+ 3, ":number-lines: with non-integer " "start value"
+ )
+ endline = startline + len(file_content.splitlines())
+ if file_content.endswith("\n"):
+ file_content = file_content[:-1]
+ tokens = NumberLines([([], file_content)], startline, endline)
+ for classes, value in tokens:
+ if classes:
+ literal_block += nodes.inline(value, value, classes=classes)
+ else:
+ literal_block += nodes.Text(value)
+ else:
+ literal_block += nodes.Text(file_content)
+ return [literal_block]
+ if "code" in self.options:
+ self.options["source"] = str(path)
+ state_machine = MockStateMachine(self.renderer, self.lineno)
+ state = MockState(self.renderer, state_machine, self.lineno)
+ codeblock = CodeBlock(
+ name=self.name,
+ arguments=[self.options.pop("code")],
+ options=self.options,
+ content=file_content.splitlines(),
+ lineno=self.lineno,
+ content_offset=0,
+ block_text=file_content,
+ state=state,
+ state_machine=state_machine,
+ )
+ return codeblock.run()
+
+ # Here we perform a nested render, but temporarily setup the document/reporter
+ # with the correct document path and lineno for the included file.
+ source = self.renderer.document["source"]
+ rsource = self.renderer.reporter.source
+ line_func = getattr(self.renderer.reporter, "get_source_and_line", None)
+ try:
+ self.renderer.document["source"] = str(path)
+ self.renderer.reporter.source = str(path)
+ self.renderer.reporter.get_source_and_line = lambda l: (str(path), l)
+ if "relative-images" in self.options:
+ self.renderer.md_env["relative-images"] = os.path.relpath(
+ path.parent, source_dir
+ )
+ if "relative-docs" in self.options:
+ self.renderer.md_env["relative-docs"] = (
+ self.options["relative-docs"],
+ source_dir,
+ path.parent,
+ )
+ self.renderer.nested_render_text(
+ file_content, startline + 1, allow_headings=True
+ )
+ finally:
+ self.renderer.document["source"] = source
+ self.renderer.reporter.source = rsource
+ self.renderer.md_env.pop("relative-images", None)
+ self.renderer.md_env.pop("relative-docs", None)
+ if line_func is not None:
+ self.renderer.reporter.get_source_and_line = line_func
+ else:
+ del self.renderer.reporter.get_source_and_line
+ return []
+
+ def add_name(self, node: nodes.Element):
+ """Append self.options['name'] to node['names'] if it exists.
+
+ Also normalize the name string and register it as explicit target.
+ """
+ if "name" in self.options:
+ name = nodes.fully_normalize_name(self.options.pop("name"))
+ if "name" in node:
+ del node["name"]
+ node["names"].append(name)
+ self.renderer.document.note_explicit_target(node, node)
+
+
+class MockRSTParser(RSTParser):
+ """RSTParser which avoids a negative side effect."""
+
+ def parse(self, inputstring: str, document: nodes.document):
+ """Parse the input to populate the document AST."""
+ from docutils.parsers.rst import roles
+
+ should_restore = False
+ if "" in roles._roles:
+ should_restore = True
+ blankrole = roles._roles[""]
+
+ super().parse(inputstring, document)
+
+ if should_restore:
+ roles._roles[""] = blankrole
diff --git a/myst_parser/parsers/__init__.py b/myst_parser/parsers/__init__.py
new file mode 100644
index 0000000..26fbfca
--- /dev/null
+++ b/myst_parser/parsers/__init__.py
@@ -0,0 +1 @@
+"""Parsers of MyST Markdown source text to docutils AST."""
diff --git a/myst_parser/parsers/directives.py b/myst_parser/parsers/directives.py
new file mode 100644
index 0000000..5637254
--- /dev/null
+++ b/myst_parser/parsers/directives.py
@@ -0,0 +1,190 @@
+"""Fenced code blocks are parsed as directives,
+if the block starts with ``{directive_name}``,
+followed by arguments on the same line.
+
+Directive options are read from a YAML block,
+if the first content line starts with ``---``, e.g.
+
+::
+
+ ```{directive_name} arguments
+ ---
+ option1: name
+ option2: |
+ Longer text block
+ ---
+ content...
+ ```
+
+Or the option block will be parsed if the first content line starts with ``:``,
+as a YAML block consisting of every line that starts with a ``:``, e.g.
+
+::
+
+ ```{directive_name} arguments
+ :option1: name
+ :option2: other
+
+ content...
+ ```
+
+If the first line of a directive's content is blank, this will be stripped
+from the content.
+This is to allow for separation between the option block and content.
+
+"""
+from __future__ import annotations
+
+import datetime
+import re
+from textwrap import dedent
+from typing import Any, Callable
+
+import yaml
+from docutils.parsers.rst import Directive
+from docutils.parsers.rst.directives.misc import TestDirective
+
+
+class DirectiveParsingError(Exception):
+ """Raise on parsing/validation error."""
+
+ pass
+
+
+def parse_directive_text(
+ directive_class: type[Directive],
+ first_line: str,
+ content: str,
+ validate_options: bool = True,
+) -> tuple[list[str], dict, list[str], int]:
+ """Parse (and validate) the full directive text.
+
+ :param first_line: The text on the same line as the directive name.
+ May be an argument or body text, dependent on the directive
+ :param content: All text after the first line. Can include options.
+ :param validate_options: Whether to validate the values of options
+
+ :returns: (arguments, options, body_lines, content_offset)
+ """
+ if directive_class.option_spec:
+ body, options = parse_directive_options(
+ content, directive_class, validate=validate_options
+ )
+ body_lines = body.splitlines()
+ content_offset = len(content.splitlines()) - len(body_lines)
+ else:
+ # If there are no possible options, we do not look for a YAML block
+ options = {}
+ body_lines = content.splitlines()
+ content_offset = 0
+
+ if not (directive_class.required_arguments or directive_class.optional_arguments):
+ # If there are no possible arguments, then the body starts on the argument line
+ if first_line:
+ body_lines.insert(0, first_line)
+ arguments = []
+ else:
+ arguments = parse_directive_arguments(directive_class, first_line)
+
+ # remove first line of body if blank
+ # this is to allow space between the options and the content
+ if body_lines and not body_lines[0].strip():
+ body_lines = body_lines[1:]
+ content_offset += 1
+
+ # check for body content
+ if body_lines and not directive_class.has_content:
+ raise DirectiveParsingError("No content permitted")
+
+ return arguments, options, body_lines, content_offset
+
+
+def parse_directive_options(
+ content: str, directive_class: type[Directive], validate: bool = True
+):
+ """Parse (and validate) the directive option section."""
+ options: dict[str, Any] = {}
+ if content.startswith("---"):
+ content = "\n".join(content.splitlines()[1:])
+ match = re.search(r"^-{3,}", content, re.MULTILINE)
+ if match:
+ yaml_block = content[: match.start()]
+ content = content[match.end() + 1 :] # TODO advance line number
+ else:
+ yaml_block = content
+ content = ""
+ yaml_block = dedent(yaml_block)
+ try:
+ options = yaml.safe_load(yaml_block) or {}
+ except (yaml.parser.ParserError, yaml.scanner.ScannerError) as error:
+ raise DirectiveParsingError("Invalid options YAML: " + str(error))
+ elif content.lstrip().startswith(":"):
+ content_lines = content.splitlines() # type: list
+ yaml_lines = []
+ while content_lines:
+ if not content_lines[0].lstrip().startswith(":"):
+ break
+ yaml_lines.append(content_lines.pop(0).lstrip()[1:])
+ yaml_block = "\n".join(yaml_lines)
+ content = "\n".join(content_lines)
+ try:
+ options = yaml.safe_load(yaml_block) or {}
+ except (yaml.parser.ParserError, yaml.scanner.ScannerError) as error:
+ raise DirectiveParsingError("Invalid options YAML: " + str(error))
+ if not isinstance(options, dict):
+ raise DirectiveParsingError(f"Invalid options (not dict): {options}")
+
+ if (not validate) or issubclass(directive_class, TestDirective):
+ # technically this directive spec only accepts one option ('option')
+ # but since its for testing only we accept all options
+ return content, options
+
+ # check options against spec
+ options_spec: dict[str, Callable] = directive_class.option_spec
+ for name, value in list(options.items()):
+ try:
+ convertor = options_spec[name]
+ except KeyError:
+ raise DirectiveParsingError(f"Unknown option: {name}")
+ if not isinstance(value, str):
+ if value is True or value is None:
+ value = None # flag converter requires no argument
+ elif isinstance(value, (int, float, datetime.date, datetime.datetime)):
+ # convertor always requires string input
+ value = str(value)
+ else:
+ raise DirectiveParsingError(
+ f'option "{name}" value not string (enclose with ""): {value}'
+ )
+ try:
+ converted_value = convertor(value)
+ except (ValueError, TypeError) as error:
+ raise DirectiveParsingError(
+ "Invalid option value: (option: '{}'; value: {})\n{}".format(
+ name, value, error
+ )
+ )
+ options[name] = converted_value
+
+ return content, options
+
+
+def parse_directive_arguments(directive, arg_text):
+ """Parse (and validate) the directive argument section."""
+ required = directive.required_arguments
+ optional = directive.optional_arguments
+ arguments = arg_text.split()
+ if len(arguments) < required:
+ raise DirectiveParsingError(
+ f"{required} argument(s) required, {len(arguments)} supplied"
+ )
+ elif len(arguments) > required + optional:
+ if directive.final_argument_whitespace:
+ arguments = arg_text.split(None, required + optional - 1)
+ else:
+ raise DirectiveParsingError(
+ "maximum {} argument(s) allowed, {} supplied".format(
+ required + optional, len(arguments)
+ )
+ )
+ return arguments
diff --git a/myst_parser/parsers/docutils_.py b/myst_parser/parsers/docutils_.py
new file mode 100644
index 0000000..aaef5e2
--- /dev/null
+++ b/myst_parser/parsers/docutils_.py
@@ -0,0 +1,275 @@
+"""MyST Markdown parser for docutils."""
+from dataclasses import Field
+from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple, Union
+
+from docutils import frontend, nodes
+from docutils.core import default_description, publish_cmdline
+from docutils.parsers.rst import Parser as RstParser
+from typing_extensions import Literal, get_args, get_origin
+
+from myst_parser.config.main import (
+ MdParserConfig,
+ TopmatterReadError,
+ merge_file_level,
+ read_topmatter,
+)
+from myst_parser.mdit_to_docutils.base import DocutilsRenderer, create_warning
+from myst_parser.parsers.mdit import create_md_parser
+
+
+def _validate_int(
+ setting, value, option_parser, config_parser=None, config_section=None
+) -> int:
+ """Validate an integer setting."""
+ return int(value)
+
+
+def _create_validate_tuple(length: int) -> Callable[..., Tuple[str, ...]]:
+ """Create a validator for a tuple of length `length`."""
+
+ def _validate(
+ setting, value, option_parser, config_parser=None, config_section=None
+ ):
+ string_list = frontend.validate_comma_separated_list(
+ setting, value, option_parser, config_parser, config_section
+ )
+ if len(string_list) != length:
+ raise ValueError(
+ f"Expecting {length} items in {setting}, got {len(string_list)}."
+ )
+ return tuple(string_list)
+
+ return _validate
+
+
+class Unset:
+ """A sentinel class for unset settings."""
+
+ def __repr__(self):
+ return "UNSET"
+
+
+DOCUTILS_UNSET = Unset()
+"""Sentinel for arguments not set through docutils.conf."""
+
+
+DOCUTILS_EXCLUDED_ARGS = (
+ # docutils.conf can't represent callables
+ "heading_slug_func",
+ # docutils.conf can't represent dicts
+ "html_meta",
+ "substitutions",
+ # we can't add substitutions so not needed
+ "sub_delimiters",
+ # sphinx only options
+ "heading_anchors",
+ "ref_domains",
+ "update_mathjax",
+ "mathjax_classes",
+)
+"""Names of settings that cannot be set in docutils.conf."""
+
+
+def _attr_to_optparse_option(at: Field, default: Any) -> Tuple[dict, str]:
+ """Convert a field into a Docutils optparse options dict."""
+ if at.type is int:
+ return {"metavar": "<int>", "validator": _validate_int}, f"(default: {default})"
+ if at.type is bool:
+ return {
+ "metavar": "<boolean>",
+ "validator": frontend.validate_boolean,
+ }, f"(default: {default})"
+ if at.type is str:
+ return {
+ "metavar": "<str>",
+ }, f"(default: '{default}')"
+ if get_origin(at.type) is Literal and all(
+ isinstance(a, str) for a in get_args(at.type)
+ ):
+ args = get_args(at.type)
+ return {
+ "metavar": f"<{'|'.join(repr(a) for a in args)}>",
+ "type": "choice",
+ "choices": args,
+ }, f"(default: {default!r})"
+ if at.type in (Iterable[str], Sequence[str]):
+ return {
+ "metavar": "<comma-delimited>",
+ "validator": frontend.validate_comma_separated_list,
+ }, f"(default: '{','.join(default)}')"
+ if at.type == Tuple[str, str]:
+ return {
+ "metavar": "<str,str>",
+ "validator": _create_validate_tuple(2),
+ }, f"(default: '{','.join(default)}')"
+ if at.type == Union[int, type(None)]:
+ return {
+ "metavar": "<null|int>",
+ "validator": _validate_int,
+ }, f"(default: {default})"
+ if at.type == Union[Iterable[str], type(None)]:
+ default_str = ",".join(default) if default else ""
+ return {
+ "metavar": "<null|comma-delimited>",
+ "validator": frontend.validate_comma_separated_list,
+ }, f"(default: {default_str!r})"
+ raise AssertionError(
+ f"Configuration option {at.name} not set up for use in docutils.conf."
+ )
+
+
+def attr_to_optparse_option(
+ attribute: Field, default: Any, prefix: str = "myst_"
+) -> Tuple[str, List[str], Dict[str, Any]]:
+ """Convert an ``MdParserConfig`` attribute into a Docutils setting tuple.
+
+ :returns: A tuple of ``(help string, option flags, optparse kwargs)``.
+ """
+ name = f"{prefix}{attribute.name}"
+ flag = "--" + name.replace("_", "-")
+ options = {"dest": name, "default": DOCUTILS_UNSET}
+ at_options, type_str = _attr_to_optparse_option(attribute, default)
+ options.update(at_options)
+ help_str = attribute.metadata.get("help", "") if attribute.metadata else ""
+ return (f"{help_str} {type_str}", [flag], options)
+
+
+def create_myst_settings_spec(
+ excluded: Sequence[str], config_cls=MdParserConfig, prefix: str = "myst_"
+):
+ """Return a list of Docutils setting for the docutils MyST section."""
+ defaults = config_cls()
+ return tuple(
+ attr_to_optparse_option(at, getattr(defaults, at.name), prefix)
+ for at in config_cls.get_fields()
+ if at.name not in excluded
+ )
+
+
+def create_myst_config(
+ settings: frontend.Values,
+ excluded: Sequence[str],
+ config_cls=MdParserConfig,
+ prefix: str = "myst_",
+):
+ """Create a configuration instance from the given settings."""
+ values = {}
+ for attribute in config_cls.get_fields():
+ if attribute.name in excluded:
+ continue
+ setting = f"{prefix}{attribute.name}"
+ val = getattr(settings, setting, DOCUTILS_UNSET)
+ if val is not DOCUTILS_UNSET:
+ values[attribute.name] = val
+ return config_cls(**values)
+
+
+class Parser(RstParser):
+ """Docutils parser for Markedly Structured Text (MyST)."""
+
+ supported: Tuple[str, ...] = ("md", "markdown", "myst")
+ """Aliases this parser supports."""
+
+ settings_spec = (
+ "MyST options",
+ None,
+ create_myst_settings_spec(DOCUTILS_EXCLUDED_ARGS),
+ *RstParser.settings_spec,
+ )
+ """Runtime settings specification."""
+
+ config_section = "myst parser"
+ config_section_dependencies = ("parsers",)
+ translate_section_name = None
+
+ def parse(self, inputstring: str, document: nodes.document) -> None:
+ """Parse source text.
+
+ :param inputstring: The source string to parse
+ :param document: The root docutils node to add AST elements to
+ """
+
+ self.setup_parse(inputstring, document)
+
+ # check for exorbitantly long lines
+ if hasattr(document.settings, "line_length_limit"):
+ for i, line in enumerate(inputstring.split("\n")):
+ if len(line) > document.settings.line_length_limit:
+ error = document.reporter.error(
+ f"Line {i+1} exceeds the line-length-limit:"
+ f" {document.settings.line_length_limit}."
+ )
+ document.append(error)
+ return
+
+ # create parsing configuration from the global config
+ try:
+ config = create_myst_config(document.settings, DOCUTILS_EXCLUDED_ARGS)
+ except Exception as exc:
+ error = document.reporter.error(f"Global myst configuration invalid: {exc}")
+ document.append(error)
+ config = MdParserConfig()
+
+ # update the global config with the file-level config
+ try:
+ topmatter = read_topmatter(inputstring)
+ except TopmatterReadError:
+ pass # this will be reported during the render
+ else:
+ if topmatter:
+ warning = lambda wtype, msg: create_warning( # noqa: E731
+ document, msg, line=1, append_to=document, subtype=wtype
+ )
+ config = merge_file_level(config, topmatter, warning)
+
+ # parse content
+ parser = create_md_parser(config, DocutilsRenderer)
+ parser.options["document"] = document
+ parser.render(inputstring)
+
+ # post-processing
+
+ # replace raw nodes if raw is not allowed
+ if not getattr(document.settings, "raw_enabled", True):
+ for node in document.traverse(nodes.raw):
+ warning = document.reporter.warning("Raw content disabled.")
+ node.parent.replace(node, warning)
+
+ self.finish_parse()
+
+
+def _run_cli(writer_name: str, writer_description: str, argv: Optional[List[str]]):
+ """Run the command line interface for a particular writer."""
+ publish_cmdline(
+ parser=Parser(),
+ writer_name=writer_name,
+ description=(
+ f"Generates {writer_description} from standalone MyST sources.\n{default_description}"
+ ),
+ argv=argv,
+ )
+
+
+def cli_html(argv: Optional[List[str]] = None) -> None:
+ """Cmdline entrypoint for converting MyST to HTML."""
+ _run_cli("html", "(X)HTML documents", argv)
+
+
+def cli_html5(argv: Optional[List[str]] = None):
+ """Cmdline entrypoint for converting MyST to HTML5."""
+ _run_cli("html5", "HTML5 documents", argv)
+
+
+def cli_latex(argv: Optional[List[str]] = None):
+ """Cmdline entrypoint for converting MyST to LaTeX."""
+ _run_cli("latex", "LaTeX documents", argv)
+
+
+def cli_xml(argv: Optional[List[str]] = None):
+ """Cmdline entrypoint for converting MyST to XML."""
+ _run_cli("xml", "Docutils-native XML", argv)
+
+
+def cli_pseudoxml(argv: Optional[List[str]] = None):
+ """Cmdline entrypoint for converting MyST to pseudo-XML."""
+ _run_cli("pseudoxml", "pseudo-XML", argv)
diff --git a/myst_parser/parsers/mdit.py b/myst_parser/parsers/mdit.py
new file mode 100644
index 0000000..8476495
--- /dev/null
+++ b/myst_parser/parsers/mdit.py
@@ -0,0 +1,123 @@
+"""This module holds the ``create_md_parser`` function,
+which creates a parser from the config.
+"""
+from __future__ import annotations
+
+from typing import Callable
+
+from markdown_it import MarkdownIt
+from markdown_it.renderer import RendererProtocol
+from mdit_py_plugins.amsmath import amsmath_plugin
+from mdit_py_plugins.anchors import anchors_plugin
+from mdit_py_plugins.attrs import attrs_plugin
+from mdit_py_plugins.colon_fence import colon_fence_plugin
+from mdit_py_plugins.deflist import deflist_plugin
+from mdit_py_plugins.dollarmath import dollarmath_plugin
+from mdit_py_plugins.field_list import fieldlist_plugin
+from mdit_py_plugins.footnote import footnote_plugin
+from mdit_py_plugins.front_matter import front_matter_plugin
+from mdit_py_plugins.myst_blocks import myst_block_plugin
+from mdit_py_plugins.myst_role import myst_role_plugin
+from mdit_py_plugins.substitution import substitution_plugin
+from mdit_py_plugins.tasklists import tasklists_plugin
+from mdit_py_plugins.wordcount import wordcount_plugin
+
+from myst_parser.config.main import MdParserConfig
+
+
+def create_md_parser(
+ config: MdParserConfig, renderer: Callable[[MarkdownIt], RendererProtocol]
+) -> MarkdownIt:
+ """Return a Markdown parser with the required MyST configuration."""
+
+ # TODO warn if linkify required and linkify-it-py not installed
+ # (currently the parse will unceremoniously except)
+
+ if config.commonmark_only:
+ # see https://spec.commonmark.org/
+ md = MarkdownIt("commonmark", renderer_cls=renderer).use(
+ wordcount_plugin, per_minute=config.words_per_minute
+ )
+ md.options.update({"myst_config": config})
+ return md
+
+ if config.gfm_only:
+ # see https://github.github.com/gfm/
+ md = (
+ MarkdownIt("commonmark", renderer_cls=renderer)
+ # note, strikethrough currently only supported tentatively for HTML
+ .enable("strikethrough")
+ .enable("table")
+ .use(tasklists_plugin)
+ .enable("linkify")
+ .use(wordcount_plugin, per_minute=config.words_per_minute)
+ )
+ md.options.update({"linkify": True, "myst_config": config})
+ return md
+
+ md = (
+ MarkdownIt("commonmark", renderer_cls=renderer)
+ .enable("table")
+ .use(front_matter_plugin)
+ .use(myst_block_plugin)
+ .use(myst_role_plugin)
+ .use(footnote_plugin)
+ .use(wordcount_plugin, per_minute=config.words_per_minute)
+ .disable("footnote_inline")
+ # disable this for now, because it need a new implementation in the renderer
+ .disable("footnote_tail")
+ )
+
+ typographer = False
+ if "smartquotes" in config.enable_extensions:
+ md.enable("smartquotes")
+ typographer = True
+ if "replacements" in config.enable_extensions:
+ md.enable("replacements")
+ typographer = True
+ if "linkify" in config.enable_extensions:
+ md.enable("linkify")
+ if md.linkify is not None:
+ md.linkify.set({"fuzzy_link": config.linkify_fuzzy_links})
+ if "strikethrough" in config.enable_extensions:
+ md.enable("strikethrough")
+ if "dollarmath" in config.enable_extensions:
+ md.use(
+ dollarmath_plugin,
+ allow_labels=config.dmath_allow_labels,
+ allow_space=config.dmath_allow_space,
+ allow_digits=config.dmath_allow_digits,
+ double_inline=config.dmath_double_inline,
+ )
+ if "colon_fence" in config.enable_extensions:
+ md.use(colon_fence_plugin)
+ if "amsmath" in config.enable_extensions:
+ md.use(amsmath_plugin)
+ if "deflist" in config.enable_extensions:
+ md.use(deflist_plugin)
+ if "fieldlist" in config.enable_extensions:
+ md.use(fieldlist_plugin)
+ if "tasklist" in config.enable_extensions:
+ md.use(tasklists_plugin)
+ if "substitution" in config.enable_extensions:
+ md.use(substitution_plugin, *config.sub_delimiters)
+ if "attrs_image" in config.enable_extensions:
+ md.use(attrs_plugin, after=("image",))
+ if config.heading_anchors is not None:
+ md.use(
+ anchors_plugin,
+ max_level=config.heading_anchors,
+ slug_func=config.heading_slug_func,
+ )
+ for name in config.disable_syntax:
+ md.disable(name, True)
+
+ md.options.update(
+ {
+ "typographer": typographer,
+ "linkify": "linkify" in config.enable_extensions,
+ "myst_config": config,
+ }
+ )
+
+ return md
diff --git a/myst_parser/parsers/parse_html.py b/myst_parser/parsers/parse_html.py
new file mode 100644
index 0000000..7539e42
--- /dev/null
+++ b/myst_parser/parsers/parse_html.py
@@ -0,0 +1,440 @@
+"""A simple but complete HTML to Abstract Syntax Tree (AST) parser.
+
+The AST can also reproduce the HTML text.
+
+Example::
+
+ >> text = '<div class="note"><p>text</p></div>'
+ >> ast = tokenize_html(text)
+ >> list(ast.walk(include_self=True))
+ [Root(''), Tag('div', {'class': 'note'}), Tag('p'), Data('text')]
+ >> str(ast)
+ '<div class="note"><p>text</p></div>'
+ >> str(ast[0][0])
+ '<p>text</p>'
+
+Note: optional tags are not accounted for
+(see https://html.spec.whatwg.org/multipage/syntax.html#optional-tags)
+
+"""
+from __future__ import annotations
+
+import inspect
+import itertools
+from collections import abc, deque
+from html.parser import HTMLParser
+from typing import Any, Callable, Iterable, Iterator
+
+
+class Attribute(dict):
+ """This class holds the tags's attributes."""
+
+ def __getitem__(self, key: str) -> str:
+ """If self doesn't have the key it returns ''."""
+ return self.get(key, "")
+
+ @property
+ def classes(self) -> list[str]:
+ """Return 'class' attribute as list."""
+ return self["class"].split()
+
+ def __str__(self) -> str:
+ """Return a htmlized representation for attributes."""
+ return " ".join(f'{key}="{value}"' for key, value in self.items())
+
+
+class Element(abc.MutableSequence):
+ """An Element of the xml/html document.
+
+ All xml/html entities inherit from this class.
+ """
+
+ def __init__(self, name: str = "", attr: dict | None = None) -> None:
+ """Initialise the element."""
+ self.name = name
+ self.attrs: Attribute = Attribute(attr or {})
+ self._parent: Element | None = None
+ self._children: list[Element] = []
+
+ @property
+ def parent(self) -> Element | None:
+ """Return parent."""
+ return self._parent
+
+ @property
+ def children(self) -> list[Element]:
+ """Return copy of children."""
+ return self._children[:]
+
+ def reset_children(self, children: list[Element], deepcopy: bool = False):
+ new_children = []
+ for i, item in enumerate(children):
+ assert isinstance(item, Element)
+ if deepcopy:
+ item = item.deepcopy()
+ if item._parent is None:
+ item._parent = self
+ elif item._parent != self:
+ raise AssertionError(f"different parent already set for item {i}")
+ new_children.append(item)
+ self._children = new_children
+
+ def __getitem__(self, index: int) -> Element: # type: ignore[override]
+ return self._children[index]
+
+ def __setitem__(self, index: int, item: Element): # type: ignore[override]
+ assert isinstance(item, Element)
+ if item._parent is not None and item._parent != self:
+ raise AssertionError(f"different parent already set for: {item!r}")
+ item._parent = self
+ return self._children.__setitem__(index, item)
+
+ def __delitem__(self, index: int): # type: ignore[override]
+ return self._children.__delitem__(index)
+
+ def __len__(self) -> int:
+ return self._children.__len__()
+
+ def __iter__(self) -> Iterator[Element]:
+ yield from self._children
+
+ def insert(self, index: int, item: Element):
+ assert isinstance(item, Element)
+ if item._parent is not None and item._parent != self:
+ raise AssertionError(f"different parent already set for: {item!r}")
+ item._parent = self
+ return self._children.insert(index, item)
+
+ def deepcopy(self) -> Element:
+ """Recursively copy and remove parent."""
+ _copy = self.__class__(self.name, self.attrs)
+ for child in self:
+ _copy_child = child.deepcopy()
+ _copy.append(_copy_child)
+ return _copy
+
+ def __repr__(self) -> str:
+ text = f"{self.__class__.__name__}({self.name!r}"
+ if self.attrs:
+ text += f", {self.attrs!r}"
+ text += ")"
+ return text
+
+ def render(
+ self,
+ tag_overrides: dict[str, Callable[[Element, dict], str]] | None = None,
+ **kwargs,
+ ) -> str:
+ """Returns a HTML string representation of the element.
+
+ :param tag_overrides: Provide a dictionary of render function
+ for specific tag names, to override the normal render format
+
+ """
+ raise NotImplementedError
+
+ def __str__(self) -> str:
+ return self.render()
+
+ def __eq__(self, item: Any) -> bool:
+ return item is self
+
+ def walk(self, include_self: bool = False) -> Iterator[Element]:
+ """Walk through the xml/html AST."""
+ if include_self:
+ yield self
+ for child in self:
+ yield child
+ yield from child.walk()
+
+ def strip(self, inplace: bool = False, recurse: bool = False) -> Element:
+ """Return copy with all `Data` tokens
+ that only contain whitespace / newlines removed.
+ """
+ element = self
+ if not inplace:
+ element = self.deepcopy()
+ element.reset_children(
+ [
+ e
+ for e in element.children
+ if not (isinstance(e, Data) and e.data.strip() == "")
+ ]
+ )
+ if recurse:
+ for child in element:
+ child.strip(inplace=True, recurse=True)
+ return element
+
+ def find(
+ self,
+ identifier: str | type[Element],
+ attrs: dict | None = None,
+ classes: Iterable[str] | None = None,
+ include_self: bool = False,
+ recurse: bool = True,
+ ) -> Iterator[Element]:
+ """Find all elements that match name and specific attributes."""
+ iterator = self.walk() if recurse else self
+ if include_self:
+ iterator = itertools.chain([self], iterator)
+ if inspect.isclass(identifier):
+ test_func = lambda c: isinstance(c, identifier) # noqa: E731
+ else:
+ test_func = lambda c: c.name == identifier # noqa: E731
+ classes = set(classes) if classes is not None else classes
+ for child in iterator:
+ if test_func(child):
+ if classes is not None and not classes.issubset(child.attrs.classes):
+ continue
+ for key, value in (attrs or {}).items():
+ if child.attrs[key] != value:
+ break
+ else:
+ yield child
+
+
+class Root(Element):
+ """The root of the AST tree."""
+
+ def render(self, **kwargs) -> str: # type: ignore[override]
+ """Returns a string HTML representation of the structure."""
+ return "".join(child.render(**kwargs) for child in self)
+
+
+class Tag(Element):
+ """Represent xml/html tags under the form: <name key="value" ...> ... </name>."""
+
+ def render(
+ self,
+ tag_overrides: dict[str, Callable[[Element, dict], str]] | None = None,
+ **kwargs,
+ ) -> str:
+ if tag_overrides and self.name in tag_overrides:
+ return tag_overrides[self.name](self, tag_overrides)
+ return (
+ f"<{self.name}{' ' if self.attrs else ''}{self.attrs}>"
+ + "".join(
+ child.render(tag_overrides=tag_overrides, **kwargs) for child in self
+ )
+ + f"</{self.name}>"
+ )
+
+
+class XTag(Element):
+ """Represent XHTML style tags with no children, like `<img src="t.gif" />`"""
+
+ def render(
+ self,
+ tag_overrides: dict[str, Callable[[Element, dict], str]] | None = None,
+ **kwargs,
+ ) -> str:
+ if tag_overrides is not None and self.name in tag_overrides:
+ return tag_overrides[self.name](self, tag_overrides)
+ return f"<{self.name}{' ' if self.attrs else ''}{self.attrs}/>"
+
+
+class VoidTag(Element):
+ """Represent tags with no children, only start tag, like `<img src="t.gif" >`"""
+
+ def render(self, **kwargs) -> str: # type: ignore[override]
+ return f"<{self.name}{' ' if self.attrs else ''}{self.attrs}>"
+
+
+class TerminalElement(Element):
+ def __init__(self, data: str):
+ super().__init__("")
+ self.data: str = data
+
+ def __repr__(self) -> str:
+ text = self.data
+ if len(text) > 20:
+ text = text[:17] + "..."
+ return f"{self.__class__.__name__}({text!r})"
+
+ def deepcopy(self) -> TerminalElement:
+ """Copy and remove parent."""
+ _copy = self.__class__(self.data)
+ return _copy
+
+
+class Data(TerminalElement):
+ """Represent data inside xml/html documents, like raw text."""
+
+ def render(self, **kwargs) -> str: # type: ignore[override]
+ return self.data
+
+
+class Declaration(TerminalElement):
+ """Represent declarations, like `<!DOCTYPE html>`"""
+
+ def render(self, **kwargs) -> str: # type: ignore[override]
+ return f"<!{self.data}>"
+
+
+class Comment(TerminalElement):
+ """Represent HTML comments"""
+
+ def render(self, **kwargs) -> str: # type: ignore[override]
+ return f"<!--{self.data}-->"
+
+
+class Pi(TerminalElement):
+ """Represent processing instructions like `<?xml-stylesheet ?>`"""
+
+ def render(self, **kwargs) -> str: # type: ignore[override]
+ return f"<?{self.data}>"
+
+
+class Char(TerminalElement):
+ """Represent character codes like: `&#0`"""
+
+ def render(self, **kwargs) -> str: # type: ignore[override]
+ return f"&#{self.data};"
+
+
+class Entity(TerminalElement):
+ """Represent entities like `&amp`"""
+
+ def render(self, **kwargs) -> str: # type: ignore[override]
+ return f"&{self.data};"
+
+
+class Tree:
+ """The engine class to generate the AST tree."""
+
+ def __init__(self, name: str = ""):
+ """Initialise Tree"""
+ self.name = name
+ self.outmost = Root(name)
+ self.stack: deque = deque()
+ self.stack.append(self.outmost)
+
+ def clear(self):
+ """Clear the outmost and stack for a new parsing."""
+ self.outmost = Root(self.name)
+ self.stack.clear()
+ self.stack.append(self.outmost)
+
+ def last(self) -> Element:
+ """Return the last pointer which point to the actual tag scope."""
+ return self.stack[-1]
+
+ def nest_tag(self, name: str, attrs: dict):
+ """Nest a given tag at the bottom of the tree using
+ the last stack's pointer.
+ """
+ pointer = self.stack.pop()
+ item = Tag(name, attrs)
+ pointer.append(item)
+ self.stack.append(pointer)
+ self.stack.append(item)
+
+ def nest_xtag(self, name: str, attrs: dict):
+ """Nest an XTag onto the tree."""
+ top = self.last()
+ item = XTag(name, attrs)
+ top.append(item)
+
+ def nest_vtag(self, name: str, attrs: dict):
+ """Nest a VoidTag onto the tree."""
+ top = self.last()
+ item = VoidTag(name, attrs)
+ top.append(item)
+
+ def nest_terminal(self, klass: type[TerminalElement], data: str):
+ """Nest the data onto the tree."""
+ top = self.last()
+ item = klass(data)
+ top.append(item)
+
+ def enclose(self, name: str):
+ """When a closing tag is found, pop the pointer's scope from the stack,
+ to then point to the earlier scope's tag.
+ """
+ count = 0
+ for ind in reversed(self.stack):
+ count = count + 1
+ if ind.name == name:
+ break
+ else:
+ count = 0
+
+ # It pops all the items which do not match with the closing tag.
+ for _ in range(0, count):
+ self.stack.pop()
+
+
+class HtmlToAst(HTMLParser):
+ """The tokenizer class."""
+
+ # see https://html.spec.whatwg.org/multipage/syntax.html#void-elements
+ void_elements = {
+ "area",
+ "base",
+ "br",
+ "col",
+ "embed",
+ "hr",
+ "img",
+ "input",
+ "link",
+ "meta",
+ "param",
+ "source",
+ "track",
+ "wbr",
+ }
+
+ def __init__(self, name: str = "", convert_charrefs: bool = False):
+ super().__init__(convert_charrefs=convert_charrefs)
+ self.struct = Tree(name)
+
+ def feed(self, source: str) -> Root: # type: ignore[override]
+ """Parse the source string."""
+ self.struct.clear()
+ super().feed(source)
+ return self.struct.outmost
+
+ def handle_starttag(self, name: str, attr):
+ """When found an opening tag then nest it onto the tree."""
+ if name in self.void_elements:
+ self.struct.nest_vtag(name, attr)
+ else:
+ self.struct.nest_tag(name, attr)
+
+ def handle_startendtag(self, name: str, attr):
+ """When found a XHTML tag style then nest it up to the tree."""
+ self.struct.nest_xtag(name, attr)
+
+ def handle_endtag(self, name: str):
+ """When found a closing tag then makes it point to the right scope."""
+ if name not in self.void_elements:
+ self.struct.enclose(name)
+
+ def handle_data(self, data: str):
+ """Nest data onto the tree."""
+ self.struct.nest_terminal(Data, data)
+
+ def handle_decl(self, decl: str):
+ self.struct.nest_terminal(Declaration, decl)
+
+ def unknown_decl(self, decl: str):
+ self.struct.nest_terminal(Declaration, decl)
+
+ def handle_charref(self, data: str):
+ self.struct.nest_terminal(Char, data)
+
+ def handle_entityref(self, data: str):
+ self.struct.nest_terminal(Entity, data)
+
+ def handle_pi(self, data: str):
+ self.struct.nest_terminal(Pi, data)
+
+ def handle_comment(self, data: str):
+ self.struct.nest_terminal(Comment, data)
+
+
+def tokenize_html(text: str, name: str = "", convert_charrefs: bool = False) -> Root:
+ parser = HtmlToAst(name, convert_charrefs=convert_charrefs)
+ return parser.feed(text)
diff --git a/myst_parser/parsers/sphinx_.py b/myst_parser/parsers/sphinx_.py
new file mode 100644
index 0000000..fff098f
--- /dev/null
+++ b/myst_parser/parsers/sphinx_.py
@@ -0,0 +1,69 @@
+"""MyST Markdown parser for sphinx."""
+from __future__ import annotations
+
+from docutils import nodes
+from docutils.parsers.rst import Parser as RstParser
+from sphinx.parsers import Parser as SphinxParser
+from sphinx.util import logging
+
+from myst_parser.config.main import (
+ MdParserConfig,
+ TopmatterReadError,
+ merge_file_level,
+ read_topmatter,
+)
+from myst_parser.mdit_to_docutils.sphinx_ import SphinxRenderer, create_warning
+from myst_parser.parsers.mdit import create_md_parser
+
+SPHINX_LOGGER = logging.getLogger(__name__)
+
+
+class MystParser(SphinxParser):
+ """Sphinx parser for Markedly Structured Text (MyST)."""
+
+ supported: tuple[str, ...] = ("md", "markdown", "myst")
+ """Aliases this parser supports."""
+
+ settings_spec = RstParser.settings_spec
+ """Runtime settings specification.
+
+ Defines runtime settings and associated command-line options, as used by
+ `docutils.frontend.OptionParser`. This is a concatenation of tuples of:
+
+ - Option group title (string or `None` which implies no group, just a list
+ of single options).
+
+ - Description (string or `None`).
+
+ - A sequence of option tuples
+ """
+
+ config_section = "myst parser"
+ config_section_dependencies = ("parsers",)
+ translate_section_name = None
+
+ def parse(self, inputstring: str, document: nodes.document) -> None:
+ """Parse source text.
+
+ :param inputstring: The source string to parse
+ :param document: The root docutils node to add AST elements to
+
+ """
+ # get the global config
+ config: MdParserConfig = document.settings.env.myst_config
+
+ # update the global config with the file-level config
+ try:
+ topmatter = read_topmatter(inputstring)
+ except TopmatterReadError:
+ pass # this will be reported during the render
+ else:
+ if topmatter:
+ warning = lambda wtype, msg: create_warning( # noqa: E731
+ document, msg, line=1, append_to=document, subtype=wtype
+ )
+ config = merge_file_level(config, topmatter, warning)
+
+ parser = create_md_parser(config, SphinxRenderer)
+ parser.options["document"] = document
+ parser.render(inputstring)
diff --git a/myst_parser/py.typed b/myst_parser/py.typed
new file mode 100644
index 0000000..7632ecf
--- /dev/null
+++ b/myst_parser/py.typed
@@ -0,0 +1 @@
+# Marker file for PEP 561
diff --git a/myst_parser/sphinx_.py b/myst_parser/sphinx_.py
new file mode 100644
index 0000000..b085086
--- /dev/null
+++ b/myst_parser/sphinx_.py
@@ -0,0 +1,6 @@
+"""A module for compatibility with the docutils>=0.17 `include` directive, in RST documents::
+
+ .. include:: path/to/file.md
+ :parser: myst_parser.sphinx_
+"""
+from myst_parser.parsers.sphinx_ import MystParser as Parser # noqa: F401
diff --git a/myst_parser/sphinx_ext/__init__.py b/myst_parser/sphinx_ext/__init__.py
new file mode 100644
index 0000000..1bfeb71
--- /dev/null
+++ b/myst_parser/sphinx_ext/__init__.py
@@ -0,0 +1 @@
+"""Sphinx extension for myst_parser."""
diff --git a/myst_parser/sphinx_ext/directives.py b/myst_parser/sphinx_ext/directives.py
new file mode 100644
index 0000000..39ca2c6
--- /dev/null
+++ b/myst_parser/sphinx_ext/directives.py
@@ -0,0 +1,136 @@
+"""MyST specific directives"""
+from copy import copy
+from typing import List, Tuple, cast
+
+from docutils import nodes
+from docutils.parsers.rst import directives
+from sphinx.directives import SphinxDirective
+from sphinx.util.docutils import SphinxRole
+
+from myst_parser.mocking import MockState
+
+
+def align(argument):
+ return directives.choice(argument, ("left", "center", "right"))
+
+
+def figwidth_value(argument):
+ if argument.lower() == "image":
+ return "image"
+ else:
+ return directives.length_or_percentage_or_unitless(argument, "px")
+
+
+class SubstitutionReferenceRole(SphinxRole):
+ """Implement substitution references as a role.
+
+ Note, in ``docutils/parsers/rst/roles.py`` this is left unimplemented.
+ """
+
+ def run(self) -> Tuple[List[nodes.Node], List[nodes.system_message]]:
+ subref_node = nodes.substitution_reference(self.rawtext, self.text)
+ self.set_source_info(subref_node, self.lineno)
+ subref_node["refname"] = nodes.fully_normalize_name(self.text)
+ return [subref_node], []
+
+
+class FigureMarkdown(SphinxDirective):
+ """Directive for creating a figure with Markdown compatible syntax.
+
+ Example::
+
+ :::{figure-md} target
+ <img src="img/fun-fish.png" alt="fishy" class="bg-primary mb-1" width="200px">
+
+ This is a caption in **Markdown**
+ :::
+
+ """
+
+ required_arguments = 0
+ optional_arguments = 1 # image target
+ final_argument_whitespace = True
+ has_content = True
+
+ option_spec = {
+ "width": figwidth_value,
+ "class": directives.class_option,
+ "align": align,
+ "name": directives.unchanged,
+ }
+
+ def run(self) -> List[nodes.Node]:
+ figwidth = self.options.pop("width", None)
+ figclasses = self.options.pop("class", None)
+ align = self.options.pop("align", None)
+
+ if not isinstance(self.state, MockState):
+ return [self.figure_error("Directive is only supported in myst parser")]
+ state = cast(MockState, self.state)
+
+ # ensure html image enabled
+ myst_extensions = copy(state._renderer.md_config.enable_extensions)
+ node = nodes.Element()
+ try:
+ state._renderer.md_config.enable_extensions = list(
+ state._renderer.md_config.enable_extensions
+ ) + ["html_image"]
+ state.nested_parse(self.content, self.content_offset, node)
+ finally:
+ state._renderer.md_config.enable_extensions = myst_extensions
+
+ if not len(node.children) == 2:
+ return [
+ self.figure_error(
+ "content should be one image, "
+ "followed by a single paragraph caption"
+ )
+ ]
+
+ image_node, caption_para = node.children
+ if isinstance(image_node, nodes.paragraph):
+ image_node = image_node[0]
+
+ if not isinstance(image_node, nodes.image):
+ return [
+ self.figure_error(
+ "content should be one image (not found), "
+ "followed by single paragraph caption"
+ )
+ ]
+
+ if not isinstance(caption_para, nodes.paragraph):
+ return [
+ self.figure_error(
+ "content should be one image, "
+ "followed by single paragraph caption (not found)"
+ )
+ ]
+
+ caption_node = nodes.caption(caption_para.rawsource, "", *caption_para.children)
+ caption_node.source = caption_para.source
+ caption_node.line = caption_para.line
+
+ figure_node = nodes.figure("", image_node, caption_node)
+ self.set_source_info(figure_node)
+
+ if figwidth is not None:
+ figure_node["width"] = figwidth
+ if figclasses:
+ figure_node["classes"] += figclasses
+ if align:
+ figure_node["align"] = align
+ if self.arguments:
+ self.options["name"] = self.arguments[0]
+ self.add_name(figure_node)
+
+ return [figure_node]
+
+ def figure_error(self, message):
+ """A warning for reporting an invalid figure."""
+ error = self.state_machine.reporter.error(
+ message,
+ nodes.literal_block(self.block_text, self.block_text),
+ line=self.lineno,
+ )
+ return error
diff --git a/myst_parser/sphinx_ext/main.py b/myst_parser/sphinx_ext/main.py
new file mode 100644
index 0000000..f5aeffc
--- /dev/null
+++ b/myst_parser/sphinx_ext/main.py
@@ -0,0 +1,60 @@
+"""The setup for the sphinx extension."""
+from typing import Any
+
+from sphinx.application import Sphinx
+
+
+def setup_sphinx(app: Sphinx, load_parser=False):
+ """Initialize all settings and transforms in Sphinx."""
+ # we do this separately to setup,
+ # so that it can be called by external packages like myst_nb
+ from myst_parser.config.main import MdParserConfig
+ from myst_parser.parsers.sphinx_ import MystParser
+ from myst_parser.sphinx_ext.directives import (
+ FigureMarkdown,
+ SubstitutionReferenceRole,
+ )
+ from myst_parser.sphinx_ext.mathjax import override_mathjax
+ from myst_parser.sphinx_ext.myst_refs import MystReferenceResolver
+
+ if load_parser:
+ app.add_source_suffix(".md", "markdown")
+ app.add_source_parser(MystParser)
+
+ app.add_role("sub-ref", SubstitutionReferenceRole())
+ app.add_directive("figure-md", FigureMarkdown)
+
+ app.add_post_transform(MystReferenceResolver)
+
+ for name, default, field in MdParserConfig().as_triple():
+ if not field.metadata.get("docutils_only", False):
+ # TODO add types?
+ app.add_config_value(f"myst_{name}", default, "env", types=Any)
+
+ app.connect("builder-inited", create_myst_config)
+ app.connect("builder-inited", override_mathjax)
+
+
+def create_myst_config(app):
+ from sphinx.util import logging
+
+ # Ignore type checkers because the attribute is dynamically assigned
+ from sphinx.util.console import bold # type: ignore[attr-defined]
+
+ from myst_parser import __version__
+ from myst_parser.config.main import MdParserConfig
+
+ logger = logging.getLogger(__name__)
+
+ values = {
+ name: app.config[f"myst_{name}"]
+ for name, _, field in MdParserConfig().as_triple()
+ if not field.metadata.get("docutils_only", False)
+ }
+
+ try:
+ app.env.myst_config = MdParserConfig(**values)
+ logger.info(bold("myst v%s:") + " %s", __version__, app.env.myst_config)
+ except (TypeError, ValueError) as error:
+ logger.error("myst configuration invalid: %s", error.args[0])
+ app.env.myst_config = MdParserConfig()
diff --git a/myst_parser/sphinx_ext/mathjax.py b/myst_parser/sphinx_ext/mathjax.py
new file mode 100644
index 0000000..260f008
--- /dev/null
+++ b/myst_parser/sphinx_ext/mathjax.py
@@ -0,0 +1,118 @@
+"""Overrides to ``sphinx.ext.mathjax``
+
+This fixes two issues:
+
+1. Mathjax should not search for ``$`` delimiters, nor LaTeX amsmath environments,
+ since we already achieve this with the dollarmath and amsmath mrakdown-it-py plugins
+2. amsmath math blocks should be wrapped in mathjax delimiters (default ``\\[...\\]``),
+ and assigned an equation number
+
+"""
+from docutils import nodes
+from sphinx.application import Sphinx
+from sphinx.ext import mathjax
+from sphinx.locale import _
+from sphinx.util import logging
+from sphinx.util.math import get_node_equation_number
+from sphinx.writers.html import HTMLTranslator
+
+logger = logging.getLogger(__name__)
+
+
+def log_override_warning(app: Sphinx, version: int, current: str, new: str) -> None:
+ """Log a warning if MathJax configuration being overridden."""
+ if logging.is_suppressed_warning("myst", "mathjax", app.config.suppress_warnings):
+ return
+ config_name = (
+ "mathjax3_config['options']['processHtmlClass']"
+ if version == 3
+ else "mathjax_config['tex2jax']['processClass']"
+ )
+ logger.warning(
+ f"`{config_name}` is being overridden by myst-parser: '{current}' -> '{new}'. "
+ "Set `suppress_warnings=['myst.mathjax']` to ignore this warning, or "
+ "`myst_update_mathjax=False` if this is undesirable."
+ )
+
+
+def override_mathjax(app: Sphinx):
+ """Override aspects of the mathjax extension.
+
+ MyST-Parser parses dollar and latex math, via markdown-it plugins.
+ Therefore, we tell Mathjax to only render these HTML elements.
+ This is accompanied by setting the `ignoreClass` on the top-level section of each MyST document.
+ """
+ if (
+ "amsmath" in app.config["myst_enable_extensions"]
+ and "mathjax" in app.registry.html_block_math_renderers
+ ):
+ app.registry.html_block_math_renderers["mathjax"] = (
+ html_visit_displaymath, # type: ignore[assignment]
+ None,
+ )
+
+ if "dollarmath" not in app.config["myst_enable_extensions"]:
+ return
+ if not app.env.myst_config.update_mathjax: # type: ignore
+ return
+
+ mjax_classes = app.env.myst_config.mathjax_classes # type: ignore
+
+ if "mathjax3_config" in app.config:
+ # sphinx 4 + mathjax 3
+ app.config.mathjax3_config = app.config.mathjax3_config or {} # type: ignore
+ app.config.mathjax3_config.setdefault("options", {})
+ if (
+ "processHtmlClass" in app.config.mathjax3_config["options"]
+ and app.config.mathjax3_config["options"]["processHtmlClass"]
+ != mjax_classes
+ ):
+ log_override_warning(
+ app,
+ 3,
+ app.config.mathjax3_config["options"]["processHtmlClass"],
+ mjax_classes,
+ )
+ app.config.mathjax3_config["options"]["processHtmlClass"] = mjax_classes
+ elif "mathjax_config" in app.config:
+ # sphinx 3 + mathjax 2
+ app.config.mathjax_config = app.config.mathjax_config or {} # type: ignore[attr-defined]
+ app.config.mathjax_config.setdefault("tex2jax", {})
+ if (
+ "processClass" in app.config.mathjax_config["tex2jax"]
+ and app.config.mathjax_config["tex2jax"]["processClass"] != mjax_classes
+ ):
+ log_override_warning(
+ app,
+ 2,
+ app.config.mathjax_config["tex2jax"]["processClass"],
+ mjax_classes,
+ )
+ app.config.mathjax_config["tex2jax"]["processClass"] = mjax_classes
+
+
+def html_visit_displaymath(self: HTMLTranslator, node: nodes.math_block) -> None:
+ """Override for sphinx.ext.mathjax.html_visit_displaymath to handle amsmath.
+
+ By default displaymath, are normally wrapped in a prefix/suffix,
+ defined by mathjax_display, and labelled nodes are numbered.
+ However, this is not the case if the math_block is set as 'nowrap', as for amsmath.
+ Therefore, we need to override this behaviour.
+ """
+ if "amsmath" in node.get("classes", []):
+ self.body.append(
+ self.starttag(node, "div", CLASS="math notranslate nohighlight amsmath")
+ )
+ if node["number"]:
+ number = get_node_equation_number(self, node)
+ self.body.append('<span class="eqno">(%s)' % number)
+ self.add_permalink_ref(node, _("Permalink to this equation"))
+ self.body.append("</span>")
+ prefix, suffix = self.builder.config.mathjax_display
+ self.body.append(prefix)
+ self.body.append(self.encode(node.astext()))
+ self.body.append(suffix)
+ self.body.append("</div>\n")
+ raise nodes.SkipNode
+
+ return mathjax.html_visit_displaymath(self, node)
diff --git a/myst_parser/sphinx_ext/myst_refs.py b/myst_parser/sphinx_ext/myst_refs.py
new file mode 100644
index 0000000..f364345
--- /dev/null
+++ b/myst_parser/sphinx_ext/myst_refs.py
@@ -0,0 +1,282 @@
+"""A post-transform for overriding the behaviour of sphinx reference resolution.
+
+This is applied to MyST type references only, such as ``[text](target)``,
+and allows for nested syntax
+"""
+import os
+from typing import Any, List, Optional, Tuple, cast
+
+from docutils import nodes
+from docutils.nodes import Element, document
+from sphinx import addnodes, version_info
+from sphinx.addnodes import pending_xref
+from sphinx.domains.std import StandardDomain
+from sphinx.locale import __
+from sphinx.transforms.post_transforms import ReferencesResolver
+from sphinx.util import docname_join, logging
+from sphinx.util.nodes import clean_astext, make_refnode
+
+from myst_parser._compat import findall
+
+try:
+ from sphinx.errors import NoUri
+except ImportError:
+ # sphinx < 2.1
+ from sphinx.environment import NoUri # type: ignore
+
+logger = logging.getLogger(__name__)
+
+
+class MystReferenceResolver(ReferencesResolver):
+ """Resolves cross-references on doctrees.
+
+ Overrides default sphinx implementation, to allow for nested syntax
+ """
+
+ default_priority = 9 # higher priority than ReferencesResolver (10)
+
+ def run(self, **kwargs: Any) -> None:
+ self.document: document
+ for node in findall(self.document)(addnodes.pending_xref):
+ if node["reftype"] != "myst":
+ continue
+
+ contnode = cast(nodes.TextElement, node[0].deepcopy())
+ newnode = None
+
+ target = node["reftarget"]
+ refdoc = node.get("refdoc", self.env.docname)
+ domain = None
+
+ try:
+ newnode = self.resolve_myst_ref(refdoc, node, contnode)
+ if newnode is None:
+ # no new node found? try the missing-reference event
+ # but first we change the the reftype to 'any'
+ # this means it is picked up by extensions like intersphinx
+ node["reftype"] = "any"
+ try:
+ newnode = self.app.emit_firstresult(
+ "missing-reference",
+ self.env,
+ node,
+ contnode,
+ **(
+ {"allowed_exceptions": (NoUri,)}
+ if version_info[0] > 2
+ else {}
+ ),
+ )
+ finally:
+ node["reftype"] = "myst"
+ # still not found? warn if node wishes to be warned about or
+ # we are in nit-picky mode
+ if newnode is None:
+ node["refdomain"] = ""
+ # TODO ideally we would override the warning message here,
+ # to show the [ref.myst] for suppressing warning
+ self.warn_missing_reference(
+ refdoc, node["reftype"], target, node, domain
+ )
+ except NoUri:
+ newnode = contnode
+
+ node.replace_self(newnode or contnode)
+
+ def resolve_myst_ref(
+ self, refdoc: str, node: pending_xref, contnode: Element
+ ) -> Element:
+ """Resolve reference generated by the "myst" role; ``[text](reference)``.
+
+ This builds on the sphinx ``any`` role to also resolve:
+
+ - Document references with extensions; ``[text](./doc.md)``
+ - Document references with anchors with anchors; ``[text](./doc.md#target)``
+ - Nested syntax for explicit text with std:doc and std:ref;
+ ``[**nested**](reference)``
+
+ """
+ target = node["reftarget"] # type: str
+ results = [] # type: List[Tuple[str, Element]]
+
+ res_anchor = self._resolve_anchor(node, refdoc)
+ if res_anchor:
+ results.append(("std:doc", res_anchor))
+ else:
+ # if we've already found an anchored doc,
+ # don't search in the std:ref/std:doc (leads to duplication)
+
+ # resolve standard references
+ res = self._resolve_ref_nested(node, refdoc)
+ if res:
+ results.append(("std:ref", res))
+
+ # resolve doc names
+ res = self._resolve_doc_nested(node, refdoc)
+ if res:
+ results.append(("std:doc", res))
+
+ # get allowed domains for referencing
+ ref_domains = self.env.config.myst_ref_domains
+
+ assert self.app.builder
+
+ # next resolve for any other standard reference objects
+ if ref_domains is None or "std" in ref_domains:
+ stddomain = cast(StandardDomain, self.env.get_domain("std"))
+ for objtype in stddomain.object_types:
+ key = (objtype, target)
+ if objtype == "term":
+ key = (objtype, target.lower())
+ if key in stddomain.objects:
+ docname, labelid = stddomain.objects[key]
+ domain_role = "std:" + stddomain.role_for_objtype(objtype)
+ ref_node = make_refnode(
+ self.app.builder, refdoc, docname, labelid, contnode
+ )
+ results.append((domain_role, ref_node))
+
+ # finally resolve for any other type of allowed reference domain
+ for domain in self.env.domains.values():
+ if domain.name == "std":
+ continue # we did this one already
+ if ref_domains is not None and domain.name not in ref_domains:
+ continue
+ try:
+ results.extend(
+ domain.resolve_any_xref(
+ self.env, refdoc, self.app.builder, target, node, contnode
+ )
+ )
+ except NotImplementedError:
+ # the domain doesn't yet support the new interface
+ # we have to manually collect possible references (SLOW)
+ if not (getattr(domain, "__module__", "").startswith("sphinx.")):
+ logger.warning(
+ f"Domain '{domain.__module__}::{domain.name}' has not "
+ "implemented a `resolve_any_xref` method [myst.domains]",
+ type="myst",
+ subtype="domains",
+ once=True,
+ )
+ for role in domain.roles:
+ res = domain.resolve_xref(
+ self.env, refdoc, self.app.builder, role, target, node, contnode
+ )
+ if res and len(res) and isinstance(res[0], nodes.Element):
+ results.append((f"{domain.name}:{role}", res))
+
+ # now, see how many matches we got...
+ if not results:
+ return None
+ if len(results) > 1:
+
+ def stringify(name, node):
+ reftitle = node.get("reftitle", node.astext())
+ return f":{name}:`{reftitle}`"
+
+ candidates = " or ".join(stringify(name, role) for name, role in results)
+ logger.warning(
+ __(
+ f"more than one target found for 'myst' cross-reference {target}: "
+ f"could be {candidates} [myst.ref]"
+ ),
+ location=node,
+ type="myst",
+ subtype="ref",
+ )
+
+ res_role, newnode = results[0]
+ # Override "myst" class with the actual role type to get the styling
+ # approximately correct.
+ res_domain = res_role.split(":")[0]
+ if len(newnode) > 0 and isinstance(newnode[0], nodes.Element):
+ newnode[0]["classes"] = newnode[0].get("classes", []) + [
+ res_domain,
+ res_role.replace(":", "-"),
+ ]
+
+ return newnode
+
+ def _resolve_anchor(
+ self, node: pending_xref, fromdocname: str
+ ) -> Optional[Element]:
+ """Resolve doc with anchor."""
+ if self.env.config.myst_heading_anchors is None:
+ # no target anchors will have been created, so we don't look for them
+ return None
+ target = node["reftarget"] # type: str
+ if "#" not in target:
+ return None
+ # the link may be a heading anchor; we need to first get the relative path
+ rel_path, anchor = target.rsplit("#", 1)
+ rel_path = os.path.normpath(rel_path)
+ if rel_path == ".":
+ # anchor in the same doc as the node
+ doc_path = self.env.doc2path(node.get("refdoc", fromdocname), base=False)
+ else:
+ # anchor in a different doc from the node
+ doc_path = os.path.normpath(
+ os.path.join(node.get("refdoc", fromdocname), "..", rel_path)
+ )
+ return self._resolve_ref_nested(node, fromdocname, doc_path + "#" + anchor)
+
+ def _resolve_ref_nested(
+ self, node: pending_xref, fromdocname: str, target=None
+ ) -> Optional[Element]:
+ """This is the same as ``sphinx.domains.std._resolve_ref_xref``,
+ but allows for nested syntax, rather than converting the inner node to raw text.
+ """
+ stddomain = cast(StandardDomain, self.env.get_domain("std"))
+ target = target or node["reftarget"].lower()
+
+ if node["refexplicit"]:
+ # reference to anonymous label; the reference uses
+ # the supplied link caption
+ docname, labelid = stddomain.anonlabels.get(target, ("", ""))
+ sectname = node.astext()
+ innernode = nodes.inline(sectname, "")
+ innernode.extend(node[0].children)
+ else:
+ # reference to named label; the final node will
+ # contain the section name after the label
+ docname, labelid, sectname = stddomain.labels.get(target, ("", "", ""))
+ innernode = nodes.inline(sectname, sectname)
+
+ if not docname:
+ return None
+
+ assert self.app.builder
+ return make_refnode(self.app.builder, fromdocname, docname, labelid, innernode)
+
+ def _resolve_doc_nested(
+ self, node: pending_xref, fromdocname: str
+ ) -> Optional[Element]:
+ """This is the same as ``sphinx.domains.std._resolve_doc_xref``,
+ but allows for nested syntax, rather than converting the inner node to raw text.
+
+ It also allows for extensions on document names.
+ """
+ # directly reference to document by source name; can be absolute or relative
+ refdoc = node.get("refdoc", fromdocname)
+ docname = docname_join(refdoc, node["reftarget"])
+
+ if docname not in self.env.all_docs:
+ # try stripping known extensions from doc name
+ if os.path.splitext(docname)[1] in self.env.config.source_suffix:
+ docname = os.path.splitext(docname)[0]
+ if docname not in self.env.all_docs:
+ return None
+
+ if node["refexplicit"]:
+ # reference with explicit title
+ caption = node.astext()
+ innernode = nodes.inline(caption, "", classes=["doc"])
+ innernode.extend(node[0].children)
+ else:
+ # TODO do we want nested syntax for titles?
+ caption = clean_astext(self.env.titles[docname])
+ innernode = nodes.inline(caption, caption, classes=["doc"])
+
+ assert self.app.builder
+ return make_refnode(self.app.builder, fromdocname, docname, "", innernode)