summaryrefslogtreecommitdiffstats
path: root/myst_parser/parsers
diff options
context:
space:
mode:
Diffstat (limited to 'myst_parser/parsers')
-rw-r--r--myst_parser/parsers/__init__.py1
-rw-r--r--myst_parser/parsers/directives.py190
-rw-r--r--myst_parser/parsers/docutils_.py275
-rw-r--r--myst_parser/parsers/mdit.py123
-rw-r--r--myst_parser/parsers/parse_html.py440
-rw-r--r--myst_parser/parsers/sphinx_.py69
6 files changed, 1098 insertions, 0 deletions
diff --git a/myst_parser/parsers/__init__.py b/myst_parser/parsers/__init__.py
new file mode 100644
index 0000000..26fbfca
--- /dev/null
+++ b/myst_parser/parsers/__init__.py
@@ -0,0 +1 @@
+"""Parsers of MyST Markdown source text to docutils AST."""
diff --git a/myst_parser/parsers/directives.py b/myst_parser/parsers/directives.py
new file mode 100644
index 0000000..5637254
--- /dev/null
+++ b/myst_parser/parsers/directives.py
@@ -0,0 +1,190 @@
+"""Fenced code blocks are parsed as directives,
+if the block starts with ``{directive_name}``,
+followed by arguments on the same line.
+
+Directive options are read from a YAML block,
+if the first content line starts with ``---``, e.g.
+
+::
+
+ ```{directive_name} arguments
+ ---
+ option1: name
+ option2: |
+ Longer text block
+ ---
+ content...
+ ```
+
+Or the option block will be parsed if the first content line starts with ``:``,
+as a YAML block consisting of every line that starts with a ``:``, e.g.
+
+::
+
+ ```{directive_name} arguments
+ :option1: name
+ :option2: other
+
+ content...
+ ```
+
+If the first line of a directive's content is blank, this will be stripped
+from the content.
+This is to allow for separation between the option block and content.
+
+"""
+from __future__ import annotations
+
+import datetime
+import re
+from textwrap import dedent
+from typing import Any, Callable
+
+import yaml
+from docutils.parsers.rst import Directive
+from docutils.parsers.rst.directives.misc import TestDirective
+
+
+class DirectiveParsingError(Exception):
+ """Raise on parsing/validation error."""
+
+ pass
+
+
+def parse_directive_text(
+ directive_class: type[Directive],
+ first_line: str,
+ content: str,
+ validate_options: bool = True,
+) -> tuple[list[str], dict, list[str], int]:
+ """Parse (and validate) the full directive text.
+
+ :param first_line: The text on the same line as the directive name.
+ May be an argument or body text, dependent on the directive
+ :param content: All text after the first line. Can include options.
+ :param validate_options: Whether to validate the values of options
+
+ :returns: (arguments, options, body_lines, content_offset)
+ """
+ if directive_class.option_spec:
+ body, options = parse_directive_options(
+ content, directive_class, validate=validate_options
+ )
+ body_lines = body.splitlines()
+ content_offset = len(content.splitlines()) - len(body_lines)
+ else:
+ # If there are no possible options, we do not look for a YAML block
+ options = {}
+ body_lines = content.splitlines()
+ content_offset = 0
+
+ if not (directive_class.required_arguments or directive_class.optional_arguments):
+ # If there are no possible arguments, then the body starts on the argument line
+ if first_line:
+ body_lines.insert(0, first_line)
+ arguments = []
+ else:
+ arguments = parse_directive_arguments(directive_class, first_line)
+
+ # remove first line of body if blank
+ # this is to allow space between the options and the content
+ if body_lines and not body_lines[0].strip():
+ body_lines = body_lines[1:]
+ content_offset += 1
+
+ # check for body content
+ if body_lines and not directive_class.has_content:
+ raise DirectiveParsingError("No content permitted")
+
+ return arguments, options, body_lines, content_offset
+
+
+def parse_directive_options(
+ content: str, directive_class: type[Directive], validate: bool = True
+):
+ """Parse (and validate) the directive option section."""
+ options: dict[str, Any] = {}
+ if content.startswith("---"):
+ content = "\n".join(content.splitlines()[1:])
+ match = re.search(r"^-{3,}", content, re.MULTILINE)
+ if match:
+ yaml_block = content[: match.start()]
+ content = content[match.end() + 1 :] # TODO advance line number
+ else:
+ yaml_block = content
+ content = ""
+ yaml_block = dedent(yaml_block)
+ try:
+ options = yaml.safe_load(yaml_block) or {}
+ except (yaml.parser.ParserError, yaml.scanner.ScannerError) as error:
+ raise DirectiveParsingError("Invalid options YAML: " + str(error))
+ elif content.lstrip().startswith(":"):
+ content_lines = content.splitlines() # type: list
+ yaml_lines = []
+ while content_lines:
+ if not content_lines[0].lstrip().startswith(":"):
+ break
+ yaml_lines.append(content_lines.pop(0).lstrip()[1:])
+ yaml_block = "\n".join(yaml_lines)
+ content = "\n".join(content_lines)
+ try:
+ options = yaml.safe_load(yaml_block) or {}
+ except (yaml.parser.ParserError, yaml.scanner.ScannerError) as error:
+ raise DirectiveParsingError("Invalid options YAML: " + str(error))
+ if not isinstance(options, dict):
+ raise DirectiveParsingError(f"Invalid options (not dict): {options}")
+
+ if (not validate) or issubclass(directive_class, TestDirective):
+ # technically this directive spec only accepts one option ('option')
+ # but since its for testing only we accept all options
+ return content, options
+
+ # check options against spec
+ options_spec: dict[str, Callable] = directive_class.option_spec
+ for name, value in list(options.items()):
+ try:
+ convertor = options_spec[name]
+ except KeyError:
+ raise DirectiveParsingError(f"Unknown option: {name}")
+ if not isinstance(value, str):
+ if value is True or value is None:
+ value = None # flag converter requires no argument
+ elif isinstance(value, (int, float, datetime.date, datetime.datetime)):
+ # convertor always requires string input
+ value = str(value)
+ else:
+ raise DirectiveParsingError(
+ f'option "{name}" value not string (enclose with ""): {value}'
+ )
+ try:
+ converted_value = convertor(value)
+ except (ValueError, TypeError) as error:
+ raise DirectiveParsingError(
+ "Invalid option value: (option: '{}'; value: {})\n{}".format(
+ name, value, error
+ )
+ )
+ options[name] = converted_value
+
+ return content, options
+
+
+def parse_directive_arguments(directive, arg_text):
+ """Parse (and validate) the directive argument section."""
+ required = directive.required_arguments
+ optional = directive.optional_arguments
+ arguments = arg_text.split()
+ if len(arguments) < required:
+ raise DirectiveParsingError(
+ f"{required} argument(s) required, {len(arguments)} supplied"
+ )
+ elif len(arguments) > required + optional:
+ if directive.final_argument_whitespace:
+ arguments = arg_text.split(None, required + optional - 1)
+ else:
+ raise DirectiveParsingError(
+ "maximum {} argument(s) allowed, {} supplied".format(
+ required + optional, len(arguments)
+ )
+ )
+ return arguments
diff --git a/myst_parser/parsers/docutils_.py b/myst_parser/parsers/docutils_.py
new file mode 100644
index 0000000..aaef5e2
--- /dev/null
+++ b/myst_parser/parsers/docutils_.py
@@ -0,0 +1,275 @@
+"""MyST Markdown parser for docutils."""
+from dataclasses import Field
+from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple, Union
+
+from docutils import frontend, nodes
+from docutils.core import default_description, publish_cmdline
+from docutils.parsers.rst import Parser as RstParser
+from typing_extensions import Literal, get_args, get_origin
+
+from myst_parser.config.main import (
+ MdParserConfig,
+ TopmatterReadError,
+ merge_file_level,
+ read_topmatter,
+)
+from myst_parser.mdit_to_docutils.base import DocutilsRenderer, create_warning
+from myst_parser.parsers.mdit import create_md_parser
+
+
+def _validate_int(
+ setting, value, option_parser, config_parser=None, config_section=None
+) -> int:
+ """Validate an integer setting."""
+ return int(value)
+
+
+def _create_validate_tuple(length: int) -> Callable[..., Tuple[str, ...]]:
+ """Create a validator for a tuple of length `length`."""
+
+ def _validate(
+ setting, value, option_parser, config_parser=None, config_section=None
+ ):
+ string_list = frontend.validate_comma_separated_list(
+ setting, value, option_parser, config_parser, config_section
+ )
+ if len(string_list) != length:
+ raise ValueError(
+ f"Expecting {length} items in {setting}, got {len(string_list)}."
+ )
+ return tuple(string_list)
+
+ return _validate
+
+
+class Unset:
+ """A sentinel class for unset settings."""
+
+ def __repr__(self):
+ return "UNSET"
+
+
+DOCUTILS_UNSET = Unset()
+"""Sentinel for arguments not set through docutils.conf."""
+
+
+DOCUTILS_EXCLUDED_ARGS = (
+ # docutils.conf can't represent callables
+ "heading_slug_func",
+ # docutils.conf can't represent dicts
+ "html_meta",
+ "substitutions",
+ # we can't add substitutions so not needed
+ "sub_delimiters",
+ # sphinx only options
+ "heading_anchors",
+ "ref_domains",
+ "update_mathjax",
+ "mathjax_classes",
+)
+"""Names of settings that cannot be set in docutils.conf."""
+
+
+def _attr_to_optparse_option(at: Field, default: Any) -> Tuple[dict, str]:
+ """Convert a field into a Docutils optparse options dict."""
+ if at.type is int:
+ return {"metavar": "<int>", "validator": _validate_int}, f"(default: {default})"
+ if at.type is bool:
+ return {
+ "metavar": "<boolean>",
+ "validator": frontend.validate_boolean,
+ }, f"(default: {default})"
+ if at.type is str:
+ return {
+ "metavar": "<str>",
+ }, f"(default: '{default}')"
+ if get_origin(at.type) is Literal and all(
+ isinstance(a, str) for a in get_args(at.type)
+ ):
+ args = get_args(at.type)
+ return {
+ "metavar": f"<{'|'.join(repr(a) for a in args)}>",
+ "type": "choice",
+ "choices": args,
+ }, f"(default: {default!r})"
+ if at.type in (Iterable[str], Sequence[str]):
+ return {
+ "metavar": "<comma-delimited>",
+ "validator": frontend.validate_comma_separated_list,
+ }, f"(default: '{','.join(default)}')"
+ if at.type == Tuple[str, str]:
+ return {
+ "metavar": "<str,str>",
+ "validator": _create_validate_tuple(2),
+ }, f"(default: '{','.join(default)}')"
+ if at.type == Union[int, type(None)]:
+ return {
+ "metavar": "<null|int>",
+ "validator": _validate_int,
+ }, f"(default: {default})"
+ if at.type == Union[Iterable[str], type(None)]:
+ default_str = ",".join(default) if default else ""
+ return {
+ "metavar": "<null|comma-delimited>",
+ "validator": frontend.validate_comma_separated_list,
+ }, f"(default: {default_str!r})"
+ raise AssertionError(
+ f"Configuration option {at.name} not set up for use in docutils.conf."
+ )
+
+
+def attr_to_optparse_option(
+ attribute: Field, default: Any, prefix: str = "myst_"
+) -> Tuple[str, List[str], Dict[str, Any]]:
+ """Convert an ``MdParserConfig`` attribute into a Docutils setting tuple.
+
+ :returns: A tuple of ``(help string, option flags, optparse kwargs)``.
+ """
+ name = f"{prefix}{attribute.name}"
+ flag = "--" + name.replace("_", "-")
+ options = {"dest": name, "default": DOCUTILS_UNSET}
+ at_options, type_str = _attr_to_optparse_option(attribute, default)
+ options.update(at_options)
+ help_str = attribute.metadata.get("help", "") if attribute.metadata else ""
+ return (f"{help_str} {type_str}", [flag], options)
+
+
+def create_myst_settings_spec(
+ excluded: Sequence[str], config_cls=MdParserConfig, prefix: str = "myst_"
+):
+ """Return a list of Docutils setting for the docutils MyST section."""
+ defaults = config_cls()
+ return tuple(
+ attr_to_optparse_option(at, getattr(defaults, at.name), prefix)
+ for at in config_cls.get_fields()
+ if at.name not in excluded
+ )
+
+
+def create_myst_config(
+ settings: frontend.Values,
+ excluded: Sequence[str],
+ config_cls=MdParserConfig,
+ prefix: str = "myst_",
+):
+ """Create a configuration instance from the given settings."""
+ values = {}
+ for attribute in config_cls.get_fields():
+ if attribute.name in excluded:
+ continue
+ setting = f"{prefix}{attribute.name}"
+ val = getattr(settings, setting, DOCUTILS_UNSET)
+ if val is not DOCUTILS_UNSET:
+ values[attribute.name] = val
+ return config_cls(**values)
+
+
+class Parser(RstParser):
+ """Docutils parser for Markedly Structured Text (MyST)."""
+
+ supported: Tuple[str, ...] = ("md", "markdown", "myst")
+ """Aliases this parser supports."""
+
+ settings_spec = (
+ "MyST options",
+ None,
+ create_myst_settings_spec(DOCUTILS_EXCLUDED_ARGS),
+ *RstParser.settings_spec,
+ )
+ """Runtime settings specification."""
+
+ config_section = "myst parser"
+ config_section_dependencies = ("parsers",)
+ translate_section_name = None
+
+ def parse(self, inputstring: str, document: nodes.document) -> None:
+ """Parse source text.
+
+ :param inputstring: The source string to parse
+ :param document: The root docutils node to add AST elements to
+ """
+
+ self.setup_parse(inputstring, document)
+
+ # check for exorbitantly long lines
+ if hasattr(document.settings, "line_length_limit"):
+ for i, line in enumerate(inputstring.split("\n")):
+ if len(line) > document.settings.line_length_limit:
+ error = document.reporter.error(
+ f"Line {i+1} exceeds the line-length-limit:"
+ f" {document.settings.line_length_limit}."
+ )
+ document.append(error)
+ return
+
+ # create parsing configuration from the global config
+ try:
+ config = create_myst_config(document.settings, DOCUTILS_EXCLUDED_ARGS)
+ except Exception as exc:
+ error = document.reporter.error(f"Global myst configuration invalid: {exc}")
+ document.append(error)
+ config = MdParserConfig()
+
+ # update the global config with the file-level config
+ try:
+ topmatter = read_topmatter(inputstring)
+ except TopmatterReadError:
+ pass # this will be reported during the render
+ else:
+ if topmatter:
+ warning = lambda wtype, msg: create_warning( # noqa: E731
+ document, msg, line=1, append_to=document, subtype=wtype
+ )
+ config = merge_file_level(config, topmatter, warning)
+
+ # parse content
+ parser = create_md_parser(config, DocutilsRenderer)
+ parser.options["document"] = document
+ parser.render(inputstring)
+
+ # post-processing
+
+ # replace raw nodes if raw is not allowed
+ if not getattr(document.settings, "raw_enabled", True):
+ for node in document.traverse(nodes.raw):
+ warning = document.reporter.warning("Raw content disabled.")
+ node.parent.replace(node, warning)
+
+ self.finish_parse()
+
+
+def _run_cli(writer_name: str, writer_description: str, argv: Optional[List[str]]):
+ """Run the command line interface for a particular writer."""
+ publish_cmdline(
+ parser=Parser(),
+ writer_name=writer_name,
+ description=(
+ f"Generates {writer_description} from standalone MyST sources.\n{default_description}"
+ ),
+ argv=argv,
+ )
+
+
+def cli_html(argv: Optional[List[str]] = None) -> None:
+ """Cmdline entrypoint for converting MyST to HTML."""
+ _run_cli("html", "(X)HTML documents", argv)
+
+
+def cli_html5(argv: Optional[List[str]] = None):
+ """Cmdline entrypoint for converting MyST to HTML5."""
+ _run_cli("html5", "HTML5 documents", argv)
+
+
+def cli_latex(argv: Optional[List[str]] = None):
+ """Cmdline entrypoint for converting MyST to LaTeX."""
+ _run_cli("latex", "LaTeX documents", argv)
+
+
+def cli_xml(argv: Optional[List[str]] = None):
+ """Cmdline entrypoint for converting MyST to XML."""
+ _run_cli("xml", "Docutils-native XML", argv)
+
+
+def cli_pseudoxml(argv: Optional[List[str]] = None):
+ """Cmdline entrypoint for converting MyST to pseudo-XML."""
+ _run_cli("pseudoxml", "pseudo-XML", argv)
diff --git a/myst_parser/parsers/mdit.py b/myst_parser/parsers/mdit.py
new file mode 100644
index 0000000..8476495
--- /dev/null
+++ b/myst_parser/parsers/mdit.py
@@ -0,0 +1,123 @@
+"""This module holds the ``create_md_parser`` function,
+which creates a parser from the config.
+"""
+from __future__ import annotations
+
+from typing import Callable
+
+from markdown_it import MarkdownIt
+from markdown_it.renderer import RendererProtocol
+from mdit_py_plugins.amsmath import amsmath_plugin
+from mdit_py_plugins.anchors import anchors_plugin
+from mdit_py_plugins.attrs import attrs_plugin
+from mdit_py_plugins.colon_fence import colon_fence_plugin
+from mdit_py_plugins.deflist import deflist_plugin
+from mdit_py_plugins.dollarmath import dollarmath_plugin
+from mdit_py_plugins.field_list import fieldlist_plugin
+from mdit_py_plugins.footnote import footnote_plugin
+from mdit_py_plugins.front_matter import front_matter_plugin
+from mdit_py_plugins.myst_blocks import myst_block_plugin
+from mdit_py_plugins.myst_role import myst_role_plugin
+from mdit_py_plugins.substitution import substitution_plugin
+from mdit_py_plugins.tasklists import tasklists_plugin
+from mdit_py_plugins.wordcount import wordcount_plugin
+
+from myst_parser.config.main import MdParserConfig
+
+
+def create_md_parser(
+ config: MdParserConfig, renderer: Callable[[MarkdownIt], RendererProtocol]
+) -> MarkdownIt:
+ """Return a Markdown parser with the required MyST configuration."""
+
+ # TODO warn if linkify required and linkify-it-py not installed
+ # (currently the parse will unceremoniously except)
+
+ if config.commonmark_only:
+ # see https://spec.commonmark.org/
+ md = MarkdownIt("commonmark", renderer_cls=renderer).use(
+ wordcount_plugin, per_minute=config.words_per_minute
+ )
+ md.options.update({"myst_config": config})
+ return md
+
+ if config.gfm_only:
+ # see https://github.github.com/gfm/
+ md = (
+ MarkdownIt("commonmark", renderer_cls=renderer)
+ # note, strikethrough currently only supported tentatively for HTML
+ .enable("strikethrough")
+ .enable("table")
+ .use(tasklists_plugin)
+ .enable("linkify")
+ .use(wordcount_plugin, per_minute=config.words_per_minute)
+ )
+ md.options.update({"linkify": True, "myst_config": config})
+ return md
+
+ md = (
+ MarkdownIt("commonmark", renderer_cls=renderer)
+ .enable("table")
+ .use(front_matter_plugin)
+ .use(myst_block_plugin)
+ .use(myst_role_plugin)
+ .use(footnote_plugin)
+ .use(wordcount_plugin, per_minute=config.words_per_minute)
+ .disable("footnote_inline")
+ # disable this for now, because it need a new implementation in the renderer
+ .disable("footnote_tail")
+ )
+
+ typographer = False
+ if "smartquotes" in config.enable_extensions:
+ md.enable("smartquotes")
+ typographer = True
+ if "replacements" in config.enable_extensions:
+ md.enable("replacements")
+ typographer = True
+ if "linkify" in config.enable_extensions:
+ md.enable("linkify")
+ if md.linkify is not None:
+ md.linkify.set({"fuzzy_link": config.linkify_fuzzy_links})
+ if "strikethrough" in config.enable_extensions:
+ md.enable("strikethrough")
+ if "dollarmath" in config.enable_extensions:
+ md.use(
+ dollarmath_plugin,
+ allow_labels=config.dmath_allow_labels,
+ allow_space=config.dmath_allow_space,
+ allow_digits=config.dmath_allow_digits,
+ double_inline=config.dmath_double_inline,
+ )
+ if "colon_fence" in config.enable_extensions:
+ md.use(colon_fence_plugin)
+ if "amsmath" in config.enable_extensions:
+ md.use(amsmath_plugin)
+ if "deflist" in config.enable_extensions:
+ md.use(deflist_plugin)
+ if "fieldlist" in config.enable_extensions:
+ md.use(fieldlist_plugin)
+ if "tasklist" in config.enable_extensions:
+ md.use(tasklists_plugin)
+ if "substitution" in config.enable_extensions:
+ md.use(substitution_plugin, *config.sub_delimiters)
+ if "attrs_image" in config.enable_extensions:
+ md.use(attrs_plugin, after=("image",))
+ if config.heading_anchors is not None:
+ md.use(
+ anchors_plugin,
+ max_level=config.heading_anchors,
+ slug_func=config.heading_slug_func,
+ )
+ for name in config.disable_syntax:
+ md.disable(name, True)
+
+ md.options.update(
+ {
+ "typographer": typographer,
+ "linkify": "linkify" in config.enable_extensions,
+ "myst_config": config,
+ }
+ )
+
+ return md
diff --git a/myst_parser/parsers/parse_html.py b/myst_parser/parsers/parse_html.py
new file mode 100644
index 0000000..7539e42
--- /dev/null
+++ b/myst_parser/parsers/parse_html.py
@@ -0,0 +1,440 @@
+"""A simple but complete HTML to Abstract Syntax Tree (AST) parser.
+
+The AST can also reproduce the HTML text.
+
+Example::
+
+ >> text = '<div class="note"><p>text</p></div>'
+ >> ast = tokenize_html(text)
+ >> list(ast.walk(include_self=True))
+ [Root(''), Tag('div', {'class': 'note'}), Tag('p'), Data('text')]
+ >> str(ast)
+ '<div class="note"><p>text</p></div>'
+ >> str(ast[0][0])
+ '<p>text</p>'
+
+Note: optional tags are not accounted for
+(see https://html.spec.whatwg.org/multipage/syntax.html#optional-tags)
+
+"""
+from __future__ import annotations
+
+import inspect
+import itertools
+from collections import abc, deque
+from html.parser import HTMLParser
+from typing import Any, Callable, Iterable, Iterator
+
+
+class Attribute(dict):
+ """This class holds the tags's attributes."""
+
+ def __getitem__(self, key: str) -> str:
+ """If self doesn't have the key it returns ''."""
+ return self.get(key, "")
+
+ @property
+ def classes(self) -> list[str]:
+ """Return 'class' attribute as list."""
+ return self["class"].split()
+
+ def __str__(self) -> str:
+ """Return a htmlized representation for attributes."""
+ return " ".join(f'{key}="{value}"' for key, value in self.items())
+
+
+class Element(abc.MutableSequence):
+ """An Element of the xml/html document.
+
+ All xml/html entities inherit from this class.
+ """
+
+ def __init__(self, name: str = "", attr: dict | None = None) -> None:
+ """Initialise the element."""
+ self.name = name
+ self.attrs: Attribute = Attribute(attr or {})
+ self._parent: Element | None = None
+ self._children: list[Element] = []
+
+ @property
+ def parent(self) -> Element | None:
+ """Return parent."""
+ return self._parent
+
+ @property
+ def children(self) -> list[Element]:
+ """Return copy of children."""
+ return self._children[:]
+
+ def reset_children(self, children: list[Element], deepcopy: bool = False):
+ new_children = []
+ for i, item in enumerate(children):
+ assert isinstance(item, Element)
+ if deepcopy:
+ item = item.deepcopy()
+ if item._parent is None:
+ item._parent = self
+ elif item._parent != self:
+ raise AssertionError(f"different parent already set for item {i}")
+ new_children.append(item)
+ self._children = new_children
+
+ def __getitem__(self, index: int) -> Element: # type: ignore[override]
+ return self._children[index]
+
+ def __setitem__(self, index: int, item: Element): # type: ignore[override]
+ assert isinstance(item, Element)
+ if item._parent is not None and item._parent != self:
+ raise AssertionError(f"different parent already set for: {item!r}")
+ item._parent = self
+ return self._children.__setitem__(index, item)
+
+ def __delitem__(self, index: int): # type: ignore[override]
+ return self._children.__delitem__(index)
+
+ def __len__(self) -> int:
+ return self._children.__len__()
+
+ def __iter__(self) -> Iterator[Element]:
+ yield from self._children
+
+ def insert(self, index: int, item: Element):
+ assert isinstance(item, Element)
+ if item._parent is not None and item._parent != self:
+ raise AssertionError(f"different parent already set for: {item!r}")
+ item._parent = self
+ return self._children.insert(index, item)
+
+ def deepcopy(self) -> Element:
+ """Recursively copy and remove parent."""
+ _copy = self.__class__(self.name, self.attrs)
+ for child in self:
+ _copy_child = child.deepcopy()
+ _copy.append(_copy_child)
+ return _copy
+
+ def __repr__(self) -> str:
+ text = f"{self.__class__.__name__}({self.name!r}"
+ if self.attrs:
+ text += f", {self.attrs!r}"
+ text += ")"
+ return text
+
+ def render(
+ self,
+ tag_overrides: dict[str, Callable[[Element, dict], str]] | None = None,
+ **kwargs,
+ ) -> str:
+ """Returns a HTML string representation of the element.
+
+ :param tag_overrides: Provide a dictionary of render function
+ for specific tag names, to override the normal render format
+
+ """
+ raise NotImplementedError
+
+ def __str__(self) -> str:
+ return self.render()
+
+ def __eq__(self, item: Any) -> bool:
+ return item is self
+
+ def walk(self, include_self: bool = False) -> Iterator[Element]:
+ """Walk through the xml/html AST."""
+ if include_self:
+ yield self
+ for child in self:
+ yield child
+ yield from child.walk()
+
+ def strip(self, inplace: bool = False, recurse: bool = False) -> Element:
+ """Return copy with all `Data` tokens
+ that only contain whitespace / newlines removed.
+ """
+ element = self
+ if not inplace:
+ element = self.deepcopy()
+ element.reset_children(
+ [
+ e
+ for e in element.children
+ if not (isinstance(e, Data) and e.data.strip() == "")
+ ]
+ )
+ if recurse:
+ for child in element:
+ child.strip(inplace=True, recurse=True)
+ return element
+
+ def find(
+ self,
+ identifier: str | type[Element],
+ attrs: dict | None = None,
+ classes: Iterable[str] | None = None,
+ include_self: bool = False,
+ recurse: bool = True,
+ ) -> Iterator[Element]:
+ """Find all elements that match name and specific attributes."""
+ iterator = self.walk() if recurse else self
+ if include_self:
+ iterator = itertools.chain([self], iterator)
+ if inspect.isclass(identifier):
+ test_func = lambda c: isinstance(c, identifier) # noqa: E731
+ else:
+ test_func = lambda c: c.name == identifier # noqa: E731
+ classes = set(classes) if classes is not None else classes
+ for child in iterator:
+ if test_func(child):
+ if classes is not None and not classes.issubset(child.attrs.classes):
+ continue
+ for key, value in (attrs or {}).items():
+ if child.attrs[key] != value:
+ break
+ else:
+ yield child
+
+
+class Root(Element):
+ """The root of the AST tree."""
+
+ def render(self, **kwargs) -> str: # type: ignore[override]
+ """Returns a string HTML representation of the structure."""
+ return "".join(child.render(**kwargs) for child in self)
+
+
+class Tag(Element):
+ """Represent xml/html tags under the form: <name key="value" ...> ... </name>."""
+
+ def render(
+ self,
+ tag_overrides: dict[str, Callable[[Element, dict], str]] | None = None,
+ **kwargs,
+ ) -> str:
+ if tag_overrides and self.name in tag_overrides:
+ return tag_overrides[self.name](self, tag_overrides)
+ return (
+ f"<{self.name}{' ' if self.attrs else ''}{self.attrs}>"
+ + "".join(
+ child.render(tag_overrides=tag_overrides, **kwargs) for child in self
+ )
+ + f"</{self.name}>"
+ )
+
+
+class XTag(Element):
+ """Represent XHTML style tags with no children, like `<img src="t.gif" />`"""
+
+ def render(
+ self,
+ tag_overrides: dict[str, Callable[[Element, dict], str]] | None = None,
+ **kwargs,
+ ) -> str:
+ if tag_overrides is not None and self.name in tag_overrides:
+ return tag_overrides[self.name](self, tag_overrides)
+ return f"<{self.name}{' ' if self.attrs else ''}{self.attrs}/>"
+
+
+class VoidTag(Element):
+ """Represent tags with no children, only start tag, like `<img src="t.gif" >`"""
+
+ def render(self, **kwargs) -> str: # type: ignore[override]
+ return f"<{self.name}{' ' if self.attrs else ''}{self.attrs}>"
+
+
+class TerminalElement(Element):
+ def __init__(self, data: str):
+ super().__init__("")
+ self.data: str = data
+
+ def __repr__(self) -> str:
+ text = self.data
+ if len(text) > 20:
+ text = text[:17] + "..."
+ return f"{self.__class__.__name__}({text!r})"
+
+ def deepcopy(self) -> TerminalElement:
+ """Copy and remove parent."""
+ _copy = self.__class__(self.data)
+ return _copy
+
+
+class Data(TerminalElement):
+ """Represent data inside xml/html documents, like raw text."""
+
+ def render(self, **kwargs) -> str: # type: ignore[override]
+ return self.data
+
+
+class Declaration(TerminalElement):
+ """Represent declarations, like `<!DOCTYPE html>`"""
+
+ def render(self, **kwargs) -> str: # type: ignore[override]
+ return f"<!{self.data}>"
+
+
+class Comment(TerminalElement):
+ """Represent HTML comments"""
+
+ def render(self, **kwargs) -> str: # type: ignore[override]
+ return f"<!--{self.data}-->"
+
+
+class Pi(TerminalElement):
+ """Represent processing instructions like `<?xml-stylesheet ?>`"""
+
+ def render(self, **kwargs) -> str: # type: ignore[override]
+ return f"<?{self.data}>"
+
+
+class Char(TerminalElement):
+ """Represent character codes like: `&#0`"""
+
+ def render(self, **kwargs) -> str: # type: ignore[override]
+ return f"&#{self.data};"
+
+
+class Entity(TerminalElement):
+ """Represent entities like `&amp`"""
+
+ def render(self, **kwargs) -> str: # type: ignore[override]
+ return f"&{self.data};"
+
+
+class Tree:
+ """The engine class to generate the AST tree."""
+
+ def __init__(self, name: str = ""):
+ """Initialise Tree"""
+ self.name = name
+ self.outmost = Root(name)
+ self.stack: deque = deque()
+ self.stack.append(self.outmost)
+
+ def clear(self):
+ """Clear the outmost and stack for a new parsing."""
+ self.outmost = Root(self.name)
+ self.stack.clear()
+ self.stack.append(self.outmost)
+
+ def last(self) -> Element:
+ """Return the last pointer which point to the actual tag scope."""
+ return self.stack[-1]
+
+ def nest_tag(self, name: str, attrs: dict):
+ """Nest a given tag at the bottom of the tree using
+ the last stack's pointer.
+ """
+ pointer = self.stack.pop()
+ item = Tag(name, attrs)
+ pointer.append(item)
+ self.stack.append(pointer)
+ self.stack.append(item)
+
+ def nest_xtag(self, name: str, attrs: dict):
+ """Nest an XTag onto the tree."""
+ top = self.last()
+ item = XTag(name, attrs)
+ top.append(item)
+
+ def nest_vtag(self, name: str, attrs: dict):
+ """Nest a VoidTag onto the tree."""
+ top = self.last()
+ item = VoidTag(name, attrs)
+ top.append(item)
+
+ def nest_terminal(self, klass: type[TerminalElement], data: str):
+ """Nest the data onto the tree."""
+ top = self.last()
+ item = klass(data)
+ top.append(item)
+
+ def enclose(self, name: str):
+ """When a closing tag is found, pop the pointer's scope from the stack,
+ to then point to the earlier scope's tag.
+ """
+ count = 0
+ for ind in reversed(self.stack):
+ count = count + 1
+ if ind.name == name:
+ break
+ else:
+ count = 0
+
+ # It pops all the items which do not match with the closing tag.
+ for _ in range(0, count):
+ self.stack.pop()
+
+
+class HtmlToAst(HTMLParser):
+ """The tokenizer class."""
+
+ # see https://html.spec.whatwg.org/multipage/syntax.html#void-elements
+ void_elements = {
+ "area",
+ "base",
+ "br",
+ "col",
+ "embed",
+ "hr",
+ "img",
+ "input",
+ "link",
+ "meta",
+ "param",
+ "source",
+ "track",
+ "wbr",
+ }
+
+ def __init__(self, name: str = "", convert_charrefs: bool = False):
+ super().__init__(convert_charrefs=convert_charrefs)
+ self.struct = Tree(name)
+
+ def feed(self, source: str) -> Root: # type: ignore[override]
+ """Parse the source string."""
+ self.struct.clear()
+ super().feed(source)
+ return self.struct.outmost
+
+ def handle_starttag(self, name: str, attr):
+ """When found an opening tag then nest it onto the tree."""
+ if name in self.void_elements:
+ self.struct.nest_vtag(name, attr)
+ else:
+ self.struct.nest_tag(name, attr)
+
+ def handle_startendtag(self, name: str, attr):
+ """When found a XHTML tag style then nest it up to the tree."""
+ self.struct.nest_xtag(name, attr)
+
+ def handle_endtag(self, name: str):
+ """When found a closing tag then makes it point to the right scope."""
+ if name not in self.void_elements:
+ self.struct.enclose(name)
+
+ def handle_data(self, data: str):
+ """Nest data onto the tree."""
+ self.struct.nest_terminal(Data, data)
+
+ def handle_decl(self, decl: str):
+ self.struct.nest_terminal(Declaration, decl)
+
+ def unknown_decl(self, decl: str):
+ self.struct.nest_terminal(Declaration, decl)
+
+ def handle_charref(self, data: str):
+ self.struct.nest_terminal(Char, data)
+
+ def handle_entityref(self, data: str):
+ self.struct.nest_terminal(Entity, data)
+
+ def handle_pi(self, data: str):
+ self.struct.nest_terminal(Pi, data)
+
+ def handle_comment(self, data: str):
+ self.struct.nest_terminal(Comment, data)
+
+
+def tokenize_html(text: str, name: str = "", convert_charrefs: bool = False) -> Root:
+ parser = HtmlToAst(name, convert_charrefs=convert_charrefs)
+ return parser.feed(text)
diff --git a/myst_parser/parsers/sphinx_.py b/myst_parser/parsers/sphinx_.py
new file mode 100644
index 0000000..fff098f
--- /dev/null
+++ b/myst_parser/parsers/sphinx_.py
@@ -0,0 +1,69 @@
+"""MyST Markdown parser for sphinx."""
+from __future__ import annotations
+
+from docutils import nodes
+from docutils.parsers.rst import Parser as RstParser
+from sphinx.parsers import Parser as SphinxParser
+from sphinx.util import logging
+
+from myst_parser.config.main import (
+ MdParserConfig,
+ TopmatterReadError,
+ merge_file_level,
+ read_topmatter,
+)
+from myst_parser.mdit_to_docutils.sphinx_ import SphinxRenderer, create_warning
+from myst_parser.parsers.mdit import create_md_parser
+
+SPHINX_LOGGER = logging.getLogger(__name__)
+
+
+class MystParser(SphinxParser):
+ """Sphinx parser for Markedly Structured Text (MyST)."""
+
+ supported: tuple[str, ...] = ("md", "markdown", "myst")
+ """Aliases this parser supports."""
+
+ settings_spec = RstParser.settings_spec
+ """Runtime settings specification.
+
+ Defines runtime settings and associated command-line options, as used by
+ `docutils.frontend.OptionParser`. This is a concatenation of tuples of:
+
+ - Option group title (string or `None` which implies no group, just a list
+ of single options).
+
+ - Description (string or `None`).
+
+ - A sequence of option tuples
+ """
+
+ config_section = "myst parser"
+ config_section_dependencies = ("parsers",)
+ translate_section_name = None
+
+ def parse(self, inputstring: str, document: nodes.document) -> None:
+ """Parse source text.
+
+ :param inputstring: The source string to parse
+ :param document: The root docutils node to add AST elements to
+
+ """
+ # get the global config
+ config: MdParserConfig = document.settings.env.myst_config
+
+ # update the global config with the file-level config
+ try:
+ topmatter = read_topmatter(inputstring)
+ except TopmatterReadError:
+ pass # this will be reported during the render
+ else:
+ if topmatter:
+ warning = lambda wtype, msg: create_warning( # noqa: E731
+ document, msg, line=1, append_to=document, subtype=wtype
+ )
+ config = merge_file_level(config, topmatter, warning)
+
+ parser = create_md_parser(config, SphinxRenderer)
+ parser.options["document"] = document
+ parser.render(inputstring)