6 files changed, 1098 insertions, 0 deletions
diff --git a/myst_parser/parsers/__init__.py b/myst_parser/parsers/__init__.py
new file mode 100644
index 0000000..26fbfca
--- /dev/null
+++ b/myst_parser/parsers/__init__.py
@@ -0,0 +1 @@
+"""Parsers of MyST Markdown source text to docutils AST."""
diff --git a/myst_parser/parsers/directives.py b/myst_parser/parsers/directives.py
new file mode 100644
index 0000000..5637254
--- /dev/null
+++ b/myst_parser/parsers/directives.py
@@ -0,0 +1,190 @@
+"""Fenced code blocks are parsed as directives,
+if the block starts with ``{directive_name}``,
+followed by arguments on the same line.
+
+Directive options are read from a YAML block,
+if the first content line starts with ``---``, e.g.
+
+::
+
+    ```{directive_name} arguments
+    ---
+    option1: name
+    option2: |
+        Longer text block
+    ---
+    content...
+    ```
+
+Or the option block will be parsed if the first content line starts with ``:``,
+as a YAML block consisting of every line that starts with a ``:``, e.g.
+
+::
+
+    ```{directive_name} arguments
+    :option1: name
+    :option2: other
+
+    content...
+    ```
+
+If the first line of a directive's content is blank, this will be stripped
+from the content.
+This is to allow for separation between the option block and content.
+
+"""
+from __future__ import annotations
+
+import datetime
+import re
+from textwrap import dedent
+from typing import Any, Callable
+
+import yaml
+from docutils.parsers.rst import Directive
+from docutils.parsers.rst.directives.misc import TestDirective
+
+
+class DirectiveParsingError(Exception):
+    """Raise on parsing/validation error."""
+
+    pass
+
+
+def parse_directive_text(
+    directive_class: type[Directive],
+    first_line: str,
+    content: str,
+    validate_options: bool = True,
+) -> tuple[list[str], dict, list[str], int]:
+    """Parse (and validate) the full directive text.
+
+    :param first_line: The text on the same line as the directive name.
+        May be an argument or body text, dependent on the directive
+    :param content: All text after the first line. Can include options.
+    :param validate_options: Whether to validate the values of options
+
+    :returns: (arguments, options, body_lines, content_offset)
+    """
+    if directive_class.option_spec:
+        body, options = parse_directive_options(
+            content, directive_class, validate=validate_options
+        )
+        body_lines = body.splitlines()
+        content_offset = len(content.splitlines()) - len(body_lines)
+    else:
+        # If there are no possible options, we do not look for a YAML block
+        options = {}
+        body_lines = content.splitlines()
+        content_offset = 0
+
+    if not (directive_class.required_arguments or directive_class.optional_arguments):
+        # If there are no possible arguments, then the body starts on the argument line
+        if first_line:
+            body_lines.insert(0, first_line)
+        arguments = []
+    else:
+        arguments = parse_directive_arguments(directive_class, first_line)
+
+    # remove first line of body if blank
+    # this is to allow space between the options and the content
+    if body_lines and not body_lines[0].strip():
+        body_lines = body_lines[1:]
+        content_offset += 1
+
+    # check for body content
+    if body_lines and not directive_class.has_content:
+        raise DirectiveParsingError("No content permitted")
+
+    return arguments, options, body_lines, content_offset
+
+
+def parse_directive_options(
+    content: str, directive_class: type[Directive], validate: bool = True
+):
+    """Parse (and validate) the directive option section."""
+    options: dict[str, Any] = {}
+    if content.startswith("---"):
+        content = "\n".join(content.splitlines()[1:])
+        match = re.search(r"^-{3,}", content, re.MULTILINE)
+        if match:
+            yaml_block = content[: match.start()]
+            content = content[match.end() + 1 :]  # TODO advance line number
+        else:
+            yaml_block = content
+            content = ""
+        yaml_block = dedent(yaml_block)
+        try:
+            options = yaml.safe_load(yaml_block) or {}
+        except (yaml.parser.ParserError, yaml.scanner.ScannerError) as error:
+            raise DirectiveParsingError("Invalid options YAML: " + str(error))
+    elif content.lstrip().startswith(":"):
+        content_lines = content.splitlines()  # type: list
+        yaml_lines = []
+        while content_lines:
+            if not content_lines[0].lstrip().startswith(":"):
+                break
+            yaml_lines.append(content_lines.pop(0).lstrip()[1:])
+        yaml_block = "\n".join(yaml_lines)
+        content = "\n".join(content_lines)
+        try:
+            options = yaml.safe_load(yaml_block) or {}
+        except (yaml.parser.ParserError, yaml.scanner.ScannerError) as error:
+            raise DirectiveParsingError("Invalid options YAML: " + str(error))
+        if not isinstance(options, dict):
+            raise DirectiveParsingError(f"Invalid options (not dict): {options}")
+
+    if (not validate) or issubclass(directive_class, TestDirective):
+        # technically this directive spec only accepts one option ('option')
+        # but since its for testing only we accept all options
+        return content, options
+
+    # check options against spec
+    options_spec: dict[str, Callable] = directive_class.option_spec
+    for name, value in list(options.items()):
+        try:
+            convertor = options_spec[name]
+        except KeyError:
+            raise DirectiveParsingError(f"Unknown option: {name}")
+        if not isinstance(value, str):
+            if value is True or value is None:
+                value = None  # flag converter requires no argument
+            elif isinstance(value, (int, float, datetime.date, datetime.datetime)):
+                # convertor always requires string input
+                value = str(value)
+            else:
+                raise DirectiveParsingError(
+                    f'option "{name}" value not string (enclose with ""): {value}'
+                )
+        try:
+            converted_value = convertor(value)
+        except (ValueError, TypeError) as error:
+            raise DirectiveParsingError(
+                "Invalid option value: (option: '{}'; value: {})\n{}".format(
+                    name, value, error
+                )
+            )
+        options[name] = converted_value
+
+    return content, options
+
+
+def parse_directive_arguments(directive, arg_text):
+    """Parse (and validate) the directive argument section."""
+    required = directive.required_arguments
+    optional = directive.optional_arguments
+    arguments = arg_text.split()
+    if len(arguments) < required:
+        raise DirectiveParsingError(
+            f"{required} argument(s) required, {len(arguments)} supplied"
+        )
+    elif len(arguments) > required + optional:
+        if directive.final_argument_whitespace:
+            arguments = arg_text.split(None, required + optional - 1)
+        else:
+            raise DirectiveParsingError(
+                "maximum {} argument(s) allowed, {} supplied".format(
+                    required + optional, len(arguments)
+                )
+            )
+    return arguments
diff --git a/myst_parser/parsers/docutils_.py b/myst_parser/parsers/docutils_.py
new file mode 100644
index 0000000..aaef5e2
--- /dev/null
+++ b/myst_parser/parsers/docutils_.py
@@ -0,0 +1,275 @@
+"""MyST Markdown parser for docutils."""
+from dataclasses import Field
+from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple, Union
+
+from docutils import frontend, nodes
+from docutils.core import default_description, publish_cmdline
+from docutils.parsers.rst import Parser as RstParser
+from typing_extensions import Literal, get_args, get_origin
+
+from myst_parser.config.main import (
+    MdParserConfig,
+    TopmatterReadError,
+    merge_file_level,
+    read_topmatter,
+)
+from myst_parser.mdit_to_docutils.base import DocutilsRenderer, create_warning
+from myst_parser.parsers.mdit import create_md_parser
+
+
+def _validate_int(
+    setting, value, option_parser, config_parser=None, config_section=None
+) -> int:
+    """Validate an integer setting."""
+    return int(value)
+
+
+def _create_validate_tuple(length: int) -> Callable[..., Tuple[str, ...]]:
+    """Create a validator for a tuple of length `length`."""
+
+    def _validate(
+        setting, value, option_parser, config_parser=None, config_section=None
+    ):
+        string_list = frontend.validate_comma_separated_list(
+            setting, value, option_parser, config_parser, config_section
+        )
+        if len(string_list) != length:
+            raise ValueError(
+                f"Expecting {length} items in {setting}, got {len(string_list)}."
+            )
+        return tuple(string_list)
+
+    return _validate
+
+
+class Unset:
+    """A sentinel class for unset settings."""
+
+    def __repr__(self):
+        return "UNSET"
+
+
+DOCUTILS_UNSET = Unset()
+"""Sentinel for arguments not set through docutils.conf."""
+
+
+DOCUTILS_EXCLUDED_ARGS = (
+    # docutils.conf can't represent callables
+    "heading_slug_func",
+    # docutils.conf can't represent dicts
+    "html_meta",
+    "substitutions",
+    # we can't add substitutions so not needed
+    "sub_delimiters",
+    # sphinx only options
+    "heading_anchors",
+    "ref_domains",
+    "update_mathjax",
+    "mathjax_classes",
+)
+"""Names of settings that cannot be set in docutils.conf."""
+
+
+def _attr_to_optparse_option(at: Field, default: Any) -> Tuple[dict, str]:
+    """Convert a field into a Docutils optparse options dict."""
+    if at.type is int:
+        return {"metavar": "<int>", "validator": _validate_int}, f"(default: {default})"
+    if at.type is bool:
+        return {
+            "metavar": "<boolean>",
+            "validator": frontend.validate_boolean,
+        }, f"(default: {default})"
+    if at.type is str:
+        return {
+            "metavar": "<str>",
+        }, f"(default: '{default}')"
+    if get_origin(at.type) is Literal and all(
+        isinstance(a, str) for a in get_args(at.type)
+    ):
+        args = get_args(at.type)
+        return {
+            "metavar": f"<{'|'.join(repr(a) for a in args)}>",
+            "type": "choice",
+            "choices": args,
+        }, f"(default: {default!r})"
+    if at.type in (Iterable[str], Sequence[str]):
+        return {
+            "metavar": "<comma-delimited>",
+            "validator": frontend.validate_comma_separated_list,
+        }, f"(default: '{','.join(default)}')"
+    if at.type == Tuple[str, str]:
+        return {
+            "metavar": "<str,str>",
+            "validator": _create_validate_tuple(2),
+        }, f"(default: '{','.join(default)}')"
+    if at.type == Union[int, type(None)]:
+        return {
+            "metavar": "<null|int>",
+            "validator": _validate_int,
+        }, f"(default: {default})"
+    if at.type == Union[Iterable[str], type(None)]:
+        default_str = ",".join(default) if default else ""
+        return {
+            "metavar": "<null|comma-delimited>",
+            "validator": frontend.validate_comma_separated_list,
+        }, f"(default: {default_str!r})"
+    raise AssertionError(
+        f"Configuration option {at.name} not set up for use in docutils.conf."
+    )
+
+
+def attr_to_optparse_option(
+    attribute: Field, default: Any, prefix: str = "myst_"
+) -> Tuple[str, List[str], Dict[str, Any]]:
+    """Convert an ``MdParserConfig`` attribute into a Docutils setting tuple.
+
+    :returns: A tuple of ``(help string, option flags, optparse kwargs)``.
+    """
+    name = f"{prefix}{attribute.name}"
+    flag = "--" + name.replace("_", "-")
+    options = {"dest": name, "default": DOCUTILS_UNSET}
+    at_options, type_str = _attr_to_optparse_option(attribute, default)
+    options.update(at_options)
+    help_str = attribute.metadata.get("help", "") if attribute.metadata else ""
+    return (f"{help_str} {type_str}", [flag], options)
+
+
+def create_myst_settings_spec(
+    excluded: Sequence[str], config_cls=MdParserConfig, prefix: str = "myst_"
+):
+    """Return a list of Docutils setting for the docutils MyST section."""
+    defaults = config_cls()
+    return tuple(
+        attr_to_optparse_option(at, getattr(defaults, at.name), prefix)
+        for at in config_cls.get_fields()
+        if at.name not in excluded
+    )
+
+
+def create_myst_config(
+    settings: frontend.Values,
+    excluded: Sequence[str],
+    config_cls=MdParserConfig,
+    prefix: str = "myst_",
+):
+    """Create a configuration instance from the given settings."""
+    values = {}
+    for attribute in config_cls.get_fields():
+        if attribute.name in excluded:
+            continue
+        setting = f"{prefix}{attribute.name}"
+        val = getattr(settings, setting, DOCUTILS_UNSET)
+        if val is not DOCUTILS_UNSET:
+            values[attribute.name] = val
+    return config_cls(**values)
+
+
+class Parser(RstParser):
+    """Docutils parser for Markedly Structured Text (MyST)."""
+
+    supported: Tuple[str, ...] = ("md", "markdown", "myst")
+    """Aliases this parser supports."""
+
+    settings_spec = (
+        "MyST options",
+        None,
+        create_myst_settings_spec(DOCUTILS_EXCLUDED_ARGS),
+        *RstParser.settings_spec,
+    )
+    """Runtime settings specification."""
+
+    config_section = "myst parser"
+    config_section_dependencies = ("parsers",)
+    translate_section_name = None
+
+    def parse(self, inputstring: str, document: nodes.document) -> None:
+        """Parse source text.
+
+        :param inputstring: The source string to parse
+        :param document: The root docutils node to add AST elements to
+        """
+
+        self.setup_parse(inputstring, document)
+
+        # check for exorbitantly long lines
+        if hasattr(document.settings, "line_length_limit"):
+            for i, line in enumerate(inputstring.split("\n")):
+                if len(line) > document.settings.line_length_limit:
+                    error = document.reporter.error(
+                        f"Line {i+1} exceeds the line-length-limit:"
+                        f" {document.settings.line_length_limit}."
+                    )
+                    document.append(error)
+                    return
+
+        # create parsing configuration from the global config
+        try:
+            config = create_myst_config(document.settings, DOCUTILS_EXCLUDED_ARGS)
+        except Exception as exc:
+            error = document.reporter.error(f"Global myst configuration invalid: {exc}")
+            document.append(error)
+            config = MdParserConfig()
+
+        # update the global config with the file-level config
+        try:
+            topmatter = read_topmatter(inputstring)
+        except TopmatterReadError:
+            pass  # this will be reported during the render
+        else:
+            if topmatter:
+                warning = lambda wtype, msg: create_warning(  # noqa: E731
+                    document, msg, line=1, append_to=document, subtype=wtype
+                )
+                config = merge_file_level(config, topmatter, warning)
+
+        # parse content
+        parser = create_md_parser(config, DocutilsRenderer)
+        parser.options["document"] = document
+        parser.render(inputstring)
+
+        # post-processing
+
+        # replace raw nodes if raw is not allowed
+        if not getattr(document.settings, "raw_enabled", True):
+            for node in document.traverse(nodes.raw):
+                warning = document.reporter.warning("Raw content disabled.")
+                node.parent.replace(node, warning)
+
+        self.finish_parse()
+
+
+def _run_cli(writer_name: str, writer_description: str, argv: Optional[List[str]]):
+    """Run the command line interface for a particular writer."""
+    publish_cmdline(
+        parser=Parser(),
+        writer_name=writer_name,
+        description=(
+            f"Generates {writer_description} from standalone MyST sources.\n{default_description}"
+        ),
+        argv=argv,
+    )
+
+
+def cli_html(argv: Optional[List[str]] = None) -> None:
+    """Cmdline entrypoint for converting MyST to HTML."""
+    _run_cli("html", "(X)HTML documents", argv)
+
+
+def cli_html5(argv: Optional[List[str]] = None):
+    """Cmdline entrypoint for converting MyST to HTML5."""
+    _run_cli("html5", "HTML5 documents", argv)
+
+
+def cli_latex(argv: Optional[List[str]] = None):
+    """Cmdline entrypoint for converting MyST to LaTeX."""
+    _run_cli("latex", "LaTeX documents", argv)
+
+
+def cli_xml(argv: Optional[List[str]] = None):
+    """Cmdline entrypoint for converting MyST to XML."""
+    _run_cli("xml", "Docutils-native XML", argv)
+
+
+def cli_pseudoxml(argv: Optional[List[str]] = None):
+    """Cmdline entrypoint for converting MyST to pseudo-XML."""
+    _run_cli("pseudoxml", "pseudo-XML", argv)
diff --git a/myst_parser/parsers/mdit.py b/myst_parser/parsers/mdit.py
new file mode 100644
index 0000000..8476495
--- /dev/null
+++ b/myst_parser/parsers/mdit.py
@@ -0,0 +1,123 @@
+"""This module holds the ``create_md_parser`` function,
+which creates a parser from the config.
+"""
+from __future__ import annotations
+
+from typing import Callable
+
+from markdown_it import MarkdownIt
+from markdown_it.renderer import RendererProtocol
+from mdit_py_plugins.amsmath import amsmath_plugin
+from mdit_py_plugins.anchors import anchors_plugin
+from mdit_py_plugins.attrs import attrs_plugin
+from mdit_py_plugins.colon_fence import colon_fence_plugin
+from mdit_py_plugins.deflist import deflist_plugin
+from mdit_py_plugins.dollarmath import dollarmath_plugin
+from mdit_py_plugins.field_list import fieldlist_plugin
+from mdit_py_plugins.footnote import footnote_plugin
+from mdit_py_plugins.front_matter import front_matter_plugin
+from mdit_py_plugins.myst_blocks import myst_block_plugin
+from mdit_py_plugins.myst_role import myst_role_plugin
+from mdit_py_plugins.substitution import substitution_plugin
+from mdit_py_plugins.tasklists import tasklists_plugin
+from mdit_py_plugins.wordcount import wordcount_plugin
+
+from myst_parser.config.main import MdParserConfig
+
+
+def create_md_parser(
+    config: MdParserConfig, renderer: Callable[[MarkdownIt], RendererProtocol]
+) -> MarkdownIt:
+    """Return a Markdown parser with the required MyST configuration."""
+
+    # TODO warn if linkify required and linkify-it-py not installed
+    # (currently the parse will unceremoniously except)
+
+    if config.commonmark_only:
+        # see https://spec.commonmark.org/
+        md = MarkdownIt("commonmark", renderer_cls=renderer).use(
+            wordcount_plugin, per_minute=config.words_per_minute
+        )
+        md.options.update({"myst_config": config})
+        return md
+
+    if config.gfm_only:
+        # see https://github.github.com/gfm/
+        md = (
+            MarkdownIt("commonmark", renderer_cls=renderer)
+            # note, strikethrough currently only supported tentatively for HTML
+            .enable("strikethrough")
+            .enable("table")
+            .use(tasklists_plugin)
+            .enable("linkify")
+            .use(wordcount_plugin, per_minute=config.words_per_minute)
+        )
+        md.options.update({"linkify": True, "myst_config": config})
+        return md
+
+    md = (
+        MarkdownIt("commonmark", renderer_cls=renderer)
+        .enable("table")
+        .use(front_matter_plugin)
+        .use(myst_block_plugin)
+        .use(myst_role_plugin)
+        .use(footnote_plugin)
+        .use(wordcount_plugin, per_minute=config.words_per_minute)
+        .disable("footnote_inline")
+        # disable this for now, because it need a new implementation in the renderer
+        .disable("footnote_tail")
+    )
+
+    typographer = False
+    if "smartquotes" in config.enable_extensions:
+        md.enable("smartquotes")
+        typographer = True
+    if "replacements" in config.enable_extensions:
+        md.enable("replacements")
+        typographer = True
+    if "linkify" in config.enable_extensions:
+        md.enable("linkify")
+        if md.linkify is not None:
+            md.linkify.set({"fuzzy_link": config.linkify_fuzzy_links})
+    if "strikethrough" in config.enable_extensions:
+        md.enable("strikethrough")
+    if "dollarmath" in config.enable_extensions:
+        md.use(
+            dollarmath_plugin,
+            allow_labels=config.dmath_allow_labels,
+            allow_space=config.dmath_allow_space,
+            allow_digits=config.dmath_allow_digits,
+            double_inline=config.dmath_double_inline,
+        )
+    if "colon_fence" in config.enable_extensions:
+        md.use(colon_fence_plugin)
+    if "amsmath" in config.enable_extensions:
+        md.use(amsmath_plugin)
+    if "deflist" in config.enable_extensions:
+        md.use(deflist_plugin)
+    if "fieldlist" in config.enable_extensions:
+        md.use(fieldlist_plugin)
+    if "tasklist" in config.enable_extensions:
+        md.use(tasklists_plugin)
+    if "substitution" in config.enable_extensions:
+        md.use(substitution_plugin, *config.sub_delimiters)
+    if "attrs_image" in config.enable_extensions:
+        md.use(attrs_plugin, after=("image",))
+    if config.heading_anchors is not None:
+        md.use(
+            anchors_plugin,
+            max_level=config.heading_anchors,
+            slug_func=config.heading_slug_func,
+        )
+    for name in config.disable_syntax:
+        md.disable(name, True)
+
+    md.options.update(
+        {
+            "typographer": typographer,
+            "linkify": "linkify" in config.enable_extensions,
+            "myst_config": config,
+        }
+    )
+
+    return md
diff --git a/myst_parser/parsers/parse_html.py b/myst_parser/parsers/parse_html.py
new file mode 100644
index 0000000..7539e42
--- /dev/null
+++ b/myst_parser/parsers/parse_html.py
@@ -0,0 +1,440 @@
+"""A simple but complete HTML to Abstract Syntax Tree (AST) parser.
+
+The AST can also reproduce the HTML text.
+
+Example::
+
+    >> text = '<div class="note"><p>text</p></div>'
+    >> ast = tokenize_html(text)
+    >> list(ast.walk(include_self=True))
+    [Root(''), Tag('div', {'class': 'note'}), Tag('p'), Data('text')]
+    >> str(ast)
+    '<div class="note"><p>text</p></div>'
+    >> str(ast[0][0])
+    '<p>text</p>'
+
+Note: optional tags are not accounted for
+(see https://html.spec.whatwg.org/multipage/syntax.html#optional-tags)
+
+"""
+from __future__ import annotations
+
+import inspect
+import itertools
+from collections import abc, deque
+from html.parser import HTMLParser
+from typing import Any, Callable, Iterable, Iterator
+
+
+class Attribute(dict):
+    """This class holds the tags's attributes."""
+
+    def __getitem__(self, key: str) -> str:
+        """If self doesn't have the key it returns ''."""
+        return self.get(key, "")
+
+    @property
+    def classes(self) -> list[str]:
+        """Return 'class' attribute as list."""
+        return self["class"].split()
+
+    def __str__(self) -> str:
+        """Return a htmlized representation for attributes."""
+        return " ".join(f'{key}="{value}"' for key, value in self.items())
+
+
+class Element(abc.MutableSequence):
+    """An Element of the xml/html document.
+
+    All xml/html entities inherit from this class.
+    """
+
+    def __init__(self, name: str = "", attr: dict | None = None) -> None:
+        """Initialise the element."""
+        self.name = name
+        self.attrs: Attribute = Attribute(attr or {})
+        self._parent: Element | None = None
+        self._children: list[Element] = []
+
+    @property
+    def parent(self) -> Element | None:
+        """Return parent."""
+        return self._parent
+
+    @property
+    def children(self) -> list[Element]:
+        """Return copy of children."""
+        return self._children[:]
+
+    def reset_children(self, children: list[Element], deepcopy: bool = False):
+        new_children = []
+        for i, item in enumerate(children):
+            assert isinstance(item, Element)
+            if deepcopy:
+                item = item.deepcopy()
+            if item._parent is None:
+                item._parent = self
+            elif item._parent != self:
+                raise AssertionError(f"different parent already set for item {i}")
+            new_children.append(item)
+        self._children = new_children
+
+    def __getitem__(self, index: int) -> Element:  # type: ignore[override]
+        return self._children[index]
+
+    def __setitem__(self, index: int, item: Element):  # type: ignore[override]
+        assert isinstance(item, Element)
+        if item._parent is not None and item._parent != self:
+            raise AssertionError(f"different parent already set for: {item!r}")
+        item._parent = self
+        return self._children.__setitem__(index, item)
+
+    def __delitem__(self, index: int):  # type: ignore[override]
+        return self._children.__delitem__(index)
+
+    def __len__(self) -> int:
+        return self._children.__len__()
+
+    def __iter__(self) -> Iterator[Element]:
+        yield from self._children
+
+    def insert(self, index: int, item: Element):
+        assert isinstance(item, Element)
+        if item._parent is not None and item._parent != self:
+            raise AssertionError(f"different parent already set for: {item!r}")
+        item._parent = self
+        return self._children.insert(index, item)
+
+    def deepcopy(self) -> Element:
+        """Recursively copy and remove parent."""
+        _copy = self.__class__(self.name, self.attrs)
+        for child in self:
+            _copy_child = child.deepcopy()
+            _copy.append(_copy_child)
+        return _copy
+
+    def __repr__(self) -> str:
+        text = f"{self.__class__.__name__}({self.name!r}"
+        if self.attrs:
+            text += f", {self.attrs!r}"
+        text += ")"
+        return text
+
+    def render(
+        self,
+        tag_overrides: dict[str, Callable[[Element, dict], str]] | None = None,
+        **kwargs,
+    ) -> str:
+        """Returns a HTML string representation of the element.
+
+        :param tag_overrides: Provide a dictionary of render function
+            for specific tag names, to override the normal render format
+
+        """
+        raise NotImplementedError
+
+    def __str__(self) -> str:
+        return self.render()
+
+    def __eq__(self, item: Any) -> bool:
+        return item is self
+
+    def walk(self, include_self: bool = False) -> Iterator[Element]:
+        """Walk through the xml/html AST."""
+        if include_self:
+            yield self
+        for child in self:
+            yield child
+            yield from child.walk()
+
+    def strip(self, inplace: bool = False, recurse: bool = False) -> Element:
+        """Return copy with all `Data` tokens
+        that only contain whitespace / newlines removed.
+        """
+        element = self
+        if not inplace:
+            element = self.deepcopy()
+        element.reset_children(
+            [
+                e
+                for e in element.children
+                if not (isinstance(e, Data) and e.data.strip() == "")
+            ]
+        )
+        if recurse:
+            for child in element:
+                child.strip(inplace=True, recurse=True)
+        return element
+
+    def find(
+        self,
+        identifier: str | type[Element],
+        attrs: dict | None = None,
+        classes: Iterable[str] | None = None,
+        include_self: bool = False,
+        recurse: bool = True,
+    ) -> Iterator[Element]:
+        """Find all elements that match name and specific attributes."""
+        iterator = self.walk() if recurse else self
+        if include_self:
+            iterator = itertools.chain([self], iterator)
+        if inspect.isclass(identifier):
+            test_func = lambda c: isinstance(c, identifier)  # noqa: E731
+        else:
+            test_func = lambda c: c.name == identifier  # noqa: E731
+        classes = set(classes) if classes is not None else classes
+        for child in iterator:
+            if test_func(child):
+                if classes is not None and not classes.issubset(child.attrs.classes):
+                    continue
+                for key, value in (attrs or {}).items():
+                    if child.attrs[key] != value:
+                        break
+                else:
+                    yield child
+
+
+class Root(Element):
+    """The root of the AST tree."""
+
+    def render(self, **kwargs) -> str:  # type: ignore[override]
+        """Returns a string HTML representation of the structure."""
+        return "".join(child.render(**kwargs) for child in self)
+
+
+class Tag(Element):
+    """Represent xml/html tags under the form: <name key="value" ...> ... </name>."""
+
+    def render(
+        self,
+        tag_overrides: dict[str, Callable[[Element, dict], str]] | None = None,
+        **kwargs,
+    ) -> str:
+        if tag_overrides and self.name in tag_overrides:
+            return tag_overrides[self.name](self, tag_overrides)
+        return (
+            f"<{self.name}{' ' if self.attrs else ''}{self.attrs}>"
+            + "".join(
+                child.render(tag_overrides=tag_overrides, **kwargs) for child in self
+            )
+            + f"</{self.name}>"
+        )
+
+
+class XTag(Element):
+    """Represent XHTML style tags with no children, like `<img src="t.gif" />`"""
+
+    def render(
+        self,
+        tag_overrides: dict[str, Callable[[Element, dict], str]] | None = None,
+        **kwargs,
+    ) -> str:
+        if tag_overrides is not None and self.name in tag_overrides:
+            return tag_overrides[self.name](self, tag_overrides)
+        return f"<{self.name}{' ' if self.attrs else ''}{self.attrs}/>"
+
+
+class VoidTag(Element):
+    """Represent tags with no children, only start tag, like `<img src="t.gif" >`"""
+
+    def render(self, **kwargs) -> str:  # type: ignore[override]
+        return f"<{self.name}{' ' if self.attrs else ''}{self.attrs}>"
+
+
+class TerminalElement(Element):
+    def __init__(self, data: str):
+        super().__init__("")
+        self.data: str = data
+
+    def __repr__(self) -> str:
+        text = self.data
+        if len(text) > 20:
+            text = text[:17] + "..."
+        return f"{self.__class__.__name__}({text!r})"
+
+    def deepcopy(self) -> TerminalElement:
+        """Copy and remove parent."""
+        _copy = self.__class__(self.data)
+        return _copy
+
+
+class Data(TerminalElement):
+    """Represent data inside xml/html documents, like raw text."""
+
+    def render(self, **kwargs) -> str:  # type: ignore[override]
+        return self.data
+
+
+class Declaration(TerminalElement):
+    """Represent declarations, like `<!DOCTYPE html>`"""
+
+    def render(self, **kwargs) -> str:  # type: ignore[override]
+        return f"<!{self.data}>"
+
+
+class Comment(TerminalElement):
+    """Represent HTML comments"""
+
+    def render(self, **kwargs) -> str:  # type: ignore[override]
+        return f"<!--{self.data}-->"
+
+
+class Pi(TerminalElement):
+    """Represent processing instructions like `<?xml-stylesheet ?>`"""
+
+    def render(self, **kwargs) -> str:  # type: ignore[override]
+        return f"<?{self.data}>"
+
+
+class Char(TerminalElement):
+    """Represent character codes like: `&#0`"""
+
+    def render(self, **kwargs) -> str:  # type: ignore[override]
+        return f"&#{self.data};"
+
+
+class Entity(TerminalElement):
+    """Represent entities like `&amp`"""
+
+    def render(self, **kwargs) -> str:  # type: ignore[override]
+        return f"&{self.data};"
+
+
+class Tree:
+    """The engine class to generate the AST tree."""
+
+    def __init__(self, name: str = ""):
+        """Initialise Tree"""
+        self.name = name
+        self.outmost = Root(name)
+        self.stack: deque = deque()
+        self.stack.append(self.outmost)
+
+    def clear(self):
+        """Clear the outmost and stack for a new parsing."""
+        self.outmost = Root(self.name)
+        self.stack.clear()
+        self.stack.append(self.outmost)
+
+    def last(self) -> Element:
+        """Return the last pointer which point to the actual tag scope."""
+        return self.stack[-1]
+
+    def nest_tag(self, name: str, attrs: dict):
+        """Nest a given tag at the bottom of the tree using
+        the last stack's pointer.
+        """
+        pointer = self.stack.pop()
+        item = Tag(name, attrs)
+        pointer.append(item)
+        self.stack.append(pointer)
+        self.stack.append(item)
+
+    def nest_xtag(self, name: str, attrs: dict):
+        """Nest an XTag onto the tree."""
+        top = self.last()
+        item = XTag(name, attrs)
+        top.append(item)
+
+    def nest_vtag(self, name: str, attrs: dict):
+        """Nest a VoidTag onto the tree."""
+        top = self.last()
+        item = VoidTag(name, attrs)
+        top.append(item)
+
+    def nest_terminal(self, klass: type[TerminalElement], data: str):
+        """Nest the data onto the tree."""
+        top = self.last()
+        item = klass(data)
+        top.append(item)
+
+    def enclose(self, name: str):
+        """When a closing tag is found, pop the pointer's scope from the stack,
+        to then point to the earlier scope's tag.
+        """
+        count = 0
+        for ind in reversed(self.stack):
+            count = count + 1
+            if ind.name == name:
+                break
+        else:
+            count = 0
+
+        # It pops all the items which do not match with the closing tag.
+        for _ in range(0, count):
+            self.stack.pop()
+
+
+class HtmlToAst(HTMLParser):
+    """The tokenizer class."""
+
+    # see https://html.spec.whatwg.org/multipage/syntax.html#void-elements
+    void_elements = {
+        "area",
+        "base",
+        "br",
+        "col",
+        "embed",
+        "hr",
+        "img",
+        "input",
+        "link",
+        "meta",
+        "param",
+        "source",
+        "track",
+        "wbr",
+    }
+
+    def __init__(self, name: str = "", convert_charrefs: bool = False):
+        super().__init__(convert_charrefs=convert_charrefs)
+        self.struct = Tree(name)
+
+    def feed(self, source: str) -> Root:  # type: ignore[override]
+        """Parse the source string."""
+        self.struct.clear()
+        super().feed(source)
+        return self.struct.outmost
+
+    def handle_starttag(self, name: str, attr):
+        """When found an opening tag then nest it onto the tree."""
+        if name in self.void_elements:
+            self.struct.nest_vtag(name, attr)
+        else:
+            self.struct.nest_tag(name, attr)
+
+    def handle_startendtag(self, name: str, attr):
+        """When found a XHTML tag style then nest it up to the tree."""
+        self.struct.nest_xtag(name, attr)
+
+    def handle_endtag(self, name: str):
+        """When found a closing tag then makes it point to the right scope."""
+        if name not in self.void_elements:
+            self.struct.enclose(name)
+
+    def handle_data(self, data: str):
+        """Nest data onto the tree."""
+        self.struct.nest_terminal(Data, data)
+
+    def handle_decl(self, decl: str):
+        self.struct.nest_terminal(Declaration, decl)
+
+    def unknown_decl(self, decl: str):
+        self.struct.nest_terminal(Declaration, decl)
+
+    def handle_charref(self, data: str):
+        self.struct.nest_terminal(Char, data)
+
+    def handle_entityref(self, data: str):
+        self.struct.nest_terminal(Entity, data)
+
+    def handle_pi(self, data: str):
+        self.struct.nest_terminal(Pi, data)
+
+    def handle_comment(self, data: str):
+        self.struct.nest_terminal(Comment, data)
+
+
+def tokenize_html(text: str, name: str = "", convert_charrefs: bool = False) -> Root:
+    parser = HtmlToAst(name, convert_charrefs=convert_charrefs)
+    return parser.feed(text)
diff --git a/myst_parser/parsers/sphinx_.py b/myst_parser/parsers/sphinx_.py
new file mode 100644
index 0000000..fff098f
--- /dev/null
+++ b/myst_parser/parsers/sphinx_.py
@@ -0,0 +1,69 @@
+"""MyST Markdown parser for sphinx."""
+from __future__ import annotations
+
+from docutils import nodes
+from docutils.parsers.rst import Parser as RstParser
+from sphinx.parsers import Parser as SphinxParser
+from sphinx.util import logging
+
+from myst_parser.config.main import (
+    MdParserConfig,
+    TopmatterReadError,
+    merge_file_level,
+    read_topmatter,
+)
+from myst_parser.mdit_to_docutils.sphinx_ import SphinxRenderer, create_warning
+from myst_parser.parsers.mdit import create_md_parser
+
+SPHINX_LOGGER = logging.getLogger(__name__)
+
+
+class MystParser(SphinxParser):
+    """Sphinx parser for Markedly Structured Text (MyST)."""
+
+    supported: tuple[str, ...] = ("md", "markdown", "myst")
+    """Aliases this parser supports."""
+
+    settings_spec = RstParser.settings_spec
+    """Runtime settings specification.
+
+    Defines runtime settings and associated command-line options, as used by
+    `docutils.frontend.OptionParser`.  This is a concatenation of tuples of:
+
+    - Option group title (string or `None` which implies no group, just a list
+      of single options).
+
+    - Description (string or `None`).
+
+    - A sequence of option tuples
+    """
+
+    config_section = "myst parser"
+    config_section_dependencies = ("parsers",)
+    translate_section_name = None
+
+    def parse(self, inputstring: str, document: nodes.document) -> None:
+        """Parse source text.
+
+        :param inputstring: The source string to parse
+        :param document: The root docutils node to add AST elements to
+
+        """
+        # get the global config
+        config: MdParserConfig = document.settings.env.myst_config
+
+        # update the global config with the file-level config
+        try:
+            topmatter = read_topmatter(inputstring)
+        except TopmatterReadError:
+            pass  # this will be reported during the render
+        else:
+            if topmatter:
+                warning = lambda wtype, msg: create_warning(  # noqa: E731
+                    document, msg, line=1, append_to=document, subtype=wtype
+                )
+                config = merge_file_level(config, topmatter, warning)
+
+        parser = create_md_parser(config, SphinxRenderer)
+        parser.options["document"] = document
+        parser.render(inputstring)