diff options
Diffstat (limited to 'myst_parser/parsers')
-rw-r--r-- | myst_parser/parsers/__init__.py | 1 | ||||
-rw-r--r-- | myst_parser/parsers/directives.py | 190 | ||||
-rw-r--r-- | myst_parser/parsers/docutils_.py | 275 | ||||
-rw-r--r-- | myst_parser/parsers/mdit.py | 123 | ||||
-rw-r--r-- | myst_parser/parsers/parse_html.py | 440 | ||||
-rw-r--r-- | myst_parser/parsers/sphinx_.py | 69 |
6 files changed, 1098 insertions, 0 deletions
diff --git a/myst_parser/parsers/__init__.py b/myst_parser/parsers/__init__.py new file mode 100644 index 0000000..26fbfca --- /dev/null +++ b/myst_parser/parsers/__init__.py @@ -0,0 +1 @@ +"""Parsers of MyST Markdown source text to docutils AST.""" diff --git a/myst_parser/parsers/directives.py b/myst_parser/parsers/directives.py new file mode 100644 index 0000000..5637254 --- /dev/null +++ b/myst_parser/parsers/directives.py @@ -0,0 +1,190 @@ +"""Fenced code blocks are parsed as directives, +if the block starts with ``{directive_name}``, +followed by arguments on the same line. + +Directive options are read from a YAML block, +if the first content line starts with ``---``, e.g. + +:: + + ```{directive_name} arguments + --- + option1: name + option2: | + Longer text block + --- + content... + ``` + +Or the option block will be parsed if the first content line starts with ``:``, +as a YAML block consisting of every line that starts with a ``:``, e.g. + +:: + + ```{directive_name} arguments + :option1: name + :option2: other + + content... + ``` + +If the first line of a directive's content is blank, this will be stripped +from the content. +This is to allow for separation between the option block and content. + +""" +from __future__ import annotations + +import datetime +import re +from textwrap import dedent +from typing import Any, Callable + +import yaml +from docutils.parsers.rst import Directive +from docutils.parsers.rst.directives.misc import TestDirective + + +class DirectiveParsingError(Exception): + """Raise on parsing/validation error.""" + + pass + + +def parse_directive_text( + directive_class: type[Directive], + first_line: str, + content: str, + validate_options: bool = True, +) -> tuple[list[str], dict, list[str], int]: + """Parse (and validate) the full directive text. + + :param first_line: The text on the same line as the directive name. + May be an argument or body text, dependent on the directive + :param content: All text after the first line. Can include options. + :param validate_options: Whether to validate the values of options + + :returns: (arguments, options, body_lines, content_offset) + """ + if directive_class.option_spec: + body, options = parse_directive_options( + content, directive_class, validate=validate_options + ) + body_lines = body.splitlines() + content_offset = len(content.splitlines()) - len(body_lines) + else: + # If there are no possible options, we do not look for a YAML block + options = {} + body_lines = content.splitlines() + content_offset = 0 + + if not (directive_class.required_arguments or directive_class.optional_arguments): + # If there are no possible arguments, then the body starts on the argument line + if first_line: + body_lines.insert(0, first_line) + arguments = [] + else: + arguments = parse_directive_arguments(directive_class, first_line) + + # remove first line of body if blank + # this is to allow space between the options and the content + if body_lines and not body_lines[0].strip(): + body_lines = body_lines[1:] + content_offset += 1 + + # check for body content + if body_lines and not directive_class.has_content: + raise DirectiveParsingError("No content permitted") + + return arguments, options, body_lines, content_offset + + +def parse_directive_options( + content: str, directive_class: type[Directive], validate: bool = True +): + """Parse (and validate) the directive option section.""" + options: dict[str, Any] = {} + if content.startswith("---"): + content = "\n".join(content.splitlines()[1:]) + match = re.search(r"^-{3,}", content, re.MULTILINE) + if match: + yaml_block = content[: match.start()] + content = content[match.end() + 1 :] # TODO advance line number + else: + yaml_block = content + content = "" + yaml_block = dedent(yaml_block) + try: + options = yaml.safe_load(yaml_block) or {} + except (yaml.parser.ParserError, yaml.scanner.ScannerError) as error: + raise DirectiveParsingError("Invalid options YAML: " + str(error)) + elif content.lstrip().startswith(":"): + content_lines = content.splitlines() # type: list + yaml_lines = [] + while content_lines: + if not content_lines[0].lstrip().startswith(":"): + break + yaml_lines.append(content_lines.pop(0).lstrip()[1:]) + yaml_block = "\n".join(yaml_lines) + content = "\n".join(content_lines) + try: + options = yaml.safe_load(yaml_block) or {} + except (yaml.parser.ParserError, yaml.scanner.ScannerError) as error: + raise DirectiveParsingError("Invalid options YAML: " + str(error)) + if not isinstance(options, dict): + raise DirectiveParsingError(f"Invalid options (not dict): {options}") + + if (not validate) or issubclass(directive_class, TestDirective): + # technically this directive spec only accepts one option ('option') + # but since its for testing only we accept all options + return content, options + + # check options against spec + options_spec: dict[str, Callable] = directive_class.option_spec + for name, value in list(options.items()): + try: + convertor = options_spec[name] + except KeyError: + raise DirectiveParsingError(f"Unknown option: {name}") + if not isinstance(value, str): + if value is True or value is None: + value = None # flag converter requires no argument + elif isinstance(value, (int, float, datetime.date, datetime.datetime)): + # convertor always requires string input + value = str(value) + else: + raise DirectiveParsingError( + f'option "{name}" value not string (enclose with ""): {value}' + ) + try: + converted_value = convertor(value) + except (ValueError, TypeError) as error: + raise DirectiveParsingError( + "Invalid option value: (option: '{}'; value: {})\n{}".format( + name, value, error + ) + ) + options[name] = converted_value + + return content, options + + +def parse_directive_arguments(directive, arg_text): + """Parse (and validate) the directive argument section.""" + required = directive.required_arguments + optional = directive.optional_arguments + arguments = arg_text.split() + if len(arguments) < required: + raise DirectiveParsingError( + f"{required} argument(s) required, {len(arguments)} supplied" + ) + elif len(arguments) > required + optional: + if directive.final_argument_whitespace: + arguments = arg_text.split(None, required + optional - 1) + else: + raise DirectiveParsingError( + "maximum {} argument(s) allowed, {} supplied".format( + required + optional, len(arguments) + ) + ) + return arguments diff --git a/myst_parser/parsers/docutils_.py b/myst_parser/parsers/docutils_.py new file mode 100644 index 0000000..aaef5e2 --- /dev/null +++ b/myst_parser/parsers/docutils_.py @@ -0,0 +1,275 @@ +"""MyST Markdown parser for docutils.""" +from dataclasses import Field +from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple, Union + +from docutils import frontend, nodes +from docutils.core import default_description, publish_cmdline +from docutils.parsers.rst import Parser as RstParser +from typing_extensions import Literal, get_args, get_origin + +from myst_parser.config.main import ( + MdParserConfig, + TopmatterReadError, + merge_file_level, + read_topmatter, +) +from myst_parser.mdit_to_docutils.base import DocutilsRenderer, create_warning +from myst_parser.parsers.mdit import create_md_parser + + +def _validate_int( + setting, value, option_parser, config_parser=None, config_section=None +) -> int: + """Validate an integer setting.""" + return int(value) + + +def _create_validate_tuple(length: int) -> Callable[..., Tuple[str, ...]]: + """Create a validator for a tuple of length `length`.""" + + def _validate( + setting, value, option_parser, config_parser=None, config_section=None + ): + string_list = frontend.validate_comma_separated_list( + setting, value, option_parser, config_parser, config_section + ) + if len(string_list) != length: + raise ValueError( + f"Expecting {length} items in {setting}, got {len(string_list)}." + ) + return tuple(string_list) + + return _validate + + +class Unset: + """A sentinel class for unset settings.""" + + def __repr__(self): + return "UNSET" + + +DOCUTILS_UNSET = Unset() +"""Sentinel for arguments not set through docutils.conf.""" + + +DOCUTILS_EXCLUDED_ARGS = ( + # docutils.conf can't represent callables + "heading_slug_func", + # docutils.conf can't represent dicts + "html_meta", + "substitutions", + # we can't add substitutions so not needed + "sub_delimiters", + # sphinx only options + "heading_anchors", + "ref_domains", + "update_mathjax", + "mathjax_classes", +) +"""Names of settings that cannot be set in docutils.conf.""" + + +def _attr_to_optparse_option(at: Field, default: Any) -> Tuple[dict, str]: + """Convert a field into a Docutils optparse options dict.""" + if at.type is int: + return {"metavar": "<int>", "validator": _validate_int}, f"(default: {default})" + if at.type is bool: + return { + "metavar": "<boolean>", + "validator": frontend.validate_boolean, + }, f"(default: {default})" + if at.type is str: + return { + "metavar": "<str>", + }, f"(default: '{default}')" + if get_origin(at.type) is Literal and all( + isinstance(a, str) for a in get_args(at.type) + ): + args = get_args(at.type) + return { + "metavar": f"<{'|'.join(repr(a) for a in args)}>", + "type": "choice", + "choices": args, + }, f"(default: {default!r})" + if at.type in (Iterable[str], Sequence[str]): + return { + "metavar": "<comma-delimited>", + "validator": frontend.validate_comma_separated_list, + }, f"(default: '{','.join(default)}')" + if at.type == Tuple[str, str]: + return { + "metavar": "<str,str>", + "validator": _create_validate_tuple(2), + }, f"(default: '{','.join(default)}')" + if at.type == Union[int, type(None)]: + return { + "metavar": "<null|int>", + "validator": _validate_int, + }, f"(default: {default})" + if at.type == Union[Iterable[str], type(None)]: + default_str = ",".join(default) if default else "" + return { + "metavar": "<null|comma-delimited>", + "validator": frontend.validate_comma_separated_list, + }, f"(default: {default_str!r})" + raise AssertionError( + f"Configuration option {at.name} not set up for use in docutils.conf." + ) + + +def attr_to_optparse_option( + attribute: Field, default: Any, prefix: str = "myst_" +) -> Tuple[str, List[str], Dict[str, Any]]: + """Convert an ``MdParserConfig`` attribute into a Docutils setting tuple. + + :returns: A tuple of ``(help string, option flags, optparse kwargs)``. + """ + name = f"{prefix}{attribute.name}" + flag = "--" + name.replace("_", "-") + options = {"dest": name, "default": DOCUTILS_UNSET} + at_options, type_str = _attr_to_optparse_option(attribute, default) + options.update(at_options) + help_str = attribute.metadata.get("help", "") if attribute.metadata else "" + return (f"{help_str} {type_str}", [flag], options) + + +def create_myst_settings_spec( + excluded: Sequence[str], config_cls=MdParserConfig, prefix: str = "myst_" +): + """Return a list of Docutils setting for the docutils MyST section.""" + defaults = config_cls() + return tuple( + attr_to_optparse_option(at, getattr(defaults, at.name), prefix) + for at in config_cls.get_fields() + if at.name not in excluded + ) + + +def create_myst_config( + settings: frontend.Values, + excluded: Sequence[str], + config_cls=MdParserConfig, + prefix: str = "myst_", +): + """Create a configuration instance from the given settings.""" + values = {} + for attribute in config_cls.get_fields(): + if attribute.name in excluded: + continue + setting = f"{prefix}{attribute.name}" + val = getattr(settings, setting, DOCUTILS_UNSET) + if val is not DOCUTILS_UNSET: + values[attribute.name] = val + return config_cls(**values) + + +class Parser(RstParser): + """Docutils parser for Markedly Structured Text (MyST).""" + + supported: Tuple[str, ...] = ("md", "markdown", "myst") + """Aliases this parser supports.""" + + settings_spec = ( + "MyST options", + None, + create_myst_settings_spec(DOCUTILS_EXCLUDED_ARGS), + *RstParser.settings_spec, + ) + """Runtime settings specification.""" + + config_section = "myst parser" + config_section_dependencies = ("parsers",) + translate_section_name = None + + def parse(self, inputstring: str, document: nodes.document) -> None: + """Parse source text. + + :param inputstring: The source string to parse + :param document: The root docutils node to add AST elements to + """ + + self.setup_parse(inputstring, document) + + # check for exorbitantly long lines + if hasattr(document.settings, "line_length_limit"): + for i, line in enumerate(inputstring.split("\n")): + if len(line) > document.settings.line_length_limit: + error = document.reporter.error( + f"Line {i+1} exceeds the line-length-limit:" + f" {document.settings.line_length_limit}." + ) + document.append(error) + return + + # create parsing configuration from the global config + try: + config = create_myst_config(document.settings, DOCUTILS_EXCLUDED_ARGS) + except Exception as exc: + error = document.reporter.error(f"Global myst configuration invalid: {exc}") + document.append(error) + config = MdParserConfig() + + # update the global config with the file-level config + try: + topmatter = read_topmatter(inputstring) + except TopmatterReadError: + pass # this will be reported during the render + else: + if topmatter: + warning = lambda wtype, msg: create_warning( # noqa: E731 + document, msg, line=1, append_to=document, subtype=wtype + ) + config = merge_file_level(config, topmatter, warning) + + # parse content + parser = create_md_parser(config, DocutilsRenderer) + parser.options["document"] = document + parser.render(inputstring) + + # post-processing + + # replace raw nodes if raw is not allowed + if not getattr(document.settings, "raw_enabled", True): + for node in document.traverse(nodes.raw): + warning = document.reporter.warning("Raw content disabled.") + node.parent.replace(node, warning) + + self.finish_parse() + + +def _run_cli(writer_name: str, writer_description: str, argv: Optional[List[str]]): + """Run the command line interface for a particular writer.""" + publish_cmdline( + parser=Parser(), + writer_name=writer_name, + description=( + f"Generates {writer_description} from standalone MyST sources.\n{default_description}" + ), + argv=argv, + ) + + +def cli_html(argv: Optional[List[str]] = None) -> None: + """Cmdline entrypoint for converting MyST to HTML.""" + _run_cli("html", "(X)HTML documents", argv) + + +def cli_html5(argv: Optional[List[str]] = None): + """Cmdline entrypoint for converting MyST to HTML5.""" + _run_cli("html5", "HTML5 documents", argv) + + +def cli_latex(argv: Optional[List[str]] = None): + """Cmdline entrypoint for converting MyST to LaTeX.""" + _run_cli("latex", "LaTeX documents", argv) + + +def cli_xml(argv: Optional[List[str]] = None): + """Cmdline entrypoint for converting MyST to XML.""" + _run_cli("xml", "Docutils-native XML", argv) + + +def cli_pseudoxml(argv: Optional[List[str]] = None): + """Cmdline entrypoint for converting MyST to pseudo-XML.""" + _run_cli("pseudoxml", "pseudo-XML", argv) diff --git a/myst_parser/parsers/mdit.py b/myst_parser/parsers/mdit.py new file mode 100644 index 0000000..8476495 --- /dev/null +++ b/myst_parser/parsers/mdit.py @@ -0,0 +1,123 @@ +"""This module holds the ``create_md_parser`` function, +which creates a parser from the config. +""" +from __future__ import annotations + +from typing import Callable + +from markdown_it import MarkdownIt +from markdown_it.renderer import RendererProtocol +from mdit_py_plugins.amsmath import amsmath_plugin +from mdit_py_plugins.anchors import anchors_plugin +from mdit_py_plugins.attrs import attrs_plugin +from mdit_py_plugins.colon_fence import colon_fence_plugin +from mdit_py_plugins.deflist import deflist_plugin +from mdit_py_plugins.dollarmath import dollarmath_plugin +from mdit_py_plugins.field_list import fieldlist_plugin +from mdit_py_plugins.footnote import footnote_plugin +from mdit_py_plugins.front_matter import front_matter_plugin +from mdit_py_plugins.myst_blocks import myst_block_plugin +from mdit_py_plugins.myst_role import myst_role_plugin +from mdit_py_plugins.substitution import substitution_plugin +from mdit_py_plugins.tasklists import tasklists_plugin +from mdit_py_plugins.wordcount import wordcount_plugin + +from myst_parser.config.main import MdParserConfig + + +def create_md_parser( + config: MdParserConfig, renderer: Callable[[MarkdownIt], RendererProtocol] +) -> MarkdownIt: + """Return a Markdown parser with the required MyST configuration.""" + + # TODO warn if linkify required and linkify-it-py not installed + # (currently the parse will unceremoniously except) + + if config.commonmark_only: + # see https://spec.commonmark.org/ + md = MarkdownIt("commonmark", renderer_cls=renderer).use( + wordcount_plugin, per_minute=config.words_per_minute + ) + md.options.update({"myst_config": config}) + return md + + if config.gfm_only: + # see https://github.github.com/gfm/ + md = ( + MarkdownIt("commonmark", renderer_cls=renderer) + # note, strikethrough currently only supported tentatively for HTML + .enable("strikethrough") + .enable("table") + .use(tasklists_plugin) + .enable("linkify") + .use(wordcount_plugin, per_minute=config.words_per_minute) + ) + md.options.update({"linkify": True, "myst_config": config}) + return md + + md = ( + MarkdownIt("commonmark", renderer_cls=renderer) + .enable("table") + .use(front_matter_plugin) + .use(myst_block_plugin) + .use(myst_role_plugin) + .use(footnote_plugin) + .use(wordcount_plugin, per_minute=config.words_per_minute) + .disable("footnote_inline") + # disable this for now, because it need a new implementation in the renderer + .disable("footnote_tail") + ) + + typographer = False + if "smartquotes" in config.enable_extensions: + md.enable("smartquotes") + typographer = True + if "replacements" in config.enable_extensions: + md.enable("replacements") + typographer = True + if "linkify" in config.enable_extensions: + md.enable("linkify") + if md.linkify is not None: + md.linkify.set({"fuzzy_link": config.linkify_fuzzy_links}) + if "strikethrough" in config.enable_extensions: + md.enable("strikethrough") + if "dollarmath" in config.enable_extensions: + md.use( + dollarmath_plugin, + allow_labels=config.dmath_allow_labels, + allow_space=config.dmath_allow_space, + allow_digits=config.dmath_allow_digits, + double_inline=config.dmath_double_inline, + ) + if "colon_fence" in config.enable_extensions: + md.use(colon_fence_plugin) + if "amsmath" in config.enable_extensions: + md.use(amsmath_plugin) + if "deflist" in config.enable_extensions: + md.use(deflist_plugin) + if "fieldlist" in config.enable_extensions: + md.use(fieldlist_plugin) + if "tasklist" in config.enable_extensions: + md.use(tasklists_plugin) + if "substitution" in config.enable_extensions: + md.use(substitution_plugin, *config.sub_delimiters) + if "attrs_image" in config.enable_extensions: + md.use(attrs_plugin, after=("image",)) + if config.heading_anchors is not None: + md.use( + anchors_plugin, + max_level=config.heading_anchors, + slug_func=config.heading_slug_func, + ) + for name in config.disable_syntax: + md.disable(name, True) + + md.options.update( + { + "typographer": typographer, + "linkify": "linkify" in config.enable_extensions, + "myst_config": config, + } + ) + + return md diff --git a/myst_parser/parsers/parse_html.py b/myst_parser/parsers/parse_html.py new file mode 100644 index 0000000..7539e42 --- /dev/null +++ b/myst_parser/parsers/parse_html.py @@ -0,0 +1,440 @@ +"""A simple but complete HTML to Abstract Syntax Tree (AST) parser. + +The AST can also reproduce the HTML text. + +Example:: + + >> text = '<div class="note"><p>text</p></div>' + >> ast = tokenize_html(text) + >> list(ast.walk(include_self=True)) + [Root(''), Tag('div', {'class': 'note'}), Tag('p'), Data('text')] + >> str(ast) + '<div class="note"><p>text</p></div>' + >> str(ast[0][0]) + '<p>text</p>' + +Note: optional tags are not accounted for +(see https://html.spec.whatwg.org/multipage/syntax.html#optional-tags) + +""" +from __future__ import annotations + +import inspect +import itertools +from collections import abc, deque +from html.parser import HTMLParser +from typing import Any, Callable, Iterable, Iterator + + +class Attribute(dict): + """This class holds the tags's attributes.""" + + def __getitem__(self, key: str) -> str: + """If self doesn't have the key it returns ''.""" + return self.get(key, "") + + @property + def classes(self) -> list[str]: + """Return 'class' attribute as list.""" + return self["class"].split() + + def __str__(self) -> str: + """Return a htmlized representation for attributes.""" + return " ".join(f'{key}="{value}"' for key, value in self.items()) + + +class Element(abc.MutableSequence): + """An Element of the xml/html document. + + All xml/html entities inherit from this class. + """ + + def __init__(self, name: str = "", attr: dict | None = None) -> None: + """Initialise the element.""" + self.name = name + self.attrs: Attribute = Attribute(attr or {}) + self._parent: Element | None = None + self._children: list[Element] = [] + + @property + def parent(self) -> Element | None: + """Return parent.""" + return self._parent + + @property + def children(self) -> list[Element]: + """Return copy of children.""" + return self._children[:] + + def reset_children(self, children: list[Element], deepcopy: bool = False): + new_children = [] + for i, item in enumerate(children): + assert isinstance(item, Element) + if deepcopy: + item = item.deepcopy() + if item._parent is None: + item._parent = self + elif item._parent != self: + raise AssertionError(f"different parent already set for item {i}") + new_children.append(item) + self._children = new_children + + def __getitem__(self, index: int) -> Element: # type: ignore[override] + return self._children[index] + + def __setitem__(self, index: int, item: Element): # type: ignore[override] + assert isinstance(item, Element) + if item._parent is not None and item._parent != self: + raise AssertionError(f"different parent already set for: {item!r}") + item._parent = self + return self._children.__setitem__(index, item) + + def __delitem__(self, index: int): # type: ignore[override] + return self._children.__delitem__(index) + + def __len__(self) -> int: + return self._children.__len__() + + def __iter__(self) -> Iterator[Element]: + yield from self._children + + def insert(self, index: int, item: Element): + assert isinstance(item, Element) + if item._parent is not None and item._parent != self: + raise AssertionError(f"different parent already set for: {item!r}") + item._parent = self + return self._children.insert(index, item) + + def deepcopy(self) -> Element: + """Recursively copy and remove parent.""" + _copy = self.__class__(self.name, self.attrs) + for child in self: + _copy_child = child.deepcopy() + _copy.append(_copy_child) + return _copy + + def __repr__(self) -> str: + text = f"{self.__class__.__name__}({self.name!r}" + if self.attrs: + text += f", {self.attrs!r}" + text += ")" + return text + + def render( + self, + tag_overrides: dict[str, Callable[[Element, dict], str]] | None = None, + **kwargs, + ) -> str: + """Returns a HTML string representation of the element. + + :param tag_overrides: Provide a dictionary of render function + for specific tag names, to override the normal render format + + """ + raise NotImplementedError + + def __str__(self) -> str: + return self.render() + + def __eq__(self, item: Any) -> bool: + return item is self + + def walk(self, include_self: bool = False) -> Iterator[Element]: + """Walk through the xml/html AST.""" + if include_self: + yield self + for child in self: + yield child + yield from child.walk() + + def strip(self, inplace: bool = False, recurse: bool = False) -> Element: + """Return copy with all `Data` tokens + that only contain whitespace / newlines removed. + """ + element = self + if not inplace: + element = self.deepcopy() + element.reset_children( + [ + e + for e in element.children + if not (isinstance(e, Data) and e.data.strip() == "") + ] + ) + if recurse: + for child in element: + child.strip(inplace=True, recurse=True) + return element + + def find( + self, + identifier: str | type[Element], + attrs: dict | None = None, + classes: Iterable[str] | None = None, + include_self: bool = False, + recurse: bool = True, + ) -> Iterator[Element]: + """Find all elements that match name and specific attributes.""" + iterator = self.walk() if recurse else self + if include_self: + iterator = itertools.chain([self], iterator) + if inspect.isclass(identifier): + test_func = lambda c: isinstance(c, identifier) # noqa: E731 + else: + test_func = lambda c: c.name == identifier # noqa: E731 + classes = set(classes) if classes is not None else classes + for child in iterator: + if test_func(child): + if classes is not None and not classes.issubset(child.attrs.classes): + continue + for key, value in (attrs or {}).items(): + if child.attrs[key] != value: + break + else: + yield child + + +class Root(Element): + """The root of the AST tree.""" + + def render(self, **kwargs) -> str: # type: ignore[override] + """Returns a string HTML representation of the structure.""" + return "".join(child.render(**kwargs) for child in self) + + +class Tag(Element): + """Represent xml/html tags under the form: <name key="value" ...> ... </name>.""" + + def render( + self, + tag_overrides: dict[str, Callable[[Element, dict], str]] | None = None, + **kwargs, + ) -> str: + if tag_overrides and self.name in tag_overrides: + return tag_overrides[self.name](self, tag_overrides) + return ( + f"<{self.name}{' ' if self.attrs else ''}{self.attrs}>" + + "".join( + child.render(tag_overrides=tag_overrides, **kwargs) for child in self + ) + + f"</{self.name}>" + ) + + +class XTag(Element): + """Represent XHTML style tags with no children, like `<img src="t.gif" />`""" + + def render( + self, + tag_overrides: dict[str, Callable[[Element, dict], str]] | None = None, + **kwargs, + ) -> str: + if tag_overrides is not None and self.name in tag_overrides: + return tag_overrides[self.name](self, tag_overrides) + return f"<{self.name}{' ' if self.attrs else ''}{self.attrs}/>" + + +class VoidTag(Element): + """Represent tags with no children, only start tag, like `<img src="t.gif" >`""" + + def render(self, **kwargs) -> str: # type: ignore[override] + return f"<{self.name}{' ' if self.attrs else ''}{self.attrs}>" + + +class TerminalElement(Element): + def __init__(self, data: str): + super().__init__("") + self.data: str = data + + def __repr__(self) -> str: + text = self.data + if len(text) > 20: + text = text[:17] + "..." + return f"{self.__class__.__name__}({text!r})" + + def deepcopy(self) -> TerminalElement: + """Copy and remove parent.""" + _copy = self.__class__(self.data) + return _copy + + +class Data(TerminalElement): + """Represent data inside xml/html documents, like raw text.""" + + def render(self, **kwargs) -> str: # type: ignore[override] + return self.data + + +class Declaration(TerminalElement): + """Represent declarations, like `<!DOCTYPE html>`""" + + def render(self, **kwargs) -> str: # type: ignore[override] + return f"<!{self.data}>" + + +class Comment(TerminalElement): + """Represent HTML comments""" + + def render(self, **kwargs) -> str: # type: ignore[override] + return f"<!--{self.data}-->" + + +class Pi(TerminalElement): + """Represent processing instructions like `<?xml-stylesheet ?>`""" + + def render(self, **kwargs) -> str: # type: ignore[override] + return f"<?{self.data}>" + + +class Char(TerminalElement): + """Represent character codes like: `�`""" + + def render(self, **kwargs) -> str: # type: ignore[override] + return f"&#{self.data};" + + +class Entity(TerminalElement): + """Represent entities like `&`""" + + def render(self, **kwargs) -> str: # type: ignore[override] + return f"&{self.data};" + + +class Tree: + """The engine class to generate the AST tree.""" + + def __init__(self, name: str = ""): + """Initialise Tree""" + self.name = name + self.outmost = Root(name) + self.stack: deque = deque() + self.stack.append(self.outmost) + + def clear(self): + """Clear the outmost and stack for a new parsing.""" + self.outmost = Root(self.name) + self.stack.clear() + self.stack.append(self.outmost) + + def last(self) -> Element: + """Return the last pointer which point to the actual tag scope.""" + return self.stack[-1] + + def nest_tag(self, name: str, attrs: dict): + """Nest a given tag at the bottom of the tree using + the last stack's pointer. + """ + pointer = self.stack.pop() + item = Tag(name, attrs) + pointer.append(item) + self.stack.append(pointer) + self.stack.append(item) + + def nest_xtag(self, name: str, attrs: dict): + """Nest an XTag onto the tree.""" + top = self.last() + item = XTag(name, attrs) + top.append(item) + + def nest_vtag(self, name: str, attrs: dict): + """Nest a VoidTag onto the tree.""" + top = self.last() + item = VoidTag(name, attrs) + top.append(item) + + def nest_terminal(self, klass: type[TerminalElement], data: str): + """Nest the data onto the tree.""" + top = self.last() + item = klass(data) + top.append(item) + + def enclose(self, name: str): + """When a closing tag is found, pop the pointer's scope from the stack, + to then point to the earlier scope's tag. + """ + count = 0 + for ind in reversed(self.stack): + count = count + 1 + if ind.name == name: + break + else: + count = 0 + + # It pops all the items which do not match with the closing tag. + for _ in range(0, count): + self.stack.pop() + + +class HtmlToAst(HTMLParser): + """The tokenizer class.""" + + # see https://html.spec.whatwg.org/multipage/syntax.html#void-elements + void_elements = { + "area", + "base", + "br", + "col", + "embed", + "hr", + "img", + "input", + "link", + "meta", + "param", + "source", + "track", + "wbr", + } + + def __init__(self, name: str = "", convert_charrefs: bool = False): + super().__init__(convert_charrefs=convert_charrefs) + self.struct = Tree(name) + + def feed(self, source: str) -> Root: # type: ignore[override] + """Parse the source string.""" + self.struct.clear() + super().feed(source) + return self.struct.outmost + + def handle_starttag(self, name: str, attr): + """When found an opening tag then nest it onto the tree.""" + if name in self.void_elements: + self.struct.nest_vtag(name, attr) + else: + self.struct.nest_tag(name, attr) + + def handle_startendtag(self, name: str, attr): + """When found a XHTML tag style then nest it up to the tree.""" + self.struct.nest_xtag(name, attr) + + def handle_endtag(self, name: str): + """When found a closing tag then makes it point to the right scope.""" + if name not in self.void_elements: + self.struct.enclose(name) + + def handle_data(self, data: str): + """Nest data onto the tree.""" + self.struct.nest_terminal(Data, data) + + def handle_decl(self, decl: str): + self.struct.nest_terminal(Declaration, decl) + + def unknown_decl(self, decl: str): + self.struct.nest_terminal(Declaration, decl) + + def handle_charref(self, data: str): + self.struct.nest_terminal(Char, data) + + def handle_entityref(self, data: str): + self.struct.nest_terminal(Entity, data) + + def handle_pi(self, data: str): + self.struct.nest_terminal(Pi, data) + + def handle_comment(self, data: str): + self.struct.nest_terminal(Comment, data) + + +def tokenize_html(text: str, name: str = "", convert_charrefs: bool = False) -> Root: + parser = HtmlToAst(name, convert_charrefs=convert_charrefs) + return parser.feed(text) diff --git a/myst_parser/parsers/sphinx_.py b/myst_parser/parsers/sphinx_.py new file mode 100644 index 0000000..fff098f --- /dev/null +++ b/myst_parser/parsers/sphinx_.py @@ -0,0 +1,69 @@ +"""MyST Markdown parser for sphinx.""" +from __future__ import annotations + +from docutils import nodes +from docutils.parsers.rst import Parser as RstParser +from sphinx.parsers import Parser as SphinxParser +from sphinx.util import logging + +from myst_parser.config.main import ( + MdParserConfig, + TopmatterReadError, + merge_file_level, + read_topmatter, +) +from myst_parser.mdit_to_docutils.sphinx_ import SphinxRenderer, create_warning +from myst_parser.parsers.mdit import create_md_parser + +SPHINX_LOGGER = logging.getLogger(__name__) + + +class MystParser(SphinxParser): + """Sphinx parser for Markedly Structured Text (MyST).""" + + supported: tuple[str, ...] = ("md", "markdown", "myst") + """Aliases this parser supports.""" + + settings_spec = RstParser.settings_spec + """Runtime settings specification. + + Defines runtime settings and associated command-line options, as used by + `docutils.frontend.OptionParser`. This is a concatenation of tuples of: + + - Option group title (string or `None` which implies no group, just a list + of single options). + + - Description (string or `None`). + + - A sequence of option tuples + """ + + config_section = "myst parser" + config_section_dependencies = ("parsers",) + translate_section_name = None + + def parse(self, inputstring: str, document: nodes.document) -> None: + """Parse source text. + + :param inputstring: The source string to parse + :param document: The root docutils node to add AST elements to + + """ + # get the global config + config: MdParserConfig = document.settings.env.myst_config + + # update the global config with the file-level config + try: + topmatter = read_topmatter(inputstring) + except TopmatterReadError: + pass # this will be reported during the render + else: + if topmatter: + warning = lambda wtype, msg: create_warning( # noqa: E731 + document, msg, line=1, append_to=document, subtype=wtype + ) + config = merge_file_level(config, topmatter, warning) + + parser = create_md_parser(config, SphinxRenderer) + parser.options["document"] = document + parser.render(inputstring) |