diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-29 04:24:24 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-29 04:24:24 +0000 |
commit | 12e8343068b906f8b2afddc5569968a8a91fa5b0 (patch) | |
tree | 75cc5e05a4392ea0292251898f992a15a16b172b /markdown_it | |
parent | Initial commit. (diff) | |
download | markdown-it-py-upstream/2.1.0.tar.xz markdown-it-py-upstream/2.1.0.zip |
Adding upstream version 2.1.0.upstream/2.1.0upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'markdown_it')
66 files changed, 6663 insertions, 0 deletions
diff --git a/markdown_it/__init__.py b/markdown_it/__init__.py new file mode 100644 index 0000000..5cc232a --- /dev/null +++ b/markdown_it/__init__.py @@ -0,0 +1,5 @@ +"""A Python port of Markdown-It""" +__all__ = ("MarkdownIt",) +__version__ = "2.1.0" + +from .main import MarkdownIt diff --git a/markdown_it/_compat.py b/markdown_it/_compat.py new file mode 100644 index 0000000..12df1aa --- /dev/null +++ b/markdown_it/_compat.py @@ -0,0 +1,10 @@ +from __future__ import annotations + +from collections.abc import Mapping +import sys +from typing import Any + +if sys.version_info >= (3, 10): + DATACLASS_KWARGS: Mapping[str, Any] = {"slots": True} +else: + DATACLASS_KWARGS: Mapping[str, Any] = {} diff --git a/markdown_it/_punycode.py b/markdown_it/_punycode.py new file mode 100644 index 0000000..9ad2442 --- /dev/null +++ b/markdown_it/_punycode.py @@ -0,0 +1,66 @@ +# Copyright 2014 Mathias Bynens <https://mathiasbynens.be/> +# Copyright 2021 Taneli Hukkinen +# +# Permission is hereby granted, free of charge, to any person obtaining +# a copy of this software and associated documentation files (the +# "Software"), to deal in the Software without restriction, including +# without limitation the rights to use, copy, modify, merge, publish, +# distribute, sublicense, and/or sell copies of the Software, and to +# permit persons to whom the Software is furnished to do so, subject to +# the following conditions: +# +# The above copyright notice and this permission notice shall be +# included in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION +# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +import codecs +import re + +REGEX_SEPARATORS = re.compile(r"[\x2E\u3002\uFF0E\uFF61]") +REGEX_NON_ASCII = re.compile(r"[^\0-\x7E]") + + +def encode(uni: str) -> str: + return codecs.encode(uni, encoding="punycode").decode() + + +def decode(ascii: str) -> str: + return codecs.decode(ascii, encoding="punycode") # type: ignore[call-overload] + + +def map_domain(string, fn): + parts = string.split("@") + result = "" + if len(parts) > 1: + # In email addresses, only the domain name should be punycoded. Leave + # the local part (i.e. everything up to `@`) intact. + result = parts[0] + "@" + string = parts[1] + labels = REGEX_SEPARATORS.split(string) + encoded = ".".join(fn(label) for label in labels) + return result + encoded + + +def to_unicode(obj: str) -> str: + def mapping(obj: str) -> str: + if obj.startswith("xn--"): + return decode(obj[4:].lower()) + return obj + + return map_domain(obj, mapping) + + +def to_ascii(obj: str) -> str: + def mapping(obj: str) -> str: + if REGEX_NON_ASCII.search(obj): + return "xn--" + encode(obj) + return obj + + return map_domain(obj, mapping) diff --git a/markdown_it/cli/__init__.py b/markdown_it/cli/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/markdown_it/cli/__init__.py diff --git a/markdown_it/cli/parse.py b/markdown_it/cli/parse.py new file mode 100644 index 0000000..2d74f55 --- /dev/null +++ b/markdown_it/cli/parse.py @@ -0,0 +1,109 @@ +#!/usr/bin/env python +""" +CLI interface to markdown-it-py + +Parse one or more markdown files, convert each to HTML, and print to stdout. +""" +from __future__ import annotations + +import argparse +from collections.abc import Iterable, Sequence +import sys + +from markdown_it import __version__ +from markdown_it.main import MarkdownIt + +version_str = "markdown-it-py [version {}]".format(__version__) + + +def main(args: Sequence[str] | None = None) -> int: + namespace = parse_args(args) + if namespace.filenames: + convert(namespace.filenames) + else: + interactive() + return 0 + + +def convert(filenames: Iterable[str]) -> None: + for filename in filenames: + convert_file(filename) + + +def convert_file(filename: str) -> None: + """ + Parse a Markdown file and dump the output to stdout. + """ + try: + with open(filename, "r") as fin: + rendered = MarkdownIt().render(fin.read()) + print(rendered, end="") + except OSError: + sys.stderr.write(f'Cannot open file "{filename}".\n') + sys.exit(1) + + +def interactive() -> None: + """ + Parse user input, dump to stdout, rinse and repeat. + Python REPL style. + """ + print_heading() + contents = [] + more = False + while True: + try: + prompt, more = ("... ", True) if more else (">>> ", True) + contents.append(input(prompt) + "\n") + except EOFError: + print("\n" + MarkdownIt().render("\n".join(contents)), end="") + more = False + contents = [] + except KeyboardInterrupt: + print("\nExiting.") + break + + +def parse_args(args: Sequence[str] | None) -> argparse.Namespace: + """Parse input CLI arguments.""" + parser = argparse.ArgumentParser( + description="Parse one or more markdown files, " + "convert each to HTML, and print to stdout", + # NOTE: Remember to update README.md w/ the output of `markdown-it -h` + epilog=( + f""" +Interactive: + + $ markdown-it + markdown-it-py [version {__version__}] (interactive) + Type Ctrl-D to complete input, or Ctrl-C to exit. + >>> # Example + ... > markdown *input* + ... + <h1>Example</h1> + <blockquote> + <p>markdown <em>input</em></p> + </blockquote> + +Batch: + + $ markdown-it README.md README.footer.md > index.html +""" + ), + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument("-v", "--version", action="version", version=version_str) + parser.add_argument( + "filenames", nargs="*", help="specify an optional list of files to convert" + ) + return parser.parse_args(args) + + +def print_heading() -> None: + print("{} (interactive)".format(version_str)) + print("Type Ctrl-D to complete input, or Ctrl-C to exit.") + + +if __name__ == "__main__": + exit_code = main(sys.argv[1:]) + sys.exit(exit_code) diff --git a/markdown_it/common/__init__.py b/markdown_it/common/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/markdown_it/common/__init__.py diff --git a/markdown_it/common/entities.py b/markdown_it/common/entities.py new file mode 100644 index 0000000..6bb2d34 --- /dev/null +++ b/markdown_it/common/entities.py @@ -0,0 +1,4 @@ +"""HTML5 entities map: { name -> characters }.""" +import html.entities + +entities = {name.rstrip(";"): chars for name, chars in html.entities.html5.items()} diff --git a/markdown_it/common/html_blocks.py b/markdown_it/common/html_blocks.py new file mode 100644 index 0000000..8b199af --- /dev/null +++ b/markdown_it/common/html_blocks.py @@ -0,0 +1,68 @@ +"""List of valid html blocks names, according to commonmark spec +http://jgm.github.io/CommonMark/spec.html#html-blocks +""" + +block_names = [ + "address", + "article", + "aside", + "base", + "basefont", + "blockquote", + "body", + "caption", + "center", + "col", + "colgroup", + "dd", + "details", + "dialog", + "dir", + "div", + "dl", + "dt", + "fieldset", + "figcaption", + "figure", + "footer", + "form", + "frame", + "frameset", + "h1", + "h2", + "h3", + "h4", + "h5", + "h6", + "head", + "header", + "hr", + "html", + "iframe", + "legend", + "li", + "link", + "main", + "menu", + "menuitem", + "nav", + "noframes", + "ol", + "optgroup", + "option", + "p", + "param", + "section", + "source", + "summary", + "table", + "tbody", + "td", + "tfoot", + "th", + "thead", + "title", + "tr", + "track", + "ul", +] diff --git a/markdown_it/common/html_re.py b/markdown_it/common/html_re.py new file mode 100644 index 0000000..f0c336d --- /dev/null +++ b/markdown_it/common/html_re.py @@ -0,0 +1,40 @@ +"""Regexps to match html elements +""" + +import re + +attr_name = "[a-zA-Z_:][a-zA-Z0-9:._-]*" + +unquoted = "[^\"'=<>`\\x00-\\x20]+" +single_quoted = "'[^']*'" +double_quoted = '"[^"]*"' + +attr_value = "(?:" + unquoted + "|" + single_quoted + "|" + double_quoted + ")" + +attribute = "(?:\\s+" + attr_name + "(?:\\s*=\\s*" + attr_value + ")?)" + +open_tag = "<[A-Za-z][A-Za-z0-9\\-]*" + attribute + "*\\s*\\/?>" + +close_tag = "<\\/[A-Za-z][A-Za-z0-9\\-]*\\s*>" +comment = "<!---->|<!--(?:-?[^>-])(?:-?[^-])*-->" +processing = "<[?][\\s\\S]*?[?]>" +declaration = "<![A-Z]+\\s+[^>]*>" +cdata = "<!\\[CDATA\\[[\\s\\S]*?\\]\\]>" + +HTML_TAG_RE = re.compile( + "^(?:" + + open_tag + + "|" + + close_tag + + "|" + + comment + + "|" + + processing + + "|" + + declaration + + "|" + + cdata + + ")" +) +HTML_OPEN_CLOSE_TAG_STR = "^(?:" + open_tag + "|" + close_tag + ")" +HTML_OPEN_CLOSE_TAG_RE = re.compile(HTML_OPEN_CLOSE_TAG_STR) diff --git a/markdown_it/common/normalize_url.py b/markdown_it/common/normalize_url.py new file mode 100644 index 0000000..afec928 --- /dev/null +++ b/markdown_it/common/normalize_url.py @@ -0,0 +1,82 @@ +from __future__ import annotations + +from collections.abc import Callable +import re +from urllib.parse import quote, unquote, urlparse, urlunparse # noqa: F401 + +import mdurl + +from .. import _punycode + +RECODE_HOSTNAME_FOR = ("http:", "https:", "mailto:") + + +def normalizeLink(url: str) -> str: + """Normalize destination URLs in links + + :: + + [label]: destination 'title' + ^^^^^^^^^^^ + """ + parsed = mdurl.parse(url, slashes_denote_host=True) + + if parsed.hostname: + # Encode hostnames in urls like: + # `http://host/`, `https://host/`, `mailto:user@host`, `//host/` + # + # We don't encode unknown schemas, because it's likely that we encode + # something we shouldn't (e.g. `skype:name` treated as `skype:host`) + # + if not parsed.protocol or parsed.protocol in RECODE_HOSTNAME_FOR: + try: + parsed = parsed._replace(hostname=_punycode.to_ascii(parsed.hostname)) + except Exception: + pass + + return mdurl.encode(mdurl.format(parsed)) + + +def normalizeLinkText(url: str) -> str: + """Normalize autolink content + + :: + + <destination> + ~~~~~~~~~~~ + """ + parsed = mdurl.parse(url, slashes_denote_host=True) + + if parsed.hostname: + # Encode hostnames in urls like: + # `http://host/`, `https://host/`, `mailto:user@host`, `//host/` + # + # We don't encode unknown schemas, because it's likely that we encode + # something we shouldn't (e.g. `skype:name` treated as `skype:host`) + # + if not parsed.protocol or parsed.protocol in RECODE_HOSTNAME_FOR: + try: + parsed = parsed._replace(hostname=_punycode.to_unicode(parsed.hostname)) + except Exception: + pass + + # add '%' to exclude list because of https://github.com/markdown-it/markdown-it/issues/720 + return mdurl.decode(mdurl.format(parsed), mdurl.DECODE_DEFAULT_CHARS + "%") + + +BAD_PROTO_RE = re.compile(r"^(vbscript|javascript|file|data):") +GOOD_DATA_RE = re.compile(r"^data:image\/(gif|png|jpeg|webp);") + + +def validateLink(url: str, validator: Callable | None = None) -> bool: + """Validate URL link is allowed in output. + + This validator can prohibit more than really needed to prevent XSS. + It's a tradeoff to keep code simple and to be secure by default. + + Note: url should be normalized at this point, and existing entities decoded. + """ + if validator is not None: + return validator(url) + url = url.strip().lower() + return bool(GOOD_DATA_RE.search(url)) if BAD_PROTO_RE.search(url) else True diff --git a/markdown_it/common/utils.py b/markdown_it/common/utils.py new file mode 100644 index 0000000..edc24ca --- /dev/null +++ b/markdown_it/common/utils.py @@ -0,0 +1,334 @@ +"""Utilities for parsing source text +""" +import html +import re +from typing import Any + +from .entities import entities + + +def charCodeAt(src: str, pos: int) -> Any: + """ + Returns the Unicode value of the character at the specified location. + + @param - index The zero-based index of the desired character. + If there is no character at the specified index, NaN is returned. + + This was added for compatibility with python + """ + try: + return ord(src[pos]) + except IndexError: + return None + + +# Merge objects +# +def assign(obj): + """Merge objects /*from1, from2, from3, ...*/)""" + raise NotImplementedError + # sources = Array.prototype.slice.call(arguments, 1) + + # sources.forEach(function (source) { + # if (!source) { return; } + + # if (typeof source !== 'object') { + # throw new TypeError(source + 'must be object') + # } + + # Object.keys(source).forEach(function (key) { + # obj[key] = source[key] + # }) + # }) + + # return obj + + +def arrayReplaceAt(src: list, pos: int, newElements: list) -> list: + """ + Remove element from array and put another array at those position. + Useful for some operations with tokens + """ + return src[:pos] + newElements + src[pos + 1 :] + + +###################################################################### + + +def isValidEntityCode(c: int) -> bool: + + # broken sequence + if c >= 0xD800 and c <= 0xDFFF: + return False + # never used + if c >= 0xFDD0 and c <= 0xFDEF: + return False + if ((c & 0xFFFF) == 0xFFFF) or ((c & 0xFFFF) == 0xFFFE): + return False + # control codes + if c >= 0x00 and c <= 0x08: + return False + if c == 0x0B: + return False + if c >= 0x0E and c <= 0x1F: + return False + if c >= 0x7F and c <= 0x9F: + return False + # out of range + if c > 0x10FFFF: + return False + return True + + +def fromCodePoint(c: int) -> str: + """Convert ordinal to unicode. + + Note, in the original Javascript two string characters were required, + for codepoints larger than `0xFFFF`. + But Python 3 can represent any unicode codepoint in one character. + """ + return chr(c) + + +UNESCAPE_MD_RE = re.compile(r'\\([!"#$%&\'()*+,\-.\/:;<=>?@[\\\]^_`{|}~])') +# ENTITY_RE_g = re.compile(r'&([a-z#][a-z0-9]{1,31})', re.IGNORECASE) +UNESCAPE_ALL_RE = re.compile( + r'\\([!"#$%&\'()*+,\-.\/:;<=>?@[\\\]^_`{|}~])' + "|" + r"&([a-z#][a-z0-9]{1,31});", + re.IGNORECASE, +) +DIGITAL_ENTITY_TEST_RE = re.compile(r"^#((?:x[a-f0-9]{1,8}|[0-9]{1,8}))", re.IGNORECASE) + + +def replaceEntityPattern(match: str, name: str) -> str: + """Convert HTML entity patterns + + :: + + https://www.google.com -> https%3A//www.google.com + + """ + code = 0 + + if name in entities: + return entities[name] + + if ord(name[0]) == 0x23 and DIGITAL_ENTITY_TEST_RE.search(name): + code = int(name[2:], 16) if name[1].lower() == "x" else int(name[1:], 10) + if isValidEntityCode(code): + return fromCodePoint(code) + + return match + + +# def replaceEntities(string): +# if (string.indexOf('&') < 0): +# return string +# return string.replace(ENTITY_RE, replaceEntityPattern) + + +def unescapeMd(string: str) -> str: + raise NotImplementedError + # if "\\" in string: + # return string + # return string.replace(UNESCAPE_MD_RE, "$1") + + +def unescapeAll(string: str) -> str: + def replacer_func(match): + escaped = match.group(1) + if escaped: + return escaped + entity = match.group(2) + return replaceEntityPattern(match.group(), entity) + + if "\\" not in string and "&" not in string: + return string + return UNESCAPE_ALL_RE.sub(replacer_func, string) + + +ESCAPABLE = r"""\\!"#$%&'()*+,./:;<=>?@\[\]^`{}|_~-""" +ESCAPE_CHAR = re.compile(r"\\([" + ESCAPABLE + r"])") + + +def stripEscape(string: str) -> str: + """Strip escape \\ characters""" + return ESCAPE_CHAR.sub(r"\1", string) + + +# ////////////////////////////////////////////////////////////////////////////// + +# TODO This section changed quite a lot, should re-check + +# UNESCAPE_HTML_RE = re.compile(r"\\&(?=(amp\;|lt\;|gt\;|quot\;))") +# ESCAPE_AND_HTML = re.compile(r"&(?!(amp\;|lt\;|gt\;|quot\;))") +# HTML_ESCAPE_REPLACE_RE = re.compile(r'[&<>"]') + + +# def escapeHtml(string: str): + +# if HTML_ESCAPE_REPLACE_RE.search(string): + +# string = UNESCAPE_HTML_RE.sub("&", string) +# string = ESCAPE_AND_HTML.sub("&", string) +# for k, v in {"<": "<", ">": ">", '"': """}.items(): +# string = string.replace(k, v) + +# return string + + +def escapeHtml(raw: str) -> str: + # return html.escape(html.unescape(raw)).replace("'", "'") + return html.escape(raw).replace("'", "'") + + +# ////////////////////////////////////////////////////////////////////////////// + +REGEXP_ESCAPE_RE = re.compile(r"[.?*+^$[\]\\(){}|-]") + + +def escapeRE(string: str) -> str: + string = REGEXP_ESCAPE_RE.sub("\\$&", string) + return string + + +# ////////////////////////////////////////////////////////////////////////////// + + +def isSpace(code: object) -> bool: + return code in {0x09, 0x20} + + +MD_WHITESPACE = { + 0x09, # \t + 0x0A, # \n + 0x0B, # \v + 0x0C, # \f + 0x0D, # \r + 0x20, + 0xA0, + 0x1680, + 0x202F, + 0x205F, + 0x3000, +} + + +def isWhiteSpace(code: int) -> bool: + r"""Zs (unicode class) || [\t\f\v\r\n]""" + if code >= 0x2000 and code <= 0x200A: + return True + return code in MD_WHITESPACE + + +# ////////////////////////////////////////////////////////////////////////////// + +UNICODE_PUNCT_RE = re.compile( + r"[!-#%-\*,-\/:;\?@\[-\]_\{\}\xA1\xA7\xAB\xB6\xB7\xBB\xBF\u037E\u0387\u055A-\u055F\u0589\u058A\u05BE\u05C0\u05C3\u05C6\u05F3\u05F4\u0609\u060A\u060C\u060D\u061B\u061E\u061F\u066A-\u066D\u06D4\u0700-\u070D\u07F7-\u07F9\u0830-\u083E\u085E\u0964\u0965\u0970\u09FD\u0A76\u0AF0\u0C84\u0DF4\u0E4F\u0E5A\u0E5B\u0F04-\u0F12\u0F14\u0F3A-\u0F3D\u0F85\u0FD0-\u0FD4\u0FD9\u0FDA\u104A-\u104F\u10FB\u1360-\u1368\u1400\u166D\u166E\u169B\u169C\u16EB-\u16ED\u1735\u1736\u17D4-\u17D6\u17D8-\u17DA\u1800-\u180A\u1944\u1945\u1A1E\u1A1F\u1AA0-\u1AA6\u1AA8-\u1AAD\u1B5A-\u1B60\u1BFC-\u1BFF\u1C3B-\u1C3F\u1C7E\u1C7F\u1CC0-\u1CC7\u1CD3\u2010-\u2027\u2030-\u2043\u2045-\u2051\u2053-\u205E\u207D\u207E\u208D\u208E\u2308-\u230B\u2329\u232A\u2768-\u2775\u27C5\u27C6\u27E6-\u27EF\u2983-\u2998\u29D8-\u29DB\u29FC\u29FD\u2CF9-\u2CFC\u2CFE\u2CFF\u2D70\u2E00-\u2E2E\u2E30-\u2E4E\u3001-\u3003\u3008-\u3011\u3014-\u301F\u3030\u303D\u30A0\u30FB\uA4FE\uA4FF\uA60D-\uA60F\uA673\uA67E\uA6F2-\uA6F7\uA874-\uA877\uA8CE\uA8CF\uA8F8-\uA8FA\uA8FC\uA92E\uA92F\uA95F\uA9C1-\uA9CD\uA9DE\uA9DF\uAA5C-\uAA5F\uAADE\uAADF\uAAF0\uAAF1\uABEB\uFD3E\uFD3F\uFE10-\uFE19\uFE30-\uFE52\uFE54-\uFE61\uFE63\uFE68\uFE6A\uFE6B\uFF01-\uFF03\uFF05-\uFF0A\uFF0C-\uFF0F\uFF1A\uFF1B\uFF1F\uFF20\uFF3B-\uFF3D\uFF3F\uFF5B\uFF5D\uFF5F-\uFF65]|\uD800[\uDD00-\uDD02\uDF9F\uDFD0]|\uD801\uDD6F|\uD802[\uDC57\uDD1F\uDD3F\uDE50-\uDE58\uDE7F\uDEF0-\uDEF6\uDF39-\uDF3F\uDF99-\uDF9C]|\uD803[\uDF55-\uDF59]|\uD804[\uDC47-\uDC4D\uDCBB\uDCBC\uDCBE-\uDCC1\uDD40-\uDD43\uDD74\uDD75\uDDC5-\uDDC8\uDDCD\uDDDB\uDDDD-\uDDDF\uDE38-\uDE3D\uDEA9]|\uD805[\uDC4B-\uDC4F\uDC5B\uDC5D\uDCC6\uDDC1-\uDDD7\uDE41-\uDE43\uDE60-\uDE6C\uDF3C-\uDF3E]|\uD806[\uDC3B\uDE3F-\uDE46\uDE9A-\uDE9C\uDE9E-\uDEA2]|\uD807[\uDC41-\uDC45\uDC70\uDC71\uDEF7\uDEF8]|\uD809[\uDC70-\uDC74]|\uD81A[\uDE6E\uDE6F\uDEF5\uDF37-\uDF3B\uDF44]|\uD81B[\uDE97-\uDE9A]|\uD82F\uDC9F|\uD836[\uDE87-\uDE8B]|\uD83A[\uDD5E\uDD5F]" # noqa: E501 +) + + +# Currently without astral characters support. +def isPunctChar(ch: str) -> bool: + return UNICODE_PUNCT_RE.search(ch) is not None + + +MD_ASCII_PUNCT = { + 0x21, # /* ! */ + 0x22, # /* " */ + 0x23, # /* # */ + 0x24, # /* $ */ + 0x25, # /* % */ + 0x26, # /* & */ + 0x27, # /* ' */ + 0x28, # /* ( */ + 0x29, # /* ) */ + 0x2A, # /* * */ + 0x2B, # /* + */ + 0x2C, # /* , */ + 0x2D, # /* - */ + 0x2E, # /* . */ + 0x2F, # /* / */ + 0x3A, # /* : */ + 0x3B, # /* ; */ + 0x3C, # /* < */ + 0x3D, # /* = */ + 0x3E, # /* > */ + 0x3F, # /* ? */ + 0x40, # /* @ */ + 0x5B, # /* [ */ + 0x5C, # /* \ */ + 0x5D, # /* ] */ + 0x5E, # /* ^ */ + 0x5F, # /* _ */ + 0x60, # /* ` */ + 0x7B, # /* { */ + 0x7C, # /* | */ + 0x7D, # /* } */ + 0x7E, # /* ~ */ +} + + +def isMdAsciiPunct(ch: int) -> bool: + """Markdown ASCII punctuation characters. + + :: + + !, ", #, $, %, &, ', (, ), *, +, ,, -, ., /, :, ;, <, =, >, ?, @, [, \\, ], ^, _, `, {, |, }, or ~ + + See http://spec.commonmark.org/0.15/#ascii-punctuation-character + + Don't confuse with unicode punctuation !!! It lacks some chars in ascii range. + + """ # noqa: E501 + return ch in MD_ASCII_PUNCT + + +def normalizeReference(string: str) -> str: + """Helper to unify [reference labels].""" + # Trim and collapse whitespace + # + string = re.sub(r"\s+", " ", string.strip()) + + # In node v10 'ẞ'.toLowerCase() === 'Ṿ', which is presumed to be a bug + # fixed in v12 (couldn't find any details). + # + # So treat this one as a special case + # (remove this when node v10 is no longer supported). + # + # if ('ẞ'.toLowerCase() === 'Ṿ') { + # str = str.replace(/ẞ/g, 'ß') + # } + + # .toLowerCase().toUpperCase() should get rid of all differences + # between letter variants. + # + # Simple .toLowerCase() doesn't normalize 125 code points correctly, + # and .toUpperCase doesn't normalize 6 of them (list of exceptions: + # İ, ϴ, ẞ, Ω, K, Å - those are already uppercased, but have differently + # uppercased versions). + # + # Here's an example showing how it happens. Lets take greek letter omega: + # uppercase U+0398 (Θ), U+03f4 (ϴ) and lowercase U+03b8 (θ), U+03d1 (ϑ) + # + # Unicode entries: + # 0398;GREEK CAPITAL LETTER THETA;Lu;0;L;;;;;N;;;;03B8 + # 03B8;GREEK SMALL LETTER THETA;Ll;0;L;;;;;N;;;0398;;0398 + # 03D1;GREEK THETA SYMBOL;Ll;0;L;<compat> 03B8;;;;N;GREEK SMALL LETTER SCRIPT THETA;;0398;;0398 + # 03F4;GREEK CAPITAL THETA SYMBOL;Lu;0;L;<compat> 0398;;;;N;;;;03B8 + # + # Case-insensitive comparison should treat all of them as equivalent. + # + # But .toLowerCase() doesn't change ϑ (it's already lowercase), + # and .toUpperCase() doesn't change ϴ (already uppercase). + # + # Applying first lower then upper case normalizes any character: + # '\u0398\u03f4\u03b8\u03d1'.toLowerCase().toUpperCase() === '\u0398\u0398\u0398\u0398' + # + # Note: this is equivalent to unicode case folding; unicode normalization + # is a different step that is not required here. + # + # Final result should be uppercased, because it's later stored in an object + # (this avoid a conflict with Object.prototype members, + # most notably, `__proto__`) + # + return string.lower().upper() diff --git a/markdown_it/helpers/__init__.py b/markdown_it/helpers/__init__.py new file mode 100644 index 0000000..3dbbdd1 --- /dev/null +++ b/markdown_it/helpers/__init__.py @@ -0,0 +1,6 @@ +"""Functions for parsing Links +""" +__all__ = ("parseLinkLabel", "parseLinkDestination", "parseLinkTitle") +from .parse_link_destination import parseLinkDestination +from .parse_link_label import parseLinkLabel +from .parse_link_title import parseLinkTitle diff --git a/markdown_it/helpers/parse_link_destination.py b/markdown_it/helpers/parse_link_destination.py new file mode 100644 index 0000000..58b76f3 --- /dev/null +++ b/markdown_it/helpers/parse_link_destination.py @@ -0,0 +1,86 @@ +""" +Parse link destination +""" + +from ..common.utils import charCodeAt, unescapeAll + + +class _Result: + __slots__ = ("ok", "pos", "lines", "str") + + def __init__(self): + self.ok = False + self.pos = 0 + self.lines = 0 + self.str = "" + + +def parseLinkDestination(string: str, pos: int, maximum: int) -> _Result: + lines = 0 + start = pos + result = _Result() + + if charCodeAt(string, pos) == 0x3C: # /* < */ + pos += 1 + while pos < maximum: + code = charCodeAt(string, pos) + if code == 0x0A: # /* \n */) + return result + if code == 0x3C: # / * < * / + return result + if code == 0x3E: # /* > */) { + result.pos = pos + 1 + result.str = unescapeAll(string[start + 1 : pos]) + result.ok = True + return result + + if code == 0x5C and pos + 1 < maximum: # \ + pos += 2 + continue + + pos += 1 + + # no closing '>' + return result + + # this should be ... } else { ... branch + + level = 0 + while pos < maximum: + code = charCodeAt(string, pos) + + if code == 0x20: + break + + # ascii control characters + if code < 0x20 or code == 0x7F: + break + + if code == 0x5C and pos + 1 < maximum: + if charCodeAt(string, pos + 1) == 0x20: + break + pos += 2 + continue + + if code == 0x28: # /* ( */) + level += 1 + if level > 32: + return result + + if code == 0x29: # /* ) */) + if level == 0: + break + level -= 1 + + pos += 1 + + if start == pos: + return result + if level != 0: + return result + + result.str = unescapeAll(string[start:pos]) + result.lines = lines + result.pos = pos + result.ok = True + return result diff --git a/markdown_it/helpers/parse_link_label.py b/markdown_it/helpers/parse_link_label.py new file mode 100644 index 0000000..20e3c14 --- /dev/null +++ b/markdown_it/helpers/parse_link_label.py @@ -0,0 +1,44 @@ +""" +Parse link label + +this function assumes that first character ("[") already matches +returns the end of the label + +""" +from markdown_it.rules_inline import StateInline + + +def parseLinkLabel(state: StateInline, start: int, disableNested: bool = False) -> int: + + labelEnd = -1 + oldPos = state.pos + found = False + + state.pos = start + 1 + level = 1 + + while state.pos < state.posMax: + marker = state.srcCharCode[state.pos] + if marker == 0x5D: # /* ] */) + level -= 1 + if level == 0: + found = True + break + + prevPos = state.pos + state.md.inline.skipToken(state) + if marker == 0x5B: # /* [ */) + if prevPos == state.pos - 1: + # increase level if we find text `[`, + # which is not a part of any token + level += 1 + elif disableNested: + state.pos = oldPos + return -1 + if found: + labelEnd = state.pos + + # restore old state + state.pos = oldPos + + return labelEnd diff --git a/markdown_it/helpers/parse_link_title.py b/markdown_it/helpers/parse_link_title.py new file mode 100644 index 0000000..842c83b --- /dev/null +++ b/markdown_it/helpers/parse_link_title.py @@ -0,0 +1,60 @@ +"""Parse link title +""" +from ..common.utils import charCodeAt, unescapeAll + + +class _Result: + __slots__ = ("ok", "pos", "lines", "str") + + def __init__(self): + self.ok = False + self.pos = 0 + self.lines = 0 + self.str = "" + + def __str__(self): + return self.str + + +def parseLinkTitle(string: str, pos: int, maximum: int) -> _Result: + lines = 0 + start = pos + result = _Result() + + if pos >= maximum: + return result + + marker = charCodeAt(string, pos) + + # /* " */ /* ' */ /* ( */ + if marker != 0x22 and marker != 0x27 and marker != 0x28: + return result + + pos += 1 + + # if opening marker is "(", switch it to closing marker ")" + if marker == 0x28: + marker = 0x29 + + while pos < maximum: + code = charCodeAt(string, pos) + if code == marker: + title = string[start + 1 : pos] + title = unescapeAll(title) + result.pos = pos + 1 + result.lines = lines + result.str = title + result.ok = True + return result + elif code == 0x28 and marker == 0x29: # /* ( */ /* ) */ + return result + elif code == 0x0A: + lines += 1 + elif code == 0x5C and pos + 1 < maximum: # /* \ */ + pos += 1 + if charCodeAt(string, pos) == 0x0A: + lines += 1 + + pos += 1 + + return result diff --git a/markdown_it/main.py b/markdown_it/main.py new file mode 100644 index 0000000..7faac5a --- /dev/null +++ b/markdown_it/main.py @@ -0,0 +1,331 @@ +from __future__ import annotations + +from collections.abc import Callable, Generator, Iterable, Mapping, MutableMapping +from contextlib import contextmanager +from typing import Any + +from . import helpers, presets # noqa F401 +from .common import normalize_url, utils # noqa F401 +from .parser_block import ParserBlock # noqa F401 +from .parser_core import ParserCore # noqa F401 +from .parser_inline import ParserInline # noqa F401 +from .renderer import RendererHTML, RendererProtocol +from .rules_core.state_core import StateCore +from .token import Token +from .utils import OptionsDict + +try: + import linkify_it +except ModuleNotFoundError: + linkify_it = None + + +_PRESETS = { + "default": presets.default.make(), + "js-default": presets.js_default.make(), + "zero": presets.zero.make(), + "commonmark": presets.commonmark.make(), + "gfm-like": presets.gfm_like.make(), +} + + +class MarkdownIt: + def __init__( + self, + config: str | Mapping = "commonmark", + options_update: Mapping | None = None, + *, + renderer_cls: Callable[[MarkdownIt], RendererProtocol] = RendererHTML, + ): + """Main parser class + + :param config: name of configuration to load or a pre-defined dictionary + :param options_update: dictionary that will be merged into ``config["options"]`` + :param renderer_cls: the class to load as the renderer: + ``self.renderer = renderer_cls(self) + """ + # add modules + self.utils = utils + self.helpers: Any = helpers + + # initialise classes + self.inline = ParserInline() + self.block = ParserBlock() + self.core = ParserCore() + self.renderer = renderer_cls(self) + self.linkify = linkify_it.LinkifyIt() if linkify_it else None + + # set the configuration + if options_update and not isinstance(options_update, Mapping): + # catch signature change where renderer_cls was not used as a key-word + raise TypeError( + f"options_update should be a mapping: {options_update}" + "\n(Perhaps you intended this to be the renderer_cls?)" + ) + self.configure(config, options_update=options_update) + + def __repr__(self) -> str: + return f"{self.__class__.__module__}.{self.__class__.__name__}()" + + def __getitem__(self, name: str) -> Any: + return { + "inline": self.inline, + "block": self.block, + "core": self.core, + "renderer": self.renderer, + }[name] + + def set(self, options: MutableMapping) -> None: + """Set parser options (in the same format as in constructor). + Probably, you will never need it, but you can change options after constructor call. + + __Note:__ To achieve the best possible performance, don't modify a + `markdown-it` instance options on the fly. If you need multiple configurations + it's best to create multiple instances and initialize each with separate config. + """ + self.options = OptionsDict(options) + + def configure( + self, presets: str | Mapping, options_update: Mapping | None = None + ) -> MarkdownIt: + """Batch load of all options and component settings. + This is an internal method, and you probably will not need it. + But if you will - see available presets and data structure + [here](https://github.com/markdown-it/markdown-it/tree/master/lib/presets) + + We strongly recommend to use presets instead of direct config loads. + That will give better compatibility with next versions. + """ + if isinstance(presets, str): + if presets not in _PRESETS: + raise KeyError(f"Wrong `markdown-it` preset '{presets}', check name") + config = _PRESETS[presets] + else: + config = presets + + if not config: + raise ValueError("Wrong `markdown-it` config, can't be empty") + + options = config.get("options", {}) or {} + if options_update: + options = {**options, **options_update} + + self.set(options) + + if "components" in config: + for name, component in config["components"].items(): + rules = component.get("rules", None) + if rules: + self[name].ruler.enableOnly(rules) + rules2 = component.get("rules2", None) + if rules2: + self[name].ruler2.enableOnly(rules2) + + return self + + def get_all_rules(self) -> dict[str, list[str]]: + """Return the names of all active rules.""" + rules = { + chain: self[chain].ruler.get_all_rules() + for chain in ["core", "block", "inline"] + } + rules["inline2"] = self.inline.ruler2.get_all_rules() + return rules + + def get_active_rules(self) -> dict[str, list[str]]: + """Return the names of all active rules.""" + rules = { + chain: self[chain].ruler.get_active_rules() + for chain in ["core", "block", "inline"] + } + rules["inline2"] = self.inline.ruler2.get_active_rules() + return rules + + def enable( + self, names: str | Iterable[str], ignoreInvalid: bool = False + ) -> MarkdownIt: + """Enable list or rules. (chainable) + + :param names: rule name or list of rule names to enable. + :param ignoreInvalid: set `true` to ignore errors when rule not found. + + It will automatically find appropriate components, + containing rules with given names. If rule not found, and `ignoreInvalid` + not set - throws exception. + + Example:: + + md = MarkdownIt().enable(['sub', 'sup']).disable('smartquotes') + + """ + result = [] + + if isinstance(names, str): + names = [names] + + for chain in ["core", "block", "inline"]: + result.extend(self[chain].ruler.enable(names, True)) + result.extend(self.inline.ruler2.enable(names, True)) + + missed = [name for name in names if name not in result] + if missed and not ignoreInvalid: + raise ValueError(f"MarkdownIt. Failed to enable unknown rule(s): {missed}") + + return self + + def disable( + self, names: str | Iterable[str], ignoreInvalid: bool = False + ) -> MarkdownIt: + """The same as [[MarkdownIt.enable]], but turn specified rules off. (chainable) + + :param names: rule name or list of rule names to disable. + :param ignoreInvalid: set `true` to ignore errors when rule not found. + + """ + result = [] + + if isinstance(names, str): + names = [names] + + for chain in ["core", "block", "inline"]: + result.extend(self[chain].ruler.disable(names, True)) + result.extend(self.inline.ruler2.disable(names, True)) + + missed = [name for name in names if name not in result] + if missed and not ignoreInvalid: + raise ValueError(f"MarkdownIt. Failed to disable unknown rule(s): {missed}") + return self + + @contextmanager + def reset_rules(self) -> Generator[None, None, None]: + """A context manager, that will reset the current enabled rules on exit.""" + chain_rules = self.get_active_rules() + yield + for chain, rules in chain_rules.items(): + if chain != "inline2": + self[chain].ruler.enableOnly(rules) + self.inline.ruler2.enableOnly(chain_rules["inline2"]) + + def add_render_rule(self, name: str, function: Callable, fmt: str = "html") -> None: + """Add a rule for rendering a particular Token type. + + Only applied when ``renderer.__output__ == fmt`` + """ + if self.renderer.__output__ == fmt: + self.renderer.rules[name] = function.__get__(self.renderer) # type: ignore + + def use(self, plugin: Callable, *params, **options) -> MarkdownIt: + """Load specified plugin with given params into current parser instance. (chainable) + + It's just a sugar to call `plugin(md, params)` with curring. + + Example:: + + def func(tokens, idx): + tokens[idx].content = tokens[idx].content.replace('foo', 'bar') + md = MarkdownIt().use(plugin, 'foo_replace', 'text', func) + + """ + plugin(self, *params, **options) + return self + + def parse(self, src: str, env: MutableMapping | None = None) -> list[Token]: + """Parse the source string to a token stream + + :param src: source string + :param env: environment sandbox + + Parse input string and return list of block tokens (special token type + "inline" will contain list of inline tokens). + + `env` is used to pass data between "distributed" rules and return additional + metadata like reference info, needed for the renderer. It also can be used to + inject data in specific cases. Usually, you will be ok to pass `{}`, + and then pass updated object to renderer. + """ + env = {} if env is None else env + if not isinstance(env, MutableMapping): + raise TypeError(f"Input data should be a MutableMapping, not {type(env)}") + if not isinstance(src, str): + raise TypeError(f"Input data should be a string, not {type(src)}") + state = StateCore(src, self, env) + self.core.process(state) + return state.tokens + + def render(self, src: str, env: MutableMapping | None = None) -> Any: + """Render markdown string into html. It does all magic for you :). + + :param src: source string + :param env: environment sandbox + :returns: The output of the loaded renderer + + `env` can be used to inject additional metadata (`{}` by default). + But you will not need it with high probability. See also comment + in [[MarkdownIt.parse]]. + """ + env = {} if env is None else env + return self.renderer.render(self.parse(src, env), self.options, env) + + def parseInline(self, src: str, env: MutableMapping | None = None) -> list[Token]: + """The same as [[MarkdownIt.parse]] but skip all block rules. + + :param src: source string + :param env: environment sandbox + + It returns the + block tokens list with the single `inline` element, containing parsed inline + tokens in `children` property. Also updates `env` object. + """ + env = {} if env is None else env + if not isinstance(env, MutableMapping): + raise TypeError(f"Input data should be an MutableMapping, not {type(env)}") + if not isinstance(src, str): + raise TypeError(f"Input data should be a string, not {type(src)}") + state = StateCore(src, self, env) + state.inlineMode = True + self.core.process(state) + return state.tokens + + def renderInline(self, src: str, env: MutableMapping | None = None) -> Any: + """Similar to [[MarkdownIt.render]] but for single paragraph content. + + :param src: source string + :param env: environment sandbox + + Similar to [[MarkdownIt.render]] but for single paragraph content. Result + will NOT be wrapped into `<p>` tags. + """ + env = {} if env is None else env + return self.renderer.render(self.parseInline(src, env), self.options, env) + + # link methods + + def validateLink(self, url: str) -> bool: + """Validate if the URL link is allowed in output. + + This validator can prohibit more than really needed to prevent XSS. + It's a tradeoff to keep code simple and to be secure by default. + + Note: the url should be normalized at this point, and existing entities decoded. + """ + return normalize_url.validateLink(url) + + def normalizeLink(self, url: str) -> str: + """Normalize destination URLs in links + + :: + + [label]: destination 'title' + ^^^^^^^^^^^ + """ + return normalize_url.normalizeLink(url) + + def normalizeLinkText(self, link: str) -> str: + """Normalize autolink content + + :: + + <destination> + ~~~~~~~~~~~ + """ + return normalize_url.normalizeLinkText(link) diff --git a/markdown_it/parser_block.py b/markdown_it/parser_block.py new file mode 100644 index 0000000..f331ec5 --- /dev/null +++ b/markdown_it/parser_block.py @@ -0,0 +1,109 @@ +"""Block-level tokenizer.""" +from __future__ import annotations + +import logging + +from . import rules_block +from .ruler import Ruler +from .rules_block.state_block import StateBlock +from .token import Token + +LOGGER = logging.getLogger(__name__) + + +_rules: list[tuple] = [ + # First 2 params - rule name & source. Secondary array - list of rules, + # which can be terminated by this one. + ("table", rules_block.table, ["paragraph", "reference"]), + ("code", rules_block.code), + ("fence", rules_block.fence, ["paragraph", "reference", "blockquote", "list"]), + ( + "blockquote", + rules_block.blockquote, + ["paragraph", "reference", "blockquote", "list"], + ), + ("hr", rules_block.hr, ["paragraph", "reference", "blockquote", "list"]), + ("list", rules_block.list_block, ["paragraph", "reference", "blockquote"]), + ("reference", rules_block.reference), + ("html_block", rules_block.html_block, ["paragraph", "reference", "blockquote"]), + ("heading", rules_block.heading, ["paragraph", "reference", "blockquote"]), + ("lheading", rules_block.lheading), + ("paragraph", rules_block.paragraph), +] + + +class ParserBlock: + """ + ParserBlock#ruler -> Ruler + + [[Ruler]] instance. Keep configuration of block rules. + """ + + def __init__(self): + self.ruler = Ruler() + for data in _rules: + name = data[0] + rule = data[1] + self.ruler.push(name, rule, {"alt": data[2] if len(data) > 2 else []}) + + def tokenize( + self, state: StateBlock, startLine: int, endLine: int, silent: bool = False + ) -> None: + """Generate tokens for input range.""" + rules = self.ruler.getRules("") + line = startLine + maxNesting = state.md.options.maxNesting + hasEmptyLines = False + + while line < endLine: + state.line = line = state.skipEmptyLines(line) + if line >= endLine: + break + if state.sCount[line] < state.blkIndent: + # Termination condition for nested calls. + # Nested calls currently used for blockquotes & lists + break + if state.level >= maxNesting: + # If nesting level exceeded - skip tail to the end. + # That's not ordinary situation and we should not care about content. + state.line = endLine + break + + # Try all possible rules. + # On success, rule should: + # - update `state.line` + # - update `state.tokens` + # - return True + for rule in rules: + if rule(state, line, endLine, False): + break + + # set state.tight if we had an empty line before current tag + # i.e. latest empty line should not count + state.tight = not hasEmptyLines + + line = state.line + + # paragraph might "eat" one newline after it in nested lists + if (line - 1) < endLine and state.isEmpty(line - 1): + hasEmptyLines = True + + if line < endLine and state.isEmpty(line): + hasEmptyLines = True + line += 1 + state.line = line + + def parse( + self, + src: str, + md, + env, + outTokens: list[Token], + ords: tuple[int, ...] | None = None, + ) -> list[Token] | None: + """Process input string and push block tokens into `outTokens`.""" + if not src: + return None + state = StateBlock(src, md, env, outTokens, ords) + self.tokenize(state, state.line, state.lineMax) + return state.tokens diff --git a/markdown_it/parser_core.py b/markdown_it/parser_core.py new file mode 100644 index 0000000..32209b3 --- /dev/null +++ b/markdown_it/parser_core.py @@ -0,0 +1,32 @@ +""" + * class Core + * + * Top-level rules executor. Glues block/inline parsers and does intermediate + * transformations. +""" +from __future__ import annotations + +from .ruler import RuleFunc, Ruler +from .rules_core import block, inline, linkify, normalize, replace, smartquotes +from .rules_core.state_core import StateCore + +_rules: list[tuple[str, RuleFunc]] = [ + ("normalize", normalize), + ("block", block), + ("inline", inline), + ("linkify", linkify), + ("replacements", replace), + ("smartquotes", smartquotes), +] + + +class ParserCore: + def __init__(self): + self.ruler = Ruler() + for name, rule in _rules: + self.ruler.push(name, rule) + + def process(self, state: StateCore) -> None: + """Executes core chain rules.""" + for rule in self.ruler.getRules(""): + rule(state) diff --git a/markdown_it/parser_inline.py b/markdown_it/parser_inline.py new file mode 100644 index 0000000..b61c990 --- /dev/null +++ b/markdown_it/parser_inline.py @@ -0,0 +1,124 @@ +"""Tokenizes paragraph content. +""" +from __future__ import annotations + +from . import rules_inline +from .ruler import RuleFunc, Ruler +from .rules_inline.state_inline import StateInline +from .token import Token + +# Parser rules +_rules: list[tuple[str, RuleFunc]] = [ + ("text", rules_inline.text), + ("newline", rules_inline.newline), + ("escape", rules_inline.escape), + ("backticks", rules_inline.backtick), + ("strikethrough", rules_inline.strikethrough.tokenize), + ("emphasis", rules_inline.emphasis.tokenize), + ("link", rules_inline.link), + ("image", rules_inline.image), + ("autolink", rules_inline.autolink), + ("html_inline", rules_inline.html_inline), + ("entity", rules_inline.entity), +] + +_rules2: list[tuple[str, RuleFunc]] = [ + ("balance_pairs", rules_inline.link_pairs), + ("strikethrough", rules_inline.strikethrough.postProcess), + ("emphasis", rules_inline.emphasis.postProcess), + ("text_collapse", rules_inline.text_collapse), +] + + +class ParserInline: + def __init__(self): + self.ruler = Ruler() + for name, rule in _rules: + self.ruler.push(name, rule) + # Second ruler used for post-processing (e.g. in emphasis-like rules) + self.ruler2 = Ruler() + for name, rule2 in _rules2: + self.ruler2.push(name, rule2) + + def skipToken(self, state: StateInline) -> None: + """Skip single token by running all rules in validation mode; + returns `True` if any rule reported success + """ + ok = False + pos = state.pos + rules = self.ruler.getRules("") + maxNesting = state.md.options["maxNesting"] + cache = state.cache + + if pos in cache: + state.pos = cache[pos] + return + + if state.level < maxNesting: + for rule in rules: + # Increment state.level and decrement it later to limit recursion. + # It's harmless to do here, because no tokens are created. + # But ideally, we'd need a separate private state variable for this purpose. + state.level += 1 + ok = rule(state, True) + state.level -= 1 + if ok: + break + else: + # Too much nesting, just skip until the end of the paragraph. + # + # NOTE: this will cause links to behave incorrectly in the following case, + # when an amount of `[` is exactly equal to `maxNesting + 1`: + # + # [[[[[[[[[[[[[[[[[[[[[foo]() + # + # TODO: remove this workaround when CM standard will allow nested links + # (we can replace it by preventing links from being parsed in + # validation mode) + # + state.pos = state.posMax + + if not ok: + state.pos += 1 + cache[pos] = state.pos + + def tokenize(self, state: StateInline) -> None: + """Generate tokens for input range.""" + ok = False + rules = self.ruler.getRules("") + end = state.posMax + maxNesting = state.md.options["maxNesting"] + + while state.pos < end: + # Try all possible rules. + # On success, rule should: + # + # - update `state.pos` + # - update `state.tokens` + # - return true + + if state.level < maxNesting: + for rule in rules: + ok = rule(state, False) + if ok: + break + + if ok: + if state.pos >= end: + break + continue + + state.pending += state.src[state.pos] + state.pos += 1 + + if state.pending: + state.pushPending() + + def parse(self, src: str, md, env, tokens: list[Token]) -> list[Token]: + """Process input string and push inline tokens into `tokens`""" + state = StateInline(src, md, env, tokens) + self.tokenize(state) + rules2 = self.ruler2.getRules("") + for rule in rules2: + rule(state) + return state.tokens diff --git a/markdown_it/port.yaml b/markdown_it/port.yaml new file mode 100644 index 0000000..a6718fd --- /dev/null +++ b/markdown_it/port.yaml @@ -0,0 +1,49 @@ +- package: markdown-it/markdown-it + version: 12.2.0 + commit: 6e2de08a0b03d3d0dcc524b89710ce05f83a0283 + date: Aug 2, 2021 + notes: + - Rename variables that use python built-in names, e.g. + - `max` -> `maximum` + - `len` -> `length` + - `str` -> `string` + - | + Convert JS `for` loops to `while` loops + this is generally the main difference between the codes, + because in python you can't do e.g. `for {i=1;i<x;i++} {}` + - | + `env` is a common Python dictionary, and so does not have attribute access to keys, + as with JavaScript dictionaries. + `options` have attribute access only to core markdownit configuration options + - | + `Token.attrs` is a dictionary, instead of a list of lists. + Upstream the list format is only used to guarantee order: https://github.com/markdown-it/markdown-it/issues/142, + but in Python 3.7+ order of dictionaries is guaranteed. + One should anyhow use the `attrGet`, `attrSet`, `attrPush` and `attrJoin` methods + to manipulate `Token.attrs`, which have an identical signature to those upstream. + - Use python version of `charCodeAt` + - | + Reduce use of charCodeAt() by storing char codes in a srcCharCodes attribute for state + objects and sharing those whenever possible + This provides a significant performance boost + - | + In markdown_it/rules_block/reference.py, + record line range in state.env["references"] and add state.env["duplicate_refs"] + This is to allow renderers to report on issues regarding references + - | + The `MarkdownIt.__init__` signature is slightly different for updating options, + since you must always specify the config first, e.g. + use `MarkdownIt("commonmark", {"html": False})` instead of `MarkdownIt({"html": False})` + - The default configuration preset for `MarkdownIt` is "commonmark" not "default" + - Allow custom renderer to be passed to `MarkdownIt` + - | + change render method signatures + `func(tokens, idx, options, env, slf)` to + `func(self, tokens, idx, options, env)` + - | + Extensions add render methods by format + `MarkdownIt.add_render_rule(name, function, fmt="html")`, + rather than `MarkdownIt.renderer.rules[name] = function` + and renderers should declare a class property `__output__ = "html"`. + This allows for extensibility to more than just HTML renderers + - inline tokens in tables are assigned a map (this is helpful for propagation to children) diff --git a/markdown_it/presets/__init__.py b/markdown_it/presets/__init__.py new file mode 100644 index 0000000..16f10e5 --- /dev/null +++ b/markdown_it/presets/__init__.py @@ -0,0 +1,27 @@ +__all__ = ("commonmark", "default", "zero", "js_default", "gfm_like") + +from . import commonmark, default, zero + +js_default = default + + +class gfm_like: + """GitHub Flavoured Markdown (GFM) like. + + This adds the linkify, table and strikethrough components to CommmonMark. + + Note, it lacks task-list items and raw HTML filtering, + to meet the the full GFM specification + (see https://github.github.com/gfm/#autolinks-extension-). + """ + + @staticmethod + def make(): + config = commonmark.make() + config["components"]["core"]["rules"].append("linkify") + config["components"]["block"]["rules"].append("table") + config["components"]["inline"]["rules"].append("strikethrough") + config["components"]["inline"]["rules2"].append("strikethrough") + config["options"]["linkify"] = True + config["options"]["html"] = True + return config diff --git a/markdown_it/presets/commonmark.py b/markdown_it/presets/commonmark.py new file mode 100644 index 0000000..e44b66b --- /dev/null +++ b/markdown_it/presets/commonmark.py @@ -0,0 +1,73 @@ +"""Commonmark default options. + +This differs to presets.default, +primarily in that it allows HTML and does not enable components: + +- block: table +- inline: strikethrough +""" + + +def make(): + return { + "options": { + "maxNesting": 20, # Internal protection, recursion limit + "html": True, # Enable HTML tags in source, + # this is just a shorthand for .enable(["html_inline", "html_block"]) + # used by the linkify rule: + "linkify": False, # autoconvert URL-like texts to links + # used by the replacements and smartquotes rules + # Enable some language-neutral replacements + quotes beautification + "typographer": False, + # used by the smartquotes rule: + # Double + single quotes replacement pairs, when typographer enabled, + # and smartquotes on. Could be either a String or an Array. + # + # For example, you can use '«»„“' for Russian, '„“‚‘' for German, + # and ['«\xA0', '\xA0»', '‹\xA0', '\xA0›'] for French (including nbsp). + "quotes": "\u201c\u201d\u2018\u2019", # /* “”‘’ */ + # Renderer specific; these options are used directly in the HTML renderer + "xhtmlOut": True, # Use '/' to close single tags (<br />) + "breaks": False, # Convert '\n' in paragraphs into <br> + "langPrefix": "language-", # CSS language prefix for fenced blocks + # Highlighter function. Should return escaped HTML, + # or '' if the source string is not changed and should be escaped externally. + # If result starts with <pre... internal wrapper is skipped. + # + # function (/*str, lang, attrs*/) { return ''; } + # + "highlight": None, + }, + "components": { + "core": {"rules": ["normalize", "block", "inline"]}, + "block": { + "rules": [ + "blockquote", + "code", + "fence", + "heading", + "hr", + "html_block", + "lheading", + "list", + "reference", + "paragraph", + ] + }, + "inline": { + "rules": [ + "autolink", + "backticks", + "emphasis", + "entity", + "escape", + "html_inline", + "image", + "link", + "newline", + "text", + ], + "rules2": ["balance_pairs", "emphasis", "text_collapse"], + }, + }, + } diff --git a/markdown_it/presets/default.py b/markdown_it/presets/default.py new file mode 100644 index 0000000..59f4855 --- /dev/null +++ b/markdown_it/presets/default.py @@ -0,0 +1,34 @@ +"""markdown-it default options.""" + + +def make(): + return { + "options": { + "maxNesting": 100, # Internal protection, recursion limit + "html": False, # Enable HTML tags in source + # this is just a shorthand for .disable(["html_inline", "html_block"]) + # used by the linkify rule: + "linkify": False, # autoconvert URL-like texts to links + # used by the replacements and smartquotes rules: + # Enable some language-neutral replacements + quotes beautification + "typographer": False, + # used by the smartquotes rule: + # Double + single quotes replacement pairs, when typographer enabled, + # and smartquotes on. Could be either a String or an Array. + # For example, you can use '«»„“' for Russian, '„“‚‘' for German, + # and ['«\xA0', '\xA0»', '‹\xA0', '\xA0›'] for French (including nbsp). + "quotes": "\u201c\u201d\u2018\u2019", # /* “”‘’ */ + # Renderer specific; these options are used directly in the HTML renderer + "xhtmlOut": False, # Use '/' to close single tags (<br />) + "breaks": False, # Convert '\n' in paragraphs into <br> + "langPrefix": "language-", # CSS language prefix for fenced blocks + # Highlighter function. Should return escaped HTML, + # or '' if the source string is not changed and should be escaped externally. + # If result starts with <pre... internal wrapper is skipped. + # + # function (/*str, lang, attrs*/) { return ''; } + # + "highlight": None, + }, + "components": {"core": {}, "block": {}, "inline": {}}, + } diff --git a/markdown_it/presets/zero.py b/markdown_it/presets/zero.py new file mode 100644 index 0000000..af1d9c7 --- /dev/null +++ b/markdown_it/presets/zero.py @@ -0,0 +1,39 @@ +""" +"Zero" preset, with nothing enabled. Useful for manual configuring of simple +modes. For example, to parse bold/italic only. +""" + + +def make(): + return { + "options": { + "maxNesting": 20, # Internal protection, recursion limit + "html": False, # Enable HTML tags in source + # this is just a shorthand for .disable(["html_inline", "html_block"]) + # used by the linkify rule: + "linkify": False, # autoconvert URL-like texts to links + # used by the replacements and smartquotes rules: + # Enable some language-neutral replacements + quotes beautification + "typographer": False, + # used by the smartquotes rule: + # Double + single quotes replacement pairs, when typographer enabled, + # and smartquotes on. Could be either a String or an Array. + # For example, you can use '«»„“' for Russian, '„“‚‘' for German, + # and ['«\xA0', '\xA0»', '‹\xA0', '\xA0›'] for French (including nbsp). + "quotes": "\u201c\u201d\u2018\u2019", # /* “”‘’ */ + # Renderer specific; these options are used directly in the HTML renderer + "xhtmlOut": False, # Use '/' to close single tags (<br />) + "breaks": False, # Convert '\n' in paragraphs into <br> + "langPrefix": "language-", # CSS language prefix for fenced blocks + # Highlighter function. Should return escaped HTML, + # or '' if the source string is not changed and should be escaped externally. + # If result starts with <pre... internal wrapper is skipped. + # function (/*str, lang, attrs*/) { return ''; } + "highlight": None, + }, + "components": { + "core": {"rules": ["normalize", "block", "inline"]}, + "block": {"rules": ["paragraph"]}, + "inline": {"rules": ["text"], "rules2": ["balance_pairs", "text_collapse"]}, + }, + } diff --git a/markdown_it/py.typed b/markdown_it/py.typed new file mode 100644 index 0000000..7632ecf --- /dev/null +++ b/markdown_it/py.typed @@ -0,0 +1 @@ +# Marker file for PEP 561 diff --git a/markdown_it/renderer.py b/markdown_it/renderer.py new file mode 100644 index 0000000..b8bfe4d --- /dev/null +++ b/markdown_it/renderer.py @@ -0,0 +1,339 @@ +""" +class Renderer + +Generates HTML from parsed token stream. Each instance has independent +copy of rules. Those can be rewritten with ease. Also, you can add new +rules if you create plugin and adds new token types. +""" +from __future__ import annotations + +from collections.abc import MutableMapping, Sequence +import inspect +from typing import Any, ClassVar + +from .common.utils import escapeHtml, unescapeAll +from .token import Token +from .utils import OptionsDict + +try: + from typing import Protocol +except ImportError: # Python <3.8 doesn't have `Protocol` in the stdlib + from typing_extensions import Protocol # type: ignore[misc] + + +class RendererProtocol(Protocol): + __output__: ClassVar[str] + + def render( + self, tokens: Sequence[Token], options: OptionsDict, env: MutableMapping + ) -> Any: + ... + + +class RendererHTML(RendererProtocol): + """Contains render rules for tokens. Can be updated and extended. + + Example: + + Each rule is called as independent static function with fixed signature: + + :: + + class Renderer: + def token_type_name(self, tokens, idx, options, env) { + # ... + return renderedHTML + + :: + + class CustomRenderer(RendererHTML): + def strong_open(self, tokens, idx, options, env): + return '<b>' + def strong_close(self, tokens, idx, options, env): + return '</b>' + + md = MarkdownIt(renderer_cls=CustomRenderer) + + result = md.render(...) + + See https://github.com/markdown-it/markdown-it/blob/master/lib/renderer.js + for more details and examples. + """ + + __output__ = "html" + + def __init__(self, parser=None): + self.rules = { + k: v + for k, v in inspect.getmembers(self, predicate=inspect.ismethod) + if not (k.startswith("render") or k.startswith("_")) + } + + def render( + self, tokens: Sequence[Token], options: OptionsDict, env: MutableMapping + ) -> str: + """Takes token stream and generates HTML. + + :param tokens: list on block tokens to render + :param options: params of parser instance + :param env: additional data from parsed input + + """ + result = "" + + for i, token in enumerate(tokens): + + if token.type == "inline": + assert token.children is not None + result += self.renderInline(token.children, options, env) + elif token.type in self.rules: + result += self.rules[token.type](tokens, i, options, env) + else: + result += self.renderToken(tokens, i, options, env) + + return result + + def renderInline( + self, tokens: Sequence[Token], options: OptionsDict, env: MutableMapping + ) -> str: + """The same as ``render``, but for single token of `inline` type. + + :param tokens: list on block tokens to render + :param options: params of parser instance + :param env: additional data from parsed input (references, for example) + """ + result = "" + + for i, token in enumerate(tokens): + if token.type in self.rules: + result += self.rules[token.type](tokens, i, options, env) + else: + result += self.renderToken(tokens, i, options, env) + + return result + + def renderToken( + self, + tokens: Sequence[Token], + idx: int, + options: OptionsDict, + env: MutableMapping, + ) -> str: + """Default token renderer. + + Can be overridden by custom function + + :param idx: token index to render + :param options: params of parser instance + """ + result = "" + needLf = False + token = tokens[idx] + + # Tight list paragraphs + if token.hidden: + return "" + + # Insert a newline between hidden paragraph and subsequent opening + # block-level tag. + # + # For example, here we should insert a newline before blockquote: + # - a + # > + # + if token.block and token.nesting != -1 and idx and tokens[idx - 1].hidden: + result += "\n" + + # Add token name, e.g. `<img` + result += ("</" if token.nesting == -1 else "<") + token.tag + + # Encode attributes, e.g. `<img src="foo"` + result += self.renderAttrs(token) + + # Add a slash for self-closing tags, e.g. `<img src="foo" /` + if token.nesting == 0 and options["xhtmlOut"]: + result += " /" + + # Check if we need to add a newline after this tag + if token.block: + needLf = True + + if token.nesting == 1: + if idx + 1 < len(tokens): + nextToken = tokens[idx + 1] + + if nextToken.type == "inline" or nextToken.hidden: + # Block-level tag containing an inline tag. + # + needLf = False + + elif nextToken.nesting == -1 and nextToken.tag == token.tag: + # Opening tag + closing tag of the same type. E.g. `<li></li>`. + # + needLf = False + + result += ">\n" if needLf else ">" + + return result + + @staticmethod + def renderAttrs(token: Token) -> str: + """Render token attributes to string.""" + result = "" + + for key, value in token.attrItems(): + result += " " + escapeHtml(key) + '="' + escapeHtml(str(value)) + '"' + + return result + + def renderInlineAsText( + self, + tokens: Sequence[Token] | None, + options: OptionsDict, + env: MutableMapping, + ) -> str: + """Special kludge for image `alt` attributes to conform CommonMark spec. + + Don't try to use it! Spec requires to show `alt` content with stripped markup, + instead of simple escaping. + + :param tokens: list on block tokens to render + :param options: params of parser instance + :param env: additional data from parsed input + """ + result = "" + + for token in tokens or []: + if token.type == "text": + result += token.content + elif token.type == "image": + assert token.children is not None + result += self.renderInlineAsText(token.children, options, env) + elif token.type == "softbreak": + result += "\n" + + return result + + ################################################### + + def code_inline(self, tokens: Sequence[Token], idx: int, options, env) -> str: + token = tokens[idx] + return ( + "<code" + + self.renderAttrs(token) + + ">" + + escapeHtml(tokens[idx].content) + + "</code>" + ) + + def code_block( + self, + tokens: Sequence[Token], + idx: int, + options: OptionsDict, + env: MutableMapping, + ) -> str: + token = tokens[idx] + + return ( + "<pre" + + self.renderAttrs(token) + + "><code>" + + escapeHtml(tokens[idx].content) + + "</code></pre>\n" + ) + + def fence( + self, + tokens: Sequence[Token], + idx: int, + options: OptionsDict, + env: MutableMapping, + ) -> str: + token = tokens[idx] + info = unescapeAll(token.info).strip() if token.info else "" + langName = "" + langAttrs = "" + + if info: + arr = info.split(maxsplit=1) + langName = arr[0] + if len(arr) == 2: + langAttrs = arr[1] + + if options.highlight: + highlighted = options.highlight( + token.content, langName, langAttrs + ) or escapeHtml(token.content) + else: + highlighted = escapeHtml(token.content) + + if highlighted.startswith("<pre"): + return highlighted + "\n" + + # If language exists, inject class gently, without modifying original token. + # May be, one day we will add .deepClone() for token and simplify this part, but + # now we prefer to keep things local. + if info: + # Fake token just to render attributes + tmpToken = Token(type="", tag="", nesting=0, attrs=token.attrs.copy()) + tmpToken.attrJoin("class", options.langPrefix + langName) + + return ( + "<pre><code" + + self.renderAttrs(tmpToken) + + ">" + + highlighted + + "</code></pre>\n" + ) + + return ( + "<pre><code" + + self.renderAttrs(token) + + ">" + + highlighted + + "</code></pre>\n" + ) + + def image( + self, + tokens: Sequence[Token], + idx: int, + options: OptionsDict, + env: MutableMapping, + ) -> str: + token = tokens[idx] + + # "alt" attr MUST be set, even if empty. Because it's mandatory and + # should be placed on proper position for tests. + + assert ( + token.attrs and "alt" in token.attrs + ), '"image" token\'s attrs must contain `alt`' + + # Replace content with actual value + + token.attrSet("alt", self.renderInlineAsText(token.children, options, env)) + + return self.renderToken(tokens, idx, options, env) + + def hardbreak( + self, tokens: Sequence[Token], idx: int, options: OptionsDict, *args + ) -> str: + return "<br />\n" if options.xhtmlOut else "<br>\n" + + def softbreak( + self, tokens: Sequence[Token], idx: int, options: OptionsDict, *args + ) -> str: + return ( + ("<br />\n" if options.xhtmlOut else "<br>\n") if options.breaks else "\n" + ) + + def text(self, tokens: Sequence[Token], idx: int, *args) -> str: + return escapeHtml(tokens[idx].content) + + def html_block(self, tokens: Sequence[Token], idx: int, *args) -> str: + return tokens[idx].content + + def html_inline(self, tokens: Sequence[Token], idx: int, *args) -> str: + return tokens[idx].content diff --git a/markdown_it/ruler.py b/markdown_it/ruler.py new file mode 100644 index 0000000..11b937a --- /dev/null +++ b/markdown_it/ruler.py @@ -0,0 +1,237 @@ +""" +class Ruler + +Helper class, used by [[MarkdownIt#core]], [[MarkdownIt#block]] and +[[MarkdownIt#inline]] to manage sequences of functions (rules): + +- keep rules in defined order +- assign the name to each rule +- enable/disable rules +- add/replace rules +- allow assign rules to additional named chains (in the same) +- caching lists of active rules + +You will not need use this class directly until write plugins. For simple +rules control use [[MarkdownIt.disable]], [[MarkdownIt.enable]] and +[[MarkdownIt.use]]. +""" +from __future__ import annotations + +from collections.abc import Callable, Iterable, MutableMapping +from dataclasses import dataclass, field +from typing import TYPE_CHECKING + +from markdown_it._compat import DATACLASS_KWARGS + +if TYPE_CHECKING: + from markdown_it import MarkdownIt + + +class StateBase: + srcCharCode: tuple[int, ...] + + def __init__(self, src: str, md: MarkdownIt, env: MutableMapping): + self.src = src + self.env = env + self.md = md + + @property + def src(self) -> str: + return self._src + + @src.setter + def src(self, value: str) -> None: + self._src = value + self.srcCharCode = tuple(ord(c) for c in self.src) + + +# The first positional arg is always a subtype of `StateBase`. Other +# arguments may or may not exist, based on the rule's type (block, +# core, inline). Return type is either `None` or `bool` based on the +# rule's type. +RuleFunc = Callable + + +@dataclass(**DATACLASS_KWARGS) +class Rule: + name: str + enabled: bool + fn: RuleFunc = field(repr=False) + alt: list[str] + + +class Ruler: + def __init__(self): + # List of added rules. + self.__rules__: list[Rule] = [] + # Cached rule chains. + # First level - chain name, '' for default. + # Second level - diginal anchor for fast filtering by charcodes. + self.__cache__: dict[str, list[RuleFunc]] | None = None + + def __find__(self, name: str) -> int: + """Find rule index by name""" + for i, rule in enumerate(self.__rules__): + if rule.name == name: + return i + return -1 + + def __compile__(self) -> None: + """Build rules lookup cache""" + chains = {""} + # collect unique names + for rule in self.__rules__: + if not rule.enabled: + continue + for name in rule.alt: + chains.add(name) + self.__cache__ = {} + for chain in chains: + self.__cache__[chain] = [] + for rule in self.__rules__: + if not rule.enabled: + continue + if chain and (chain not in rule.alt): + continue + self.__cache__[chain].append(rule.fn) + + def at(self, ruleName: str, fn: RuleFunc, options=None): + """Replace rule by name with new function & options. + + :param ruleName: rule name to replace. + :param fn: new rule function. + :param options: new rule options (not mandatory). + :raises: KeyError if name not found + """ + index = self.__find__(ruleName) + options = options or {} + if index == -1: + raise KeyError(f"Parser rule not found: {ruleName}") + self.__rules__[index].fn = fn + self.__rules__[index].alt = options.get("alt", []) + self.__cache__ = None + + def before(self, beforeName: str, ruleName: str, fn: RuleFunc, options=None): + """Add new rule to chain before one with given name. + + :param beforeName: new rule will be added before this one. + :param ruleName: new rule will be added before this one. + :param fn: new rule function. + :param options: new rule options (not mandatory). + :raises: KeyError if name not found + """ + index = self.__find__(beforeName) + options = options or {} + if index == -1: + raise KeyError(f"Parser rule not found: {beforeName}") + self.__rules__.insert(index, Rule(ruleName, True, fn, options.get("alt", []))) + self.__cache__ = None + + def after(self, afterName: str, ruleName: str, fn: RuleFunc, options=None): + """Add new rule to chain after one with given name. + + :param afterName: new rule will be added after this one. + :param ruleName: new rule will be added after this one. + :param fn: new rule function. + :param options: new rule options (not mandatory). + :raises: KeyError if name not found + """ + index = self.__find__(afterName) + options = options or {} + if index == -1: + raise KeyError(f"Parser rule not found: {afterName}") + self.__rules__.insert( + index + 1, Rule(ruleName, True, fn, options.get("alt", [])) + ) + self.__cache__ = None + + def push(self, ruleName: str, fn: RuleFunc, options=None): + """Push new rule to the end of chain. + + :param ruleName: new rule will be added to the end of chain. + :param fn: new rule function. + :param options: new rule options (not mandatory). + + """ + self.__rules__.append(Rule(ruleName, True, fn, (options or {}).get("alt", []))) + self.__cache__ = None + + def enable(self, names: str | Iterable[str], ignoreInvalid: bool = False): + """Enable rules with given names. + + :param names: name or list of rule names to enable. + :param ignoreInvalid: ignore errors when rule not found + :raises: KeyError if name not found and not ignoreInvalid + :return: list of found rule names + """ + if isinstance(names, str): + names = [names] + result = [] + for name in names: + idx = self.__find__(name) + if (idx < 0) and ignoreInvalid: + continue + if (idx < 0) and not ignoreInvalid: + raise KeyError(f"Rules manager: invalid rule name {name}") + self.__rules__[idx].enabled = True + result.append(name) + self.__cache__ = None + return result + + def enableOnly(self, names: str | Iterable[str], ignoreInvalid: bool = False): + """Enable rules with given names, and disable everything else. + + :param names: name or list of rule names to enable. + :param ignoreInvalid: ignore errors when rule not found + :raises: KeyError if name not found and not ignoreInvalid + :return: list of found rule names + """ + if isinstance(names, str): + names = [names] + for rule in self.__rules__: + rule.enabled = False + self.enable(names, ignoreInvalid) + + def disable(self, names: str | Iterable[str], ignoreInvalid: bool = False): + """Disable rules with given names. + + :param names: name or list of rule names to enable. + :param ignoreInvalid: ignore errors when rule not found + :raises: KeyError if name not found and not ignoreInvalid + :return: list of found rule names + """ + if isinstance(names, str): + names = [names] + result = [] + for name in names: + idx = self.__find__(name) + if (idx < 0) and ignoreInvalid: + continue + if (idx < 0) and not ignoreInvalid: + raise KeyError(f"Rules manager: invalid rule name {name}") + self.__rules__[idx].enabled = False + result.append(name) + self.__cache__ = None + return result + + def getRules(self, chainName: str) -> list[RuleFunc]: + """Return array of active functions (rules) for given chain name. + It analyzes rules configuration, compiles caches if not exists and returns result. + + Default chain name is `''` (empty string). It can't be skipped. + That's done intentionally, to keep signature monomorphic for high speed. + + """ + if self.__cache__ is None: + self.__compile__() + assert self.__cache__ is not None + # Chain can be empty, if rules disabled. But we still have to return Array. + return self.__cache__.get(chainName, []) or [] + + def get_all_rules(self) -> list[str]: + """Return all available rule names.""" + return [r.name for r in self.__rules__] + + def get_active_rules(self) -> list[str]: + """Return the active rule names.""" + return [r.name for r in self.__rules__ if r.enabled] diff --git a/markdown_it/rules_block/__init__.py b/markdown_it/rules_block/__init__.py new file mode 100644 index 0000000..bcf138d --- /dev/null +++ b/markdown_it/rules_block/__init__.py @@ -0,0 +1,27 @@ +__all__ = ( + "StateBlock", + "paragraph", + "heading", + "lheading", + "code", + "fence", + "hr", + "list_block", + "reference", + "blockquote", + "html_block", + "table", +) + +from .blockquote import blockquote +from .code import code +from .fence import fence +from .heading import heading +from .hr import hr +from .html_block import html_block +from .lheading import lheading +from .list import list_block +from .paragraph import paragraph +from .reference import reference +from .state_block import StateBlock +from .table import table diff --git a/markdown_it/rules_block/blockquote.py b/markdown_it/rules_block/blockquote.py new file mode 100644 index 0000000..6575731 --- /dev/null +++ b/markdown_it/rules_block/blockquote.py @@ -0,0 +1,299 @@ +# Block quotes +from __future__ import annotations + +import logging + +from ..common.utils import isSpace +from .state_block import StateBlock + +LOGGER = logging.getLogger(__name__) + + +def blockquote(state: StateBlock, startLine: int, endLine: int, silent: bool): + + LOGGER.debug( + "entering blockquote: %s, %s, %s, %s", state, startLine, endLine, silent + ) + + oldLineMax = state.lineMax + pos = state.bMarks[startLine] + state.tShift[startLine] + max = state.eMarks[startLine] + + # if it's indented more than 3 spaces, it should be a code block + if (state.sCount[startLine] - state.blkIndent) >= 4: + return False + + # check the block quote marker + if state.srcCharCode[pos] != 0x3E: # /* > */ + return False + pos += 1 + + # we know that it's going to be a valid blockquote, + # so no point trying to find the end of it in silent mode + if silent: + return True + + # set offset past spaces and ">" + initial = offset = state.sCount[startLine] + 1 + + try: + second_char_code: int | None = state.srcCharCode[pos] + except IndexError: + second_char_code = None + + # skip one optional space after '>' + if second_char_code == 0x20: # /* space */ + # ' > test ' + # ^ -- position start of line here: + pos += 1 + initial += 1 + offset += 1 + adjustTab = False + spaceAfterMarker = True + elif second_char_code == 0x09: # /* tab */ + spaceAfterMarker = True + + if (state.bsCount[startLine] + offset) % 4 == 3: + # ' >\t test ' + # ^ -- position start of line here (tab has width==1) + pos += 1 + initial += 1 + offset += 1 + adjustTab = False + else: + # ' >\t test ' + # ^ -- position start of line here + shift bsCount slightly + # to make extra space appear + adjustTab = True + + else: + spaceAfterMarker = False + + oldBMarks = [state.bMarks[startLine]] + state.bMarks[startLine] = pos + + while pos < max: + ch = state.srcCharCode[pos] + + if isSpace(ch): + if ch == 0x09: # / tab / + offset += ( + 4 + - (offset + state.bsCount[startLine] + (1 if adjustTab else 0)) % 4 + ) + else: + offset += 1 + + else: + break + + pos += 1 + + oldBSCount = [state.bsCount[startLine]] + state.bsCount[startLine] = ( + state.sCount[startLine] + 1 + (1 if spaceAfterMarker else 0) + ) + + lastLineEmpty = pos >= max + + oldSCount = [state.sCount[startLine]] + state.sCount[startLine] = offset - initial + + oldTShift = [state.tShift[startLine]] + state.tShift[startLine] = pos - state.bMarks[startLine] + + terminatorRules = state.md.block.ruler.getRules("blockquote") + + oldParentType = state.parentType + state.parentType = "blockquote" + + # Search the end of the block + # + # Block ends with either: + # 1. an empty line outside: + # ``` + # > test + # + # ``` + # 2. an empty line inside: + # ``` + # > + # test + # ``` + # 3. another tag: + # ``` + # > test + # - - - + # ``` + + # for (nextLine = startLine + 1; nextLine < endLine; nextLine++) { + nextLine = startLine + 1 + while nextLine < endLine: + + # check if it's outdented, i.e. it's inside list item and indented + # less than said list item: + # + # ``` + # 1. anything + # > current blockquote + # 2. checking this line + # ``` + isOutdented = state.sCount[nextLine] < state.blkIndent + + pos = state.bMarks[nextLine] + state.tShift[nextLine] + max = state.eMarks[nextLine] + + if pos >= max: + # Case 1: line is not inside the blockquote, and this line is empty. + break + + evaluatesTrue = state.srcCharCode[pos] == 0x3E and not isOutdented # /* > */ + pos += 1 + if evaluatesTrue: + # This line is inside the blockquote. + + # set offset past spaces and ">" + initial = offset = state.sCount[nextLine] + 1 + + try: + next_char: int | None = state.srcCharCode[pos] + except IndexError: + next_char = None + + # skip one optional space after '>' + if next_char == 0x20: # /* space */ + # ' > test ' + # ^ -- position start of line here: + pos += 1 + initial += 1 + offset += 1 + adjustTab = False + spaceAfterMarker = True + elif next_char == 0x09: # /* tab */ + spaceAfterMarker = True + + if (state.bsCount[nextLine] + offset) % 4 == 3: + # ' >\t test ' + # ^ -- position start of line here (tab has width==1) + pos += 1 + initial += 1 + offset += 1 + adjustTab = False + else: + # ' >\t test ' + # ^ -- position start of line here + shift bsCount slightly + # to make extra space appear + adjustTab = True + + else: + spaceAfterMarker = False + + oldBMarks.append(state.bMarks[nextLine]) + state.bMarks[nextLine] = pos + + while pos < max: + ch = state.srcCharCode[pos] + + if isSpace(ch): + if ch == 0x09: + offset += ( + 4 + - ( + offset + + state.bsCount[nextLine] + + (1 if adjustTab else 0) + ) + % 4 + ) + else: + offset += 1 + else: + break + + pos += 1 + + lastLineEmpty = pos >= max + + oldBSCount.append(state.bsCount[nextLine]) + state.bsCount[nextLine] = ( + state.sCount[nextLine] + 1 + (1 if spaceAfterMarker else 0) + ) + + oldSCount.append(state.sCount[nextLine]) + state.sCount[nextLine] = offset - initial + + oldTShift.append(state.tShift[nextLine]) + state.tShift[nextLine] = pos - state.bMarks[nextLine] + + nextLine += 1 + continue + + # Case 2: line is not inside the blockquote, and the last line was empty. + if lastLineEmpty: + break + + # Case 3: another tag found. + terminate = False + + for terminatorRule in terminatorRules: + if terminatorRule(state, nextLine, endLine, True): + terminate = True + break + + if terminate: + # Quirk to enforce "hard termination mode" for paragraphs; + # normally if you call `tokenize(state, startLine, nextLine)`, + # paragraphs will look below nextLine for paragraph continuation, + # but if blockquote is terminated by another tag, they shouldn't + state.lineMax = nextLine + + if state.blkIndent != 0: + # state.blkIndent was non-zero, we now set it to zero, + # so we need to re-calculate all offsets to appear as + # if indent wasn't changed + oldBMarks.append(state.bMarks[nextLine]) + oldBSCount.append(state.bsCount[nextLine]) + oldTShift.append(state.tShift[nextLine]) + oldSCount.append(state.sCount[nextLine]) + state.sCount[nextLine] -= state.blkIndent + + break + + oldBMarks.append(state.bMarks[nextLine]) + oldBSCount.append(state.bsCount[nextLine]) + oldTShift.append(state.tShift[nextLine]) + oldSCount.append(state.sCount[nextLine]) + + # A negative indentation means that this is a paragraph continuation + # + state.sCount[nextLine] = -1 + + nextLine += 1 + + oldIndent = state.blkIndent + state.blkIndent = 0 + + token = state.push("blockquote_open", "blockquote", 1) + token.markup = ">" + token.map = lines = [startLine, 0] + + state.md.block.tokenize(state, startLine, nextLine) + + token = state.push("blockquote_close", "blockquote", -1) + token.markup = ">" + + state.lineMax = oldLineMax + state.parentType = oldParentType + lines[1] = state.line + + # Restore original tShift; this might not be necessary since the parser + # has already been here, but just to make sure we can do that. + for i, item in enumerate(oldTShift): + state.bMarks[i + startLine] = oldBMarks[i] + state.tShift[i + startLine] = item + state.sCount[i + startLine] = oldSCount[i] + state.bsCount[i + startLine] = oldBSCount[i] + + state.blkIndent = oldIndent + + return True diff --git a/markdown_it/rules_block/code.py b/markdown_it/rules_block/code.py new file mode 100644 index 0000000..c4fdba3 --- /dev/null +++ b/markdown_it/rules_block/code.py @@ -0,0 +1,36 @@ +"""Code block (4 spaces padded).""" +import logging + +from .state_block import StateBlock + +LOGGER = logging.getLogger(__name__) + + +def code(state: StateBlock, startLine: int, endLine: int, silent: bool = False): + + LOGGER.debug("entering code: %s, %s, %s, %s", state, startLine, endLine, silent) + + if state.sCount[startLine] - state.blkIndent < 4: + return False + + last = nextLine = startLine + 1 + + while nextLine < endLine: + if state.isEmpty(nextLine): + nextLine += 1 + continue + + if state.sCount[nextLine] - state.blkIndent >= 4: + nextLine += 1 + last = nextLine + continue + + break + + state.line = last + + token = state.push("code_block", "code", 0) + token.content = state.getLines(startLine, last, 4 + state.blkIndent, False) + "\n" + token.map = [startLine, state.line] + + return True diff --git a/markdown_it/rules_block/fence.py b/markdown_it/rules_block/fence.py new file mode 100644 index 0000000..c4f5275 --- /dev/null +++ b/markdown_it/rules_block/fence.py @@ -0,0 +1,104 @@ +# fences (``` lang, ~~~ lang) +import logging + +from .state_block import StateBlock + +LOGGER = logging.getLogger(__name__) + + +def fence(state: StateBlock, startLine: int, endLine: int, silent: bool): + + LOGGER.debug("entering fence: %s, %s, %s, %s", state, startLine, endLine, silent) + + haveEndMarker = False + pos = state.bMarks[startLine] + state.tShift[startLine] + maximum = state.eMarks[startLine] + + # if it's indented more than 3 spaces, it should be a code block + if state.sCount[startLine] - state.blkIndent >= 4: + return False + + if pos + 3 > maximum: + return False + + marker = state.srcCharCode[pos] + + # /* ~ */ /* ` */ + if marker != 0x7E and marker != 0x60: + return False + + # scan marker length + mem = pos + pos = state.skipChars(pos, marker) + + length = pos - mem + + if length < 3: + return False + + markup = state.src[mem:pos] + params = state.src[pos:maximum] + + # /* ` */ + if marker == 0x60: + if chr(marker) in params: + return False + + # Since start is found, we can report success here in validation mode + if silent: + return True + + # search end of block + nextLine = startLine + + while True: + nextLine += 1 + if nextLine >= endLine: + # unclosed block should be autoclosed by end of document. + # also block seems to be autoclosed by end of parent + break + + pos = mem = state.bMarks[nextLine] + state.tShift[nextLine] + maximum = state.eMarks[nextLine] + + if pos < maximum and state.sCount[nextLine] < state.blkIndent: + # non-empty line with negative indent should stop the list: + # - ``` + # test + break + + if state.srcCharCode[pos] != marker: + continue + + if state.sCount[nextLine] - state.blkIndent >= 4: + # closing fence should be indented less than 4 spaces + continue + + pos = state.skipChars(pos, marker) + + # closing code fence must be at least as long as the opening one + if pos - mem < length: + continue + + # make sure tail has spaces only + pos = state.skipSpaces(pos) + + if pos < maximum: + continue + + haveEndMarker = True + # found! + break + + # If a fence has heading spaces, they should be removed from its inner block + length = state.sCount[startLine] + + state.line = nextLine + (1 if haveEndMarker else 0) + + token = state.push("fence", "code", 0) + token.info = params + token.content = state.getLines(startLine + 1, nextLine, length, True) + token.markup = markup + token.map = [startLine, state.line] + + return True diff --git a/markdown_it/rules_block/heading.py b/markdown_it/rules_block/heading.py new file mode 100644 index 0000000..8d4ef3e --- /dev/null +++ b/markdown_it/rules_block/heading.py @@ -0,0 +1,72 @@ +""" Atex heading (#, ##, ...) """ +from __future__ import annotations + +import logging + +from ..common.utils import isSpace +from .state_block import StateBlock + +LOGGER = logging.getLogger(__name__) + + +def heading(state: StateBlock, startLine: int, endLine: int, silent: bool): + + LOGGER.debug("entering heading: %s, %s, %s, %s", state, startLine, endLine, silent) + + pos = state.bMarks[startLine] + state.tShift[startLine] + maximum = state.eMarks[startLine] + + # if it's indented more than 3 spaces, it should be a code block + if state.sCount[startLine] - state.blkIndent >= 4: + return False + + ch: int | None = state.srcCharCode[pos] + + # /* # */ + if ch != 0x23 or pos >= maximum: + return False + + # count heading level + level = 1 + pos += 1 + try: + ch = state.srcCharCode[pos] + except IndexError: + ch = None + # /* # */ + while ch == 0x23 and pos < maximum and level <= 6: + level += 1 + pos += 1 + try: + ch = state.srcCharCode[pos] + except IndexError: + ch = None + + if level > 6 or (pos < maximum and not isSpace(ch)): + return False + + if silent: + return True + + # Let's cut tails like ' ### ' from the end of string + + maximum = state.skipSpacesBack(maximum, pos) + tmp = state.skipCharsBack(maximum, 0x23, pos) # # + if tmp > pos and isSpace(state.srcCharCode[tmp - 1]): + maximum = tmp + + state.line = startLine + 1 + + token = state.push("heading_open", "h" + str(level), 1) + token.markup = "########"[:level] + token.map = [startLine, state.line] + + token = state.push("inline", "", 0) + token.content = state.src[pos:maximum].strip() + token.map = [startLine, state.line] + token.children = [] + + token = state.push("heading_close", "h" + str(level), -1) + token.markup = "########"[:level] + + return True diff --git a/markdown_it/rules_block/hr.py b/markdown_it/rules_block/hr.py new file mode 100644 index 0000000..804cd9d --- /dev/null +++ b/markdown_it/rules_block/hr.py @@ -0,0 +1,54 @@ +"""Horizontal rule + +At least 3 of these characters on a line * - _ +""" +import logging + +from ..common.utils import isSpace +from .state_block import StateBlock + +LOGGER = logging.getLogger(__name__) + + +def hr(state: StateBlock, startLine: int, endLine: int, silent: bool): + + LOGGER.debug("entering hr: %s, %s, %s, %s", state, startLine, endLine, silent) + + pos = state.bMarks[startLine] + state.tShift[startLine] + maximum = state.eMarks[startLine] + + # if it's indented more than 3 spaces, it should be a code block + if state.sCount[startLine] - state.blkIndent >= 4: + return False + + marker = state.srcCharCode[pos] + pos += 1 + + # Check hr marker: /* * */ /* - */ /* _ */ + if marker != 0x2A and marker != 0x2D and marker != 0x5F: + return False + + # markers can be mixed with spaces, but there should be at least 3 of them + + cnt = 1 + while pos < maximum: + ch = state.srcCharCode[pos] + pos += 1 + if ch != marker and not isSpace(ch): + return False + if ch == marker: + cnt += 1 + + if cnt < 3: + return False + + if silent: + return True + + state.line = startLine + 1 + + token = state.push("hr", "hr", 0) + token.map = [startLine, state.line] + token.markup = chr(marker) * (cnt + 1) + + return True diff --git a/markdown_it/rules_block/html_block.py b/markdown_it/rules_block/html_block.py new file mode 100644 index 0000000..31afab7 --- /dev/null +++ b/markdown_it/rules_block/html_block.py @@ -0,0 +1,91 @@ +# HTML block +from __future__ import annotations + +import logging +import re + +from ..common.html_blocks import block_names +from ..common.html_re import HTML_OPEN_CLOSE_TAG_STR +from .state_block import StateBlock + +LOGGER = logging.getLogger(__name__) + +# An array of opening and corresponding closing sequences for html tags, +# last argument defines whether it can terminate a paragraph or not +HTML_SEQUENCES: list[tuple[re.Pattern, re.Pattern, bool]] = [ + ( + re.compile(r"^<(script|pre|style|textarea)(?=(\s|>|$))", re.IGNORECASE), + re.compile(r"<\/(script|pre|style|textarea)>", re.IGNORECASE), + True, + ), + (re.compile(r"^<!--"), re.compile(r"-->"), True), + (re.compile(r"^<\?"), re.compile(r"\?>"), True), + (re.compile(r"^<![A-Z]"), re.compile(r">"), True), + (re.compile(r"^<!\[CDATA\["), re.compile(r"\]\]>"), True), + ( + re.compile("^</?(" + "|".join(block_names) + ")(?=(\\s|/?>|$))", re.IGNORECASE), + re.compile(r"^$"), + True, + ), + (re.compile(HTML_OPEN_CLOSE_TAG_STR + "\\s*$"), re.compile(r"^$"), False), +] + + +def html_block(state: StateBlock, startLine: int, endLine: int, silent: bool): + LOGGER.debug( + "entering html_block: %s, %s, %s, %s", state, startLine, endLine, silent + ) + pos = state.bMarks[startLine] + state.tShift[startLine] + maximum = state.eMarks[startLine] + + # if it's indented more than 3 spaces, it should be a code block + if state.sCount[startLine] - state.blkIndent >= 4: + return False + + if not state.md.options.get("html", None): + return False + + if state.srcCharCode[pos] != 0x3C: # /* < */ + return False + + lineText = state.src[pos:maximum] + + html_seq = None + for HTML_SEQUENCE in HTML_SEQUENCES: + if HTML_SEQUENCE[0].search(lineText): + html_seq = HTML_SEQUENCE + break + + if not html_seq: + return False + + if silent: + # true if this sequence can be a terminator, false otherwise + return html_seq[2] + + nextLine = startLine + 1 + + # If we are here - we detected HTML block. + # Let's roll down till block end. + if not html_seq[1].search(lineText): + while nextLine < endLine: + if state.sCount[nextLine] < state.blkIndent: + break + + pos = state.bMarks[nextLine] + state.tShift[nextLine] + maximum = state.eMarks[nextLine] + lineText = state.src[pos:maximum] + + if html_seq[1].search(lineText): + if len(lineText) != 0: + nextLine += 1 + break + nextLine += 1 + + state.line = nextLine + + token = state.push("html_block", "", 0) + token.map = [startLine, nextLine] + token.content = state.getLines(startLine, nextLine, state.blkIndent, True) + + return True diff --git a/markdown_it/rules_block/lheading.py b/markdown_it/rules_block/lheading.py new file mode 100644 index 0000000..f26e2af --- /dev/null +++ b/markdown_it/rules_block/lheading.py @@ -0,0 +1,90 @@ +# lheading (---, ==) +import logging + +from ..ruler import Ruler +from .state_block import StateBlock + +LOGGER = logging.getLogger(__name__) + + +def lheading(state: StateBlock, startLine: int, endLine: int, silent: bool): + + LOGGER.debug("entering lheading: %s, %s, %s, %s", state, startLine, endLine, silent) + + level = None + nextLine = startLine + 1 + ruler: Ruler = state.md.block.ruler + terminatorRules = ruler.getRules("paragraph") + + # if it's indented more than 3 spaces, it should be a code block + if state.sCount[startLine] - state.blkIndent >= 4: + return False + + oldParentType = state.parentType + state.parentType = "paragraph" # use paragraph to match terminatorRules + + # jump line-by-line until empty one or EOF + while nextLine < endLine and not state.isEmpty(nextLine): + # this would be a code block normally, but after paragraph + # it's considered a lazy continuation regardless of what's there + if state.sCount[nextLine] - state.blkIndent > 3: + nextLine += 1 + continue + + # Check for underline in setext header + if state.sCount[nextLine] >= state.blkIndent: + pos = state.bMarks[nextLine] + state.tShift[nextLine] + maximum = state.eMarks[nextLine] + + if pos < maximum: + marker = state.srcCharCode[pos] + + # /* - */ /* = */ + if marker == 0x2D or marker == 0x3D: + pos = state.skipChars(pos, marker) + pos = state.skipSpaces(pos) + + # /* = */ + if pos >= maximum: + level = 1 if marker == 0x3D else 2 + break + + # quirk for blockquotes, this line should already be checked by that rule + if state.sCount[nextLine] < 0: + nextLine += 1 + continue + + # Some tags can terminate paragraph without empty line. + terminate = False + for terminatorRule in terminatorRules: + if terminatorRule(state, nextLine, endLine, True): + terminate = True + break + if terminate: + break + + nextLine += 1 + + if not level: + # Didn't find valid underline + return False + + content = state.getLines(startLine, nextLine, state.blkIndent, False).strip() + + state.line = nextLine + 1 + + token = state.push("heading_open", "h" + str(level), 1) + token.markup = chr(marker) + token.map = [startLine, state.line] + + token = state.push("inline", "", 0) + token.content = content + token.map = [startLine, state.line - 1] + token.children = [] + + token = state.push("heading_close", "h" + str(level), -1) + token.markup = chr(marker) + + state.parentType = oldParentType + + return True diff --git a/markdown_it/rules_block/list.py b/markdown_it/rules_block/list.py new file mode 100644 index 0000000..a7617ad --- /dev/null +++ b/markdown_it/rules_block/list.py @@ -0,0 +1,344 @@ +# Lists +import logging + +from ..common.utils import isSpace +from .state_block import StateBlock + +LOGGER = logging.getLogger(__name__) + + +# Search `[-+*][\n ]`, returns next pos after marker on success +# or -1 on fail. +def skipBulletListMarker(state: StateBlock, startLine: int): + + pos = state.bMarks[startLine] + state.tShift[startLine] + maximum = state.eMarks[startLine] + + marker = state.srcCharCode[pos] + pos += 1 + # Check bullet /* * */ /* - */ /* + */ + if marker != 0x2A and marker != 0x2D and marker != 0x2B: + return -1 + + if pos < maximum: + ch = state.srcCharCode[pos] + + if not isSpace(ch): + # " -test " - is not a list item + return -1 + + return pos + + +# Search `\d+[.)][\n ]`, returns next pos after marker on success +# or -1 on fail. +def skipOrderedListMarker(state: StateBlock, startLine: int): + + start = state.bMarks[startLine] + state.tShift[startLine] + pos = start + maximum = state.eMarks[startLine] + + # List marker should have at least 2 chars (digit + dot) + if pos + 1 >= maximum: + return -1 + + ch = state.srcCharCode[pos] + pos += 1 + + # /* 0 */ /* 9 */ + if ch < 0x30 or ch > 0x39: + return -1 + + while True: + # EOL -> fail + if pos >= maximum: + return -1 + + ch = state.srcCharCode[pos] + pos += 1 + + # /* 0 */ /* 9 */ + if ch >= 0x30 and ch <= 0x39: + + # List marker should have no more than 9 digits + # (prevents integer overflow in browsers) + if pos - start >= 10: + return -1 + + continue + + # found valid marker: /* ) */ /* . */ + if ch == 0x29 or ch == 0x2E: + break + + return -1 + + if pos < maximum: + ch = state.srcCharCode[pos] + + if not isSpace(ch): + # " 1.test " - is not a list item + return -1 + + return pos + + +def markTightParagraphs(state: StateBlock, idx: int): + level = state.level + 2 + + i = idx + 2 + length = len(state.tokens) - 2 + while i < length: + if state.tokens[i].level == level and state.tokens[i].type == "paragraph_open": + state.tokens[i + 2].hidden = True + state.tokens[i].hidden = True + i += 2 + i += 1 + + +def list_block(state: StateBlock, startLine: int, endLine: int, silent: bool): + + LOGGER.debug("entering list: %s, %s, %s, %s", state, startLine, endLine, silent) + + isTerminatingParagraph = False + tight = True + + # if it's indented more than 3 spaces, it should be a code block + if state.sCount[startLine] - state.blkIndent >= 4: + return False + + # Special case: + # - item 1 + # - item 2 + # - item 3 + # - item 4 + # - this one is a paragraph continuation + if ( + state.listIndent >= 0 + and state.sCount[startLine] - state.listIndent >= 4 + and state.sCount[startLine] < state.blkIndent + ): + return False + + # limit conditions when list can interrupt + # a paragraph (validation mode only) + if silent and state.parentType == "paragraph": + # Next list item should still terminate previous list item + # + # This code can fail if plugins use blkIndent as well as lists, + # but I hope the spec gets fixed long before that happens. + # + if state.tShift[startLine] >= state.blkIndent: + isTerminatingParagraph = True + + # Detect list type and position after marker + posAfterMarker = skipOrderedListMarker(state, startLine) + if posAfterMarker >= 0: + isOrdered = True + start = state.bMarks[startLine] + state.tShift[startLine] + markerValue = int(state.src[start : posAfterMarker - 1]) + + # If we're starting a new ordered list right after + # a paragraph, it should start with 1. + if isTerminatingParagraph and markerValue != 1: + return False + else: + posAfterMarker = skipBulletListMarker(state, startLine) + if posAfterMarker >= 0: + isOrdered = False + else: + return False + + # If we're starting a new unordered list right after + # a paragraph, first line should not be empty. + if isTerminatingParagraph: + if state.skipSpaces(posAfterMarker) >= state.eMarks[startLine]: + return False + + # We should terminate list on style change. Remember first one to compare. + markerCharCode = state.srcCharCode[posAfterMarker - 1] + + # For validation mode we can terminate immediately + if silent: + return True + + # Start list + listTokIdx = len(state.tokens) + + if isOrdered: + token = state.push("ordered_list_open", "ol", 1) + if markerValue != 1: + token.attrs = {"start": markerValue} + + else: + token = state.push("bullet_list_open", "ul", 1) + + token.map = listLines = [startLine, 0] + token.markup = chr(markerCharCode) + + # + # Iterate list items + # + + nextLine = startLine + prevEmptyEnd = False + terminatorRules = state.md.block.ruler.getRules("list") + + oldParentType = state.parentType + state.parentType = "list" + + while nextLine < endLine: + pos = posAfterMarker + maximum = state.eMarks[nextLine] + + initial = offset = ( + state.sCount[nextLine] + + posAfterMarker + - (state.bMarks[startLine] + state.tShift[startLine]) + ) + + while pos < maximum: + ch = state.srcCharCode[pos] + + if ch == 0x09: # \t + offset += 4 - (offset + state.bsCount[nextLine]) % 4 + elif ch == 0x20: # \s + offset += 1 + else: + break + + pos += 1 + + contentStart = pos + + if contentStart >= maximum: + # trimming space in "- \n 3" case, indent is 1 here + indentAfterMarker = 1 + else: + indentAfterMarker = offset - initial + + # If we have more than 4 spaces, the indent is 1 + # (the rest is just indented code block) + if indentAfterMarker > 4: + indentAfterMarker = 1 + + # " - test" + # ^^^^^ - calculating total length of this thing + indent = initial + indentAfterMarker + + # Run subparser & write tokens + token = state.push("list_item_open", "li", 1) + token.markup = chr(markerCharCode) + token.map = itemLines = [startLine, 0] + if isOrdered: + token.info = state.src[start : posAfterMarker - 1] + + # change current state, then restore it after parser subcall + oldTight = state.tight + oldTShift = state.tShift[startLine] + oldSCount = state.sCount[startLine] + + # - example list + # ^ listIndent position will be here + # ^ blkIndent position will be here + # + oldListIndent = state.listIndent + state.listIndent = state.blkIndent + state.blkIndent = indent + + state.tight = True + state.tShift[startLine] = contentStart - state.bMarks[startLine] + state.sCount[startLine] = offset + + if contentStart >= maximum and state.isEmpty(startLine + 1): + # workaround for this case + # (list item is empty, list terminates before "foo"): + # ~~~~~~~~ + # - + # + # foo + # ~~~~~~~~ + state.line = min(state.line + 2, endLine) + else: + # NOTE in list.js this was: + # state.md.block.tokenize(state, startLine, endLine, True) + # but tokeniz does not take the final parameter + state.md.block.tokenize(state, startLine, endLine) + + # If any of list item is tight, mark list as tight + if (not state.tight) or prevEmptyEnd: + tight = False + + # Item become loose if finish with empty line, + # but we should filter last element, because it means list finish + prevEmptyEnd = (state.line - startLine) > 1 and state.isEmpty(state.line - 1) + + state.blkIndent = state.listIndent + state.listIndent = oldListIndent + state.tShift[startLine] = oldTShift + state.sCount[startLine] = oldSCount + state.tight = oldTight + + token = state.push("list_item_close", "li", -1) + token.markup = chr(markerCharCode) + + nextLine = startLine = state.line + itemLines[1] = nextLine + + if nextLine >= endLine: + break + + contentStart = state.bMarks[startLine] + + # + # Try to check if list is terminated or continued. + # + if state.sCount[nextLine] < state.blkIndent: + break + + # if it's indented more than 3 spaces, it should be a code block + if state.sCount[startLine] - state.blkIndent >= 4: + break + + # fail if terminating block found + terminate = False + for terminatorRule in terminatorRules: + if terminatorRule(state, nextLine, endLine, True): + terminate = True + break + + if terminate: + break + + # fail if list has another type + if isOrdered: + posAfterMarker = skipOrderedListMarker(state, nextLine) + if posAfterMarker < 0: + break + start = state.bMarks[nextLine] + state.tShift[nextLine] + else: + posAfterMarker = skipBulletListMarker(state, nextLine) + if posAfterMarker < 0: + break + + if markerCharCode != state.srcCharCode[posAfterMarker - 1]: + break + + # Finalize list + if isOrdered: + token = state.push("ordered_list_close", "ol", -1) + else: + token = state.push("bullet_list_close", "ul", -1) + + token.markup = chr(markerCharCode) + + listLines[1] = nextLine + state.line = nextLine + + state.parentType = oldParentType + + # mark paragraphs tight if needed + if tight: + markTightParagraphs(state, listTokIdx) + + return True diff --git a/markdown_it/rules_block/paragraph.py b/markdown_it/rules_block/paragraph.py new file mode 100644 index 0000000..4fee83e --- /dev/null +++ b/markdown_it/rules_block/paragraph.py @@ -0,0 +1,67 @@ +"""Paragraph.""" +import logging + +from ..ruler import Ruler +from .state_block import StateBlock + +LOGGER = logging.getLogger(__name__) + + +def paragraph(state: StateBlock, startLine: int, endLine: int, silent: bool = False): + + LOGGER.debug( + "entering paragraph: %s, %s, %s, %s", state, startLine, endLine, silent + ) + + nextLine = startLine + 1 + ruler: Ruler = state.md.block.ruler + terminatorRules = ruler.getRules("paragraph") + endLine = state.lineMax + + oldParentType = state.parentType + state.parentType = "paragraph" + + # jump line-by-line until empty one or EOF + while nextLine < endLine: + if state.isEmpty(nextLine): + break + # this would be a code block normally, but after paragraph + # it's considered a lazy continuation regardless of what's there + if state.sCount[nextLine] - state.blkIndent > 3: + nextLine += 1 + continue + + # quirk for blockquotes, this line should already be checked by that rule + if state.sCount[nextLine] < 0: + nextLine += 1 + continue + + # Some tags can terminate paragraph without empty line. + terminate = False + for terminatorRule in terminatorRules: + if terminatorRule(state, nextLine, endLine, True): + terminate = True + break + + if terminate: + break + + nextLine += 1 + + content = state.getLines(startLine, nextLine, state.blkIndent, False).strip() + + state.line = nextLine + + token = state.push("paragraph_open", "p", 1) + token.map = [startLine, state.line] + + token = state.push("inline", "", 0) + token.content = content + token.map = [startLine, state.line] + token.children = [] + + token = state.push("paragraph_close", "p", -1) + + state.parentType = oldParentType + + return True diff --git a/markdown_it/rules_block/reference.py b/markdown_it/rules_block/reference.py new file mode 100644 index 0000000..35adde2 --- /dev/null +++ b/markdown_it/rules_block/reference.py @@ -0,0 +1,218 @@ +import logging + +from ..common.utils import charCodeAt, isSpace, normalizeReference +from .state_block import StateBlock + +LOGGER = logging.getLogger(__name__) + + +def reference(state: StateBlock, startLine, _endLine, silent): + + LOGGER.debug( + "entering reference: %s, %s, %s, %s", state, startLine, _endLine, silent + ) + + lines = 0 + pos = state.bMarks[startLine] + state.tShift[startLine] + maximum = state.eMarks[startLine] + nextLine = startLine + 1 + + # if it's indented more than 3 spaces, it should be a code block + if state.sCount[startLine] - state.blkIndent >= 4: + return False + + if state.srcCharCode[pos] != 0x5B: # /* [ */ + return False + + # Simple check to quickly interrupt scan on [link](url) at the start of line. + # Can be useful on practice: https:#github.com/markdown-it/markdown-it/issues/54 + while pos < maximum: + # /* ] */ /* \ */ /* : */ + if state.srcCharCode[pos] == 0x5D and state.srcCharCode[pos - 1] != 0x5C: + if pos + 1 == maximum: + return False + if state.srcCharCode[pos + 1] != 0x3A: + return False + break + pos += 1 + + endLine = state.lineMax + + # jump line-by-line until empty one or EOF + terminatorRules = state.md.block.ruler.getRules("reference") + + oldParentType = state.parentType + state.parentType = "reference" + + while nextLine < endLine and not state.isEmpty(nextLine): + # this would be a code block normally, but after paragraph + # it's considered a lazy continuation regardless of what's there + if state.sCount[nextLine] - state.blkIndent > 3: + nextLine += 1 + continue + + # quirk for blockquotes, this line should already be checked by that rule + if state.sCount[nextLine] < 0: + nextLine += 1 + continue + + # Some tags can terminate paragraph without empty line. + terminate = False + for terminatorRule in terminatorRules: + if terminatorRule(state, nextLine, endLine, True): + terminate = True + break + + if terminate: + break + + nextLine += 1 + + string = state.getLines(startLine, nextLine, state.blkIndent, False).strip() + maximum = len(string) + + labelEnd = None + pos = 1 + while pos < maximum: + ch = charCodeAt(string, pos) + if ch == 0x5B: # /* [ */ + return False + elif ch == 0x5D: # /* ] */ + labelEnd = pos + break + elif ch == 0x0A: # /* \n */ + lines += 1 + elif ch == 0x5C: # /* \ */ + pos += 1 + if pos < maximum and charCodeAt(string, pos) == 0x0A: + lines += 1 + pos += 1 + + if ( + labelEnd is None or labelEnd < 0 or charCodeAt(string, labelEnd + 1) != 0x3A + ): # /* : */ + return False + + # [label]: destination 'title' + # ^^^ skip optional whitespace here + pos = labelEnd + 2 + while pos < maximum: + ch = charCodeAt(string, pos) + if ch == 0x0A: + lines += 1 + elif isSpace(ch): + pass + else: + break + pos += 1 + + # [label]: destination 'title' + # ^^^^^^^^^^^ parse this + res = state.md.helpers.parseLinkDestination(string, pos, maximum) + if not res.ok: + return False + + href = state.md.normalizeLink(res.str) + if not state.md.validateLink(href): + return False + + pos = res.pos + lines += res.lines + + # save cursor state, we could require to rollback later + destEndPos = pos + destEndLineNo = lines + + # [label]: destination 'title' + # ^^^ skipping those spaces + start = pos + while pos < maximum: + ch = charCodeAt(string, pos) + if ch == 0x0A: + lines += 1 + elif isSpace(ch): + pass + else: + break + pos += 1 + + # [label]: destination 'title' + # ^^^^^^^ parse this + res = state.md.helpers.parseLinkTitle(string, pos, maximum) + if pos < maximum and start != pos and res.ok: + title = res.str + pos = res.pos + lines += res.lines + else: + title = "" + pos = destEndPos + lines = destEndLineNo + + # skip trailing spaces until the rest of the line + while pos < maximum: + ch = charCodeAt(string, pos) + if not isSpace(ch): + break + pos += 1 + + if pos < maximum and charCodeAt(string, pos) != 0x0A: + if title: + # garbage at the end of the line after title, + # but it could still be a valid reference if we roll back + title = "" + pos = destEndPos + lines = destEndLineNo + while pos < maximum: + ch = charCodeAt(string, pos) + if not isSpace(ch): + break + pos += 1 + + if pos < maximum and charCodeAt(string, pos) != 0x0A: + # garbage at the end of the line + return False + + label = normalizeReference(string[1:labelEnd]) + if not label: + # CommonMark 0.20 disallows empty labels + return False + + # Reference can not terminate anything. This check is for safety only. + if silent: + return True + + if "references" not in state.env: + state.env["references"] = {} + + state.line = startLine + lines + 1 + + # note, this is not part of markdown-it JS, but is useful for renderers + if state.md.options.get("inline_definitions", False): + token = state.push("definition", "", 0) + token.meta = { + "id": label, + "title": title, + "url": href, + "label": string[1:labelEnd], + } + token.map = [startLine, state.line] + + if label not in state.env["references"]: + state.env["references"][label] = { + "title": title, + "href": href, + "map": [startLine, state.line], + } + else: + state.env.setdefault("duplicate_refs", []).append( + { + "title": title, + "href": href, + "label": label, + "map": [startLine, state.line], + } + ) + + state.parentType = oldParentType + + return True diff --git a/markdown_it/rules_block/state_block.py b/markdown_it/rules_block/state_block.py new file mode 100644 index 0000000..42b8fce --- /dev/null +++ b/markdown_it/rules_block/state_block.py @@ -0,0 +1,230 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from ..common.utils import isSpace +from ..ruler import StateBase +from ..token import Token + +if TYPE_CHECKING: + from markdown_it.main import MarkdownIt + + +class StateBlock(StateBase): + def __init__( + self, + src: str, + md: MarkdownIt, + env, + tokens: list[Token], + srcCharCode: tuple[int, ...] | None = None, + ): + + if srcCharCode is not None: + self._src = src + self.srcCharCode = srcCharCode + else: + self.src = src + + # link to parser instance + self.md = md + + self.env = env + + # + # Internal state variables + # + + self.tokens = tokens + + self.bMarks = [] # line begin offsets for fast jumps + self.eMarks = [] # line end offsets for fast jumps + # offsets of the first non-space characters (tabs not expanded) + self.tShift = [] + self.sCount = [] # indents for each line (tabs expanded) + + # An amount of virtual spaces (tabs expanded) between beginning + # of each line (bMarks) and real beginning of that line. + # + # It exists only as a hack because blockquotes override bMarks + # losing information in the process. + # + # It's used only when expanding tabs, you can think about it as + # an initial tab length, e.g. bsCount=21 applied to string `\t123` + # means first tab should be expanded to 4-21%4 === 3 spaces. + # + self.bsCount = [] + + # block parser variables + self.blkIndent = 0 # required block content indent (for example, if we are + # inside a list, it would be positioned after list marker) + self.line = 0 # line index in src + self.lineMax = 0 # lines count + self.tight = False # loose/tight mode for lists + self.ddIndent = -1 # indent of the current dd block (-1 if there isn't any) + self.listIndent = -1 # indent of the current list block (-1 if there isn't any) + + # can be 'blockquote', 'list', 'root', 'paragraph' or 'reference' + # used in lists to determine if they interrupt a paragraph + self.parentType = "root" + + self.level = 0 + + # renderer + self.result = "" + + # Create caches + # Generate markers. + indent_found = False + + start = pos = indent = offset = 0 + length = len(self.src) + + for pos, character in enumerate(self.srcCharCode): + if not indent_found: + if isSpace(character): + indent += 1 + + if character == 0x09: + offset += 4 - offset % 4 + else: + offset += 1 + continue + else: + indent_found = True + + if character == 0x0A or pos == length - 1: + if character != 0x0A: + pos += 1 + self.bMarks.append(start) + self.eMarks.append(pos) + self.tShift.append(indent) + self.sCount.append(offset) + self.bsCount.append(0) + + indent_found = False + indent = 0 + offset = 0 + start = pos + 1 + + # Push fake entry to simplify cache bounds checks + self.bMarks.append(length) + self.eMarks.append(length) + self.tShift.append(0) + self.sCount.append(0) + self.bsCount.append(0) + + self.lineMax = len(self.bMarks) - 1 # don't count last fake line + + def __repr__(self): + return ( + f"{self.__class__.__name__}" + f"(line={self.line},level={self.level},tokens={len(self.tokens)})" + ) + + def push(self, ttype: str, tag: str, nesting: int) -> Token: + """Push new token to "stream".""" + token = Token(ttype, tag, nesting) + token.block = True + if nesting < 0: + self.level -= 1 # closing tag + token.level = self.level + if nesting > 0: + self.level += 1 # opening tag + self.tokens.append(token) + return token + + def isEmpty(self, line: int) -> bool: + """.""" + return (self.bMarks[line] + self.tShift[line]) >= self.eMarks[line] + + def skipEmptyLines(self, from_pos: int) -> int: + """.""" + while from_pos < self.lineMax: + try: + if (self.bMarks[from_pos] + self.tShift[from_pos]) < self.eMarks[ + from_pos + ]: + break + except IndexError: + pass + from_pos += 1 + return from_pos + + def skipSpaces(self, pos: int) -> int: + """Skip spaces from given position.""" + while pos < len(self.src): + if not isSpace(self.srcCharCode[pos]): + break + pos += 1 + return pos + + def skipSpacesBack(self, pos: int, minimum: int) -> int: + """Skip spaces from given position in reverse.""" + if pos <= minimum: + return pos + while pos > minimum: + pos -= 1 + if not isSpace(self.srcCharCode[pos]): + return pos + 1 + return pos + + def skipChars(self, pos: int, code: int) -> int: + """Skip char codes from given position.""" + while pos < len(self.src): + if self.srcCharCode[pos] != code: + break + pos += 1 + return pos + + def skipCharsBack(self, pos: int, code: int, minimum: int) -> int: + """Skip char codes reverse from given position - 1.""" + if pos <= minimum: + return pos + while pos > minimum: + pos -= 1 + if code != self.srcCharCode[pos]: + return pos + 1 + return pos + + def getLines(self, begin: int, end: int, indent: int, keepLastLF: bool) -> str: + """Cut lines range from source.""" + line = begin + if begin >= end: + return "" + + queue = [""] * (end - begin) + + i = 1 + while line < end: + lineIndent = 0 + lineStart = first = self.bMarks[line] + if line + 1 < end or keepLastLF: + last = self.eMarks[line] + 1 + else: + last = self.eMarks[line] + + while (first < last) and (lineIndent < indent): + ch = self.srcCharCode[first] + if isSpace(ch): + if ch == 0x09: + lineIndent += 4 - (lineIndent + self.bsCount[line]) % 4 + else: + lineIndent += 1 + elif first - lineStart < self.tShift[line]: + lineIndent += 1 + else: + break + first += 1 + + if lineIndent > indent: + # partially expanding tabs in code blocks, e.g '\t\tfoobar' + # with indent=2 becomes ' \tfoobar' + queue[i - 1] = (" " * (lineIndent - indent)) + self.src[first:last] + else: + queue[i - 1] = self.src[first:last] + + line += 1 + i += 1 + + return "".join(queue) diff --git a/markdown_it/rules_block/table.py b/markdown_it/rules_block/table.py new file mode 100644 index 0000000..e3db858 --- /dev/null +++ b/markdown_it/rules_block/table.py @@ -0,0 +1,238 @@ +# GFM table, https://github.github.com/gfm/#tables-extension- +import re + +from ..common.utils import charCodeAt, isSpace +from .state_block import StateBlock + +headerLineRe = re.compile(r"^:?-+:?$") +enclosingPipesRe = re.compile(r"^\||\|$") + + +def getLine(state: StateBlock, line: int): + pos = state.bMarks[line] + state.tShift[line] + maximum = state.eMarks[line] + + # return state.src.substr(pos, max - pos) + return state.src[pos:maximum] + + +def escapedSplit(string): + result = [] + pos = 0 + max = len(string) + isEscaped = False + lastPos = 0 + current = "" + ch = charCodeAt(string, pos) + + while pos < max: + if ch == 0x7C: # /* | */ + if not isEscaped: + # pipe separating cells, '|' + result.append(current + string[lastPos:pos]) + current = "" + lastPos = pos + 1 + else: + # escaped pipe, '\|' + current += string[lastPos : pos - 1] + lastPos = pos + + isEscaped = ch == 0x5C # /* \ */ + pos += 1 + + ch = charCodeAt(string, pos) + + result.append(current + string[lastPos:]) + + return result + + +def table(state: StateBlock, startLine: int, endLine: int, silent: bool): + tbodyLines = None + + # should have at least two lines + if startLine + 2 > endLine: + return False + + nextLine = startLine + 1 + + if state.sCount[nextLine] < state.blkIndent: + return False + + # if it's indented more than 3 spaces, it should be a code block + if state.sCount[nextLine] - state.blkIndent >= 4: + return False + + # first character of the second line should be '|', '-', ':', + # and no other characters are allowed but spaces; + # basically, this is the equivalent of /^[-:|][-:|\s]*$/ regexp + + pos = state.bMarks[nextLine] + state.tShift[nextLine] + if pos >= state.eMarks[nextLine]: + return False + first_ch = state.srcCharCode[pos] + pos += 1 + if first_ch not in {0x7C, 0x2D, 0x3A}: # not in {"|", "-", ":"} + return False + + if pos >= state.eMarks[nextLine]: + return False + second_ch = state.srcCharCode[pos] + pos += 1 + # not in {"|", "-", ":"} and not space + if second_ch not in {0x7C, 0x2D, 0x3A} and not isSpace(second_ch): + return False + + # if first character is '-', then second character must not be a space + # (due to parsing ambiguity with list) + if first_ch == 0x2D and isSpace(second_ch): + return False + + while pos < state.eMarks[nextLine]: + ch = state.srcCharCode[pos] + + # /* | */ /* - */ /* : */ + if ch not in {0x7C, 0x2D, 0x3A} and not isSpace(ch): + return False + + pos += 1 + + lineText = getLine(state, startLine + 1) + + columns = lineText.split("|") + aligns = [] + for i in range(len(columns)): + t = columns[i].strip() + if not t: + # allow empty columns before and after table, but not in between columns; + # e.g. allow ` |---| `, disallow ` ---||--- ` + if i == 0 or i == len(columns) - 1: + continue + else: + return False + + if not headerLineRe.search(t): + return False + if charCodeAt(t, len(t) - 1) == 0x3A: # /* : */ + # /* : */ + aligns.append("center" if charCodeAt(t, 0) == 0x3A else "right") + elif charCodeAt(t, 0) == 0x3A: # /* : */ + aligns.append("left") + else: + aligns.append("") + + lineText = getLine(state, startLine).strip() + if "|" not in lineText: + return False + if state.sCount[startLine] - state.blkIndent >= 4: + return False + columns = escapedSplit(lineText) + if columns and columns[0] == "": + columns.pop(0) + if columns and columns[-1] == "": + columns.pop() + + # header row will define an amount of columns in the entire table, + # and align row should be exactly the same (the rest of the rows can differ) + columnCount = len(columns) + if columnCount == 0 or columnCount != len(aligns): + return False + + if silent: + return True + + oldParentType = state.parentType + state.parentType = "table" + + # use 'blockquote' lists for termination because it's + # the most similar to tables + terminatorRules = state.md.block.ruler.getRules("blockquote") + + token = state.push("table_open", "table", 1) + token.map = tableLines = [startLine, 0] + + token = state.push("thead_open", "thead", 1) + token.map = [startLine, startLine + 1] + + token = state.push("tr_open", "tr", 1) + token.map = [startLine, startLine + 1] + + for i in range(len(columns)): + token = state.push("th_open", "th", 1) + if aligns[i]: + token.attrs = {"style": "text-align:" + aligns[i]} + + token = state.push("inline", "", 0) + # note in markdown-it this map was removed in v12.0.0 however, we keep it, + # since it is helpful to propagate to children tokens + token.map = [startLine, startLine + 1] + token.content = columns[i].strip() + token.children = [] + + token = state.push("th_close", "th", -1) + + token = state.push("tr_close", "tr", -1) + token = state.push("thead_close", "thead", -1) + + nextLine = startLine + 2 + while nextLine < endLine: + if state.sCount[nextLine] < state.blkIndent: + break + + terminate = False + for i in range(len(terminatorRules)): + if terminatorRules[i](state, nextLine, endLine, True): + terminate = True + break + + if terminate: + break + lineText = getLine(state, nextLine).strip() + if not lineText: + break + if state.sCount[nextLine] - state.blkIndent >= 4: + break + columns = escapedSplit(lineText) + if columns and columns[0] == "": + columns.pop(0) + if columns and columns[-1] == "": + columns.pop() + + if nextLine == startLine + 2: + token = state.push("tbody_open", "tbody", 1) + token.map = tbodyLines = [startLine + 2, 0] + + token = state.push("tr_open", "tr", 1) + token.map = [nextLine, nextLine + 1] + + for i in range(columnCount): + token = state.push("td_open", "td", 1) + if aligns[i]: + token.attrs = {"style": "text-align:" + aligns[i]} + + token = state.push("inline", "", 0) + # note in markdown-it this map was removed in v12.0.0 however, we keep it, + # since it is helpful to propagate to children tokens + token.map = [nextLine, nextLine + 1] + try: + token.content = columns[i].strip() if columns[i] else "" + except IndexError: + token.content = "" + token.children = [] + + token = state.push("td_close", "td", -1) + + token = state.push("tr_close", "tr", -1) + + nextLine += 1 + + if tbodyLines: + token = state.push("tbody_close", "tbody", -1) + tbodyLines[1] = nextLine + + token = state.push("table_close", "table", -1) + + tableLines[1] = nextLine + state.parentType = oldParentType + state.line = nextLine + return True diff --git a/markdown_it/rules_core/__init__.py b/markdown_it/rules_core/__init__.py new file mode 100644 index 0000000..f80034c --- /dev/null +++ b/markdown_it/rules_core/__init__.py @@ -0,0 +1,17 @@ +__all__ = ( + "StateCore", + "normalize", + "block", + "inline", + "replace", + "smartquotes", + "linkify", +) + +from .block import block +from .inline import inline +from .linkify import linkify +from .normalize import normalize +from .replacements import replace +from .smartquotes import smartquotes +from .state_core import StateCore diff --git a/markdown_it/rules_core/block.py b/markdown_it/rules_core/block.py new file mode 100644 index 0000000..fa1c52c --- /dev/null +++ b/markdown_it/rules_core/block.py @@ -0,0 +1,16 @@ +from ..token import Token +from .state_core import StateCore + + +def block(state: StateCore) -> None: + + if state.inlineMode: + token = Token("inline", "", 0) + token.content = state.src + token.map = [0, 1] + token.children = [] + state.tokens.append(token) + else: + state.md.block.parse( + state.src, state.md, state.env, state.tokens, state.srcCharCode + ) diff --git a/markdown_it/rules_core/inline.py b/markdown_it/rules_core/inline.py new file mode 100644 index 0000000..c3fd0b5 --- /dev/null +++ b/markdown_it/rules_core/inline.py @@ -0,0 +1,10 @@ +from .state_core import StateCore + + +def inline(state: StateCore) -> None: + """Parse inlines""" + for token in state.tokens: + if token.type == "inline": + if token.children is None: + token.children = [] + state.md.inline.parse(token.content, state.md, state.env, token.children) diff --git a/markdown_it/rules_core/linkify.py b/markdown_it/rules_core/linkify.py new file mode 100644 index 0000000..49bb4ef --- /dev/null +++ b/markdown_it/rules_core/linkify.py @@ -0,0 +1,141 @@ +import re + +from ..common.utils import arrayReplaceAt +from ..token import Token +from .state_core import StateCore + +LINK_OPEN_RE = re.compile(r"^<a[>\s]", flags=re.IGNORECASE) +LINK_CLOSE_RE = re.compile(r"^</a\s*>", flags=re.IGNORECASE) + +HTTP_RE = re.compile(r"^http://") +MAILTO_RE = re.compile(r"^mailto:") +TEST_MAILTO_RE = re.compile(r"^mailto:", flags=re.IGNORECASE) + + +def isLinkOpen(string: str) -> bool: + return bool(LINK_OPEN_RE.search(string)) + + +def isLinkClose(string: str) -> bool: + return bool(LINK_CLOSE_RE.search(string)) + + +def linkify(state: StateCore) -> None: + blockTokens = state.tokens + + if not state.md.options.linkify: + return + + if not state.md.linkify: + raise ModuleNotFoundError("Linkify enabled but not installed.") + + for j in range(len(blockTokens)): + if blockTokens[j].type != "inline" or not state.md.linkify.pretest( + blockTokens[j].content + ): + continue + + tokens = blockTokens[j].children + + htmlLinkLevel = 0 + + # We scan from the end, to keep position when new tags added. + # Use reversed logic in links start/end match + assert tokens is not None + i = len(tokens) + while i >= 1: + i -= 1 + assert isinstance(tokens, list) + currentToken = tokens[i] + + # Skip content of markdown links + if currentToken.type == "link_close": + i -= 1 + while ( + tokens[i].level != currentToken.level + and tokens[i].type != "link_open" + ): + i -= 1 + continue + + # Skip content of html tag links + if currentToken.type == "html_inline": + if isLinkOpen(currentToken.content) and htmlLinkLevel > 0: + htmlLinkLevel -= 1 + if isLinkClose(currentToken.content): + htmlLinkLevel += 1 + if htmlLinkLevel > 0: + continue + + if currentToken.type == "text" and state.md.linkify.test( + currentToken.content + ): + text = currentToken.content + links = state.md.linkify.match(text) + + # Now split string to nodes + nodes = [] + level = currentToken.level + lastPos = 0 + + for ln in range(len(links)): + url = links[ln].url + fullUrl = state.md.normalizeLink(url) + if not state.md.validateLink(fullUrl): + continue + + urlText = links[ln].text + + # Linkifier might send raw hostnames like "example.com", where url + # starts with domain name. So we prepend http:// in those cases, + # and remove it afterwards. + if not links[ln].schema: + urlText = HTTP_RE.sub( + "", state.md.normalizeLinkText("http://" + urlText) + ) + elif links[ln].schema == "mailto:" and TEST_MAILTO_RE.search( + urlText + ): + urlText = MAILTO_RE.sub( + "", state.md.normalizeLinkText("mailto:" + urlText) + ) + else: + urlText = state.md.normalizeLinkText(urlText) + + pos = links[ln].index + + if pos > lastPos: + token = Token("text", "", 0) + token.content = text[lastPos:pos] + token.level = level + nodes.append(token) + + token = Token("link_open", "a", 1) + token.attrs = {"href": fullUrl} + token.level = level + level += 1 + token.markup = "linkify" + token.info = "auto" + nodes.append(token) + + token = Token("text", "", 0) + token.content = urlText + token.level = level + nodes.append(token) + + token = Token("link_close", "a", -1) + level -= 1 + token.level = level + token.markup = "linkify" + token.info = "auto" + nodes.append(token) + + lastPos = links[ln].last_index + + if lastPos < len(text): + token = Token("text", "", 0) + token.content = text[lastPos:] + token.level = level + nodes.append(token) + + blockTokens[j].children = tokens = arrayReplaceAt(tokens, i, nodes) diff --git a/markdown_it/rules_core/normalize.py b/markdown_it/rules_core/normalize.py new file mode 100644 index 0000000..bf16fd7 --- /dev/null +++ b/markdown_it/rules_core/normalize.py @@ -0,0 +1,19 @@ +"""Normalize input string.""" +import re + +from .state_core import StateCore + +# https://spec.commonmark.org/0.29/#line-ending +NEWLINES_RE = re.compile(r"\r\n?|\n") +NULL_RE = re.compile(r"\0") + + +def normalize(state: StateCore) -> None: + + # Normalize newlines + string = NEWLINES_RE.sub("\n", state.src) + + # Replace NULL characters + string = NULL_RE.sub("\uFFFD", string) + + state.src = string diff --git a/markdown_it/rules_core/replacements.py b/markdown_it/rules_core/replacements.py new file mode 100644 index 0000000..45377d3 --- /dev/null +++ b/markdown_it/rules_core/replacements.py @@ -0,0 +1,125 @@ +"""Simple typographic replacements + +* ``(c)``, ``(C)`` → © +* ``(tm)``, ``(TM)`` → ™ +* ``(r)``, ``(R)`` → ® +* ``(p)``, ``(P)`` → § +* ``+-`` → ± +* ``...`` → … +* ``?....`` → ?.. +* ``!....`` → !.. +* ``????????`` → ??? +* ``!!!!!`` → !!! +* ``,,,`` → , +* ``--`` → &ndash +* ``---`` → &mdash +""" +from __future__ import annotations + +import logging +import re + +from ..token import Token +from .state_core import StateCore + +LOGGER = logging.getLogger(__name__) + +# TODO: +# - fractionals 1/2, 1/4, 3/4 -> ½, ¼, ¾ +# - miltiplication 2 x 4 -> 2 × 4 + +RARE_RE = re.compile(r"\+-|\.\.|\?\?\?\?|!!!!|,,|--") + +# Workaround for phantomjs - need regex without /g flag, +# or root check will fail every second time +# SCOPED_ABBR_TEST_RE = r"\((c|tm|r|p)\)" + +SCOPED_ABBR_RE = re.compile(r"\((c|tm|r|p)\)", flags=re.IGNORECASE) + +PLUS_MINUS_RE = re.compile(r"\+-") + +ELLIPSIS_RE = re.compile(r"\.{2,}") + +ELLIPSIS_QUESTION_EXCLAMATION_RE = re.compile(r"([?!])…") + +QUESTION_EXCLAMATION_RE = re.compile(r"([?!]){4,}") + +COMMA_RE = re.compile(r",{2,}") + +EM_DASH_RE = re.compile(r"(^|[^-])---(?=[^-]|$)", flags=re.MULTILINE) + +EN_DASH_RE = re.compile(r"(^|\s)--(?=\s|$)", flags=re.MULTILINE) + +EN_DASH_INDENT_RE = re.compile(r"(^|[^-\s])--(?=[^-\s]|$)", flags=re.MULTILINE) + + +SCOPED_ABBR = {"c": "©", "r": "®", "p": "§", "tm": "™"} + + +def replaceFn(match: re.Match[str]): + return SCOPED_ABBR[match.group(1).lower()] + + +def replace_scoped(inlineTokens: list[Token]) -> None: + inside_autolink = 0 + + for token in inlineTokens: + if token.type == "text" and not inside_autolink: + token.content = SCOPED_ABBR_RE.sub(replaceFn, token.content) + + if token.type == "link_open" and token.info == "auto": + inside_autolink -= 1 + + if token.type == "link_close" and token.info == "auto": + inside_autolink += 1 + + +def replace_rare(inlineTokens: list[Token]) -> None: + inside_autolink = 0 + + for token in inlineTokens: + if token.type == "text" and not inside_autolink: + if RARE_RE.search(token.content): + # +- -> ± + token.content = PLUS_MINUS_RE.sub("±", token.content) + + # .., ..., ....... -> … + token.content = ELLIPSIS_RE.sub("…", token.content) + + # but ?..... & !..... -> ?.. & !.. + token.content = ELLIPSIS_QUESTION_EXCLAMATION_RE.sub( + "\\1..", token.content + ) + token.content = QUESTION_EXCLAMATION_RE.sub("\\1\\1\\1", token.content) + + # ,, ,,, ,,,, -> , + token.content = COMMA_RE.sub(",", token.content) + + # em-dash + token.content = EM_DASH_RE.sub("\\1\u2014", token.content) + + # en-dash + token.content = EN_DASH_RE.sub("\\1\u2013", token.content) + token.content = EN_DASH_INDENT_RE.sub("\\1\u2013", token.content) + + if token.type == "link_open" and token.info == "auto": + inside_autolink -= 1 + + if token.type == "link_close" and token.info == "auto": + inside_autolink += 1 + + +def replace(state: StateCore) -> None: + if not state.md.options.typographer: + return + + for token in state.tokens: + if token.type != "inline": + continue + assert token.children is not None + + if SCOPED_ABBR_RE.search(token.content): + replace_scoped(token.children) + + if RARE_RE.search(token.content): + replace_rare(token.children) diff --git a/markdown_it/rules_core/smartquotes.py b/markdown_it/rules_core/smartquotes.py new file mode 100644 index 0000000..93f8be2 --- /dev/null +++ b/markdown_it/rules_core/smartquotes.py @@ -0,0 +1,202 @@ +"""Convert straight quotation marks to typographic ones +""" +from __future__ import annotations + +import re +from typing import Any + +from ..common.utils import charCodeAt, isMdAsciiPunct, isPunctChar, isWhiteSpace +from ..token import Token +from .state_core import StateCore + +QUOTE_TEST_RE = re.compile(r"['\"]") +QUOTE_RE = re.compile(r"['\"]") +APOSTROPHE = "\u2019" # ’ + + +def replaceAt(string: str, index: int, ch: str) -> str: + # When the index is negative, the behavior is different from the js version. + # But basically, the index will not be negative. + assert index >= 0 + return string[:index] + ch + string[index + 1 :] + + +def process_inlines(tokens: list[Token], state: StateCore) -> None: + stack: list[dict[str, Any]] = [] + + for i in range(len(tokens)): + token = tokens[i] + + thisLevel = token.level + + j = 0 + for j in range(len(stack))[::-1]: + if stack[j]["level"] <= thisLevel: + break + else: + # When the loop is terminated without a "break". + # Subtract 1 to get the same index as the js version. + j -= 1 + + stack = stack[: j + 1] + + if token.type != "text": + continue + + text = token.content + pos = 0 + maximum = len(text) + + while pos < maximum: + goto_outer = False + lastIndex = pos + t = QUOTE_RE.search(text[lastIndex:]) + if not t: + break + + canOpen = canClose = True + pos = t.start(0) + lastIndex + 1 + isSingle = t.group(0) == "'" + + # Find previous character, + # default to space if it's the beginning of the line + lastChar = 0x20 + + if t.start(0) + lastIndex - 1 >= 0: + lastChar = charCodeAt(text, t.start(0) + lastIndex - 1) + else: + for j in range(i)[::-1]: + # lastChar defaults to 0x20 + if tokens[j].type == "softbreak" or tokens[j].type == "hardbreak": + break + # should skip all tokens except 'text', 'html_inline' or 'code_inline' + if not tokens[j].content: + continue + + lastChar = charCodeAt(tokens[j].content, len(tokens[j].content) - 1) + break + + # Find next character, + # default to space if it's the end of the line + nextChar = 0x20 + + if pos < maximum: + nextChar = charCodeAt(text, pos) + else: + for j in range(i + 1, len(tokens)): + # nextChar defaults to 0x20 + if tokens[j].type == "softbreak" or tokens[j].type == "hardbreak": + break + # should skip all tokens except 'text', 'html_inline' or 'code_inline' + if not tokens[j].content: + continue + + nextChar = charCodeAt(tokens[j].content, 0) + break + + isLastPunctChar = isMdAsciiPunct(lastChar) or isPunctChar(chr(lastChar)) + isNextPunctChar = isMdAsciiPunct(nextChar) or isPunctChar(chr(nextChar)) + + isLastWhiteSpace = isWhiteSpace(lastChar) + isNextWhiteSpace = isWhiteSpace(nextChar) + + if isNextWhiteSpace: + canOpen = False + elif isNextPunctChar: + if not (isLastWhiteSpace or isLastPunctChar): + canOpen = False + + if isLastWhiteSpace: + canClose = False + elif isLastPunctChar: + if not (isNextWhiteSpace or isNextPunctChar): + canClose = False + + if nextChar == 0x22 and t.group(0) == '"': # 0x22: " + if lastChar >= 0x30 and lastChar <= 0x39: # 0x30: 0, 0x39: 9 + # special case: 1"" - count first quote as an inch + canClose = canOpen = False + + if canOpen and canClose: + # Replace quotes in the middle of punctuation sequence, but not + # in the middle of the words, i.e.: + # + # 1. foo " bar " baz - not replaced + # 2. foo-"-bar-"-baz - replaced + # 3. foo"bar"baz - not replaced + canOpen = isLastPunctChar + canClose = isNextPunctChar + + if not canOpen and not canClose: + # middle of word + if isSingle: + token.content = replaceAt( + token.content, t.start(0) + lastIndex, APOSTROPHE + ) + continue + + if canClose: + # this could be a closing quote, rewind the stack to get a match + for j in range(len(stack))[::-1]: + item = stack[j] + if stack[j]["level"] < thisLevel: + break + if item["single"] == isSingle and stack[j]["level"] == thisLevel: + item = stack[j] + + if isSingle: + openQuote = state.md.options.quotes[2] + closeQuote = state.md.options.quotes[3] + else: + openQuote = state.md.options.quotes[0] + closeQuote = state.md.options.quotes[1] + + # replace token.content *before* tokens[item.token].content, + # because, if they are pointing at the same token, replaceAt + # could mess up indices when quote length != 1 + token.content = replaceAt( + token.content, t.start(0) + lastIndex, closeQuote + ) + tokens[item["token"]].content = replaceAt( + tokens[item["token"]].content, item["pos"], openQuote + ) + + pos += len(closeQuote) - 1 + if item["token"] == i: + pos += len(openQuote) - 1 + + text = token.content + maximum = len(text) + + stack = stack[:j] + goto_outer = True + break + if goto_outer: + goto_outer = False + continue + + if canOpen: + stack.append( + { + "token": i, + "pos": t.start(0) + lastIndex, + "single": isSingle, + "level": thisLevel, + } + ) + elif canClose and isSingle: + token.content = replaceAt( + token.content, t.start(0) + lastIndex, APOSTROPHE + ) + + +def smartquotes(state: StateCore) -> None: + if not state.md.options.typographer: + return + + for token in state.tokens: + + if token.type != "inline" or not QUOTE_RE.search(token.content): + continue + assert token.children is not None + process_inlines(token.children, state) diff --git a/markdown_it/rules_core/state_core.py b/markdown_it/rules_core/state_core.py new file mode 100644 index 0000000..15b7c60 --- /dev/null +++ b/markdown_it/rules_core/state_core.py @@ -0,0 +1,25 @@ +from __future__ import annotations + +from collections.abc import MutableMapping +from typing import TYPE_CHECKING + +from ..ruler import StateBase +from ..token import Token + +if TYPE_CHECKING: + from markdown_it import MarkdownIt + + +class StateCore(StateBase): + def __init__( + self, + src: str, + md: MarkdownIt, + env: MutableMapping, + tokens: list[Token] | None = None, + ): + self.src = src + self.md = md # link to parser instance + self.env = env + self.tokens: list[Token] = tokens or [] + self.inlineMode = False diff --git a/markdown_it/rules_inline/__init__.py b/markdown_it/rules_inline/__init__.py new file mode 100644 index 0000000..f27907c --- /dev/null +++ b/markdown_it/rules_inline/__init__.py @@ -0,0 +1,29 @@ +__all__ = ( + "StateInline", + "text", + "text_collapse", + "link_pairs", + "escape", + "newline", + "backtick", + "emphasis", + "image", + "link", + "autolink", + "entity", + "html_inline", + "strikethrough", +) +from . import emphasis, strikethrough +from .autolink import autolink +from .backticks import backtick +from .balance_pairs import link_pairs +from .entity import entity +from .escape import escape +from .html_inline import html_inline +from .image import image +from .link import link +from .newline import newline +from .state_inline import StateInline +from .text import text +from .text_collapse import text_collapse diff --git a/markdown_it/rules_inline/autolink.py b/markdown_it/rules_inline/autolink.py new file mode 100644 index 0000000..a4ee61c --- /dev/null +++ b/markdown_it/rules_inline/autolink.py @@ -0,0 +1,78 @@ +# Process autolinks '<protocol:...>' +import re + +from .state_inline import StateInline + +EMAIL_RE = re.compile( + r"^([a-zA-Z0-9.!#$%&\'*+\/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*)$" # noqa: E501 +) +AUTOLINK_RE = re.compile(r"^([a-zA-Z][a-zA-Z0-9+.\-]{1,31}):([^<>\x00-\x20]*)$") + + +def autolink(state: StateInline, silent: bool) -> bool: + + pos = state.pos + + if state.srcCharCode[pos] != 0x3C: # /* < */ + return False + + start = state.pos + maximum = state.posMax + + while True: + pos += 1 + if pos >= maximum: + return False + + ch = state.srcCharCode[pos] + + if ch == 0x3C: # /* < */ + return False + if ch == 0x3E: # /* > */ + break + + url = state.src[start + 1 : pos] + + if AUTOLINK_RE.search(url) is not None: + fullUrl = state.md.normalizeLink(url) + if not state.md.validateLink(fullUrl): + return False + + if not silent: + token = state.push("link_open", "a", 1) + token.attrs = {"href": fullUrl} + token.markup = "autolink" + token.info = "auto" + + token = state.push("text", "", 0) + token.content = state.md.normalizeLinkText(url) + + token = state.push("link_close", "a", -1) + token.markup = "autolink" + token.info = "auto" + + state.pos += len(url) + 2 + return True + + if EMAIL_RE.search(url) is not None: + fullUrl = state.md.normalizeLink("mailto:" + url) + if not state.md.validateLink(fullUrl): + return False + + if not silent: + token = state.push("link_open", "a", 1) + token.attrs = {"href": fullUrl} + token.markup = "autolink" + token.info = "auto" + + token = state.push("text", "", 0) + token.content = state.md.normalizeLinkText(url) + + token = state.push("link_close", "a", -1) + token.markup = "autolink" + token.info = "auto" + + state.pos += len(url) + 2 + return True + + return False diff --git a/markdown_it/rules_inline/backticks.py b/markdown_it/rules_inline/backticks.py new file mode 100644 index 0000000..7bff12f --- /dev/null +++ b/markdown_it/rules_inline/backticks.py @@ -0,0 +1,75 @@ +# Parse backticks +import re + +from .state_inline import StateInline + +regex = re.compile("^ (.+) $") + + +def backtick(state: StateInline, silent: bool) -> bool: + + pos = state.pos + ch = state.srcCharCode[pos] + + # /* ` */ + if ch != 0x60: + return False + + start = pos + pos += 1 + maximum = state.posMax + + # scan marker length + while pos < maximum and (state.srcCharCode[pos] == 0x60): # /* ` */ + pos += 1 + + marker = state.src[start:pos] + openerLength = len(marker) + + if state.backticksScanned and state.backticks.get(openerLength, 0) <= start: + if not silent: + state.pending += marker + state.pos += openerLength + return True + + matchStart = matchEnd = pos + + # Nothing found in the cache, scan until the end of the line (or until marker is found) + while True: + try: + matchStart = state.src.index("`", matchEnd) + except ValueError: + break + matchEnd = matchStart + 1 + + # scan marker length + while matchEnd < maximum and (state.srcCharCode[matchEnd] == 0x60): # /* ` */ + matchEnd += 1 + + closerLength = matchEnd - matchStart + + if closerLength == openerLength: + # Found matching closer length. + if not silent: + token = state.push("code_inline", "code", 0) + token.markup = marker + token.content = state.src[pos:matchStart].replace("\n", " ") + if ( + token.content.startswith(" ") + and token.content.endswith(" ") + and len(token.content.strip()) > 0 + ): + token.content = token.content[1:-1] + state.pos = matchEnd + return True + + # Some different length found, put it in cache as upper limit of where closer can be found + state.backticks[closerLength] = matchStart + + # Scanned through the end, didn't find anything + state.backticksScanned = True + + if not silent: + state.pending += marker + state.pos += openerLength + return True diff --git a/markdown_it/rules_inline/balance_pairs.py b/markdown_it/rules_inline/balance_pairs.py new file mode 100644 index 0000000..db622f0 --- /dev/null +++ b/markdown_it/rules_inline/balance_pairs.py @@ -0,0 +1,114 @@ +# For each opening emphasis-like marker find a matching closing one +# +from .state_inline import StateInline + + +def processDelimiters(state: StateInline, delimiters, *args): + + openersBottom = {} + maximum = len(delimiters) + + closerIdx = 0 + while closerIdx < maximum: + closer = delimiters[closerIdx] + + # Length is only used for emphasis-specific "rule of 3", + # if it's not defined (in strikethrough or 3rd party plugins), + # we can default it to 0 to disable those checks. + # + closer.length = closer.length or 0 + + if not closer.close: + closerIdx += 1 + continue + + # Previously calculated lower bounds (previous fails) + # for each marker, each delimiter length modulo 3, + # and for whether this closer can be an opener; + # https://github.com/commonmark/cmark/commit/34250e12ccebdc6372b8b49c44fab57c72443460 + if closer.marker not in openersBottom: + openersBottom[closer.marker] = [-1, -1, -1, -1, -1, -1] + + minOpenerIdx = openersBottom[closer.marker][ + (3 if closer.open else 0) + (closer.length % 3) + ] + + openerIdx = closerIdx - closer.jump - 1 + + # avoid crash if `closer.jump` is pointing outside of the array, + # e.g. for strikethrough + if openerIdx < -1: + openerIdx = -1 + + newMinOpenerIdx = openerIdx + + while openerIdx > minOpenerIdx: + opener = delimiters[openerIdx] + + if opener.marker != closer.marker: + openerIdx -= opener.jump + 1 + continue + + if opener.open and opener.end < 0: + + isOddMatch = False + + # from spec: + # + # If one of the delimiters can both open and close emphasis, then the + # sum of the lengths of the delimiter runs containing the opening and + # closing delimiters must not be a multiple of 3 unless both lengths + # are multiples of 3. + # + if opener.close or closer.open: + if (opener.length + closer.length) % 3 == 0: + if opener.length % 3 != 0 or closer.length % 3 != 0: + isOddMatch = True + + if not isOddMatch: + # If previous delimiter cannot be an opener, we can safely skip + # the entire sequence in future checks. This is required to make + # sure algorithm has linear complexity (see *_*_*_*_*_... case). + # + if openerIdx > 0 and not delimiters[openerIdx - 1].open: + lastJump = delimiters[openerIdx - 1].jump + 1 + else: + lastJump = 0 + + closer.jump = closerIdx - openerIdx + lastJump + closer.open = False + opener.end = closerIdx + opener.jump = lastJump + opener.close = False + newMinOpenerIdx = -1 + break + + openerIdx -= opener.jump + 1 + + if newMinOpenerIdx != -1: + # If match for this delimiter run failed, we want to set lower bound for + # future lookups. This is required to make sure algorithm has linear + # complexity. + # + # See details here: + # https:#github.com/commonmark/cmark/issues/178#issuecomment-270417442 + # + openersBottom[closer.marker][ + (3 if closer.open else 0) + ((closer.length or 0) % 3) + ] = newMinOpenerIdx + + closerIdx += 1 + + +def link_pairs(state: StateInline) -> None: + tokens_meta = state.tokens_meta + maximum = len(state.tokens_meta) + + processDelimiters(state, state.delimiters) + + curr = 0 + while curr < maximum: + curr_meta = tokens_meta[curr] + if curr_meta and "delimiters" in curr_meta: + processDelimiters(state, curr_meta["delimiters"]) + curr += 1 diff --git a/markdown_it/rules_inline/emphasis.py b/markdown_it/rules_inline/emphasis.py new file mode 100644 index 0000000..9001b09 --- /dev/null +++ b/markdown_it/rules_inline/emphasis.py @@ -0,0 +1,102 @@ +# Process *this* and _that_ +# + +from .state_inline import Delimiter, StateInline + + +def tokenize(state: StateInline, silent: bool): + """Insert each marker as a separate text token, and add it to delimiter list""" + start = state.pos + marker = state.srcCharCode[start] + + if silent: + return False + + # /* _ */ /* * */ + if marker != 0x5F and marker != 0x2A: + return False + + scanned = state.scanDelims(state.pos, marker == 0x2A) + + for i in range(scanned.length): + token = state.push("text", "", 0) + token.content = chr(marker) + state.delimiters.append( + Delimiter( + marker=marker, + length=scanned.length, + jump=i, + token=len(state.tokens) - 1, + end=-1, + open=scanned.can_open, + close=scanned.can_close, + ) + ) + + state.pos += scanned.length + + return True + + +def _postProcess(state, delimiters): + + i = len(delimiters) - 1 + while i >= 0: + startDelim = delimiters[i] + + # /* _ */ /* * */ + if startDelim.marker != 0x5F and startDelim.marker != 0x2A: + i -= 1 + continue + + # Process only opening markers + if startDelim.end == -1: + i -= 1 + continue + + endDelim = delimiters[startDelim.end] + + # If the previous delimiter has the same marker and is adjacent to this one, + # merge those into one strong delimiter. + # + # `<em><em>whatever</em></em>` -> `<strong>whatever</strong>` + # + isStrong = ( + i > 0 + and delimiters[i - 1].end == startDelim.end + 1 + and delimiters[i - 1].token == startDelim.token - 1 + and delimiters[startDelim.end + 1].token == endDelim.token + 1 + and delimiters[i - 1].marker == startDelim.marker + ) + + ch = chr(startDelim.marker) + + token = state.tokens[startDelim.token] + token.type = "strong_open" if isStrong else "em_open" + token.tag = "strong" if isStrong else "em" + token.nesting = 1 + token.markup = ch + ch if isStrong else ch + token.content = "" + + token = state.tokens[endDelim.token] + token.type = "strong_close" if isStrong else "em_close" + token.tag = "strong" if isStrong else "em" + token.nesting = -1 + token.markup = ch + ch if isStrong else ch + token.content = "" + + if isStrong: + state.tokens[delimiters[i - 1].token].content = "" + state.tokens[delimiters[startDelim.end + 1].token].content = "" + i -= 1 + + i -= 1 + + +def postProcess(state: StateInline): + """Walk through delimiter list and replace text tokens with tags.""" + _postProcess(state, state.delimiters) + + for token in state.tokens_meta: + if token and "delimiters" in token: + _postProcess(state, token["delimiters"]) diff --git a/markdown_it/rules_inline/entity.py b/markdown_it/rules_inline/entity.py new file mode 100644 index 0000000..883a966 --- /dev/null +++ b/markdown_it/rules_inline/entity.py @@ -0,0 +1,54 @@ +# Process html entity - {, ¯, ", ... +import re + +from ..common.entities import entities +from ..common.utils import fromCodePoint, isValidEntityCode +from .state_inline import StateInline + +DIGITAL_RE = re.compile(r"^&#((?:x[a-f0-9]{1,6}|[0-9]{1,7}));", re.IGNORECASE) +NAMED_RE = re.compile(r"^&([a-z][a-z0-9]{1,31});", re.IGNORECASE) + + +def entity(state: StateInline, silent: bool): + + pos = state.pos + maximum = state.posMax + + if state.srcCharCode[pos] != 0x26: # /* & */ + return False + + if (pos + 1) < maximum: + ch = state.srcCharCode[pos + 1] + + if ch == 0x23: # /* # */ + match = DIGITAL_RE.search(state.src[pos:]) + if match: + if not silent: + match1 = match.group(1) + code = ( + int(match1[1:], 16) + if match1[0].lower() == "x" + else int(match1, 10) + ) + state.pending += ( + fromCodePoint(code) + if isValidEntityCode(code) + else fromCodePoint(0xFFFD) + ) + + state.pos += len(match.group(0)) + return True + + else: + match = NAMED_RE.search(state.src[pos:]) + if match: + if match.group(1) in entities: + if not silent: + state.pending += entities[match.group(1)] + state.pos += len(match.group(0)) + return True + + if not silent: + state.pending += "&" + state.pos += 1 + return True diff --git a/markdown_it/rules_inline/escape.py b/markdown_it/rules_inline/escape.py new file mode 100644 index 0000000..36bd040 --- /dev/null +++ b/markdown_it/rules_inline/escape.py @@ -0,0 +1,49 @@ +""" +Process escaped chars and hardbreaks +""" +from ..common.utils import isSpace +from .state_inline import StateInline + +ESCAPED = [0 for _ in range(256)] +for ch in "\\!\"#$%&'()*+,./:;<=>?@[]^_`{|}~-": + ESCAPED[ord(ch)] = 1 + + +def escape(state: StateInline, silent: bool): + pos = state.pos + maximum = state.posMax + + # /* \ */ + if state.srcCharCode[pos] != 0x5C: + return False + + pos += 1 + + if pos < maximum: + ch = state.srcCharCode[pos] + + if ch < 256 and ESCAPED[ch] != 0: + if not silent: + state.pending += state.src[pos] + state.pos += 2 + return True + + if ch == 0x0A: + if not silent: + state.push("hardbreak", "br", 0) + + pos += 1 + # skip leading whitespaces from next line + while pos < maximum: + ch = state.srcCharCode[pos] + if not isSpace(ch): + break + pos += 1 + + state.pos = pos + return True + + if not silent: + state.pending += "\\" + state.pos += 1 + return True diff --git a/markdown_it/rules_inline/html_inline.py b/markdown_it/rules_inline/html_inline.py new file mode 100644 index 0000000..295cc5c --- /dev/null +++ b/markdown_it/rules_inline/html_inline.py @@ -0,0 +1,43 @@ +# Process html tags +from ..common.html_re import HTML_TAG_RE +from .state_inline import StateInline + + +def isLetter(ch: int): + lc = ch | 0x20 # to lower case + # /* a */ and /* z */ + return (lc >= 0x61) and (lc <= 0x7A) + + +def html_inline(state: StateInline, silent: bool): + + pos = state.pos + + if not state.md.options.get("html", None): + return False + + # Check start + maximum = state.posMax + if state.srcCharCode[pos] != 0x3C or pos + 2 >= maximum: # /* < */ + return False + + # Quick fail on second char + ch = state.srcCharCode[pos + 1] + if ( + ch != 0x21 + and ch != 0x3F # /* ! */ + and ch != 0x2F # /* ? */ + and not isLetter(ch) # /* / */ + ): + return False + + match = HTML_TAG_RE.search(state.src[pos:]) + if not match: + return False + + if not silent: + token = state.push("html_inline", "", 0) + token.content = state.src[pos : pos + len(match.group(0))] + + state.pos += len(match.group(0)) + return True diff --git a/markdown_it/rules_inline/image.py b/markdown_it/rules_inline/image.py new file mode 100644 index 0000000..d2a08d4 --- /dev/null +++ b/markdown_it/rules_inline/image.py @@ -0,0 +1,151 @@ +# Process ![image](<src> "title") +from __future__ import annotations + +from ..common.utils import isSpace, normalizeReference +from ..token import Token +from .state_inline import StateInline + + +def image(state: StateInline, silent: bool): + + label = None + href = "" + oldPos = state.pos + max = state.posMax + + # /* ! */ + if state.srcCharCode[state.pos] != 0x21: + return False + # /* [ */ + if state.pos + 1 < state.posMax and state.srcCharCode[state.pos + 1] != 0x5B: + return False + + labelStart = state.pos + 2 + labelEnd = state.md.helpers.parseLinkLabel(state, state.pos + 1, False) + + # parser failed to find ']', so it's not a valid link + if labelEnd < 0: + return False + + pos = labelEnd + 1 + # /* ( */ + if pos < max and state.srcCharCode[pos] == 0x28: + # + # Inline link + # + + # [link]( <href> "title" ) + # ^^ skipping these spaces + pos += 1 + while pos < max: + code = state.srcCharCode[pos] + if not isSpace(code) and code != 0x0A: + break + pos += 1 + + if pos >= max: + return False + + # [link]( <href> "title" ) + # ^^^^^^ parsing link destination + start = pos + res = state.md.helpers.parseLinkDestination(state.src, pos, state.posMax) + if res.ok: + href = state.md.normalizeLink(res.str) + if state.md.validateLink(href): + pos = res.pos + else: + href = "" + + # [link]( <href> "title" ) + # ^^ skipping these spaces + start = pos + while pos < max: + code = state.srcCharCode[pos] + if not isSpace(code) and code != 0x0A: + break + pos += 1 + + # [link]( <href> "title" ) + # ^^^^^^^ parsing link title + res = state.md.helpers.parseLinkTitle(state.src, pos, state.posMax) + if pos < max and start != pos and res.ok: + title = res.str + pos = res.pos + + # [link]( <href> "title" ) + # ^^ skipping these spaces + while pos < max: + code = state.srcCharCode[pos] + if not isSpace(code) and code != 0x0A: + break + pos += 1 + else: + title = "" + + # /* ) */ + if pos >= max or state.srcCharCode[pos] != 0x29: + state.pos = oldPos + return False + + pos += 1 + + else: + # + # Link reference + # + if "references" not in state.env: + return False + + # /* [ */ + if pos < max and state.srcCharCode[pos] == 0x5B: + start = pos + 1 + pos = state.md.helpers.parseLinkLabel(state, pos) + if pos >= 0: + label = state.src[start:pos] + pos += 1 + else: + pos = labelEnd + 1 + else: + pos = labelEnd + 1 + + # covers label == '' and label == undefined + # (collapsed reference link and shortcut reference link respectively) + if not label: + label = state.src[labelStart:labelEnd] + + label = normalizeReference(label) + + ref = state.env["references"].get(label, None) + if not ref: + state.pos = oldPos + return False + + href = ref["href"] + title = ref["title"] + + # + # We found the end of the link, and know for a fact it's a valid link + # so all that's left to do is to call tokenizer. + # + if not silent: + content = state.src[labelStart:labelEnd] + + tokens: list[Token] = [] + state.md.inline.parse(content, state.md, state.env, tokens) + + token = state.push("image", "img", 0) + token.attrs = {"src": href, "alt": ""} + token.children = tokens or None + token.content = content + + if title: + token.attrSet("title", title) + + # note, this is not part of markdown-it JS, but is useful for renderers + if label and state.md.options.get("store_labels", False): + token.meta["label"] = label + + state.pos = pos + state.posMax = max + return True diff --git a/markdown_it/rules_inline/link.py b/markdown_it/rules_inline/link.py new file mode 100644 index 0000000..2394d6c --- /dev/null +++ b/markdown_it/rules_inline/link.py @@ -0,0 +1,150 @@ +# Process [link](<to> "stuff") + +from ..common.utils import isSpace, normalizeReference +from .state_inline import StateInline + + +def link(state: StateInline, silent: bool): + + href = "" + title = "" + label = None + oldPos = state.pos + maximum = state.posMax + start = state.pos + parseReference = True + + if state.srcCharCode[state.pos] != 0x5B: # /* [ */ + return False + + labelStart = state.pos + 1 + labelEnd = state.md.helpers.parseLinkLabel(state, state.pos, True) + + # parser failed to find ']', so it's not a valid link + if labelEnd < 0: + return False + + pos = labelEnd + 1 + + if pos < maximum and state.srcCharCode[pos] == 0x28: # /* ( */ + # + # Inline link + # + + # might have found a valid shortcut link, disable reference parsing + parseReference = False + + # [link]( <href> "title" ) + # ^^ skipping these spaces + pos += 1 + while pos < maximum: + code = state.srcCharCode[pos] + if not isSpace(code) and code != 0x0A: + break + pos += 1 + + if pos >= maximum: + return False + + # [link]( <href> "title" ) + # ^^^^^^ parsing link destination + start = pos + res = state.md.helpers.parseLinkDestination(state.src, pos, state.posMax) + if res.ok: + href = state.md.normalizeLink(res.str) + if state.md.validateLink(href): + pos = res.pos + else: + href = "" + + # [link]( <href> "title" ) + # ^^ skipping these spaces + start = pos + while pos < maximum: + code = state.srcCharCode[pos] + if not isSpace(code) and code != 0x0A: + break + pos += 1 + + # [link]( <href> "title" ) + # ^^^^^^^ parsing link title + res = state.md.helpers.parseLinkTitle(state.src, pos, state.posMax) + if pos < maximum and start != pos and res.ok: + title = res.str + pos = res.pos + + # [link]( <href> "title" ) + # ^^ skipping these spaces + while pos < maximum: + code = state.srcCharCode[pos] + if not isSpace(code) and code != 0x0A: + break + pos += 1 + + if pos >= maximum or state.srcCharCode[pos] != 0x29: # /* ) */ + # parsing a valid shortcut link failed, fallback to reference + parseReference = True + + pos += 1 + + if parseReference: + # + # Link reference + # + if "references" not in state.env: + return False + + if pos < maximum and state.srcCharCode[pos] == 0x5B: # /* [ */ + start = pos + 1 + pos = state.md.helpers.parseLinkLabel(state, pos) + if pos >= 0: + label = state.src[start:pos] + pos += 1 + else: + pos = labelEnd + 1 + + else: + pos = labelEnd + 1 + + # covers label == '' and label == undefined + # (collapsed reference link and shortcut reference link respectively) + if not label: + label = state.src[labelStart:labelEnd] + + label = normalizeReference(label) + + ref = ( + state.env["references"][label] if label in state.env["references"] else None + ) + if not ref: + state.pos = oldPos + return False + + href = ref["href"] + title = ref["title"] + + # + # We found the end of the link, and know for a fact it's a valid link + # so all that's left to do is to call tokenizer. + # + if not silent: + state.pos = labelStart + state.posMax = labelEnd + + token = state.push("link_open", "a", 1) + token.attrs = {"href": href} + + if title: + token.attrSet("title", title) + + # note, this is not part of markdown-it JS, but is useful for renderers + if label and state.md.options.get("store_labels", False): + token.meta["label"] = label + + state.md.inline.tokenize(state) + + token = state.push("link_close", "a", -1) + + state.pos = pos + state.posMax = maximum + return True diff --git a/markdown_it/rules_inline/newline.py b/markdown_it/rules_inline/newline.py new file mode 100644 index 0000000..3034e40 --- /dev/null +++ b/markdown_it/rules_inline/newline.py @@ -0,0 +1,43 @@ +# Proceess '\n' +import re + +from ..common.utils import charCodeAt, isSpace +from .state_inline import StateInline + +endSpace = re.compile(r" +$") + + +def newline(state: StateInline, silent: bool): + pos = state.pos + + # /* \n */ + if state.srcCharCode[pos] != 0x0A: + return False + + pmax = len(state.pending) - 1 + maximum = state.posMax + + # ' \n' -> hardbreak + # Lookup in pending chars is bad practice! Don't copy to other rules! + # Pending string is stored in concat mode, indexed lookups will cause + # conversion to flat mode. + if not silent: + if pmax >= 0 and charCodeAt(state.pending, pmax) == 0x20: + if pmax >= 1 and charCodeAt(state.pending, pmax - 1) == 0x20: + state.pending = endSpace.sub("", state.pending) + state.push("hardbreak", "br", 0) + else: + state.pending = state.pending[:-1] + state.push("softbreak", "br", 0) + + else: + state.push("softbreak", "br", 0) + + pos += 1 + + # skip heading spaces for next line + while pos < maximum and isSpace(state.srcCharCode[pos]): + pos += 1 + + state.pos = pos + return True diff --git a/markdown_it/rules_inline/state_inline.py b/markdown_it/rules_inline/state_inline.py new file mode 100644 index 0000000..283532c --- /dev/null +++ b/markdown_it/rules_inline/state_inline.py @@ -0,0 +1,175 @@ +from __future__ import annotations + +from collections import namedtuple +from collections.abc import MutableMapping +from dataclasses import dataclass +from typing import TYPE_CHECKING + +from .._compat import DATACLASS_KWARGS +from ..common.utils import isMdAsciiPunct, isPunctChar, isWhiteSpace +from ..ruler import StateBase +from ..token import Token + +if TYPE_CHECKING: + from markdown_it import MarkdownIt + + +@dataclass(**DATACLASS_KWARGS) +class Delimiter: + # Char code of the starting marker (number). + marker: int + + # Total length of these series of delimiters. + length: int + + # An amount of characters before this one that's equivalent to + # current one. In plain English: if this delimiter does not open + # an emphasis, neither do previous `jump` characters. + # + # Used to skip sequences like "*****" in one step, for 1st asterisk + # value will be 0, for 2nd it's 1 and so on. + jump: int + + # A position of the token this delimiter corresponds to. + token: int + + # If this delimiter is matched as a valid opener, `end` will be + # equal to its position, otherwise it's `-1`. + end: int + + # Boolean flags that determine if this delimiter could open or close + # an emphasis. + open: bool + close: bool + + level: bool | None = None + + +Scanned = namedtuple("Scanned", ["can_open", "can_close", "length"]) + + +class StateInline(StateBase): + def __init__( + self, src: str, md: MarkdownIt, env: MutableMapping, outTokens: list[Token] + ): + self.src = src + self.env = env + self.md = md + self.tokens = outTokens + self.tokens_meta: list[dict | None] = [None] * len(outTokens) + + self.pos = 0 + self.posMax = len(self.src) + self.level = 0 + self.pending = "" + self.pendingLevel = 0 + + # Stores { start: end } pairs. Useful for backtrack + # optimization of pairs parse (emphasis, strikes). + self.cache: dict[int, int] = {} + + # List of emphasis-like delimiters for current tag + self.delimiters: list[Delimiter] = [] + + # Stack of delimiter lists for upper level tags + self._prev_delimiters: list[list[Delimiter]] = [] + + # backticklength => last seen position + self.backticks: dict[int, int] = {} + self.backticksScanned = False + + def __repr__(self): + return ( + f"{self.__class__.__name__}" + f"(pos=[{self.pos} of {self.posMax}], token={len(self.tokens)})" + ) + + def pushPending(self): + token = Token("text", "", 0) + token.content = self.pending + token.level = self.pendingLevel + self.tokens.append(token) + self.pending = "" + return token + + def push(self, ttype, tag, nesting): + """Push new token to "stream". + If pending text exists - flush it as text token + """ + if self.pending: + self.pushPending() + + token = Token(ttype, tag, nesting) + token_meta = None + + if nesting < 0: + # closing tag + self.level -= 1 + self.delimiters = self._prev_delimiters.pop() + + token.level = self.level + + if nesting > 0: + # opening tag + self.level += 1 + self._prev_delimiters.append(self.delimiters) + self.delimiters = [] + token_meta = {"delimiters": self.delimiters} + + self.pendingLevel = self.level + self.tokens.append(token) + self.tokens_meta.append(token_meta) + return token + + def scanDelims(self, start, canSplitWord): + """ + Scan a sequence of emphasis-like markers, and determine whether + it can start an emphasis sequence or end an emphasis sequence. + + - start - position to scan from (it should point at a valid marker); + - canSplitWord - determine if these markers can be found inside a word + + """ + pos = start + left_flanking = True + right_flanking = True + maximum = self.posMax + marker = self.srcCharCode[start] + + # treat beginning of the line as a whitespace + lastChar = self.srcCharCode[start - 1] if start > 0 else 0x20 + + while pos < maximum and self.srcCharCode[pos] == marker: + pos += 1 + + count = pos - start + + # treat end of the line as a whitespace + nextChar = self.srcCharCode[pos] if pos < maximum else 0x20 + + isLastPunctChar = isMdAsciiPunct(lastChar) or isPunctChar(chr(lastChar)) + isNextPunctChar = isMdAsciiPunct(nextChar) or isPunctChar(chr(nextChar)) + + isLastWhiteSpace = isWhiteSpace(lastChar) + isNextWhiteSpace = isWhiteSpace(nextChar) + + if isNextWhiteSpace: + left_flanking = False + elif isNextPunctChar: + if not (isLastWhiteSpace or isLastPunctChar): + left_flanking = False + + if isLastWhiteSpace: + right_flanking = False + elif isLastPunctChar: + if not (isNextWhiteSpace or isNextPunctChar): + right_flanking = False + + if not canSplitWord: + can_open = left_flanking and ((not right_flanking) or isLastPunctChar) + can_close = right_flanking and ((not left_flanking) or isNextPunctChar) + else: + can_open = left_flanking + can_close = right_flanking + + return Scanned(can_open, can_close, count) diff --git a/markdown_it/rules_inline/strikethrough.py b/markdown_it/rules_inline/strikethrough.py new file mode 100644 index 0000000..107ea26 --- /dev/null +++ b/markdown_it/rules_inline/strikethrough.py @@ -0,0 +1,133 @@ +# ~~strike through~~ +from __future__ import annotations + +from .state_inline import Delimiter, StateInline + + +def tokenize(state: StateInline, silent: bool): + """Insert each marker as a separate text token, and add it to delimiter list""" + start = state.pos + marker = state.srcCharCode[start] + + if silent: + return False + + if marker != 0x7E: # /* ~ */ + return False + + scanned = state.scanDelims(state.pos, True) + length = scanned.length + ch = chr(marker) + + if length < 2: + return False + + if length % 2: + token = state.push("text", "", 0) + token.content = ch + length -= 1 + + i = 0 + while i < length: + token = state.push("text", "", 0) + token.content = ch + ch + state.delimiters.append( + Delimiter( + **{ + "marker": marker, + "length": 0, # disable "rule of 3" length checks meant for emphasis + "jump": i // 2, # for `~~` 1 marker = 2 characters + "token": len(state.tokens) - 1, + "end": -1, + "open": scanned.can_open, + "close": scanned.can_close, + } + ) + ) + + i += 2 + + state.pos += scanned.length + + return True + + +def _postProcess(state: StateInline, delimiters: list[Delimiter]): + + loneMarkers = [] + maximum = len(delimiters) + + i = 0 + while i < maximum: + startDelim = delimiters[i] + + if startDelim.marker != 0x7E: # /* ~ */ + i += 1 + continue + + if startDelim.end == -1: + i += 1 + continue + + endDelim = delimiters[startDelim.end] + + token = state.tokens[startDelim.token] + token.type = "s_open" + token.tag = "s" + token.nesting = 1 + token.markup = "~~" + token.content = "" + + token = state.tokens[endDelim.token] + token.type = "s_close" + token.tag = "s" + token.nesting = -1 + token.markup = "~~" + token.content = "" + + if ( + state.tokens[endDelim.token - 1].type == "text" + and state.tokens[endDelim.token - 1].content == "~" + ): + + loneMarkers.append(endDelim.token - 1) + + i += 1 + + # If a marker sequence has an odd number of characters, it's split + # like this: `~~~~~` -> `~` + `~~` + `~~`, leaving one marker at the + # start of the sequence. + # + # So, we have to move all those markers after subsequent s_close tags. + # + while loneMarkers: + i = loneMarkers.pop() + j = i + 1 + + while (j < len(state.tokens)) and (state.tokens[j].type == "s_close"): + j += 1 + + j -= 1 + + if i != j: + token = state.tokens[j] + state.tokens[j] = state.tokens[i] + state.tokens[i] = token + + +def postProcess(state: StateInline): + """Walk through delimiter list and replace text tokens with tags.""" + tokens_meta = state.tokens_meta + maximum = len(state.tokens_meta) + _postProcess(state, state.delimiters) + + curr = 0 + while curr < maximum: + try: + curr_meta = tokens_meta[curr] + except IndexError: + pass + else: + if curr_meta and "delimiters" in curr_meta: + _postProcess(state, curr_meta["delimiters"]) + curr += 1 diff --git a/markdown_it/rules_inline/text.py b/markdown_it/rules_inline/text.py new file mode 100644 index 0000000..ec6ee0f --- /dev/null +++ b/markdown_it/rules_inline/text.py @@ -0,0 +1,57 @@ +# Skip text characters for text token, place those to pending buffer +# and increment current pos + +from .state_inline import StateInline + +# Rule to skip pure text +# '{}$%@~+=:' reserved for extensions + +# !, ", #, $, %, &, ', (, ), *, +, ,, -, ., /, :, ;, <, =, >, ?, @, [, \, ], ^, _, `, {, |, }, or ~ + +# !!!! Don't confuse with "Markdown ASCII Punctuation" chars +# http://spec.commonmark.org/0.15/#ascii-punctuation-character + + +def isTerminatorChar(ch): + return ch in { + 0x0A, # /* \n */: + 0x21, # /* ! */: + 0x23, # /* # */: + 0x24, # /* $ */: + 0x25, # /* % */: + 0x26, # /* & */: + 0x2A, # /* * */: + 0x2B, # /* + */: + 0x2D, # /* - */: + 0x3A, # /* : */: + 0x3C, # /* < */: + 0x3D, # /* = */: + 0x3E, # /* > */: + 0x40, # /* @ */: + 0x5B, # /* [ */: + 0x5C, # /* \ */: + 0x5D, # /* ] */: + 0x5E, # /* ^ */: + 0x5F, # /* _ */: + 0x60, # /* ` */: + 0x7B, # /* { */: + 0x7D, # /* } */: + 0x7E, # /* ~ */: + } + + +def text(state: StateInline, silent: bool, **args): + pos = state.pos + posMax = state.posMax + while (pos < posMax) and not isTerminatorChar(state.srcCharCode[pos]): + pos += 1 + + if pos == state.pos: + return False + + if not silent: + state.pending += state.src[state.pos : pos] + + state.pos = pos + + return True diff --git a/markdown_it/rules_inline/text_collapse.py b/markdown_it/rules_inline/text_collapse.py new file mode 100644 index 0000000..6d0c0ab --- /dev/null +++ b/markdown_it/rules_inline/text_collapse.py @@ -0,0 +1,43 @@ +from .state_inline import StateInline + + +def text_collapse(state: StateInline, *args): + """ + Clean up tokens after emphasis and strikethrough postprocessing: + merge adjacent text nodes into one and re-calculate all token levels + + This is necessary because initially emphasis delimiter markers (``*, _, ~``) + are treated as their own separate text tokens. Then emphasis rule either + leaves them as text (needed to merge with adjacent text) or turns them + into opening/closing tags (which messes up levels inside). + """ + level = 0 + maximum = len(state.tokens) + + curr = last = 0 + while curr < maximum: + # re-calculate levels after emphasis/strikethrough turns some text nodes + # into opening/closing tags + if state.tokens[curr].nesting < 0: + level -= 1 # closing tag + state.tokens[curr].level = level + if state.tokens[curr].nesting > 0: + level += 1 # opening tag + + if ( + state.tokens[curr].type == "text" + and curr + 1 < maximum + and state.tokens[curr + 1].type == "text" + ): + # collapse two adjacent text nodes + state.tokens[curr + 1].content = ( + state.tokens[curr].content + state.tokens[curr + 1].content + ) + else: + if curr != last: + state.tokens[last] = state.tokens[curr] + last += 1 + curr += 1 + + if curr != last: + del state.tokens[last:] diff --git a/markdown_it/token.py b/markdown_it/token.py new file mode 100644 index 0000000..b20875b --- /dev/null +++ b/markdown_it/token.py @@ -0,0 +1,181 @@ +from __future__ import annotations + +from collections.abc import Callable, MutableMapping +import dataclasses as dc +from typing import Any +import warnings + +from markdown_it._compat import DATACLASS_KWARGS + + +def convert_attrs(value: Any) -> Any: + """Convert Token.attrs set as ``None`` or ``[[key, value], ...]`` to a dict. + + This improves compatibility with upstream markdown-it. + """ + if not value: + return {} + if isinstance(value, list): + return dict(value) + return value + + +@dc.dataclass(**DATACLASS_KWARGS) +class Token: + + type: str + """Type of the token (string, e.g. "paragraph_open")""" + + tag: str + """HTML tag name, e.g. 'p'""" + + nesting: int + """Level change (number in {-1, 0, 1} set), where: + - `1` means the tag is opening + - `0` means the tag is self-closing + - `-1` means the tag is closing + """ + + attrs: dict[str, str | int | float] = dc.field(default_factory=dict) + """HTML attributes. + Note this differs from the upstream "list of lists" format, + although than an instance can still be initialised with this format. + """ + + map: list[int] | None = None + """Source map info. Format: `[ line_begin, line_end ]`""" + + level: int = 0 + """Nesting level, the same as `state.level`""" + + children: list[Token] | None = None + """Array of child nodes (inline and img tokens).""" + + content: str = "" + """Inner content, in the case of a self-closing tag (code, html, fence, etc.),""" + + markup: str = "" + """'*' or '_' for emphasis, fence string for fence, etc.""" + + info: str = "" + """Additional information: + - Info string for "fence" tokens + - The value "auto" for autolink "link_open" and "link_close" tokens + - The string value of the item marker for ordered-list "list_item_open" tokens + """ + + meta: dict = dc.field(default_factory=dict) + """A place for plugins to store any arbitrary data""" + + block: bool = False + """True for block-level tokens, false for inline tokens. + Used in renderer to calculate line breaks + """ + + hidden: bool = False + """If true, ignore this element when rendering. + Used for tight lists to hide paragraphs. + """ + + def __post_init__(self): + self.attrs = convert_attrs(self.attrs) + + def attrIndex(self, name: str) -> int: + warnings.warn( + "Token.attrIndex should not be used, since Token.attrs is a dictionary", + UserWarning, + ) + if name not in self.attrs: + return -1 + return list(self.attrs.keys()).index(name) + + def attrItems(self) -> list[tuple[str, str | int | float]]: + """Get (key, value) list of attrs.""" + return list(self.attrs.items()) + + def attrPush(self, attrData: tuple[str, str | int | float]) -> None: + """Add `[ name, value ]` attribute to list. Init attrs if necessary.""" + name, value = attrData + self.attrSet(name, value) + + def attrSet(self, name: str, value: str | int | float) -> None: + """Set `name` attribute to `value`. Override old value if exists.""" + self.attrs[name] = value + + def attrGet(self, name: str) -> None | str | int | float: + """Get the value of attribute `name`, or null if it does not exist.""" + return self.attrs.get(name, None) + + def attrJoin(self, name: str, value: str) -> None: + """Join value to existing attribute via space. + Or create new attribute if not exists. + Useful to operate with token classes. + """ + if name in self.attrs: + current = self.attrs[name] + if not isinstance(current, str): + raise TypeError( + f"existing attr 'name' is not a str: {self.attrs[name]}" + ) + self.attrs[name] = f"{current} {value}" + else: + self.attrs[name] = value + + def copy(self, **changes: Any) -> Token: + """Return a shallow copy of the instance.""" + return dc.replace(self, **changes) + + def as_dict( + self, + *, + children: bool = True, + as_upstream: bool = True, + meta_serializer: Callable[[dict], Any] | None = None, + filter: Callable[[str, Any], bool] | None = None, + dict_factory: Callable[..., MutableMapping[str, Any]] = dict, + ) -> MutableMapping[str, Any]: + """Return the token as a dictionary. + + :param children: Also convert children to dicts + :param as_upstream: Ensure the output dictionary is equal to that created by markdown-it + For example, attrs are converted to null or lists + :param meta_serializer: hook for serializing ``Token.meta`` + :param filter: A callable whose return code determines whether an + attribute or element is included (``True``) or dropped (``False``). + Is called with the (key, value) pair. + :param dict_factory: A callable to produce dictionaries from. + For example, to produce ordered dictionaries instead of normal Python + dictionaries, pass in ``collections.OrderedDict``. + + """ + mapping = dict_factory((f.name, getattr(self, f.name)) for f in dc.fields(self)) + if filter: + mapping = dict_factory((k, v) for k, v in mapping.items() if filter(k, v)) + if as_upstream and "attrs" in mapping: + mapping["attrs"] = ( + None + if not mapping["attrs"] + else [[k, v] for k, v in mapping["attrs"].items()] + ) + if meta_serializer and "meta" in mapping: + mapping["meta"] = meta_serializer(mapping["meta"]) + if children and mapping.get("children", None): + mapping["children"] = [ + child.as_dict( + children=children, + filter=filter, + dict_factory=dict_factory, + as_upstream=as_upstream, + meta_serializer=meta_serializer, + ) + for child in mapping["children"] + ] + return mapping + + @classmethod + def from_dict(cls, dct: MutableMapping[str, Any]) -> Token: + """Convert a dict to a Token.""" + token = cls(**dct) + if token.children: + token.children = [cls.from_dict(c) for c in token.children] # type: ignore[arg-type] + return token diff --git a/markdown_it/tree.py b/markdown_it/tree.py new file mode 100644 index 0000000..09476b2 --- /dev/null +++ b/markdown_it/tree.py @@ -0,0 +1,330 @@ +"""A tree representation of a linear markdown-it token stream. + +This module is not part of upstream JavaScript markdown-it. +""" +from __future__ import annotations + +from collections.abc import Generator, Sequence +import textwrap +from typing import Any, NamedTuple, TypeVar, overload + +from .token import Token +from .utils import _removesuffix + + +class _NesterTokens(NamedTuple): + opening: Token + closing: Token + + +_NodeType = TypeVar("_NodeType", bound="SyntaxTreeNode") + + +class SyntaxTreeNode: + """A Markdown syntax tree node. + + A class that can be used to construct a tree representation of a linear + `markdown-it-py` token stream. + + Each node in the tree represents either: + - root of the Markdown document + - a single unnested `Token` + - a `Token` "_open" and "_close" token pair, and the tokens nested in + between + """ + + def __init__( + self, tokens: Sequence[Token] = (), *, create_root: bool = True + ) -> None: + """Initialize a `SyntaxTreeNode` from a token stream. + + If `create_root` is True, create a root node for the document. + """ + # Only nodes representing an unnested token have self.token + self.token: Token | None = None + + # Only containers have nester tokens + self.nester_tokens: _NesterTokens | None = None + + # Root node does not have self.parent + self._parent: Any = None + + # Empty list unless a non-empty container, or unnested token that has + # children (i.e. inline or img) + self._children: list = [] + + if create_root: + self._set_children_from_tokens(tokens) + return + + if not tokens: + raise ValueError( + "Can only create root from empty token sequence." + " Set `create_root=True`." + ) + elif len(tokens) == 1: + inline_token = tokens[0] + if inline_token.nesting: + raise ValueError( + "Unequal nesting level at the start and end of token stream." + ) + self.token = inline_token + if inline_token.children: + self._set_children_from_tokens(inline_token.children) + else: + self.nester_tokens = _NesterTokens(tokens[0], tokens[-1]) + self._set_children_from_tokens(tokens[1:-1]) + + def __repr__(self) -> str: + return f"{type(self).__name__}({self.type})" + + @overload + def __getitem__(self: _NodeType, item: int) -> _NodeType: + ... + + @overload + def __getitem__(self: _NodeType, item: slice) -> list[_NodeType]: + ... + + def __getitem__(self: _NodeType, item: int | slice) -> _NodeType | list[_NodeType]: + return self.children[item] + + def to_tokens(self: _NodeType) -> list[Token]: + """Recover the linear token stream.""" + + def recursive_collect_tokens(node: _NodeType, token_list: list[Token]) -> None: + if node.type == "root": + for child in node.children: + recursive_collect_tokens(child, token_list) + elif node.token: + token_list.append(node.token) + else: + assert node.nester_tokens + token_list.append(node.nester_tokens.opening) + for child in node.children: + recursive_collect_tokens(child, token_list) + token_list.append(node.nester_tokens.closing) + + tokens: list[Token] = [] + recursive_collect_tokens(self, tokens) + return tokens + + @property + def children(self: _NodeType) -> list[_NodeType]: + return self._children + + @children.setter + def children(self: _NodeType, value: list[_NodeType]) -> None: + self._children = value + + @property + def parent(self: _NodeType) -> _NodeType | None: + return self._parent + + @parent.setter + def parent(self: _NodeType, value: _NodeType | None) -> None: + self._parent = value + + @property + def is_root(self) -> bool: + """Is the node a special root node?""" + return not (self.token or self.nester_tokens) + + @property + def is_nested(self) -> bool: + """Is this node nested?. + + Returns `True` if the node represents a `Token` pair and tokens in the + sequence between them, where `Token.nesting` of the first `Token` in + the pair is 1 and nesting of the other `Token` is -1. + """ + return bool(self.nester_tokens) + + @property + def siblings(self: _NodeType) -> Sequence[_NodeType]: + """Get siblings of the node. + + Gets the whole group of siblings, including self. + """ + if not self.parent: + return [self] + return self.parent.children + + @property + def type(self) -> str: + """Get a string type of the represented syntax. + + - "root" for root nodes + - `Token.type` if the node represents an unnested token + - `Token.type` of the opening token, with "_open" suffix stripped, if + the node represents a nester token pair + """ + if self.is_root: + return "root" + if self.token: + return self.token.type + assert self.nester_tokens + return _removesuffix(self.nester_tokens.opening.type, "_open") + + @property + def next_sibling(self: _NodeType) -> _NodeType | None: + """Get the next node in the sequence of siblings. + + Returns `None` if this is the last sibling. + """ + self_index = self.siblings.index(self) + if self_index + 1 < len(self.siblings): + return self.siblings[self_index + 1] + return None + + @property + def previous_sibling(self: _NodeType) -> _NodeType | None: + """Get the previous node in the sequence of siblings. + + Returns `None` if this is the first sibling. + """ + self_index = self.siblings.index(self) + if self_index - 1 >= 0: + return self.siblings[self_index - 1] + return None + + def _add_child( + self, + tokens: Sequence[Token], + ) -> None: + """Make a child node for `self`.""" + child = type(self)(tokens, create_root=False) + child.parent = self + self.children.append(child) + + def _set_children_from_tokens(self, tokens: Sequence[Token]) -> None: + """Convert the token stream to a tree structure and set the resulting + nodes as children of `self`.""" + reversed_tokens = list(reversed(tokens)) + while reversed_tokens: + token = reversed_tokens.pop() + + if not token.nesting: + self._add_child([token]) + continue + if token.nesting != 1: + raise ValueError("Invalid token nesting") + + nested_tokens = [token] + nesting = 1 + while reversed_tokens and nesting: + token = reversed_tokens.pop() + nested_tokens.append(token) + nesting += token.nesting + if nesting: + raise ValueError(f"unclosed tokens starting {nested_tokens[0]}") + + self._add_child(nested_tokens) + + def pretty( + self, *, indent: int = 2, show_text: bool = False, _current: int = 0 + ) -> str: + """Create an XML style string of the tree.""" + prefix = " " * _current + text = prefix + f"<{self.type}" + if not self.is_root and self.attrs: + text += " " + " ".join(f"{k}={v!r}" for k, v in self.attrs.items()) + text += ">" + if show_text and not self.is_root and self.type == "text" and self.content: + text += "\n" + textwrap.indent(self.content, prefix + " " * indent) + for child in self.children: + text += "\n" + child.pretty( + indent=indent, show_text=show_text, _current=_current + indent + ) + return text + + def walk( + self: _NodeType, *, include_self: bool = True + ) -> Generator[_NodeType, None, None]: + """Recursively yield all descendant nodes in the tree starting at self. + + The order mimics the order of the underlying linear token + stream (i.e. depth first). + """ + if include_self: + yield self + for child in self.children: + yield from child.walk(include_self=True) + + # NOTE: + # The values of the properties defined below directly map to properties + # of the underlying `Token`s. A root node does not translate to a `Token` + # object, so calling these property getters on a root node will raise an + # `AttributeError`. + # + # There is no mapping for `Token.nesting` because the `is_nested` property + # provides that data, and can be called on any node type, including root. + + def _attribute_token(self) -> Token: + """Return the `Token` that is used as the data source for the + properties defined below.""" + if self.token: + return self.token + if self.nester_tokens: + return self.nester_tokens.opening + raise AttributeError("Root node does not have the accessed attribute") + + @property + def tag(self) -> str: + """html tag name, e.g. \"p\" """ + return self._attribute_token().tag + + @property + def attrs(self) -> dict[str, str | int | float]: + """Html attributes.""" + return self._attribute_token().attrs + + def attrGet(self, name: str) -> None | str | int | float: + """Get the value of attribute `name`, or null if it does not exist.""" + return self._attribute_token().attrGet(name) + + @property + def map(self) -> tuple[int, int] | None: + """Source map info. Format: `tuple[ line_begin, line_end ]`""" + map_ = self._attribute_token().map + if map_: + # Type ignore because `Token`s attribute types are not perfect + return tuple(map_) # type: ignore + return None + + @property + def level(self) -> int: + """nesting level, the same as `state.level`""" + return self._attribute_token().level + + @property + def content(self) -> str: + """In a case of self-closing tag (code, html, fence, etc.), it + has contents of this tag.""" + return self._attribute_token().content + + @property + def markup(self) -> str: + """'*' or '_' for emphasis, fence string for fence, etc.""" + return self._attribute_token().markup + + @property + def info(self) -> str: + """fence infostring""" + return self._attribute_token().info + + @property + def meta(self) -> dict: + """A place for plugins to store an arbitrary data.""" + return self._attribute_token().meta + + @property + def block(self) -> bool: + """True for block-level tokens, false for inline tokens.""" + return self._attribute_token().block + + @property + def hidden(self) -> bool: + """If it's true, ignore this element when rendering. + Used for tight lists to hide paragraphs.""" + return self._attribute_token().hidden diff --git a/markdown_it/utils.py b/markdown_it/utils.py new file mode 100644 index 0000000..2ba2995 --- /dev/null +++ b/markdown_it/utils.py @@ -0,0 +1,122 @@ +from __future__ import annotations + +from collections.abc import Callable +from pathlib import Path + + +class OptionsDict(dict): + """A dictionary, with attribute access to core markdownit configuration options.""" + + @property + def maxNesting(self) -> int: + """Internal protection, recursion limit.""" + return self["maxNesting"] + + @maxNesting.setter + def maxNesting(self, value: int): + self["maxNesting"] = value + + @property + def html(self) -> bool: + """Enable HTML tags in source.""" + return self["html"] + + @html.setter + def html(self, value: bool): + self["html"] = value + + @property + def linkify(self) -> bool: + """Enable autoconversion of URL-like texts to links.""" + return self["linkify"] + + @linkify.setter + def linkify(self, value: bool): + self["linkify"] = value + + @property + def typographer(self) -> bool: + """Enable smartquotes and replacements.""" + return self["typographer"] + + @typographer.setter + def typographer(self, value: bool): + self["typographer"] = value + + @property + def quotes(self) -> str: + """Quote characters.""" + return self["quotes"] + + @quotes.setter + def quotes(self, value: str): + self["quotes"] = value + + @property + def xhtmlOut(self) -> bool: + """Use '/' to close single tags (<br />).""" + return self["xhtmlOut"] + + @xhtmlOut.setter + def xhtmlOut(self, value: bool): + self["xhtmlOut"] = value + + @property + def breaks(self) -> bool: + """Convert newlines in paragraphs into <br>.""" + return self["breaks"] + + @breaks.setter + def breaks(self, value: bool): + self["breaks"] = value + + @property + def langPrefix(self) -> str: + """CSS language prefix for fenced blocks.""" + return self["langPrefix"] + + @langPrefix.setter + def langPrefix(self, value: str): + self["langPrefix"] = value + + @property + def highlight(self) -> Callable[[str, str, str], str] | None: + """Highlighter function: (content, langName, langAttrs) -> escaped HTML.""" + return self["highlight"] + + @highlight.setter + def highlight(self, value: Callable[[str, str, str], str] | None): + self["highlight"] = value + + +def read_fixture_file(path: str | Path) -> list[list]: + text = Path(path).read_text(encoding="utf-8") + tests = [] + section = 0 + last_pos = 0 + lines = text.splitlines(keepends=True) + for i in range(len(lines)): + if lines[i].rstrip() == ".": + if section == 0: + tests.append([i, lines[i - 1].strip()]) + section = 1 + elif section == 1: + tests[-1].append("".join(lines[last_pos + 1 : i])) + section = 2 + elif section == 2: + tests[-1].append("".join(lines[last_pos + 1 : i])) + section = 0 + + last_pos = i + return tests + + +def _removesuffix(string: str, suffix: str) -> str: + """Remove a suffix from a string. + + Replace this with str.removesuffix() from stdlib when minimum Python + version is 3.9. + """ + if suffix and string.endswith(suffix): + return string[: -len(suffix)] + return string |