diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-29 04:24:24 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-29 04:24:24 +0000 |
commit | 12e8343068b906f8b2afddc5569968a8a91fa5b0 (patch) | |
tree | 75cc5e05a4392ea0292251898f992a15a16b172b /markdown_it/rules_core | |
parent | Initial commit. (diff) | |
download | markdown-it-py-upstream.tar.xz markdown-it-py-upstream.zip |
Adding upstream version 2.1.0.upstream/2.1.0upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'markdown_it/rules_core')
-rw-r--r-- | markdown_it/rules_core/__init__.py | 17 | ||||
-rw-r--r-- | markdown_it/rules_core/block.py | 16 | ||||
-rw-r--r-- | markdown_it/rules_core/inline.py | 10 | ||||
-rw-r--r-- | markdown_it/rules_core/linkify.py | 141 | ||||
-rw-r--r-- | markdown_it/rules_core/normalize.py | 19 | ||||
-rw-r--r-- | markdown_it/rules_core/replacements.py | 125 | ||||
-rw-r--r-- | markdown_it/rules_core/smartquotes.py | 202 | ||||
-rw-r--r-- | markdown_it/rules_core/state_core.py | 25 |
8 files changed, 555 insertions, 0 deletions
diff --git a/markdown_it/rules_core/__init__.py b/markdown_it/rules_core/__init__.py new file mode 100644 index 0000000..f80034c --- /dev/null +++ b/markdown_it/rules_core/__init__.py @@ -0,0 +1,17 @@ +__all__ = ( + "StateCore", + "normalize", + "block", + "inline", + "replace", + "smartquotes", + "linkify", +) + +from .block import block +from .inline import inline +from .linkify import linkify +from .normalize import normalize +from .replacements import replace +from .smartquotes import smartquotes +from .state_core import StateCore diff --git a/markdown_it/rules_core/block.py b/markdown_it/rules_core/block.py new file mode 100644 index 0000000..fa1c52c --- /dev/null +++ b/markdown_it/rules_core/block.py @@ -0,0 +1,16 @@ +from ..token import Token +from .state_core import StateCore + + +def block(state: StateCore) -> None: + + if state.inlineMode: + token = Token("inline", "", 0) + token.content = state.src + token.map = [0, 1] + token.children = [] + state.tokens.append(token) + else: + state.md.block.parse( + state.src, state.md, state.env, state.tokens, state.srcCharCode + ) diff --git a/markdown_it/rules_core/inline.py b/markdown_it/rules_core/inline.py new file mode 100644 index 0000000..c3fd0b5 --- /dev/null +++ b/markdown_it/rules_core/inline.py @@ -0,0 +1,10 @@ +from .state_core import StateCore + + +def inline(state: StateCore) -> None: + """Parse inlines""" + for token in state.tokens: + if token.type == "inline": + if token.children is None: + token.children = [] + state.md.inline.parse(token.content, state.md, state.env, token.children) diff --git a/markdown_it/rules_core/linkify.py b/markdown_it/rules_core/linkify.py new file mode 100644 index 0000000..49bb4ef --- /dev/null +++ b/markdown_it/rules_core/linkify.py @@ -0,0 +1,141 @@ +import re + +from ..common.utils import arrayReplaceAt +from ..token import Token +from .state_core import StateCore + +LINK_OPEN_RE = re.compile(r"^<a[>\s]", flags=re.IGNORECASE) +LINK_CLOSE_RE = re.compile(r"^</a\s*>", flags=re.IGNORECASE) + +HTTP_RE = re.compile(r"^http://") +MAILTO_RE = re.compile(r"^mailto:") +TEST_MAILTO_RE = re.compile(r"^mailto:", flags=re.IGNORECASE) + + +def isLinkOpen(string: str) -> bool: + return bool(LINK_OPEN_RE.search(string)) + + +def isLinkClose(string: str) -> bool: + return bool(LINK_CLOSE_RE.search(string)) + + +def linkify(state: StateCore) -> None: + blockTokens = state.tokens + + if not state.md.options.linkify: + return + + if not state.md.linkify: + raise ModuleNotFoundError("Linkify enabled but not installed.") + + for j in range(len(blockTokens)): + if blockTokens[j].type != "inline" or not state.md.linkify.pretest( + blockTokens[j].content + ): + continue + + tokens = blockTokens[j].children + + htmlLinkLevel = 0 + + # We scan from the end, to keep position when new tags added. + # Use reversed logic in links start/end match + assert tokens is not None + i = len(tokens) + while i >= 1: + i -= 1 + assert isinstance(tokens, list) + currentToken = tokens[i] + + # Skip content of markdown links + if currentToken.type == "link_close": + i -= 1 + while ( + tokens[i].level != currentToken.level + and tokens[i].type != "link_open" + ): + i -= 1 + continue + + # Skip content of html tag links + if currentToken.type == "html_inline": + if isLinkOpen(currentToken.content) and htmlLinkLevel > 0: + htmlLinkLevel -= 1 + if isLinkClose(currentToken.content): + htmlLinkLevel += 1 + if htmlLinkLevel > 0: + continue + + if currentToken.type == "text" and state.md.linkify.test( + currentToken.content + ): + text = currentToken.content + links = state.md.linkify.match(text) + + # Now split string to nodes + nodes = [] + level = currentToken.level + lastPos = 0 + + for ln in range(len(links)): + url = links[ln].url + fullUrl = state.md.normalizeLink(url) + if not state.md.validateLink(fullUrl): + continue + + urlText = links[ln].text + + # Linkifier might send raw hostnames like "example.com", where url + # starts with domain name. So we prepend http:// in those cases, + # and remove it afterwards. + if not links[ln].schema: + urlText = HTTP_RE.sub( + "", state.md.normalizeLinkText("http://" + urlText) + ) + elif links[ln].schema == "mailto:" and TEST_MAILTO_RE.search( + urlText + ): + urlText = MAILTO_RE.sub( + "", state.md.normalizeLinkText("mailto:" + urlText) + ) + else: + urlText = state.md.normalizeLinkText(urlText) + + pos = links[ln].index + + if pos > lastPos: + token = Token("text", "", 0) + token.content = text[lastPos:pos] + token.level = level + nodes.append(token) + + token = Token("link_open", "a", 1) + token.attrs = {"href": fullUrl} + token.level = level + level += 1 + token.markup = "linkify" + token.info = "auto" + nodes.append(token) + + token = Token("text", "", 0) + token.content = urlText + token.level = level + nodes.append(token) + + token = Token("link_close", "a", -1) + level -= 1 + token.level = level + token.markup = "linkify" + token.info = "auto" + nodes.append(token) + + lastPos = links[ln].last_index + + if lastPos < len(text): + token = Token("text", "", 0) + token.content = text[lastPos:] + token.level = level + nodes.append(token) + + blockTokens[j].children = tokens = arrayReplaceAt(tokens, i, nodes) diff --git a/markdown_it/rules_core/normalize.py b/markdown_it/rules_core/normalize.py new file mode 100644 index 0000000..bf16fd7 --- /dev/null +++ b/markdown_it/rules_core/normalize.py @@ -0,0 +1,19 @@ +"""Normalize input string.""" +import re + +from .state_core import StateCore + +# https://spec.commonmark.org/0.29/#line-ending +NEWLINES_RE = re.compile(r"\r\n?|\n") +NULL_RE = re.compile(r"\0") + + +def normalize(state: StateCore) -> None: + + # Normalize newlines + string = NEWLINES_RE.sub("\n", state.src) + + # Replace NULL characters + string = NULL_RE.sub("\uFFFD", string) + + state.src = string diff --git a/markdown_it/rules_core/replacements.py b/markdown_it/rules_core/replacements.py new file mode 100644 index 0000000..45377d3 --- /dev/null +++ b/markdown_it/rules_core/replacements.py @@ -0,0 +1,125 @@ +"""Simple typographic replacements + +* ``(c)``, ``(C)`` → © +* ``(tm)``, ``(TM)`` → ™ +* ``(r)``, ``(R)`` → ® +* ``(p)``, ``(P)`` → § +* ``+-`` → ± +* ``...`` → … +* ``?....`` → ?.. +* ``!....`` → !.. +* ``????????`` → ??? +* ``!!!!!`` → !!! +* ``,,,`` → , +* ``--`` → &ndash +* ``---`` → &mdash +""" +from __future__ import annotations + +import logging +import re + +from ..token import Token +from .state_core import StateCore + +LOGGER = logging.getLogger(__name__) + +# TODO: +# - fractionals 1/2, 1/4, 3/4 -> ½, ¼, ¾ +# - miltiplication 2 x 4 -> 2 × 4 + +RARE_RE = re.compile(r"\+-|\.\.|\?\?\?\?|!!!!|,,|--") + +# Workaround for phantomjs - need regex without /g flag, +# or root check will fail every second time +# SCOPED_ABBR_TEST_RE = r"\((c|tm|r|p)\)" + +SCOPED_ABBR_RE = re.compile(r"\((c|tm|r|p)\)", flags=re.IGNORECASE) + +PLUS_MINUS_RE = re.compile(r"\+-") + +ELLIPSIS_RE = re.compile(r"\.{2,}") + +ELLIPSIS_QUESTION_EXCLAMATION_RE = re.compile(r"([?!])…") + +QUESTION_EXCLAMATION_RE = re.compile(r"([?!]){4,}") + +COMMA_RE = re.compile(r",{2,}") + +EM_DASH_RE = re.compile(r"(^|[^-])---(?=[^-]|$)", flags=re.MULTILINE) + +EN_DASH_RE = re.compile(r"(^|\s)--(?=\s|$)", flags=re.MULTILINE) + +EN_DASH_INDENT_RE = re.compile(r"(^|[^-\s])--(?=[^-\s]|$)", flags=re.MULTILINE) + + +SCOPED_ABBR = {"c": "©", "r": "®", "p": "§", "tm": "™"} + + +def replaceFn(match: re.Match[str]): + return SCOPED_ABBR[match.group(1).lower()] + + +def replace_scoped(inlineTokens: list[Token]) -> None: + inside_autolink = 0 + + for token in inlineTokens: + if token.type == "text" and not inside_autolink: + token.content = SCOPED_ABBR_RE.sub(replaceFn, token.content) + + if token.type == "link_open" and token.info == "auto": + inside_autolink -= 1 + + if token.type == "link_close" and token.info == "auto": + inside_autolink += 1 + + +def replace_rare(inlineTokens: list[Token]) -> None: + inside_autolink = 0 + + for token in inlineTokens: + if token.type == "text" and not inside_autolink: + if RARE_RE.search(token.content): + # +- -> ± + token.content = PLUS_MINUS_RE.sub("±", token.content) + + # .., ..., ....... -> … + token.content = ELLIPSIS_RE.sub("…", token.content) + + # but ?..... & !..... -> ?.. & !.. + token.content = ELLIPSIS_QUESTION_EXCLAMATION_RE.sub( + "\\1..", token.content + ) + token.content = QUESTION_EXCLAMATION_RE.sub("\\1\\1\\1", token.content) + + # ,, ,,, ,,,, -> , + token.content = COMMA_RE.sub(",", token.content) + + # em-dash + token.content = EM_DASH_RE.sub("\\1\u2014", token.content) + + # en-dash + token.content = EN_DASH_RE.sub("\\1\u2013", token.content) + token.content = EN_DASH_INDENT_RE.sub("\\1\u2013", token.content) + + if token.type == "link_open" and token.info == "auto": + inside_autolink -= 1 + + if token.type == "link_close" and token.info == "auto": + inside_autolink += 1 + + +def replace(state: StateCore) -> None: + if not state.md.options.typographer: + return + + for token in state.tokens: + if token.type != "inline": + continue + assert token.children is not None + + if SCOPED_ABBR_RE.search(token.content): + replace_scoped(token.children) + + if RARE_RE.search(token.content): + replace_rare(token.children) diff --git a/markdown_it/rules_core/smartquotes.py b/markdown_it/rules_core/smartquotes.py new file mode 100644 index 0000000..93f8be2 --- /dev/null +++ b/markdown_it/rules_core/smartquotes.py @@ -0,0 +1,202 @@ +"""Convert straight quotation marks to typographic ones +""" +from __future__ import annotations + +import re +from typing import Any + +from ..common.utils import charCodeAt, isMdAsciiPunct, isPunctChar, isWhiteSpace +from ..token import Token +from .state_core import StateCore + +QUOTE_TEST_RE = re.compile(r"['\"]") +QUOTE_RE = re.compile(r"['\"]") +APOSTROPHE = "\u2019" # ’ + + +def replaceAt(string: str, index: int, ch: str) -> str: + # When the index is negative, the behavior is different from the js version. + # But basically, the index will not be negative. + assert index >= 0 + return string[:index] + ch + string[index + 1 :] + + +def process_inlines(tokens: list[Token], state: StateCore) -> None: + stack: list[dict[str, Any]] = [] + + for i in range(len(tokens)): + token = tokens[i] + + thisLevel = token.level + + j = 0 + for j in range(len(stack))[::-1]: + if stack[j]["level"] <= thisLevel: + break + else: + # When the loop is terminated without a "break". + # Subtract 1 to get the same index as the js version. + j -= 1 + + stack = stack[: j + 1] + + if token.type != "text": + continue + + text = token.content + pos = 0 + maximum = len(text) + + while pos < maximum: + goto_outer = False + lastIndex = pos + t = QUOTE_RE.search(text[lastIndex:]) + if not t: + break + + canOpen = canClose = True + pos = t.start(0) + lastIndex + 1 + isSingle = t.group(0) == "'" + + # Find previous character, + # default to space if it's the beginning of the line + lastChar = 0x20 + + if t.start(0) + lastIndex - 1 >= 0: + lastChar = charCodeAt(text, t.start(0) + lastIndex - 1) + else: + for j in range(i)[::-1]: + # lastChar defaults to 0x20 + if tokens[j].type == "softbreak" or tokens[j].type == "hardbreak": + break + # should skip all tokens except 'text', 'html_inline' or 'code_inline' + if not tokens[j].content: + continue + + lastChar = charCodeAt(tokens[j].content, len(tokens[j].content) - 1) + break + + # Find next character, + # default to space if it's the end of the line + nextChar = 0x20 + + if pos < maximum: + nextChar = charCodeAt(text, pos) + else: + for j in range(i + 1, len(tokens)): + # nextChar defaults to 0x20 + if tokens[j].type == "softbreak" or tokens[j].type == "hardbreak": + break + # should skip all tokens except 'text', 'html_inline' or 'code_inline' + if not tokens[j].content: + continue + + nextChar = charCodeAt(tokens[j].content, 0) + break + + isLastPunctChar = isMdAsciiPunct(lastChar) or isPunctChar(chr(lastChar)) + isNextPunctChar = isMdAsciiPunct(nextChar) or isPunctChar(chr(nextChar)) + + isLastWhiteSpace = isWhiteSpace(lastChar) + isNextWhiteSpace = isWhiteSpace(nextChar) + + if isNextWhiteSpace: + canOpen = False + elif isNextPunctChar: + if not (isLastWhiteSpace or isLastPunctChar): + canOpen = False + + if isLastWhiteSpace: + canClose = False + elif isLastPunctChar: + if not (isNextWhiteSpace or isNextPunctChar): + canClose = False + + if nextChar == 0x22 and t.group(0) == '"': # 0x22: " + if lastChar >= 0x30 and lastChar <= 0x39: # 0x30: 0, 0x39: 9 + # special case: 1"" - count first quote as an inch + canClose = canOpen = False + + if canOpen and canClose: + # Replace quotes in the middle of punctuation sequence, but not + # in the middle of the words, i.e.: + # + # 1. foo " bar " baz - not replaced + # 2. foo-"-bar-"-baz - replaced + # 3. foo"bar"baz - not replaced + canOpen = isLastPunctChar + canClose = isNextPunctChar + + if not canOpen and not canClose: + # middle of word + if isSingle: + token.content = replaceAt( + token.content, t.start(0) + lastIndex, APOSTROPHE + ) + continue + + if canClose: + # this could be a closing quote, rewind the stack to get a match + for j in range(len(stack))[::-1]: + item = stack[j] + if stack[j]["level"] < thisLevel: + break + if item["single"] == isSingle and stack[j]["level"] == thisLevel: + item = stack[j] + + if isSingle: + openQuote = state.md.options.quotes[2] + closeQuote = state.md.options.quotes[3] + else: + openQuote = state.md.options.quotes[0] + closeQuote = state.md.options.quotes[1] + + # replace token.content *before* tokens[item.token].content, + # because, if they are pointing at the same token, replaceAt + # could mess up indices when quote length != 1 + token.content = replaceAt( + token.content, t.start(0) + lastIndex, closeQuote + ) + tokens[item["token"]].content = replaceAt( + tokens[item["token"]].content, item["pos"], openQuote + ) + + pos += len(closeQuote) - 1 + if item["token"] == i: + pos += len(openQuote) - 1 + + text = token.content + maximum = len(text) + + stack = stack[:j] + goto_outer = True + break + if goto_outer: + goto_outer = False + continue + + if canOpen: + stack.append( + { + "token": i, + "pos": t.start(0) + lastIndex, + "single": isSingle, + "level": thisLevel, + } + ) + elif canClose and isSingle: + token.content = replaceAt( + token.content, t.start(0) + lastIndex, APOSTROPHE + ) + + +def smartquotes(state: StateCore) -> None: + if not state.md.options.typographer: + return + + for token in state.tokens: + + if token.type != "inline" or not QUOTE_RE.search(token.content): + continue + assert token.children is not None + process_inlines(token.children, state) diff --git a/markdown_it/rules_core/state_core.py b/markdown_it/rules_core/state_core.py new file mode 100644 index 0000000..15b7c60 --- /dev/null +++ b/markdown_it/rules_core/state_core.py @@ -0,0 +1,25 @@ +from __future__ import annotations + +from collections.abc import MutableMapping +from typing import TYPE_CHECKING + +from ..ruler import StateBase +from ..token import Token + +if TYPE_CHECKING: + from markdown_it import MarkdownIt + + +class StateCore(StateBase): + def __init__( + self, + src: str, + md: MarkdownIt, + env: MutableMapping, + tokens: list[Token] | None = None, + ): + self.src = src + self.md = md # link to parser instance + self.env = env + self.tokens: list[Token] = tokens or [] + self.inlineMode = False |