Adding upstream version 2.1.0.upstream/2.1.0 upstream

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-29 04:24:24 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-29 04:24:24 +0000
commit: 12e8343068b906f8b2afddc5569968a8a91fa5b0 (patch)
tree: 75cc5e05a4392ea0292251898f992a15a16b172b /markdown_it/rules_core
parent: Initial commit. (diff)
download: markdown-it-py-upstream.tar.xz
markdown-it-py-upstream.zip
8 files changed, 555 insertions, 0 deletions
diff --git a/markdown_it/rules_core/__init__.py b/markdown_it/rules_core/__init__.py
new file mode 100644
index 0000000..f80034c
--- /dev/null
+++ b/markdown_it/rules_core/__init__.py
@@ -0,0 +1,17 @@
+__all__ = (
+    "StateCore",
+    "normalize",
+    "block",
+    "inline",
+    "replace",
+    "smartquotes",
+    "linkify",
+)
+
+from .block import block
+from .inline import inline
+from .linkify import linkify
+from .normalize import normalize
+from .replacements import replace
+from .smartquotes import smartquotes
+from .state_core import StateCore
diff --git a/markdown_it/rules_core/block.py b/markdown_it/rules_core/block.py
new file mode 100644
index 0000000..fa1c52c
--- /dev/null
+++ b/markdown_it/rules_core/block.py
@@ -0,0 +1,16 @@
+from ..token import Token
+from .state_core import StateCore
+
+
+def block(state: StateCore) -> None:
+
+    if state.inlineMode:
+        token = Token("inline", "", 0)
+        token.content = state.src
+        token.map = [0, 1]
+        token.children = []
+        state.tokens.append(token)
+    else:
+        state.md.block.parse(
+            state.src, state.md, state.env, state.tokens, state.srcCharCode
+        )
diff --git a/markdown_it/rules_core/inline.py b/markdown_it/rules_core/inline.py
new file mode 100644
index 0000000..c3fd0b5
--- /dev/null
+++ b/markdown_it/rules_core/inline.py
@@ -0,0 +1,10 @@
+from .state_core import StateCore
+
+
+def inline(state: StateCore) -> None:
+    """Parse inlines"""
+    for token in state.tokens:
+        if token.type == "inline":
+            if token.children is None:
+                token.children = []
+            state.md.inline.parse(token.content, state.md, state.env, token.children)
diff --git a/markdown_it/rules_core/linkify.py b/markdown_it/rules_core/linkify.py
new file mode 100644
index 0000000..49bb4ef
--- /dev/null
+++ b/markdown_it/rules_core/linkify.py
@@ -0,0 +1,141 @@
+import re
+
+from ..common.utils import arrayReplaceAt
+from ..token import Token
+from .state_core import StateCore
+
+LINK_OPEN_RE = re.compile(r"^<a[>\s]", flags=re.IGNORECASE)
+LINK_CLOSE_RE = re.compile(r"^</a\s*>", flags=re.IGNORECASE)
+
+HTTP_RE = re.compile(r"^http://")
+MAILTO_RE = re.compile(r"^mailto:")
+TEST_MAILTO_RE = re.compile(r"^mailto:", flags=re.IGNORECASE)
+
+
+def isLinkOpen(string: str) -> bool:
+    return bool(LINK_OPEN_RE.search(string))
+
+
+def isLinkClose(string: str) -> bool:
+    return bool(LINK_CLOSE_RE.search(string))
+
+
+def linkify(state: StateCore) -> None:
+    blockTokens = state.tokens
+
+    if not state.md.options.linkify:
+        return
+
+    if not state.md.linkify:
+        raise ModuleNotFoundError("Linkify enabled but not installed.")
+
+    for j in range(len(blockTokens)):
+        if blockTokens[j].type != "inline" or not state.md.linkify.pretest(
+            blockTokens[j].content
+        ):
+            continue
+
+        tokens = blockTokens[j].children
+
+        htmlLinkLevel = 0
+
+        # We scan from the end, to keep position when new tags added.
+        # Use reversed logic in links start/end match
+        assert tokens is not None
+        i = len(tokens)
+        while i >= 1:
+            i -= 1
+            assert isinstance(tokens, list)
+            currentToken = tokens[i]
+
+            # Skip content of markdown links
+            if currentToken.type == "link_close":
+                i -= 1
+                while (
+                    tokens[i].level != currentToken.level
+                    and tokens[i].type != "link_open"
+                ):
+                    i -= 1
+                continue
+
+            # Skip content of html tag links
+            if currentToken.type == "html_inline":
+                if isLinkOpen(currentToken.content) and htmlLinkLevel > 0:
+                    htmlLinkLevel -= 1
+                if isLinkClose(currentToken.content):
+                    htmlLinkLevel += 1
+            if htmlLinkLevel > 0:
+                continue
+
+            if currentToken.type == "text" and state.md.linkify.test(
+                currentToken.content
+            ):
+                text = currentToken.content
+                links = state.md.linkify.match(text)
+
+                # Now split string to nodes
+                nodes = []
+                level = currentToken.level
+                lastPos = 0
+
+                for ln in range(len(links)):
+                    url = links[ln].url
+                    fullUrl = state.md.normalizeLink(url)
+                    if not state.md.validateLink(fullUrl):
+                        continue
+
+                    urlText = links[ln].text
+
+                    # Linkifier might send raw hostnames like "example.com", where url
+                    # starts with domain name. So we prepend http:// in those cases,
+                    # and remove it afterwards.
+                    if not links[ln].schema:
+                        urlText = HTTP_RE.sub(
+                            "", state.md.normalizeLinkText("http://" + urlText)
+                        )
+                    elif links[ln].schema == "mailto:" and TEST_MAILTO_RE.search(
+                        urlText
+                    ):
+                        urlText = MAILTO_RE.sub(
+                            "", state.md.normalizeLinkText("mailto:" + urlText)
+                        )
+                    else:
+                        urlText = state.md.normalizeLinkText(urlText)
+
+                    pos = links[ln].index
+
+                    if pos > lastPos:
+                        token = Token("text", "", 0)
+                        token.content = text[lastPos:pos]
+                        token.level = level
+                        nodes.append(token)
+
+                    token = Token("link_open", "a", 1)
+                    token.attrs = {"href": fullUrl}
+                    token.level = level
+                    level += 1
+                    token.markup = "linkify"
+                    token.info = "auto"
+                    nodes.append(token)
+
+                    token = Token("text", "", 0)
+                    token.content = urlText
+                    token.level = level
+                    nodes.append(token)
+
+                    token = Token("link_close", "a", -1)
+                    level -= 1
+                    token.level = level
+                    token.markup = "linkify"
+                    token.info = "auto"
+                    nodes.append(token)
+
+                    lastPos = links[ln].last_index
+
+                if lastPos < len(text):
+                    token = Token("text", "", 0)
+                    token.content = text[lastPos:]
+                    token.level = level
+                    nodes.append(token)
+
+                blockTokens[j].children = tokens = arrayReplaceAt(tokens, i, nodes)
diff --git a/markdown_it/rules_core/normalize.py b/markdown_it/rules_core/normalize.py
new file mode 100644
index 0000000..bf16fd7
--- /dev/null
+++ b/markdown_it/rules_core/normalize.py
@@ -0,0 +1,19 @@
+"""Normalize input string."""
+import re
+
+from .state_core import StateCore
+
+# https://spec.commonmark.org/0.29/#line-ending
+NEWLINES_RE = re.compile(r"\r\n?|\n")
+NULL_RE = re.compile(r"\0")
+
+
+def normalize(state: StateCore) -> None:
+
+    # Normalize newlines
+    string = NEWLINES_RE.sub("\n", state.src)
+
+    # Replace NULL characters
+    string = NULL_RE.sub("\uFFFD", string)
+
+    state.src = string
diff --git a/markdown_it/rules_core/replacements.py b/markdown_it/rules_core/replacements.py
new file mode 100644
index 0000000..45377d3
--- /dev/null
+++ b/markdown_it/rules_core/replacements.py
@@ -0,0 +1,125 @@
+"""Simple typographic replacements
+
+* ``(c)``, ``(C)`` → ©
+* ``(tm)``, ``(TM)`` → ™
+* ``(r)``, ``(R)`` → ®
+* ``(p)``, ``(P)`` → §
+* ``+-`` → ±
+* ``...`` → …
+* ``?....`` → ?..
+* ``!....`` → !..
+* ``????????`` → ???
+* ``!!!!!`` → !!!
+* ``,,,`` → ,
+* ``--`` → &ndash
+* ``---`` → &mdash
+"""
+from __future__ import annotations
+
+import logging
+import re
+
+from ..token import Token
+from .state_core import StateCore
+
+LOGGER = logging.getLogger(__name__)
+
+# TODO:
+# - fractionals 1/2, 1/4, 3/4 -> ½, ¼, ¾
+# - miltiplication 2 x 4 -> 2 × 4
+
+RARE_RE = re.compile(r"\+-|\.\.|\?\?\?\?|!!!!|,,|--")
+
+# Workaround for phantomjs - need regex without /g flag,
+# or root check will fail every second time
+# SCOPED_ABBR_TEST_RE = r"\((c|tm|r|p)\)"
+
+SCOPED_ABBR_RE = re.compile(r"\((c|tm|r|p)\)", flags=re.IGNORECASE)
+
+PLUS_MINUS_RE = re.compile(r"\+-")
+
+ELLIPSIS_RE = re.compile(r"\.{2,}")
+
+ELLIPSIS_QUESTION_EXCLAMATION_RE = re.compile(r"([?!])…")
+
+QUESTION_EXCLAMATION_RE = re.compile(r"([?!]){4,}")
+
+COMMA_RE = re.compile(r",{2,}")
+
+EM_DASH_RE = re.compile(r"(^|[^-])---(?=[^-]|$)", flags=re.MULTILINE)
+
+EN_DASH_RE = re.compile(r"(^|\s)--(?=\s|$)", flags=re.MULTILINE)
+
+EN_DASH_INDENT_RE = re.compile(r"(^|[^-\s])--(?=[^-\s]|$)", flags=re.MULTILINE)
+
+
+SCOPED_ABBR = {"c": "©", "r": "®", "p": "§", "tm": "™"}
+
+
+def replaceFn(match: re.Match[str]):
+    return SCOPED_ABBR[match.group(1).lower()]
+
+
+def replace_scoped(inlineTokens: list[Token]) -> None:
+    inside_autolink = 0
+
+    for token in inlineTokens:
+        if token.type == "text" and not inside_autolink:
+            token.content = SCOPED_ABBR_RE.sub(replaceFn, token.content)
+
+        if token.type == "link_open" and token.info == "auto":
+            inside_autolink -= 1
+
+        if token.type == "link_close" and token.info == "auto":
+            inside_autolink += 1
+
+
+def replace_rare(inlineTokens: list[Token]) -> None:
+    inside_autolink = 0
+
+    for token in inlineTokens:
+        if token.type == "text" and not inside_autolink:
+            if RARE_RE.search(token.content):
+                # +- -> ±
+                token.content = PLUS_MINUS_RE.sub("±", token.content)
+
+                # .., ..., ....... -> …
+                token.content = ELLIPSIS_RE.sub("…", token.content)
+
+                # but ?..... & !..... -> ?.. & !..
+                token.content = ELLIPSIS_QUESTION_EXCLAMATION_RE.sub(
+                    "\\1..", token.content
+                )
+                token.content = QUESTION_EXCLAMATION_RE.sub("\\1\\1\\1", token.content)
+
+                # ,,  ,,,  ,,,, -> ,
+                token.content = COMMA_RE.sub(",", token.content)
+
+                # em-dash
+                token.content = EM_DASH_RE.sub("\\1\u2014", token.content)
+
+                # en-dash
+                token.content = EN_DASH_RE.sub("\\1\u2013", token.content)
+                token.content = EN_DASH_INDENT_RE.sub("\\1\u2013", token.content)
+
+        if token.type == "link_open" and token.info == "auto":
+            inside_autolink -= 1
+
+        if token.type == "link_close" and token.info == "auto":
+            inside_autolink += 1
+
+
+def replace(state: StateCore) -> None:
+    if not state.md.options.typographer:
+        return
+
+    for token in state.tokens:
+        if token.type != "inline":
+            continue
+        assert token.children is not None
+
+        if SCOPED_ABBR_RE.search(token.content):
+            replace_scoped(token.children)
+
+        if RARE_RE.search(token.content):
+            replace_rare(token.children)
diff --git a/markdown_it/rules_core/smartquotes.py b/markdown_it/rules_core/smartquotes.py
new file mode 100644
index 0000000..93f8be2
--- /dev/null
+++ b/markdown_it/rules_core/smartquotes.py
@@ -0,0 +1,202 @@
+"""Convert straight quotation marks to typographic ones
+"""
+from __future__ import annotations
+
+import re
+from typing import Any
+
+from ..common.utils import charCodeAt, isMdAsciiPunct, isPunctChar, isWhiteSpace
+from ..token import Token
+from .state_core import StateCore
+
+QUOTE_TEST_RE = re.compile(r"['\"]")
+QUOTE_RE = re.compile(r"['\"]")
+APOSTROPHE = "\u2019"  # ’
+
+
+def replaceAt(string: str, index: int, ch: str) -> str:
+    # When the index is negative, the behavior is different from the js version.
+    # But basically, the index will not be negative.
+    assert index >= 0
+    return string[:index] + ch + string[index + 1 :]
+
+
+def process_inlines(tokens: list[Token], state: StateCore) -> None:
+    stack: list[dict[str, Any]] = []
+
+    for i in range(len(tokens)):
+        token = tokens[i]
+
+        thisLevel = token.level
+
+        j = 0
+        for j in range(len(stack))[::-1]:
+            if stack[j]["level"] <= thisLevel:
+                break
+        else:
+            # When the loop is terminated without a "break".
+            # Subtract 1 to get the same index as the js version.
+            j -= 1
+
+        stack = stack[: j + 1]
+
+        if token.type != "text":
+            continue
+
+        text = token.content
+        pos = 0
+        maximum = len(text)
+
+        while pos < maximum:
+            goto_outer = False
+            lastIndex = pos
+            t = QUOTE_RE.search(text[lastIndex:])
+            if not t:
+                break
+
+            canOpen = canClose = True
+            pos = t.start(0) + lastIndex + 1
+            isSingle = t.group(0) == "'"
+
+            # Find previous character,
+            # default to space if it's the beginning of the line
+            lastChar = 0x20
+
+            if t.start(0) + lastIndex - 1 >= 0:
+                lastChar = charCodeAt(text, t.start(0) + lastIndex - 1)
+            else:
+                for j in range(i)[::-1]:
+                    # lastChar defaults to 0x20
+                    if tokens[j].type == "softbreak" or tokens[j].type == "hardbreak":
+                        break
+                    # should skip all tokens except 'text', 'html_inline' or 'code_inline'
+                    if not tokens[j].content:
+                        continue
+
+                    lastChar = charCodeAt(tokens[j].content, len(tokens[j].content) - 1)
+                    break
+
+            # Find next character,
+            # default to space if it's the end of the line
+            nextChar = 0x20
+
+            if pos < maximum:
+                nextChar = charCodeAt(text, pos)
+            else:
+                for j in range(i + 1, len(tokens)):
+                    # nextChar defaults to 0x20
+                    if tokens[j].type == "softbreak" or tokens[j].type == "hardbreak":
+                        break
+                    # should skip all tokens except 'text', 'html_inline' or 'code_inline'
+                    if not tokens[j].content:
+                        continue
+
+                    nextChar = charCodeAt(tokens[j].content, 0)
+                    break
+
+            isLastPunctChar = isMdAsciiPunct(lastChar) or isPunctChar(chr(lastChar))
+            isNextPunctChar = isMdAsciiPunct(nextChar) or isPunctChar(chr(nextChar))
+
+            isLastWhiteSpace = isWhiteSpace(lastChar)
+            isNextWhiteSpace = isWhiteSpace(nextChar)
+
+            if isNextWhiteSpace:
+                canOpen = False
+            elif isNextPunctChar:
+                if not (isLastWhiteSpace or isLastPunctChar):
+                    canOpen = False
+
+            if isLastWhiteSpace:
+                canClose = False
+            elif isLastPunctChar:
+                if not (isNextWhiteSpace or isNextPunctChar):
+                    canClose = False
+
+            if nextChar == 0x22 and t.group(0) == '"':  # 0x22: "
+                if lastChar >= 0x30 and lastChar <= 0x39:  # 0x30: 0, 0x39: 9
+                    # special case: 1"" - count first quote as an inch
+                    canClose = canOpen = False
+
+            if canOpen and canClose:
+                # Replace quotes in the middle of punctuation sequence, but not
+                # in the middle of the words, i.e.:
+                #
+                # 1. foo " bar " baz - not replaced
+                # 2. foo-"-bar-"-baz - replaced
+                # 3. foo"bar"baz     - not replaced
+                canOpen = isLastPunctChar
+                canClose = isNextPunctChar
+
+            if not canOpen and not canClose:
+                # middle of word
+                if isSingle:
+                    token.content = replaceAt(
+                        token.content, t.start(0) + lastIndex, APOSTROPHE
+                    )
+                continue
+
+            if canClose:
+                # this could be a closing quote, rewind the stack to get a match
+                for j in range(len(stack))[::-1]:
+                    item = stack[j]
+                    if stack[j]["level"] < thisLevel:
+                        break
+                    if item["single"] == isSingle and stack[j]["level"] == thisLevel:
+                        item = stack[j]
+
+                        if isSingle:
+                            openQuote = state.md.options.quotes[2]
+                            closeQuote = state.md.options.quotes[3]
+                        else:
+                            openQuote = state.md.options.quotes[0]
+                            closeQuote = state.md.options.quotes[1]
+
+                        # replace token.content *before* tokens[item.token].content,
+                        # because, if they are pointing at the same token, replaceAt
+                        # could mess up indices when quote length != 1
+                        token.content = replaceAt(
+                            token.content, t.start(0) + lastIndex, closeQuote
+                        )
+                        tokens[item["token"]].content = replaceAt(
+                            tokens[item["token"]].content, item["pos"], openQuote
+                        )
+
+                        pos += len(closeQuote) - 1
+                        if item["token"] == i:
+                            pos += len(openQuote) - 1
+
+                        text = token.content
+                        maximum = len(text)
+
+                        stack = stack[:j]
+                        goto_outer = True
+                        break
+                if goto_outer:
+                    goto_outer = False
+                    continue
+
+            if canOpen:
+                stack.append(
+                    {
+                        "token": i,
+                        "pos": t.start(0) + lastIndex,
+                        "single": isSingle,
+                        "level": thisLevel,
+                    }
+                )
+            elif canClose and isSingle:
+                token.content = replaceAt(
+                    token.content, t.start(0) + lastIndex, APOSTROPHE
+                )
+
+
+def smartquotes(state: StateCore) -> None:
+    if not state.md.options.typographer:
+        return
+
+    for token in state.tokens:
+
+        if token.type != "inline" or not QUOTE_RE.search(token.content):
+            continue
+        assert token.children is not None
+        process_inlines(token.children, state)
diff --git a/markdown_it/rules_core/state_core.py b/markdown_it/rules_core/state_core.py
new file mode 100644
index 0000000..15b7c60
--- /dev/null
+++ b/markdown_it/rules_core/state_core.py
@@ -0,0 +1,25 @@
+from __future__ import annotations
+
+from collections.abc import MutableMapping
+from typing import TYPE_CHECKING
+
+from ..ruler import StateBase
+from ..token import Token
+
+if TYPE_CHECKING:
+    from markdown_it import MarkdownIt
+
+
+class StateCore(StateBase):
+    def __init__(
+        self,
+        src: str,
+        md: MarkdownIt,
+        env: MutableMapping,
+        tokens: list[Token] | None = None,
+    ):
+        self.src = src
+        self.md = md  # link to parser instance
+        self.env = env
+        self.tokens: list[Token] = tokens or []
+        self.inlineMode = False
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-29 04:24:24 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-29 04:24:24 +0000
commit	12e8343068b906f8b2afddc5569968a8a91fa5b0 (patch)
tree	75cc5e05a4392ea0292251898f992a15a16b172b /markdown_it/rules_core
parent	Initial commit. (diff)
download	markdown-it-py-upstream.tar.xz markdown-it-py-upstream.zip