summaryrefslogtreecommitdiffstats
path: root/markdown_it/rules_core
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-29 04:24:24 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-29 04:24:24 +0000
commit12e8343068b906f8b2afddc5569968a8a91fa5b0 (patch)
tree75cc5e05a4392ea0292251898f992a15a16b172b /markdown_it/rules_core
parentInitial commit. (diff)
downloadmarkdown-it-py-upstream.tar.xz
markdown-it-py-upstream.zip
Adding upstream version 2.1.0.upstream/2.1.0upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'markdown_it/rules_core')
-rw-r--r--markdown_it/rules_core/__init__.py17
-rw-r--r--markdown_it/rules_core/block.py16
-rw-r--r--markdown_it/rules_core/inline.py10
-rw-r--r--markdown_it/rules_core/linkify.py141
-rw-r--r--markdown_it/rules_core/normalize.py19
-rw-r--r--markdown_it/rules_core/replacements.py125
-rw-r--r--markdown_it/rules_core/smartquotes.py202
-rw-r--r--markdown_it/rules_core/state_core.py25
8 files changed, 555 insertions, 0 deletions
diff --git a/markdown_it/rules_core/__init__.py b/markdown_it/rules_core/__init__.py
new file mode 100644
index 0000000..f80034c
--- /dev/null
+++ b/markdown_it/rules_core/__init__.py
@@ -0,0 +1,17 @@
+__all__ = (
+ "StateCore",
+ "normalize",
+ "block",
+ "inline",
+ "replace",
+ "smartquotes",
+ "linkify",
+)
+
+from .block import block
+from .inline import inline
+from .linkify import linkify
+from .normalize import normalize
+from .replacements import replace
+from .smartquotes import smartquotes
+from .state_core import StateCore
diff --git a/markdown_it/rules_core/block.py b/markdown_it/rules_core/block.py
new file mode 100644
index 0000000..fa1c52c
--- /dev/null
+++ b/markdown_it/rules_core/block.py
@@ -0,0 +1,16 @@
+from ..token import Token
+from .state_core import StateCore
+
+
+def block(state: StateCore) -> None:
+
+ if state.inlineMode:
+ token = Token("inline", "", 0)
+ token.content = state.src
+ token.map = [0, 1]
+ token.children = []
+ state.tokens.append(token)
+ else:
+ state.md.block.parse(
+ state.src, state.md, state.env, state.tokens, state.srcCharCode
+ )
diff --git a/markdown_it/rules_core/inline.py b/markdown_it/rules_core/inline.py
new file mode 100644
index 0000000..c3fd0b5
--- /dev/null
+++ b/markdown_it/rules_core/inline.py
@@ -0,0 +1,10 @@
+from .state_core import StateCore
+
+
+def inline(state: StateCore) -> None:
+ """Parse inlines"""
+ for token in state.tokens:
+ if token.type == "inline":
+ if token.children is None:
+ token.children = []
+ state.md.inline.parse(token.content, state.md, state.env, token.children)
diff --git a/markdown_it/rules_core/linkify.py b/markdown_it/rules_core/linkify.py
new file mode 100644
index 0000000..49bb4ef
--- /dev/null
+++ b/markdown_it/rules_core/linkify.py
@@ -0,0 +1,141 @@
+import re
+
+from ..common.utils import arrayReplaceAt
+from ..token import Token
+from .state_core import StateCore
+
+LINK_OPEN_RE = re.compile(r"^<a[>\s]", flags=re.IGNORECASE)
+LINK_CLOSE_RE = re.compile(r"^</a\s*>", flags=re.IGNORECASE)
+
+HTTP_RE = re.compile(r"^http://")
+MAILTO_RE = re.compile(r"^mailto:")
+TEST_MAILTO_RE = re.compile(r"^mailto:", flags=re.IGNORECASE)
+
+
+def isLinkOpen(string: str) -> bool:
+ return bool(LINK_OPEN_RE.search(string))
+
+
+def isLinkClose(string: str) -> bool:
+ return bool(LINK_CLOSE_RE.search(string))
+
+
+def linkify(state: StateCore) -> None:
+ blockTokens = state.tokens
+
+ if not state.md.options.linkify:
+ return
+
+ if not state.md.linkify:
+ raise ModuleNotFoundError("Linkify enabled but not installed.")
+
+ for j in range(len(blockTokens)):
+ if blockTokens[j].type != "inline" or not state.md.linkify.pretest(
+ blockTokens[j].content
+ ):
+ continue
+
+ tokens = blockTokens[j].children
+
+ htmlLinkLevel = 0
+
+ # We scan from the end, to keep position when new tags added.
+ # Use reversed logic in links start/end match
+ assert tokens is not None
+ i = len(tokens)
+ while i >= 1:
+ i -= 1
+ assert isinstance(tokens, list)
+ currentToken = tokens[i]
+
+ # Skip content of markdown links
+ if currentToken.type == "link_close":
+ i -= 1
+ while (
+ tokens[i].level != currentToken.level
+ and tokens[i].type != "link_open"
+ ):
+ i -= 1
+ continue
+
+ # Skip content of html tag links
+ if currentToken.type == "html_inline":
+ if isLinkOpen(currentToken.content) and htmlLinkLevel > 0:
+ htmlLinkLevel -= 1
+ if isLinkClose(currentToken.content):
+ htmlLinkLevel += 1
+ if htmlLinkLevel > 0:
+ continue
+
+ if currentToken.type == "text" and state.md.linkify.test(
+ currentToken.content
+ ):
+ text = currentToken.content
+ links = state.md.linkify.match(text)
+
+ # Now split string to nodes
+ nodes = []
+ level = currentToken.level
+ lastPos = 0
+
+ for ln in range(len(links)):
+ url = links[ln].url
+ fullUrl = state.md.normalizeLink(url)
+ if not state.md.validateLink(fullUrl):
+ continue
+
+ urlText = links[ln].text
+
+ # Linkifier might send raw hostnames like "example.com", where url
+ # starts with domain name. So we prepend http:// in those cases,
+ # and remove it afterwards.
+ if not links[ln].schema:
+ urlText = HTTP_RE.sub(
+ "", state.md.normalizeLinkText("http://" + urlText)
+ )
+ elif links[ln].schema == "mailto:" and TEST_MAILTO_RE.search(
+ urlText
+ ):
+ urlText = MAILTO_RE.sub(
+ "", state.md.normalizeLinkText("mailto:" + urlText)
+ )
+ else:
+ urlText = state.md.normalizeLinkText(urlText)
+
+ pos = links[ln].index
+
+ if pos > lastPos:
+ token = Token("text", "", 0)
+ token.content = text[lastPos:pos]
+ token.level = level
+ nodes.append(token)
+
+ token = Token("link_open", "a", 1)
+ token.attrs = {"href": fullUrl}
+ token.level = level
+ level += 1
+ token.markup = "linkify"
+ token.info = "auto"
+ nodes.append(token)
+
+ token = Token("text", "", 0)
+ token.content = urlText
+ token.level = level
+ nodes.append(token)
+
+ token = Token("link_close", "a", -1)
+ level -= 1
+ token.level = level
+ token.markup = "linkify"
+ token.info = "auto"
+ nodes.append(token)
+
+ lastPos = links[ln].last_index
+
+ if lastPos < len(text):
+ token = Token("text", "", 0)
+ token.content = text[lastPos:]
+ token.level = level
+ nodes.append(token)
+
+ blockTokens[j].children = tokens = arrayReplaceAt(tokens, i, nodes)
diff --git a/markdown_it/rules_core/normalize.py b/markdown_it/rules_core/normalize.py
new file mode 100644
index 0000000..bf16fd7
--- /dev/null
+++ b/markdown_it/rules_core/normalize.py
@@ -0,0 +1,19 @@
+"""Normalize input string."""
+import re
+
+from .state_core import StateCore
+
+# https://spec.commonmark.org/0.29/#line-ending
+NEWLINES_RE = re.compile(r"\r\n?|\n")
+NULL_RE = re.compile(r"\0")
+
+
+def normalize(state: StateCore) -> None:
+
+ # Normalize newlines
+ string = NEWLINES_RE.sub("\n", state.src)
+
+ # Replace NULL characters
+ string = NULL_RE.sub("\uFFFD", string)
+
+ state.src = string
diff --git a/markdown_it/rules_core/replacements.py b/markdown_it/rules_core/replacements.py
new file mode 100644
index 0000000..45377d3
--- /dev/null
+++ b/markdown_it/rules_core/replacements.py
@@ -0,0 +1,125 @@
+"""Simple typographic replacements
+
+* ``(c)``, ``(C)`` → ©
+* ``(tm)``, ``(TM)`` → ™
+* ``(r)``, ``(R)`` → ®
+* ``(p)``, ``(P)`` → §
+* ``+-`` → ±
+* ``...`` → …
+* ``?....`` → ?..
+* ``!....`` → !..
+* ``????????`` → ???
+* ``!!!!!`` → !!!
+* ``,,,`` → ,
+* ``--`` → &ndash
+* ``---`` → &mdash
+"""
+from __future__ import annotations
+
+import logging
+import re
+
+from ..token import Token
+from .state_core import StateCore
+
+LOGGER = logging.getLogger(__name__)
+
+# TODO:
+# - fractionals 1/2, 1/4, 3/4 -> ½, ¼, ¾
+# - miltiplication 2 x 4 -> 2 × 4
+
+RARE_RE = re.compile(r"\+-|\.\.|\?\?\?\?|!!!!|,,|--")
+
+# Workaround for phantomjs - need regex without /g flag,
+# or root check will fail every second time
+# SCOPED_ABBR_TEST_RE = r"\((c|tm|r|p)\)"
+
+SCOPED_ABBR_RE = re.compile(r"\((c|tm|r|p)\)", flags=re.IGNORECASE)
+
+PLUS_MINUS_RE = re.compile(r"\+-")
+
+ELLIPSIS_RE = re.compile(r"\.{2,}")
+
+ELLIPSIS_QUESTION_EXCLAMATION_RE = re.compile(r"([?!])…")
+
+QUESTION_EXCLAMATION_RE = re.compile(r"([?!]){4,}")
+
+COMMA_RE = re.compile(r",{2,}")
+
+EM_DASH_RE = re.compile(r"(^|[^-])---(?=[^-]|$)", flags=re.MULTILINE)
+
+EN_DASH_RE = re.compile(r"(^|\s)--(?=\s|$)", flags=re.MULTILINE)
+
+EN_DASH_INDENT_RE = re.compile(r"(^|[^-\s])--(?=[^-\s]|$)", flags=re.MULTILINE)
+
+
+SCOPED_ABBR = {"c": "©", "r": "®", "p": "§", "tm": "™"}
+
+
+def replaceFn(match: re.Match[str]):
+ return SCOPED_ABBR[match.group(1).lower()]
+
+
+def replace_scoped(inlineTokens: list[Token]) -> None:
+ inside_autolink = 0
+
+ for token in inlineTokens:
+ if token.type == "text" and not inside_autolink:
+ token.content = SCOPED_ABBR_RE.sub(replaceFn, token.content)
+
+ if token.type == "link_open" and token.info == "auto":
+ inside_autolink -= 1
+
+ if token.type == "link_close" and token.info == "auto":
+ inside_autolink += 1
+
+
+def replace_rare(inlineTokens: list[Token]) -> None:
+ inside_autolink = 0
+
+ for token in inlineTokens:
+ if token.type == "text" and not inside_autolink:
+ if RARE_RE.search(token.content):
+ # +- -> ±
+ token.content = PLUS_MINUS_RE.sub("±", token.content)
+
+ # .., ..., ....... -> …
+ token.content = ELLIPSIS_RE.sub("…", token.content)
+
+ # but ?..... & !..... -> ?.. & !..
+ token.content = ELLIPSIS_QUESTION_EXCLAMATION_RE.sub(
+ "\\1..", token.content
+ )
+ token.content = QUESTION_EXCLAMATION_RE.sub("\\1\\1\\1", token.content)
+
+ # ,, ,,, ,,,, -> ,
+ token.content = COMMA_RE.sub(",", token.content)
+
+ # em-dash
+ token.content = EM_DASH_RE.sub("\\1\u2014", token.content)
+
+ # en-dash
+ token.content = EN_DASH_RE.sub("\\1\u2013", token.content)
+ token.content = EN_DASH_INDENT_RE.sub("\\1\u2013", token.content)
+
+ if token.type == "link_open" and token.info == "auto":
+ inside_autolink -= 1
+
+ if token.type == "link_close" and token.info == "auto":
+ inside_autolink += 1
+
+
+def replace(state: StateCore) -> None:
+ if not state.md.options.typographer:
+ return
+
+ for token in state.tokens:
+ if token.type != "inline":
+ continue
+ assert token.children is not None
+
+ if SCOPED_ABBR_RE.search(token.content):
+ replace_scoped(token.children)
+
+ if RARE_RE.search(token.content):
+ replace_rare(token.children)
diff --git a/markdown_it/rules_core/smartquotes.py b/markdown_it/rules_core/smartquotes.py
new file mode 100644
index 0000000..93f8be2
--- /dev/null
+++ b/markdown_it/rules_core/smartquotes.py
@@ -0,0 +1,202 @@
+"""Convert straight quotation marks to typographic ones
+"""
+from __future__ import annotations
+
+import re
+from typing import Any
+
+from ..common.utils import charCodeAt, isMdAsciiPunct, isPunctChar, isWhiteSpace
+from ..token import Token
+from .state_core import StateCore
+
+QUOTE_TEST_RE = re.compile(r"['\"]")
+QUOTE_RE = re.compile(r"['\"]")
+APOSTROPHE = "\u2019" # ’
+
+
+def replaceAt(string: str, index: int, ch: str) -> str:
+ # When the index is negative, the behavior is different from the js version.
+ # But basically, the index will not be negative.
+ assert index >= 0
+ return string[:index] + ch + string[index + 1 :]
+
+
+def process_inlines(tokens: list[Token], state: StateCore) -> None:
+ stack: list[dict[str, Any]] = []
+
+ for i in range(len(tokens)):
+ token = tokens[i]
+
+ thisLevel = token.level
+
+ j = 0
+ for j in range(len(stack))[::-1]:
+ if stack[j]["level"] <= thisLevel:
+ break
+ else:
+ # When the loop is terminated without a "break".
+ # Subtract 1 to get the same index as the js version.
+ j -= 1
+
+ stack = stack[: j + 1]
+
+ if token.type != "text":
+ continue
+
+ text = token.content
+ pos = 0
+ maximum = len(text)
+
+ while pos < maximum:
+ goto_outer = False
+ lastIndex = pos
+ t = QUOTE_RE.search(text[lastIndex:])
+ if not t:
+ break
+
+ canOpen = canClose = True
+ pos = t.start(0) + lastIndex + 1
+ isSingle = t.group(0) == "'"
+
+ # Find previous character,
+ # default to space if it's the beginning of the line
+ lastChar = 0x20
+
+ if t.start(0) + lastIndex - 1 >= 0:
+ lastChar = charCodeAt(text, t.start(0) + lastIndex - 1)
+ else:
+ for j in range(i)[::-1]:
+ # lastChar defaults to 0x20
+ if tokens[j].type == "softbreak" or tokens[j].type == "hardbreak":
+ break
+ # should skip all tokens except 'text', 'html_inline' or 'code_inline'
+ if not tokens[j].content:
+ continue
+
+ lastChar = charCodeAt(tokens[j].content, len(tokens[j].content) - 1)
+ break
+
+ # Find next character,
+ # default to space if it's the end of the line
+ nextChar = 0x20
+
+ if pos < maximum:
+ nextChar = charCodeAt(text, pos)
+ else:
+ for j in range(i + 1, len(tokens)):
+ # nextChar defaults to 0x20
+ if tokens[j].type == "softbreak" or tokens[j].type == "hardbreak":
+ break
+ # should skip all tokens except 'text', 'html_inline' or 'code_inline'
+ if not tokens[j].content:
+ continue
+
+ nextChar = charCodeAt(tokens[j].content, 0)
+ break
+
+ isLastPunctChar = isMdAsciiPunct(lastChar) or isPunctChar(chr(lastChar))
+ isNextPunctChar = isMdAsciiPunct(nextChar) or isPunctChar(chr(nextChar))
+
+ isLastWhiteSpace = isWhiteSpace(lastChar)
+ isNextWhiteSpace = isWhiteSpace(nextChar)
+
+ if isNextWhiteSpace:
+ canOpen = False
+ elif isNextPunctChar:
+ if not (isLastWhiteSpace or isLastPunctChar):
+ canOpen = False
+
+ if isLastWhiteSpace:
+ canClose = False
+ elif isLastPunctChar:
+ if not (isNextWhiteSpace or isNextPunctChar):
+ canClose = False
+
+ if nextChar == 0x22 and t.group(0) == '"': # 0x22: "
+ if lastChar >= 0x30 and lastChar <= 0x39: # 0x30: 0, 0x39: 9
+ # special case: 1"" - count first quote as an inch
+ canClose = canOpen = False
+
+ if canOpen and canClose:
+ # Replace quotes in the middle of punctuation sequence, but not
+ # in the middle of the words, i.e.:
+ #
+ # 1. foo " bar " baz - not replaced
+ # 2. foo-"-bar-"-baz - replaced
+ # 3. foo"bar"baz - not replaced
+ canOpen = isLastPunctChar
+ canClose = isNextPunctChar
+
+ if not canOpen and not canClose:
+ # middle of word
+ if isSingle:
+ token.content = replaceAt(
+ token.content, t.start(0) + lastIndex, APOSTROPHE
+ )
+ continue
+
+ if canClose:
+ # this could be a closing quote, rewind the stack to get a match
+ for j in range(len(stack))[::-1]:
+ item = stack[j]
+ if stack[j]["level"] < thisLevel:
+ break
+ if item["single"] == isSingle and stack[j]["level"] == thisLevel:
+ item = stack[j]
+
+ if isSingle:
+ openQuote = state.md.options.quotes[2]
+ closeQuote = state.md.options.quotes[3]
+ else:
+ openQuote = state.md.options.quotes[0]
+ closeQuote = state.md.options.quotes[1]
+
+ # replace token.content *before* tokens[item.token].content,
+ # because, if they are pointing at the same token, replaceAt
+ # could mess up indices when quote length != 1
+ token.content = replaceAt(
+ token.content, t.start(0) + lastIndex, closeQuote
+ )
+ tokens[item["token"]].content = replaceAt(
+ tokens[item["token"]].content, item["pos"], openQuote
+ )
+
+ pos += len(closeQuote) - 1
+ if item["token"] == i:
+ pos += len(openQuote) - 1
+
+ text = token.content
+ maximum = len(text)
+
+ stack = stack[:j]
+ goto_outer = True
+ break
+ if goto_outer:
+ goto_outer = False
+ continue
+
+ if canOpen:
+ stack.append(
+ {
+ "token": i,
+ "pos": t.start(0) + lastIndex,
+ "single": isSingle,
+ "level": thisLevel,
+ }
+ )
+ elif canClose and isSingle:
+ token.content = replaceAt(
+ token.content, t.start(0) + lastIndex, APOSTROPHE
+ )
+
+
+def smartquotes(state: StateCore) -> None:
+ if not state.md.options.typographer:
+ return
+
+ for token in state.tokens:
+
+ if token.type != "inline" or not QUOTE_RE.search(token.content):
+ continue
+ assert token.children is not None
+ process_inlines(token.children, state)
diff --git a/markdown_it/rules_core/state_core.py b/markdown_it/rules_core/state_core.py
new file mode 100644
index 0000000..15b7c60
--- /dev/null
+++ b/markdown_it/rules_core/state_core.py
@@ -0,0 +1,25 @@
+from __future__ import annotations
+
+from collections.abc import MutableMapping
+from typing import TYPE_CHECKING
+
+from ..ruler import StateBase
+from ..token import Token
+
+if TYPE_CHECKING:
+ from markdown_it import MarkdownIt
+
+
+class StateCore(StateBase):
+ def __init__(
+ self,
+ src: str,
+ md: MarkdownIt,
+ env: MutableMapping,
+ tokens: list[Token] | None = None,
+ ):
+ self.src = src
+ self.md = md # link to parser instance
+ self.env = env
+ self.tokens: list[Token] = tokens or []
+ self.inlineMode = False