From 12e8343068b906f8b2afddc5569968a8a91fa5b0 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Mon, 29 Apr 2024 06:24:24 +0200 Subject: Adding upstream version 2.1.0. Signed-off-by: Daniel Baumann --- markdown_it/rules_inline/__init__.py | 29 +++++ markdown_it/rules_inline/autolink.py | 78 +++++++++++++ markdown_it/rules_inline/backticks.py | 75 +++++++++++++ markdown_it/rules_inline/balance_pairs.py | 114 +++++++++++++++++++ markdown_it/rules_inline/emphasis.py | 102 +++++++++++++++++ markdown_it/rules_inline/entity.py | 54 +++++++++ markdown_it/rules_inline/escape.py | 49 +++++++++ markdown_it/rules_inline/html_inline.py | 43 ++++++++ markdown_it/rules_inline/image.py | 151 ++++++++++++++++++++++++++ markdown_it/rules_inline/link.py | 150 +++++++++++++++++++++++++ markdown_it/rules_inline/newline.py | 43 ++++++++ markdown_it/rules_inline/state_inline.py | 175 ++++++++++++++++++++++++++++++ markdown_it/rules_inline/strikethrough.py | 133 +++++++++++++++++++++++ markdown_it/rules_inline/text.py | 57 ++++++++++ markdown_it/rules_inline/text_collapse.py | 43 ++++++++ 15 files changed, 1296 insertions(+) create mode 100644 markdown_it/rules_inline/__init__.py create mode 100644 markdown_it/rules_inline/autolink.py create mode 100644 markdown_it/rules_inline/backticks.py create mode 100644 markdown_it/rules_inline/balance_pairs.py create mode 100644 markdown_it/rules_inline/emphasis.py create mode 100644 markdown_it/rules_inline/entity.py create mode 100644 markdown_it/rules_inline/escape.py create mode 100644 markdown_it/rules_inline/html_inline.py create mode 100644 markdown_it/rules_inline/image.py create mode 100644 markdown_it/rules_inline/link.py create mode 100644 markdown_it/rules_inline/newline.py create mode 100644 markdown_it/rules_inline/state_inline.py create mode 100644 markdown_it/rules_inline/strikethrough.py create mode 100644 markdown_it/rules_inline/text.py create mode 100644 markdown_it/rules_inline/text_collapse.py (limited to 'markdown_it/rules_inline') diff --git a/markdown_it/rules_inline/__init__.py b/markdown_it/rules_inline/__init__.py new file mode 100644 index 0000000..f27907c --- /dev/null +++ b/markdown_it/rules_inline/__init__.py @@ -0,0 +1,29 @@ +__all__ = ( + "StateInline", + "text", + "text_collapse", + "link_pairs", + "escape", + "newline", + "backtick", + "emphasis", + "image", + "link", + "autolink", + "entity", + "html_inline", + "strikethrough", +) +from . import emphasis, strikethrough +from .autolink import autolink +from .backticks import backtick +from .balance_pairs import link_pairs +from .entity import entity +from .escape import escape +from .html_inline import html_inline +from .image import image +from .link import link +from .newline import newline +from .state_inline import StateInline +from .text import text +from .text_collapse import text_collapse diff --git a/markdown_it/rules_inline/autolink.py b/markdown_it/rules_inline/autolink.py new file mode 100644 index 0000000..a4ee61c --- /dev/null +++ b/markdown_it/rules_inline/autolink.py @@ -0,0 +1,78 @@ +# Process autolinks '' +import re + +from .state_inline import StateInline + +EMAIL_RE = re.compile( + r"^([a-zA-Z0-9.!#$%&\'*+\/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*)$" # noqa: E501 +) +AUTOLINK_RE = re.compile(r"^([a-zA-Z][a-zA-Z0-9+.\-]{1,31}):([^<>\x00-\x20]*)$") + + +def autolink(state: StateInline, silent: bool) -> bool: + + pos = state.pos + + if state.srcCharCode[pos] != 0x3C: # /* < */ + return False + + start = state.pos + maximum = state.posMax + + while True: + pos += 1 + if pos >= maximum: + return False + + ch = state.srcCharCode[pos] + + if ch == 0x3C: # /* < */ + return False + if ch == 0x3E: # /* > */ + break + + url = state.src[start + 1 : pos] + + if AUTOLINK_RE.search(url) is not None: + fullUrl = state.md.normalizeLink(url) + if not state.md.validateLink(fullUrl): + return False + + if not silent: + token = state.push("link_open", "a", 1) + token.attrs = {"href": fullUrl} + token.markup = "autolink" + token.info = "auto" + + token = state.push("text", "", 0) + token.content = state.md.normalizeLinkText(url) + + token = state.push("link_close", "a", -1) + token.markup = "autolink" + token.info = "auto" + + state.pos += len(url) + 2 + return True + + if EMAIL_RE.search(url) is not None: + fullUrl = state.md.normalizeLink("mailto:" + url) + if not state.md.validateLink(fullUrl): + return False + + if not silent: + token = state.push("link_open", "a", 1) + token.attrs = {"href": fullUrl} + token.markup = "autolink" + token.info = "auto" + + token = state.push("text", "", 0) + token.content = state.md.normalizeLinkText(url) + + token = state.push("link_close", "a", -1) + token.markup = "autolink" + token.info = "auto" + + state.pos += len(url) + 2 + return True + + return False diff --git a/markdown_it/rules_inline/backticks.py b/markdown_it/rules_inline/backticks.py new file mode 100644 index 0000000..7bff12f --- /dev/null +++ b/markdown_it/rules_inline/backticks.py @@ -0,0 +1,75 @@ +# Parse backticks +import re + +from .state_inline import StateInline + +regex = re.compile("^ (.+) $") + + +def backtick(state: StateInline, silent: bool) -> bool: + + pos = state.pos + ch = state.srcCharCode[pos] + + # /* ` */ + if ch != 0x60: + return False + + start = pos + pos += 1 + maximum = state.posMax + + # scan marker length + while pos < maximum and (state.srcCharCode[pos] == 0x60): # /* ` */ + pos += 1 + + marker = state.src[start:pos] + openerLength = len(marker) + + if state.backticksScanned and state.backticks.get(openerLength, 0) <= start: + if not silent: + state.pending += marker + state.pos += openerLength + return True + + matchStart = matchEnd = pos + + # Nothing found in the cache, scan until the end of the line (or until marker is found) + while True: + try: + matchStart = state.src.index("`", matchEnd) + except ValueError: + break + matchEnd = matchStart + 1 + + # scan marker length + while matchEnd < maximum and (state.srcCharCode[matchEnd] == 0x60): # /* ` */ + matchEnd += 1 + + closerLength = matchEnd - matchStart + + if closerLength == openerLength: + # Found matching closer length. + if not silent: + token = state.push("code_inline", "code", 0) + token.markup = marker + token.content = state.src[pos:matchStart].replace("\n", " ") + if ( + token.content.startswith(" ") + and token.content.endswith(" ") + and len(token.content.strip()) > 0 + ): + token.content = token.content[1:-1] + state.pos = matchEnd + return True + + # Some different length found, put it in cache as upper limit of where closer can be found + state.backticks[closerLength] = matchStart + + # Scanned through the end, didn't find anything + state.backticksScanned = True + + if not silent: + state.pending += marker + state.pos += openerLength + return True diff --git a/markdown_it/rules_inline/balance_pairs.py b/markdown_it/rules_inline/balance_pairs.py new file mode 100644 index 0000000..db622f0 --- /dev/null +++ b/markdown_it/rules_inline/balance_pairs.py @@ -0,0 +1,114 @@ +# For each opening emphasis-like marker find a matching closing one +# +from .state_inline import StateInline + + +def processDelimiters(state: StateInline, delimiters, *args): + + openersBottom = {} + maximum = len(delimiters) + + closerIdx = 0 + while closerIdx < maximum: + closer = delimiters[closerIdx] + + # Length is only used for emphasis-specific "rule of 3", + # if it's not defined (in strikethrough or 3rd party plugins), + # we can default it to 0 to disable those checks. + # + closer.length = closer.length or 0 + + if not closer.close: + closerIdx += 1 + continue + + # Previously calculated lower bounds (previous fails) + # for each marker, each delimiter length modulo 3, + # and for whether this closer can be an opener; + # https://github.com/commonmark/cmark/commit/34250e12ccebdc6372b8b49c44fab57c72443460 + if closer.marker not in openersBottom: + openersBottom[closer.marker] = [-1, -1, -1, -1, -1, -1] + + minOpenerIdx = openersBottom[closer.marker][ + (3 if closer.open else 0) + (closer.length % 3) + ] + + openerIdx = closerIdx - closer.jump - 1 + + # avoid crash if `closer.jump` is pointing outside of the array, + # e.g. for strikethrough + if openerIdx < -1: + openerIdx = -1 + + newMinOpenerIdx = openerIdx + + while openerIdx > minOpenerIdx: + opener = delimiters[openerIdx] + + if opener.marker != closer.marker: + openerIdx -= opener.jump + 1 + continue + + if opener.open and opener.end < 0: + + isOddMatch = False + + # from spec: + # + # If one of the delimiters can both open and close emphasis, then the + # sum of the lengths of the delimiter runs containing the opening and + # closing delimiters must not be a multiple of 3 unless both lengths + # are multiples of 3. + # + if opener.close or closer.open: + if (opener.length + closer.length) % 3 == 0: + if opener.length % 3 != 0 or closer.length % 3 != 0: + isOddMatch = True + + if not isOddMatch: + # If previous delimiter cannot be an opener, we can safely skip + # the entire sequence in future checks. This is required to make + # sure algorithm has linear complexity (see *_*_*_*_*_... case). + # + if openerIdx > 0 and not delimiters[openerIdx - 1].open: + lastJump = delimiters[openerIdx - 1].jump + 1 + else: + lastJump = 0 + + closer.jump = closerIdx - openerIdx + lastJump + closer.open = False + opener.end = closerIdx + opener.jump = lastJump + opener.close = False + newMinOpenerIdx = -1 + break + + openerIdx -= opener.jump + 1 + + if newMinOpenerIdx != -1: + # If match for this delimiter run failed, we want to set lower bound for + # future lookups. This is required to make sure algorithm has linear + # complexity. + # + # See details here: + # https:#github.com/commonmark/cmark/issues/178#issuecomment-270417442 + # + openersBottom[closer.marker][ + (3 if closer.open else 0) + ((closer.length or 0) % 3) + ] = newMinOpenerIdx + + closerIdx += 1 + + +def link_pairs(state: StateInline) -> None: + tokens_meta = state.tokens_meta + maximum = len(state.tokens_meta) + + processDelimiters(state, state.delimiters) + + curr = 0 + while curr < maximum: + curr_meta = tokens_meta[curr] + if curr_meta and "delimiters" in curr_meta: + processDelimiters(state, curr_meta["delimiters"]) + curr += 1 diff --git a/markdown_it/rules_inline/emphasis.py b/markdown_it/rules_inline/emphasis.py new file mode 100644 index 0000000..9001b09 --- /dev/null +++ b/markdown_it/rules_inline/emphasis.py @@ -0,0 +1,102 @@ +# Process *this* and _that_ +# + +from .state_inline import Delimiter, StateInline + + +def tokenize(state: StateInline, silent: bool): + """Insert each marker as a separate text token, and add it to delimiter list""" + start = state.pos + marker = state.srcCharCode[start] + + if silent: + return False + + # /* _ */ /* * */ + if marker != 0x5F and marker != 0x2A: + return False + + scanned = state.scanDelims(state.pos, marker == 0x2A) + + for i in range(scanned.length): + token = state.push("text", "", 0) + token.content = chr(marker) + state.delimiters.append( + Delimiter( + marker=marker, + length=scanned.length, + jump=i, + token=len(state.tokens) - 1, + end=-1, + open=scanned.can_open, + close=scanned.can_close, + ) + ) + + state.pos += scanned.length + + return True + + +def _postProcess(state, delimiters): + + i = len(delimiters) - 1 + while i >= 0: + startDelim = delimiters[i] + + # /* _ */ /* * */ + if startDelim.marker != 0x5F and startDelim.marker != 0x2A: + i -= 1 + continue + + # Process only opening markers + if startDelim.end == -1: + i -= 1 + continue + + endDelim = delimiters[startDelim.end] + + # If the previous delimiter has the same marker and is adjacent to this one, + # merge those into one strong delimiter. + # + # `whatever` -> `whatever` + # + isStrong = ( + i > 0 + and delimiters[i - 1].end == startDelim.end + 1 + and delimiters[i - 1].token == startDelim.token - 1 + and delimiters[startDelim.end + 1].token == endDelim.token + 1 + and delimiters[i - 1].marker == startDelim.marker + ) + + ch = chr(startDelim.marker) + + token = state.tokens[startDelim.token] + token.type = "strong_open" if isStrong else "em_open" + token.tag = "strong" if isStrong else "em" + token.nesting = 1 + token.markup = ch + ch if isStrong else ch + token.content = "" + + token = state.tokens[endDelim.token] + token.type = "strong_close" if isStrong else "em_close" + token.tag = "strong" if isStrong else "em" + token.nesting = -1 + token.markup = ch + ch if isStrong else ch + token.content = "" + + if isStrong: + state.tokens[delimiters[i - 1].token].content = "" + state.tokens[delimiters[startDelim.end + 1].token].content = "" + i -= 1 + + i -= 1 + + +def postProcess(state: StateInline): + """Walk through delimiter list and replace text tokens with tags.""" + _postProcess(state, state.delimiters) + + for token in state.tokens_meta: + if token and "delimiters" in token: + _postProcess(state, token["delimiters"]) diff --git a/markdown_it/rules_inline/entity.py b/markdown_it/rules_inline/entity.py new file mode 100644 index 0000000..883a966 --- /dev/null +++ b/markdown_it/rules_inline/entity.py @@ -0,0 +1,54 @@ +# Process html entity - {, ¯, ", ... +import re + +from ..common.entities import entities +from ..common.utils import fromCodePoint, isValidEntityCode +from .state_inline import StateInline + +DIGITAL_RE = re.compile(r"^&#((?:x[a-f0-9]{1,6}|[0-9]{1,7}));", re.IGNORECASE) +NAMED_RE = re.compile(r"^&([a-z][a-z0-9]{1,31});", re.IGNORECASE) + + +def entity(state: StateInline, silent: bool): + + pos = state.pos + maximum = state.posMax + + if state.srcCharCode[pos] != 0x26: # /* & */ + return False + + if (pos + 1) < maximum: + ch = state.srcCharCode[pos + 1] + + if ch == 0x23: # /* # */ + match = DIGITAL_RE.search(state.src[pos:]) + if match: + if not silent: + match1 = match.group(1) + code = ( + int(match1[1:], 16) + if match1[0].lower() == "x" + else int(match1, 10) + ) + state.pending += ( + fromCodePoint(code) + if isValidEntityCode(code) + else fromCodePoint(0xFFFD) + ) + + state.pos += len(match.group(0)) + return True + + else: + match = NAMED_RE.search(state.src[pos:]) + if match: + if match.group(1) in entities: + if not silent: + state.pending += entities[match.group(1)] + state.pos += len(match.group(0)) + return True + + if not silent: + state.pending += "&" + state.pos += 1 + return True diff --git a/markdown_it/rules_inline/escape.py b/markdown_it/rules_inline/escape.py new file mode 100644 index 0000000..36bd040 --- /dev/null +++ b/markdown_it/rules_inline/escape.py @@ -0,0 +1,49 @@ +""" +Process escaped chars and hardbreaks +""" +from ..common.utils import isSpace +from .state_inline import StateInline + +ESCAPED = [0 for _ in range(256)] +for ch in "\\!\"#$%&'()*+,./:;<=>?@[]^_`{|}~-": + ESCAPED[ord(ch)] = 1 + + +def escape(state: StateInline, silent: bool): + pos = state.pos + maximum = state.posMax + + # /* \ */ + if state.srcCharCode[pos] != 0x5C: + return False + + pos += 1 + + if pos < maximum: + ch = state.srcCharCode[pos] + + if ch < 256 and ESCAPED[ch] != 0: + if not silent: + state.pending += state.src[pos] + state.pos += 2 + return True + + if ch == 0x0A: + if not silent: + state.push("hardbreak", "br", 0) + + pos += 1 + # skip leading whitespaces from next line + while pos < maximum: + ch = state.srcCharCode[pos] + if not isSpace(ch): + break + pos += 1 + + state.pos = pos + return True + + if not silent: + state.pending += "\\" + state.pos += 1 + return True diff --git a/markdown_it/rules_inline/html_inline.py b/markdown_it/rules_inline/html_inline.py new file mode 100644 index 0000000..295cc5c --- /dev/null +++ b/markdown_it/rules_inline/html_inline.py @@ -0,0 +1,43 @@ +# Process html tags +from ..common.html_re import HTML_TAG_RE +from .state_inline import StateInline + + +def isLetter(ch: int): + lc = ch | 0x20 # to lower case + # /* a */ and /* z */ + return (lc >= 0x61) and (lc <= 0x7A) + + +def html_inline(state: StateInline, silent: bool): + + pos = state.pos + + if not state.md.options.get("html", None): + return False + + # Check start + maximum = state.posMax + if state.srcCharCode[pos] != 0x3C or pos + 2 >= maximum: # /* < */ + return False + + # Quick fail on second char + ch = state.srcCharCode[pos + 1] + if ( + ch != 0x21 + and ch != 0x3F # /* ! */ + and ch != 0x2F # /* ? */ + and not isLetter(ch) # /* / */ + ): + return False + + match = HTML_TAG_RE.search(state.src[pos:]) + if not match: + return False + + if not silent: + token = state.push("html_inline", "", 0) + token.content = state.src[pos : pos + len(match.group(0))] + + state.pos += len(match.group(0)) + return True diff --git a/markdown_it/rules_inline/image.py b/markdown_it/rules_inline/image.py new file mode 100644 index 0000000..d2a08d4 --- /dev/null +++ b/markdown_it/rules_inline/image.py @@ -0,0 +1,151 @@ +# Process ![image]( "title") +from __future__ import annotations + +from ..common.utils import isSpace, normalizeReference +from ..token import Token +from .state_inline import StateInline + + +def image(state: StateInline, silent: bool): + + label = None + href = "" + oldPos = state.pos + max = state.posMax + + # /* ! */ + if state.srcCharCode[state.pos] != 0x21: + return False + # /* [ */ + if state.pos + 1 < state.posMax and state.srcCharCode[state.pos + 1] != 0x5B: + return False + + labelStart = state.pos + 2 + labelEnd = state.md.helpers.parseLinkLabel(state, state.pos + 1, False) + + # parser failed to find ']', so it's not a valid link + if labelEnd < 0: + return False + + pos = labelEnd + 1 + # /* ( */ + if pos < max and state.srcCharCode[pos] == 0x28: + # + # Inline link + # + + # [link]( "title" ) + # ^^ skipping these spaces + pos += 1 + while pos < max: + code = state.srcCharCode[pos] + if not isSpace(code) and code != 0x0A: + break + pos += 1 + + if pos >= max: + return False + + # [link]( "title" ) + # ^^^^^^ parsing link destination + start = pos + res = state.md.helpers.parseLinkDestination(state.src, pos, state.posMax) + if res.ok: + href = state.md.normalizeLink(res.str) + if state.md.validateLink(href): + pos = res.pos + else: + href = "" + + # [link]( "title" ) + # ^^ skipping these spaces + start = pos + while pos < max: + code = state.srcCharCode[pos] + if not isSpace(code) and code != 0x0A: + break + pos += 1 + + # [link]( "title" ) + # ^^^^^^^ parsing link title + res = state.md.helpers.parseLinkTitle(state.src, pos, state.posMax) + if pos < max and start != pos and res.ok: + title = res.str + pos = res.pos + + # [link]( "title" ) + # ^^ skipping these spaces + while pos < max: + code = state.srcCharCode[pos] + if not isSpace(code) and code != 0x0A: + break + pos += 1 + else: + title = "" + + # /* ) */ + if pos >= max or state.srcCharCode[pos] != 0x29: + state.pos = oldPos + return False + + pos += 1 + + else: + # + # Link reference + # + if "references" not in state.env: + return False + + # /* [ */ + if pos < max and state.srcCharCode[pos] == 0x5B: + start = pos + 1 + pos = state.md.helpers.parseLinkLabel(state, pos) + if pos >= 0: + label = state.src[start:pos] + pos += 1 + else: + pos = labelEnd + 1 + else: + pos = labelEnd + 1 + + # covers label == '' and label == undefined + # (collapsed reference link and shortcut reference link respectively) + if not label: + label = state.src[labelStart:labelEnd] + + label = normalizeReference(label) + + ref = state.env["references"].get(label, None) + if not ref: + state.pos = oldPos + return False + + href = ref["href"] + title = ref["title"] + + # + # We found the end of the link, and know for a fact it's a valid link + # so all that's left to do is to call tokenizer. + # + if not silent: + content = state.src[labelStart:labelEnd] + + tokens: list[Token] = [] + state.md.inline.parse(content, state.md, state.env, tokens) + + token = state.push("image", "img", 0) + token.attrs = {"src": href, "alt": ""} + token.children = tokens or None + token.content = content + + if title: + token.attrSet("title", title) + + # note, this is not part of markdown-it JS, but is useful for renderers + if label and state.md.options.get("store_labels", False): + token.meta["label"] = label + + state.pos = pos + state.posMax = max + return True diff --git a/markdown_it/rules_inline/link.py b/markdown_it/rules_inline/link.py new file mode 100644 index 0000000..2394d6c --- /dev/null +++ b/markdown_it/rules_inline/link.py @@ -0,0 +1,150 @@ +# Process [link]( "stuff") + +from ..common.utils import isSpace, normalizeReference +from .state_inline import StateInline + + +def link(state: StateInline, silent: bool): + + href = "" + title = "" + label = None + oldPos = state.pos + maximum = state.posMax + start = state.pos + parseReference = True + + if state.srcCharCode[state.pos] != 0x5B: # /* [ */ + return False + + labelStart = state.pos + 1 + labelEnd = state.md.helpers.parseLinkLabel(state, state.pos, True) + + # parser failed to find ']', so it's not a valid link + if labelEnd < 0: + return False + + pos = labelEnd + 1 + + if pos < maximum and state.srcCharCode[pos] == 0x28: # /* ( */ + # + # Inline link + # + + # might have found a valid shortcut link, disable reference parsing + parseReference = False + + # [link]( "title" ) + # ^^ skipping these spaces + pos += 1 + while pos < maximum: + code = state.srcCharCode[pos] + if not isSpace(code) and code != 0x0A: + break + pos += 1 + + if pos >= maximum: + return False + + # [link]( "title" ) + # ^^^^^^ parsing link destination + start = pos + res = state.md.helpers.parseLinkDestination(state.src, pos, state.posMax) + if res.ok: + href = state.md.normalizeLink(res.str) + if state.md.validateLink(href): + pos = res.pos + else: + href = "" + + # [link]( "title" ) + # ^^ skipping these spaces + start = pos + while pos < maximum: + code = state.srcCharCode[pos] + if not isSpace(code) and code != 0x0A: + break + pos += 1 + + # [link]( "title" ) + # ^^^^^^^ parsing link title + res = state.md.helpers.parseLinkTitle(state.src, pos, state.posMax) + if pos < maximum and start != pos and res.ok: + title = res.str + pos = res.pos + + # [link]( "title" ) + # ^^ skipping these spaces + while pos < maximum: + code = state.srcCharCode[pos] + if not isSpace(code) and code != 0x0A: + break + pos += 1 + + if pos >= maximum or state.srcCharCode[pos] != 0x29: # /* ) */ + # parsing a valid shortcut link failed, fallback to reference + parseReference = True + + pos += 1 + + if parseReference: + # + # Link reference + # + if "references" not in state.env: + return False + + if pos < maximum and state.srcCharCode[pos] == 0x5B: # /* [ */ + start = pos + 1 + pos = state.md.helpers.parseLinkLabel(state, pos) + if pos >= 0: + label = state.src[start:pos] + pos += 1 + else: + pos = labelEnd + 1 + + else: + pos = labelEnd + 1 + + # covers label == '' and label == undefined + # (collapsed reference link and shortcut reference link respectively) + if not label: + label = state.src[labelStart:labelEnd] + + label = normalizeReference(label) + + ref = ( + state.env["references"][label] if label in state.env["references"] else None + ) + if not ref: + state.pos = oldPos + return False + + href = ref["href"] + title = ref["title"] + + # + # We found the end of the link, and know for a fact it's a valid link + # so all that's left to do is to call tokenizer. + # + if not silent: + state.pos = labelStart + state.posMax = labelEnd + + token = state.push("link_open", "a", 1) + token.attrs = {"href": href} + + if title: + token.attrSet("title", title) + + # note, this is not part of markdown-it JS, but is useful for renderers + if label and state.md.options.get("store_labels", False): + token.meta["label"] = label + + state.md.inline.tokenize(state) + + token = state.push("link_close", "a", -1) + + state.pos = pos + state.posMax = maximum + return True diff --git a/markdown_it/rules_inline/newline.py b/markdown_it/rules_inline/newline.py new file mode 100644 index 0000000..3034e40 --- /dev/null +++ b/markdown_it/rules_inline/newline.py @@ -0,0 +1,43 @@ +# Proceess '\n' +import re + +from ..common.utils import charCodeAt, isSpace +from .state_inline import StateInline + +endSpace = re.compile(r" +$") + + +def newline(state: StateInline, silent: bool): + pos = state.pos + + # /* \n */ + if state.srcCharCode[pos] != 0x0A: + return False + + pmax = len(state.pending) - 1 + maximum = state.posMax + + # ' \n' -> hardbreak + # Lookup in pending chars is bad practice! Don't copy to other rules! + # Pending string is stored in concat mode, indexed lookups will cause + # conversion to flat mode. + if not silent: + if pmax >= 0 and charCodeAt(state.pending, pmax) == 0x20: + if pmax >= 1 and charCodeAt(state.pending, pmax - 1) == 0x20: + state.pending = endSpace.sub("", state.pending) + state.push("hardbreak", "br", 0) + else: + state.pending = state.pending[:-1] + state.push("softbreak", "br", 0) + + else: + state.push("softbreak", "br", 0) + + pos += 1 + + # skip heading spaces for next line + while pos < maximum and isSpace(state.srcCharCode[pos]): + pos += 1 + + state.pos = pos + return True diff --git a/markdown_it/rules_inline/state_inline.py b/markdown_it/rules_inline/state_inline.py new file mode 100644 index 0000000..283532c --- /dev/null +++ b/markdown_it/rules_inline/state_inline.py @@ -0,0 +1,175 @@ +from __future__ import annotations + +from collections import namedtuple +from collections.abc import MutableMapping +from dataclasses import dataclass +from typing import TYPE_CHECKING + +from .._compat import DATACLASS_KWARGS +from ..common.utils import isMdAsciiPunct, isPunctChar, isWhiteSpace +from ..ruler import StateBase +from ..token import Token + +if TYPE_CHECKING: + from markdown_it import MarkdownIt + + +@dataclass(**DATACLASS_KWARGS) +class Delimiter: + # Char code of the starting marker (number). + marker: int + + # Total length of these series of delimiters. + length: int + + # An amount of characters before this one that's equivalent to + # current one. In plain English: if this delimiter does not open + # an emphasis, neither do previous `jump` characters. + # + # Used to skip sequences like "*****" in one step, for 1st asterisk + # value will be 0, for 2nd it's 1 and so on. + jump: int + + # A position of the token this delimiter corresponds to. + token: int + + # If this delimiter is matched as a valid opener, `end` will be + # equal to its position, otherwise it's `-1`. + end: int + + # Boolean flags that determine if this delimiter could open or close + # an emphasis. + open: bool + close: bool + + level: bool | None = None + + +Scanned = namedtuple("Scanned", ["can_open", "can_close", "length"]) + + +class StateInline(StateBase): + def __init__( + self, src: str, md: MarkdownIt, env: MutableMapping, outTokens: list[Token] + ): + self.src = src + self.env = env + self.md = md + self.tokens = outTokens + self.tokens_meta: list[dict | None] = [None] * len(outTokens) + + self.pos = 0 + self.posMax = len(self.src) + self.level = 0 + self.pending = "" + self.pendingLevel = 0 + + # Stores { start: end } pairs. Useful for backtrack + # optimization of pairs parse (emphasis, strikes). + self.cache: dict[int, int] = {} + + # List of emphasis-like delimiters for current tag + self.delimiters: list[Delimiter] = [] + + # Stack of delimiter lists for upper level tags + self._prev_delimiters: list[list[Delimiter]] = [] + + # backticklength => last seen position + self.backticks: dict[int, int] = {} + self.backticksScanned = False + + def __repr__(self): + return ( + f"{self.__class__.__name__}" + f"(pos=[{self.pos} of {self.posMax}], token={len(self.tokens)})" + ) + + def pushPending(self): + token = Token("text", "", 0) + token.content = self.pending + token.level = self.pendingLevel + self.tokens.append(token) + self.pending = "" + return token + + def push(self, ttype, tag, nesting): + """Push new token to "stream". + If pending text exists - flush it as text token + """ + if self.pending: + self.pushPending() + + token = Token(ttype, tag, nesting) + token_meta = None + + if nesting < 0: + # closing tag + self.level -= 1 + self.delimiters = self._prev_delimiters.pop() + + token.level = self.level + + if nesting > 0: + # opening tag + self.level += 1 + self._prev_delimiters.append(self.delimiters) + self.delimiters = [] + token_meta = {"delimiters": self.delimiters} + + self.pendingLevel = self.level + self.tokens.append(token) + self.tokens_meta.append(token_meta) + return token + + def scanDelims(self, start, canSplitWord): + """ + Scan a sequence of emphasis-like markers, and determine whether + it can start an emphasis sequence or end an emphasis sequence. + + - start - position to scan from (it should point at a valid marker); + - canSplitWord - determine if these markers can be found inside a word + + """ + pos = start + left_flanking = True + right_flanking = True + maximum = self.posMax + marker = self.srcCharCode[start] + + # treat beginning of the line as a whitespace + lastChar = self.srcCharCode[start - 1] if start > 0 else 0x20 + + while pos < maximum and self.srcCharCode[pos] == marker: + pos += 1 + + count = pos - start + + # treat end of the line as a whitespace + nextChar = self.srcCharCode[pos] if pos < maximum else 0x20 + + isLastPunctChar = isMdAsciiPunct(lastChar) or isPunctChar(chr(lastChar)) + isNextPunctChar = isMdAsciiPunct(nextChar) or isPunctChar(chr(nextChar)) + + isLastWhiteSpace = isWhiteSpace(lastChar) + isNextWhiteSpace = isWhiteSpace(nextChar) + + if isNextWhiteSpace: + left_flanking = False + elif isNextPunctChar: + if not (isLastWhiteSpace or isLastPunctChar): + left_flanking = False + + if isLastWhiteSpace: + right_flanking = False + elif isLastPunctChar: + if not (isNextWhiteSpace or isNextPunctChar): + right_flanking = False + + if not canSplitWord: + can_open = left_flanking and ((not right_flanking) or isLastPunctChar) + can_close = right_flanking and ((not left_flanking) or isNextPunctChar) + else: + can_open = left_flanking + can_close = right_flanking + + return Scanned(can_open, can_close, count) diff --git a/markdown_it/rules_inline/strikethrough.py b/markdown_it/rules_inline/strikethrough.py new file mode 100644 index 0000000..107ea26 --- /dev/null +++ b/markdown_it/rules_inline/strikethrough.py @@ -0,0 +1,133 @@ +# ~~strike through~~ +from __future__ import annotations + +from .state_inline import Delimiter, StateInline + + +def tokenize(state: StateInline, silent: bool): + """Insert each marker as a separate text token, and add it to delimiter list""" + start = state.pos + marker = state.srcCharCode[start] + + if silent: + return False + + if marker != 0x7E: # /* ~ */ + return False + + scanned = state.scanDelims(state.pos, True) + length = scanned.length + ch = chr(marker) + + if length < 2: + return False + + if length % 2: + token = state.push("text", "", 0) + token.content = ch + length -= 1 + + i = 0 + while i < length: + token = state.push("text", "", 0) + token.content = ch + ch + state.delimiters.append( + Delimiter( + **{ + "marker": marker, + "length": 0, # disable "rule of 3" length checks meant for emphasis + "jump": i // 2, # for `~~` 1 marker = 2 characters + "token": len(state.tokens) - 1, + "end": -1, + "open": scanned.can_open, + "close": scanned.can_close, + } + ) + ) + + i += 2 + + state.pos += scanned.length + + return True + + +def _postProcess(state: StateInline, delimiters: list[Delimiter]): + + loneMarkers = [] + maximum = len(delimiters) + + i = 0 + while i < maximum: + startDelim = delimiters[i] + + if startDelim.marker != 0x7E: # /* ~ */ + i += 1 + continue + + if startDelim.end == -1: + i += 1 + continue + + endDelim = delimiters[startDelim.end] + + token = state.tokens[startDelim.token] + token.type = "s_open" + token.tag = "s" + token.nesting = 1 + token.markup = "~~" + token.content = "" + + token = state.tokens[endDelim.token] + token.type = "s_close" + token.tag = "s" + token.nesting = -1 + token.markup = "~~" + token.content = "" + + if ( + state.tokens[endDelim.token - 1].type == "text" + and state.tokens[endDelim.token - 1].content == "~" + ): + + loneMarkers.append(endDelim.token - 1) + + i += 1 + + # If a marker sequence has an odd number of characters, it's split + # like this: `~~~~~` -> `~` + `~~` + `~~`, leaving one marker at the + # start of the sequence. + # + # So, we have to move all those markers after subsequent s_close tags. + # + while loneMarkers: + i = loneMarkers.pop() + j = i + 1 + + while (j < len(state.tokens)) and (state.tokens[j].type == "s_close"): + j += 1 + + j -= 1 + + if i != j: + token = state.tokens[j] + state.tokens[j] = state.tokens[i] + state.tokens[i] = token + + +def postProcess(state: StateInline): + """Walk through delimiter list and replace text tokens with tags.""" + tokens_meta = state.tokens_meta + maximum = len(state.tokens_meta) + _postProcess(state, state.delimiters) + + curr = 0 + while curr < maximum: + try: + curr_meta = tokens_meta[curr] + except IndexError: + pass + else: + if curr_meta and "delimiters" in curr_meta: + _postProcess(state, curr_meta["delimiters"]) + curr += 1 diff --git a/markdown_it/rules_inline/text.py b/markdown_it/rules_inline/text.py new file mode 100644 index 0000000..ec6ee0f --- /dev/null +++ b/markdown_it/rules_inline/text.py @@ -0,0 +1,57 @@ +# Skip text characters for text token, place those to pending buffer +# and increment current pos + +from .state_inline import StateInline + +# Rule to skip pure text +# '{}$%@~+=:' reserved for extensions + +# !, ", #, $, %, &, ', (, ), *, +, ,, -, ., /, :, ;, <, =, >, ?, @, [, \, ], ^, _, `, {, |, }, or ~ + +# !!!! Don't confuse with "Markdown ASCII Punctuation" chars +# http://spec.commonmark.org/0.15/#ascii-punctuation-character + + +def isTerminatorChar(ch): + return ch in { + 0x0A, # /* \n */: + 0x21, # /* ! */: + 0x23, # /* # */: + 0x24, # /* $ */: + 0x25, # /* % */: + 0x26, # /* & */: + 0x2A, # /* * */: + 0x2B, # /* + */: + 0x2D, # /* - */: + 0x3A, # /* : */: + 0x3C, # /* < */: + 0x3D, # /* = */: + 0x3E, # /* > */: + 0x40, # /* @ */: + 0x5B, # /* [ */: + 0x5C, # /* \ */: + 0x5D, # /* ] */: + 0x5E, # /* ^ */: + 0x5F, # /* _ */: + 0x60, # /* ` */: + 0x7B, # /* { */: + 0x7D, # /* } */: + 0x7E, # /* ~ */: + } + + +def text(state: StateInline, silent: bool, **args): + pos = state.pos + posMax = state.posMax + while (pos < posMax) and not isTerminatorChar(state.srcCharCode[pos]): + pos += 1 + + if pos == state.pos: + return False + + if not silent: + state.pending += state.src[state.pos : pos] + + state.pos = pos + + return True diff --git a/markdown_it/rules_inline/text_collapse.py b/markdown_it/rules_inline/text_collapse.py new file mode 100644 index 0000000..6d0c0ab --- /dev/null +++ b/markdown_it/rules_inline/text_collapse.py @@ -0,0 +1,43 @@ +from .state_inline import StateInline + + +def text_collapse(state: StateInline, *args): + """ + Clean up tokens after emphasis and strikethrough postprocessing: + merge adjacent text nodes into one and re-calculate all token levels + + This is necessary because initially emphasis delimiter markers (``*, _, ~``) + are treated as their own separate text tokens. Then emphasis rule either + leaves them as text (needed to merge with adjacent text) or turns them + into opening/closing tags (which messes up levels inside). + """ + level = 0 + maximum = len(state.tokens) + + curr = last = 0 + while curr < maximum: + # re-calculate levels after emphasis/strikethrough turns some text nodes + # into opening/closing tags + if state.tokens[curr].nesting < 0: + level -= 1 # closing tag + state.tokens[curr].level = level + if state.tokens[curr].nesting > 0: + level += 1 # opening tag + + if ( + state.tokens[curr].type == "text" + and curr + 1 < maximum + and state.tokens[curr + 1].type == "text" + ): + # collapse two adjacent text nodes + state.tokens[curr + 1].content = ( + state.tokens[curr].content + state.tokens[curr + 1].content + ) + else: + if curr != last: + state.tokens[last] = state.tokens[curr] + last += 1 + curr += 1 + + if curr != last: + del state.tokens[last:] -- cgit v1.2.3