diff options
Diffstat (limited to 'sphinx/util/smartypants.py')
-rw-r--r-- | sphinx/util/smartypants.py | 374 |
1 files changed, 374 insertions, 0 deletions
diff --git a/sphinx/util/smartypants.py b/sphinx/util/smartypants.py new file mode 100644 index 0000000..ab300e0 --- /dev/null +++ b/sphinx/util/smartypants.py @@ -0,0 +1,374 @@ +"""Deprecated backport of docutils.utils.smartquotes. + +This is extracted (with minor adaptations for flake8 compliance) from +docutils’ docutils/utils/smartquotes.py as of revision 8097 (30 May 2017), +in order to backport for Sphinx usage with Docutils < 0.14 extra language +configurations and fixes. Replaces earlier smartypants version as used up +to Sphinx 1.5.6. + +:copyright: © 2010 Günter Milde, + original `SmartyPants`_: © 2003 John Gruber + smartypants.py: © 2004, 2007 Chad Miller +:license: Released under the terms of the `2-Clause BSD license`_, in short: + + Copying and distribution of this file, with or without modification, + are permitted in any medium without royalty provided the copyright + notices and this notice are preserved. + This file is offered as-is, without any warranty. + +.. _SmartyPants: https://daringfireball.net/projects/smartypants/ +.. _2-Clause BSD license: https://spdx.org/licenses/BSD-2-Clause + +See the LICENSE file and the original docutils code for details. + +""" + +import re +import warnings +from typing import Generator, Iterable, Tuple + +from docutils.utils import smartquotes + +from sphinx.deprecation import RemovedInSphinx60Warning + +warnings.warn('sphinx.util.smartypants is deprecated.', + RemovedInSphinx60Warning) + +langquotes = {'af': '“”‘’', + 'af-x-altquot': '„”‚’', + 'bg': '„“‚‘', # Bulgarian, https://bg.wikipedia.org/wiki/Кавички + 'ca': '«»“”', + 'ca-x-altquot': '“”‘’', + 'cs': '„“‚‘', + 'cs-x-altquot': '»«›‹', + 'da': '»«›‹', + 'da-x-altquot': '„“‚‘', + # 'da-x-altquot2': '””’’', + 'de': '„“‚‘', + 'de-x-altquot': '»«›‹', + 'de-ch': '«»‹›', + 'el': '«»“”', + 'en': '“”‘’', + 'en-uk-x-altquot': '‘’“”', # Attention: " → ‘ and ' → “ ! + 'eo': '“”‘’', + 'es': '«»“”', + 'es-x-altquot': '“”‘’', + 'et': '„“‚‘', # no secondary quote listed in + 'et-x-altquot': '«»‹›', # the sources above (wikipedia.org) + 'eu': '«»‹›', + 'fi': '””’’', + 'fi-x-altquot': '»»››', + 'fr': ('« ', ' »', '“', '”'), # full no-break space + 'fr-x-altquot': ('« ', ' »', '“', '”'), # narrow no-break space + 'fr-ch': '«»‹›', + 'fr-ch-x-altquot': ('« ', ' »', '‹ ', ' ›'), # narrow no-break space + # http://typoguide.ch/ + 'gl': '«»“”', + 'he': '”“»«', # Hebrew is RTL, test position: + 'he-x-altquot': '„”‚’', # low quotation marks are opening. + # 'he-x-altquot': '“„‘‚', # RTL: low quotation marks opening + 'hr': '„”‘’', # https://hrvatska-tipografija.com/polunavodnici/ + 'hr-x-altquot': '»«›‹', + 'hsb': '„“‚‘', + 'hsb-x-altquot': '»«›‹', + 'hu': '„”«»', + 'is': '„“‚‘', + 'it': '«»“”', + 'it-ch': '«»‹›', + 'it-x-altquot': '“”‘’', + # 'it-x-altquot2': '“„‘‚', # [7] in headlines + 'ja': '「」『』', + 'lt': '„“‚‘', + 'lv': '„“‚‘', + 'mk': '„“‚‘', # Macedonian, + # https://mk.wikipedia.org/wiki/Правопис_и_правоговор_на_македонскиот_јазик + 'nl': '“”‘’', + 'nl-x-altquot': '„”‚’', + # 'nl-x-altquot2': '””’’', + 'nb': '«»’’', # Norsk bokmål (canonical form 'no') + 'nn': '«»’’', # Nynorsk [10] + 'nn-x-altquot': '«»‘’', # [8], [10] + # 'nn-x-altquot2': '«»«»', # [9], [10] + # 'nn-x-altquot3': '„“‚‘', # [10] + 'no': '«»’’', # Norsk bokmål [10] + 'no-x-altquot': '«»‘’', # [8], [10] + # 'no-x-altquot2': '«»«»', # [9], [10] + # 'no-x-altquot3': '„“‚‘', # [10] + 'pl': '„”«»', + 'pl-x-altquot': '«»‚’', + # 'pl-x-altquot2': '„”‚’', + # https://pl.wikipedia.org/wiki/Cudzys%C5%82%C3%B3w + 'pt': '«»“”', + 'pt-br': '“”‘’', + 'ro': '„”«»', + 'ru': '«»„“', + 'sh': '„”‚’', # Serbo-Croatian + 'sh-x-altquot': '»«›‹', + 'sk': '„“‚‘', # Slovak + 'sk-x-altquot': '»«›‹', + 'sl': '„“‚‘', # Slovenian + 'sl-x-altquot': '»«›‹', + 'sq': '«»‹›', # Albanian + 'sq-x-altquot': '“„‘‚', + 'sr': '„”’’', + 'sr-x-altquot': '»«›‹', + 'sv': '””’’', + 'sv-x-altquot': '»»››', + 'tr': '“”‘’', + 'tr-x-altquot': '«»‹›', + # 'tr-x-altquot2': '“„‘‚', # [7] antiquated? + 'uk': '«»„“', + 'uk-x-altquot': '„“‚‘', + 'zh-cn': '“”‘’', + 'zh-tw': '「」『』', + } + + +def educateQuotes(text: str, language: str = 'en') -> str: + """ + Parameter: - text string (unicode or bytes). + - language (`BCP 47` language tag.) + Returns: The `text`, with "educated" curly quote characters. + + Example input: "Isn't this fun?" + Example output: “Isn’t this fun?“; + """ + + smart = smartquotes.smartchars(language) + try: + apostrophe = smart.apostrophe + except Exception: + apostrophe = '’' + + # oldtext = text + punct_class = r"""[!"#\$\%'()*+,-.\/:;<=>?\@\[\\\]\^_`{|}~]""" + + # Special case if the very first character is a quote + # followed by punctuation at a non-word-break. + # Close the quotes by brute force: + text = re.sub(r"""^'(?=%s\\B)""" % (punct_class,), smart.csquote, text) + text = re.sub(r"""^"(?=%s\\B)""" % (punct_class,), smart.cpquote, text) + + # Special case for double sets of quotes, e.g.: + # <p>He said, "'Quoted' words in a larger quote."</p> + text = re.sub(r""""'(?=\w)""", smart.opquote + smart.osquote, text) + text = re.sub(r"""'"(?=\w)""", smart.osquote + smart.opquote, text) + + # Special case for decade abbreviations (the '80s): + if language.startswith('en'): # TODO similar cases in other languages? + text = re.sub(r"""'(?=\d{2}s)""", apostrophe, text, flags=re.UNICODE) + + close_class = r"""[^\ \t\r\n\[\{\(\-]""" + dec_dashes = r"""–|—""" + + # Get most opening single quotes: + opening_single_quotes_regex = re.compile(r""" + ( + \s | # a whitespace char, or + | # a non-breaking space entity, or + -- | # dashes, or + &[mn]dash; | # named dash entities + %s | # or decimal entities + &\#x201[34]; # or hex + ) + ' # the quote + (?=\w) # followed by a word character + """ % (dec_dashes,), re.VERBOSE | re.UNICODE) + text = opening_single_quotes_regex.sub(r'\1' + smart.osquote, text) + + # In many locales, single closing quotes are different from apostrophe: + if smart.csquote != apostrophe: + apostrophe_regex = re.compile(r"(?<=(\w|\d))'(?=\w)", re.UNICODE) + text = apostrophe_regex.sub(apostrophe, text) + # TODO: keep track of quoting level to recognize apostrophe in, e.g., + # "Ich fass' es nicht." + + closing_single_quotes_regex = re.compile(r""" + (%s) + ' + (?!\s | # whitespace + s\b | + \d # digits ('80s) + ) + """ % (close_class,), re.VERBOSE | re.UNICODE) + text = closing_single_quotes_regex.sub(r'\1' + smart.csquote, text) + + closing_single_quotes_regex = re.compile(r""" + (%s) + ' + (\s | s\b) + """ % (close_class,), re.VERBOSE | re.UNICODE) + text = closing_single_quotes_regex.sub(r'\1%s\2' % smart.csquote, text) + + # Any remaining single quotes should be opening ones: + text = re.sub(r"""'""", smart.osquote, text) + + # Get most opening double quotes: + opening_double_quotes_regex = re.compile(r""" + ( + \s | # a whitespace char, or + | # a non-breaking space entity, or + -- | # dashes, or + &[mn]dash; | # named dash entities + %s | # or decimal entities + &\#x201[34]; # or hex + ) + " # the quote + (?=\w) # followed by a word character + """ % (dec_dashes,), re.VERBOSE) + text = opening_double_quotes_regex.sub(r'\1' + smart.opquote, text) + + # Double closing quotes: + closing_double_quotes_regex = re.compile(r""" + #(%s)? # character that indicates the quote should be closing + " + (?=\s) + """ % (close_class,), re.VERBOSE) + text = closing_double_quotes_regex.sub(smart.cpquote, text) + + closing_double_quotes_regex = re.compile(r""" + (%s) # character that indicates the quote should be closing + " + """ % (close_class,), re.VERBOSE) + text = closing_double_quotes_regex.sub(r'\1' + smart.cpquote, text) + + # Any remaining quotes should be opening ones. + text = re.sub(r'"', smart.opquote, text) + + return text + + +def educate_tokens(text_tokens: Iterable[Tuple[str, str]], + attr: str = smartquotes.default_smartypants_attr, + language: str = 'en' + ) -> Generator[str, None, None]: + """Return iterator that "educates" the items of `text_tokens`. + + This is modified to intercept the ``attr='2'`` as it was used by the + Docutils 0.13.1 SmartQuotes transform in a hard coded way. Docutils 0.14 + uses ``'qDe'``` and is configurable, and its choice is backported here + for use by Sphinx with earlier Docutils releases. Similarly ``'1'`` is + replaced by ``'qde'``. + + Use ``attr='qDbe'``, resp. ``'qdbe'`` to recover Docutils effect of ``'2'``, + resp. ``'1'``. + + refs: https://sourceforge.net/p/docutils/mailman/message/35869025/ + """ + + # Parse attributes: + # 0 : do nothing + # 1 : set all (but backticks) + # 2 : set all (but backticks), using old school en- and em- dash shortcuts + # 3 : set all, using inverted old school en and em- dash shortcuts + # + # q : quotes + # b : backtick quotes (``double'' only) + # B : backtick quotes (``double'' and `single') + # d : dashes + # D : old school dashes + # i : inverted old school dashes + # e : ellipses + # w : convert " entities to " for Dreamweaver users + + convert_quot = False # translate " entities into normal quotes? + do_dashes = 0 + do_backticks = 0 + do_quotes = False + do_ellipses = False + do_stupefy = False + + if attr == "1": # Do everything, turn all options on. + do_quotes = True + # do_backticks = 1 + do_dashes = 1 + do_ellipses = True + elif attr == "2": + # Do everything, turn all options on, use old school dash shorthand. + do_quotes = True + # do_backticks = 1 + do_dashes = 2 + do_ellipses = True + elif attr == "3": + # Do everything, use inverted old school dash shorthand. + do_quotes = True + do_backticks = 1 + do_dashes = 3 + do_ellipses = True + elif attr == "-1": # Special "stupefy" mode. + do_stupefy = True + else: + if "q" in attr: + do_quotes = True + if "b" in attr: + do_backticks = 1 + if "B" in attr: + do_backticks = 2 + if "d" in attr: + do_dashes = 1 + if "D" in attr: + do_dashes = 2 + if "i" in attr: + do_dashes = 3 + if "e" in attr: + do_ellipses = True + if "w" in attr: + convert_quot = True + + prev_token_last_char = " " + # Last character of the previous text token. Used as + # context to curl leading quote characters correctly. + + for (ttype, text) in text_tokens: + + # skip HTML and/or XML tags as well as empty text tokens + # without updating the last character + if ttype == 'tag' or not text: + yield text + continue + + # skip literal text (math, literal, raw, ...) + if ttype == 'literal': + prev_token_last_char = text[-1:] + yield text + continue + + last_char = text[-1:] # Remember last char before processing. + + text = smartquotes.processEscapes(text) + + if convert_quot: + text = re.sub('"', '"', text) + + if do_dashes == 1: + text = smartquotes.educateDashes(text) + elif do_dashes == 2: + text = smartquotes.educateDashesOldSchool(text) + elif do_dashes == 3: + text = smartquotes.educateDashesOldSchoolInverted(text) + + if do_ellipses: + text = smartquotes.educateEllipses(text) + + # Note: backticks need to be processed before quotes. + if do_backticks: + text = smartquotes.educateBackticks(text, language) + + if do_backticks == 2: + text = smartquotes.educateSingleBackticks(text, language) + + if do_quotes: + # Replace plain quotes to prevent conversion to + # 2-character sequence in French. + context = prev_token_last_char.replace('"', ';').replace("'", ';') + text = educateQuotes(context + text, language)[1:] + + if do_stupefy: + text = smartquotes.stupefyEntities(text, language) + + # Remember last char as context for the next token + prev_token_last_char = last_char + + text = smartquotes.processEscapes(text, restore=True) + + yield text |