sphinx/util/texescape.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153

"""TeX escaping helper."""

from __future__ import annotations

import re

tex_replacements = [
    # map TeX special chars
    ('$', r'\$'),
    ('%', r'\%'),
    ('&', r'\&'),
    ('#', r'\#'),
    ('_', r'\_'),
    ('{', r'\{'),
    ('}', r'\}'),
    ('\\', r'\textbackslash{}'),
    ('~', r'\textasciitilde{}'),
    ('^', r'\textasciicircum{}'),
    # map chars to avoid mis-interpretation in LaTeX
    ('[', r'{[}'),
    (']', r'{]}'),
    # map special Unicode characters to TeX commands
    ('✓', r'\(\checkmark\)'),
    ('✔', r'\(\pmb{\checkmark}\)'),
    ('✕', r'\(\times\)'),
    ('✖', r'\(\pmb{\times}\)'),
    # used to separate -- in options
    ('', r'{}'),
    # map some special Unicode characters to similar ASCII ones
    # (even for Unicode LaTeX as may not be supported by OpenType font)
    ('⎽', r'\_'),
    ('ℯ', r'e'),
    ('ⅈ', r'i'),
    # Greek alphabet not escaped: pdflatex handles it via textalpha and inputenc
    # OHM SIGN U+2126 is handled by LaTeX textcomp package
]

# A map to avoid TeX ligatures or character replacements in PDF output
# xelatex/lualatex/uplatex are handled differently (#5790, #6888)
ascii_tex_replacements = [
    # Note: the " renders curly in OT1 encoding but straight in T1, T2A, LY1...
    #       escaping it to \textquotedbl would break documents using OT1
    #       Sphinx does \shorthandoff{"} to avoid problems with some languages
    # There is no \text... LaTeX escape for the hyphen character -
    ('-', r'\sphinxhyphen{}'),  # -- and --- are TeX ligatures
    # ,, is a TeX ligature in T1 encoding, but escaping the comma adds
    # complications (whether by {}, or a macro) and is not done
    # the next two require textcomp package
    ("'", r'\textquotesingle{}'),  # else ' renders curly, and '' is a ligature
    ('`', r'\textasciigrave{}'),   # else \` and \`\` render curly
    ('<', r'\textless{}'),     # < is inv. exclam in OT1, << is a T1-ligature
    ('>', r'\textgreater{}'),  # > is inv. quest. mark in 0T1, >> a T1-ligature
]

# A map Unicode characters to LaTeX representation
# (for LaTeX engines which don't support unicode)
unicode_tex_replacements = [
    # map some more common Unicode characters to TeX commands
    ('¶', r'\P{}'),
    ('§', r'\S{}'),
    ('€', r'\texteuro{}'),
    ('∞', r'\(\infty\)'),
    ('±', r'\(\pm\)'),
    ('→', r'\(\rightarrow\)'),
    ('‣', r'\(\rightarrow\)'),
    ('–', r'\textendash{}'),
    # superscript
    ('⁰', r'\(\sp{\text{0}}\)'),
    ('¹', r'\(\sp{\text{1}}\)'),
    ('²', r'\(\sp{\text{2}}\)'),
    ('³', r'\(\sp{\text{3}}\)'),
    ('⁴', r'\(\sp{\text{4}}\)'),
    ('⁵', r'\(\sp{\text{5}}\)'),
    ('⁶', r'\(\sp{\text{6}}\)'),
    ('⁷', r'\(\sp{\text{7}}\)'),
    ('⁸', r'\(\sp{\text{8}}\)'),
    ('⁹', r'\(\sp{\text{9}}\)'),
    # subscript
    ('₀', r'\(\sb{\text{0}}\)'),
    ('₁', r'\(\sb{\text{1}}\)'),
    ('₂', r'\(\sb{\text{2}}\)'),
    ('₃', r'\(\sb{\text{3}}\)'),
    ('₄', r'\(\sb{\text{4}}\)'),
    ('₅', r'\(\sb{\text{5}}\)'),
    ('₆', r'\(\sb{\text{6}}\)'),
    ('₇', r'\(\sb{\text{7}}\)'),
    ('₈', r'\(\sb{\text{8}}\)'),
    ('₉', r'\(\sb{\text{9}}\)'),
]

# TODO: this should be called tex_idescape_map because its only use is in
#       sphinx.writers.latex.LaTeXTranslator.idescape()
# %, {, }, \, #, and ~ are the only ones which must be replaced by _ character
# It would be simpler to define it entirely here rather than in init().
# Unicode replacements are superfluous, as idescape() uses backslashreplace
tex_replace_map: dict[int, str] = {}

_tex_escape_map: dict[int, str] = {}
_tex_escape_map_without_unicode: dict[int, str] = {}
_tex_hlescape_map: dict[int, str] = {}
_tex_hlescape_map_without_unicode: dict[int, str] = {}


def escape(s: str, latex_engine: str | None = None) -> str:
    """Escape text for LaTeX output."""
    if latex_engine in ('lualatex', 'xelatex'):
        # unicode based LaTeX engine
        return s.translate(_tex_escape_map_without_unicode)
    else:
        return s.translate(_tex_escape_map)


def hlescape(s: str, latex_engine: str | None = None) -> str:
    """Escape text for LaTeX highlighter."""
    if latex_engine in ('lualatex', 'xelatex'):
        # unicode based LaTeX engine
        return s.translate(_tex_hlescape_map_without_unicode)
    else:
        return s.translate(_tex_hlescape_map)


def escape_abbr(text: str) -> str:
    """Adjust spacing after abbreviations. Works with @ letter or other."""
    return re.sub(r'\.(?=\s|$)', r'.\@{}', text)


def init() -> None:
    for a, b in tex_replacements:
        _tex_escape_map[ord(a)] = b
        _tex_escape_map_without_unicode[ord(a)] = b
        tex_replace_map[ord(a)] = '_'

    # no reason to do this for _tex_escape_map_without_unicode
    for a, b in ascii_tex_replacements:
        _tex_escape_map[ord(a)] = b

    # but the hyphen has a specific PDF bookmark problem
    # https://github.com/latex3/hyperref/issues/112
    _tex_escape_map_without_unicode[ord('-')] = r'\sphinxhyphen{}'

    for a, b in unicode_tex_replacements:
        _tex_escape_map[ord(a)] = b
        #  This is actually unneeded:
        tex_replace_map[ord(a)] = '_'

    for a, b in tex_replacements:
        if a in '[]{}\\':
            continue
        _tex_hlescape_map[ord(a)] = b
        _tex_hlescape_map_without_unicode[ord(a)] = b

    for a, b in unicode_tex_replacements:
        _tex_hlescape_map[ord(a)] = b