1 files changed, 153 insertions, 0 deletions
diff --git a/sphinx/util/texescape.py b/sphinx/util/texescape.py
new file mode 100644
index 0000000..8527441
--- /dev/null
+++ b/sphinx/util/texescape.py
@@ -0,0 +1,153 @@
+"""TeX escaping helper."""
+
+from __future__ import annotations
+
+import re
+
+tex_replacements = [
+    # map TeX special chars
+    ('$', r'\$'),
+    ('%', r'\%'),
+    ('&', r'\&'),
+    ('#', r'\#'),
+    ('_', r'\_'),
+    ('{', r'\{'),
+    ('}', r'\}'),
+    ('\\', r'\textbackslash{}'),
+    ('~', r'\textasciitilde{}'),
+    ('^', r'\textasciicircum{}'),
+    # map chars to avoid mis-interpretation in LaTeX
+    ('[', r'{[}'),
+    (']', r'{]}'),
+    # map special Unicode characters to TeX commands
+    ('✓', r'\(\checkmark\)'),
+    ('✔', r'\(\pmb{\checkmark}\)'),
+    ('✕', r'\(\times\)'),
+    ('✖', r'\(\pmb{\times}\)'),
+    # used to separate -- in options
+    ('', r'{}'),
+    # map some special Unicode characters to similar ASCII ones
+    # (even for Unicode LaTeX as may not be supported by OpenType font)
+    ('⎽', r'\_'),
+    ('ℯ', r'e'),
+    ('ⅈ', r'i'),
+    # Greek alphabet not escaped: pdflatex handles it via textalpha and inputenc
+    # OHM SIGN U+2126 is handled by LaTeX textcomp package
+]
+
+# A map to avoid TeX ligatures or character replacements in PDF output
+# xelatex/lualatex/uplatex are handled differently (#5790, #6888)
+ascii_tex_replacements = [
+    # Note: the " renders curly in OT1 encoding but straight in T1, T2A, LY1...
+    #       escaping it to \textquotedbl would break documents using OT1
+    #       Sphinx does \shorthandoff{"} to avoid problems with some languages
+    # There is no \text... LaTeX escape for the hyphen character -
+    ('-', r'\sphinxhyphen{}'),  # -- and --- are TeX ligatures
+    # ,, is a TeX ligature in T1 encoding, but escaping the comma adds
+    # complications (whether by {}, or a macro) and is not done
+    # the next two require textcomp package
+    ("'", r'\textquotesingle{}'),  # else ' renders curly, and '' is a ligature
+    ('`', r'\textasciigrave{}'),   # else \` and \`\` render curly
+    ('<', r'\textless{}'),     # < is inv. exclam in OT1, << is a T1-ligature
+    ('>', r'\textgreater{}'),  # > is inv. quest. mark in 0T1, >> a T1-ligature
+]
+
+# A map Unicode characters to LaTeX representation
+# (for LaTeX engines which don't support unicode)
+unicode_tex_replacements = [
+    # map some more common Unicode characters to TeX commands
+    ('¶', r'\P{}'),
+    ('§', r'\S{}'),
+    ('€', r'\texteuro{}'),
+    ('∞', r'\(\infty\)'),
+    ('±', r'\(\pm\)'),
+    ('→', r'\(\rightarrow\)'),
+    ('‣', r'\(\rightarrow\)'),
+    ('–', r'\textendash{}'),
+    # superscript
+    ('⁰', r'\(\sp{\text{0}}\)'),
+    ('¹', r'\(\sp{\text{1}}\)'),
+    ('²', r'\(\sp{\text{2}}\)'),
+    ('³', r'\(\sp{\text{3}}\)'),
+    ('⁴', r'\(\sp{\text{4}}\)'),
+    ('⁵', r'\(\sp{\text{5}}\)'),
+    ('⁶', r'\(\sp{\text{6}}\)'),
+    ('⁷', r'\(\sp{\text{7}}\)'),
+    ('⁸', r'\(\sp{\text{8}}\)'),
+    ('⁹', r'\(\sp{\text{9}}\)'),
+    # subscript
+    ('₀', r'\(\sb{\text{0}}\)'),
+    ('₁', r'\(\sb{\text{1}}\)'),
+    ('₂', r'\(\sb{\text{2}}\)'),
+    ('₃', r'\(\sb{\text{3}}\)'),
+    ('₄', r'\(\sb{\text{4}}\)'),
+    ('₅', r'\(\sb{\text{5}}\)'),
+    ('₆', r'\(\sb{\text{6}}\)'),
+    ('₇', r'\(\sb{\text{7}}\)'),
+    ('₈', r'\(\sb{\text{8}}\)'),
+    ('₉', r'\(\sb{\text{9}}\)'),
+]
+
+# TODO: this should be called tex_idescape_map because its only use is in
+#       sphinx.writers.latex.LaTeXTranslator.idescape()
+# %, {, }, \, #, and ~ are the only ones which must be replaced by _ character
+# It would be simpler to define it entirely here rather than in init().
+# Unicode replacements are superfluous, as idescape() uses backslashreplace
+tex_replace_map: dict[int, str] = {}
+
+_tex_escape_map: dict[int, str] = {}
+_tex_escape_map_without_unicode: dict[int, str] = {}
+_tex_hlescape_map: dict[int, str] = {}
+_tex_hlescape_map_without_unicode: dict[int, str] = {}
+
+
+def escape(s: str, latex_engine: str | None = None) -> str:
+    """Escape text for LaTeX output."""
+    if latex_engine in ('lualatex', 'xelatex'):
+        # unicode based LaTeX engine
+        return s.translate(_tex_escape_map_without_unicode)
+    else:
+        return s.translate(_tex_escape_map)
+
+
+def hlescape(s: str, latex_engine: str | None = None) -> str:
+    """Escape text for LaTeX highlighter."""
+    if latex_engine in ('lualatex', 'xelatex'):
+        # unicode based LaTeX engine
+        return s.translate(_tex_hlescape_map_without_unicode)
+    else:
+        return s.translate(_tex_hlescape_map)
+
+
+def escape_abbr(text: str) -> str:
+    """Adjust spacing after abbreviations. Works with @ letter or other."""
+    return re.sub(r'\.(?=\s|$)', r'.\@{}', text)
+
+
+def init() -> None:
+    for a, b in tex_replacements:
+        _tex_escape_map[ord(a)] = b
+        _tex_escape_map_without_unicode[ord(a)] = b
+        tex_replace_map[ord(a)] = '_'
+
+    # no reason to do this for _tex_escape_map_without_unicode
+    for a, b in ascii_tex_replacements:
+        _tex_escape_map[ord(a)] = b
+
+    # but the hyphen has a specific PDF bookmark problem
+    # https://github.com/latex3/hyperref/issues/112
+    _tex_escape_map_without_unicode[ord('-')] = r'\sphinxhyphen{}'
+
+    for a, b in unicode_tex_replacements:
+        _tex_escape_map[ord(a)] = b
+        #  This is actually unneeded:
+        tex_replace_map[ord(a)] = '_'
+
+    for a, b in tex_replacements:
+        if a in '[]{}\\':
+            continue
+        _tex_hlescape_map[ord(a)] = b
+        _tex_hlescape_map_without_unicode[ord(a)] = b
+
+    for a, b in unicode_tex_replacements:
+        _tex_hlescape_map[ord(a)] = b