src/prompt_toolkit/lexers/pygments.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335

"""
Adaptor classes for using Pygments lexers within prompt_toolkit.

This includes syntax synchronization code, so that we don't have to start
lexing at the beginning of a document, when displaying a very large text.
"""
import re
from abc import ABCMeta, abstractmethod
from typing import (
    TYPE_CHECKING,
    Callable,
    Dict,
    Generator,
    Iterable,
    Optional,
    Tuple,
    Type,
)

from prompt_toolkit.document import Document
from prompt_toolkit.filters import FilterOrBool, to_filter
from prompt_toolkit.formatted_text.base import StyleAndTextTuples
from prompt_toolkit.formatted_text.utils import split_lines
from prompt_toolkit.styles.pygments import pygments_token_to_classname

from .base import Lexer, SimpleLexer

if TYPE_CHECKING:
    from pygments.lexer import Lexer as PygmentsLexerCls

__all__ = [
    "PygmentsLexer",
    "SyntaxSync",
    "SyncFromStart",
    "RegexSync",
]


class SyntaxSync(metaclass=ABCMeta):
    """
    Syntax synchroniser. This is a tool that finds a start position for the
    lexer. This is especially important when editing big documents; we don't
    want to start the highlighting by running the lexer from the beginning of
    the file. That is very slow when editing.
    """

    @abstractmethod
    def get_sync_start_position(
        self, document: Document, lineno: int
    ) -> Tuple[int, int]:
        """
        Return the position from where we can start lexing as a (row, column)
        tuple.

        :param document: `Document` instance that contains all the lines.
        :param lineno: The line that we want to highlight. (We need to return
            this line, or an earlier position.)
        """


class SyncFromStart(SyntaxSync):
    """
    Always start the syntax highlighting from the beginning.
    """

    def get_sync_start_position(
        self, document: Document, lineno: int
    ) -> Tuple[int, int]:
        return 0, 0


class RegexSync(SyntaxSync):
    """
    Synchronize by starting at a line that matches the given regex pattern.
    """

    # Never go more than this amount of lines backwards for synchronisation.
    # That would be too CPU intensive.
    MAX_BACKWARDS = 500

    # Start lexing at the start, if we are in the first 'n' lines and no
    # synchronisation position was found.
    FROM_START_IF_NO_SYNC_POS_FOUND = 100

    def __init__(self, pattern: str) -> None:
        self._compiled_pattern = re.compile(pattern)

    def get_sync_start_position(
        self, document: Document, lineno: int
    ) -> Tuple[int, int]:
        """
        Scan backwards, and find a possible position to start.
        """
        pattern = self._compiled_pattern
        lines = document.lines

        # Scan upwards, until we find a point where we can start the syntax
        # synchronisation.
        for i in range(lineno, max(-1, lineno - self.MAX_BACKWARDS), -1):
            match = pattern.match(lines[i])
            if match:
                return i, match.start()

        # No synchronisation point found. If we aren't that far from the
        # beginning, start at the very beginning, otherwise, just try to start
        # at the current line.
        if lineno < self.FROM_START_IF_NO_SYNC_POS_FOUND:
            return 0, 0
        else:
            return lineno, 0

    @classmethod
    def from_pygments_lexer_cls(cls, lexer_cls: "PygmentsLexerCls") -> "RegexSync":
        """
        Create a :class:`.RegexSync` instance for this Pygments lexer class.
        """
        patterns = {
            # For Python, start highlighting at any class/def block.
            "Python": r"^\s*(class|def)\s+",
            "Python 3": r"^\s*(class|def)\s+",
            # For HTML, start at any open/close tag definition.
            "HTML": r"<[/a-zA-Z]",
            # For javascript, start at a function.
            "JavaScript": r"\bfunction\b"
            # TODO: Add definitions for other languages.
            #       By default, we start at every possible line.
        }
        p = patterns.get(lexer_cls.name, "^")
        return cls(p)


class _TokenCache(Dict[Tuple[str, ...], str]):
    """
    Cache that converts Pygments tokens into `prompt_toolkit` style objects.

    ``Token.A.B.C`` will be converted into:
    ``class:pygments,pygments.A,pygments.A.B,pygments.A.B.C``
    """

    def __missing__(self, key: Tuple[str, ...]) -> str:
        result = "class:" + pygments_token_to_classname(key)
        self[key] = result
        return result


_token_cache = _TokenCache()


class PygmentsLexer(Lexer):
    """
    Lexer that calls a pygments lexer.

    Example::

        from pygments.lexers.html import HtmlLexer
        lexer = PygmentsLexer(HtmlLexer)

    Note: Don't forget to also load a Pygments compatible style. E.g.::

        from prompt_toolkit.styles.from_pygments import style_from_pygments_cls
        from pygments.styles import get_style_by_name
        style = style_from_pygments_cls(get_style_by_name('monokai'))

    :param pygments_lexer_cls: A `Lexer` from Pygments.
    :param sync_from_start: Start lexing at the start of the document. This
        will always give the best results, but it will be slow for bigger
        documents. (When the last part of the document is display, then the
        whole document will be lexed by Pygments on every key stroke.) It is
        recommended to disable this for inputs that are expected to be more
        than 1,000 lines.
    :param syntax_sync: `SyntaxSync` object.
    """

    # Minimum amount of lines to go backwards when starting the parser.
    # This is important when the lines are retrieved in reverse order, or when
    # scrolling upwards. (Due to the complexity of calculating the vertical
    # scroll offset in the `Window` class, lines are not always retrieved in
    # order.)
    MIN_LINES_BACKWARDS = 50

    # When a parser was started this amount of lines back, read the parser
    # until we get the current line. Otherwise, start a new parser.
    # (This should probably be bigger than MIN_LINES_BACKWARDS.)
    REUSE_GENERATOR_MAX_DISTANCE = 100

    def __init__(
        self,
        pygments_lexer_cls: Type["PygmentsLexerCls"],
        sync_from_start: FilterOrBool = True,
        syntax_sync: Optional[SyntaxSync] = None,
    ) -> None:

        self.pygments_lexer_cls = pygments_lexer_cls
        self.sync_from_start = to_filter(sync_from_start)

        # Instantiate the Pygments lexer.
        self.pygments_lexer = pygments_lexer_cls(
            stripnl=False, stripall=False, ensurenl=False
        )

        # Create syntax sync instance.
        self.syntax_sync = syntax_sync or RegexSync.from_pygments_lexer_cls(
            pygments_lexer_cls
        )

    @classmethod
    def from_filename(
        cls, filename: str, sync_from_start: FilterOrBool = True
    ) -> "Lexer":
        """
        Create a `Lexer` from a filename.
        """
        # Inline imports: the Pygments dependency is optional!
        from pygments.lexers import get_lexer_for_filename
        from pygments.util import ClassNotFound

        try:
            pygments_lexer = get_lexer_for_filename(filename)
        except ClassNotFound:
            return SimpleLexer()
        else:
            return cls(pygments_lexer.__class__, sync_from_start=sync_from_start)

    def lex_document(self, document: Document) -> Callable[[int], StyleAndTextTuples]:
        """
        Create a lexer function that takes a line number and returns the list
        of (style_str, text) tuples as the Pygments lexer returns for that line.
        """
        LineGenerator = Generator[Tuple[int, StyleAndTextTuples], None, None]

        # Cache of already lexed lines.
        cache: Dict[int, StyleAndTextTuples] = {}

        # Pygments generators that are currently lexing.
        # Map lexer generator to the line number.
        line_generators: Dict[LineGenerator, int] = {}

        def get_syntax_sync() -> SyntaxSync:
            "The Syntax synchronisation object that we currently use."
            if self.sync_from_start():
                return SyncFromStart()
            else:
                return self.syntax_sync

        def find_closest_generator(i: int) -> Optional[LineGenerator]:
            "Return a generator close to line 'i', or None if none was found."
            for generator, lineno in line_generators.items():
                if lineno < i and i - lineno < self.REUSE_GENERATOR_MAX_DISTANCE:
                    return generator
            return None

        def create_line_generator(start_lineno: int, column: int = 0) -> LineGenerator:
            """
            Create a generator that yields the lexed lines.
            Each iteration it yields a (line_number, [(style_str, text), ...]) tuple.
            """

            def get_text_fragments() -> Iterable[Tuple[str, str]]:
                text = "\n".join(document.lines[start_lineno:])[column:]

                # We call `get_text_fragments_unprocessed`, because `get_tokens` will
                # still replace \r\n and \r by \n.  (We don't want that,
                # Pygments should return exactly the same amount of text, as we
                # have given as input.)
                for _, t, v in self.pygments_lexer.get_tokens_unprocessed(text):
                    # Turn Pygments `Token` object into prompt_toolkit style
                    # strings.
                    yield _token_cache[t], v

            yield from enumerate(split_lines(list(get_text_fragments())), start_lineno)

        def get_generator(i: int) -> LineGenerator:
            """
            Find an already started generator that is close, or create a new one.
            """
            # Find closest line generator.
            generator = find_closest_generator(i)
            if generator:
                return generator

            # No generator found. Determine starting point for the syntax
            # synchronisation first.

            # Go at least x lines back. (Make scrolling upwards more
            # efficient.)
            i = max(0, i - self.MIN_LINES_BACKWARDS)

            if i == 0:
                row = 0
                column = 0
            else:
                row, column = get_syntax_sync().get_sync_start_position(document, i)

            # Find generator close to this point, or otherwise create a new one.
            generator = find_closest_generator(i)
            if generator:
                return generator
            else:
                generator = create_line_generator(row, column)

            # If the column is not 0, ignore the first line. (Which is
            # incomplete. This happens when the synchronisation algorithm tells
            # us to start parsing in the middle of a line.)
            if column:
                next(generator)
                row += 1

            line_generators[generator] = row
            return generator

        def get_line(i: int) -> StyleAndTextTuples:
            "Return the tokens for a given line number."
            try:
                return cache[i]
            except KeyError:
                generator = get_generator(i)

                # Exhaust the generator, until we find the requested line.
                for num, line in generator:
                    cache[num] = line
                    if num == i:
                        line_generators[generator] = i

                        # Remove the next item from the cache.
                        # (It could happen that it's already there, because of
                        # another generator that started filling these lines,
                        # but we want to synchronise these lines with the
                        # current lexer's state.)
                        if num + 1 in cache:
                            del cache[num + 1]

                        return cache[num]
            return []

        return get_line