3 files changed, 431 insertions, 0 deletions
diff --git a/src/prompt_toolkit/lexers/__init__.py b/src/prompt_toolkit/lexers/__init__.py
new file mode 100644
index 0000000..9bdc599
--- /dev/null
+++ b/src/prompt_toolkit/lexers/__init__.py
@@ -0,0 +1,20 @@
+"""
+Lexer interface and implementations.
+Used for syntax highlighting.
+"""
+from __future__ import annotations
+
+from .base import DynamicLexer, Lexer, SimpleLexer
+from .pygments import PygmentsLexer, RegexSync, SyncFromStart, SyntaxSync
+
+__all__ = [
+    # Base.
+    "Lexer",
+    "SimpleLexer",
+    "DynamicLexer",
+    # Pygments.
+    "PygmentsLexer",
+    "RegexSync",
+    "SyncFromStart",
+    "SyntaxSync",
+]
diff --git a/src/prompt_toolkit/lexers/base.py b/src/prompt_toolkit/lexers/base.py
new file mode 100644
index 0000000..3f65f8e
--- /dev/null
+++ b/src/prompt_toolkit/lexers/base.py
@@ -0,0 +1,84 @@
+"""
+Base classes for prompt_toolkit lexers.
+"""
+from __future__ import annotations
+
+from abc import ABCMeta, abstractmethod
+from typing import Callable, Hashable
+
+from prompt_toolkit.document import Document
+from prompt_toolkit.formatted_text.base import StyleAndTextTuples
+
+__all__ = [
+    "Lexer",
+    "SimpleLexer",
+    "DynamicLexer",
+]
+
+
+class Lexer(metaclass=ABCMeta):
+    """
+    Base class for all lexers.
+    """
+
+    @abstractmethod
+    def lex_document(self, document: Document) -> Callable[[int], StyleAndTextTuples]:
+        """
+        Takes a :class:`~prompt_toolkit.document.Document` and returns a
+        callable that takes a line number and returns a list of
+        ``(style_str, text)`` tuples for that line.
+
+        XXX: Note that in the past, this was supposed to return a list
+             of ``(Token, text)`` tuples, just like a Pygments lexer.
+        """
+
+    def invalidation_hash(self) -> Hashable:
+        """
+        When this changes, `lex_document` could give a different output.
+        (Only used for `DynamicLexer`.)
+        """
+        return id(self)
+
+
+class SimpleLexer(Lexer):
+    """
+    Lexer that doesn't do any tokenizing and returns the whole input as one
+    token.
+
+    :param style: The style string for this lexer.
+    """
+
+    def __init__(self, style: str = "") -> None:
+        self.style = style
+
+    def lex_document(self, document: Document) -> Callable[[int], StyleAndTextTuples]:
+        lines = document.lines
+
+        def get_line(lineno: int) -> StyleAndTextTuples:
+            "Return the tokens for the given line."
+            try:
+                return [(self.style, lines[lineno])]
+            except IndexError:
+                return []
+
+        return get_line
+
+
+class DynamicLexer(Lexer):
+    """
+    Lexer class that can dynamically returns any Lexer.
+
+    :param get_lexer: Callable that returns a :class:`.Lexer` instance.
+    """
+
+    def __init__(self, get_lexer: Callable[[], Lexer | None]) -> None:
+        self.get_lexer = get_lexer
+        self._dummy = SimpleLexer()
+
+    def lex_document(self, document: Document) -> Callable[[int], StyleAndTextTuples]:
+        lexer = self.get_lexer() or self._dummy
+        return lexer.lex_document(document)
+
+    def invalidation_hash(self) -> Hashable:
+        lexer = self.get_lexer() or self._dummy
+        return id(lexer)
diff --git a/src/prompt_toolkit/lexers/pygments.py b/src/prompt_toolkit/lexers/pygments.py
new file mode 100644
index 0000000..4721d73
--- /dev/null
+++ b/src/prompt_toolkit/lexers/pygments.py
@@ -0,0 +1,327 @@
+"""
+Adaptor classes for using Pygments lexers within prompt_toolkit.
+
+This includes syntax synchronization code, so that we don't have to start
+lexing at the beginning of a document, when displaying a very large text.
+"""
+from __future__ import annotations
+
+import re
+from abc import ABCMeta, abstractmethod
+from typing import TYPE_CHECKING, Callable, Dict, Generator, Iterable, Tuple
+
+from prompt_toolkit.document import Document
+from prompt_toolkit.filters import FilterOrBool, to_filter
+from prompt_toolkit.formatted_text.base import StyleAndTextTuples
+from prompt_toolkit.formatted_text.utils import split_lines
+from prompt_toolkit.styles.pygments import pygments_token_to_classname
+
+from .base import Lexer, SimpleLexer
+
+if TYPE_CHECKING:
+    from pygments.lexer import Lexer as PygmentsLexerCls
+
+__all__ = [
+    "PygmentsLexer",
+    "SyntaxSync",
+    "SyncFromStart",
+    "RegexSync",
+]
+
+
+class SyntaxSync(metaclass=ABCMeta):
+    """
+    Syntax synchronizer. This is a tool that finds a start position for the
+    lexer. This is especially important when editing big documents; we don't
+    want to start the highlighting by running the lexer from the beginning of
+    the file. That is very slow when editing.
+    """
+
+    @abstractmethod
+    def get_sync_start_position(
+        self, document: Document, lineno: int
+    ) -> tuple[int, int]:
+        """
+        Return the position from where we can start lexing as a (row, column)
+        tuple.
+
+        :param document: `Document` instance that contains all the lines.
+        :param lineno: The line that we want to highlight. (We need to return
+            this line, or an earlier position.)
+        """
+
+
+class SyncFromStart(SyntaxSync):
+    """
+    Always start the syntax highlighting from the beginning.
+    """
+
+    def get_sync_start_position(
+        self, document: Document, lineno: int
+    ) -> tuple[int, int]:
+        return 0, 0
+
+
+class RegexSync(SyntaxSync):
+    """
+    Synchronize by starting at a line that matches the given regex pattern.
+    """
+
+    # Never go more than this amount of lines backwards for synchronization.
+    # That would be too CPU intensive.
+    MAX_BACKWARDS = 500
+
+    # Start lexing at the start, if we are in the first 'n' lines and no
+    # synchronization position was found.
+    FROM_START_IF_NO_SYNC_POS_FOUND = 100
+
+    def __init__(self, pattern: str) -> None:
+        self._compiled_pattern = re.compile(pattern)
+
+    def get_sync_start_position(
+        self, document: Document, lineno: int
+    ) -> tuple[int, int]:
+        """
+        Scan backwards, and find a possible position to start.
+        """
+        pattern = self._compiled_pattern
+        lines = document.lines
+
+        # Scan upwards, until we find a point where we can start the syntax
+        # synchronization.
+        for i in range(lineno, max(-1, lineno - self.MAX_BACKWARDS), -1):
+            match = pattern.match(lines[i])
+            if match:
+                return i, match.start()
+
+        # No synchronization point found. If we aren't that far from the
+        # beginning, start at the very beginning, otherwise, just try to start
+        # at the current line.
+        if lineno < self.FROM_START_IF_NO_SYNC_POS_FOUND:
+            return 0, 0
+        else:
+            return lineno, 0
+
+    @classmethod
+    def from_pygments_lexer_cls(cls, lexer_cls: PygmentsLexerCls) -> RegexSync:
+        """
+        Create a :class:`.RegexSync` instance for this Pygments lexer class.
+        """
+        patterns = {
+            # For Python, start highlighting at any class/def block.
+            "Python": r"^\s*(class|def)\s+",
+            "Python 3": r"^\s*(class|def)\s+",
+            # For HTML, start at any open/close tag definition.
+            "HTML": r"<[/a-zA-Z]",
+            # For javascript, start at a function.
+            "JavaScript": r"\bfunction\b",
+            # TODO: Add definitions for other languages.
+            #       By default, we start at every possible line.
+        }
+        p = patterns.get(lexer_cls.name, "^")
+        return cls(p)
+
+
+class _TokenCache(Dict[Tuple[str, ...], str]):
+    """
+    Cache that converts Pygments tokens into `prompt_toolkit` style objects.
+
+    ``Token.A.B.C`` will be converted into:
+    ``class:pygments,pygments.A,pygments.A.B,pygments.A.B.C``
+    """
+
+    def __missing__(self, key: tuple[str, ...]) -> str:
+        result = "class:" + pygments_token_to_classname(key)
+        self[key] = result
+        return result
+
+
+_token_cache = _TokenCache()
+
+
+class PygmentsLexer(Lexer):
+    """
+    Lexer that calls a pygments lexer.
+
+    Example::
+
+        from pygments.lexers.html import HtmlLexer
+        lexer = PygmentsLexer(HtmlLexer)
+
+    Note: Don't forget to also load a Pygments compatible style. E.g.::
+
+        from prompt_toolkit.styles.from_pygments import style_from_pygments_cls
+        from pygments.styles import get_style_by_name
+        style = style_from_pygments_cls(get_style_by_name('monokai'))
+
+    :param pygments_lexer_cls: A `Lexer` from Pygments.
+    :param sync_from_start: Start lexing at the start of the document. This
+        will always give the best results, but it will be slow for bigger
+        documents. (When the last part of the document is display, then the
+        whole document will be lexed by Pygments on every key stroke.) It is
+        recommended to disable this for inputs that are expected to be more
+        than 1,000 lines.
+    :param syntax_sync: `SyntaxSync` object.
+    """
+
+    # Minimum amount of lines to go backwards when starting the parser.
+    # This is important when the lines are retrieved in reverse order, or when
+    # scrolling upwards. (Due to the complexity of calculating the vertical
+    # scroll offset in the `Window` class, lines are not always retrieved in
+    # order.)
+    MIN_LINES_BACKWARDS = 50
+
+    # When a parser was started this amount of lines back, read the parser
+    # until we get the current line. Otherwise, start a new parser.
+    # (This should probably be bigger than MIN_LINES_BACKWARDS.)
+    REUSE_GENERATOR_MAX_DISTANCE = 100
+
+    def __init__(
+        self,
+        pygments_lexer_cls: type[PygmentsLexerCls],
+        sync_from_start: FilterOrBool = True,
+        syntax_sync: SyntaxSync | None = None,
+    ) -> None:
+        self.pygments_lexer_cls = pygments_lexer_cls
+        self.sync_from_start = to_filter(sync_from_start)
+
+        # Instantiate the Pygments lexer.
+        self.pygments_lexer = pygments_lexer_cls(
+            stripnl=False, stripall=False, ensurenl=False
+        )
+
+        # Create syntax sync instance.
+        self.syntax_sync = syntax_sync or RegexSync.from_pygments_lexer_cls(
+            pygments_lexer_cls
+        )
+
+    @classmethod
+    def from_filename(
+        cls, filename: str, sync_from_start: FilterOrBool = True
+    ) -> Lexer:
+        """
+        Create a `Lexer` from a filename.
+        """
+        # Inline imports: the Pygments dependency is optional!
+        from pygments.lexers import get_lexer_for_filename
+        from pygments.util import ClassNotFound
+
+        try:
+            pygments_lexer = get_lexer_for_filename(filename)
+        except ClassNotFound:
+            return SimpleLexer()
+        else:
+            return cls(pygments_lexer.__class__, sync_from_start=sync_from_start)
+
+    def lex_document(self, document: Document) -> Callable[[int], StyleAndTextTuples]:
+        """
+        Create a lexer function that takes a line number and returns the list
+        of (style_str, text) tuples as the Pygments lexer returns for that line.
+        """
+        LineGenerator = Generator[Tuple[int, StyleAndTextTuples], None, None]
+
+        # Cache of already lexed lines.
+        cache: dict[int, StyleAndTextTuples] = {}
+
+        # Pygments generators that are currently lexing.
+        # Map lexer generator to the line number.
+        line_generators: dict[LineGenerator, int] = {}
+
+        def get_syntax_sync() -> SyntaxSync:
+            "The Syntax synchronization object that we currently use."
+            if self.sync_from_start():
+                return SyncFromStart()
+            else:
+                return self.syntax_sync
+
+        def find_closest_generator(i: int) -> LineGenerator | None:
+            "Return a generator close to line 'i', or None if none was found."
+            for generator, lineno in line_generators.items():
+                if lineno < i and i - lineno < self.REUSE_GENERATOR_MAX_DISTANCE:
+                    return generator
+            return None
+
+        def create_line_generator(start_lineno: int, column: int = 0) -> LineGenerator:
+            """
+            Create a generator that yields the lexed lines.
+            Each iteration it yields a (line_number, [(style_str, text), ...]) tuple.
+            """
+
+            def get_text_fragments() -> Iterable[tuple[str, str]]:
+                text = "\n".join(document.lines[start_lineno:])[column:]
+
+                # We call `get_text_fragments_unprocessed`, because `get_tokens` will
+                # still replace \r\n and \r by \n.  (We don't want that,
+                # Pygments should return exactly the same amount of text, as we
+                # have given as input.)
+                for _, t, v in self.pygments_lexer.get_tokens_unprocessed(text):
+                    # Turn Pygments `Token` object into prompt_toolkit style
+                    # strings.
+                    yield _token_cache[t], v
+
+            yield from enumerate(split_lines(list(get_text_fragments())), start_lineno)
+
+        def get_generator(i: int) -> LineGenerator:
+            """
+            Find an already started generator that is close, or create a new one.
+            """
+            # Find closest line generator.
+            generator = find_closest_generator(i)
+            if generator:
+                return generator
+
+            # No generator found. Determine starting point for the syntax
+            # synchronization first.
+
+            # Go at least x lines back. (Make scrolling upwards more
+            # efficient.)
+            i = max(0, i - self.MIN_LINES_BACKWARDS)
+
+            if i == 0:
+                row = 0
+                column = 0
+            else:
+                row, column = get_syntax_sync().get_sync_start_position(document, i)
+
+            # Find generator close to this point, or otherwise create a new one.
+            generator = find_closest_generator(i)
+            if generator:
+                return generator
+            else:
+                generator = create_line_generator(row, column)
+
+            # If the column is not 0, ignore the first line. (Which is
+            # incomplete. This happens when the synchronization algorithm tells
+            # us to start parsing in the middle of a line.)
+            if column:
+                next(generator)
+                row += 1
+
+            line_generators[generator] = row
+            return generator
+
+        def get_line(i: int) -> StyleAndTextTuples:
+            "Return the tokens for a given line number."
+            try:
+                return cache[i]
+            except KeyError:
+                generator = get_generator(i)
+
+                # Exhaust the generator, until we find the requested line.
+                for num, line in generator:
+                    cache[num] = line
+                    if num == i:
+                        line_generators[generator] = i
+
+                        # Remove the next item from the cache.
+                        # (It could happen that it's already there, because of
+                        # another generator that started filling these lines,
+                        # but we want to synchronize these lines with the
+                        # current lexer's state.)
+                        if num + 1 in cache:
+                            del cache[num + 1]
+
+                        return cache[num]
+            return []
+
+        return get_line