summaryrefslogtreecommitdiffstats
path: root/third_party/rust/jsparagus/jsparagus/lexer.py
blob: 865068e9c73d49713028f83375001bdebd75b449 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
""" Lexical analysis is the breaking of a string into tokens. """

import re
import linecache
from builtins import SyntaxError as BaseSyntaxError


class SyntaxError(BaseSyntaxError):
    pass


class UnexpectedEndError(SyntaxError):
    pass


class LexicalGrammar:
    """Quick and dirty lexer implementation.

    In order to support multi-part lexing (multiple calls to .write()),
    both 1. the `ignore` regular expression; and 2. the union of the family of
    regular expressions given by `tokens` and `regexp`; must have have the
    following property: if they match a string s, they also match every prefix
    of that string.

    This requirement is not enforced by assertions; if it's not met, the
    tokenizer will just have bugs when sent multiple chunks of data.
    """
    def __init__(self, tokens, ignore=r'[ \t]*', **regexps):
        def token_to_re(token):
            s = re.escape(token)
            if s.isalpha():
                s += r'\b'
            return s

        token_list = sorted(tokens.split(), key=len, reverse=True)
        self.ignore_re = re.compile(ignore)
        self.token_re = re.compile("|".join(token_to_re(token) for token in token_list))
        self.parser_pairs = [(k, re.compile(v)) for k, v in regexps.items()]

    def __call__(self, parser, filename=None):
        return Tokenizer(self, parser, filename)


class FlatStringLexer:
    def __init__(self, parser, filename=None):
        self.parser = parser
        self.src = ''
        self.previous_token_end = 0
        self.current_token_start = 0
        self.start_lineno = 1
        self.start_column = 0
        self.point = 0
        self.filename = filename
        self.closed = False

    def write(self, text):
        assert not self.closed
        self.src += text
        self._drain()

    def close(self):
        assert not self.closed
        self.closed = True
        self._drain()
        assert self.src == ''
        return self.parser.close(self)

    def _drain(self):
        assert self.previous_token_end == 0
        assert self.current_token_start == 0
        assert self.point == 0
        closing = self.closed

        terminal_id = self._match(closing)
        while terminal_id is not None:
            self.parser.write_terminal(self, terminal_id)
            terminal_id = self._match(closing)

        # Update position info.
        discarded_text = self.src[:self.point]
        newline_count = self.src[:self.point].count('\n')
        self.start_lineno += newline_count
        if newline_count > 0:
            self.start_column = self.point - discarded_text.rindex('\n')
        else:
            self.start_column += self.point

        # Drop the parsed text and reset counters. Note that setting
        # self.previous_token_end to 0 really is correct. Setting
        # self.current_token_start to 0 is as good as anything else, because
        # there is no current token.
        self.src = self.src[self.point:]
        self.point = 0
        self.previous_token_end = 0
        self.current_token_start = 0

    def current_token_position(self):
        src_pre = self.src[:self.current_token_start]
        lineno = self.start_lineno + src_pre.count("\n")
        if '\n' in src_pre:
            line_start_index = src_pre.rfind("\n") + 1
            column = self.current_token_start - line_start_index  # can be zero
        else:
            column = self.start_column + self.current_token_start
        return lineno, column

    def current_line(self):
        # OK, this is gruesome, but we return the current line if we have the
        # whole thing and otherwise we ... try loading it from disk.
        if '\n' in self.src[:self.current_token_start]:
            line_start = self.src.rindex('\n', 0, self.current_token_start) + 1
        elif self.start_column == 0:
            line_start = 0
        else:
            line_start = -1

        if line_start != -1:
            line_end = self.src.find('\n', line_start)
            if line_end == -1:
                if self.closed:
                    return self.src[line_start:] + '\n'
            else:
                return self.src[line_start:line_end] + '\n'

        # Fallback case. Python's linecache.getline() deliberately silences all
        # errors.
        lineno = self.current_token_position()[0]
        return linecache.getline(self.filename, lineno)

    def throw(self, msg_or_exception):
        lineno, column = self.current_token_position()
        if isinstance(msg_or_exception, Exception):
            e = msg_or_exception
            e.filename = self.filename
            e.lineno = lineno
            e.offset = column + 1
        else:
            # Apparently this is the secret handshake to create a Python
            # SyntaxError and get a good error message when Python prints it.
            line = self.current_line()
            args = (self.filename, lineno, column + 1, line)
            e = SyntaxError(msg_or_exception, args)
        raise e

    def throw_unexpected_end(self):
        self.throw(UnexpectedEndError("unexpected end of input"))


class Tokenizer(FlatStringLexer):
    def __init__(self, lexical_grammar, parser, filename=None):
        super().__init__(parser, filename)
        self.ignore_re = lexical_grammar.ignore_re
        self.token_re = lexical_grammar.token_re
        self.parser_pairs = lexical_grammar.parser_pairs
        self.src = ''
        self.filename = filename
        self.last_point = 0
        self.point = 0
        self._current_match = None

    def take(self):
        return self._current_match.group()

    def saw_line_terminator(self):
        """True if there's a LineTerminator before the current token."""
        i = self.previous_token_end
        j = self.current_token_start
        ws_between = self.src[i:j]
        return any(c in ws_between for c in '\r\n\u2028\u2029')

    def _match(self, closing):
        # Advance over text matching ignore_re.
        ignore_match = self.ignore_re.match(self.src, self.point)
        if ignore_match is None:
            raise ValueError("ignore_re should always match")
        point = ignore_match.end()
        if point == len(self.src):
            if closing:
                self.point = point
            self._current_match = None
            return None

        # Try the token_re.
        token_match = self.token_re.match(self.src, point)

        # Try all the parser_pairs.
        for name, pattern in self.parser_pairs:
            match = pattern.match(self.src, point)
            if match is not None:
                break
        else:
            name = match = None

        if match is not None and token_match is not None and match.end() > token_match.end():
            pass
        elif token_match is not None:
            name, match = token_match.group(0), token_match
        elif match is not None:
            pass
        else:
            self.throw("unexpected characters {!r}"
                       .format(self.src[point:point + 12]))

        # But how do we know subsequent .write() calls won't provide more text,
        # extending this token? Here we take advantage of the odd requirement
        # LexicalGrammar imposes on its users. Every prefix of a match is a
        # match. So if this hypothetical "extended" token would match, then the
        # entire remainder of self.src is a match.
        if not closing and match.end() == len(self.src):
            # This token might be extensible. Refuse to match.
            self._current_match = None
            return None

        # This token definitely is not extensible.
        self.previous_token_end = self.point
        self.current_token_start = match.start()
        self.point = match.end()
        self._current_match = match
        return name