diff options
Diffstat (limited to 'third_party/rust/jsparagus/jsparagus/lexer.py')
-rw-r--r-- | third_party/rust/jsparagus/jsparagus/lexer.py | 219 |
1 files changed, 219 insertions, 0 deletions
diff --git a/third_party/rust/jsparagus/jsparagus/lexer.py b/third_party/rust/jsparagus/jsparagus/lexer.py new file mode 100644 index 0000000000..865068e9c7 --- /dev/null +++ b/third_party/rust/jsparagus/jsparagus/lexer.py @@ -0,0 +1,219 @@ +""" Lexical analysis is the breaking of a string into tokens. """ + +import re +import linecache +from builtins import SyntaxError as BaseSyntaxError + + +class SyntaxError(BaseSyntaxError): + pass + + +class UnexpectedEndError(SyntaxError): + pass + + +class LexicalGrammar: + """Quick and dirty lexer implementation. + + In order to support multi-part lexing (multiple calls to .write()), + both 1. the `ignore` regular expression; and 2. the union of the family of + regular expressions given by `tokens` and `regexp`; must have have the + following property: if they match a string s, they also match every prefix + of that string. + + This requirement is not enforced by assertions; if it's not met, the + tokenizer will just have bugs when sent multiple chunks of data. + """ + def __init__(self, tokens, ignore=r'[ \t]*', **regexps): + def token_to_re(token): + s = re.escape(token) + if s.isalpha(): + s += r'\b' + return s + + token_list = sorted(tokens.split(), key=len, reverse=True) + self.ignore_re = re.compile(ignore) + self.token_re = re.compile("|".join(token_to_re(token) for token in token_list)) + self.parser_pairs = [(k, re.compile(v)) for k, v in regexps.items()] + + def __call__(self, parser, filename=None): + return Tokenizer(self, parser, filename) + + +class FlatStringLexer: + def __init__(self, parser, filename=None): + self.parser = parser + self.src = '' + self.previous_token_end = 0 + self.current_token_start = 0 + self.start_lineno = 1 + self.start_column = 0 + self.point = 0 + self.filename = filename + self.closed = False + + def write(self, text): + assert not self.closed + self.src += text + self._drain() + + def close(self): + assert not self.closed + self.closed = True + self._drain() + assert self.src == '' + return self.parser.close(self) + + def _drain(self): + assert self.previous_token_end == 0 + assert self.current_token_start == 0 + assert self.point == 0 + closing = self.closed + + terminal_id = self._match(closing) + while terminal_id is not None: + self.parser.write_terminal(self, terminal_id) + terminal_id = self._match(closing) + + # Update position info. + discarded_text = self.src[:self.point] + newline_count = self.src[:self.point].count('\n') + self.start_lineno += newline_count + if newline_count > 0: + self.start_column = self.point - discarded_text.rindex('\n') + else: + self.start_column += self.point + + # Drop the parsed text and reset counters. Note that setting + # self.previous_token_end to 0 really is correct. Setting + # self.current_token_start to 0 is as good as anything else, because + # there is no current token. + self.src = self.src[self.point:] + self.point = 0 + self.previous_token_end = 0 + self.current_token_start = 0 + + def current_token_position(self): + src_pre = self.src[:self.current_token_start] + lineno = self.start_lineno + src_pre.count("\n") + if '\n' in src_pre: + line_start_index = src_pre.rfind("\n") + 1 + column = self.current_token_start - line_start_index # can be zero + else: + column = self.start_column + self.current_token_start + return lineno, column + + def current_line(self): + # OK, this is gruesome, but we return the current line if we have the + # whole thing and otherwise we ... try loading it from disk. + if '\n' in self.src[:self.current_token_start]: + line_start = self.src.rindex('\n', 0, self.current_token_start) + 1 + elif self.start_column == 0: + line_start = 0 + else: + line_start = -1 + + if line_start != -1: + line_end = self.src.find('\n', line_start) + if line_end == -1: + if self.closed: + return self.src[line_start:] + '\n' + else: + return self.src[line_start:line_end] + '\n' + + # Fallback case. Python's linecache.getline() deliberately silences all + # errors. + lineno = self.current_token_position()[0] + return linecache.getline(self.filename, lineno) + + def throw(self, msg_or_exception): + lineno, column = self.current_token_position() + if isinstance(msg_or_exception, Exception): + e = msg_or_exception + e.filename = self.filename + e.lineno = lineno + e.offset = column + 1 + else: + # Apparently this is the secret handshake to create a Python + # SyntaxError and get a good error message when Python prints it. + line = self.current_line() + args = (self.filename, lineno, column + 1, line) + e = SyntaxError(msg_or_exception, args) + raise e + + def throw_unexpected_end(self): + self.throw(UnexpectedEndError("unexpected end of input")) + + +class Tokenizer(FlatStringLexer): + def __init__(self, lexical_grammar, parser, filename=None): + super().__init__(parser, filename) + self.ignore_re = lexical_grammar.ignore_re + self.token_re = lexical_grammar.token_re + self.parser_pairs = lexical_grammar.parser_pairs + self.src = '' + self.filename = filename + self.last_point = 0 + self.point = 0 + self._current_match = None + + def take(self): + return self._current_match.group() + + def saw_line_terminator(self): + """True if there's a LineTerminator before the current token.""" + i = self.previous_token_end + j = self.current_token_start + ws_between = self.src[i:j] + return any(c in ws_between for c in '\r\n\u2028\u2029') + + def _match(self, closing): + # Advance over text matching ignore_re. + ignore_match = self.ignore_re.match(self.src, self.point) + if ignore_match is None: + raise ValueError("ignore_re should always match") + point = ignore_match.end() + if point == len(self.src): + if closing: + self.point = point + self._current_match = None + return None + + # Try the token_re. + token_match = self.token_re.match(self.src, point) + + # Try all the parser_pairs. + for name, pattern in self.parser_pairs: + match = pattern.match(self.src, point) + if match is not None: + break + else: + name = match = None + + if match is not None and token_match is not None and match.end() > token_match.end(): + pass + elif token_match is not None: + name, match = token_match.group(0), token_match + elif match is not None: + pass + else: + self.throw("unexpected characters {!r}" + .format(self.src[point:point + 12])) + + # But how do we know subsequent .write() calls won't provide more text, + # extending this token? Here we take advantage of the odd requirement + # LexicalGrammar imposes on its users. Every prefix of a match is a + # match. So if this hypothetical "extended" token would match, then the + # entire remainder of self.src is a match. + if not closing and match.end() == len(self.src): + # This token might be extensible. Refuse to match. + self._current_match = None + return None + + # This token definitely is not extensible. + self.previous_token_end = self.point + self.current_token_start = match.start() + self.point = match.end() + self._current_match = match + return name |