summaryrefslogtreecommitdiffstats
path: root/third_party/rust/jsparagus/jsparagus/lexer.py
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/rust/jsparagus/jsparagus/lexer.py')
-rw-r--r--third_party/rust/jsparagus/jsparagus/lexer.py219
1 files changed, 219 insertions, 0 deletions
diff --git a/third_party/rust/jsparagus/jsparagus/lexer.py b/third_party/rust/jsparagus/jsparagus/lexer.py
new file mode 100644
index 0000000000..865068e9c7
--- /dev/null
+++ b/third_party/rust/jsparagus/jsparagus/lexer.py
@@ -0,0 +1,219 @@
+""" Lexical analysis is the breaking of a string into tokens. """
+
+import re
+import linecache
+from builtins import SyntaxError as BaseSyntaxError
+
+
+class SyntaxError(BaseSyntaxError):
+ pass
+
+
+class UnexpectedEndError(SyntaxError):
+ pass
+
+
+class LexicalGrammar:
+ """Quick and dirty lexer implementation.
+
+ In order to support multi-part lexing (multiple calls to .write()),
+ both 1. the `ignore` regular expression; and 2. the union of the family of
+ regular expressions given by `tokens` and `regexp`; must have have the
+ following property: if they match a string s, they also match every prefix
+ of that string.
+
+ This requirement is not enforced by assertions; if it's not met, the
+ tokenizer will just have bugs when sent multiple chunks of data.
+ """
+ def __init__(self, tokens, ignore=r'[ \t]*', **regexps):
+ def token_to_re(token):
+ s = re.escape(token)
+ if s.isalpha():
+ s += r'\b'
+ return s
+
+ token_list = sorted(tokens.split(), key=len, reverse=True)
+ self.ignore_re = re.compile(ignore)
+ self.token_re = re.compile("|".join(token_to_re(token) for token in token_list))
+ self.parser_pairs = [(k, re.compile(v)) for k, v in regexps.items()]
+
+ def __call__(self, parser, filename=None):
+ return Tokenizer(self, parser, filename)
+
+
+class FlatStringLexer:
+ def __init__(self, parser, filename=None):
+ self.parser = parser
+ self.src = ''
+ self.previous_token_end = 0
+ self.current_token_start = 0
+ self.start_lineno = 1
+ self.start_column = 0
+ self.point = 0
+ self.filename = filename
+ self.closed = False
+
+ def write(self, text):
+ assert not self.closed
+ self.src += text
+ self._drain()
+
+ def close(self):
+ assert not self.closed
+ self.closed = True
+ self._drain()
+ assert self.src == ''
+ return self.parser.close(self)
+
+ def _drain(self):
+ assert self.previous_token_end == 0
+ assert self.current_token_start == 0
+ assert self.point == 0
+ closing = self.closed
+
+ terminal_id = self._match(closing)
+ while terminal_id is not None:
+ self.parser.write_terminal(self, terminal_id)
+ terminal_id = self._match(closing)
+
+ # Update position info.
+ discarded_text = self.src[:self.point]
+ newline_count = self.src[:self.point].count('\n')
+ self.start_lineno += newline_count
+ if newline_count > 0:
+ self.start_column = self.point - discarded_text.rindex('\n')
+ else:
+ self.start_column += self.point
+
+ # Drop the parsed text and reset counters. Note that setting
+ # self.previous_token_end to 0 really is correct. Setting
+ # self.current_token_start to 0 is as good as anything else, because
+ # there is no current token.
+ self.src = self.src[self.point:]
+ self.point = 0
+ self.previous_token_end = 0
+ self.current_token_start = 0
+
+ def current_token_position(self):
+ src_pre = self.src[:self.current_token_start]
+ lineno = self.start_lineno + src_pre.count("\n")
+ if '\n' in src_pre:
+ line_start_index = src_pre.rfind("\n") + 1
+ column = self.current_token_start - line_start_index # can be zero
+ else:
+ column = self.start_column + self.current_token_start
+ return lineno, column
+
+ def current_line(self):
+ # OK, this is gruesome, but we return the current line if we have the
+ # whole thing and otherwise we ... try loading it from disk.
+ if '\n' in self.src[:self.current_token_start]:
+ line_start = self.src.rindex('\n', 0, self.current_token_start) + 1
+ elif self.start_column == 0:
+ line_start = 0
+ else:
+ line_start = -1
+
+ if line_start != -1:
+ line_end = self.src.find('\n', line_start)
+ if line_end == -1:
+ if self.closed:
+ return self.src[line_start:] + '\n'
+ else:
+ return self.src[line_start:line_end] + '\n'
+
+ # Fallback case. Python's linecache.getline() deliberately silences all
+ # errors.
+ lineno = self.current_token_position()[0]
+ return linecache.getline(self.filename, lineno)
+
+ def throw(self, msg_or_exception):
+ lineno, column = self.current_token_position()
+ if isinstance(msg_or_exception, Exception):
+ e = msg_or_exception
+ e.filename = self.filename
+ e.lineno = lineno
+ e.offset = column + 1
+ else:
+ # Apparently this is the secret handshake to create a Python
+ # SyntaxError and get a good error message when Python prints it.
+ line = self.current_line()
+ args = (self.filename, lineno, column + 1, line)
+ e = SyntaxError(msg_or_exception, args)
+ raise e
+
+ def throw_unexpected_end(self):
+ self.throw(UnexpectedEndError("unexpected end of input"))
+
+
+class Tokenizer(FlatStringLexer):
+ def __init__(self, lexical_grammar, parser, filename=None):
+ super().__init__(parser, filename)
+ self.ignore_re = lexical_grammar.ignore_re
+ self.token_re = lexical_grammar.token_re
+ self.parser_pairs = lexical_grammar.parser_pairs
+ self.src = ''
+ self.filename = filename
+ self.last_point = 0
+ self.point = 0
+ self._current_match = None
+
+ def take(self):
+ return self._current_match.group()
+
+ def saw_line_terminator(self):
+ """True if there's a LineTerminator before the current token."""
+ i = self.previous_token_end
+ j = self.current_token_start
+ ws_between = self.src[i:j]
+ return any(c in ws_between for c in '\r\n\u2028\u2029')
+
+ def _match(self, closing):
+ # Advance over text matching ignore_re.
+ ignore_match = self.ignore_re.match(self.src, self.point)
+ if ignore_match is None:
+ raise ValueError("ignore_re should always match")
+ point = ignore_match.end()
+ if point == len(self.src):
+ if closing:
+ self.point = point
+ self._current_match = None
+ return None
+
+ # Try the token_re.
+ token_match = self.token_re.match(self.src, point)
+
+ # Try all the parser_pairs.
+ for name, pattern in self.parser_pairs:
+ match = pattern.match(self.src, point)
+ if match is not None:
+ break
+ else:
+ name = match = None
+
+ if match is not None and token_match is not None and match.end() > token_match.end():
+ pass
+ elif token_match is not None:
+ name, match = token_match.group(0), token_match
+ elif match is not None:
+ pass
+ else:
+ self.throw("unexpected characters {!r}"
+ .format(self.src[point:point + 12]))
+
+ # But how do we know subsequent .write() calls won't provide more text,
+ # extending this token? Here we take advantage of the odd requirement
+ # LexicalGrammar imposes on its users. Every prefix of a match is a
+ # match. So if this hypothetical "extended" token would match, then the
+ # entire remainder of self.src is a match.
+ if not closing and match.end() == len(self.src):
+ # This token might be extensible. Refuse to match.
+ self._current_match = None
+ return None
+
+ # This token definitely is not extensible.
+ self.previous_token_end = self.point
+ self.current_token_start = match.start()
+ self.point = match.end()
+ self._current_match = match
+ return name