diff options
Diffstat (limited to 'third_party/rust/jsparagus/js_parser/lexer.py')
-rw-r--r-- | third_party/rust/jsparagus/js_parser/lexer.py | 315 |
1 files changed, 315 insertions, 0 deletions
diff --git a/third_party/rust/jsparagus/js_parser/lexer.py b/third_party/rust/jsparagus/js_parser/lexer.py new file mode 100644 index 0000000000..2d8ed530ed --- /dev/null +++ b/third_party/rust/jsparagus/js_parser/lexer.py @@ -0,0 +1,315 @@ +"""Vague approximation of an ECMAScript lexer. + +A parser has two levels: the *lexer* scans bytes to produce tokens. The +*parser* consumes tokens and produces ASTs. + +In a traditional design, the parser drives the process. It *pulls* one token at +a time from the lexer. However, for a parser that can accept arbitrary slabs of +data, scan them, then keep going, it makes more sense for the user to feed +those slabs to the lexer, which then *pushes* tokens to the parser. So that's +what we do. + +Usage: + + from js_parser.lexer import JSLexer + from js_parser.parser import JSParser + + lexer = JSLexer(JSParser()) + lexer.write(some_source_text) + lexer.write(some_more_source_text) + ast = lexer.close() +""" + +import re +import jsparagus.lexer + + +def _get_punctuators(): + punctuators = ''' + &&= ||= ??= + { ( ) [ ] . ... ; , < > <= >= == != === !== + - * % ** ++ -- + << >> >>> & | ^ ! ~ && || ? : = += -= *= %= + **= ><<= >>= >>>= &= |= ^= => + '''.split() + + return '|'.join( + re.escape(token) + for token in sorted(punctuators, key=len, reverse=True)) + + +TOKEN_RE = re.compile(r'''(?x) + (?: + # WhiteSpace + [\ \t\v\r\n\u00a0\u2028\u2029\ufeff] + # SingleLineComment + | // [^\r\n\u2028\u2029]* (?= [\r\n\u2028\u2029] | \Z ) + # MultiLineComment + | /\* (?: [^*] | \*+[^/] )* \*+/ + )* + ( + # Incomplete MultiLineComment + /\* (?: [^*] | \*+[^/] )* \** + | # Incomplete SingleLineComment + // [^\r\n\u2028\u2029]* + | # IdentifierName + (?: [$_A-Za-z] | \\ u [0-9A-Fa-f]{4} | \\ u \{ [0-9A-Fa-f]+ \}) + (?: [$_0-9A-Za-z] | \\ u [0-9A-Fa-f]{4} | \\ u \{ [0-9A-Fa-f]+ \})* + | # NumericLiteral + [0-9][0-9A-Za-z]*(?:\.[0-9A-Za-z]*)? + | \.[0-9][0-9A-Za-z]* + | # Punctuator + <INSERT_PUNCTUATORS> + | # The slash special case + / + | # The curly brace special case + } + | # StringLiteral + ' + # SingleStringCharacters + (?: + # SourceCharacter but not one of ' or \\ or LineTerminator + # but also allow LINE SEPARATOR or PARAGRAPH SEPARATOR + [^'\\\r\n] + | \\ [^0-9xu\r\n\u2028\u2029] # CharacterEscapeSequence + | \\ x [0-9A-Fa-f]{2} # HexEscapeSequence + | \\ u [0-9A-Fa-f]{4} # UnicodeEscapeSequence + | \\ u \{ [0-9A-Fa-f]+ \} + | \\\r\n? # LineContinuation + | \\[\n\u2028\u2029] + )* + ' + | " + # DoubleStringCharacters + (?: + # SourceCharacter but not one of " or \\ or LineTerminator + # but also allow LINE SEPARATOR or PARAGRAPH SEPARATOR + [^"\\\r\n] + | \\ [^0-9xu\r\n\u2028\u2029] # CharacterEscapeSequence + | \\ x [0-9A-Fa-f]{2} # HexEscapeSequence + | \\ u [0-9A-Fa-f]{4} # UnicodeEscapeSequence + | \\ u \{ [0-9A-Fa-f]+ \} + | \\\r\n? # LineContinuation + | \\[\n\u2028\u2029] + )* + " + | # Template + ` (?: [^`\\$] | \\. )* (?: \${ | ` ) + | # illegal character or end of input (this branch matches no characters) + ) +'''.replace("<INSERT_PUNCTUATORS>", _get_punctuators())) + +DIV_RE = re.compile(r'(/=?)') + +REGEXP_RE = re.compile(r'''(?x) +( + / + (?: + # RegularExpressionFirstChar - implemented using + # RegularExpressionChars on the theory that we have already + # ruled out the possibility of a comment. + # RegularExpressionChars + (?: + # RegularExpressionNonTerminator but not one of \\ or / or [ + [^/\\\[\r\n\u2028\u2029] + | # RegularExpressionBackslashSequence + \\ [^\r\n\u2028\u2029] + | # RegularExpressionClass + \[ + # RegularExpressionClassChars + (?: + # RegularExpressionNonTerminator but not one of ] or \\ + [^]\\\r\n\u2028\u2029] + | # RegularExpressionBackslashSequence + \\ [^\r\n\u2028\u2029] + )* + \] + )+ + ) + / + (?: \w* ) +) +''') + +# Words that never match Identifier. (`await` and `yield` nonetheless +# conditionally match IdentifierReference, BindingIdentifier, and +# LabelIdentifier.) +# +# Technically the term for these is "reserved word", not "keyword", but +# whatever. +ECMASCRIPT_FULL_KEYWORDS = [ + 'await', + 'break', + 'case', + 'catch', + 'class', + 'const', + 'continue', + 'debugger', + 'default', + 'delete', + 'do', + 'else', + 'enum', + 'export', + 'extends', + 'finally', + 'for', + 'function', + 'if', + 'import', + 'in', + 'instanceof', + 'new', + 'null', + 'return', + 'super', + 'switch', + 'this', + 'throw', + 'true', + 'false', + 'try', + 'typeof', + 'var', + 'void', + 'while', + 'with', + 'yield', +] + +ECMASCRIPT_CONDITIONAL_KEYWORDS = [ + # Words that are identifiers except in strict mode + 'let', # this one is also banned at the beginning of an ExpressionStatement + 'static', + 'implements', + 'interface', + 'package', + 'private', + 'protected', + 'public', + + # Words that are always allowed as identifiers, but are also keywords in + # other contexts. + 'as', + 'async', + 'from', + 'get', + 'of', + 'set', + 'target', +] + +# Technically this set includes a reserved word that isn't currently being used +# as a keyword in the grammar: `enum`. +ALL_KEYWORDS = set(ECMASCRIPT_FULL_KEYWORDS + ECMASCRIPT_CONDITIONAL_KEYWORDS) + + +class JSLexer(jsparagus.lexer.FlatStringLexer): + """Vague approximation of an ECMAScript lexer. """ + def __init__(self, parser, filename=None): + super().__init__(parser, filename) + + def _match(self, closing): + match = TOKEN_RE.match(self.src, self.point) + assert match is not None + + if match.end() == len(self.src) and not closing: + # The current token runs right up against the end of the current + # chunk of source and thus might continue in the next chunk. Do not + # move self.point. + return None + + token = match.group(1) + if token == '': + # Whitespace followed by end of input or illegal character. + if match.end() == len(self.src): + # End of input. Success! + assert closing + self.point = match.end() + return None + else: + c = self.src[match.end()] + self.throw("unexpected character: {!r}".format(c)) + + c = token[0] + t = None + if c.isdigit() or c == '.' and token != '.': + t = 'NumericLiteral' + elif c.isalpha() or c in '$_': + if token in ALL_KEYWORDS: # TODO support strict mode + if token == 'null': + t = 'NullLiteral' + elif token in ('true', 'false'): + t = 'BooleanLiteral' + else: + t = token + else: + t = 'Name' + elif c == '/': + if token.startswith(('/*', '//')): + # Incomplete comment. (In non-closing mode, this is handled + # above, immediately after the match.) + assert match.end() == len(self.src) + assert closing + self.point = len(self.src) + self.throw("incomplete comment at end of source") + + # We choose RegExp vs. division based on what the parser can + # accept, a literal implementation of the spec. + # + # To make this correct in combination with end-of-line ASI, make + # the parser rewind the lexer one token and ask for it again in + # that case, so that the lexer asks the can-accept question again. + point = match.start(1) + if self.parser.can_accept_terminal(self, 'RegularExpressionLiteral'): + match = REGEXP_RE.match(self.src, point) + if match is None: + if closing: + self.throw("unterminated regexp literal") + else: + return None + token = 'RegularExpressionLiteral' + else: + match = DIV_RE.match(self.src, point) + token = match.group(1) + + if not closing and match.end() == len(self.src): + # At the end of a chunk, `/a*b/` could be the start of + # `/a*b/g`, and `/` could be the start of `/=`. + return None + + t = token + elif c == '`': + if token.endswith('`'): + t = 'NoSubstitutionTemplate' + else: + t = 'TemplateHead' + elif c == '"' or c == "'": + t = 'StringLiteral' + elif c == '}': + # TODO: TemplateTail + t = token + elif c in '{()[];,~?:.<>=!+-*%&|^': + t = token + else: + assert False + + self._current_match = match + self.previous_token_end = self.point + self.current_token_start = match.start(1) + self.point = match.end() + return t + + def take(self): + return self._current_match.group(1) + + def saw_line_terminator(self): + """True if there's a LineTerminator before the current token.""" + i = self.previous_token_end + j = self.current_token_start + ws_between = self.src[i:j] + return any(c in ws_between for c in '\r\n\u2028\u2029') + + def can_close(self): + match = TOKEN_RE.match(self.src) + return match.group(1) == '' and self.parser.can_close() |