"""Vague approximation of an ECMAScript lexer. A parser has two levels: the *lexer* scans bytes to produce tokens. The *parser* consumes tokens and produces ASTs. In a traditional design, the parser drives the process. It *pulls* one token at a time from the lexer. However, for a parser that can accept arbitrary slabs of data, scan them, then keep going, it makes more sense for the user to feed those slabs to the lexer, which then *pushes* tokens to the parser. So that's what we do. Usage: from js_parser.lexer import JSLexer from js_parser.parser import JSParser lexer = JSLexer(JSParser()) lexer.write(some_source_text) lexer.write(some_more_source_text) ast = lexer.close() """ import re import jsparagus.lexer def _get_punctuators(): punctuators = ''' &&= ||= ??= { ( ) [ ] . ... ; , < > <= >= == != === !== + - * % ** ++ -- << >> >>> & | ^ ! ~ && || ? : = += -= *= %= **= ><<= >>= >>>= &= |= ^= => '''.split() return '|'.join( re.escape(token) for token in sorted(punctuators, key=len, reverse=True)) TOKEN_RE = re.compile(r'''(?x) (?: # WhiteSpace [\ \t\v\r\n\u00a0\u2028\u2029\ufeff] # SingleLineComment | // [^\r\n\u2028\u2029]* (?= [\r\n\u2028\u2029] | \Z ) # MultiLineComment | /\* (?: [^*] | \*+[^/] )* \*+/ )* ( # Incomplete MultiLineComment /\* (?: [^*] | \*+[^/] )* \** | # Incomplete SingleLineComment // [^\r\n\u2028\u2029]* | # IdentifierName (?: [$_A-Za-z] | \\ u [0-9A-Fa-f]{4} | \\ u \{ [0-9A-Fa-f]+ \}) (?: [$_0-9A-Za-z] | \\ u [0-9A-Fa-f]{4} | \\ u \{ [0-9A-Fa-f]+ \})* | # NumericLiteral [0-9][0-9A-Za-z]*(?:\.[0-9A-Za-z]*)? | \.[0-9][0-9A-Za-z]* | # Punctuator | # The slash special case / | # The curly brace special case } | # StringLiteral ' # SingleStringCharacters (?: # SourceCharacter but not one of ' or \\ or LineTerminator # but also allow LINE SEPARATOR or PARAGRAPH SEPARATOR [^'\\\r\n] | \\ [^0-9xu\r\n\u2028\u2029] # CharacterEscapeSequence | \\ x [0-9A-Fa-f]{2} # HexEscapeSequence | \\ u [0-9A-Fa-f]{4} # UnicodeEscapeSequence | \\ u \{ [0-9A-Fa-f]+ \} | \\\r\n? # LineContinuation | \\[\n\u2028\u2029] )* ' | " # DoubleStringCharacters (?: # SourceCharacter but not one of " or \\ or LineTerminator # but also allow LINE SEPARATOR or PARAGRAPH SEPARATOR [^"\\\r\n] | \\ [^0-9xu\r\n\u2028\u2029] # CharacterEscapeSequence | \\ x [0-9A-Fa-f]{2} # HexEscapeSequence | \\ u [0-9A-Fa-f]{4} # UnicodeEscapeSequence | \\ u \{ [0-9A-Fa-f]+ \} | \\\r\n? # LineContinuation | \\[\n\u2028\u2029] )* " | # Template ` (?: [^`\\$] | \\. )* (?: \${ | ` ) | # illegal character or end of input (this branch matches no characters) ) '''.replace("", _get_punctuators())) DIV_RE = re.compile(r'(/=?)') REGEXP_RE = re.compile(r'''(?x) ( / (?: # RegularExpressionFirstChar - implemented using # RegularExpressionChars on the theory that we have already # ruled out the possibility of a comment. # RegularExpressionChars (?: # RegularExpressionNonTerminator but not one of \\ or / or [ [^/\\\[\r\n\u2028\u2029] | # RegularExpressionBackslashSequence \\ [^\r\n\u2028\u2029] | # RegularExpressionClass \[ # RegularExpressionClassChars (?: # RegularExpressionNonTerminator but not one of ] or \\ [^]\\\r\n\u2028\u2029] | # RegularExpressionBackslashSequence \\ [^\r\n\u2028\u2029] )* \] )+ ) / (?: \w* ) ) ''') # Words that never match Identifier. (`await` and `yield` nonetheless # conditionally match IdentifierReference, BindingIdentifier, and # LabelIdentifier.) # # Technically the term for these is "reserved word", not "keyword", but # whatever. ECMASCRIPT_FULL_KEYWORDS = [ 'await', 'break', 'case', 'catch', 'class', 'const', 'continue', 'debugger', 'default', 'delete', 'do', 'else', 'enum', 'export', 'extends', 'finally', 'for', 'function', 'if', 'import', 'in', 'instanceof', 'new', 'null', 'return', 'super', 'switch', 'this', 'throw', 'true', 'false', 'try', 'typeof', 'var', 'void', 'while', 'with', 'yield', ] ECMASCRIPT_CONDITIONAL_KEYWORDS = [ # Words that are identifiers except in strict mode 'let', # this one is also banned at the beginning of an ExpressionStatement 'static', 'implements', 'interface', 'package', 'private', 'protected', 'public', # Words that are always allowed as identifiers, but are also keywords in # other contexts. 'as', 'async', 'from', 'get', 'of', 'set', 'target', ] # Technically this set includes a reserved word that isn't currently being used # as a keyword in the grammar: `enum`. ALL_KEYWORDS = set(ECMASCRIPT_FULL_KEYWORDS + ECMASCRIPT_CONDITIONAL_KEYWORDS) class JSLexer(jsparagus.lexer.FlatStringLexer): """Vague approximation of an ECMAScript lexer. """ def __init__(self, parser, filename=None): super().__init__(parser, filename) def _match(self, closing): match = TOKEN_RE.match(self.src, self.point) assert match is not None if match.end() == len(self.src) and not closing: # The current token runs right up against the end of the current # chunk of source and thus might continue in the next chunk. Do not # move self.point. return None token = match.group(1) if token == '': # Whitespace followed by end of input or illegal character. if match.end() == len(self.src): # End of input. Success! assert closing self.point = match.end() return None else: c = self.src[match.end()] self.throw("unexpected character: {!r}".format(c)) c = token[0] t = None if c.isdigit() or c == '.' and token != '.': t = 'NumericLiteral' elif c.isalpha() or c in '$_': if token in ALL_KEYWORDS: # TODO support strict mode if token == 'null': t = 'NullLiteral' elif token in ('true', 'false'): t = 'BooleanLiteral' else: t = token else: t = 'Name' elif c == '/': if token.startswith(('/*', '//')): # Incomplete comment. (In non-closing mode, this is handled # above, immediately after the match.) assert match.end() == len(self.src) assert closing self.point = len(self.src) self.throw("incomplete comment at end of source") # We choose RegExp vs. division based on what the parser can # accept, a literal implementation of the spec. # # To make this correct in combination with end-of-line ASI, make # the parser rewind the lexer one token and ask for it again in # that case, so that the lexer asks the can-accept question again. point = match.start(1) if self.parser.can_accept_terminal(self, 'RegularExpressionLiteral'): match = REGEXP_RE.match(self.src, point) if match is None: if closing: self.throw("unterminated regexp literal") else: return None token = 'RegularExpressionLiteral' else: match = DIV_RE.match(self.src, point) token = match.group(1) if not closing and match.end() == len(self.src): # At the end of a chunk, `/a*b/` could be the start of # `/a*b/g`, and `/` could be the start of `/=`. return None t = token elif c == '`': if token.endswith('`'): t = 'NoSubstitutionTemplate' else: t = 'TemplateHead' elif c == '"' or c == "'": t = 'StringLiteral' elif c == '}': # TODO: TemplateTail t = token elif c in '{()[];,~?:.<>=!+-*%&|^': t = token else: assert False self._current_match = match self.previous_token_end = self.point self.current_token_start = match.start(1) self.point = match.end() return t def take(self): return self._current_match.group(1) def saw_line_terminator(self): """True if there's a LineTerminator before the current token.""" i = self.previous_token_end j = self.current_token_start ws_between = self.src[i:j] return any(c in ws_between for c in '\r\n\u2028\u2029') def can_close(self): match = TOKEN_RE.match(self.src) return match.group(1) == '' and self.parser.can_close()