1 files changed, 315 insertions, 0 deletions
diff --git a/third_party/rust/jsparagus/js_parser/lexer.py b/third_party/rust/jsparagus/js_parser/lexer.py
new file mode 100644
index 0000000000..2d8ed530ed
--- /dev/null
+++ b/third_party/rust/jsparagus/js_parser/lexer.py
@@ -0,0 +1,315 @@
+"""Vague approximation of an ECMAScript lexer.
+
+A parser has two levels: the *lexer* scans bytes to produce tokens. The
+*parser* consumes tokens and produces ASTs.
+
+In a traditional design, the parser drives the process. It *pulls* one token at
+a time from the lexer. However, for a parser that can accept arbitrary slabs of
+data, scan them, then keep going, it makes more sense for the user to feed
+those slabs to the lexer, which then *pushes* tokens to the parser. So that's
+what we do.
+
+Usage:
+
+    from js_parser.lexer import JSLexer
+    from js_parser.parser import JSParser
+
+    lexer = JSLexer(JSParser())
+    lexer.write(some_source_text)
+    lexer.write(some_more_source_text)
+    ast = lexer.close()
+"""
+
+import re
+import jsparagus.lexer
+
+
+def _get_punctuators():
+    punctuators = '''
+        &&= ||= ??=
+        { ( ) [ ] . ... ; , < > <= >= == != === !== + - * % ** ++ --
+        << >> >>> & | ^ ! ~ && || ? : = += -= *= %=
+        **= ><<= >>= >>>= &= |= ^= =>
+    '''.split()
+
+    return '|'.join(
+        re.escape(token)
+        for token in sorted(punctuators, key=len, reverse=True))
+
+
+TOKEN_RE = re.compile(r'''(?x)
+  (?:
+      # WhiteSpace
+      [\ \t\v\r\n\u00a0\u2028\u2029\ufeff]
+      # SingleLineComment
+    | // [^\r\n\u2028\u2029]* (?= [\r\n\u2028\u2029] | \Z )
+      # MultiLineComment
+    | /\*  (?: [^*] | \*+[^/] )*  \*+/
+  )*
+  (
+      # Incomplete MultiLineComment
+      /\*  (?: [^*] | \*+[^/] )*  \**
+    | # Incomplete SingleLineComment
+      // [^\r\n\u2028\u2029]*
+    | # IdentifierName
+      (?: [$_A-Za-z]     | \\ u [0-9A-Fa-f]{4} | \\ u \{ [0-9A-Fa-f]+ \})
+      (?: [$_0-9A-Za-z]  | \\ u [0-9A-Fa-f]{4} | \\ u \{ [0-9A-Fa-f]+ \})*
+    | # NumericLiteral
+      [0-9][0-9A-Za-z]*(?:\.[0-9A-Za-z]*)?
+    | \.[0-9][0-9A-Za-z]*
+    | # Punctuator
+      <INSERT_PUNCTUATORS>
+    | # The slash special case
+      /
+    | # The curly brace special case
+      }
+    | # StringLiteral
+      '
+        # SingleStringCharacters
+        (?:
+            # SourceCharacter but not one of ' or \\ or LineTerminator
+            # but also allow LINE SEPARATOR or PARAGRAPH SEPARATOR
+            [^'\\\r\n]
+          | \\ [^0-9xu\r\n\u2028\u2029]  # CharacterEscapeSequence
+          | \\ x [0-9A-Fa-f]{2}          # HexEscapeSequence
+          | \\ u [0-9A-Fa-f]{4}          # UnicodeEscapeSequence
+          | \\ u \{ [0-9A-Fa-f]+ \}
+          | \\\r\n?                      # LineContinuation
+          | \\[\n\u2028\u2029]
+        )*
+      '
+    | "
+        # DoubleStringCharacters
+        (?:
+            # SourceCharacter but not one of " or \\ or LineTerminator
+            # but also allow LINE SEPARATOR or PARAGRAPH SEPARATOR
+            [^"\\\r\n]
+          | \\ [^0-9xu\r\n\u2028\u2029]  # CharacterEscapeSequence
+          | \\ x [0-9A-Fa-f]{2}          # HexEscapeSequence
+          | \\ u [0-9A-Fa-f]{4}          # UnicodeEscapeSequence
+          | \\ u \{ [0-9A-Fa-f]+ \}
+          | \\\r\n?                      # LineContinuation
+          | \\[\n\u2028\u2029]
+        )*
+      "
+    | # Template
+      ` (?: [^`\\$] | \\. )* (?: \${ | ` )
+    | # illegal character or end of input (this branch matches no characters)
+  )
+'''.replace("<INSERT_PUNCTUATORS>", _get_punctuators()))
+
+DIV_RE = re.compile(r'(/=?)')
+
+REGEXP_RE = re.compile(r'''(?x)
+(
+    /
+    (?:
+        # RegularExpressionFirstChar - implemented using
+        #     RegularExpressionChars on the theory that we have already
+        #     ruled out the possibility of a comment.
+        # RegularExpressionChars
+        (?:
+            # RegularExpressionNonTerminator but not one of \\ or / or [
+            [^/\\\[\r\n\u2028\u2029]
+          | # RegularExpressionBackslashSequence
+            \\ [^\r\n\u2028\u2029]
+          | # RegularExpressionClass
+            \[
+                # RegularExpressionClassChars
+                (?:
+                    # RegularExpressionNonTerminator but not one of ] or \\
+                    [^]\\\r\n\u2028\u2029]
+                  | # RegularExpressionBackslashSequence
+                    \\ [^\r\n\u2028\u2029]
+                )*
+            \]
+        )+
+    )
+    /
+    (?: \w* )
+)
+''')
+
+# Words that never match Identifier. (`await` and `yield` nonetheless
+# conditionally match IdentifierReference, BindingIdentifier, and
+# LabelIdentifier.)
+#
+# Technically the term for these is "reserved word", not "keyword", but
+# whatever.
+ECMASCRIPT_FULL_KEYWORDS = [
+    'await',
+    'break',
+    'case',
+    'catch',
+    'class',
+    'const',
+    'continue',
+    'debugger',
+    'default',
+    'delete',
+    'do',
+    'else',
+    'enum',
+    'export',
+    'extends',
+    'finally',
+    'for',
+    'function',
+    'if',
+    'import',
+    'in',
+    'instanceof',
+    'new',
+    'null',
+    'return',
+    'super',
+    'switch',
+    'this',
+    'throw',
+    'true',
+    'false',
+    'try',
+    'typeof',
+    'var',
+    'void',
+    'while',
+    'with',
+    'yield',
+]
+
+ECMASCRIPT_CONDITIONAL_KEYWORDS = [
+    # Words that are identifiers except in strict mode
+    'let',  # this one is also banned at the beginning of an ExpressionStatement
+    'static',
+    'implements',
+    'interface',
+    'package',
+    'private',
+    'protected',
+    'public',
+
+    # Words that are always allowed as identifiers, but are also keywords in
+    # other contexts.
+    'as',
+    'async',
+    'from',
+    'get',
+    'of',
+    'set',
+    'target',
+]
+
+# Technically this set includes a reserved word that isn't currently being used
+# as a keyword in the grammar: `enum`.
+ALL_KEYWORDS = set(ECMASCRIPT_FULL_KEYWORDS + ECMASCRIPT_CONDITIONAL_KEYWORDS)
+
+
+class JSLexer(jsparagus.lexer.FlatStringLexer):
+    """Vague approximation of an ECMAScript lexer. """
+    def __init__(self, parser, filename=None):
+        super().__init__(parser, filename)
+
+    def _match(self, closing):
+        match = TOKEN_RE.match(self.src, self.point)
+        assert match is not None
+
+        if match.end() == len(self.src) and not closing:
+            # The current token runs right up against the end of the current
+            # chunk of source and thus might continue in the next chunk. Do not
+            # move self.point.
+            return None
+
+        token = match.group(1)
+        if token == '':
+            # Whitespace followed by end of input or illegal character.
+            if match.end() == len(self.src):
+                # End of input. Success!
+                assert closing
+                self.point = match.end()
+                return None
+            else:
+                c = self.src[match.end()]
+                self.throw("unexpected character: {!r}".format(c))
+
+        c = token[0]
+        t = None
+        if c.isdigit() or c == '.' and token != '.':
+            t = 'NumericLiteral'
+        elif c.isalpha() or c in '$_':
+            if token in ALL_KEYWORDS:  # TODO support strict mode
+                if token == 'null':
+                    t = 'NullLiteral'
+                elif token in ('true', 'false'):
+                    t = 'BooleanLiteral'
+                else:
+                    t = token
+            else:
+                t = 'Name'
+        elif c == '/':
+            if token.startswith(('/*', '//')):
+                # Incomplete comment. (In non-closing mode, this is handled
+                # above, immediately after the match.)
+                assert match.end() == len(self.src)
+                assert closing
+                self.point = len(self.src)
+                self.throw("incomplete comment at end of source")
+
+            # We choose RegExp vs. division based on what the parser can
+            # accept, a literal implementation of the spec.
+            #
+            # To make this correct in combination with end-of-line ASI, make
+            # the parser rewind the lexer one token and ask for it again in
+            # that case, so that the lexer asks the can-accept question again.
+            point = match.start(1)
+            if self.parser.can_accept_terminal(self, 'RegularExpressionLiteral'):
+                match = REGEXP_RE.match(self.src, point)
+                if match is None:
+                    if closing:
+                        self.throw("unterminated regexp literal")
+                    else:
+                        return None
+                token = 'RegularExpressionLiteral'
+            else:
+                match = DIV_RE.match(self.src, point)
+                token = match.group(1)
+
+            if not closing and match.end() == len(self.src):
+                # At the end of a chunk, `/a*b/` could be the start of
+                # `/a*b/g`, and `/` could be the start of `/=`.
+                return None
+
+            t = token
+        elif c == '`':
+            if token.endswith('`'):
+                t = 'NoSubstitutionTemplate'
+            else:
+                t = 'TemplateHead'
+        elif c == '"' or c == "'":
+            t = 'StringLiteral'
+        elif c == '}':
+            # TODO: TemplateTail
+            t = token
+        elif c in '{()[];,~?:.<>=!+-*%&|^':
+            t = token
+        else:
+            assert False
+
+        self._current_match = match
+        self.previous_token_end = self.point
+        self.current_token_start = match.start(1)
+        self.point = match.end()
+        return t
+
+    def take(self):
+        return self._current_match.group(1)
+
+    def saw_line_terminator(self):
+        """True if there's a LineTerminator before the current token."""
+        i = self.previous_token_end
+        j = self.current_token_start
+        ws_between = self.src[i:j]
+        return any(c in ws_between for c in '\r\n\u2028\u2029')
+
+    def can_close(self):
+        match = TOKEN_RE.match(self.src)
+        return match.group(1) == '' and self.parser.can_close()