summaryrefslogtreecommitdiffstats
path: root/third_party/python/esprima/esprima/scanner.py
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/python/esprima/esprima/scanner.py')
-rw-r--r--third_party/python/esprima/esprima/scanner.py1189
1 files changed, 1189 insertions, 0 deletions
diff --git a/third_party/python/esprima/esprima/scanner.py b/third_party/python/esprima/esprima/scanner.py
new file mode 100644
index 0000000000..53502a51d3
--- /dev/null
+++ b/third_party/python/esprima/esprima/scanner.py
@@ -0,0 +1,1189 @@
+# -*- coding: utf-8 -*-
+# Copyright JS Foundation and other contributors, https://js.foundation/
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
+# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from __future__ import absolute_import, unicode_literals
+
+import re
+
+from .objects import Object
+from .compat import xrange, unicode, uchr, uord
+from .character import Character, HEX_CONV, OCTAL_CONV
+from .messages import Messages
+from .token import Token
+
+
+def hexValue(ch):
+ return HEX_CONV[ch]
+
+
+def octalValue(ch):
+ return OCTAL_CONV[ch]
+
+
+class RegExp(Object):
+ def __init__(self, pattern=None, flags=None):
+ self.pattern = pattern
+ self.flags = flags
+
+
+class Position(Object):
+ def __init__(self, line=None, column=None, offset=None):
+ self.line = line
+ self.column = column
+ self.offset = offset
+
+
+class SourceLocation(Object):
+ def __init__(self, start=None, end=None, source=None):
+ self.start = start
+ self.end = end
+ self.source = source
+
+
+class Comment(Object):
+ def __init__(self, multiLine=None, slice=None, range=None, loc=None):
+ self.multiLine = multiLine
+ self.slice = slice
+ self.range = range
+ self.loc = loc
+
+
+class RawToken(Object):
+ def __init__(self, type=None, value=None, pattern=None, flags=None, regex=None, octal=None, cooked=None, head=None, tail=None, lineNumber=None, lineStart=None, start=None, end=None):
+ self.type = type
+ self.value = value
+ self.pattern = pattern
+ self.flags = flags
+ self.regex = regex
+ self.octal = octal
+ self.cooked = cooked
+ self.head = head
+ self.tail = tail
+ self.lineNumber = lineNumber
+ self.lineStart = lineStart
+ self.start = start
+ self.end = end
+
+
+class ScannerState(Object):
+ def __init__(self, index=None, lineNumber=None, lineStart=None):
+ self.index = index
+ self.lineNumber = lineNumber
+ self.lineStart = lineStart
+
+
+class Octal(object):
+ def __init__(self, octal, code):
+ self.octal = octal
+ self.code = code
+
+
+class Scanner(object):
+ def __init__(self, code, handler):
+ self.source = unicode(code) + '\x00'
+ self.errorHandler = handler
+ self.trackComment = False
+ self.isModule = False
+
+ self.length = len(code)
+ self.index = 0
+ self.lineNumber = 1 if self.length > 0 else 0
+ self.lineStart = 0
+ self.curlyStack = []
+
+ def saveState(self):
+ return ScannerState(
+ index=self.index,
+ lineNumber=self.lineNumber,
+ lineStart=self.lineStart
+ )
+
+ def restoreState(self, state):
+ self.index = state.index
+ self.lineNumber = state.lineNumber
+ self.lineStart = state.lineStart
+
+ def eof(self):
+ return self.index >= self.length
+
+ def throwUnexpectedToken(self, message=Messages.UnexpectedTokenIllegal):
+ return self.errorHandler.throwError(self.index, self.lineNumber,
+ self.index - self.lineStart + 1, message)
+
+ def tolerateUnexpectedToken(self, message=Messages.UnexpectedTokenIllegal):
+ self.errorHandler.tolerateError(self.index, self.lineNumber,
+ self.index - self.lineStart + 1, message)
+
+ # https://tc39.github.io/ecma262/#sec-comments
+
+ def skipSingleLineComment(self, offset):
+ comments = []
+
+ if self.trackComment:
+ start = self.index - offset
+ loc = SourceLocation(
+ start=Position(
+ line=self.lineNumber,
+ column=self.index - self.lineStart - offset
+ ),
+ end=Position()
+ )
+
+ while not self.eof():
+ ch = self.source[self.index]
+ self.index += 1
+ if Character.isLineTerminator(ch):
+ if self.trackComment:
+ loc.end = Position(
+ line=self.lineNumber,
+ column=self.index - self.lineStart - 1
+ )
+ entry = Comment(
+ multiLine=False,
+ slice=[start + offset, self.index - 1],
+ range=[start, self.index - 1],
+ loc=loc
+ )
+ comments.append(entry)
+
+ if ch == '\r' and self.source[self.index] == '\n':
+ self.index += 1
+
+ self.lineNumber += 1
+ self.lineStart = self.index
+ return comments
+
+ if self.trackComment:
+ loc.end = Position(
+ line=self.lineNumber,
+ column=self.index - self.lineStart
+ )
+ entry = Comment(
+ multiLine=False,
+ slice=[start + offset, self.index],
+ range=[start, self.index],
+ loc=loc
+ )
+ comments.append(entry)
+
+ return comments
+
+ def skipMultiLineComment(self):
+ comments = []
+
+ if self.trackComment:
+ comments = []
+ start = self.index - 2
+ loc = SourceLocation(
+ start=Position(
+ line=self.lineNumber,
+ column=self.index - self.lineStart - 2
+ ),
+ end=Position()
+ )
+
+ while not self.eof():
+ ch = self.source[self.index]
+ if Character.isLineTerminator(ch):
+ if ch == '\r' and self.source[self.index + 1] == '\n':
+ self.index += 1
+
+ self.lineNumber += 1
+ self.index += 1
+ self.lineStart = self.index
+ elif ch == '*':
+ # Block comment ends with '*/'.
+ if self.source[self.index + 1] == '/':
+ self.index += 2
+ if self.trackComment:
+ loc.end = Position(
+ line=self.lineNumber,
+ column=self.index - self.lineStart
+ )
+ entry = Comment(
+ multiLine=True,
+ slice=[start + 2, self.index - 2],
+ range=[start, self.index],
+ loc=loc
+ )
+ comments.append(entry)
+
+ return comments
+
+ self.index += 1
+ else:
+ self.index += 1
+
+ # Ran off the end of the file - the whole thing is a comment
+ if self.trackComment:
+ loc.end = Position(
+ line=self.lineNumber,
+ column=self.index - self.lineStart
+ )
+ entry = Comment(
+ multiLine=True,
+ slice=[start + 2, self.index],
+ range=[start, self.index],
+ loc=loc
+ )
+ comments.append(entry)
+
+ self.tolerateUnexpectedToken()
+ return comments
+
+ def scanComments(self):
+ comments = []
+
+ start = self.index == 0
+ while not self.eof():
+ ch = self.source[self.index]
+
+ if Character.isWhiteSpace(ch):
+ self.index += 1
+ elif Character.isLineTerminator(ch):
+ self.index += 1
+ if ch == '\r' and self.source[self.index] == '\n':
+ self.index += 1
+
+ self.lineNumber += 1
+ self.lineStart = self.index
+ start = True
+ elif ch == '/': # U+002F is '/'
+ ch = self.source[self.index + 1]
+ if ch == '/':
+ self.index += 2
+ comment = self.skipSingleLineComment(2)
+ if self.trackComment:
+ comments.extend(comment)
+
+ start = True
+ elif ch == '*': # U+002A is '*'
+ self.index += 2
+ comment = self.skipMultiLineComment()
+ if self.trackComment:
+ comments.extend(comment)
+
+ else:
+ break
+
+ elif start and ch == '-': # U+002D is '-'
+ # U+003E is '>'
+ if self.source[self.index + 1:self.index + 3] == '->':
+ # '-->' is a single-line comment
+ self.index += 3
+ comment = self.skipSingleLineComment(3)
+ if self.trackComment:
+ comments.extend(comment)
+
+ else:
+ break
+
+ elif ch == '<' and not self.isModule: # U+003C is '<'
+ if self.source[self.index + 1:self.index + 4] == '!--':
+ self.index += 4 # `<!--`
+ comment = self.skipSingleLineComment(4)
+ if self.trackComment:
+ comments.extend(comment)
+
+ else:
+ break
+
+ else:
+ break
+
+ return comments
+
+ # https://tc39.github.io/ecma262/#sec-future-reserved-words
+
+ def isFutureReservedWord(self, id):
+ return id in self.isFutureReservedWord.set
+ isFutureReservedWord.set = set((
+ 'enum',
+ 'export',
+ 'import',
+ 'super',
+ ))
+
+ def isStrictModeReservedWord(self, id):
+ return id in self.isStrictModeReservedWord.set
+ isStrictModeReservedWord.set = set((
+ 'implements',
+ 'interface',
+ 'package',
+ 'private',
+ 'protected',
+ 'public',
+ 'static',
+ 'yield',
+ 'let',
+ ))
+
+ def isRestrictedWord(self, id):
+ return id in self.isRestrictedWord.set
+ isRestrictedWord.set = set((
+ 'eval', 'arguments',
+ ))
+
+ # https://tc39.github.io/ecma262/#sec-keywords
+
+ def isKeyword(self, id):
+ return id in self.isKeyword.set
+ isKeyword.set = set((
+ 'if', 'in', 'do',
+
+ 'var', 'for', 'new',
+ 'try', 'let',
+
+ 'this', 'else', 'case',
+ 'void', 'with', 'enum',
+
+ 'while', 'break', 'catch',
+ 'throw', 'const', 'yield',
+ 'class', 'super',
+
+ 'return', 'typeof', 'delete',
+ 'switch', 'export', 'import',
+
+ 'default', 'finally', 'extends',
+
+ 'function', 'continue', 'debugger',
+
+ 'instanceof',
+ ))
+
+ def codePointAt(self, i):
+ return uord(self.source[i:i + 2])
+
+ def scanHexEscape(self, prefix):
+ length = 4 if prefix == 'u' else 2
+ code = 0
+
+ for i in xrange(length):
+ if not self.eof() and Character.isHexDigit(self.source[self.index]):
+ ch = self.source[self.index]
+ self.index += 1
+ code = code * 16 + hexValue(ch)
+ else:
+ return None
+
+ return uchr(code)
+
+ def scanUnicodeCodePointEscape(self):
+ ch = self.source[self.index]
+ code = 0
+
+ # At least, one hex digit is required.
+ if ch == '}':
+ self.throwUnexpectedToken()
+
+ while not self.eof():
+ ch = self.source[self.index]
+ self.index += 1
+ if not Character.isHexDigit(ch):
+ break
+
+ code = code * 16 + hexValue(ch)
+
+ if code > 0x10FFFF or ch != '}':
+ self.throwUnexpectedToken()
+
+ return Character.fromCodePoint(code)
+
+ def getIdentifier(self):
+ start = self.index
+ self.index += 1
+ while not self.eof():
+ ch = self.source[self.index]
+ if ch == '\\':
+ # Blackslash (U+005C) marks Unicode escape sequence.
+ self.index = start
+ return self.getComplexIdentifier()
+ else:
+ cp = ord(ch)
+ if cp >= 0xD800 and cp < 0xDFFF:
+ # Need to handle surrogate pairs.
+ self.index = start
+ return self.getComplexIdentifier()
+
+ if Character.isIdentifierPart(ch):
+ self.index += 1
+ else:
+ break
+
+ return self.source[start:self.index]
+
+ def getComplexIdentifier(self):
+ cp = self.codePointAt(self.index)
+ id = Character.fromCodePoint(cp)
+ self.index += len(id)
+
+ # '\u' (U+005C, U+0075) denotes an escaped character.
+ if cp == 0x5C:
+ if self.source[self.index] != 'u':
+ self.throwUnexpectedToken()
+
+ self.index += 1
+ if self.source[self.index] == '{':
+ self.index += 1
+ ch = self.scanUnicodeCodePointEscape()
+ else:
+ ch = self.scanHexEscape('u')
+ if not ch or ch == '\\' or not Character.isIdentifierStart(ch[0]):
+ self.throwUnexpectedToken()
+
+ id = ch
+
+ while not self.eof():
+ cp = self.codePointAt(self.index)
+ ch = Character.fromCodePoint(cp)
+ if not Character.isIdentifierPart(ch):
+ break
+
+ id += ch
+ self.index += len(ch)
+
+ # '\u' (U+005C, U+0075) denotes an escaped character.
+ if cp == 0x5C:
+ id = id[:-1]
+ if self.source[self.index] != 'u':
+ self.throwUnexpectedToken()
+
+ self.index += 1
+ if self.source[self.index] == '{':
+ self.index += 1
+ ch = self.scanUnicodeCodePointEscape()
+ else:
+ ch = self.scanHexEscape('u')
+ if not ch or ch == '\\' or not Character.isIdentifierPart(ch[0]):
+ self.throwUnexpectedToken()
+
+ id += ch
+
+ return id
+
+ def octalToDecimal(self, ch):
+ # \0 is not octal escape sequence
+ octal = ch != '0'
+ code = octalValue(ch)
+
+ if not self.eof() and Character.isOctalDigit(self.source[self.index]):
+ octal = True
+ code = code * 8 + octalValue(self.source[self.index])
+ self.index += 1
+
+ # 3 digits are only allowed when string starts
+ # with 0, 1, 2, 3
+ if ch in '0123' and not self.eof() and Character.isOctalDigit(self.source[self.index]):
+ code = code * 8 + octalValue(self.source[self.index])
+ self.index += 1
+
+ return Octal(octal, code)
+
+ # https://tc39.github.io/ecma262/#sec-names-and-keywords
+
+ def scanIdentifier(self):
+ start = self.index
+
+ # Backslash (U+005C) starts an escaped character.
+ id = self.getComplexIdentifier() if self.source[start] == '\\' else self.getIdentifier()
+
+ # There is no keyword or literal with only one character.
+ # Thus, it must be an identifier.
+ if len(id) == 1:
+ type = Token.Identifier
+ elif self.isKeyword(id):
+ type = Token.Keyword
+ elif id == 'null':
+ type = Token.NullLiteral
+ elif id == 'true' or id == 'false':
+ type = Token.BooleanLiteral
+ else:
+ type = Token.Identifier
+
+ if type is not Token.Identifier and start + len(id) != self.index:
+ restore = self.index
+ self.index = start
+ self.tolerateUnexpectedToken(Messages.InvalidEscapedReservedWord)
+ self.index = restore
+
+ return RawToken(
+ type=type,
+ value=id,
+ lineNumber=self.lineNumber,
+ lineStart=self.lineStart,
+ start=start,
+ end=self.index
+ )
+
+ # https://tc39.github.io/ecma262/#sec-punctuators
+
+ def scanPunctuator(self):
+ start = self.index
+
+ # Check for most common single-character punctuators.
+ str = self.source[self.index]
+ if str in (
+ '(',
+ '{',
+ ):
+ if str == '{':
+ self.curlyStack.append('{')
+
+ self.index += 1
+
+ elif str == '.':
+ self.index += 1
+ if self.source[self.index] == '.' and self.source[self.index + 1] == '.':
+ # Spread operator: ...
+ self.index += 2
+ str = '...'
+
+ elif str == '}':
+ self.index += 1
+ if self.curlyStack:
+ self.curlyStack.pop()
+
+ elif str in (
+ ')',
+ ';',
+ ',',
+ '[',
+ ']',
+ ':',
+ '?',
+ '~',
+ ):
+ self.index += 1
+
+ else:
+ # 4-character punctuator.
+ str = self.source[self.index:self.index + 4]
+ if str == '>>>=':
+ self.index += 4
+ else:
+
+ # 3-character punctuators.
+ str = str[:3]
+ if str in (
+ '===', '!==', '>>>',
+ '<<=', '>>=', '**='
+ ):
+ self.index += 3
+ else:
+
+ # 2-character punctuators.
+ str = str[:2]
+ if str in (
+ '&&', '||', '==', '!=',
+ '+=', '-=', '*=', '/=',
+ '++', '--', '<<', '>>',
+ '&=', '|=', '^=', '%=',
+ '<=', '>=', '=>', '**',
+ ):
+ self.index += 2
+ else:
+
+ # 1-character punctuators.
+ str = self.source[self.index]
+ if str in '<>=!+-*%&|^/':
+ self.index += 1
+
+ if self.index == start:
+ self.throwUnexpectedToken()
+
+ return RawToken(
+ type=Token.Punctuator,
+ value=str,
+ lineNumber=self.lineNumber,
+ lineStart=self.lineStart,
+ start=start,
+ end=self.index
+ )
+
+ # https://tc39.github.io/ecma262/#sec-literals-numeric-literals
+
+ def scanHexLiteral(self, start):
+ num = ''
+
+ while not self.eof():
+ if not Character.isHexDigit(self.source[self.index]):
+ break
+
+ num += self.source[self.index]
+ self.index += 1
+
+ if len(num) == 0:
+ self.throwUnexpectedToken()
+
+ if Character.isIdentifierStart(self.source[self.index]):
+ self.throwUnexpectedToken()
+
+ return RawToken(
+ type=Token.NumericLiteral,
+ value=int(num, 16),
+ lineNumber=self.lineNumber,
+ lineStart=self.lineStart,
+ start=start,
+ end=self.index
+ )
+
+ def scanBinaryLiteral(self, start):
+ num = ''
+
+ while not self.eof():
+ ch = self.source[self.index]
+ if ch != '0' and ch != '1':
+ break
+
+ num += self.source[self.index]
+ self.index += 1
+
+ if len(num) == 0:
+ # only 0b or 0B
+ self.throwUnexpectedToken()
+
+ if not self.eof():
+ ch = self.source[self.index]
+ if Character.isIdentifierStart(ch) or Character.isDecimalDigit(ch):
+ self.throwUnexpectedToken()
+
+ return RawToken(
+ type=Token.NumericLiteral,
+ value=int(num, 2),
+ lineNumber=self.lineNumber,
+ lineStart=self.lineStart,
+ start=start,
+ end=self.index
+ )
+
+ def scanOctalLiteral(self, prefix, start):
+ num = ''
+ octal = False
+
+ if Character.isOctalDigit(prefix[0]):
+ octal = True
+ num = '0' + self.source[self.index]
+ self.index += 1
+
+ while not self.eof():
+ if not Character.isOctalDigit(self.source[self.index]):
+ break
+
+ num += self.source[self.index]
+ self.index += 1
+
+ if not octal and len(num) == 0:
+ # only 0o or 0O
+ self.throwUnexpectedToken()
+
+ if Character.isIdentifierStart(self.source[self.index]) or Character.isDecimalDigit(self.source[self.index]):
+ self.throwUnexpectedToken()
+
+ return RawToken(
+ type=Token.NumericLiteral,
+ value=int(num, 8),
+ octal=octal,
+ lineNumber=self.lineNumber,
+ lineStart=self.lineStart,
+ start=start,
+ end=self.index
+ )
+
+ def isImplicitOctalLiteral(self):
+ # Implicit octal, unless there is a non-octal digit.
+ # (Annex B.1.1 on Numeric Literals)
+ for i in xrange(self.index + 1, self.length):
+ ch = self.source[i]
+ if ch in '89':
+ return False
+ if not Character.isOctalDigit(ch):
+ return True
+ return True
+
+ def scanNumericLiteral(self):
+ start = self.index
+ ch = self.source[start]
+ assert Character.isDecimalDigit(ch) or ch == '.', 'Numeric literal must start with a decimal digit or a decimal point'
+
+ num = ''
+ if ch != '.':
+ num = self.source[self.index]
+ self.index += 1
+ ch = self.source[self.index]
+
+ # Hex number starts with '0x'.
+ # Octal number starts with '0'.
+ # Octal number in ES6 starts with '0o'.
+ # Binary number in ES6 starts with '0b'.
+ if num == '0':
+ if ch in ('x', 'X'):
+ self.index += 1
+ return self.scanHexLiteral(start)
+
+ if ch in ('b', 'B'):
+ self.index += 1
+ return self.scanBinaryLiteral(start)
+
+ if ch in ('o', 'O'):
+ return self.scanOctalLiteral(ch, start)
+
+ if ch and Character.isOctalDigit(ch):
+ if self.isImplicitOctalLiteral():
+ return self.scanOctalLiteral(ch, start)
+
+ while Character.isDecimalDigit(self.source[self.index]):
+ num += self.source[self.index]
+ self.index += 1
+
+ ch = self.source[self.index]
+
+ if ch == '.':
+ num += self.source[self.index]
+ self.index += 1
+ while Character.isDecimalDigit(self.source[self.index]):
+ num += self.source[self.index]
+ self.index += 1
+
+ ch = self.source[self.index]
+
+ if ch in ('e', 'E'):
+ num += self.source[self.index]
+ self.index += 1
+
+ ch = self.source[self.index]
+ if ch in ('+', '-'):
+ num += self.source[self.index]
+ self.index += 1
+
+ if Character.isDecimalDigit(self.source[self.index]):
+ while Character.isDecimalDigit(self.source[self.index]):
+ num += self.source[self.index]
+ self.index += 1
+
+ else:
+ self.throwUnexpectedToken()
+
+ if Character.isIdentifierStart(self.source[self.index]):
+ self.throwUnexpectedToken()
+
+ value = float(num)
+ return RawToken(
+ type=Token.NumericLiteral,
+ value=int(value) if value.is_integer() else value,
+ lineNumber=self.lineNumber,
+ lineStart=self.lineStart,
+ start=start,
+ end=self.index
+ )
+
+ # https://tc39.github.io/ecma262/#sec-literals-string-literals
+
+ def scanStringLiteral(self):
+ start = self.index
+ quote = self.source[start]
+ assert quote in ('\'', '"'), 'String literal must starts with a quote'
+
+ self.index += 1
+ octal = False
+ str = ''
+
+ while not self.eof():
+ ch = self.source[self.index]
+ self.index += 1
+
+ if ch == quote:
+ quote = ''
+ break
+ elif ch == '\\':
+ ch = self.source[self.index]
+ self.index += 1
+ if not ch or not Character.isLineTerminator(ch):
+ if ch == 'u':
+ if self.source[self.index] == '{':
+ self.index += 1
+ str += self.scanUnicodeCodePointEscape()
+ else:
+ unescapedChar = self.scanHexEscape(ch)
+ if not unescapedChar:
+ self.throwUnexpectedToken()
+
+ str += unescapedChar
+
+ elif ch == 'x':
+ unescaped = self.scanHexEscape(ch)
+ if not unescaped:
+ self.throwUnexpectedToken(Messages.InvalidHexEscapeSequence)
+
+ str += unescaped
+ elif ch == 'n':
+ str += '\n'
+ elif ch == 'r':
+ str += '\r'
+ elif ch == 't':
+ str += '\t'
+ elif ch == 'b':
+ str += '\b'
+ elif ch == 'f':
+ str += '\f'
+ elif ch == 'v':
+ str += '\x0B'
+ elif ch in (
+ '8',
+ '9',
+ ):
+ str += ch
+ self.tolerateUnexpectedToken()
+
+ else:
+ if ch and Character.isOctalDigit(ch):
+ octToDec = self.octalToDecimal(ch)
+
+ octal = octToDec.octal or octal
+ str += uchr(octToDec.code)
+ else:
+ str += ch
+
+ else:
+ self.lineNumber += 1
+ if ch == '\r' and self.source[self.index] == '\n':
+ self.index += 1
+
+ self.lineStart = self.index
+
+ elif Character.isLineTerminator(ch):
+ break
+ else:
+ str += ch
+
+ if quote != '':
+ self.index = start
+ self.throwUnexpectedToken()
+
+ return RawToken(
+ type=Token.StringLiteral,
+ value=str,
+ octal=octal,
+ lineNumber=self.lineNumber,
+ lineStart=self.lineStart,
+ start=start,
+ end=self.index
+ )
+
+ # https://tc39.github.io/ecma262/#sec-template-literal-lexical-components
+
+ def scanTemplate(self):
+ cooked = ''
+ terminated = False
+ start = self.index
+
+ head = self.source[start] == '`'
+ tail = False
+ rawOffset = 2
+
+ self.index += 1
+
+ while not self.eof():
+ ch = self.source[self.index]
+ self.index += 1
+ if ch == '`':
+ rawOffset = 1
+ tail = True
+ terminated = True
+ break
+ elif ch == '$':
+ if self.source[self.index] == '{':
+ self.curlyStack.append('${')
+ self.index += 1
+ terminated = True
+ break
+
+ cooked += ch
+ elif ch == '\\':
+ ch = self.source[self.index]
+ self.index += 1
+ if not Character.isLineTerminator(ch):
+ if ch == 'n':
+ cooked += '\n'
+ elif ch == 'r':
+ cooked += '\r'
+ elif ch == 't':
+ cooked += '\t'
+ elif ch == 'u':
+ if self.source[self.index] == '{':
+ self.index += 1
+ cooked += self.scanUnicodeCodePointEscape()
+ else:
+ restore = self.index
+ unescapedChar = self.scanHexEscape(ch)
+ if unescapedChar:
+ cooked += unescapedChar
+ else:
+ self.index = restore
+ cooked += ch
+
+ elif ch == 'x':
+ unescaped = self.scanHexEscape(ch)
+ if not unescaped:
+ self.throwUnexpectedToken(Messages.InvalidHexEscapeSequence)
+
+ cooked += unescaped
+ elif ch == 'b':
+ cooked += '\b'
+ elif ch == 'f':
+ cooked += '\f'
+ elif ch == 'v':
+ cooked += '\v'
+
+ else:
+ if ch == '0':
+ if Character.isDecimalDigit(self.source[self.index]):
+ # Illegal: \01 \02 and so on
+ self.throwUnexpectedToken(Messages.TemplateOctalLiteral)
+
+ cooked += '\0'
+ elif Character.isOctalDigit(ch):
+ # Illegal: \1 \2
+ self.throwUnexpectedToken(Messages.TemplateOctalLiteral)
+ else:
+ cooked += ch
+
+ else:
+ self.lineNumber += 1
+ if ch == '\r' and self.source[self.index] == '\n':
+ self.index += 1
+
+ self.lineStart = self.index
+
+ elif Character.isLineTerminator(ch):
+ self.lineNumber += 1
+ if ch == '\r' and self.source[self.index] == '\n':
+ self.index += 1
+
+ self.lineStart = self.index
+ cooked += '\n'
+ else:
+ cooked += ch
+
+ if not terminated:
+ self.throwUnexpectedToken()
+
+ if not head:
+ if self.curlyStack:
+ self.curlyStack.pop()
+
+ return RawToken(
+ type=Token.Template,
+ value=self.source[start + 1:self.index - rawOffset],
+ cooked=cooked,
+ head=head,
+ tail=tail,
+ lineNumber=self.lineNumber,
+ lineStart=self.lineStart,
+ start=start,
+ end=self.index
+ )
+
+ # https://tc39.github.io/ecma262/#sec-literals-regular-expression-literals
+
+ def testRegExp(self, pattern, flags):
+ # The BMP character to use as a replacement for astral symbols when
+ # translating an ES6 "u"-flagged pattern to an ES5-compatible
+ # approximation.
+ # Note: replacing with '\uFFFF' enables false positives in unlikely
+ # scenarios. For example, `[\u{1044f}-\u{10440}]` is an invalid
+ # pattern that would not be detected by this substitution.
+ astralSubstitute = '\uFFFF'
+
+ # Replace every Unicode escape sequence with the equivalent
+ # BMP character or a constant ASCII code point in the case of
+ # astral symbols. (See the above note on `astralSubstitute`
+ # for more information.)
+ def astralSub(m):
+ codePoint = int(m.group(1) or m.group(2), 16)
+ if codePoint > 0x10FFFF:
+ self.tolerateUnexpectedToken(Messages.InvalidRegExp)
+ elif codePoint <= 0xFFFF:
+ return uchr(codePoint)
+ return astralSubstitute
+ pattern = re.sub(r'\\u\{([0-9a-fA-F]+)\}|\\u([a-fA-F0-9]{4})', astralSub, pattern)
+
+ # Replace each paired surrogate with a single ASCII symbol to
+ # avoid throwing on regular expressions that are only valid in
+ # combination with the "u" flag.
+ pattern = re.sub(r'[\uD800-\uDBFF][\uDC00-\uDFFF]', astralSubstitute, pattern)
+
+ # Return a regular expression object for this pattern-flag pair, or
+ # `null` in case the current environment doesn't support the flags it
+ # uses.
+ pyflags = 0 | re.M if 'm' in flags else 0 | re.I if 'i' in flags else 0
+ try:
+ return re.compile(pattern, pyflags)
+ except Exception:
+ self.tolerateUnexpectedToken(Messages.InvalidRegExp)
+
+ def scanRegExpBody(self):
+ ch = self.source[self.index]
+ assert ch == '/', 'Regular expression literal must start with a slash'
+
+ str = self.source[self.index]
+ self.index += 1
+ classMarker = False
+ terminated = False
+
+ while not self.eof():
+ ch = self.source[self.index]
+ self.index += 1
+ str += ch
+ if ch == '\\':
+ ch = self.source[self.index]
+ self.index += 1
+ # https://tc39.github.io/ecma262/#sec-literals-regular-expression-literals
+ if Character.isLineTerminator(ch):
+ self.throwUnexpectedToken(Messages.UnterminatedRegExp)
+
+ str += ch
+ elif Character.isLineTerminator(ch):
+ self.throwUnexpectedToken(Messages.UnterminatedRegExp)
+ elif classMarker:
+ if ch == ']':
+ classMarker = False
+
+ else:
+ if ch == '/':
+ terminated = True
+ break
+ elif ch == '[':
+ classMarker = True
+
+ if not terminated:
+ self.throwUnexpectedToken(Messages.UnterminatedRegExp)
+
+ # Exclude leading and trailing slash.
+ return str[1:-1]
+
+ def scanRegExpFlags(self):
+ str = ''
+ flags = ''
+ while not self.eof():
+ ch = self.source[self.index]
+ if not Character.isIdentifierPart(ch):
+ break
+
+ self.index += 1
+ if ch == '\\' and not self.eof():
+ ch = self.source[self.index]
+ if ch == 'u':
+ self.index += 1
+ restore = self.index
+ char = self.scanHexEscape('u')
+ if char:
+ flags += char
+ str += '\\u'
+ while restore < self.index:
+ str += self.source[restore]
+ restore += 1
+
+ else:
+ self.index = restore
+ flags += 'u'
+ str += '\\u'
+
+ self.tolerateUnexpectedToken()
+ else:
+ str += '\\'
+ self.tolerateUnexpectedToken()
+
+ else:
+ flags += ch
+ str += ch
+
+ return flags
+
+ def scanRegExp(self):
+ start = self.index
+
+ pattern = self.scanRegExpBody()
+ flags = self.scanRegExpFlags()
+ value = self.testRegExp(pattern, flags)
+
+ return RawToken(
+ type=Token.RegularExpression,
+ value='',
+ pattern=pattern,
+ flags=flags,
+ regex=value,
+ lineNumber=self.lineNumber,
+ lineStart=self.lineStart,
+ start=start,
+ end=self.index
+ )
+
+ def lex(self):
+ if self.eof():
+ return RawToken(
+ type=Token.EOF,
+ value='',
+ lineNumber=self.lineNumber,
+ lineStart=self.lineStart,
+ start=self.index,
+ end=self.index
+ )
+
+ ch = self.source[self.index]
+
+ if Character.isIdentifierStart(ch):
+ return self.scanIdentifier()
+
+ # Very common: ( and ) and ;
+ if ch in ('(', ')', ';'):
+ return self.scanPunctuator()
+
+ # String literal starts with single quote (U+0027) or double quote (U+0022).
+ if ch in ('\'', '"'):
+ return self.scanStringLiteral()
+
+ # Dot (.) U+002E can also start a floating-point number, hence the need
+ # to check the next character.
+ if ch == '.':
+ if Character.isDecimalDigit(self.source[self.index + 1]):
+ return self.scanNumericLiteral()
+
+ return self.scanPunctuator()
+
+ if Character.isDecimalDigit(ch):
+ return self.scanNumericLiteral()
+
+ # Template literals start with ` (U+0060) for template head
+ # or } (U+007D) for template middle or template tail.
+ if ch == '`' or (ch == '}' and self.curlyStack and self.curlyStack[-1] == '${'):
+ return self.scanTemplate()
+
+ # Possible identifier start in a surrogate pair.
+ cp = ord(ch)
+ if cp >= 0xD800 and cp < 0xDFFF:
+ cp = self.codePointAt(self.index)
+ ch = Character.fromCodePoint(cp)
+ if Character.isIdentifierStart(ch):
+ return self.scanIdentifier()
+
+ return self.scanPunctuator()