diff options
Diffstat (limited to 'third_party/rust/jsparagus-parser/src/lexer.rs')
-rw-r--r-- | third_party/rust/jsparagus-parser/src/lexer.rs | 2325 |
1 files changed, 2325 insertions, 0 deletions
diff --git a/third_party/rust/jsparagus-parser/src/lexer.rs b/third_party/rust/jsparagus-parser/src/lexer.rs new file mode 100644 index 0000000000..109558c5c2 --- /dev/null +++ b/third_party/rust/jsparagus-parser/src/lexer.rs @@ -0,0 +1,2325 @@ +//! JavaScript lexer. + +use crate::numeric_value::{parse_float, parse_int, NumericLiteralBase}; +use crate::parser::Parser; +use crate::unicode::{is_id_continue, is_id_start}; +use ast::arena; +use ast::source_atom_set::{CommonSourceAtomSetIndices, SourceAtomSet}; +use ast::source_slice_list::SourceSliceList; +use ast::SourceLocation; +use bumpalo::{collections::String, Bump}; +use generated_parser::{ParseError, Result, TerminalId, Token, TokenValue}; +use std::cell::RefCell; +use std::convert::TryFrom; +use std::rc::Rc; +use std::str::Chars; + +pub struct Lexer<'alloc> { + allocator: &'alloc Bump, + + /// Next token to be returned. + token: arena::Box<'alloc, Token>, + + /// Length of the input text, in UTF-8 bytes. + source_length: usize, + + /// Iterator over the remaining not-yet-parsed input. + chars: Chars<'alloc>, + + atoms: Rc<RefCell<SourceAtomSet<'alloc>>>, + + slices: Rc<RefCell<SourceSliceList<'alloc>>>, +} + +enum NumericResult { + Int { + base: NumericLiteralBase, + }, + Float, + BigInt { + #[allow(dead_code)] + base: NumericLiteralBase, + }, +} + +impl<'alloc> Lexer<'alloc> { + pub fn new( + allocator: &'alloc Bump, + chars: Chars<'alloc>, + atoms: Rc<RefCell<SourceAtomSet<'alloc>>>, + slices: Rc<RefCell<SourceSliceList<'alloc>>>, + ) -> Lexer<'alloc> { + Self::with_offset(allocator, chars, 0, atoms, slices) + } + + /// Create a lexer for a part of a JS script or module. `offset` is the + /// total length of all previous parts, in bytes; source locations for + /// tokens created by the new lexer start counting from this number. + pub fn with_offset( + allocator: &'alloc Bump, + chars: Chars<'alloc>, + offset: usize, + atoms: Rc<RefCell<SourceAtomSet<'alloc>>>, + slices: Rc<RefCell<SourceSliceList<'alloc>>>, + ) -> Lexer<'alloc> { + let source_length = offset + chars.as_str().len(); + let mut token = arena::alloc(allocator, new_token()); + token.is_on_new_line = true; + Lexer { + allocator, + token, + source_length, + chars, + atoms, + slices, + } + } + + fn is_looking_at(&self, s: &str) -> bool { + self.chars.as_str().starts_with(s) + } + + pub fn offset(&self) -> usize { + self.source_length - self.chars.as_str().len() + } + + fn peek(&self) -> Option<char> { + self.chars.as_str().chars().next() + } + + fn double_peek(&self) -> Option<char> { + let mut chars = self.chars.as_str().chars(); + chars.next(); + chars.next() + } + + fn set_result( + &mut self, + terminal_id: TerminalId, + loc: SourceLocation, + value: TokenValue, + ) -> Result<'alloc, ()> { + self.token.terminal_id = terminal_id; + self.token.loc = loc; + self.token.value = value; + Ok(()) + } + + #[inline] + pub fn next<'parser>( + &mut self, + parser: &Parser<'parser>, + ) -> Result<'alloc, arena::Box<'alloc, Token>> { + let mut next_token = arena::alloc_with(self.allocator, || new_token()); + self.advance_impl(parser)?; + std::mem::swap(&mut self.token, &mut next_token); + Ok(next_token) + } + + fn unexpected_err(&mut self) -> ParseError<'alloc> { + if let Some(ch) = self.peek() { + ParseError::IllegalCharacter(ch) + } else { + ParseError::UnexpectedEnd + } + } +} + +/// Returns an empty token which is meant as a place holder to be mutated later. +fn new_token() -> Token { + Token::basic_token(TerminalId::End, SourceLocation::default()) +} + +// ---------------------------------------------------------------------------- +// 11.1 Unicode Format-Control Characters + +/// U+200C ZERO WIDTH NON-JOINER, abbreviated in the spec as <ZWNJ>. +/// Specially permitted in identifiers. +const ZWNJ: char = '\u{200c}'; + +/// U+200D ZERO WIDTH JOINER, abbreviated as <ZWJ>. +/// Specially permitted in identifiers. +const ZWJ: char = '\u{200d}'; + +/// U+FEFF ZERO WIDTH NO-BREAK SPACE, abbreviated <ZWNBSP>. +/// Considered a whitespace character in JS. +const ZWNBSP: char = '\u{feff}'; + +// ---------------------------------------------------------------------------- +// 11.2 White Space + +/// U+0009 CHARACTER TABULATION, abbreviated <TAB>. +const TAB: char = '\u{9}'; + +/// U+000B VERTICAL TAB, abbreviated <VT>. +const VT: char = '\u{b}'; + +/// U+000C FORM FEED, abbreviated <FF>. +const FF: char = '\u{c}'; + +/// U+0020 SPACE, abbreviated <SP>. +const SP: char = '\u{20}'; + +/// U+00A0 NON-BREAKING SPACE, abbreviated <NBSP>. +const NBSP: char = '\u{a0}'; + +// ---------------------------------------------------------------------------- +// 11.3 Line Terminators + +/// U+000A LINE FEED, abbreviated in the spec as <LF>. +const LF: char = '\u{a}'; + +/// U+000D CARRIAGE RETURN, abbreviated in the spec as <CR>. +const CR: char = '\u{d}'; + +/// U+2028 LINE SEPARATOR, abbreviated <LS>. +const LS: char = '\u{2028}'; + +/// U+2029 PARAGRAPH SEPARATOR, abbreviated <PS>. +const PS: char = '\u{2029}'; + +// ---------------------------------------------------------------------------- +// 11.4 Comments +// +// Comment:: +// MultiLineComment +// SingleLineComment + +impl<'alloc> Lexer<'alloc> { + /// Skip a *MultiLineComment*. + /// + /// ```text + /// MultiLineComment :: + /// `/*` MultiLineCommentChars? `*/` + /// + /// MultiLineCommentChars :: + /// MultiLineNotAsteriskChar MultiLineCommentChars? + /// `*` PostAsteriskCommentChars? + /// + /// PostAsteriskCommentChars :: + /// MultiLineNotForwardSlashOrAsteriskChar MultiLineCommentChars? + /// `*` PostAsteriskCommentChars? + /// + /// MultiLineNotAsteriskChar :: + /// SourceCharacter but not `*` + /// + /// MultiLineNotForwardSlashOrAsteriskChar :: + /// SourceCharacter but not one of `/` or `*` + /// ``` + /// + /// (B.1.3 splits MultiLineComment into two nonterminals: MultiLineComment + /// and SingleLineDelimitedComment. The point of that is to help specify + /// that a SingleLineHTMLCloseComment must occur at the start of a line. We + /// use `is_on_new_line` for that.) + /// + fn skip_multi_line_comment(&mut self, builder: &mut AutoCow<'alloc>) -> Result<'alloc, ()> { + while let Some(ch) = self.chars.next() { + match ch { + '*' if self.peek() == Some('/') => { + self.chars.next(); + *builder = AutoCow::new(&self); + return Ok(()); + } + CR | LF | PS | LS => { + self.token.is_on_new_line = true; + } + _ => {} + } + } + Err(ParseError::UnterminatedMultiLineComment.into()) + } + + /// Skip a *SingleLineComment* and the following *LineTerminatorSequence*, + /// if any. + /// + /// ```text + /// SingleLineComment :: + /// `//` SingleLineCommentChars? + /// + /// SingleLineCommentChars :: + /// SingleLineCommentChar SingleLineCommentChars? + /// + /// SingleLineCommentChar :: + /// SourceCharacter but not LineTerminator + /// ``` + fn skip_single_line_comment(&mut self, builder: &mut AutoCow<'alloc>) { + while let Some(ch) = self.chars.next() { + match ch { + CR | LF | LS | PS => break, + _ => continue, + } + } + *builder = AutoCow::new(&self); + self.token.is_on_new_line = true; + } +} + +// ---------------------------------------------------------------------------- +// 11.6 Names and Keywords + +/// True if `c` is a one-character *IdentifierStart*. +/// +/// ```text +/// IdentifierStart :: +/// UnicodeIDStart +/// `$` +/// `_` +/// `\` UnicodeEscapeSequence +/// +/// UnicodeIDStart :: +/// > any Unicode code point with the Unicode property "ID_Start" +/// ``` +fn is_identifier_start(c: char) -> bool { + // Escaped case is handled separately. + if c.is_ascii() { + c == '$' || c == '_' || c.is_ascii_alphabetic() + } else { + is_id_start(c) + } +} + +/// True if `c` is a one-character *IdentifierPart*. +/// +/// ```text +/// IdentifierPart :: +/// UnicodeIDContinue +/// `$` +/// `\` UnicodeEscapeSequence +/// <ZWNJ> +/// <ZWJ> +/// +/// UnicodeIDContinue :: +/// > any Unicode code point with the Unicode property "ID_Continue" +/// ``` +fn is_identifier_part(c: char) -> bool { + // Escaped case is handled separately. + if c.is_ascii() { + c == '$' || c == '_' || c.is_ascii_alphanumeric() + } else { + is_id_continue(c) || c == ZWNJ || c == ZWJ + } +} + +impl<'alloc> Lexer<'alloc> { + /// Scan the rest of an IdentifierName, having already parsed the initial + /// IdentifierStart and stored it in `builder`. + /// + /// On success, this returns `Ok((has_escapes, str))`, where `has_escapes` + /// is true if the identifier contained any UnicodeEscapeSequences, and + /// `str` is the un-escaped IdentifierName, including the IdentifierStart, + /// on success. + /// + /// ```text + /// IdentifierName :: + /// IdentifierStart + /// IdentifierName IdentifierPart + /// ``` + fn identifier_name_tail( + &mut self, + mut builder: AutoCow<'alloc>, + ) -> Result<'alloc, (bool, &'alloc str)> { + while let Some(ch) = self.peek() { + if !is_identifier_part(ch) { + if ch == '\\' { + self.chars.next(); + builder.force_allocation_without_current_ascii_char(&self); + + let value = self.unicode_escape_sequence_after_backslash()?; + if !is_identifier_part(value) { + return Err(ParseError::InvalidEscapeSequence.into()); + } + + builder.push_different(value); + continue; + } + + break; + } + self.chars.next(); + builder.push_matching(ch); + } + let has_different = builder.has_different(); + Ok((has_different, builder.finish(&self))) + } + + fn identifier_name(&mut self, mut builder: AutoCow<'alloc>) -> Result<'alloc, &'alloc str> { + match self.chars.next() { + None => { + return Err(ParseError::UnexpectedEnd.into()); + } + Some(c) => { + match c { + '$' | '_' | 'a'..='z' | 'A'..='Z' => { + builder.push_matching(c); + } + + '\\' => { + builder.force_allocation_without_current_ascii_char(&self); + + let value = self.unicode_escape_sequence_after_backslash()?; + if !is_identifier_start(value) { + return Err(ParseError::IllegalCharacter(value).into()); + } + builder.push_different(value); + } + + other if is_identifier_start(other) => { + builder.push_matching(other); + } + + other => { + return Err(ParseError::IllegalCharacter(other).into()); + } + } + self.identifier_name_tail(builder) + .map(|(_has_escapes, name)| name) + } + } + } + + /// Finish scanning an *IdentifierName* or keyword, having already scanned + /// the *IdentifierStart* and pushed it to `builder`. + /// + /// `start` is the offset of the *IdentifierStart*. + /// + /// The lexer doesn't know the syntactic context, so it always identifies + /// possible keywords. It's up to the parser to understand that, for + /// example, `TerminalId::If` is not a keyword when it's used as a property + /// or method name. + /// + /// If the source string contains no escape and it matches to possible + /// keywords (including contextual keywords), the result is corresponding + /// `TerminalId`. For example, if the source string is "yield", the result + /// is `TerminalId::Yield`. + /// + /// If the source string contains no escape sequence and also it doesn't + /// match to any possible keywords, the result is `TerminalId::Name`. + /// + /// If the source string contains at least one escape sequence, + /// the result is always `TerminalId::NameWithEscape`, regardless of the + /// StringValue of it. For example, if the source string is "\u{79}ield", + /// the result is `TerminalId::NameWithEscape`, and the StringValue is + /// "yield". + fn identifier_tail(&mut self, start: usize, builder: AutoCow<'alloc>) -> Result<'alloc, ()> { + let (has_different, text) = self.identifier_name_tail(builder)?; + + // https://tc39.es/ecma262/#sec-keywords-and-reserved-words + // + // keywords in the grammar match literal sequences of specific + // SourceCharacter elements. A code point in a keyword cannot be + // expressed by a `\` UnicodeEscapeSequence. + let (id, value) = if has_different { + // Always return `NameWithEscape`. + // + // Error check against reserved word should be handled in the + // consumer. + (TerminalId::NameWithEscape, self.string_to_token_value(text)) + } else { + match &text as &str { + "as" => ( + TerminalId::As, + TokenValue::Atom(CommonSourceAtomSetIndices::as_()), + ), + "async" => { + /* + ( + TerminalId::Async, + TokenValue::Atom(CommonSourceAtomSetIndices::async_()), + ), + */ + return Err(ParseError::NotImplemented( + "async cannot be handled in parser due to multiple lookahead", + ) + .into()); + } + "await" => { + /* + ( + TerminalId::Await, + TokenValue::Atom(CommonSourceAtomSetIndices::await_()), + ), + */ + return Err( + ParseError::NotImplemented("await cannot be handled in parser").into(), + ); + } + "break" => ( + TerminalId::Break, + TokenValue::Atom(CommonSourceAtomSetIndices::break_()), + ), + "case" => ( + TerminalId::Case, + TokenValue::Atom(CommonSourceAtomSetIndices::case()), + ), + "catch" => ( + TerminalId::Catch, + TokenValue::Atom(CommonSourceAtomSetIndices::catch()), + ), + "class" => ( + TerminalId::Class, + TokenValue::Atom(CommonSourceAtomSetIndices::class()), + ), + "const" => ( + TerminalId::Const, + TokenValue::Atom(CommonSourceAtomSetIndices::const_()), + ), + "continue" => ( + TerminalId::Continue, + TokenValue::Atom(CommonSourceAtomSetIndices::continue_()), + ), + "debugger" => ( + TerminalId::Debugger, + TokenValue::Atom(CommonSourceAtomSetIndices::debugger()), + ), + "default" => ( + TerminalId::Default, + TokenValue::Atom(CommonSourceAtomSetIndices::default()), + ), + "delete" => ( + TerminalId::Delete, + TokenValue::Atom(CommonSourceAtomSetIndices::delete()), + ), + "do" => ( + TerminalId::Do, + TokenValue::Atom(CommonSourceAtomSetIndices::do_()), + ), + "else" => ( + TerminalId::Else, + TokenValue::Atom(CommonSourceAtomSetIndices::else_()), + ), + "enum" => ( + TerminalId::Enum, + TokenValue::Atom(CommonSourceAtomSetIndices::enum_()), + ), + "export" => ( + TerminalId::Export, + TokenValue::Atom(CommonSourceAtomSetIndices::export()), + ), + "extends" => ( + TerminalId::Extends, + TokenValue::Atom(CommonSourceAtomSetIndices::extends()), + ), + "finally" => ( + TerminalId::Finally, + TokenValue::Atom(CommonSourceAtomSetIndices::finally()), + ), + "for" => ( + TerminalId::For, + TokenValue::Atom(CommonSourceAtomSetIndices::for_()), + ), + "from" => ( + TerminalId::From, + TokenValue::Atom(CommonSourceAtomSetIndices::from()), + ), + "function" => ( + TerminalId::Function, + TokenValue::Atom(CommonSourceAtomSetIndices::function()), + ), + "get" => ( + TerminalId::Get, + TokenValue::Atom(CommonSourceAtomSetIndices::get()), + ), + "if" => ( + TerminalId::If, + TokenValue::Atom(CommonSourceAtomSetIndices::if_()), + ), + "implements" => ( + TerminalId::Implements, + TokenValue::Atom(CommonSourceAtomSetIndices::implements()), + ), + "import" => ( + TerminalId::Import, + TokenValue::Atom(CommonSourceAtomSetIndices::import()), + ), + "in" => ( + TerminalId::In, + TokenValue::Atom(CommonSourceAtomSetIndices::in_()), + ), + "instanceof" => ( + TerminalId::Instanceof, + TokenValue::Atom(CommonSourceAtomSetIndices::instanceof()), + ), + "interface" => ( + TerminalId::Interface, + TokenValue::Atom(CommonSourceAtomSetIndices::interface()), + ), + "let" => { + /* + ( + TerminalId::Let, + TokenValue::Atom(CommonSourceAtomSetIndices::let_()), + ), + */ + return Err(ParseError::NotImplemented( + "let cannot be handled in parser due to multiple lookahead", + ) + .into()); + } + "new" => ( + TerminalId::New, + TokenValue::Atom(CommonSourceAtomSetIndices::new_()), + ), + "of" => ( + TerminalId::Of, + TokenValue::Atom(CommonSourceAtomSetIndices::of()), + ), + "package" => ( + TerminalId::Package, + TokenValue::Atom(CommonSourceAtomSetIndices::package()), + ), + "private" => ( + TerminalId::Private, + TokenValue::Atom(CommonSourceAtomSetIndices::private()), + ), + "protected" => ( + TerminalId::Protected, + TokenValue::Atom(CommonSourceAtomSetIndices::protected()), + ), + "public" => ( + TerminalId::Public, + TokenValue::Atom(CommonSourceAtomSetIndices::public()), + ), + "return" => ( + TerminalId::Return, + TokenValue::Atom(CommonSourceAtomSetIndices::return_()), + ), + "set" => ( + TerminalId::Set, + TokenValue::Atom(CommonSourceAtomSetIndices::set()), + ), + "static" => ( + TerminalId::Static, + TokenValue::Atom(CommonSourceAtomSetIndices::static_()), + ), + "super" => ( + TerminalId::Super, + TokenValue::Atom(CommonSourceAtomSetIndices::super_()), + ), + "switch" => ( + TerminalId::Switch, + TokenValue::Atom(CommonSourceAtomSetIndices::switch()), + ), + "target" => ( + TerminalId::Target, + TokenValue::Atom(CommonSourceAtomSetIndices::target()), + ), + "this" => ( + TerminalId::This, + TokenValue::Atom(CommonSourceAtomSetIndices::this()), + ), + "throw" => ( + TerminalId::Throw, + TokenValue::Atom(CommonSourceAtomSetIndices::throw()), + ), + "try" => ( + TerminalId::Try, + TokenValue::Atom(CommonSourceAtomSetIndices::try_()), + ), + "typeof" => ( + TerminalId::Typeof, + TokenValue::Atom(CommonSourceAtomSetIndices::typeof_()), + ), + "var" => ( + TerminalId::Var, + TokenValue::Atom(CommonSourceAtomSetIndices::var()), + ), + "void" => ( + TerminalId::Void, + TokenValue::Atom(CommonSourceAtomSetIndices::void()), + ), + "while" => ( + TerminalId::While, + TokenValue::Atom(CommonSourceAtomSetIndices::while_()), + ), + "with" => ( + TerminalId::With, + TokenValue::Atom(CommonSourceAtomSetIndices::with()), + ), + "yield" => { + /* + ( + TerminalId::Yield, + TokenValue::Atom(CommonSourceAtomSetIndices::yield_()), + ), + */ + return Err( + ParseError::NotImplemented("yield cannot be handled in parser").into(), + ); + } + "null" => ( + TerminalId::NullLiteral, + TokenValue::Atom(CommonSourceAtomSetIndices::null()), + ), + "true" => ( + TerminalId::BooleanLiteral, + TokenValue::Atom(CommonSourceAtomSetIndices::true_()), + ), + "false" => ( + TerminalId::BooleanLiteral, + TokenValue::Atom(CommonSourceAtomSetIndices::false_()), + ), + _ => (TerminalId::Name, self.string_to_token_value(text)), + } + }; + + self.set_result(id, SourceLocation::new(start, self.offset()), value) + } + + /// ```text + /// PrivateIdentifier:: + /// `#` IdentifierName + /// ``` + fn private_identifier(&mut self, start: usize, builder: AutoCow<'alloc>) -> Result<'alloc, ()> { + let name = self.identifier_name(builder)?; + let value = self.string_to_token_value(name); + self.set_result( + TerminalId::PrivateIdentifier, + SourceLocation::new(start, self.offset()), + value, + ) + } + + /// ```text + /// UnicodeEscapeSequence:: + /// `u` Hex4Digits + /// `u{` CodePoint `}` + /// ``` + fn unicode_escape_sequence_after_backslash(&mut self) -> Result<'alloc, char> { + match self.chars.next() { + Some('u') => {} + _ => { + return Err(ParseError::InvalidEscapeSequence.into()); + } + } + self.unicode_escape_sequence_after_backslash_and_u() + } + + fn unicode_escape_sequence_after_backslash_and_u(&mut self) -> Result<'alloc, char> { + let value = match self.peek() { + Some('{') => { + self.chars.next(); + + let value = self.code_point()?; + match self.chars.next() { + Some('}') => {} + _ => { + return Err(ParseError::InvalidEscapeSequence.into()); + } + } + value + } + _ => self.hex_4_digits()?, + }; + + Ok(value) + } +} + +impl<'alloc> Lexer<'alloc> { + // ------------------------------------------------------------------------ + // 11.8.3 Numeric Literals + + /// Advance over decimal digits in the input. + /// + /// ```text + /// NumericLiteralSeparator:: + /// `_` + /// + /// DecimalDigits :: + /// DecimalDigit + /// DecimalDigits NumericLiteralSeparator? DecimalDigit + /// + /// DecimalDigit :: one of + /// `0` `1` `2` `3` `4` `5` `6` `7` `8` `9` + /// ``` + fn decimal_digits(&mut self) -> Result<'alloc, ()> { + if let Some('0'..='9') = self.peek() { + self.chars.next(); + } else { + return Err(self.unexpected_err().into()); + } + + self.decimal_digits_after_first_digit()?; + Ok(()) + } + + fn optional_decimal_digits(&mut self) -> Result<'alloc, ()> { + if let Some('0'..='9') = self.peek() { + self.chars.next(); + } else { + return Ok(()); + } + + self.decimal_digits_after_first_digit()?; + Ok(()) + } + + fn decimal_digits_after_first_digit(&mut self) -> Result<'alloc, ()> { + while let Some(next) = self.peek() { + match next { + '_' => { + self.chars.next(); + + if let Some('0'..='9') = self.peek() { + self.chars.next(); + } else { + return Err(self.unexpected_err().into()); + } + } + '0'..='9' => { + self.chars.next(); + } + _ => break, + } + } + Ok(()) + } + + /// Skip an ExponentPart, if present. + /// + /// ```text + /// ExponentPart :: + /// ExponentIndicator SignedInteger + /// + /// ExponentIndicator :: one of + /// `e` `E` + /// + /// SignedInteger :: + /// DecimalDigits + /// `+` DecimalDigits + /// `-` DecimalDigits + /// ``` + fn optional_exponent(&mut self) -> Result<'alloc, bool> { + if let Some('e') | Some('E') = self.peek() { + self.chars.next(); + self.decimal_exponent()?; + return Ok(true); + } + + Ok(false) + } + + fn decimal_exponent(&mut self) -> Result<'alloc, ()> { + if let Some('+') | Some('-') = self.peek() { + self.chars.next(); + } + + self.decimal_digits()?; + + Ok(()) + } + + /// ```text + /// HexDigit :: one of + /// `0` `1` `2` `3` `4` `5` `6` `7` `8` `9` `a` `b` `c` `d` `e` `f` `A` `B` `C` `D` `E` `F` + /// ``` + fn hex_digit(&mut self) -> Result<'alloc, u32> { + match self.chars.next() { + None => Err(ParseError::InvalidEscapeSequence.into()), + Some(c @ '0'..='9') => Ok(c as u32 - '0' as u32), + Some(c @ 'a'..='f') => Ok(10 + (c as u32 - 'a' as u32)), + Some(c @ 'A'..='F') => Ok(10 + (c as u32 - 'A' as u32)), + Some(other) => Err(ParseError::IllegalCharacter(other).into()), + } + } + + fn code_point_to_char(value: u32) -> Result<'alloc, char> { + if 0xd800 <= value && value <= 0xdfff { + Err(ParseError::NotImplemented("unicode escape sequences (surrogates)").into()) + } else { + char::try_from(value).map_err(|_| ParseError::InvalidEscapeSequence.into()) + } + } + + /// ```text + /// Hex4Digits :: + /// HexDigit HexDigit HexDigit HexDigit + /// ``` + fn hex_4_digits(&mut self) -> Result<'alloc, char> { + let mut value = 0; + for _ in 0..4 { + value = (value << 4) | self.hex_digit()?; + } + Self::code_point_to_char(value) + } + + /// ```text + /// CodePoint :: + /// HexDigits but only if MV of HexDigits ≤ 0x10FFFF + /// + /// HexDigits :: + /// HexDigit + /// HexDigits HexDigit + /// ``` + fn code_point(&mut self) -> Result<'alloc, char> { + let mut value = self.hex_digit()?; + + loop { + let next = match self.peek() { + None => { + return Err(ParseError::InvalidEscapeSequence.into()); + } + Some(c @ '0'..='9') => c as u32 - '0' as u32, + Some(c @ 'a'..='f') => 10 + (c as u32 - 'a' as u32), + Some(c @ 'A'..='F') => 10 + (c as u32 - 'A' as u32), + Some(_) => break, + }; + self.chars.next(); + value = (value << 4) | next; + if value > 0x10FFFF { + return Err(ParseError::InvalidEscapeSequence.into()); + } + } + + Self::code_point_to_char(value) + } + + /// Scan a NumericLiteral (defined in 11.8.3, extended by B.1.1) after + /// having already consumed the first character, which was `0`. + /// + /// ```text + /// NumericLiteral :: + /// DecimalLiteral + /// DecimalBigIntegerLiteral + /// NonDecimalIntegerLiteral + /// NonDecimalIntegerLiteral BigIntLiteralSuffix + /// + /// DecimalBigIntegerLiteral :: + /// `0` BigIntLiteralSuffix + /// NonZeroDigit DecimalDigits? BigIntLiteralSuffix + /// + /// NonDecimalIntegerLiteral :: + /// BinaryIntegerLiteral + /// OctalIntegerLiteral + /// HexIntegerLiteral + /// + /// BigIntLiteralSuffix :: + /// `n` + /// ``` + fn numeric_literal_starting_with_zero(&mut self) -> Result<'alloc, NumericResult> { + let mut base = NumericLiteralBase::Decimal; + match self.peek() { + // BinaryIntegerLiteral :: + // `0b` BinaryDigits + // `0B` BinaryDigits + // + // BinaryDigits :: + // BinaryDigit + // BinaryDigits NumericLiteralSeparator? BinaryDigit + // + // BinaryDigit :: one of + // `0` `1` + Some('b') | Some('B') => { + self.chars.next(); + + base = NumericLiteralBase::Binary; + + if let Some('0'..='1') = self.peek() { + self.chars.next(); + } else { + return Err(self.unexpected_err().into()); + } + + while let Some(next) = self.peek() { + match next { + '_' => { + self.chars.next(); + + if let Some('0'..='1') = self.peek() { + self.chars.next(); + } else { + return Err(self.unexpected_err().into()); + } + } + '0'..='1' => { + self.chars.next(); + } + _ => break, + } + } + + if let Some('n') = self.peek() { + self.chars.next(); + self.check_after_numeric_literal()?; + return Ok(NumericResult::BigInt { base }); + } + } + + // OctalIntegerLiteral :: + // `0o` OctalDigits + // `0O` OctalDigits + // + // OctalDigits :: + // OctalDigit + // OctalDigits NumericLiteralSeparator? OctalDigit + // + // OctalDigit :: one of + // `0` `1` `2` `3` `4` `5` `6` `7` + // + Some('o') | Some('O') => { + self.chars.next(); + + base = NumericLiteralBase::Octal; + + if let Some('0'..='7') = self.peek() { + self.chars.next(); + } else { + return Err(self.unexpected_err().into()); + } + + while let Some(next) = self.peek() { + match next { + '_' => { + self.chars.next(); + + if let Some('0'..='7') = self.peek() { + self.chars.next(); + } else { + return Err(self.unexpected_err().into()); + } + } + '0'..='7' => { + self.chars.next(); + } + _ => break, + } + } + + if let Some('n') = self.peek() { + self.chars.next(); + self.check_after_numeric_literal()?; + return Ok(NumericResult::BigInt { base }); + } + } + + // HexIntegerLiteral :: + // `0x` HexDigits + // `0X` HexDigits + // + // HexDigits :: + // HexDigit + // HexDigits NumericLiteralSeparator? HexDigit + // + // HexDigit :: one of + // `0` `1` `2` `3` `4` `5` `6` `7` `8` `9` `a` `b` `c` `d` `e` `f` `A` `B` `C` `D` `E` `F` + Some('x') | Some('X') => { + self.chars.next(); + + base = NumericLiteralBase::Hex; + + if let Some('0'..='9') | Some('a'..='f') | Some('A'..='F') = self.peek() { + self.chars.next(); + } else { + return Err(self.unexpected_err().into()); + } + + while let Some(next) = self.peek() { + match next { + '_' => { + self.chars.next(); + + if let Some('0'..='9') | Some('a'..='f') | Some('A'..='F') = self.peek() + { + self.chars.next(); + } else { + return Err(self.unexpected_err().into()); + } + } + '0'..='9' | 'a'..='f' | 'A'..='F' => { + self.chars.next(); + } + _ => break, + } + } + + if let Some('n') = self.peek() { + self.chars.next(); + self.check_after_numeric_literal()?; + return Ok(NumericResult::BigInt { base }); + } + } + + Some('.') => { + self.chars.next(); + return self.decimal_literal_after_decimal_point_after_digits(); + } + + Some('e') | Some('E') => { + self.chars.next(); + self.decimal_exponent()?; + return Ok(NumericResult::Float); + } + + Some('n') => { + self.chars.next(); + self.check_after_numeric_literal()?; + return Ok(NumericResult::BigInt { base }); + } + + Some('0'..='9') => { + // This is almost always the token `0` in practice. + // + // In nonstrict code, as a legacy feature, other numbers + // starting with `0` are allowed. If /0[0-7]+/ matches, it's a + // LegacyOctalIntegerLiteral; but if we see an `8` or `9` in + // the number, it's decimal. Decimal numbers can have a decimal + // point and/or ExponentPart; octals can't. + // + // Neither is allowed with a BigIntLiteralSuffix `n`. + // + // LegacyOctalIntegerLiteral :: + // `0` OctalDigit + // LegacyOctalIntegerLiteral OctalDigit + // + // NonOctalDecimalIntegerLiteral :: + // `0` NonOctalDigit + // LegacyOctalLikeDecimalIntegerLiteral NonOctalDigit + // NonOctalDecimalIntegerLiteral DecimalDigit + // + // LegacyOctalLikeDecimalIntegerLiteral :: + // `0` OctalDigit + // LegacyOctalLikeDecimalIntegerLiteral OctalDigit + // + // NonOctalDigit :: one of + // `8` `9` + // + + // TODO: implement `strict_mode` check + // let strict_mode = true; + // if !strict_mode { + // // TODO: Distinguish between Octal and NonOctalDecimal. + // // TODO: Support NonOctalDecimal followed by a decimal + // // point and/or ExponentPart. + // self.decimal_digits()?; + // } + return Err(ParseError::NotImplemented("LegacyOctalIntegerLiteral").into()); + } + + _ => {} + } + + self.check_after_numeric_literal()?; + Ok(NumericResult::Int { base }) + } + + /// Scan a NumericLiteral (defined in 11.8.3, extended by B.1.1) after + /// having already consumed the first character, which is a decimal digit. + fn decimal_literal_after_first_digit(&mut self) -> Result<'alloc, NumericResult> { + // DecimalLiteral :: + // DecimalIntegerLiteral `.` DecimalDigits? ExponentPart? + // `.` DecimalDigits ExponentPart? + // DecimalIntegerLiteral ExponentPart? + // + // DecimalIntegerLiteral :: + // `0` #see `numeric_literal_starting_with_zero` + // NonZeroDigit + // NonZeroDigit NumericLiteralSeparator? DecimalDigits + // NonOctalDecimalIntegerLiteral #see `numeric_literal_ + // # starting_with_zero` + // + // NonZeroDigit :: one of + // `1` `2` `3` `4` `5` `6` `7` `8` `9` + + self.decimal_digits_after_first_digit()?; + match self.peek() { + Some('.') => { + self.chars.next(); + return self.decimal_literal_after_decimal_point_after_digits(); + } + Some('n') => { + self.chars.next(); + self.check_after_numeric_literal()?; + return Ok(NumericResult::BigInt { + base: NumericLiteralBase::Decimal, + }); + } + _ => {} + } + + let has_exponent = self.optional_exponent()?; + self.check_after_numeric_literal()?; + + let result = if has_exponent { + NumericResult::Float + } else { + NumericResult::Int { + base: NumericLiteralBase::Decimal, + } + }; + + Ok(result) + } + + fn decimal_literal_after_decimal_point(&mut self) -> Result<'alloc, NumericResult> { + // The parts after `.` in + // + // `.` DecimalDigits ExponentPart? + self.decimal_digits()?; + self.optional_exponent()?; + self.check_after_numeric_literal()?; + + Ok(NumericResult::Float) + } + + fn decimal_literal_after_decimal_point_after_digits( + &mut self, + ) -> Result<'alloc, NumericResult> { + // The parts after `.` in + // + // DecimalLiteral :: + // DecimalIntegerLiteral `.` DecimalDigits? ExponentPart? + self.optional_decimal_digits()?; + self.optional_exponent()?; + self.check_after_numeric_literal()?; + + Ok(NumericResult::Float) + } + + fn check_after_numeric_literal(&self) -> Result<'alloc, ()> { + // The SourceCharacter immediately following a + // NumericLiteral must not be an IdentifierStart or + // DecimalDigit. (11.8.3) + if let Some(ch) = self.peek() { + if is_identifier_start(ch) || ch.is_digit(10) { + return Err(ParseError::IllegalCharacter(ch).into()); + } + } + + Ok(()) + } + + // ------------------------------------------------------------------------ + // 11.8.4 String Literals (as extended by B.1.2) + + /// Scan an LineContinuation or EscapeSequence in a string literal, having + /// already consumed the initial backslash character. + /// + /// ```text + /// LineContinuation :: + /// `\` LineTerminatorSequence + /// + /// EscapeSequence :: + /// CharacterEscapeSequence + /// (in strict mode code) `0` [lookahead ∉ DecimalDigit] + /// (in non-strict code) LegacyOctalEscapeSequence + /// HexEscapeSequence + /// UnicodeEscapeSequence + /// + /// CharacterEscapeSequence :: + /// SingleEscapeCharacter + /// NonEscapeCharacter + /// + /// SingleEscapeCharacter :: one of + /// `'` `"` `\` `b` `f` `n` `r` `t` `v` + /// + /// LegacyOctalEscapeSequence :: + /// OctalDigit [lookahead ∉ OctalDigit] + /// ZeroToThree OctalDigit [lookahead ∉ OctalDigit] + /// FourToSeven OctalDigit + /// ZeroToThree OctalDigit OctalDigit + /// + /// ZeroToThree :: one of + /// `0` `1` `2` `3` + /// + /// FourToSeven :: one of + /// `4` `5` `6` `7` + /// ``` + fn escape_sequence(&mut self, text: &mut String<'alloc>) -> Result<'alloc, ()> { + match self.chars.next() { + None => { + return Err(ParseError::UnterminatedString.into()); + } + Some(c) => match c { + LF | LS | PS => { + // LineContinuation. Ignore it. + // + // Don't set is_on_new_line because this LineContinuation + // has no bearing on whether the current string literal was + // the first token on the line where it started. + } + + CR => { + // LineContinuation. Check for the sequence \r\n; otherwise + // ignore it. + if self.peek() == Some(LF) { + self.chars.next(); + } + } + + '\'' | '"' | '\\' => { + text.push(c); + } + + 'b' => { + text.push('\u{8}'); + } + + 'f' => { + text.push(FF); + } + + 'n' => { + text.push(LF); + } + + 'r' => { + text.push(CR); + } + + 't' => { + text.push(TAB); + } + + 'v' => { + text.push(VT); + } + + 'x' => { + // HexEscapeSequence :: + // `x` HexDigit HexDigit + let mut value = self.hex_digit()?; + value = (value << 4) | self.hex_digit()?; + match char::try_from(value) { + Err(_) => { + return Err(ParseError::InvalidEscapeSequence.into()); + } + Ok(c) => { + text.push(c); + } + } + } + + 'u' => { + let c = self.unicode_escape_sequence_after_backslash_and_u()?; + text.push(c); + } + + '0' => { + // In strict mode code and in template literals, the + // relevant production is + // + // EscapeSequence :: + // `0` [lookahead <! DecimalDigit] + // + // In non-strict StringLiterals, `\0` begins a + // LegacyOctalEscapeSequence which may contain more digits. + match self.peek() { + Some('0'..='7') => { + return Err(ParseError::NotImplemented( + "legacy octal escape sequence in string", + ) + .into()); + } + Some('8'..='9') => { + return Err(ParseError::NotImplemented( + "digit immediately following \\0 escape sequence", + ) + .into()); + } + _ => {} + } + text.push('\0'); + } + + '1'..='7' => { + return Err(ParseError::NotImplemented( + "legacy octal escape sequence in string", + ) + .into()); + } + + other => { + // "\8" and "\9" are invalid per spec, but SpiderMonkey and + // V8 accept them, and JSC accepts them in non-strict mode. + // "\8" is "8" and "\9" is "9". + text.push(other); + } + }, + } + Ok(()) + } + + /// Scan a string literal, having already consumed the starting quote + /// character `delimiter`. + /// + /// ```text + /// StringLiteral :: + /// `"` DoubleStringCharacters? `"` + /// `'` SingleStringCharacters? `'` + /// + /// DoubleStringCharacters :: + /// DoubleStringCharacter DoubleStringCharacters? + /// + /// SingleStringCharacters :: + /// SingleStringCharacter SingleStringCharacters? + /// + /// DoubleStringCharacter :: + /// SourceCharacter but not one of `"` or `\` or LineTerminator + /// <LS> + /// <PS> + /// `\` EscapeSequence + /// LineContinuation + /// + /// SingleStringCharacter :: + /// SourceCharacter but not one of `'` or `\` or LineTerminator + /// <LS> + /// <PS> + /// `\` EscapeSequence + /// LineContinuation + /// ``` + fn string_literal(&mut self, delimiter: char) -> Result<'alloc, ()> { + let offset = self.offset() - 1; + let mut builder = AutoCow::new(&self); + loop { + match self.chars.next() { + None | Some('\r') | Some('\n') => { + return Err(ParseError::UnterminatedString.into()); + } + + Some(c @ '"') | Some(c @ '\'') => { + if c == delimiter { + let value = self.string_to_token_value(builder.finish_without_push(&self)); + return self.set_result( + TerminalId::StringLiteral, + SourceLocation::new(offset, self.offset()), + value, + ); + } else { + builder.push_matching(c); + } + } + + Some('\\') => { + let text = builder.get_mut_string_without_current_ascii_char(&self); + self.escape_sequence(text)?; + } + + Some(other) => { + // NonEscapeCharacter :: + // SourceCharacter but not one of EscapeCharacter or LineTerminator + // + // EscapeCharacter :: + // SingleEscapeCharacter + // DecimalDigit + // `x` + // `u` + builder.push_matching(other); + } + } + } + } + + // ------------------------------------------------------------------------ + // 11.8.5 Regular Expression Literals + + fn regular_expression_backslash_sequence(&mut self) -> Result<'alloc, ()> { + match self.chars.next() { + None | Some(CR) | Some(LF) | Some(LS) | Some(PS) => { + Err(ParseError::UnterminatedRegExp.into()) + } + Some(_) => Ok(()), + } + } + + // See 12.2.8 and 11.8.5 sections. + fn regular_expression_literal(&mut self, builder: &mut AutoCow<'alloc>) -> Result<'alloc, ()> { + let offset = self.offset(); + + loop { + match self.chars.next() { + None | Some(CR) | Some(LF) | Some(LS) | Some(PS) => { + return Err(ParseError::UnterminatedRegExp.into()); + } + Some('/') => { + break; + } + Some('[') => { + // RegularExpressionClass. + loop { + match self.chars.next() { + None | Some(CR) | Some(LF) | Some(LS) | Some(PS) => { + return Err(ParseError::UnterminatedRegExp.into()); + } + Some(']') => { + break; + } + Some('\\') => { + self.regular_expression_backslash_sequence()?; + } + Some(_) => {} + } + } + } + Some('\\') => { + self.regular_expression_backslash_sequence()?; + } + Some(_) => {} + } + } + let mut flag_text = AutoCow::new(&self); + while let Some(ch) = self.peek() { + match ch { + '$' | '_' | 'a'..='z' | 'A'..='Z' | '0'..='9' => { + self.chars.next(); + flag_text.push_matching(ch); + } + _ => break, + } + } + + // 12.2.8.2.1 Assert literal is a RegularExpressionLiteral. + let literal = builder.finish(&self); + + // 12.2.8.2.2 Check that only gimsuy flags are mentioned at most once. + let gimsuy_mask: u32 = ['g', 'i', 'm', 's', 'u', 'y'] + .iter() + .map(|x| 1 << ((*x as u8) - ('a' as u8))) + .sum(); + let mut flag_text_set: u32 = 0; + for ch in flag_text.finish(&self).chars() { + if !ch.is_ascii_lowercase() { + return Err(ParseError::NotImplemented( + "Unexpected flag in regular expression literal", + ) + .into()); + } + let ch_mask = 1 << ((ch as u8) - ('a' as u8)); + if ch_mask & gimsuy_mask == 0 { + return Err(ParseError::NotImplemented( + "Unexpected flag in regular expression literal", + ) + .into()); + } + if flag_text_set & ch_mask != 0 { + return Err(ParseError::NotImplemented( + "Flag is mentioned twice in regular expression literal", + ) + .into()); + } + flag_text_set |= ch_mask; + } + + // TODO: 12.2.8.2.4 and 12.2.8.2.5 Check that the body matches the + // grammar defined in 21.2.1. + + let value = self.slice_to_token_value(literal); + self.set_result( + TerminalId::RegularExpressionLiteral, + SourceLocation::new(offset, self.offset()), + value, + ) + } + + // ------------------------------------------------------------------------ + // 11.8.6 Template Literal Lexical Components + + /// Parse a template literal component token, having already consumed the + /// starting `` ` `` or `}` character. On success, the `id` of the returned + /// `Token` is `subst` (if the token ends with `${`) or `tail` (if the + /// token ends with `` ` ``). + /// + /// ```text + /// NoSubstitutionTemplate :: + /// ``` TemplateCharacters? ``` + /// + /// TemplateHead :: + /// ``` TemplateCharacters? `${` + /// + /// TemplateMiddle :: + /// `}` TemplateCharacters? `${` + /// + /// TemplateTail :: + /// `}` TemplateCharacters? ``` + /// + /// TemplateCharacters :: + /// TemplateCharacter TemplateCharacters? + /// ``` + fn template_part( + &mut self, + start: usize, + subst: TerminalId, + tail: TerminalId, + ) -> Result<'alloc, ()> { + let mut builder = AutoCow::new(&self); + while let Some(ch) = self.chars.next() { + // TemplateCharacter :: + // `$` [lookahead != `{` ] + // `\` EscapeSequence + // `\` NotEscapeSequence + // LineContinuation + // LineTerminatorSequence + // SourceCharacter but not one of ``` or `\` or `$` or LineTerminator + // + // NotEscapeSequence :: + // `0` DecimalDigit + // DecimalDigit but not `0` + // `x` [lookahead <! HexDigit] + // `x` HexDigit [lookahead <! HexDigit] + // `u` [lookahead <! HexDigit] [lookahead != `{`] + // `u` HexDigit [lookahead <! HexDigit] + // `u` HexDigit HexDigit [lookahead <! HexDigit] + // `u` HexDigit HexDigit HexDigit [lookahead <! HexDigit] + // `u` `{` [lookahead <! HexDigit] + // `u` `{` NotCodePoint [lookahead <! HexDigit] + // `u` `{` CodePoint [lookahead <! HexDigit] [lookahead != `}`] + // + // NotCodePoint :: + // HexDigits [> but only if MV of |HexDigits| > 0x10FFFF ] + // + // CodePoint :: + // HexDigits [> but only if MV of |HexDigits| ≤ 0x10FFFF ] + if ch == '$' && self.peek() == Some('{') { + self.chars.next(); + let value = self.string_to_token_value(builder.finish_without_push(&self)); + return self.set_result(subst, SourceLocation::new(start, self.offset()), value); + } + if ch == '`' { + let value = self.string_to_token_value(builder.finish_without_push(&self)); + return self.set_result(tail, SourceLocation::new(start, self.offset()), value); + } + // TODO: Support escape sequences. + if ch == '\\' { + let text = builder.get_mut_string_without_current_ascii_char(&self); + self.escape_sequence(text)?; + } else { + builder.push_matching(ch); + } + } + Err(ParseError::UnterminatedString.into()) + } + + fn advance_impl<'parser>(&mut self, parser: &Parser<'parser>) -> Result<'alloc, ()> { + let mut builder = AutoCow::new(&self); + let mut start = self.offset(); + while let Some(c) = self.chars.next() { + match c { + // 11.2 White Space + // + // WhiteSpace :: + // <TAB> + // <VT> + // <FF> + // <SP> + // <NBSP> + // <ZWNBSP> + // <USP> + TAB | + VT | + FF | + SP | + NBSP | + ZWNBSP | + '\u{1680}' | // Ogham space mark (in <USP>) + '\u{2000}' ..= '\u{200a}' | // typesetting spaces (in <USP>) + '\u{202f}' | // Narrow no-break space (in <USP>) + '\u{205f}' | // Medium mathematical space (in <USP>) + '\u{3000}' // Ideographic space (in <USP>) + => { + // TODO - The spec uses <USP> to stand for any character + // with category "Space_Separator" (Zs). New Unicode + // standards may add characters to this set. This should therefore be + // implemented using the Unicode database somehow. + builder = AutoCow::new(&self); + start = self.offset(); + continue; + } + + // 11.3 Line Terminators + // + // LineTerminator :: + // <LF> + // <CR> + // <LS> + // <PS> + LF | CR | LS | PS => { + self.token.is_on_new_line = true; + builder = AutoCow::new(&self); + start = self.offset(); + continue; + } + + '0' => { + let result = self.numeric_literal_starting_with_zero()?; + return Ok(self.numeric_result_to_advance_result(builder.finish(&self), start, result)?); + } + + '1'..='9' => { + let result = self.decimal_literal_after_first_digit()?; + return Ok(self.numeric_result_to_advance_result(builder.finish(&self), start, result)?); + } + + '"' | '\'' => { + return self.string_literal(c); + } + + '`' => { + return self.template_part(start, TerminalId::TemplateHead, TerminalId::NoSubstitutionTemplate); + } + + '!' => match self.peek() { + Some('=') => { + self.chars.next(); + match self.peek() { + Some('=') => { + self.chars.next(); + return self.set_result( + TerminalId::StrictNotEqual, + SourceLocation::new(start, self.offset()), + TokenValue::None, + ); + } + _ => return self.set_result( + TerminalId::LaxNotEqual, + SourceLocation::new(start, self.offset()), + TokenValue::None, + ), + } + } + _ => return self.set_result( + TerminalId::LogicalNot, + SourceLocation::new(start, self.offset()), + TokenValue::None, + ), + }, + + '%' => match self.peek() { + Some('=') => { + self.chars.next(); + return self.set_result( + TerminalId::RemainderAssign, + SourceLocation::new(start, self.offset()), + TokenValue::None, + ); + } + _ => return self.set_result( + TerminalId::Remainder, + SourceLocation::new(start, self.offset()), + TokenValue::None, + ), + }, + + '&' => match self.peek() { + Some('&') => { + self.chars.next(); + match self.peek() { + Some('=') => { + self.chars.next(); + return self.set_result( + TerminalId::LogicalAndAssign, + SourceLocation::new(start, self.offset()), + TokenValue::None, + ); + } + _ => return self.set_result( + TerminalId::LogicalAnd, + SourceLocation::new(start, self.offset()), + TokenValue::None, + ) + } + } + Some('=') => { + self.chars.next(); + return self.set_result( + TerminalId::BitwiseAndAssign, + SourceLocation::new(start, self.offset()), + TokenValue::None, + ); + } + _ => return self.set_result( + TerminalId::BitwiseAnd, + SourceLocation::new(start, self.offset()), + TokenValue::None, + ), + }, + + '*' => match self.peek() { + Some('*') => { + self.chars.next(); + match self.peek() { + Some('=') => { + self.chars.next(); + return self.set_result( + TerminalId::ExponentiateAssign, + SourceLocation::new(start, self.offset()), + TokenValue::None, + ); + } + _ => return self.set_result( + TerminalId::Exponentiate, + SourceLocation::new(start, self.offset()), + TokenValue::None, + ), + } + } + Some('=') => { + self.chars.next(); + return self.set_result( + TerminalId::MultiplyAssign, + SourceLocation::new(start, self.offset()), + TokenValue::None, + ); + } + _ => return self.set_result( + TerminalId::Star, + SourceLocation::new(start, self.offset()), + TokenValue::None, + ), + }, + + '+' => match self.peek() { + Some('+') => { + self.chars.next(); + return self.set_result( + TerminalId::Increment, + SourceLocation::new(start, self.offset()), + TokenValue::None, + ); + } + Some('=') => { + self.chars.next(); + return self.set_result( + TerminalId::AddAssign, + SourceLocation::new(start, self.offset()), + TokenValue::None, + ); + } + _ => return self.set_result( + TerminalId::Plus, + SourceLocation::new(start, self.offset()), + TokenValue::None, + ), + }, + + '-' => match self.peek() { + Some('-') => { + self.chars.next(); + match self.peek() { + Some('>') if self.token.is_on_new_line => { + // B.1.3 SingleLineHTMLCloseComment + // TODO: Limit this to Script (not Module). + self.skip_single_line_comment(&mut builder); + continue; + } + _ => return self.set_result( + TerminalId::Decrement, + SourceLocation::new(start, self.offset()), + TokenValue::None, + ), + } + } + Some('=') => { + self.chars.next(); + return self.set_result( + TerminalId::SubtractAssign, + SourceLocation::new(start, self.offset()), + TokenValue::None, + ); + } + _ => return self.set_result( + TerminalId::Minus, + SourceLocation::new(start, self.offset()), + TokenValue::None, + ), + }, + + '.' => match self.peek() { + Some('.') => { + self.chars.next(); + match self.peek() { + Some('.') => { + self.chars.next(); + return self.set_result( + TerminalId::Ellipsis, + SourceLocation::new(start, self.offset()), + TokenValue::None, + ); + } + _ => return Err(ParseError::IllegalCharacter('.').into()), + } + } + Some('0'..='9') => { + let result = self.decimal_literal_after_decimal_point()?; + return Ok(self.numeric_result_to_advance_result(builder.finish(&self), start, result)?); + } + _ => return self.set_result( + TerminalId::Dot, + SourceLocation::new(start, self.offset()), + TokenValue::None, + ), + }, + + '/' => match self.peek() { + Some('/') => { + // SingleLineComment :: `//` SingleLineCommentChars? + self.chars.next(); + self.skip_single_line_comment(&mut builder); + start = self.offset(); + continue; + } + Some('*') => { + self.chars.next(); + self.skip_multi_line_comment(&mut builder)?; + start = self.offset(); + continue; + } + _ => { + if parser.can_accept_terminal(TerminalId::Divide) { + match self.peek() { + Some('=') => { + self.chars.next(); + return self.set_result( + TerminalId::DivideAssign, + SourceLocation::new(start, self.offset()), + TokenValue::None, + ); + } + _ => return self.set_result( + TerminalId::Divide, + SourceLocation::new(start, self.offset()), + TokenValue::None, + ), + } + } + return self.regular_expression_literal(&mut builder); + } + }, + + '}' => { + if parser.can_accept_terminal(TerminalId::TemplateMiddle) { + return self.template_part(start, TerminalId::TemplateMiddle, TerminalId::TemplateTail); + } + return self.set_result( + TerminalId::CloseBrace, + SourceLocation::new(start, self.offset()), + TokenValue::None, + ); + } + + '<' => match self.peek() { + Some('<') => { + self.chars.next(); + match self.peek() { + Some('=') => { + self.chars.next(); + return self.set_result( + TerminalId::LeftShiftAssign, + SourceLocation::new(start, self.offset()), + TokenValue::None, + ); + } + _ => return self.set_result( + TerminalId::LeftShift, + SourceLocation::new(start, self.offset()), + TokenValue::None, + ), + } + } + Some('=') => { + self.chars.next(); + return self.set_result( + TerminalId::LessThanOrEqualTo, + SourceLocation::new(start, self.offset()), + TokenValue::None, + ); + } + Some('!') if self.is_looking_at("!--") => { + // B.1.3 SingleLineHTMLOpenComment. Note that the above + // `is_looking_at` test peeked ahead at the next three + // characters of input. This lookahead is necessary + // because `x<!--` has a comment but `x<!-y` does not. + // + // TODO: Limit this to Script (not Module). + self.skip_single_line_comment(&mut builder); + start = self.offset(); + continue; + } + _ => return self.set_result( + TerminalId::LessThan, + SourceLocation::new(start, self.offset()), + TokenValue::None, + ), + }, + + '=' => match self.peek() { + Some('=') => { + self.chars.next(); + match self.peek() { + Some('=') => { + self.chars.next(); + return self.set_result( + TerminalId::StrictEqual, + SourceLocation::new(start, self.offset()), + TokenValue::None, + ); + } + _ => return self.set_result( + TerminalId::LaxEqual, + SourceLocation::new(start, self.offset()), + TokenValue::None, + ), + } + } + Some('>') => { + self.chars.next(); + return self.set_result( + TerminalId::Arrow, + SourceLocation::new(start, self.offset()), + TokenValue::None, + ); + } + _ => return self.set_result( + TerminalId::EqualSign, + SourceLocation::new(start, self.offset()), + TokenValue::None, + ), + }, + + '>' => match self.peek() { + Some('>') => { + self.chars.next(); + match self.peek() { + Some('>') => { + self.chars.next(); + match self.peek() { + Some('=') => { + self.chars.next(); + return self.set_result( + TerminalId::UnsignedRightShiftAssign, + SourceLocation::new(start, self.offset()), + TokenValue::None, + ); + } + _ => return self.set_result( + TerminalId::UnsignedRightShift, + SourceLocation::new(start, self.offset()), + TokenValue::None, + ), + } + } + Some('=') => { + self.chars.next(); + return self.set_result( + TerminalId::SignedRightShiftAssign, + SourceLocation::new(start, self.offset()), + TokenValue::None, + ); + } + _ => return self.set_result( + TerminalId::SignedRightShift, + SourceLocation::new(start, self.offset()), + TokenValue::None, + ), + } + } + Some('=') => { + self.chars.next(); + return self.set_result( + TerminalId::GreaterThanOrEqualTo, + SourceLocation::new(start, self.offset()), + TokenValue::None, + ); + } + _ => return self.set_result( + TerminalId::GreaterThan, + SourceLocation::new(start, self.offset()), + TokenValue::None, + ), + }, + + '^' => match self.peek() { + Some('=') => { + self.chars.next(); + return self.set_result( + TerminalId::BitwiseXorAssign, + SourceLocation::new(start, self.offset()), + TokenValue::None, + ); + } + _ => return self.set_result( + TerminalId::BitwiseXor, + SourceLocation::new(start, self.offset()), + TokenValue::None, + ), + }, + + '|' => match self.peek() { + Some('|') => { + self.chars.next(); + match self.peek() { + Some('=') => { + self.chars.next(); + return self.set_result( + TerminalId::LogicalOrAssign, + SourceLocation::new(start, self.offset()), + TokenValue::None, + ); + } + _ => return self.set_result( + TerminalId::LogicalOr, + SourceLocation::new(start, self.offset()), + TokenValue::None, + ) + } + } + Some('=') => { + self.chars.next(); + return self.set_result( + TerminalId::BitwiseOrAssign, + SourceLocation::new(start, self.offset()), + TokenValue::None, + ); + } + _ => return self.set_result( + TerminalId::BitwiseOr, + SourceLocation::new(start, self.offset()), + TokenValue::None, + ), + }, + + '?' => match self.peek() { + Some('?') => { + self.chars.next(); + match self.peek() { + Some('=') => { + self.chars.next(); + return self.set_result( + TerminalId::CoalesceAssign, + SourceLocation::new(start, self.offset()), + TokenValue::None, + ); + } + _ => return self.set_result( + TerminalId::Coalesce, + SourceLocation::new(start, self.offset()), + TokenValue::None, + ) + } + } + Some('.') => { + if let Some('0'..='9') = self.double_peek() { + return self.set_result( + TerminalId::QuestionMark, + SourceLocation::new(start, self.offset()), + TokenValue::None, + ) + } + self.chars.next(); + return self.set_result( + TerminalId::OptionalChain, + SourceLocation::new(start, self.offset()), + TokenValue::None, + ); + } + _ => return self.set_result( + TerminalId::QuestionMark, + SourceLocation::new(start, self.offset()), + TokenValue::None, + ), + } + + '(' => return self.set_result( + TerminalId::OpenParenthesis, + SourceLocation::new(start, self.offset()), + TokenValue::None, + ), + ')' => return self.set_result( + TerminalId::CloseParenthesis, + SourceLocation::new(start, self.offset()), + TokenValue::None, + ), + ',' => return self.set_result( + TerminalId::Comma, + SourceLocation::new(start, self.offset()), + TokenValue::None, + ), + ':' => return self.set_result( + TerminalId::Colon, + SourceLocation::new(start, self.offset()), + TokenValue::None, + ), + ';' => return self.set_result( + TerminalId::Semicolon, + SourceLocation::new(start, self.offset()), + TokenValue::None, + ), + '[' => return self.set_result( + TerminalId::OpenBracket, + SourceLocation::new(start, self.offset()), + TokenValue::None, + ), + ']' => return self.set_result( + TerminalId::CloseBracket, + SourceLocation::new(start, self.offset()), + TokenValue::None, + ), + '{' => return self.set_result( + TerminalId::OpenBrace, + SourceLocation::new(start, self.offset()), + TokenValue::None, + ), + '~' => return self.set_result( + TerminalId::BitwiseNot, + SourceLocation::new(start, self.offset()), + TokenValue::None, + ), + + // Idents + '$' | '_' | 'a'..='z' | 'A'..='Z' => { + builder.push_matching(c); + return self.identifier_tail(start, builder); + } + + '\\' => { + builder.force_allocation_without_current_ascii_char(&self); + + let value = self.unicode_escape_sequence_after_backslash()?; + if !is_identifier_start(value) { + return Err(ParseError::IllegalCharacter(value).into()); + } + builder.push_different(value); + + return self.identifier_tail(start, builder); + } + + '#' => { + if start == 0 { + // https://tc39.es/proposal-hashbang/out.html + // HashbangComment :: + // `#!` SingleLineCommentChars? + if let Some('!') = self.peek() { + self.skip_single_line_comment(&mut builder); + start = self.offset(); + continue; + } + } + + builder.push_matching(c); + return self.private_identifier(start, builder); + } + + other if is_identifier_start(other) => { + builder.push_matching(other); + return self.identifier_tail(start, builder); + } + + other => { + return Err(ParseError::IllegalCharacter(other).into()); + } + } + } + self.set_result( + TerminalId::End, + SourceLocation::new(start, self.offset()), + TokenValue::None, + ) + } + + fn string_to_token_value(&mut self, s: &'alloc str) -> TokenValue { + let index = self.atoms.borrow_mut().insert(s); + TokenValue::Atom(index) + } + + fn slice_to_token_value(&mut self, s: &'alloc str) -> TokenValue { + let index = self.slices.borrow_mut().push(s); + TokenValue::Slice(index) + } + + fn numeric_result_to_advance_result( + &mut self, + s: &'alloc str, + start: usize, + result: NumericResult, + ) -> Result<'alloc, ()> { + let (terminal_id, value) = match result { + NumericResult::Int { base } => { + let n = parse_int(s, base).map_err(|s| ParseError::NotImplemented(s))?; + (TerminalId::NumericLiteral, TokenValue::Number(n)) + } + NumericResult::Float => { + let n = parse_float(s).map_err(|s| ParseError::NotImplemented(s))?; + (TerminalId::NumericLiteral, TokenValue::Number(n)) + } + NumericResult::BigInt { .. } => { + // FIXME + (TerminalId::BigIntLiteral, self.string_to_token_value(s)) + } + }; + + self.set_result( + terminal_id, + SourceLocation::new(start, self.offset()), + value, + ) + } +} + +struct AutoCow<'alloc> { + start: &'alloc str, + value: Option<String<'alloc>>, +} + +impl<'alloc> AutoCow<'alloc> { + fn new(lexer: &Lexer<'alloc>) -> Self { + AutoCow { + start: lexer.chars.as_str(), + value: None, + } + } + + // Push a char that matches lexer.chars.next() + fn push_matching(&mut self, c: char) { + if let Some(text) = &mut self.value { + text.push(c); + } + } + + // Push a different character than lexer.chars.next(). + // force_allocation_without_current_ascii_char must be called before this. + fn push_different(&mut self, c: char) { + debug_assert!(self.value.is_some()); + self.value.as_mut().unwrap().push(c) + } + + // Force allocation of a String, excluding the current ASCII character, + // and return the reference to it + fn get_mut_string_without_current_ascii_char<'b>( + &'b mut self, + lexer: &'_ Lexer<'alloc>, + ) -> &'b mut String<'alloc> { + self.force_allocation_without_current_ascii_char(lexer); + self.value.as_mut().unwrap() + } + + // Force allocation of a String, excluding the current ASCII character. + fn force_allocation_without_current_ascii_char(&mut self, lexer: &'_ Lexer<'alloc>) { + if self.value.is_some() { + return; + } + + self.value = Some(String::from_str_in( + &self.start[..self.start.len() - lexer.chars.as_str().len() - 1], + lexer.allocator, + )); + } + + // Check if the string contains a different character, such as an escape + // sequence + fn has_different(&self) -> bool { + self.value.is_some() + } + + fn finish(&mut self, lexer: &Lexer<'alloc>) -> &'alloc str { + match self.value.take() { + Some(arena_string) => arena_string.into_bump_str(), + None => &self.start[..self.start.len() - lexer.chars.as_str().len()], + } + } + + // Just like finish, but without pushing current char. + fn finish_without_push(&mut self, lexer: &Lexer<'alloc>) -> &'alloc str { + match self.value.take() { + Some(arena_string) => arena_string.into_bump_str(), + None => &self.start[..self.start.len() - lexer.chars.as_str().len() - 1], + } + } +} |