//! JavaScript lexer. use crate::numeric_value::{parse_float, parse_int, NumericLiteralBase}; use crate::parser::Parser; use crate::unicode::{is_id_continue, is_id_start}; use ast::arena; use ast::source_atom_set::{CommonSourceAtomSetIndices, SourceAtomSet}; use ast::source_slice_list::SourceSliceList; use ast::SourceLocation; use bumpalo::{collections::String, Bump}; use generated_parser::{ParseError, Result, TerminalId, Token, TokenValue}; use std::cell::RefCell; use std::convert::TryFrom; use std::rc::Rc; use std::str::Chars; pub struct Lexer<'alloc> { allocator: &'alloc Bump, /// Next token to be returned. token: arena::Box<'alloc, Token>, /// Length of the input text, in UTF-8 bytes. source_length: usize, /// Iterator over the remaining not-yet-parsed input. chars: Chars<'alloc>, atoms: Rc>>, slices: Rc>>, } enum NumericResult { Int { base: NumericLiteralBase, }, Float, BigInt { #[allow(dead_code)] base: NumericLiteralBase, }, } impl<'alloc> Lexer<'alloc> { pub fn new( allocator: &'alloc Bump, chars: Chars<'alloc>, atoms: Rc>>, slices: Rc>>, ) -> Lexer<'alloc> { Self::with_offset(allocator, chars, 0, atoms, slices) } /// Create a lexer for a part of a JS script or module. `offset` is the /// total length of all previous parts, in bytes; source locations for /// tokens created by the new lexer start counting from this number. pub fn with_offset( allocator: &'alloc Bump, chars: Chars<'alloc>, offset: usize, atoms: Rc>>, slices: Rc>>, ) -> Lexer<'alloc> { let source_length = offset + chars.as_str().len(); let mut token = arena::alloc(allocator, new_token()); token.is_on_new_line = true; Lexer { allocator, token, source_length, chars, atoms, slices, } } fn is_looking_at(&self, s: &str) -> bool { self.chars.as_str().starts_with(s) } pub fn offset(&self) -> usize { self.source_length - self.chars.as_str().len() } fn peek(&self) -> Option { self.chars.as_str().chars().next() } fn double_peek(&self) -> Option { let mut chars = self.chars.as_str().chars(); chars.next(); chars.next() } fn set_result( &mut self, terminal_id: TerminalId, loc: SourceLocation, value: TokenValue, ) -> Result<'alloc, ()> { self.token.terminal_id = terminal_id; self.token.loc = loc; self.token.value = value; Ok(()) } #[inline] pub fn next<'parser>( &mut self, parser: &Parser<'parser>, ) -> Result<'alloc, arena::Box<'alloc, Token>> { let mut next_token = arena::alloc_with(self.allocator, || new_token()); self.advance_impl(parser)?; std::mem::swap(&mut self.token, &mut next_token); Ok(next_token) } fn unexpected_err(&mut self) -> ParseError<'alloc> { if let Some(ch) = self.peek() { ParseError::IllegalCharacter(ch) } else { ParseError::UnexpectedEnd } } } /// Returns an empty token which is meant as a place holder to be mutated later. fn new_token() -> Token { Token::basic_token(TerminalId::End, SourceLocation::default()) } // ---------------------------------------------------------------------------- // 11.1 Unicode Format-Control Characters /// U+200C ZERO WIDTH NON-JOINER, abbreviated in the spec as . /// Specially permitted in identifiers. const ZWNJ: char = '\u{200c}'; /// U+200D ZERO WIDTH JOINER, abbreviated as . /// Specially permitted in identifiers. const ZWJ: char = '\u{200d}'; /// U+FEFF ZERO WIDTH NO-BREAK SPACE, abbreviated . /// Considered a whitespace character in JS. const ZWNBSP: char = '\u{feff}'; // ---------------------------------------------------------------------------- // 11.2 White Space /// U+0009 CHARACTER TABULATION, abbreviated . const TAB: char = '\u{9}'; /// U+000B VERTICAL TAB, abbreviated . const VT: char = '\u{b}'; /// U+000C FORM FEED, abbreviated . const FF: char = '\u{c}'; /// U+0020 SPACE, abbreviated . const SP: char = '\u{20}'; /// U+00A0 NON-BREAKING SPACE, abbreviated . const NBSP: char = '\u{a0}'; // ---------------------------------------------------------------------------- // 11.3 Line Terminators /// U+000A LINE FEED, abbreviated in the spec as . const LF: char = '\u{a}'; /// U+000D CARRIAGE RETURN, abbreviated in the spec as . const CR: char = '\u{d}'; /// U+2028 LINE SEPARATOR, abbreviated . const LS: char = '\u{2028}'; /// U+2029 PARAGRAPH SEPARATOR, abbreviated . const PS: char = '\u{2029}'; // ---------------------------------------------------------------------------- // 11.4 Comments // // Comment:: // MultiLineComment // SingleLineComment impl<'alloc> Lexer<'alloc> { /// Skip a *MultiLineComment*. /// /// ```text /// MultiLineComment :: /// `/*` MultiLineCommentChars? `*/` /// /// MultiLineCommentChars :: /// MultiLineNotAsteriskChar MultiLineCommentChars? /// `*` PostAsteriskCommentChars? /// /// PostAsteriskCommentChars :: /// MultiLineNotForwardSlashOrAsteriskChar MultiLineCommentChars? /// `*` PostAsteriskCommentChars? /// /// MultiLineNotAsteriskChar :: /// SourceCharacter but not `*` /// /// MultiLineNotForwardSlashOrAsteriskChar :: /// SourceCharacter but not one of `/` or `*` /// ``` /// /// (B.1.3 splits MultiLineComment into two nonterminals: MultiLineComment /// and SingleLineDelimitedComment. The point of that is to help specify /// that a SingleLineHTMLCloseComment must occur at the start of a line. We /// use `is_on_new_line` for that.) /// fn skip_multi_line_comment(&mut self, builder: &mut AutoCow<'alloc>) -> Result<'alloc, ()> { while let Some(ch) = self.chars.next() { match ch { '*' if self.peek() == Some('/') => { self.chars.next(); *builder = AutoCow::new(&self); return Ok(()); } CR | LF | PS | LS => { self.token.is_on_new_line = true; } _ => {} } } Err(ParseError::UnterminatedMultiLineComment.into()) } /// Skip a *SingleLineComment* and the following *LineTerminatorSequence*, /// if any. /// /// ```text /// SingleLineComment :: /// `//` SingleLineCommentChars? /// /// SingleLineCommentChars :: /// SingleLineCommentChar SingleLineCommentChars? /// /// SingleLineCommentChar :: /// SourceCharacter but not LineTerminator /// ``` fn skip_single_line_comment(&mut self, builder: &mut AutoCow<'alloc>) { while let Some(ch) = self.chars.next() { match ch { CR | LF | LS | PS => break, _ => continue, } } *builder = AutoCow::new(&self); self.token.is_on_new_line = true; } } // ---------------------------------------------------------------------------- // 11.6 Names and Keywords /// True if `c` is a one-character *IdentifierStart*. /// /// ```text /// IdentifierStart :: /// UnicodeIDStart /// `$` /// `_` /// `\` UnicodeEscapeSequence /// /// UnicodeIDStart :: /// > any Unicode code point with the Unicode property "ID_Start" /// ``` fn is_identifier_start(c: char) -> bool { // Escaped case is handled separately. if c.is_ascii() { c == '$' || c == '_' || c.is_ascii_alphabetic() } else { is_id_start(c) } } /// True if `c` is a one-character *IdentifierPart*. /// /// ```text /// IdentifierPart :: /// UnicodeIDContinue /// `$` /// `\` UnicodeEscapeSequence /// /// /// /// UnicodeIDContinue :: /// > any Unicode code point with the Unicode property "ID_Continue" /// ``` fn is_identifier_part(c: char) -> bool { // Escaped case is handled separately. if c.is_ascii() { c == '$' || c == '_' || c.is_ascii_alphanumeric() } else { is_id_continue(c) || c == ZWNJ || c == ZWJ } } impl<'alloc> Lexer<'alloc> { /// Scan the rest of an IdentifierName, having already parsed the initial /// IdentifierStart and stored it in `builder`. /// /// On success, this returns `Ok((has_escapes, str))`, where `has_escapes` /// is true if the identifier contained any UnicodeEscapeSequences, and /// `str` is the un-escaped IdentifierName, including the IdentifierStart, /// on success. /// /// ```text /// IdentifierName :: /// IdentifierStart /// IdentifierName IdentifierPart /// ``` fn identifier_name_tail( &mut self, mut builder: AutoCow<'alloc>, ) -> Result<'alloc, (bool, &'alloc str)> { while let Some(ch) = self.peek() { if !is_identifier_part(ch) { if ch == '\\' { self.chars.next(); builder.force_allocation_without_current_ascii_char(&self); let value = self.unicode_escape_sequence_after_backslash()?; if !is_identifier_part(value) { return Err(ParseError::InvalidEscapeSequence.into()); } builder.push_different(value); continue; } break; } self.chars.next(); builder.push_matching(ch); } let has_different = builder.has_different(); Ok((has_different, builder.finish(&self))) } fn identifier_name(&mut self, mut builder: AutoCow<'alloc>) -> Result<'alloc, &'alloc str> { match self.chars.next() { None => { return Err(ParseError::UnexpectedEnd.into()); } Some(c) => { match c { '$' | '_' | 'a'..='z' | 'A'..='Z' => { builder.push_matching(c); } '\\' => { builder.force_allocation_without_current_ascii_char(&self); let value = self.unicode_escape_sequence_after_backslash()?; if !is_identifier_start(value) { return Err(ParseError::IllegalCharacter(value).into()); } builder.push_different(value); } other if is_identifier_start(other) => { builder.push_matching(other); } other => { return Err(ParseError::IllegalCharacter(other).into()); } } self.identifier_name_tail(builder) .map(|(_has_escapes, name)| name) } } } /// Finish scanning an *IdentifierName* or keyword, having already scanned /// the *IdentifierStart* and pushed it to `builder`. /// /// `start` is the offset of the *IdentifierStart*. /// /// The lexer doesn't know the syntactic context, so it always identifies /// possible keywords. It's up to the parser to understand that, for /// example, `TerminalId::If` is not a keyword when it's used as a property /// or method name. /// /// If the source string contains no escape and it matches to possible /// keywords (including contextual keywords), the result is corresponding /// `TerminalId`. For example, if the source string is "yield", the result /// is `TerminalId::Yield`. /// /// If the source string contains no escape sequence and also it doesn't /// match to any possible keywords, the result is `TerminalId::Name`. /// /// If the source string contains at least one escape sequence, /// the result is always `TerminalId::NameWithEscape`, regardless of the /// StringValue of it. For example, if the source string is "\u{79}ield", /// the result is `TerminalId::NameWithEscape`, and the StringValue is /// "yield". fn identifier_tail(&mut self, start: usize, builder: AutoCow<'alloc>) -> Result<'alloc, ()> { let (has_different, text) = self.identifier_name_tail(builder)?; // https://tc39.es/ecma262/#sec-keywords-and-reserved-words // // keywords in the grammar match literal sequences of specific // SourceCharacter elements. A code point in a keyword cannot be // expressed by a `\` UnicodeEscapeSequence. let (id, value) = if has_different { // Always return `NameWithEscape`. // // Error check against reserved word should be handled in the // consumer. (TerminalId::NameWithEscape, self.string_to_token_value(text)) } else { match &text as &str { "as" => ( TerminalId::As, TokenValue::Atom(CommonSourceAtomSetIndices::as_()), ), "async" => { /* ( TerminalId::Async, TokenValue::Atom(CommonSourceAtomSetIndices::async_()), ), */ return Err(ParseError::NotImplemented( "async cannot be handled in parser due to multiple lookahead", ) .into()); } "await" => { /* ( TerminalId::Await, TokenValue::Atom(CommonSourceAtomSetIndices::await_()), ), */ return Err( ParseError::NotImplemented("await cannot be handled in parser").into(), ); } "break" => ( TerminalId::Break, TokenValue::Atom(CommonSourceAtomSetIndices::break_()), ), "case" => ( TerminalId::Case, TokenValue::Atom(CommonSourceAtomSetIndices::case()), ), "catch" => ( TerminalId::Catch, TokenValue::Atom(CommonSourceAtomSetIndices::catch()), ), "class" => ( TerminalId::Class, TokenValue::Atom(CommonSourceAtomSetIndices::class()), ), "const" => ( TerminalId::Const, TokenValue::Atom(CommonSourceAtomSetIndices::const_()), ), "continue" => ( TerminalId::Continue, TokenValue::Atom(CommonSourceAtomSetIndices::continue_()), ), "debugger" => ( TerminalId::Debugger, TokenValue::Atom(CommonSourceAtomSetIndices::debugger()), ), "default" => ( TerminalId::Default, TokenValue::Atom(CommonSourceAtomSetIndices::default()), ), "delete" => ( TerminalId::Delete, TokenValue::Atom(CommonSourceAtomSetIndices::delete()), ), "do" => ( TerminalId::Do, TokenValue::Atom(CommonSourceAtomSetIndices::do_()), ), "else" => ( TerminalId::Else, TokenValue::Atom(CommonSourceAtomSetIndices::else_()), ), "enum" => ( TerminalId::Enum, TokenValue::Atom(CommonSourceAtomSetIndices::enum_()), ), "export" => ( TerminalId::Export, TokenValue::Atom(CommonSourceAtomSetIndices::export()), ), "extends" => ( TerminalId::Extends, TokenValue::Atom(CommonSourceAtomSetIndices::extends()), ), "finally" => ( TerminalId::Finally, TokenValue::Atom(CommonSourceAtomSetIndices::finally()), ), "for" => ( TerminalId::For, TokenValue::Atom(CommonSourceAtomSetIndices::for_()), ), "from" => ( TerminalId::From, TokenValue::Atom(CommonSourceAtomSetIndices::from()), ), "function" => ( TerminalId::Function, TokenValue::Atom(CommonSourceAtomSetIndices::function()), ), "get" => ( TerminalId::Get, TokenValue::Atom(CommonSourceAtomSetIndices::get()), ), "if" => ( TerminalId::If, TokenValue::Atom(CommonSourceAtomSetIndices::if_()), ), "implements" => ( TerminalId::Implements, TokenValue::Atom(CommonSourceAtomSetIndices::implements()), ), "import" => ( TerminalId::Import, TokenValue::Atom(CommonSourceAtomSetIndices::import()), ), "in" => ( TerminalId::In, TokenValue::Atom(CommonSourceAtomSetIndices::in_()), ), "instanceof" => ( TerminalId::Instanceof, TokenValue::Atom(CommonSourceAtomSetIndices::instanceof()), ), "interface" => ( TerminalId::Interface, TokenValue::Atom(CommonSourceAtomSetIndices::interface()), ), "let" => { /* ( TerminalId::Let, TokenValue::Atom(CommonSourceAtomSetIndices::let_()), ), */ return Err(ParseError::NotImplemented( "let cannot be handled in parser due to multiple lookahead", ) .into()); } "new" => ( TerminalId::New, TokenValue::Atom(CommonSourceAtomSetIndices::new_()), ), "of" => ( TerminalId::Of, TokenValue::Atom(CommonSourceAtomSetIndices::of()), ), "package" => ( TerminalId::Package, TokenValue::Atom(CommonSourceAtomSetIndices::package()), ), "private" => ( TerminalId::Private, TokenValue::Atom(CommonSourceAtomSetIndices::private()), ), "protected" => ( TerminalId::Protected, TokenValue::Atom(CommonSourceAtomSetIndices::protected()), ), "public" => ( TerminalId::Public, TokenValue::Atom(CommonSourceAtomSetIndices::public()), ), "return" => ( TerminalId::Return, TokenValue::Atom(CommonSourceAtomSetIndices::return_()), ), "set" => ( TerminalId::Set, TokenValue::Atom(CommonSourceAtomSetIndices::set()), ), "static" => ( TerminalId::Static, TokenValue::Atom(CommonSourceAtomSetIndices::static_()), ), "super" => ( TerminalId::Super, TokenValue::Atom(CommonSourceAtomSetIndices::super_()), ), "switch" => ( TerminalId::Switch, TokenValue::Atom(CommonSourceAtomSetIndices::switch()), ), "target" => ( TerminalId::Target, TokenValue::Atom(CommonSourceAtomSetIndices::target()), ), "this" => ( TerminalId::This, TokenValue::Atom(CommonSourceAtomSetIndices::this()), ), "throw" => ( TerminalId::Throw, TokenValue::Atom(CommonSourceAtomSetIndices::throw()), ), "try" => ( TerminalId::Try, TokenValue::Atom(CommonSourceAtomSetIndices::try_()), ), "typeof" => ( TerminalId::Typeof, TokenValue::Atom(CommonSourceAtomSetIndices::typeof_()), ), "var" => ( TerminalId::Var, TokenValue::Atom(CommonSourceAtomSetIndices::var()), ), "void" => ( TerminalId::Void, TokenValue::Atom(CommonSourceAtomSetIndices::void()), ), "while" => ( TerminalId::While, TokenValue::Atom(CommonSourceAtomSetIndices::while_()), ), "with" => ( TerminalId::With, TokenValue::Atom(CommonSourceAtomSetIndices::with()), ), "yield" => { /* ( TerminalId::Yield, TokenValue::Atom(CommonSourceAtomSetIndices::yield_()), ), */ return Err( ParseError::NotImplemented("yield cannot be handled in parser").into(), ); } "null" => ( TerminalId::NullLiteral, TokenValue::Atom(CommonSourceAtomSetIndices::null()), ), "true" => ( TerminalId::BooleanLiteral, TokenValue::Atom(CommonSourceAtomSetIndices::true_()), ), "false" => ( TerminalId::BooleanLiteral, TokenValue::Atom(CommonSourceAtomSetIndices::false_()), ), _ => (TerminalId::Name, self.string_to_token_value(text)), } }; self.set_result(id, SourceLocation::new(start, self.offset()), value) } /// ```text /// PrivateIdentifier:: /// `#` IdentifierName /// ``` fn private_identifier(&mut self, start: usize, builder: AutoCow<'alloc>) -> Result<'alloc, ()> { let name = self.identifier_name(builder)?; let value = self.string_to_token_value(name); self.set_result( TerminalId::PrivateIdentifier, SourceLocation::new(start, self.offset()), value, ) } /// ```text /// UnicodeEscapeSequence:: /// `u` Hex4Digits /// `u{` CodePoint `}` /// ``` fn unicode_escape_sequence_after_backslash(&mut self) -> Result<'alloc, char> { match self.chars.next() { Some('u') => {} _ => { return Err(ParseError::InvalidEscapeSequence.into()); } } self.unicode_escape_sequence_after_backslash_and_u() } fn unicode_escape_sequence_after_backslash_and_u(&mut self) -> Result<'alloc, char> { let value = match self.peek() { Some('{') => { self.chars.next(); let value = self.code_point()?; match self.chars.next() { Some('}') => {} _ => { return Err(ParseError::InvalidEscapeSequence.into()); } } value } _ => self.hex_4_digits()?, }; Ok(value) } } impl<'alloc> Lexer<'alloc> { // ------------------------------------------------------------------------ // 11.8.3 Numeric Literals /// Advance over decimal digits in the input. /// /// ```text /// NumericLiteralSeparator:: /// `_` /// /// DecimalDigits :: /// DecimalDigit /// DecimalDigits NumericLiteralSeparator? DecimalDigit /// /// DecimalDigit :: one of /// `0` `1` `2` `3` `4` `5` `6` `7` `8` `9` /// ``` fn decimal_digits(&mut self) -> Result<'alloc, ()> { if let Some('0'..='9') = self.peek() { self.chars.next(); } else { return Err(self.unexpected_err().into()); } self.decimal_digits_after_first_digit()?; Ok(()) } fn optional_decimal_digits(&mut self) -> Result<'alloc, ()> { if let Some('0'..='9') = self.peek() { self.chars.next(); } else { return Ok(()); } self.decimal_digits_after_first_digit()?; Ok(()) } fn decimal_digits_after_first_digit(&mut self) -> Result<'alloc, ()> { while let Some(next) = self.peek() { match next { '_' => { self.chars.next(); if let Some('0'..='9') = self.peek() { self.chars.next(); } else { return Err(self.unexpected_err().into()); } } '0'..='9' => { self.chars.next(); } _ => break, } } Ok(()) } /// Skip an ExponentPart, if present. /// /// ```text /// ExponentPart :: /// ExponentIndicator SignedInteger /// /// ExponentIndicator :: one of /// `e` `E` /// /// SignedInteger :: /// DecimalDigits /// `+` DecimalDigits /// `-` DecimalDigits /// ``` fn optional_exponent(&mut self) -> Result<'alloc, bool> { if let Some('e') | Some('E') = self.peek() { self.chars.next(); self.decimal_exponent()?; return Ok(true); } Ok(false) } fn decimal_exponent(&mut self) -> Result<'alloc, ()> { if let Some('+') | Some('-') = self.peek() { self.chars.next(); } self.decimal_digits()?; Ok(()) } /// ```text /// HexDigit :: one of /// `0` `1` `2` `3` `4` `5` `6` `7` `8` `9` `a` `b` `c` `d` `e` `f` `A` `B` `C` `D` `E` `F` /// ``` fn hex_digit(&mut self) -> Result<'alloc, u32> { match self.chars.next() { None => Err(ParseError::InvalidEscapeSequence.into()), Some(c @ '0'..='9') => Ok(c as u32 - '0' as u32), Some(c @ 'a'..='f') => Ok(10 + (c as u32 - 'a' as u32)), Some(c @ 'A'..='F') => Ok(10 + (c as u32 - 'A' as u32)), Some(other) => Err(ParseError::IllegalCharacter(other).into()), } } fn code_point_to_char(value: u32) -> Result<'alloc, char> { if 0xd800 <= value && value <= 0xdfff { Err(ParseError::NotImplemented("unicode escape sequences (surrogates)").into()) } else { char::try_from(value).map_err(|_| ParseError::InvalidEscapeSequence.into()) } } /// ```text /// Hex4Digits :: /// HexDigit HexDigit HexDigit HexDigit /// ``` fn hex_4_digits(&mut self) -> Result<'alloc, char> { let mut value = 0; for _ in 0..4 { value = (value << 4) | self.hex_digit()?; } Self::code_point_to_char(value) } /// ```text /// CodePoint :: /// HexDigits but only if MV of HexDigits ≤ 0x10FFFF /// /// HexDigits :: /// HexDigit /// HexDigits HexDigit /// ``` fn code_point(&mut self) -> Result<'alloc, char> { let mut value = self.hex_digit()?; loop { let next = match self.peek() { None => { return Err(ParseError::InvalidEscapeSequence.into()); } Some(c @ '0'..='9') => c as u32 - '0' as u32, Some(c @ 'a'..='f') => 10 + (c as u32 - 'a' as u32), Some(c @ 'A'..='F') => 10 + (c as u32 - 'A' as u32), Some(_) => break, }; self.chars.next(); value = (value << 4) | next; if value > 0x10FFFF { return Err(ParseError::InvalidEscapeSequence.into()); } } Self::code_point_to_char(value) } /// Scan a NumericLiteral (defined in 11.8.3, extended by B.1.1) after /// having already consumed the first character, which was `0`. /// /// ```text /// NumericLiteral :: /// DecimalLiteral /// DecimalBigIntegerLiteral /// NonDecimalIntegerLiteral /// NonDecimalIntegerLiteral BigIntLiteralSuffix /// /// DecimalBigIntegerLiteral :: /// `0` BigIntLiteralSuffix /// NonZeroDigit DecimalDigits? BigIntLiteralSuffix /// /// NonDecimalIntegerLiteral :: /// BinaryIntegerLiteral /// OctalIntegerLiteral /// HexIntegerLiteral /// /// BigIntLiteralSuffix :: /// `n` /// ``` fn numeric_literal_starting_with_zero(&mut self) -> Result<'alloc, NumericResult> { let mut base = NumericLiteralBase::Decimal; match self.peek() { // BinaryIntegerLiteral :: // `0b` BinaryDigits // `0B` BinaryDigits // // BinaryDigits :: // BinaryDigit // BinaryDigits NumericLiteralSeparator? BinaryDigit // // BinaryDigit :: one of // `0` `1` Some('b') | Some('B') => { self.chars.next(); base = NumericLiteralBase::Binary; if let Some('0'..='1') = self.peek() { self.chars.next(); } else { return Err(self.unexpected_err().into()); } while let Some(next) = self.peek() { match next { '_' => { self.chars.next(); if let Some('0'..='1') = self.peek() { self.chars.next(); } else { return Err(self.unexpected_err().into()); } } '0'..='1' => { self.chars.next(); } _ => break, } } if let Some('n') = self.peek() { self.chars.next(); self.check_after_numeric_literal()?; return Ok(NumericResult::BigInt { base }); } } // OctalIntegerLiteral :: // `0o` OctalDigits // `0O` OctalDigits // // OctalDigits :: // OctalDigit // OctalDigits NumericLiteralSeparator? OctalDigit // // OctalDigit :: one of // `0` `1` `2` `3` `4` `5` `6` `7` // Some('o') | Some('O') => { self.chars.next(); base = NumericLiteralBase::Octal; if let Some('0'..='7') = self.peek() { self.chars.next(); } else { return Err(self.unexpected_err().into()); } while let Some(next) = self.peek() { match next { '_' => { self.chars.next(); if let Some('0'..='7') = self.peek() { self.chars.next(); } else { return Err(self.unexpected_err().into()); } } '0'..='7' => { self.chars.next(); } _ => break, } } if let Some('n') = self.peek() { self.chars.next(); self.check_after_numeric_literal()?; return Ok(NumericResult::BigInt { base }); } } // HexIntegerLiteral :: // `0x` HexDigits // `0X` HexDigits // // HexDigits :: // HexDigit // HexDigits NumericLiteralSeparator? HexDigit // // HexDigit :: one of // `0` `1` `2` `3` `4` `5` `6` `7` `8` `9` `a` `b` `c` `d` `e` `f` `A` `B` `C` `D` `E` `F` Some('x') | Some('X') => { self.chars.next(); base = NumericLiteralBase::Hex; if let Some('0'..='9') | Some('a'..='f') | Some('A'..='F') = self.peek() { self.chars.next(); } else { return Err(self.unexpected_err().into()); } while let Some(next) = self.peek() { match next { '_' => { self.chars.next(); if let Some('0'..='9') | Some('a'..='f') | Some('A'..='F') = self.peek() { self.chars.next(); } else { return Err(self.unexpected_err().into()); } } '0'..='9' | 'a'..='f' | 'A'..='F' => { self.chars.next(); } _ => break, } } if let Some('n') = self.peek() { self.chars.next(); self.check_after_numeric_literal()?; return Ok(NumericResult::BigInt { base }); } } Some('.') => { self.chars.next(); return self.decimal_literal_after_decimal_point_after_digits(); } Some('e') | Some('E') => { self.chars.next(); self.decimal_exponent()?; return Ok(NumericResult::Float); } Some('n') => { self.chars.next(); self.check_after_numeric_literal()?; return Ok(NumericResult::BigInt { base }); } Some('0'..='9') => { // This is almost always the token `0` in practice. // // In nonstrict code, as a legacy feature, other numbers // starting with `0` are allowed. If /0[0-7]+/ matches, it's a // LegacyOctalIntegerLiteral; but if we see an `8` or `9` in // the number, it's decimal. Decimal numbers can have a decimal // point and/or ExponentPart; octals can't. // // Neither is allowed with a BigIntLiteralSuffix `n`. // // LegacyOctalIntegerLiteral :: // `0` OctalDigit // LegacyOctalIntegerLiteral OctalDigit // // NonOctalDecimalIntegerLiteral :: // `0` NonOctalDigit // LegacyOctalLikeDecimalIntegerLiteral NonOctalDigit // NonOctalDecimalIntegerLiteral DecimalDigit // // LegacyOctalLikeDecimalIntegerLiteral :: // `0` OctalDigit // LegacyOctalLikeDecimalIntegerLiteral OctalDigit // // NonOctalDigit :: one of // `8` `9` // // TODO: implement `strict_mode` check // let strict_mode = true; // if !strict_mode { // // TODO: Distinguish between Octal and NonOctalDecimal. // // TODO: Support NonOctalDecimal followed by a decimal // // point and/or ExponentPart. // self.decimal_digits()?; // } return Err(ParseError::NotImplemented("LegacyOctalIntegerLiteral").into()); } _ => {} } self.check_after_numeric_literal()?; Ok(NumericResult::Int { base }) } /// Scan a NumericLiteral (defined in 11.8.3, extended by B.1.1) after /// having already consumed the first character, which is a decimal digit. fn decimal_literal_after_first_digit(&mut self) -> Result<'alloc, NumericResult> { // DecimalLiteral :: // DecimalIntegerLiteral `.` DecimalDigits? ExponentPart? // `.` DecimalDigits ExponentPart? // DecimalIntegerLiteral ExponentPart? // // DecimalIntegerLiteral :: // `0` #see `numeric_literal_starting_with_zero` // NonZeroDigit // NonZeroDigit NumericLiteralSeparator? DecimalDigits // NonOctalDecimalIntegerLiteral #see `numeric_literal_ // # starting_with_zero` // // NonZeroDigit :: one of // `1` `2` `3` `4` `5` `6` `7` `8` `9` self.decimal_digits_after_first_digit()?; match self.peek() { Some('.') => { self.chars.next(); return self.decimal_literal_after_decimal_point_after_digits(); } Some('n') => { self.chars.next(); self.check_after_numeric_literal()?; return Ok(NumericResult::BigInt { base: NumericLiteralBase::Decimal, }); } _ => {} } let has_exponent = self.optional_exponent()?; self.check_after_numeric_literal()?; let result = if has_exponent { NumericResult::Float } else { NumericResult::Int { base: NumericLiteralBase::Decimal, } }; Ok(result) } fn decimal_literal_after_decimal_point(&mut self) -> Result<'alloc, NumericResult> { // The parts after `.` in // // `.` DecimalDigits ExponentPart? self.decimal_digits()?; self.optional_exponent()?; self.check_after_numeric_literal()?; Ok(NumericResult::Float) } fn decimal_literal_after_decimal_point_after_digits( &mut self, ) -> Result<'alloc, NumericResult> { // The parts after `.` in // // DecimalLiteral :: // DecimalIntegerLiteral `.` DecimalDigits? ExponentPart? self.optional_decimal_digits()?; self.optional_exponent()?; self.check_after_numeric_literal()?; Ok(NumericResult::Float) } fn check_after_numeric_literal(&self) -> Result<'alloc, ()> { // The SourceCharacter immediately following a // NumericLiteral must not be an IdentifierStart or // DecimalDigit. (11.8.3) if let Some(ch) = self.peek() { if is_identifier_start(ch) || ch.is_digit(10) { return Err(ParseError::IllegalCharacter(ch).into()); } } Ok(()) } // ------------------------------------------------------------------------ // 11.8.4 String Literals (as extended by B.1.2) /// Scan an LineContinuation or EscapeSequence in a string literal, having /// already consumed the initial backslash character. /// /// ```text /// LineContinuation :: /// `\` LineTerminatorSequence /// /// EscapeSequence :: /// CharacterEscapeSequence /// (in strict mode code) `0` [lookahead ∉ DecimalDigit] /// (in non-strict code) LegacyOctalEscapeSequence /// HexEscapeSequence /// UnicodeEscapeSequence /// /// CharacterEscapeSequence :: /// SingleEscapeCharacter /// NonEscapeCharacter /// /// SingleEscapeCharacter :: one of /// `'` `"` `\` `b` `f` `n` `r` `t` `v` /// /// LegacyOctalEscapeSequence :: /// OctalDigit [lookahead ∉ OctalDigit] /// ZeroToThree OctalDigit [lookahead ∉ OctalDigit] /// FourToSeven OctalDigit /// ZeroToThree OctalDigit OctalDigit /// /// ZeroToThree :: one of /// `0` `1` `2` `3` /// /// FourToSeven :: one of /// `4` `5` `6` `7` /// ``` fn escape_sequence(&mut self, text: &mut String<'alloc>) -> Result<'alloc, ()> { match self.chars.next() { None => { return Err(ParseError::UnterminatedString.into()); } Some(c) => match c { LF | LS | PS => { // LineContinuation. Ignore it. // // Don't set is_on_new_line because this LineContinuation // has no bearing on whether the current string literal was // the first token on the line where it started. } CR => { // LineContinuation. Check for the sequence \r\n; otherwise // ignore it. if self.peek() == Some(LF) { self.chars.next(); } } '\'' | '"' | '\\' => { text.push(c); } 'b' => { text.push('\u{8}'); } 'f' => { text.push(FF); } 'n' => { text.push(LF); } 'r' => { text.push(CR); } 't' => { text.push(TAB); } 'v' => { text.push(VT); } 'x' => { // HexEscapeSequence :: // `x` HexDigit HexDigit let mut value = self.hex_digit()?; value = (value << 4) | self.hex_digit()?; match char::try_from(value) { Err(_) => { return Err(ParseError::InvalidEscapeSequence.into()); } Ok(c) => { text.push(c); } } } 'u' => { let c = self.unicode_escape_sequence_after_backslash_and_u()?; text.push(c); } '0' => { // In strict mode code and in template literals, the // relevant production is // // EscapeSequence :: // `0` [lookahead { return Err(ParseError::NotImplemented( "legacy octal escape sequence in string", ) .into()); } Some('8'..='9') => { return Err(ParseError::NotImplemented( "digit immediately following \\0 escape sequence", ) .into()); } _ => {} } text.push('\0'); } '1'..='7' => { return Err(ParseError::NotImplemented( "legacy octal escape sequence in string", ) .into()); } other => { // "\8" and "\9" are invalid per spec, but SpiderMonkey and // V8 accept them, and JSC accepts them in non-strict mode. // "\8" is "8" and "\9" is "9". text.push(other); } }, } Ok(()) } /// Scan a string literal, having already consumed the starting quote /// character `delimiter`. /// /// ```text /// StringLiteral :: /// `"` DoubleStringCharacters? `"` /// `'` SingleStringCharacters? `'` /// /// DoubleStringCharacters :: /// DoubleStringCharacter DoubleStringCharacters? /// /// SingleStringCharacters :: /// SingleStringCharacter SingleStringCharacters? /// /// DoubleStringCharacter :: /// SourceCharacter but not one of `"` or `\` or LineTerminator /// /// /// `\` EscapeSequence /// LineContinuation /// /// SingleStringCharacter :: /// SourceCharacter but not one of `'` or `\` or LineTerminator /// /// /// `\` EscapeSequence /// LineContinuation /// ``` fn string_literal(&mut self, delimiter: char) -> Result<'alloc, ()> { let offset = self.offset() - 1; let mut builder = AutoCow::new(&self); loop { match self.chars.next() { None | Some('\r') | Some('\n') => { return Err(ParseError::UnterminatedString.into()); } Some(c @ '"') | Some(c @ '\'') => { if c == delimiter { let value = self.string_to_token_value(builder.finish_without_push(&self)); return self.set_result( TerminalId::StringLiteral, SourceLocation::new(offset, self.offset()), value, ); } else { builder.push_matching(c); } } Some('\\') => { let text = builder.get_mut_string_without_current_ascii_char(&self); self.escape_sequence(text)?; } Some(other) => { // NonEscapeCharacter :: // SourceCharacter but not one of EscapeCharacter or LineTerminator // // EscapeCharacter :: // SingleEscapeCharacter // DecimalDigit // `x` // `u` builder.push_matching(other); } } } } // ------------------------------------------------------------------------ // 11.8.5 Regular Expression Literals fn regular_expression_backslash_sequence(&mut self) -> Result<'alloc, ()> { match self.chars.next() { None | Some(CR) | Some(LF) | Some(LS) | Some(PS) => { Err(ParseError::UnterminatedRegExp.into()) } Some(_) => Ok(()), } } // See 12.2.8 and 11.8.5 sections. fn regular_expression_literal(&mut self, builder: &mut AutoCow<'alloc>) -> Result<'alloc, ()> { let offset = self.offset(); loop { match self.chars.next() { None | Some(CR) | Some(LF) | Some(LS) | Some(PS) => { return Err(ParseError::UnterminatedRegExp.into()); } Some('/') => { break; } Some('[') => { // RegularExpressionClass. loop { match self.chars.next() { None | Some(CR) | Some(LF) | Some(LS) | Some(PS) => { return Err(ParseError::UnterminatedRegExp.into()); } Some(']') => { break; } Some('\\') => { self.regular_expression_backslash_sequence()?; } Some(_) => {} } } } Some('\\') => { self.regular_expression_backslash_sequence()?; } Some(_) => {} } } let mut flag_text = AutoCow::new(&self); while let Some(ch) = self.peek() { match ch { '$' | '_' | 'a'..='z' | 'A'..='Z' | '0'..='9' => { self.chars.next(); flag_text.push_matching(ch); } _ => break, } } // 12.2.8.2.1 Assert literal is a RegularExpressionLiteral. let literal = builder.finish(&self); // 12.2.8.2.2 Check that only gimsuy flags are mentioned at most once. let gimsuy_mask: u32 = ['g', 'i', 'm', 's', 'u', 'y'] .iter() .map(|x| 1 << ((*x as u8) - ('a' as u8))) .sum(); let mut flag_text_set: u32 = 0; for ch in flag_text.finish(&self).chars() { if !ch.is_ascii_lowercase() { return Err(ParseError::NotImplemented( "Unexpected flag in regular expression literal", ) .into()); } let ch_mask = 1 << ((ch as u8) - ('a' as u8)); if ch_mask & gimsuy_mask == 0 { return Err(ParseError::NotImplemented( "Unexpected flag in regular expression literal", ) .into()); } if flag_text_set & ch_mask != 0 { return Err(ParseError::NotImplemented( "Flag is mentioned twice in regular expression literal", ) .into()); } flag_text_set |= ch_mask; } // TODO: 12.2.8.2.4 and 12.2.8.2.5 Check that the body matches the // grammar defined in 21.2.1. let value = self.slice_to_token_value(literal); self.set_result( TerminalId::RegularExpressionLiteral, SourceLocation::new(offset, self.offset()), value, ) } // ------------------------------------------------------------------------ // 11.8.6 Template Literal Lexical Components /// Parse a template literal component token, having already consumed the /// starting `` ` `` or `}` character. On success, the `id` of the returned /// `Token` is `subst` (if the token ends with `${`) or `tail` (if the /// token ends with `` ` ``). /// /// ```text /// NoSubstitutionTemplate :: /// ``` TemplateCharacters? ``` /// /// TemplateHead :: /// ``` TemplateCharacters? `${` /// /// TemplateMiddle :: /// `}` TemplateCharacters? `${` /// /// TemplateTail :: /// `}` TemplateCharacters? ``` /// /// TemplateCharacters :: /// TemplateCharacter TemplateCharacters? /// ``` fn template_part( &mut self, start: usize, subst: TerminalId, tail: TerminalId, ) -> Result<'alloc, ()> { let mut builder = AutoCow::new(&self); while let Some(ch) = self.chars.next() { // TemplateCharacter :: // `$` [lookahead != `{` ] // `\` EscapeSequence // `\` NotEscapeSequence // LineContinuation // LineTerminatorSequence // SourceCharacter but not one of ``` or `\` or `$` or LineTerminator // // NotEscapeSequence :: // `0` DecimalDigit // DecimalDigit but not `0` // `x` [lookahead but only if MV of |HexDigits| > 0x10FFFF ] // // CodePoint :: // HexDigits [> but only if MV of |HexDigits| ≤ 0x10FFFF ] if ch == '$' && self.peek() == Some('{') { self.chars.next(); let value = self.string_to_token_value(builder.finish_without_push(&self)); return self.set_result(subst, SourceLocation::new(start, self.offset()), value); } if ch == '`' { let value = self.string_to_token_value(builder.finish_without_push(&self)); return self.set_result(tail, SourceLocation::new(start, self.offset()), value); } // TODO: Support escape sequences. if ch == '\\' { let text = builder.get_mut_string_without_current_ascii_char(&self); self.escape_sequence(text)?; } else { builder.push_matching(ch); } } Err(ParseError::UnterminatedString.into()) } fn advance_impl<'parser>(&mut self, parser: &Parser<'parser>) -> Result<'alloc, ()> { let mut builder = AutoCow::new(&self); let mut start = self.offset(); while let Some(c) = self.chars.next() { match c { // 11.2 White Space // // WhiteSpace :: // // // // // // // TAB | VT | FF | SP | NBSP | ZWNBSP | '\u{1680}' | // Ogham space mark (in ) '\u{2000}' ..= '\u{200a}' | // typesetting spaces (in ) '\u{202f}' | // Narrow no-break space (in ) '\u{205f}' | // Medium mathematical space (in ) '\u{3000}' // Ideographic space (in ) => { // TODO - The spec uses to stand for any character // with category "Space_Separator" (Zs). New Unicode // standards may add characters to this set. This should therefore be // implemented using the Unicode database somehow. builder = AutoCow::new(&self); start = self.offset(); continue; } // 11.3 Line Terminators // // LineTerminator :: // // // // LF | CR | LS | PS => { self.token.is_on_new_line = true; builder = AutoCow::new(&self); start = self.offset(); continue; } '0' => { let result = self.numeric_literal_starting_with_zero()?; return Ok(self.numeric_result_to_advance_result(builder.finish(&self), start, result)?); } '1'..='9' => { let result = self.decimal_literal_after_first_digit()?; return Ok(self.numeric_result_to_advance_result(builder.finish(&self), start, result)?); } '"' | '\'' => { return self.string_literal(c); } '`' => { return self.template_part(start, TerminalId::TemplateHead, TerminalId::NoSubstitutionTemplate); } '!' => match self.peek() { Some('=') => { self.chars.next(); match self.peek() { Some('=') => { self.chars.next(); return self.set_result( TerminalId::StrictNotEqual, SourceLocation::new(start, self.offset()), TokenValue::None, ); } _ => return self.set_result( TerminalId::LaxNotEqual, SourceLocation::new(start, self.offset()), TokenValue::None, ), } } _ => return self.set_result( TerminalId::LogicalNot, SourceLocation::new(start, self.offset()), TokenValue::None, ), }, '%' => match self.peek() { Some('=') => { self.chars.next(); return self.set_result( TerminalId::RemainderAssign, SourceLocation::new(start, self.offset()), TokenValue::None, ); } _ => return self.set_result( TerminalId::Remainder, SourceLocation::new(start, self.offset()), TokenValue::None, ), }, '&' => match self.peek() { Some('&') => { self.chars.next(); match self.peek() { Some('=') => { self.chars.next(); return self.set_result( TerminalId::LogicalAndAssign, SourceLocation::new(start, self.offset()), TokenValue::None, ); } _ => return self.set_result( TerminalId::LogicalAnd, SourceLocation::new(start, self.offset()), TokenValue::None, ) } } Some('=') => { self.chars.next(); return self.set_result( TerminalId::BitwiseAndAssign, SourceLocation::new(start, self.offset()), TokenValue::None, ); } _ => return self.set_result( TerminalId::BitwiseAnd, SourceLocation::new(start, self.offset()), TokenValue::None, ), }, '*' => match self.peek() { Some('*') => { self.chars.next(); match self.peek() { Some('=') => { self.chars.next(); return self.set_result( TerminalId::ExponentiateAssign, SourceLocation::new(start, self.offset()), TokenValue::None, ); } _ => return self.set_result( TerminalId::Exponentiate, SourceLocation::new(start, self.offset()), TokenValue::None, ), } } Some('=') => { self.chars.next(); return self.set_result( TerminalId::MultiplyAssign, SourceLocation::new(start, self.offset()), TokenValue::None, ); } _ => return self.set_result( TerminalId::Star, SourceLocation::new(start, self.offset()), TokenValue::None, ), }, '+' => match self.peek() { Some('+') => { self.chars.next(); return self.set_result( TerminalId::Increment, SourceLocation::new(start, self.offset()), TokenValue::None, ); } Some('=') => { self.chars.next(); return self.set_result( TerminalId::AddAssign, SourceLocation::new(start, self.offset()), TokenValue::None, ); } _ => return self.set_result( TerminalId::Plus, SourceLocation::new(start, self.offset()), TokenValue::None, ), }, '-' => match self.peek() { Some('-') => { self.chars.next(); match self.peek() { Some('>') if self.token.is_on_new_line => { // B.1.3 SingleLineHTMLCloseComment // TODO: Limit this to Script (not Module). self.skip_single_line_comment(&mut builder); continue; } _ => return self.set_result( TerminalId::Decrement, SourceLocation::new(start, self.offset()), TokenValue::None, ), } } Some('=') => { self.chars.next(); return self.set_result( TerminalId::SubtractAssign, SourceLocation::new(start, self.offset()), TokenValue::None, ); } _ => return self.set_result( TerminalId::Minus, SourceLocation::new(start, self.offset()), TokenValue::None, ), }, '.' => match self.peek() { Some('.') => { self.chars.next(); match self.peek() { Some('.') => { self.chars.next(); return self.set_result( TerminalId::Ellipsis, SourceLocation::new(start, self.offset()), TokenValue::None, ); } _ => return Err(ParseError::IllegalCharacter('.').into()), } } Some('0'..='9') => { let result = self.decimal_literal_after_decimal_point()?; return Ok(self.numeric_result_to_advance_result(builder.finish(&self), start, result)?); } _ => return self.set_result( TerminalId::Dot, SourceLocation::new(start, self.offset()), TokenValue::None, ), }, '/' => match self.peek() { Some('/') => { // SingleLineComment :: `//` SingleLineCommentChars? self.chars.next(); self.skip_single_line_comment(&mut builder); start = self.offset(); continue; } Some('*') => { self.chars.next(); self.skip_multi_line_comment(&mut builder)?; start = self.offset(); continue; } _ => { if parser.can_accept_terminal(TerminalId::Divide) { match self.peek() { Some('=') => { self.chars.next(); return self.set_result( TerminalId::DivideAssign, SourceLocation::new(start, self.offset()), TokenValue::None, ); } _ => return self.set_result( TerminalId::Divide, SourceLocation::new(start, self.offset()), TokenValue::None, ), } } return self.regular_expression_literal(&mut builder); } }, '}' => { if parser.can_accept_terminal(TerminalId::TemplateMiddle) { return self.template_part(start, TerminalId::TemplateMiddle, TerminalId::TemplateTail); } return self.set_result( TerminalId::CloseBrace, SourceLocation::new(start, self.offset()), TokenValue::None, ); } '<' => match self.peek() { Some('<') => { self.chars.next(); match self.peek() { Some('=') => { self.chars.next(); return self.set_result( TerminalId::LeftShiftAssign, SourceLocation::new(start, self.offset()), TokenValue::None, ); } _ => return self.set_result( TerminalId::LeftShift, SourceLocation::new(start, self.offset()), TokenValue::None, ), } } Some('=') => { self.chars.next(); return self.set_result( TerminalId::LessThanOrEqualTo, SourceLocation::new(start, self.offset()), TokenValue::None, ); } Some('!') if self.is_looking_at("!--") => { // B.1.3 SingleLineHTMLOpenComment. Note that the above // `is_looking_at` test peeked ahead at the next three // characters of input. This lookahead is necessary // because `x