diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 17:32:43 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 17:32:43 +0000 |
commit | 6bf0a5cb5034a7e684dcc3500e841785237ce2dd (patch) | |
tree | a68f146d7fa01f0134297619fbe7e33db084e0aa /modules/libpref/parser/src | |
parent | Initial commit. (diff) | |
download | thunderbird-6bf0a5cb5034a7e684dcc3500e841785237ce2dd.tar.xz thunderbird-6bf0a5cb5034a7e684dcc3500e841785237ce2dd.zip |
Adding upstream version 1:115.7.0.upstream/1%115.7.0upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'modules/libpref/parser/src')
-rw-r--r-- | modules/libpref/parser/src/lib.rs | 991 |
1 files changed, 991 insertions, 0 deletions
diff --git a/modules/libpref/parser/src/lib.rs b/modules/libpref/parser/src/lib.rs new file mode 100644 index 0000000000..3a0ad2c777 --- /dev/null +++ b/modules/libpref/parser/src/lib.rs @@ -0,0 +1,991 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +//! This crate implements a prefs file parser. +//! +//! Pref files have the following grammar. Note that there are slight +//! differences between the grammar for a default prefs files and a user prefs +//! file. +//! +//! ```text +//! <pref-file> = <pref>* +//! <pref> = <pref-spec> "(" <pref-name> "," <pref-value> <pref-attrs> ")" ";" +//! <pref-spec> = "user_pref" | "pref" | "sticky_pref" // in default pref files +//! <pref-spec> = "user_pref" // in user pref files +//! <pref-name> = <string-literal> +//! <pref-value> = <string-literal> | "true" | "false" | <int-value> +//! <int-value> = <sign>? <int-literal> +//! <sign> = "+" | "-" +//! <int-literal> = [0-9]+ (and cannot be followed by [A-Za-z_]) +//! <string-literal> = +//! A single or double-quoted string, with the following escape sequences +//! allowed: \", \', \\, \n, \r, \xNN, \uNNNN, where \xNN gives a raw byte +//! value that is copied directly into an 8-bit string value, and \uNNNN +//! gives a UTF-16 code unit that is converted to UTF-8 before being copied +//! into an 8-bit string value. \x00 and \u0000 are disallowed because they +//! would cause C++ code handling such strings to misbehave. +//! <pref-attrs> = ("," <pref-attr>)* // in default pref files +//! = <empty> // in user pref files +//! <pref-attr> = "sticky" | "locked" // default pref files only +//! ``` +//! +//! Comments can take three forms: +//! - `# Python-style comments` +//! - `// C++ style comments` +//! - `/* C style comments (non-nested) */` +//! +//! Non-end-of-line whitespace chars are `\t`, `\v`, `\f`, and space. +//! +//! End-of-line sequences can take three forms, each of which is considered as +//! a single EOL: +//! - `\n` +//! - `\r` (without subsequent `\n`) +//! - `\r\n` +//! +//! The valid range for `<int-value>` is -2,147,483,648..2,147,483,647. Values +//! outside that range will result in a parse error. +//! +//! A `\0` char is interpreted as the end of the file. The use of this character +//! in a prefs file is not recommended. Within string literals `\x00` or +//! `\u0000` can be used instead. +//! +//! The parser performs error recovery. On a syntax error, it will scan forward +//! to the next `;` token and then continue parsing. If the syntax error occurs +//! in the middle of a token, it will first finish obtaining the current token +//! in an appropriate fashion. + +// This parser uses several important optimizations. +// +// - Because "`\0` means EOF" is part of the grammar (see above), EOF is +// representable by a u8. If EOF was represented by an out-of-band value such +// as -1 or 256, we'd have to return a larger type such as `u16` or `i16` +// from `get_char()`. +// +// - When starting a new token, it uses a lookup table with the first char, +// which quickly identifies what kind of token it will be. Furthermore, if +// that token is an unambiguous single-char token (e.g. `(`, `)`, `+`, `,`, +// `-`, `;`), the parser will return the appropriate token kind value at +// minimal cost because the single-char tokens have a uniform representation. +// +// - It has a lookup table that identifies chars in string literals that need +// special handling. This means non-special chars (the common case) can be +// handled with a single test, rather than testing for the multiple special +// cases. +// +// - It pre-scans string literals for special chars. If none are present, it +// bulk copies the string literal into a Vec, which is faster than doing a +// char-by-char copy. +// +// - It reuses Vecs to avoid creating a new one for each string literal. + +use std::os::raw::{c_char, c_uchar}; + +//--------------------------------------------------------------------------- +// The public interface +//--------------------------------------------------------------------------- + +/// Keep this in sync with PrefType in Preferences.cpp. +#[derive(Clone, Copy, Debug, PartialEq)] +#[repr(u8)] +pub enum PrefType { + None, + String, + Int, + Bool, +} + +/// Keep this in sync with PrefValueKind in Preferences.h. +#[derive(Clone, Copy, Debug, PartialEq)] +#[repr(u8)] +pub enum PrefValueKind { + Default, + User, +} + +/// Keep this in sync with PrefValue in Preferences.cpp. +#[repr(C)] +pub union PrefValue { + pub string_val: *const c_char, + pub int_val: i32, + pub bool_val: bool, +} + +/// Keep this in sync with PrefsParserPrefFn in Preferences.cpp. +type PrefFn = unsafe extern "C" fn( + pref_name: *const c_char, + pref_type: PrefType, + pref_value_kind: PrefValueKind, + pref_value: PrefValue, + is_sticky: bool, + is_locked: bool, +); + +/// Keep this in sync with PrefsParserErrorFn in Preferences.cpp. +type ErrorFn = unsafe extern "C" fn(msg: *const c_char); + +/// Parse the contents of a prefs file. +/// +/// `buf` is a null-terminated string. `len` is its length, excluding the +/// null terminator. +/// +/// `pref_fn` is called once for each successfully parsed pref. +/// +/// `error_fn` is called once for each parse error detected. +/// +/// Keep this in sync with the prefs_parser_parse() declaration in +/// Preferences.cpp. +#[no_mangle] +pub unsafe extern "C" fn prefs_parser_parse( + path: *const c_char, + kind: PrefValueKind, + buf: *const c_char, + len: usize, + pref_fn: PrefFn, + error_fn: ErrorFn, +) -> bool { + let path = std::ffi::CStr::from_ptr(path) + .to_string_lossy() + .into_owned(); + + // Make sure `buf` ends in a '\0', and include that in the length, because + // it represents EOF. + let buf = std::slice::from_raw_parts(buf as *const c_uchar, len + 1); + assert!(buf.last() == Some(&EOF)); + + let mut parser = Parser::new(&path, kind, &buf, pref_fn, error_fn); + parser.parse() +} + +//--------------------------------------------------------------------------- +// The implementation +//--------------------------------------------------------------------------- + +#[derive(Clone, Copy, Debug, PartialEq)] +enum Token { + // Unambiguous single-char tokens. + SingleChar(u8), + + // Keywords + Pref, // pref + StickyPref, // sticky_pref + UserPref, // user_pref + True, // true + False, // false + Sticky, // sticky + Locked, // locked + + // String literal, e.g. '"string"'. The value is stored elsewhere. + String, + + // Unsigned integer literal, e.g. '123'. Although libpref uses i32 values, + // any '-' and '+' before an integer literal are treated as separate + // tokens, so these token values are always positive. Furthermore, we + // tokenize int literals as u32 so that 2147483648 (which doesn't fit into + // an i32) can be subsequently negated to -2147483648 (which does fit into + // an i32) if a '-' token precedes it. + Int(u32), + + // Malformed token. + Error(&'static str), + + // Malformed token at a particular line number. For use when + // Parser::line_num might not be the right line number when the error is + // reported. E.g. if a multi-line string has a bad escape sequence on the + // first line, we don't report the error until the string's end has been + // reached. + ErrorAtLine(&'static str, u32), +} + +// We categorize every char by what action should be taken when it appears at +// the start of a new token. +#[derive(Clone, Copy, PartialEq)] +enum CharKind { + // These are ordered by frequency. See the comment in GetToken(). + SingleChar, // Unambiguous single-char tokens: [()+,-] or EOF + SpaceNL, // [\t\v\f \n] + Keyword, // [A-Za-z_] + Quote, // ["'] + Slash, // / + Digit, // [0-9] + Hash, // # + CR, // \r + Other, // Everything else; invalid except within strings and comments. +} + +const C_SINGL: CharKind = CharKind::SingleChar; +const C_SPCNL: CharKind = CharKind::SpaceNL; +const C_KEYWD: CharKind = CharKind::Keyword; +const C_QUOTE: CharKind = CharKind::Quote; +const C_SLASH: CharKind = CharKind::Slash; +const C_DIGIT: CharKind = CharKind::Digit; +const C_HASH_: CharKind = CharKind::Hash; +const C_CR___: CharKind = CharKind::CR; +const C______: CharKind = CharKind::Other; + +#[rustfmt::skip] +const CHAR_KINDS: [CharKind; 256] = [ +/* 0 1 2 3 4 5 6 7 8 9 */ +/* 0+ */ C_SINGL, C______, C______, C______, C______, C______, C______, C______, C______, C_SPCNL, +/* 10+ */ C_SPCNL, C_SPCNL, C_SPCNL, C_CR___, C______, C______, C______, C______, C______, C______, +/* 20+ */ C______, C______, C______, C______, C______, C______, C______, C______, C______, C______, +/* 30+ */ C______, C______, C_SPCNL, C______, C_QUOTE, C_HASH_, C______, C______, C______, C_QUOTE, +/* 40+ */ C_SINGL, C_SINGL, C______, C_SINGL, C_SINGL, C_SINGL, C______, C_SLASH, C_DIGIT, C_DIGIT, +/* 50+ */ C_DIGIT, C_DIGIT, C_DIGIT, C_DIGIT, C_DIGIT, C_DIGIT, C_DIGIT, C_DIGIT, C______, C_SINGL, +/* 60+ */ C______, C______, C______, C______, C______, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, +/* 70+ */ C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, +/* 80+ */ C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, +/* 90+ */ C_KEYWD, C______, C______, C______, C______, C_KEYWD, C______, C_KEYWD, C_KEYWD, C_KEYWD, +/* 100+ */ C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, +/* 110+ */ C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, +/* 120+ */ C_KEYWD, C_KEYWD, C_KEYWD, C______, C______, C______, C______, C______, C______, C______, +/* 130+ */ C______, C______, C______, C______, C______, C______, C______, C______, C______, C______, +/* 140+ */ C______, C______, C______, C______, C______, C______, C______, C______, C______, C______, +/* 150+ */ C______, C______, C______, C______, C______, C______, C______, C______, C______, C______, +/* 160+ */ C______, C______, C______, C______, C______, C______, C______, C______, C______, C______, +/* 170+ */ C______, C______, C______, C______, C______, C______, C______, C______, C______, C______, +/* 180+ */ C______, C______, C______, C______, C______, C______, C______, C______, C______, C______, +/* 190+ */ C______, C______, C______, C______, C______, C______, C______, C______, C______, C______, +/* 200+ */ C______, C______, C______, C______, C______, C______, C______, C______, C______, C______, +/* 210+ */ C______, C______, C______, C______, C______, C______, C______, C______, C______, C______, +/* 220+ */ C______, C______, C______, C______, C______, C______, C______, C______, C______, C______, +/* 230+ */ C______, C______, C______, C______, C______, C______, C______, C______, C______, C______, +/* 240+ */ C______, C______, C______, C______, C______, C______, C______, C______, C______, C______, +/* 250+ */ C______, C______, C______, C______, C______, C______ +]; + +const _______: bool = false; +#[rustfmt::skip] +const SPECIAL_STRING_CHARS: [bool; 256] = [ +/* 0 1 2 3 4 5 6 7 8 9 */ +/* 0+ */ true, _______, _______, _______, _______, _______, _______, _______, _______, _______, +/* 10+ */ true, _______, _______, true, _______, _______, _______, _______, _______, _______, +/* 20+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______, +/* 30+ */ _______, _______, _______, _______, true, _______, _______, _______, _______, true, +/* 40+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______, +/* 50+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______, +/* 60+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______, +/* 70+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______, +/* 80+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______, +/* 90+ */ _______, _______, true, _______, _______, _______, _______, _______, _______, _______, +/* 100+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______, +/* 110+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______, +/* 120+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______, +/* 130+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______, +/* 140+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______, +/* 150+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______, +/* 160+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______, +/* 170+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______, +/* 180+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______, +/* 190+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______, +/* 200+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______, +/* 210+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______, +/* 220+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______, +/* 230+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______, +/* 240+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______, +/* 250+ */ _______, _______, _______, _______, _______, _______ +]; + +struct KeywordInfo { + string: &'static [u8], + token: Token, +} + +const KEYWORD_INFOS: [KeywordInfo; 7] = [ + // These are ordered by frequency. + KeywordInfo { + string: b"pref", + token: Token::Pref, + }, + KeywordInfo { + string: b"true", + token: Token::True, + }, + KeywordInfo { + string: b"false", + token: Token::False, + }, + KeywordInfo { + string: b"user_pref", + token: Token::UserPref, + }, + KeywordInfo { + string: b"sticky", + token: Token::Sticky, + }, + KeywordInfo { + string: b"locked", + token: Token::Locked, + }, + KeywordInfo { + string: b"sticky_pref", + token: Token::StickyPref, + }, +]; + +struct Parser<'t> { + path: &'t str, // Path to the file being parsed. Used in error messages. + kind: PrefValueKind, // Default prefs file or user prefs file? + buf: &'t [u8], // Text being parsed. + i: usize, // Index of next char to be read. + line_num: u32, // Current line number within the text. + pref_fn: PrefFn, // Callback for processing each pref. + error_fn: ErrorFn, // Callback for parse errors. + has_errors: bool, // Have we encountered errors? +} + +// As described above, we use 0 to represent EOF. +const EOF: u8 = b'\0'; + +impl<'t> Parser<'t> { + fn new( + path: &'t str, + kind: PrefValueKind, + buf: &'t [u8], + pref_fn: PrefFn, + error_fn: ErrorFn, + ) -> Parser<'t> { + // Make sure these tables take up 1 byte per entry. + assert!(std::mem::size_of_val(&CHAR_KINDS) == 256); + assert!(std::mem::size_of_val(&SPECIAL_STRING_CHARS) == 256); + + Parser { + path: path, + kind: kind, + buf: buf, + i: 0, + line_num: 1, + pref_fn: pref_fn, + error_fn: error_fn, + has_errors: false, + } + } + + fn parse(&mut self) -> bool { + // These are reused, because allocating a new Vec for every string is slow. + let mut name_str = Vec::with_capacity(128); // For pref names. + let mut value_str = Vec::with_capacity(512); // For string pref values. + let mut none_str = Vec::with_capacity(0); // For tokens that shouldn't be strings. + + let mut token = self.get_token(&mut none_str); + + // At the top of the loop we already have a token. In a valid input + // this will be either the first token of a new pref, or EOF. + loop { + // <pref-spec> + let (pref_value_kind, mut is_sticky) = match token { + Token::Pref if self.kind == PrefValueKind::Default => { + (PrefValueKind::Default, false) + } + Token::StickyPref if self.kind == PrefValueKind::Default => { + (PrefValueKind::Default, true) + } + Token::UserPref => (PrefValueKind::User, false), + Token::SingleChar(EOF) => return !self.has_errors, + _ => { + token = self.error_and_recover( + token, + if self.kind == PrefValueKind::Default { + "expected pref specifier at start of pref definition" + } else { + "expected 'user_pref' at start of pref definition" + }, + ); + continue; + } + }; + + // "(" + token = self.get_token(&mut none_str); + if token != Token::SingleChar(b'(') { + token = self.error_and_recover(token, "expected '(' after pref specifier"); + continue; + } + + // <pref-name> + token = self.get_token(&mut name_str); + let pref_name = if token == Token::String { + &name_str + } else { + token = self.error_and_recover(token, "expected pref name after '('"); + continue; + }; + + // "," + token = self.get_token(&mut none_str); + if token != Token::SingleChar(b',') { + token = self.error_and_recover(token, "expected ',' after pref name"); + continue; + } + + // <pref-value> + token = self.get_token(&mut value_str); + let (pref_type, pref_value) = match token { + Token::True => (PrefType::Bool, PrefValue { bool_val: true }), + Token::False => (PrefType::Bool, PrefValue { bool_val: false }), + Token::String => ( + PrefType::String, + PrefValue { + string_val: value_str.as_ptr() as *const c_char, + }, + ), + Token::Int(u) => { + // Accept u <= 2147483647; anything larger will overflow i32. + if u <= std::i32::MAX as u32 { + (PrefType::Int, PrefValue { int_val: u as i32 }) + } else { + token = + self.error_and_recover(Token::Error("integer literal overflowed"), ""); + continue; + } + } + Token::SingleChar(b'-') => { + token = self.get_token(&mut none_str); + if let Token::Int(u) = token { + // Accept u <= 2147483648; anything larger will overflow i32 once negated. + if u <= std::i32::MAX as u32 { + ( + PrefType::Int, + PrefValue { + int_val: -(u as i32), + }, + ) + } else if u == std::i32::MAX as u32 + 1 { + ( + PrefType::Int, + PrefValue { + int_val: std::i32::MIN, + }, + ) + } else { + token = self + .error_and_recover(Token::Error("integer literal overflowed"), ""); + continue; + } + } else { + token = self.error_and_recover(token, "expected integer literal after '-'"); + continue; + } + } + Token::SingleChar(b'+') => { + token = self.get_token(&mut none_str); + if let Token::Int(u) = token { + // Accept u <= 2147483647; anything larger will overflow i32. + if u <= std::i32::MAX as u32 { + (PrefType::Int, PrefValue { int_val: u as i32 }) + } else { + token = self + .error_and_recover(Token::Error("integer literal overflowed"), ""); + continue; + } + } else { + token = self.error_and_recover(token, "expected integer literal after '+'"); + continue; + } + } + _ => { + token = self.error_and_recover(token, "expected pref value after ','"); + continue; + } + }; + + // ("," <pref-attr>)* // default pref files only + let mut is_locked = false; + let mut has_attrs = false; + if self.kind == PrefValueKind::Default { + let ok = loop { + // "," + token = self.get_token(&mut none_str); + if token != Token::SingleChar(b',') { + break true; + } + + // <pref-attr> + token = self.get_token(&mut none_str); + match token { + Token::Sticky => is_sticky = true, + Token::Locked => is_locked = true, + _ => { + token = + self.error_and_recover(token, "expected pref attribute after ','"); + break false; + } + } + has_attrs = true; + }; + if !ok { + continue; + } + } else { + token = self.get_token(&mut none_str); + } + + // ")" + if token != Token::SingleChar(b')') { + let expected_msg = if self.kind == PrefValueKind::Default { + if has_attrs { + "expected ',' or ')' after pref attribute" + } else { + "expected ',' or ')' after pref value" + } + } else { + "expected ')' after pref value" + }; + token = self.error_and_recover(token, expected_msg); + continue; + } + + // ";" + token = self.get_token(&mut none_str); + if token != Token::SingleChar(b';') { + token = self.error_and_recover(token, "expected ';' after ')'"); + continue; + } + + unsafe { + (self.pref_fn)( + pref_name.as_ptr() as *const c_char, + pref_type, + pref_value_kind, + pref_value, + is_sticky, + is_locked, + ) + }; + + token = self.get_token(&mut none_str); + } + } + + fn error_and_recover(&mut self, token: Token, msg: &str) -> Token { + self.has_errors = true; + + // If `token` is a Token::{Error,ErrorAtLine}, it's a lexing error and + // the error message is within `token`. Otherwise, it's a parsing error + // and the error message is in `msg`. + let (msg, line_num) = match token { + Token::Error(token_msg) => (token_msg, self.line_num), + Token::ErrorAtLine(token_msg, line_num) => (token_msg, line_num), + _ => (msg, self.line_num), + }; + let msg = format!("{}:{}: prefs parse error: {}", self.path, line_num, msg); + let msg = std::ffi::CString::new(msg).unwrap(); + unsafe { (self.error_fn)(msg.as_ptr() as *const c_char) }; + + // "Panic-mode" recovery: consume tokens until one of the following + // occurs. + // - We hit a semicolon, whereupon we return the following token. + // - We hit EOF, whereupon we return EOF. + // + // For this to work, if the lexing functions hit EOF in an error case + // they must unget it so we can safely reget it here. + // + // If the starting token (passed in above) is EOF we must not get + // another token otherwise we will read past the end of `self.buf`. + let mut dummy_str = Vec::with_capacity(128); + let mut token = token; + loop { + match token { + Token::SingleChar(b';') => return self.get_token(&mut dummy_str), + Token::SingleChar(EOF) => return token, + _ => {} + } + token = self.get_token(&mut dummy_str); + } + } + + #[inline(always)] + fn get_char(&mut self) -> u8 { + // We do the bounds check ourselves so we can return EOF on failure. + // (Although the buffer is guaranteed to end in an EOF char, we might + // go one char past that, whereupon we must return EOF again.) + if self.i < self.buf.len() { + let c = unsafe { *self.buf.get_unchecked(self.i) }; + self.i += 1; + c + } else { + debug_assert!(self.i == self.buf.len()); + EOF + } + } + + // This function skips the bounds check in optimized builds. Using it at + // the hottest two call sites gives a ~15% parsing speed boost. + #[inline(always)] + unsafe fn get_char_unchecked(&mut self) -> u8 { + debug_assert!(self.i < self.buf.len()); + let c = *self.buf.get_unchecked(self.i); + self.i += 1; + c + } + + #[inline(always)] + fn unget_char(&mut self) { + debug_assert!(self.i > 0); + self.i -= 1; + } + + #[inline(always)] + fn match_char(&mut self, c: u8) -> bool { + if self.buf[self.i] == c { + self.i += 1; + return true; + } + false + } + + #[inline(always)] + fn match_single_line_comment(&mut self) { + loop { + // To reach here, the previous char must have been '/' (if this is + // the first loop iteration) or non-special (if this is the second + // or subsequent iteration), and assertions elsewhere ensure that + // there must be at least one subsequent char after those chars + // (the '\0' for EOF). + let c = unsafe { self.get_char_unchecked() }; + + // All the special chars have value <= b'\r'. + if c > b'\r' { + continue; + } + match c { + b'\n' => { + self.line_num += 1; + break; + } + b'\r' => { + self.line_num += 1; + self.match_char(b'\n'); + break; + } + EOF => { + break; + } + _ => continue, + } + } + } + + // Returns false if we hit EOF without closing the comment. + fn match_multi_line_comment(&mut self) -> bool { + loop { + match self.get_char() { + b'*' => { + if self.match_char(b'/') { + return true; + } + } + b'\n' => { + self.line_num += 1; + } + b'\r' => { + self.line_num += 1; + self.match_char(b'\n'); + } + EOF => return false, + _ => continue, + } + } + } + + fn match_hex_digits(&mut self, ndigits: i32) -> Option<u16> { + debug_assert!(ndigits == 2 || ndigits == 4); + let mut value: u16 = 0; + for _ in 0..ndigits { + value = value << 4; + match self.get_char() { + c @ b'0'..=b'9' => value += (c - b'0') as u16, + c @ b'A'..=b'F' => value += (c - b'A') as u16 + 10, + c @ b'a'..=b'f' => value += (c - b'a') as u16 + 10, + _ => { + self.unget_char(); + return None; + } + } + } + Some(value) + } + + #[inline(always)] + fn char_kind(c: u8) -> CharKind { + // Use get_unchecked() because a u8 index cannot exceed this table's + // bounds. + unsafe { *CHAR_KINDS.get_unchecked(c as usize) } + } + + #[inline(always)] + fn is_special_string_char(c: u8) -> bool { + // Use get_unchecked() because a u8 index cannot exceed this table's + // bounds. + unsafe { *SPECIAL_STRING_CHARS.get_unchecked(c as usize) } + } + + // If the obtained Token has a value, it is put within the Token, unless + // it's a string, in which case it's put in `str_buf`. This avoids + // allocating a new Vec for every string, which is slow. + fn get_token(&mut self, str_buf: &mut Vec<u8>) -> Token { + loop { + // Note: the following tests are ordered by frequency when parsing + // greprefs.js: + // - SingleChar 36.7% + // - SpaceNL 27.7% (14.9% for spaces, 12.8% for NL) + // - Keyword 13.4% + // - Quote 11.4% + // - Slash 8.1% + // - Digit 2.7% + // - Hash, CR, Other 0.0% + + let c = self.get_char(); + match Parser::char_kind(c) { + CharKind::SingleChar => { + return Token::SingleChar(c); + } + CharKind::SpaceNL => { + // It's slightly faster to combine the handling of the + // space chars with NL than to handle them separately; we + // have an extra test for this case, but one fewer test for + // all the subsequent CharKinds. + if c == b'\n' { + self.line_num += 1; + } + continue; + } + CharKind::Keyword => { + let start = self.i - 1; + loop { + let c = self.get_char(); + if Parser::char_kind(c) != CharKind::Keyword { + self.unget_char(); + break; + } + } + for info in KEYWORD_INFOS.iter() { + if &self.buf[start..self.i] == info.string { + return info.token; + } + } + return Token::Error("unknown keyword"); + } + CharKind::Quote => { + return self.get_string_token(c, str_buf); + } + CharKind::Slash => { + match self.get_char() { + b'/' => { + self.match_single_line_comment(); + } + b'*' => { + if !self.match_multi_line_comment() { + return Token::Error("unterminated /* comment"); + } + } + c @ _ => { + if c == b'\n' || c == b'\r' { + // Unget the newline char; the outer loop will + // reget it and adjust self.line_num + // appropriately. + self.unget_char(); + } + return Token::Error("expected '/' or '*' after '/'"); + } + } + continue; + } + CharKind::Digit => { + let mut value = Some((c - b'0') as u32); + loop { + let c = self.get_char(); + match Parser::char_kind(c) { + CharKind::Digit => { + fn add_digit(value: Option<u32>, c: u8) -> Option<u32> { + value?.checked_mul(10)?.checked_add((c - b'0') as u32) + } + value = add_digit(value, c); + } + CharKind::Keyword => { + // Reject things like "123foo". Error recovery + // will retokenize from "foo" onward. + self.unget_char(); + return Token::Error("unexpected character in integer literal"); + } + _ => { + self.unget_char(); + break; + } + } + } + return match value { + Some(v) => Token::Int(v), + None => Token::Error("integer literal overflowed"), + }; + } + CharKind::Hash => { + self.match_single_line_comment(); + continue; + } + CharKind::CR => { + self.match_char(b'\n'); + self.line_num += 1; + continue; + } + // Error recovery will retokenize from the next character. + _ => return Token::Error("unexpected character"), + } + } + } + + fn string_error_token(&self, token: &mut Token, msg: &'static str) { + // We only want to capture the first tokenization error within a string. + if *token == Token::String { + *token = Token::ErrorAtLine(msg, self.line_num); + } + } + + // Always inline this because it has a single call site. + #[inline(always)] + fn get_string_token(&mut self, quote_char: u8, str_buf: &mut Vec<u8>) -> Token { + // First scan through the string to see if it contains any chars that + // need special handling. + let start = self.i; + let has_special_chars = loop { + // To reach here, the previous char must have been a quote + // (quote_char), and assertions elsewhere ensure that there must be + // at least one subsequent char (the '\0' for EOF). + let c = unsafe { self.get_char_unchecked() }; + if Parser::is_special_string_char(c) { + break c != quote_char; + } + }; + + // Clear str_buf's contents without changing its capacity. + str_buf.clear(); + + // If there are no special chars (the common case), we can bulk copy it + // to str_buf. This is a lot faster than the char-by-char loop below. + if !has_special_chars { + str_buf.extend(&self.buf[start..self.i - 1]); + str_buf.push(b'\0'); + return Token::String; + } + + // There were special chars. Re-scan the string, filling in str_buf one + // char at a time. + // + // On error, we change `token` to an error token and then keep going to + // the end of the string literal. `str_buf` won't be used in that case. + self.i = start; + let mut token = Token::String; + + loop { + let c = self.get_char(); + let c2 = if !Parser::is_special_string_char(c) { + c + } else if c == quote_char { + break; + } else if c == b'\\' { + match self.get_char() { + b'\"' => b'\"', + b'\'' => b'\'', + b'\\' => b'\\', + b'n' => b'\n', + b'r' => b'\r', + b'x' => { + if let Some(value) = self.match_hex_digits(2) { + debug_assert!(value <= 0xff); + if value != 0 { + value as u8 + } else { + self.string_error_token(&mut token, "\\x00 is not allowed"); + continue; + } + } else { + self.string_error_token(&mut token, "malformed \\x escape sequence"); + continue; + } + } + b'u' => { + if let Some(value) = self.match_hex_digits(4) { + let mut utf16 = vec![value]; + if 0xd800 == (0xfc00 & value) { + // High surrogate value. Look for the low surrogate value. + if self.match_char(b'\\') && self.match_char(b'u') { + if let Some(lo) = self.match_hex_digits(4) { + if 0xdc00 == (0xfc00 & lo) { + // Found a valid low surrogate. + utf16.push(lo); + } else { + self.string_error_token( + &mut token, + "invalid low surrogate after high surrogate", + ); + continue; + } + } + } + if utf16.len() != 2 { + self.string_error_token( + &mut token, + "expected low surrogate after high surrogate", + ); + continue; + } + } else if 0xdc00 == (0xfc00 & value) { + // Unaccompanied low surrogate value. + self.string_error_token( + &mut token, + "expected high surrogate before low surrogate", + ); + continue; + } else if value == 0 { + self.string_error_token(&mut token, "\\u0000 is not allowed"); + continue; + } + + // Insert the UTF-16 sequence as UTF-8. + let utf8 = String::from_utf16(&utf16).unwrap(); + str_buf.extend(utf8.as_bytes()); + } else { + self.string_error_token(&mut token, "malformed \\u escape sequence"); + continue; + } + continue; // We don't want to str_buf.push(c2) below. + } + c @ _ => { + if c == b'\n' || c == b'\r' { + // Unget the newline char; the outer loop will + // reget it and adjust self.line_num appropriately. + self.unget_char(); + } + self.string_error_token( + &mut token, + "unexpected escape sequence character after '\\'", + ); + continue; + } + } + } else if c == b'\n' { + self.line_num += 1; + c + } else if c == b'\r' { + self.line_num += 1; + if self.match_char(b'\n') { + str_buf.push(b'\r'); + b'\n' + } else { + c + } + } else if c == EOF { + self.string_error_token(&mut token, "unterminated string literal"); + break; + } else { + // This case is only hit for the non-closing quote char. + debug_assert!((c == b'\'' || c == b'\"') && c != quote_char); + c + }; + str_buf.push(c2); + } + str_buf.push(b'\0'); + + token + } +} |