/* This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ use crate::preferences::{Pref, PrefValue, Preferences}; use std::borrow::Borrow; use std::borrow::Cow; use std::char; use std::error::Error; use std::io::{self, Write}; use std::str; use thiserror::Error; impl PrefReaderError { fn new(message: String, position: Position, parent: Option>) -> PrefReaderError { PrefReaderError { message, position, parent, } } } impl From for PrefReaderError { fn from(err: io::Error) -> PrefReaderError { PrefReaderError::new("IOError".into(), Position::new(), Some(err.into())) } } #[derive(Copy, Clone, Debug, PartialEq)] enum TokenizerState { Junk, CommentStart, CommentLine, CommentBlock, FunctionName, AfterFunctionName, FunctionArgs, FunctionArg, DoubleQuotedString, SingleQuotedString, Number, Bool, AfterFunctionArg, AfterFunction, Error, } #[derive(Copy, Clone, Debug, Default, PartialEq)] pub struct Position { line: u32, column: u32, } impl Position { pub fn new() -> Position { Position { line: 1, column: 0 } } } #[derive(Copy, Clone, Debug, PartialEq)] pub enum TokenType { None, PrefFunction, UserPrefFunction, StickyPrefFunction, CommentBlock, CommentLine, CommentBashLine, Paren, Semicolon, Comma, String, Int, Bool, Error, } #[derive(Debug, PartialEq)] pub enum PrefToken<'a> { PrefFunction(Position), UserPrefFunction(Position), StickyPrefFunction(Position), CommentBlock(Cow<'a, str>, Position), CommentLine(Cow<'a, str>, Position), CommentBashLine(Cow<'a, str>, Position), Paren(char, Position), Semicolon(Position), Comma(Position), String(Cow<'a, str>, Position), Int(i64, Position), Bool(bool, Position), Error(String, Position), } impl<'a> PrefToken<'a> { fn position(&self) -> Position { match *self { PrefToken::PrefFunction(position) => position, PrefToken::UserPrefFunction(position) => position, PrefToken::StickyPrefFunction(position) => position, PrefToken::CommentBlock(_, position) => position, PrefToken::CommentLine(_, position) => position, PrefToken::CommentBashLine(_, position) => position, PrefToken::Paren(_, position) => position, PrefToken::Semicolon(position) => position, PrefToken::Comma(position) => position, PrefToken::String(_, position) => position, PrefToken::Int(_, position) => position, PrefToken::Bool(_, position) => position, PrefToken::Error(_, position) => position, } } } #[derive(Debug, Error)] #[error("{message} at line {}, column {}", .position.line, .position.column)] pub struct PrefReaderError { message: String, position: Position, #[source] parent: Option>, } struct TokenData<'a> { token_type: TokenType, complete: bool, position: Position, data: Cow<'a, str>, start_pos: usize, } impl<'a> TokenData<'a> { fn new(token_type: TokenType, position: Position, start_pos: usize) -> TokenData<'a> { TokenData { token_type, complete: false, position, data: Cow::Borrowed(""), start_pos, } } fn start(&mut self, tokenizer: &PrefTokenizer, token_type: TokenType) { self.token_type = token_type; self.position = tokenizer.position; self.start_pos = tokenizer.pos; } fn end(&mut self, buf: &'a [u8], end_pos: usize) -> Result<(), PrefReaderError> { self.complete = true; self.add_slice_to_token(buf, end_pos) } fn add_slice_to_token(&mut self, buf: &'a [u8], end_pos: usize) -> Result<(), PrefReaderError> { let data = match str::from_utf8(&buf[self.start_pos..end_pos]) { Ok(x) => x, Err(_) => { return Err(PrefReaderError::new( "Could not convert string to utf8".into(), self.position, None, )); } }; if self.data != "" { self.data.to_mut().push_str(data) } else { self.data = Cow::Borrowed(data) }; Ok(()) } fn push_char(&mut self, tokenizer: &PrefTokenizer, data: char) { self.data.to_mut().push(data); self.start_pos = tokenizer.pos + 1; } } pub struct PrefTokenizer<'a> { data: &'a [u8], pos: usize, cur: Option, position: Position, state: TokenizerState, next_state: Option, } impl<'a> PrefTokenizer<'a> { pub fn new(data: &'a [u8]) -> PrefTokenizer<'a> { PrefTokenizer { data, pos: 0, cur: None, position: Position::new(), state: TokenizerState::Junk, next_state: Some(TokenizerState::FunctionName), } } fn make_token(&mut self, token_data: TokenData<'a>) -> PrefToken<'a> { let buf = token_data.data; let position = token_data.position; // Note: the panic! here are for cases where the invalid input is regarded as // a bug in the caller. In cases where `make_token` can legitimately be called // with invalid data we must instead return a PrefToken::Error match token_data.token_type { TokenType::None => panic!("Got a token without a type"), TokenType::PrefFunction => PrefToken::PrefFunction(position), TokenType::UserPrefFunction => PrefToken::UserPrefFunction(position), TokenType::StickyPrefFunction => PrefToken::StickyPrefFunction(position), TokenType::CommentBlock => PrefToken::CommentBlock(buf, position), TokenType::CommentLine => PrefToken::CommentLine(buf, position), TokenType::CommentBashLine => PrefToken::CommentBashLine(buf, position), TokenType::Paren => { if buf.len() != 1 { panic!("Expected a buffer of length one"); } PrefToken::Paren(buf.chars().next().unwrap(), position) } TokenType::Semicolon => PrefToken::Semicolon(position), TokenType::Comma => PrefToken::Comma(position), TokenType::String => PrefToken::String(buf, position), TokenType::Int => { return match buf.parse::() { Ok(value) => PrefToken::Int(value, position), Err(_) => PrefToken::Error(format!("Expected integer, got {}", buf), position), } } TokenType::Bool => { let value = match buf.borrow() { "true" => true, "false" => false, x => panic!("Boolean wasn't 'true' or 'false' (was {})", x), }; PrefToken::Bool(value, position) } TokenType::Error => panic!("make_token can't construct errors"), } } fn get_char(&mut self) -> Option { if self.pos + 1 >= self.data.len() { self.cur = None; return None; }; if self.cur.is_some() { self.pos += 1; } let c = self.data[self.pos] as char; if self.cur == Some('\n') { self.position.line += 1; self.position.column = 0; } else if self.cur.is_some() { self.position.column += 1; }; self.cur = Some(c); self.cur } fn unget_char(&mut self) -> Option { if self.pos == 0 { self.position.column = 0; self.cur = None } else { self.pos -= 1; let c = self.data[self.pos] as char; if c == '\n' { self.position.line -= 1; let mut col_pos = self.pos; while col_pos > 0 { col_pos -= 1; if self.data[col_pos] as char == '\n' { break; } } self.position.column = (self.pos - col_pos) as u32; } else { self.position.column -= 1; } self.cur = Some(c); } self.cur } fn is_space(c: char) -> bool { matches!(c, ' ' | '\t' | '\r' | '\n') } fn skip_whitespace(&mut self) -> Option { while let Some(c) = self.cur { if PrefTokenizer::is_space(c) { self.get_char(); } else { break; }; } self.cur } fn consume_escape(&mut self, token_data: &mut TokenData<'a>) -> Result<(), PrefReaderError> { let pos = self.pos; let escaped = self.read_escape()?; if let Some(escape_char) = escaped { token_data.add_slice_to_token(self.data, pos)?; token_data.push_char(self, escape_char); }; Ok(()) } fn read_escape(&mut self) -> Result, PrefReaderError> { let escape_char = match self.get_char() { Some('u') => self.read_hex_escape(4, true)?, Some('x') => self.read_hex_escape(2, true)?, Some('\\') => '\\' as u32, Some('"') => '"' as u32, Some('\'') => '\'' as u32, Some('r') => '\r' as u32, Some('n') => '\n' as u32, Some(_) => return Ok(None), None => { return Err(PrefReaderError::new( "EOF in character escape".into(), self.position, None, )) } }; Ok(Some(char::from_u32(escape_char).ok_or_else(|| { PrefReaderError::new( "Invalid codepoint decoded from escape".into(), self.position, None, ) })?)) } fn read_hex_escape(&mut self, hex_chars: isize, first: bool) -> Result { let mut value = 0; for _ in 0..hex_chars { match self.get_char() { Some(x) => { value <<= 4; match x { '0'..='9' => value += x as u32 - '0' as u32, 'a'..='f' => value += x as u32 - 'a' as u32, 'A'..='F' => value += x as u32 - 'A' as u32, _ => { return Err(PrefReaderError::new( "Unexpected character in escape".into(), self.position, None, )) } } } None => { return Err(PrefReaderError::new( "Unexpected EOF in escape".into(), self.position, None, )) } } } if first && (0xD800..=0xDBFF).contains(&value) { // First part of a surrogate pair if self.get_char() != Some('\\') || self.get_char() != Some('u') { return Err(PrefReaderError::new( "Lone high surrogate in surrogate pair".into(), self.position, None, )); } self.unget_char(); let high_surrogate = value; let low_surrogate = self.read_hex_escape(4, false)?; let high_value = (high_surrogate - 0xD800) << 10; let low_value = low_surrogate - 0xDC00; value = high_value + low_value + 0x10000; } else if first && (0xDC00..=0xDFFF).contains(&value) { return Err(PrefReaderError::new( "Lone low surrogate".into(), self.position, None, )); } else if !first && !(0xDC00..=0xDFFF).contains(&value) { return Err(PrefReaderError::new( "Invalid low surrogate in surrogate pair".into(), self.position, None, )); } Ok(value) } fn get_match(&mut self, target: &str, separators: &str) -> bool { let initial_pos = self.pos; let mut matched = true; for c in target.chars() { if self.cur == Some(c) { self.get_char(); } else { matched = false; break; } } if !matched { for _ in 0..(self.pos - initial_pos) { self.unget_char(); } } else { // Check that the next character is whitespace or a separator if let Some(c) = self.cur { if !(PrefTokenizer::is_space(c) || separators.contains(c) || c == '/') { matched = false; } self.unget_char(); } // Otherwise the token was followed by EOF. That's a valid match, but // will presumably cause a parse error later. } matched } fn next_token(&mut self) -> Result>, PrefReaderError> { let mut token_data = TokenData::new(TokenType::None, Position::new(), 0); loop { let mut c = match self.get_char() { Some(x) => x, None => return Ok(None), }; self.state = match self.state { TokenizerState::Junk => { c = match self.skip_whitespace() { Some(x) => x, None => return Ok(None), }; match c { '/' => TokenizerState::CommentStart, '#' => { token_data.start(self, TokenType::CommentBashLine); token_data.start_pos = self.pos + 1; TokenizerState::CommentLine } _ => { self.unget_char(); let next = match self.next_state { Some(x) => x, None => { return Err(PrefReaderError::new( "In Junk state without a next state defined".into(), self.position, None, )) } }; self.next_state = None; next } } } TokenizerState::CommentStart => match c { '*' => { token_data.start(self, TokenType::CommentBlock); token_data.start_pos = self.pos + 1; TokenizerState::CommentBlock } '/' => { token_data.start(self, TokenType::CommentLine); token_data.start_pos = self.pos + 1; TokenizerState::CommentLine } _ => { return Err(PrefReaderError::new( "Invalid character after /".into(), self.position, None, )) } }, TokenizerState::CommentLine => match c { '\n' => { token_data.end(self.data, self.pos)?; TokenizerState::Junk } _ => TokenizerState::CommentLine, }, TokenizerState::CommentBlock => match c { '*' => { if self.get_char() == Some('/') { token_data.end(self.data, self.pos - 1)?; TokenizerState::Junk } else { TokenizerState::CommentBlock } } _ => TokenizerState::CommentBlock, }, TokenizerState::FunctionName => { let position = self.position; let start_pos = self.pos; match c { 'u' => { if self.get_match("user_pref", "(") { token_data.start(self, TokenType::UserPrefFunction); } } 's' => { if self.get_match("sticky_pref", "(") { token_data.start(self, TokenType::StickyPrefFunction); } } 'p' => { if self.get_match("pref", "(") { token_data.start(self, TokenType::PrefFunction); } } _ => {} }; if token_data.token_type == TokenType::None { // We didn't match anything return Err(PrefReaderError::new( "Expected a pref function name".into(), position, None, )); } else { token_data.start_pos = start_pos; token_data.position = position; token_data.end(self.data, self.pos + 1)?; self.next_state = Some(TokenizerState::AfterFunctionName); TokenizerState::Junk } } TokenizerState::AfterFunctionName => match c { '(' => { self.next_state = Some(TokenizerState::FunctionArgs); token_data.start(self, TokenType::Paren); token_data.end(self.data, self.pos + 1)?; self.next_state = Some(TokenizerState::FunctionArgs); TokenizerState::Junk } _ => { return Err(PrefReaderError::new( "Expected an opening paren".into(), self.position, None, )) } }, TokenizerState::FunctionArgs => match c { ')' => { token_data.start(self, TokenType::Paren); token_data.end(self.data, self.pos + 1)?; self.next_state = Some(TokenizerState::AfterFunction); TokenizerState::Junk } _ => { self.unget_char(); TokenizerState::FunctionArg } }, TokenizerState::FunctionArg => match c { '"' => { token_data.start(self, TokenType::String); token_data.start_pos = self.pos + 1; TokenizerState::DoubleQuotedString } '\'' => { token_data.start(self, TokenType::String); token_data.start_pos = self.pos + 1; TokenizerState::SingleQuotedString } 't' | 'f' => { self.unget_char(); TokenizerState::Bool } '0'..='9' | '-' | '+' => { token_data.start(self, TokenType::Int); TokenizerState::Number } _ => { return Err(PrefReaderError::new( "Invalid character at start of function argument".into(), self.position, None, )) } }, TokenizerState::DoubleQuotedString => match c { '"' => { token_data.end(self.data, self.pos)?; self.next_state = Some(TokenizerState::AfterFunctionArg); TokenizerState::Junk } '\n' => { return Err(PrefReaderError::new( "EOL in double quoted string".into(), self.position, None, )) } '\\' => { self.consume_escape(&mut token_data)?; TokenizerState::DoubleQuotedString } _ => TokenizerState::DoubleQuotedString, }, TokenizerState::SingleQuotedString => match c { '\'' => { token_data.end(self.data, self.pos)?; self.next_state = Some(TokenizerState::AfterFunctionArg); TokenizerState::Junk } '\n' => { return Err(PrefReaderError::new( "EOL in single quoted string".into(), self.position, None, )) } '\\' => { self.consume_escape(&mut token_data)?; TokenizerState::SingleQuotedString } _ => TokenizerState::SingleQuotedString, }, TokenizerState::Number => match c { '0'..='9' => TokenizerState::Number, ')' | ',' => { token_data.end(self.data, self.pos)?; self.unget_char(); self.next_state = Some(TokenizerState::AfterFunctionArg); TokenizerState::Junk } x if PrefTokenizer::is_space(x) => { token_data.end(self.data, self.pos)?; self.next_state = Some(TokenizerState::AfterFunctionArg); TokenizerState::Junk } _ => { return Err(PrefReaderError::new( "Invalid character in number literal".into(), self.position, None, )) } }, TokenizerState::Bool => { let start_pos = self.pos; let position = self.position; match c { 't' => { if self.get_match("true", ",)") { token_data.start(self, TokenType::Bool) } } 'f' => { if self.get_match("false", ",)") { token_data.start(self, TokenType::Bool) } } _ => {} }; if token_data.token_type == TokenType::None { return Err(PrefReaderError::new( "Unexpected characters in function argument".into(), position, None, )); } else { token_data.start_pos = start_pos; token_data.position = position; token_data.end(self.data, self.pos + 1)?; self.next_state = Some(TokenizerState::AfterFunctionArg); TokenizerState::Junk } } TokenizerState::AfterFunctionArg => match c { ',' => { token_data.start(self, TokenType::Comma); token_data.end(self.data, self.pos + 1)?; self.next_state = Some(TokenizerState::FunctionArg); TokenizerState::Junk } ')' => { token_data.start(self, TokenType::Paren); token_data.end(self.data, self.pos + 1)?; self.next_state = Some(TokenizerState::AfterFunction); TokenizerState::Junk } _ => { return Err(PrefReaderError::new( "Unexpected character after function argument".into(), self.position, None, )) } }, TokenizerState::AfterFunction => match c { ';' => { token_data.start(self, TokenType::Semicolon); token_data.end(self.data, self.pos)?; self.next_state = Some(TokenizerState::FunctionName); TokenizerState::Junk } _ => { return Err(PrefReaderError::new( "Unexpected character after function".into(), self.position, None, )) } }, TokenizerState::Error => TokenizerState::Error, }; if token_data.complete { return Ok(Some(token_data)); } } } } impl<'a> Iterator for PrefTokenizer<'a> { type Item = PrefToken<'a>; fn next(&mut self) -> Option> { if let TokenizerState::Error = self.state { return None; } let token_data = match self.next_token() { Err(e) => { self.state = TokenizerState::Error; return Some(PrefToken::Error(e.message.clone(), e.position)); } Ok(Some(token_data)) => token_data, Ok(None) => return None, }; let token = self.make_token(token_data); Some(token) } } pub fn tokenize(data: &[u8]) -> PrefTokenizer { PrefTokenizer::new(data) } pub fn serialize_token(token: &PrefToken, output: &mut T) -> Result<(), PrefReaderError> { let mut data_buf = String::new(); let data = match *token { PrefToken::PrefFunction(_) => "pref", PrefToken::UserPrefFunction(_) => "user_pref", PrefToken::StickyPrefFunction(_) => "sticky_pref", PrefToken::CommentBlock(ref data, _) => { data_buf.reserve(data.len() + 4); data_buf.push_str("/*"); data_buf.push_str(data.borrow()); data_buf.push('*'); &*data_buf } PrefToken::CommentLine(ref data, _) => { data_buf.reserve(data.len() + 2); data_buf.push_str("//"); data_buf.push_str(data.borrow()); &*data_buf } PrefToken::CommentBashLine(ref data, _) => { data_buf.reserve(data.len() + 1); data_buf.push('#'); data_buf.push_str(data.borrow()); &*data_buf } PrefToken::Paren(data, _) => { data_buf.push(data); &*data_buf } PrefToken::Comma(_) => ",", PrefToken::Semicolon(_) => ";\n", PrefToken::String(ref data, _) => { data_buf.reserve(data.len() + 2); data_buf.push('"'); data_buf.push_str(escape_quote(data.borrow()).borrow()); data_buf.push('"'); &*data_buf } PrefToken::Int(data, _) => { data_buf.push_str(&data.to_string()); &*data_buf } PrefToken::Bool(data, _) => { if data { "true" } else { "false" } } PrefToken::Error(ref data, pos) => { return Err(PrefReaderError::new(data.clone(), pos, None)) } }; output.write_all(data.as_bytes())?; Ok(()) } pub fn serialize_tokens<'a, I, W>(tokens: I, output: &mut W) -> Result<(), PrefReaderError> where I: Iterator>, W: Write, { for token in tokens { serialize_token(token, output)?; } Ok(()) } fn escape_quote(data: &str) -> Cow { // Not very efficient… if data.contains('"') || data.contains('\\') { Cow::Owned(data.replace('\\', r"\\").replace('"', r#"\""#)) } else { Cow::Borrowed(data) } } #[derive(Debug, PartialEq)] enum ParserState { Function, Key, Value, } struct PrefBuilder { key: Option, value: Option, sticky: bool, } impl PrefBuilder { fn new() -> PrefBuilder { PrefBuilder { key: None, value: None, sticky: false, } } } fn skip_comments<'a>(tokenizer: &mut PrefTokenizer<'a>) -> Option> { loop { match tokenizer.next() { Some(PrefToken::CommentBashLine(_, _)) | Some(PrefToken::CommentBlock(_, _)) | Some(PrefToken::CommentLine(_, _)) => {} Some(x) => return Some(x), None => return None, } } } pub fn parse_tokens(tokenizer: &mut PrefTokenizer<'_>) -> Result { let mut state = ParserState::Function; let mut current_pref = PrefBuilder::new(); let mut rv = Preferences::new(); loop { // Not just using a for loop here seems strange, but this restricts the // scope of the borrow let token = { match tokenizer.next() { Some(x) => x, None => break, } }; // First deal with comments and errors match token { PrefToken::Error(msg, position) => { return Err(PrefReaderError::new(msg, position, None)); } PrefToken::CommentBashLine(_, _) | PrefToken::CommentLine(_, _) | PrefToken::CommentBlock(_, _) => continue, _ => {} } state = match state { ParserState::Function => { match token { PrefToken::PrefFunction(_) => { current_pref.sticky = false; } PrefToken::UserPrefFunction(_) => { current_pref.sticky = false; } PrefToken::StickyPrefFunction(_) => { current_pref.sticky = true; } _ => { return Err(PrefReaderError::new( "Expected pref function".into(), token.position(), None, )); } } let next = skip_comments(tokenizer); match next { Some(PrefToken::Paren('(', _)) => ParserState::Key, _ => { return Err(PrefReaderError::new( "Expected open paren".into(), next.map(|x| x.position()).unwrap_or(tokenizer.position), None, )) } } } ParserState::Key => { match token { PrefToken::String(data, _) => current_pref.key = Some(data.into_owned()), _ => { return Err(PrefReaderError::new( "Expected string".into(), token.position(), None, )); } } let next = skip_comments(tokenizer); match next { Some(PrefToken::Comma(_)) => ParserState::Value, _ => { return Err(PrefReaderError::new( "Expected comma".into(), next.map(|x| x.position()).unwrap_or(tokenizer.position), None, )) } } } ParserState::Value => { match token { PrefToken::String(data, _) => { current_pref.value = Some(PrefValue::String(data.into_owned())) } PrefToken::Int(data, _) => current_pref.value = Some(PrefValue::Int(data)), PrefToken::Bool(data, _) => current_pref.value = Some(PrefValue::Bool(data)), _ => { return Err(PrefReaderError::new( "Expected value".into(), token.position(), None, )) } } let next = skip_comments(tokenizer); match next { Some(PrefToken::Paren(')', _)) => {} _ => { return Err(PrefReaderError::new( "Expected close paren".into(), next.map(|x| x.position()).unwrap_or(tokenizer.position), None, )) } } let next = skip_comments(tokenizer); match next { Some(PrefToken::Semicolon(_)) | None => {} _ => { return Err(PrefReaderError::new( "Expected semicolon".into(), next.map(|x| x.position()).unwrap_or(tokenizer.position), None, )) } } let key = current_pref.key.take(); let value = current_pref.value.take(); let pref = if current_pref.sticky { Pref::new_sticky(value.unwrap()) } else { Pref::new(value.unwrap()) }; rv.insert(key.unwrap(), pref); current_pref.sticky = false; ParserState::Function } } } match state { ParserState::Key | ParserState::Value => { return Err(PrefReaderError::new( "EOF in middle of function".into(), tokenizer.position, None, )); } _ => {} } Ok(rv) } pub fn serialize(prefs: &Preferences, output: &mut W) -> io::Result<()> { let mut p: Vec<_> = prefs.iter().collect(); p.sort_by(|a, b| a.0.cmp(b.0)); for &(key, pref) in &p { let func = if pref.sticky { "sticky_pref(" } else { "user_pref(" } .as_bytes(); output.write_all(func)?; output.write_all(b"\"")?; output.write_all(escape_quote(key).as_bytes())?; output.write_all(b"\"")?; output.write_all(b", ")?; match pref.value { PrefValue::Bool(x) => { output.write_all(if x { b"true" } else { b"false" })?; } PrefValue::Int(x) => { output.write_all(x.to_string().as_bytes())?; } PrefValue::String(ref x) => { output.write_all(b"\"")?; output.write_all(escape_quote(x).as_bytes())?; output.write_all(b"\"")?; } }; output.write_all(b");\n")?; } Ok(()) } pub fn parse(data: &[u8]) -> Result { let mut tokenizer = tokenize(data); parse_tokens(&mut tokenizer) }