From 36d22d82aa202bb199967e9512281e9a53db42c9 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 7 Apr 2024 21:33:14 +0200 Subject: Adding upstream version 115.7.0esr. Signed-off-by: Daniel Baumann --- .../rust/fluent-syntax/src/parser/comment.rs | 89 ++++++ third_party/rust/fluent-syntax/src/parser/core.rs | 307 +++++++++++++++++++++ .../rust/fluent-syntax/src/parser/errors.rs | 169 ++++++++++++ .../rust/fluent-syntax/src/parser/expression.rs | 224 +++++++++++++++ .../rust/fluent-syntax/src/parser/helper.rs | 169 ++++++++++++ .../rust/fluent-syntax/src/parser/macros.rs | 11 + third_party/rust/fluent-syntax/src/parser/mod.rs | 278 +++++++++++++++++++ .../rust/fluent-syntax/src/parser/pattern.rs | 207 ++++++++++++++ .../rust/fluent-syntax/src/parser/runtime.rs | 61 ++++ third_party/rust/fluent-syntax/src/parser/slice.rs | 25 ++ 10 files changed, 1540 insertions(+) create mode 100644 third_party/rust/fluent-syntax/src/parser/comment.rs create mode 100644 third_party/rust/fluent-syntax/src/parser/core.rs create mode 100644 third_party/rust/fluent-syntax/src/parser/errors.rs create mode 100644 third_party/rust/fluent-syntax/src/parser/expression.rs create mode 100644 third_party/rust/fluent-syntax/src/parser/helper.rs create mode 100644 third_party/rust/fluent-syntax/src/parser/macros.rs create mode 100644 third_party/rust/fluent-syntax/src/parser/mod.rs create mode 100644 third_party/rust/fluent-syntax/src/parser/pattern.rs create mode 100644 third_party/rust/fluent-syntax/src/parser/runtime.rs create mode 100644 third_party/rust/fluent-syntax/src/parser/slice.rs (limited to 'third_party/rust/fluent-syntax/src/parser') diff --git a/third_party/rust/fluent-syntax/src/parser/comment.rs b/third_party/rust/fluent-syntax/src/parser/comment.rs new file mode 100644 index 0000000000..a63483c1d3 --- /dev/null +++ b/third_party/rust/fluent-syntax/src/parser/comment.rs @@ -0,0 +1,89 @@ +use super::{core::Parser, core::Result, Slice}; +use crate::ast; + +#[derive(Debug, PartialEq, Clone, Copy)] +pub(super) enum Level { + None = 0, + Regular = 1, + Group = 2, + Resource = 3, +} + +impl<'s, S> Parser +where + S: Slice<'s>, +{ + pub(super) fn get_comment(&mut self) -> Result<(ast::Comment, Level)> { + let mut level = Level::None; + let mut content = vec![]; + + while self.ptr < self.length { + let line_level = self.get_comment_level(); + if line_level == Level::None { + self.ptr -= 1; + break; + } else if level != Level::None && line_level != level { + self.ptr -= line_level as usize; + break; + } + + level = line_level; + + if self.ptr == self.length { + break; + } else if self.is_current_byte(b'\n') { + content.push(self.get_comment_line()); + } else { + if let Err(e) = self.expect_byte(b' ') { + if content.is_empty() { + return Err(e); + } else { + self.ptr -= line_level as usize; + break; + } + } + content.push(self.get_comment_line()); + } + self.skip_eol(); + } + + Ok((ast::Comment { content }, level)) + } + + pub(super) fn skip_comment(&mut self) { + loop { + while self.ptr < self.length && !self.is_current_byte(b'\n') { + self.ptr += 1; + } + self.ptr += 1; + if self.is_current_byte(b'#') { + self.ptr += 1; + } else { + break; + } + } + } + + fn get_comment_level(&mut self) -> Level { + if self.take_byte_if(b'#') { + if self.take_byte_if(b'#') { + if self.take_byte_if(b'#') { + return Level::Resource; + } + return Level::Group; + } + return Level::Regular; + } + Level::None + } + + fn get_comment_line(&mut self) -> S { + let start_pos = self.ptr; + + while !self.is_eol() { + self.ptr += 1; + } + + self.source.slice(start_pos..self.ptr) + } +} diff --git a/third_party/rust/fluent-syntax/src/parser/core.rs b/third_party/rust/fluent-syntax/src/parser/core.rs new file mode 100644 index 0000000000..68ad8dc0b6 --- /dev/null +++ b/third_party/rust/fluent-syntax/src/parser/core.rs @@ -0,0 +1,307 @@ +use super::{ + comment, + errors::{ErrorKind, ParserError}, + slice::Slice, +}; +use crate::ast; + +pub type Result = std::result::Result; + +pub struct Parser { + pub(super) source: S, + pub(super) ptr: usize, + pub(super) length: usize, +} + +impl<'s, S> Parser +where + S: Slice<'s>, +{ + pub fn new(source: S) -> Self { + let length = source.as_ref().as_bytes().len(); + Self { + source, + ptr: 0, + length, + } + } + + pub fn parse( + mut self, + ) -> std::result::Result, (ast::Resource, Vec)> { + let mut errors = vec![]; + + let mut body = vec![]; + + self.skip_blank_block(); + let mut last_comment = None; + let mut last_blank_count = 0; + + while self.ptr < self.length { + let entry_start = self.ptr; + let mut entry = self.get_entry(entry_start); + + if let Some(comment) = last_comment.take() { + match entry { + Ok(ast::Entry::Message(ref mut msg)) if last_blank_count < 2 => { + msg.comment = Some(comment); + } + Ok(ast::Entry::Term(ref mut term)) if last_blank_count < 2 => { + term.comment = Some(comment); + } + _ => { + body.push(ast::Entry::Comment(comment)); + } + } + } + + match entry { + Ok(ast::Entry::Comment(comment)) => { + last_comment = Some(comment); + } + Ok(entry) => { + body.push(entry); + } + Err(mut err) => { + self.skip_to_next_entry_start(); + err.slice = Some(entry_start..self.ptr); + errors.push(err); + let content = self.source.slice(entry_start..self.ptr); + body.push(ast::Entry::Junk { content }); + } + } + last_blank_count = self.skip_blank_block(); + } + + if let Some(last_comment) = last_comment.take() { + body.push(ast::Entry::Comment(last_comment)); + } + if errors.is_empty() { + Ok(ast::Resource { body }) + } else { + Err((ast::Resource { body }, errors)) + } + } + + fn get_entry(&mut self, entry_start: usize) -> Result> { + let entry = match get_current_byte!(self) { + Some(b'#') => { + let (comment, level) = self.get_comment()?; + match level { + comment::Level::Regular => ast::Entry::Comment(comment), + comment::Level::Group => ast::Entry::GroupComment(comment), + comment::Level::Resource => ast::Entry::ResourceComment(comment), + comment::Level::None => unreachable!(), + } + } + Some(b'-') => ast::Entry::Term(self.get_term(entry_start)?), + _ => ast::Entry::Message(self.get_message(entry_start)?), + }; + Ok(entry) + } + + pub fn get_message(&mut self, entry_start: usize) -> Result> { + let id = self.get_identifier()?; + self.skip_blank_inline(); + self.expect_byte(b'=')?; + let pattern = self.get_pattern()?; + + self.skip_blank_block(); + + let attributes = self.get_attributes(); + + if pattern.is_none() && attributes.is_empty() { + let entry_id = id.name.as_ref().to_owned(); + return error!( + ErrorKind::ExpectedMessageField { entry_id }, + entry_start, self.ptr + ); + } + + Ok(ast::Message { + id, + value: pattern, + attributes, + comment: None, + }) + } + + pub fn get_term(&mut self, entry_start: usize) -> Result> { + self.expect_byte(b'-')?; + let id = self.get_identifier()?; + self.skip_blank_inline(); + self.expect_byte(b'=')?; + self.skip_blank_inline(); + + let value = self.get_pattern()?; + + self.skip_blank_block(); + + let attributes = self.get_attributes(); + + if let Some(value) = value { + Ok(ast::Term { + id, + value, + attributes, + comment: None, + }) + } else { + error!( + ErrorKind::ExpectedTermField { + entry_id: id.name.as_ref().to_owned() + }, + entry_start, self.ptr + ) + } + } + + fn get_attributes(&mut self) -> Vec> { + let mut attributes = vec![]; + + loop { + let line_start = self.ptr; + self.skip_blank_inline(); + if !self.take_byte_if(b'.') { + self.ptr = line_start; + break; + } + + if let Ok(attr) = self.get_attribute() { + attributes.push(attr); + } else { + self.ptr = line_start; + break; + } + } + attributes + } + + fn get_attribute(&mut self) -> Result> { + let id = self.get_identifier()?; + self.skip_blank_inline(); + self.expect_byte(b'=')?; + let pattern = self.get_pattern()?; + + match pattern { + Some(pattern) => Ok(ast::Attribute { id, value: pattern }), + None => error!(ErrorKind::MissingValue, self.ptr), + } + } + + pub(super) fn get_identifier_unchecked(&mut self) -> ast::Identifier { + let mut ptr = self.ptr; + + while matches!(get_byte!(self, ptr), Some(b) if b.is_ascii_alphanumeric() || *b == b'-' || *b == b'_') + { + ptr += 1; + } + + let name = self.source.slice(self.ptr - 1..ptr); + self.ptr = ptr; + + ast::Identifier { name } + } + + pub(super) fn get_identifier(&mut self) -> Result> { + if !self.is_identifier_start() { + return error!( + ErrorKind::ExpectedCharRange { + range: "a-zA-Z".to_string() + }, + self.ptr + ); + } + self.ptr += 1; + Ok(self.get_identifier_unchecked()) + } + + pub(super) fn get_attribute_accessor(&mut self) -> Result>> { + if self.take_byte_if(b'.') { + let ident = self.get_identifier()?; + Ok(Some(ident)) + } else { + Ok(None) + } + } + + fn get_variant_key(&mut self) -> Result> { + self.skip_blank(); + + let key = if self.is_number_start() { + ast::VariantKey::NumberLiteral { + value: self.get_number_literal()?, + } + } else { + ast::VariantKey::Identifier { + name: self.get_identifier()?.name, + } + }; + + self.skip_blank(); + + self.expect_byte(b']')?; + + Ok(key) + } + + pub(super) fn get_variants(&mut self) -> Result>> { + let mut variants = Vec::with_capacity(2); + let mut has_default = false; + + loop { + let default = self.take_byte_if(b'*'); + if default { + if has_default { + return error!(ErrorKind::MultipleDefaultVariants, self.ptr); + } else { + has_default = true; + } + } + + if !self.take_byte_if(b'[') { + break; + } + + let key = self.get_variant_key()?; + + let value = self.get_pattern()?; + + if let Some(value) = value { + variants.push(ast::Variant { + key, + value, + default, + }); + self.skip_blank(); + } else { + return error!(ErrorKind::MissingValue, self.ptr); + } + } + + if has_default { + Ok(variants) + } else { + error!(ErrorKind::MissingDefaultVariant, self.ptr) + } + } + + pub(super) fn get_placeable(&mut self) -> Result> { + self.skip_blank(); + let exp = self.get_expression()?; + self.skip_blank_inline(); + self.expect_byte(b'}')?; + + let invalid_expression_found = match &exp { + ast::Expression::Inline(ast::InlineExpression::TermReference { + ref attribute, .. + }) => attribute.is_some(), + _ => false, + }; + if invalid_expression_found { + return error!(ErrorKind::TermAttributeAsPlaceable, self.ptr); + } + + Ok(exp) + } +} diff --git a/third_party/rust/fluent-syntax/src/parser/errors.rs b/third_party/rust/fluent-syntax/src/parser/errors.rs new file mode 100644 index 0000000000..2c29f97bbf --- /dev/null +++ b/third_party/rust/fluent-syntax/src/parser/errors.rs @@ -0,0 +1,169 @@ +use std::ops::Range; +use thiserror::Error; + +/// Error containing information about an error encountered by the Fluent Parser. +/// +/// Errors in Fluent Parser are non-fatal, and the syntax has been +/// designed to allow for strong recovery. +/// +/// In result [`ParserError`] is designed to point at the slice of +/// the input that is most likely to be a complete fragment from after +/// the end of a valid entry, to the start of the next valid entry, with +/// the invalid syntax in the middle. +/// +/// +/// # Example +/// +/// ``` +/// use fluent_syntax::parser; +/// use fluent_syntax::ast; +/// +/// let ftl = r#" +/// key1 = Value 1 +/// +/// g@Rb@ge = #2y ds +/// +/// key2 = Value 2 +/// +/// "#; +/// +/// let (resource, errors) = parser::parse_runtime(ftl) +/// .expect_err("Resource should contain errors."); +/// +/// assert_eq!( +/// errors, +/// vec![ +/// parser::ParserError { +/// pos: 18..19, +/// slice: Some(17..35), +/// kind: parser::ErrorKind::ExpectedToken('=') +/// } +/// ] +/// ); +/// +/// assert_eq!( +/// resource.body[0], +/// ast::Entry::Message( +/// ast::Message { +/// id: ast::Identifier { +/// name: "key1" +/// }, +/// value: Some(ast::Pattern { +/// elements: vec![ +/// ast::PatternElement::TextElement { +/// value: "Value 1" +/// }, +/// ] +/// }), +/// attributes: vec![], +/// comment: None, +/// } +/// ), +/// ); +/// +/// assert_eq!( +/// resource.body[1], +/// ast::Entry::Junk { +/// content: "g@Rb@ge = #2y ds\n\n" +/// } +/// ); +/// +/// assert_eq!( +/// resource.body[2], +/// ast::Entry::Message( +/// ast::Message { +/// id: ast::Identifier { +/// name: "key2" +/// }, +/// value: Some(ast::Pattern { +/// elements: vec![ +/// ast::PatternElement::TextElement { +/// value: "Value 2" +/// }, +/// ] +/// }), +/// attributes: vec![], +/// comment: None, +/// } +/// ), +/// ); +/// ``` +/// +/// The information contained in the `ParserError` should allow the tooling +/// to display rich contextual annotations of the error slice, using +/// crates such as `annotate-snippers`. +#[derive(Error, Debug, PartialEq, Clone)] +#[error("{}", self.kind)] +pub struct ParserError { + /// Precise location of where the parser encountered the error. + pub pos: Range, + /// Slice of the input from the end of the last valid entry to the beginning + /// of the next valid entry with the invalid syntax in the middle. + pub slice: Option>, + /// The type of the error that the parser encountered. + pub kind: ErrorKind, +} + +macro_rules! error { + ($kind:expr, $start:expr) => {{ + Err(ParserError { + pos: $start..$start + 1, + slice: None, + kind: $kind, + }) + }}; + ($kind:expr, $start:expr, $end:expr) => {{ + Err(ParserError { + pos: $start..$end, + slice: None, + kind: $kind, + }) + }}; +} + +/// Kind of an error associated with the [`ParserError`]. +#[derive(Error, Debug, PartialEq, Clone)] +pub enum ErrorKind { + #[error("Expected a token starting with \"{0}\"")] + ExpectedToken(char), + #[error("Expected one of \"{range}\"")] + ExpectedCharRange { range: String }, + #[error("Expected a message field for \"{entry_id}\"")] + ExpectedMessageField { entry_id: String }, + #[error("Expected a term field for \"{entry_id}\"")] + ExpectedTermField { entry_id: String }, + #[error("Callee is not allowed here")] + ForbiddenCallee, + #[error("The select expression must have a default variant")] + MissingDefaultVariant, + #[error("Expected a value")] + MissingValue, + #[error("A select expression can only have one default variant")] + MultipleDefaultVariants, + #[error("Message references can't be used as a selector")] + MessageReferenceAsSelector, + #[error("Term references can't be used as a selector")] + TermReferenceAsSelector, + #[error("Message attributes can't be used as a selector")] + MessageAttributeAsSelector, + #[error("Term attributes can't be used as a selector")] + TermAttributeAsPlaceable, + #[error("Unterminated string literal")] + UnterminatedStringLiteral, + #[error("Positional arguments must come before named arguments")] + PositionalArgumentFollowsNamed, + #[error("The \"{0}\" argument appears twice")] + DuplicatedNamedArgument(String), + #[error("Unknown escape sequence")] + UnknownEscapeSequence(String), + #[error("Invalid unicode escape sequence, \"{0}\"")] + InvalidUnicodeEscapeSequence(String), + #[error("Unbalanced closing brace")] + UnbalancedClosingBrace, + #[error("Expected an inline expression")] + ExpectedInlineExpression, + #[error("Expected a simple expression as selector")] + ExpectedSimpleExpressionAsSelector, + #[error("Expected a string or number literal")] + ExpectedLiteral, +} diff --git a/third_party/rust/fluent-syntax/src/parser/expression.rs b/third_party/rust/fluent-syntax/src/parser/expression.rs new file mode 100644 index 0000000000..c5ccb32bf4 --- /dev/null +++ b/third_party/rust/fluent-syntax/src/parser/expression.rs @@ -0,0 +1,224 @@ +use super::errors::{ErrorKind, ParserError}; +use super::{core::Parser, core::Result, slice::Slice}; +use crate::ast; + +impl<'s, S> Parser +where + S: Slice<'s>, +{ + pub(super) fn get_expression(&mut self) -> Result> { + let exp = self.get_inline_expression(false)?; + + self.skip_blank(); + + if !self.is_current_byte(b'-') || !self.is_byte_at(b'>', self.ptr + 1) { + if let ast::InlineExpression::TermReference { ref attribute, .. } = exp { + if attribute.is_some() { + return error!(ErrorKind::TermAttributeAsPlaceable, self.ptr); + } + } + return Ok(ast::Expression::Inline(exp)); + } + + match exp { + ast::InlineExpression::MessageReference { ref attribute, .. } => { + if attribute.is_none() { + return error!(ErrorKind::MessageReferenceAsSelector, self.ptr); + } else { + return error!(ErrorKind::MessageAttributeAsSelector, self.ptr); + } + } + ast::InlineExpression::TermReference { ref attribute, .. } => { + if attribute.is_none() { + return error!(ErrorKind::TermReferenceAsSelector, self.ptr); + } + } + ast::InlineExpression::StringLiteral { .. } + | ast::InlineExpression::NumberLiteral { .. } + | ast::InlineExpression::VariableReference { .. } + | ast::InlineExpression::FunctionReference { .. } => {} + _ => { + return error!(ErrorKind::ExpectedSimpleExpressionAsSelector, self.ptr); + } + }; + + self.ptr += 2; // -> + + self.skip_blank_inline(); + if !self.skip_eol() { + return error!( + ErrorKind::ExpectedCharRange { + range: "\n | \r\n".to_string() + }, + self.ptr + ); + } + self.skip_blank(); + + let variants = self.get_variants()?; + + Ok(ast::Expression::Select { + selector: exp, + variants, + }) + } + + pub(super) fn get_inline_expression( + &mut self, + only_literal: bool, + ) -> Result> { + match get_current_byte!(self) { + Some(b'"') => { + self.ptr += 1; // " + let start = self.ptr; + while let Some(b) = get_current_byte!(self) { + match b { + b'\\' => match get_byte!(self, self.ptr + 1) { + Some(b'\\') | Some(b'{') | Some(b'"') => self.ptr += 2, + Some(b'u') => { + self.ptr += 2; + self.skip_unicode_escape_sequence(4)?; + } + Some(b'U') => { + self.ptr += 2; + self.skip_unicode_escape_sequence(6)?; + } + b => { + let seq = b.unwrap_or(&b' ').to_string(); + return error!(ErrorKind::UnknownEscapeSequence(seq), self.ptr); + } + }, + b'"' => { + break; + } + b'\n' => { + return error!(ErrorKind::UnterminatedStringLiteral, self.ptr); + } + _ => self.ptr += 1, + } + } + + self.expect_byte(b'"')?; + let slice = self.source.slice(start..self.ptr - 1); + Ok(ast::InlineExpression::StringLiteral { value: slice }) + } + Some(b) if b.is_ascii_digit() => { + let num = self.get_number_literal()?; + Ok(ast::InlineExpression::NumberLiteral { value: num }) + } + Some(b'-') if !only_literal => { + self.ptr += 1; // - + if self.is_identifier_start() { + self.ptr += 1; + let id = self.get_identifier_unchecked(); + let attribute = self.get_attribute_accessor()?; + let arguments = self.get_call_arguments()?; + Ok(ast::InlineExpression::TermReference { + id, + attribute, + arguments, + }) + } else { + self.ptr -= 1; + let num = self.get_number_literal()?; + Ok(ast::InlineExpression::NumberLiteral { value: num }) + } + } + Some(b'$') if !only_literal => { + self.ptr += 1; // $ + let id = self.get_identifier()?; + Ok(ast::InlineExpression::VariableReference { id }) + } + Some(b) if b.is_ascii_alphabetic() => { + self.ptr += 1; + let id = self.get_identifier_unchecked(); + let arguments = self.get_call_arguments()?; + if let Some(arguments) = arguments { + if !Self::is_callee(&id.name) { + return error!(ErrorKind::ForbiddenCallee, self.ptr); + } + + Ok(ast::InlineExpression::FunctionReference { id, arguments }) + } else { + let attribute = self.get_attribute_accessor()?; + Ok(ast::InlineExpression::MessageReference { id, attribute }) + } + } + Some(b'{') if !only_literal => { + self.ptr += 1; // { + let exp = self.get_placeable()?; + Ok(ast::InlineExpression::Placeable { + expression: Box::new(exp), + }) + } + _ if only_literal => error!(ErrorKind::ExpectedLiteral, self.ptr), + _ => error!(ErrorKind::ExpectedInlineExpression, self.ptr), + } + } + + pub fn get_call_arguments(&mut self) -> Result>> { + self.skip_blank(); + if !self.take_byte_if(b'(') { + return Ok(None); + } + + let mut positional = vec![]; + let mut named = vec![]; + let mut argument_names = vec![]; + + self.skip_blank(); + + while self.ptr < self.length { + if self.is_current_byte(b')') { + break; + } + + let expr = self.get_inline_expression(false)?; + + if let ast::InlineExpression::MessageReference { + ref id, + attribute: None, + } = expr + { + self.skip_blank(); + if self.is_current_byte(b':') { + if argument_names.contains(&id.name) { + return error!( + ErrorKind::DuplicatedNamedArgument(id.name.as_ref().to_owned()), + self.ptr + ); + } + self.ptr += 1; + self.skip_blank(); + let val = self.get_inline_expression(true)?; + + argument_names.push(id.name.clone()); + named.push(ast::NamedArgument { + name: ast::Identifier { + name: id.name.clone(), + }, + value: val, + }); + } else { + if !argument_names.is_empty() { + return error!(ErrorKind::PositionalArgumentFollowsNamed, self.ptr); + } + positional.push(expr); + } + } else { + if !argument_names.is_empty() { + return error!(ErrorKind::PositionalArgumentFollowsNamed, self.ptr); + } + positional.push(expr); + } + + self.skip_blank(); + self.take_byte_if(b','); + self.skip_blank(); + } + + self.expect_byte(b')')?; + + Ok(Some(ast::CallArguments { positional, named })) + } +} diff --git a/third_party/rust/fluent-syntax/src/parser/helper.rs b/third_party/rust/fluent-syntax/src/parser/helper.rs new file mode 100644 index 0000000000..11544d6855 --- /dev/null +++ b/third_party/rust/fluent-syntax/src/parser/helper.rs @@ -0,0 +1,169 @@ +use super::errors::{ErrorKind, ParserError}; +use super::{core::Parser, core::Result, slice::Slice}; + +impl<'s, S> Parser +where + S: Slice<'s>, +{ + pub(super) fn is_current_byte(&self, b: u8) -> bool { + get_current_byte!(self) == Some(&b) + } + + pub(super) fn is_byte_at(&self, b: u8, pos: usize) -> bool { + get_byte!(self, pos) == Some(&b) + } + + pub(super) fn skip_to_next_entry_start(&mut self) { + while let Some(b) = get_current_byte!(self) { + let new_line = self.ptr == 0 || get_byte!(self, self.ptr - 1) == Some(&b'\n'); + + if new_line && (b.is_ascii_alphabetic() || [b'-', b'#'].contains(b)) { + break; + } + + self.ptr += 1; + } + } + + pub(super) fn skip_eol(&mut self) -> bool { + match get_current_byte!(self) { + Some(b'\n') => { + self.ptr += 1; + true + } + Some(b'\r') if self.is_byte_at(b'\n', self.ptr + 1) => { + self.ptr += 2; + true + } + _ => false, + } + } + + pub(super) fn skip_unicode_escape_sequence(&mut self, length: usize) -> Result<()> { + let start = self.ptr; + for _ in 0..length { + match get_current_byte!(self) { + Some(b) if b.is_ascii_hexdigit() => self.ptr += 1, + _ => break, + } + } + if self.ptr - start != length { + let end = if self.ptr >= self.length { + self.ptr + } else { + self.ptr + 1 + }; + let seq = self.source.slice(start..end).as_ref().to_owned(); + return error!(ErrorKind::InvalidUnicodeEscapeSequence(seq), self.ptr); + } + Ok(()) + } + + pub(super) fn is_identifier_start(&self) -> bool { + matches!(get_current_byte!(self), Some(b) if b.is_ascii_alphabetic()) + } + + pub(super) fn take_byte_if(&mut self, b: u8) -> bool { + if self.is_current_byte(b) { + self.ptr += 1; + true + } else { + false + } + } + + pub(super) fn skip_blank_block(&mut self) -> usize { + let mut count = 0; + loop { + let start = self.ptr; + self.skip_blank_inline(); + if !self.skip_eol() { + self.ptr = start; + break; + } + count += 1; + } + count + } + + pub(super) fn skip_blank(&mut self) { + loop { + match get_current_byte!(self) { + Some(b' ') | Some(b'\n') => self.ptr += 1, + Some(b'\r') if get_byte!(self, self.ptr + 1) == Some(&b'\n') => self.ptr += 2, + _ => break, + } + } + } + + pub(super) fn skip_blank_inline(&mut self) -> usize { + let start = self.ptr; + while let Some(b' ') = get_current_byte!(self) { + self.ptr += 1; + } + self.ptr - start + } + + pub(super) fn is_byte_pattern_continuation(b: u8) -> bool { + !matches!(b, b'.' | b'}' | b'[' | b'*') + } + + pub(super) fn is_callee(name: &S) -> bool { + name.as_ref() + .as_bytes() + .iter() + .all(|c| c.is_ascii_uppercase() || c.is_ascii_digit() || *c == b'_' || *c == b'-') + } + + pub(super) fn expect_byte(&mut self, b: u8) -> Result<()> { + if !self.is_current_byte(b) { + return error!(ErrorKind::ExpectedToken(b as char), self.ptr); + } + self.ptr += 1; + Ok(()) + } + + pub(super) fn is_number_start(&self) -> bool { + matches!(get_current_byte!(self), Some(b) if b.is_ascii_digit() || b == &b'-') + } + + pub(super) fn is_eol(&self) -> bool { + match get_current_byte!(self) { + Some(b'\n') => true, + Some(b'\r') if self.is_byte_at(b'\n', self.ptr + 1) => true, + None => true, + _ => false, + } + } + + pub(super) fn skip_digits(&mut self) -> Result<()> { + let start = self.ptr; + loop { + match get_current_byte!(self) { + Some(b) if b.is_ascii_digit() => self.ptr += 1, + _ => break, + } + } + if start == self.ptr { + error!( + ErrorKind::ExpectedCharRange { + range: "0-9".to_string() + }, + self.ptr + ) + } else { + Ok(()) + } + } + + pub(super) fn get_number_literal(&mut self) -> Result { + let start = self.ptr; + self.take_byte_if(b'-'); + self.skip_digits()?; + if self.take_byte_if(b'.') { + self.skip_digits()?; + } + + Ok(self.source.slice(start..self.ptr)) + } +} diff --git a/third_party/rust/fluent-syntax/src/parser/macros.rs b/third_party/rust/fluent-syntax/src/parser/macros.rs new file mode 100644 index 0000000000..671d543285 --- /dev/null +++ b/third_party/rust/fluent-syntax/src/parser/macros.rs @@ -0,0 +1,11 @@ +macro_rules! get_byte { + ($s:expr, $idx:expr) => { + $s.source.as_ref().as_bytes().get($idx) + }; +} + +macro_rules! get_current_byte { + ($s:expr) => { + $s.source.as_ref().as_bytes().get($s.ptr) + }; +} diff --git a/third_party/rust/fluent-syntax/src/parser/mod.rs b/third_party/rust/fluent-syntax/src/parser/mod.rs new file mode 100644 index 0000000000..52edfdc37a --- /dev/null +++ b/third_party/rust/fluent-syntax/src/parser/mod.rs @@ -0,0 +1,278 @@ +//! Fluent Translation List parsing utilities +//! +//! FTL resources can be parsed using one of two methods: +//! * [`parse`] - parses an input into a complete Abstract Syntax Tree representation with all source information preserved. +//! * [`parse_runtime`] - parses an input into a runtime optimized Abstract Syntax Tree +//! representation with comments stripped. +//! +//! # Example +//! +//! ``` +//! use fluent_syntax::parser; +//! use fluent_syntax::ast; +//! +//! let ftl = r#" +//! #### Resource Level Comment +//! +//! ## This is a message comment +//! hello-world = Hello World! +//! +//! "#; +//! +//! let resource = parser::parse(ftl) +//! .expect("Failed to parse an FTL resource."); +//! +//! assert_eq!( +//! resource.body[0], +//! ast::Entry::ResourceComment( +//! ast::Comment { +//! content: vec![ +//! "Resource Level Comment" +//! ] +//! } +//! ) +//! ); +//! assert_eq!( +//! resource.body[1], +//! ast::Entry::Message( +//! ast::Message { +//! id: ast::Identifier { +//! name: "hello-world" +//! }, +//! value: Some(ast::Pattern { +//! elements: vec![ +//! ast::PatternElement::TextElement { +//! value: "Hello World!" +//! }, +//! ] +//! }), +//! attributes: vec![], +//! comment: Some( +//! ast::Comment { +//! content: vec!["This is a message comment"] +//! } +//! ) +//! } +//! ), +//! ); +//! ``` +//! +//! # Error Recovery +//! +//! In both modes the parser is lenient, attempting to recover from errors. +//! +//! The [`Result`] return the resulting AST in both scenarios, and in the +//! error scenario a vector of [`ParserError`] elements is returned as well. +//! +//! Any unparsed parts of the input are returned as [`ast::Entry::Junk`] elements. +#[macro_use] +mod errors; +#[macro_use] +mod macros; +mod comment; +mod core; +mod expression; +mod helper; +mod pattern; +mod runtime; +mod slice; + +use crate::ast; +pub use errors::{ErrorKind, ParserError}; +pub use slice::Slice; + +/// Parser result always returns an AST representation of the input, +/// and if parsing errors were encountered, a list of [`ParserError`] elements +/// is also returned. +/// +/// # Example +/// +/// ``` +/// use fluent_syntax::parser; +/// use fluent_syntax::ast; +/// +/// let ftl = r#" +/// key1 = Value 1 +/// +/// g@Rb@ge = #2y ds +/// +/// key2 = Value 2 +/// +/// "#; +/// +/// let (resource, errors) = parser::parse_runtime(ftl) +/// .expect_err("Resource should contain errors."); +/// +/// assert_eq!( +/// errors, +/// vec![ +/// parser::ParserError { +/// pos: 18..19, +/// slice: Some(17..35), +/// kind: parser::ErrorKind::ExpectedToken('=') +/// } +/// ] +/// ); +/// +/// assert_eq!( +/// resource.body[0], +/// ast::Entry::Message( +/// ast::Message { +/// id: ast::Identifier { +/// name: "key1" +/// }, +/// value: Some(ast::Pattern { +/// elements: vec![ +/// ast::PatternElement::TextElement { +/// value: "Value 1" +/// }, +/// ] +/// }), +/// attributes: vec![], +/// comment: None, +/// } +/// ), +/// ); +/// +/// assert_eq!( +/// resource.body[1], +/// ast::Entry::Junk { +/// content: "g@Rb@ge = #2y ds\n\n" +/// } +/// ); +/// +/// assert_eq!( +/// resource.body[2], +/// ast::Entry::Message( +/// ast::Message { +/// id: ast::Identifier { +/// name: "key2" +/// }, +/// value: Some(ast::Pattern { +/// elements: vec![ +/// ast::PatternElement::TextElement { +/// value: "Value 2" +/// }, +/// ] +/// }), +/// attributes: vec![], +/// comment: None, +/// } +/// ), +/// ); +/// ``` +pub type Result = std::result::Result, (ast::Resource, Vec)>; + +/// Parses an input into a complete Abstract Syntax Tree representation with +/// all source information preserved. +/// +/// This mode is intended for tooling, linters and other scenarios where +/// complete representation, with comments, is preferred over speed and memory +/// utilization. +/// +/// # Example +/// +/// ``` +/// use fluent_syntax::parser; +/// use fluent_syntax::ast; +/// +/// let ftl = r#" +/// #### Resource Level Comment +/// +/// ## This is a message comment +/// hello-world = Hello World! +/// +/// "#; +/// +/// let resource = parser::parse(ftl) +/// .expect("Failed to parse an FTL resource."); +/// +/// assert_eq!( +/// resource.body[0], +/// ast::Entry::ResourceComment( +/// ast::Comment { +/// content: vec![ +/// "Resource Level Comment" +/// ] +/// } +/// ) +/// ); +/// assert_eq!( +/// resource.body[1], +/// ast::Entry::Message( +/// ast::Message { +/// id: ast::Identifier { +/// name: "hello-world" +/// }, +/// value: Some(ast::Pattern { +/// elements: vec![ +/// ast::PatternElement::TextElement { +/// value: "Hello World!" +/// }, +/// ] +/// }), +/// attributes: vec![], +/// comment: Some( +/// ast::Comment { +/// content: vec!["This is a message comment"] +/// } +/// ) +/// } +/// ), +/// ); +/// ``` +pub fn parse<'s, S>(input: S) -> Result +where + S: Slice<'s>, +{ + core::Parser::new(input).parse() +} + +/// Parses an input into an Abstract Syntax Tree representation with comments stripped. +/// +/// This mode is intended for runtime use of Fluent. It currently strips all +/// comments improving parsing performance and reducing the size of the AST tree. +/// +/// # Example +/// +/// ``` +/// use fluent_syntax::parser; +/// use fluent_syntax::ast; +/// +/// let ftl = r#" +/// #### Resource Level Comment +/// +/// ## This is a message comment +/// hello-world = Hello World! +/// +/// "#; +/// +/// let resource = parser::parse_runtime(ftl) +/// .expect("Failed to parse an FTL resource."); +/// +/// assert_eq!( +/// resource.body[0], +/// ast::Entry::Message( +/// ast::Message { +/// id: ast::Identifier { +/// name: "hello-world" +/// }, +/// value: Some(ast::Pattern { +/// elements: vec![ +/// ast::PatternElement::TextElement { +/// value: "Hello World!" +/// }, +/// ] +/// }), +/// attributes: vec![], +/// comment: None, +/// } +/// ), +/// ); +/// ``` +pub fn parse_runtime<'s, S>(input: S) -> Result +where + S: Slice<'s>, +{ + core::Parser::new(input).parse_runtime() +} diff --git a/third_party/rust/fluent-syntax/src/parser/pattern.rs b/third_party/rust/fluent-syntax/src/parser/pattern.rs new file mode 100644 index 0000000000..516326d761 --- /dev/null +++ b/third_party/rust/fluent-syntax/src/parser/pattern.rs @@ -0,0 +1,207 @@ +use super::errors::{ErrorKind, ParserError}; +use super::{core::Parser, core::Result, slice::Slice}; +use crate::ast; + +#[derive(Debug, PartialEq)] +enum TextElementTermination { + LineFeed, + CRLF, + PlaceableStart, + EOF, +} + +// This enum tracks the placement of the text element in the pattern, which is needed for +// dedentation logic. +#[derive(Debug, PartialEq)] +enum TextElementPosition { + InitialLineStart, + LineStart, + Continuation, +} + +// This enum allows us to mark pointers in the source which will later become text elements +// but without slicing them out of the source string. This makes the indentation adjustments +// cheaper since they'll happen on the pointers, rather than extracted slices. +#[derive(Debug)] +enum PatternElementPlaceholders { + Placeable(ast::Expression), + // (start, end, indent, position) + TextElement(usize, usize, usize, TextElementPosition), +} + +// This enum tracks whether the text element is blank or not. +// This is important to identify text elements which should not be taken into account +// when calculating common indent. +#[derive(Debug, PartialEq)] +enum TextElementType { + Blank, + NonBlank, +} + +impl<'s, S> Parser +where + S: Slice<'s>, +{ + pub(super) fn get_pattern(&mut self) -> Result>> { + let mut elements = vec![]; + let mut last_non_blank = None; + let mut common_indent = None; + + self.skip_blank_inline(); + + let mut text_element_role = if self.skip_eol() { + self.skip_blank_block(); + TextElementPosition::LineStart + } else { + TextElementPosition::InitialLineStart + }; + + while self.ptr < self.length { + if self.take_byte_if(b'{') { + if text_element_role == TextElementPosition::LineStart { + common_indent = Some(0); + } + let exp = self.get_placeable()?; + last_non_blank = Some(elements.len()); + elements.push(PatternElementPlaceholders::Placeable(exp)); + text_element_role = TextElementPosition::Continuation; + } else { + let slice_start = self.ptr; + let mut indent = 0; + if text_element_role == TextElementPosition::LineStart { + indent = self.skip_blank_inline(); + if let Some(b) = get_current_byte!(self) { + if indent == 0 { + if b != &b'\r' && b != &b'\n' { + break; + } + } else if !Self::is_byte_pattern_continuation(*b) { + self.ptr = slice_start; + break; + } + } else { + break; + } + } + let (start, end, text_element_type, termination_reason) = self.get_text_slice()?; + if start != end { + if text_element_role == TextElementPosition::LineStart + && text_element_type == TextElementType::NonBlank + { + if let Some(common) = common_indent { + if indent < common { + common_indent = Some(indent); + } + } else { + common_indent = Some(indent); + } + } + if text_element_role != TextElementPosition::LineStart + || text_element_type == TextElementType::NonBlank + || termination_reason == TextElementTermination::LineFeed + { + if text_element_type == TextElementType::NonBlank { + last_non_blank = Some(elements.len()); + } + elements.push(PatternElementPlaceholders::TextElement( + slice_start, + end, + indent, + text_element_role, + )); + } + } + + text_element_role = match termination_reason { + TextElementTermination::LineFeed => TextElementPosition::LineStart, + TextElementTermination::CRLF => TextElementPosition::LineStart, + TextElementTermination::PlaceableStart => TextElementPosition::Continuation, + TextElementTermination::EOF => TextElementPosition::Continuation, + }; + } + } + + if let Some(last_non_blank) = last_non_blank { + let elements = elements + .into_iter() + .take(last_non_blank + 1) + .enumerate() + .map(|(i, elem)| match elem { + PatternElementPlaceholders::Placeable(expression) => { + ast::PatternElement::Placeable { expression } + } + PatternElementPlaceholders::TextElement(start, end, indent, role) => { + let start = if role == TextElementPosition::LineStart { + common_indent.map_or_else( + || start + indent, + |common_indent| start + std::cmp::min(indent, common_indent), + ) + } else { + start + }; + let mut value = self.source.slice(start..end); + if last_non_blank == i { + value.trim(); + } + ast::PatternElement::TextElement { value } + } + }) + .collect(); + return Ok(Some(ast::Pattern { elements })); + } + + Ok(None) + } + + fn get_text_slice( + &mut self, + ) -> Result<(usize, usize, TextElementType, TextElementTermination)> { + let start_pos = self.ptr; + let mut text_element_type = TextElementType::Blank; + + while let Some(b) = get_current_byte!(self) { + match b { + b' ' => self.ptr += 1, + b'\n' => { + self.ptr += 1; + return Ok(( + start_pos, + self.ptr, + text_element_type, + TextElementTermination::LineFeed, + )); + } + b'\r' if self.is_byte_at(b'\n', self.ptr + 1) => { + self.ptr += 1; + return Ok(( + start_pos, + self.ptr - 1, + text_element_type, + TextElementTermination::CRLF, + )); + } + b'{' => { + return Ok(( + start_pos, + self.ptr, + text_element_type, + TextElementTermination::PlaceableStart, + )); + } + b'}' => { + return error!(ErrorKind::UnbalancedClosingBrace, self.ptr); + } + _ => { + text_element_type = TextElementType::NonBlank; + self.ptr += 1 + } + } + } + Ok(( + start_pos, + self.ptr, + text_element_type, + TextElementTermination::EOF, + )) + } +} diff --git a/third_party/rust/fluent-syntax/src/parser/runtime.rs b/third_party/rust/fluent-syntax/src/parser/runtime.rs new file mode 100644 index 0000000000..e116ceaeed --- /dev/null +++ b/third_party/rust/fluent-syntax/src/parser/runtime.rs @@ -0,0 +1,61 @@ +use super::{ + core::{Parser, Result}, + errors::ParserError, + slice::Slice, +}; +use crate::ast; + +impl<'s, S> Parser +where + S: Slice<'s>, +{ + pub fn parse_runtime( + mut self, + ) -> std::result::Result, (ast::Resource, Vec)> { + let mut errors = vec![]; + + // That default allocation gives the lowest + // number of instructions and cycles in ioi. + let mut body = Vec::with_capacity(6); + + self.skip_blank_block(); + + while self.ptr < self.length { + let entry_start = self.ptr; + let entry = self.get_entry_runtime(entry_start); + + match entry { + Ok(Some(entry)) => { + body.push(entry); + } + Ok(None) => {} + Err(mut err) => { + self.skip_to_next_entry_start(); + err.slice = Some(entry_start..self.ptr); + errors.push(err); + let content = self.source.slice(entry_start..self.ptr); + body.push(ast::Entry::Junk { content }); + } + } + self.skip_blank_block(); + } + + if errors.is_empty() { + Ok(ast::Resource { body }) + } else { + Err((ast::Resource { body }, errors)) + } + } + + fn get_entry_runtime(&mut self, entry_start: usize) -> Result>> { + let entry = match get_current_byte!(self) { + Some(b'#') => { + self.skip_comment(); + None + } + Some(b'-') => Some(ast::Entry::Term(self.get_term(entry_start)?)), + _ => Some(ast::Entry::Message(self.get_message(entry_start)?)), + }; + Ok(entry) + } +} diff --git a/third_party/rust/fluent-syntax/src/parser/slice.rs b/third_party/rust/fluent-syntax/src/parser/slice.rs new file mode 100644 index 0000000000..d44f8251fe --- /dev/null +++ b/third_party/rust/fluent-syntax/src/parser/slice.rs @@ -0,0 +1,25 @@ +use std::ops::Range; +pub trait Slice<'s>: AsRef + Clone + PartialEq { + fn slice(&self, range: Range) -> Self; + fn trim(&mut self); +} + +impl<'s> Slice<'s> for String { + fn slice(&self, range: Range) -> Self { + self[range].to_string() + } + + fn trim(&mut self) { + *self = self.trim_end().to_string(); + } +} + +impl<'s> Slice<'s> for &'s str { + fn slice(&self, range: Range) -> Self { + &self[range] + } + + fn trim(&mut self) { + *self = self.trim_end(); + } +} -- cgit v1.2.3