diff options
Diffstat (limited to 'third_party/rust/cssparser/src/tokenizer.rs')
-rw-r--r-- | third_party/rust/cssparser/src/tokenizer.rs | 1397 |
1 files changed, 1397 insertions, 0 deletions
diff --git a/third_party/rust/cssparser/src/tokenizer.rs b/third_party/rust/cssparser/src/tokenizer.rs new file mode 100644 index 0000000000..62f3868362 --- /dev/null +++ b/third_party/rust/cssparser/src/tokenizer.rs @@ -0,0 +1,1397 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +// https://drafts.csswg.org/css-syntax/#tokenization + +use self::Token::*; +use crate::cow_rc_str::CowRcStr; +use crate::parser::ParserState; +use matches::matches; +use std::char; +use std::i32; +use std::ops::Range; + +/// One of the pieces the CSS input is broken into. +/// +/// Some components use `Cow` in order to borrow from the original input string +/// and avoid allocating/copying when possible. +#[derive(PartialEq, Debug, Clone)] +pub enum Token<'a> { + /// A [`<ident-token>`](https://drafts.csswg.org/css-syntax/#ident-token-diagram) + Ident(CowRcStr<'a>), + + /// A [`<at-keyword-token>`](https://drafts.csswg.org/css-syntax/#at-keyword-token-diagram) + /// + /// The value does not include the `@` marker. + AtKeyword(CowRcStr<'a>), + + /// A [`<hash-token>`](https://drafts.csswg.org/css-syntax/#hash-token-diagram) with the type flag set to "unrestricted" + /// + /// The value does not include the `#` marker. + Hash(CowRcStr<'a>), + + /// A [`<hash-token>`](https://drafts.csswg.org/css-syntax/#hash-token-diagram) with the type flag set to "id" + /// + /// The value does not include the `#` marker. + IDHash(CowRcStr<'a>), // Hash that is a valid ID selector. + + /// A [`<string-token>`](https://drafts.csswg.org/css-syntax/#string-token-diagram) + /// + /// The value does not include the quotes. + QuotedString(CowRcStr<'a>), + + /// A [`<url-token>`](https://drafts.csswg.org/css-syntax/#url-token-diagram) + /// + /// The value does not include the `url(` `)` markers. Note that `url( <string-token> )` is represented by a + /// `Function` token. + UnquotedUrl(CowRcStr<'a>), + + /// A `<delim-token>` + Delim(char), + + /// A [`<number-token>`](https://drafts.csswg.org/css-syntax/#number-token-diagram) + Number { + /// Whether the number had a `+` or `-` sign. + /// + /// This is used is some cases like the <An+B> micro syntax. (See the `parse_nth` function.) + has_sign: bool, + + /// The value as a float + value: f32, + + /// If the origin source did not include a fractional part, the value as an integer. + int_value: Option<i32>, + }, + + /// A [`<percentage-token>`](https://drafts.csswg.org/css-syntax/#percentage-token-diagram) + Percentage { + /// Whether the number had a `+` or `-` sign. + has_sign: bool, + + /// The value as a float, divided by 100 so that the nominal range is 0.0 to 1.0. + unit_value: f32, + + /// If the origin source did not include a fractional part, the value as an integer. + /// It is **not** divided by 100. + int_value: Option<i32>, + }, + + /// A [`<dimension-token>`](https://drafts.csswg.org/css-syntax/#dimension-token-diagram) + Dimension { + /// Whether the number had a `+` or `-` sign. + /// + /// This is used is some cases like the <An+B> micro syntax. (See the `parse_nth` function.) + has_sign: bool, + + /// The value as a float + value: f32, + + /// If the origin source did not include a fractional part, the value as an integer. + int_value: Option<i32>, + + /// The unit, e.g. "px" in `12px` + unit: CowRcStr<'a>, + }, + + /// A [`<whitespace-token>`](https://drafts.csswg.org/css-syntax/#whitespace-token-diagram) + WhiteSpace(&'a str), + + /// A comment. + /// + /// The CSS Syntax spec does not generate tokens for comments, + /// But we do, because we can (borrowed &str makes it cheap). + /// + /// The value does not include the `/*` `*/` markers. + Comment(&'a str), + + /// A `:` `<colon-token>` + Colon, // : + + /// A `;` `<semicolon-token>` + Semicolon, // ; + + /// A `,` `<comma-token>` + Comma, // , + + /// A `~=` [`<include-match-token>`](https://drafts.csswg.org/css-syntax/#include-match-token-diagram) + IncludeMatch, + + /// A `|=` [`<dash-match-token>`](https://drafts.csswg.org/css-syntax/#dash-match-token-diagram) + DashMatch, + + /// A `^=` [`<prefix-match-token>`](https://drafts.csswg.org/css-syntax/#prefix-match-token-diagram) + PrefixMatch, + + /// A `$=` [`<suffix-match-token>`](https://drafts.csswg.org/css-syntax/#suffix-match-token-diagram) + SuffixMatch, + + /// A `*=` [`<substring-match-token>`](https://drafts.csswg.org/css-syntax/#substring-match-token-diagram) + SubstringMatch, + + /// A `<!--` [`<CDO-token>`](https://drafts.csswg.org/css-syntax/#CDO-token-diagram) + CDO, + + /// A `-->` [`<CDC-token>`](https://drafts.csswg.org/css-syntax/#CDC-token-diagram) + CDC, + + /// A [`<function-token>`](https://drafts.csswg.org/css-syntax/#function-token-diagram) + /// + /// The value (name) does not include the `(` marker. + Function(CowRcStr<'a>), + + /// A `<(-token>` + ParenthesisBlock, + + /// A `<[-token>` + SquareBracketBlock, + + /// A `<{-token>` + CurlyBracketBlock, + + /// A `<bad-url-token>` + /// + /// This token always indicates a parse error. + BadUrl(CowRcStr<'a>), + + /// A `<bad-string-token>` + /// + /// This token always indicates a parse error. + BadString(CowRcStr<'a>), + + /// A `<)-token>` + /// + /// When obtained from one of the `Parser::next*` methods, + /// this token is always unmatched and indicates a parse error. + CloseParenthesis, + + /// A `<]-token>` + /// + /// When obtained from one of the `Parser::next*` methods, + /// this token is always unmatched and indicates a parse error. + CloseSquareBracket, + + /// A `<}-token>` + /// + /// When obtained from one of the `Parser::next*` methods, + /// this token is always unmatched and indicates a parse error. + CloseCurlyBracket, +} + +impl<'a> Token<'a> { + /// Return whether this token represents a parse error. + /// + /// `BadUrl` and `BadString` are tokenizer-level parse errors. + /// + /// `CloseParenthesis`, `CloseSquareBracket`, and `CloseCurlyBracket` are *unmatched* + /// and therefore parse errors when returned by one of the `Parser::next*` methods. + pub fn is_parse_error(&self) -> bool { + matches!( + *self, + BadUrl(_) | BadString(_) | CloseParenthesis | CloseSquareBracket | CloseCurlyBracket + ) + } +} + +#[derive(Clone)] +pub struct Tokenizer<'a> { + input: &'a str, + /// Counted in bytes, not code points. From 0. + position: usize, + /// The position at the start of the current line; but adjusted to + /// ensure that computing the column will give the result in units + /// of UTF-16 characters. + current_line_start_position: usize, + current_line_number: u32, + var_or_env_functions: SeenStatus, + source_map_url: Option<&'a str>, + source_url: Option<&'a str>, +} + +#[derive(Copy, Clone, PartialEq, Eq)] +enum SeenStatus { + DontCare, + LookingForThem, + SeenAtLeastOne, +} + +impl<'a> Tokenizer<'a> { + #[inline] + pub fn new(input: &str) -> Tokenizer { + Tokenizer::with_first_line_number(input, 0) + } + + #[inline] + pub fn with_first_line_number(input: &str, first_line_number: u32) -> Tokenizer { + Tokenizer { + input: input, + position: 0, + current_line_start_position: 0, + current_line_number: first_line_number, + var_or_env_functions: SeenStatus::DontCare, + source_map_url: None, + source_url: None, + } + } + + #[inline] + pub fn look_for_var_or_env_functions(&mut self) { + self.var_or_env_functions = SeenStatus::LookingForThem; + } + + #[inline] + pub fn seen_var_or_env_functions(&mut self) -> bool { + let seen = self.var_or_env_functions == SeenStatus::SeenAtLeastOne; + self.var_or_env_functions = SeenStatus::DontCare; + seen + } + + #[inline] + pub fn see_function(&mut self, name: &str) { + if self.var_or_env_functions == SeenStatus::LookingForThem { + if name.eq_ignore_ascii_case("var") || name.eq_ignore_ascii_case("env") { + self.var_or_env_functions = SeenStatus::SeenAtLeastOne; + } + } + } + + #[inline] + pub fn next(&mut self) -> Result<Token<'a>, ()> { + next_token(self) + } + + #[inline] + pub fn position(&self) -> SourcePosition { + SourcePosition(self.position) + } + + #[inline] + pub fn current_source_location(&self) -> SourceLocation { + SourceLocation { + line: self.current_line_number, + column: (self.position - self.current_line_start_position + 1) as u32, + } + } + + #[inline] + pub fn current_source_map_url(&self) -> Option<&'a str> { + self.source_map_url + } + + #[inline] + pub fn current_source_url(&self) -> Option<&'a str> { + self.source_url + } + + #[inline] + pub fn state(&self) -> ParserState { + ParserState { + position: self.position, + current_line_start_position: self.current_line_start_position, + current_line_number: self.current_line_number, + at_start_of: None, + } + } + + #[inline] + pub fn reset(&mut self, state: &ParserState) { + self.position = state.position; + self.current_line_start_position = state.current_line_start_position; + self.current_line_number = state.current_line_number; + } + + #[inline] + pub fn slice_from(&self, start_pos: SourcePosition) -> &'a str { + &self.input[start_pos.0..self.position] + } + + #[inline] + pub fn slice(&self, range: Range<SourcePosition>) -> &'a str { + &self.input[range.start.0..range.end.0] + } + + pub fn current_source_line(&self) -> &'a str { + let current = self.position; + let start = self.input[0..current] + .rfind(|c| matches!(c, '\r' | '\n' | '\x0C')) + .map_or(0, |start| start + 1); + let end = self.input[current..] + .find(|c| matches!(c, '\r' | '\n' | '\x0C')) + .map_or(self.input.len(), |end| current + end); + &self.input[start..end] + } + + #[inline] + pub fn next_byte(&self) -> Option<u8> { + if self.is_eof() { + None + } else { + Some(self.input.as_bytes()[self.position]) + } + } + + // If false, `tokenizer.next_char()` will not panic. + #[inline] + fn is_eof(&self) -> bool { + !self.has_at_least(0) + } + + // If true, the input has at least `n` bytes left *after* the current one. + // That is, `tokenizer.char_at(n)` will not panic. + #[inline] + fn has_at_least(&self, n: usize) -> bool { + self.position + n < self.input.len() + } + + // Advance over N bytes in the input. This function can advance + // over ASCII bytes (excluding newlines), or UTF-8 sequence + // leaders (excluding leaders for 4-byte sequences). + #[inline] + pub fn advance(&mut self, n: usize) { + if cfg!(debug_assertions) { + // Each byte must either be an ASCII byte or a sequence + // leader, but not a 4-byte leader; also newlines are + // rejected. + for i in 0..n { + let b = self.byte_at(i); + debug_assert!(b.is_ascii() || (b & 0xF0 != 0xF0 && b & 0xC0 != 0x80)); + debug_assert!(b != b'\r' && b != b'\n' && b != b'\x0C'); + } + } + self.position += n + } + + // Assumes non-EOF + #[inline] + fn next_byte_unchecked(&self) -> u8 { + self.byte_at(0) + } + + #[inline] + fn byte_at(&self, offset: usize) -> u8 { + self.input.as_bytes()[self.position + offset] + } + + // Advance over a single byte; the byte must be a UTF-8 sequence + // leader for a 4-byte sequence. + #[inline] + fn consume_4byte_intro(&mut self) { + debug_assert!(self.next_byte_unchecked() & 0xF0 == 0xF0); + // This takes two UTF-16 characters to represent, so we + // actually have an undercount. + self.current_line_start_position = self.current_line_start_position.wrapping_sub(1); + self.position += 1; + } + + // Advance over a single byte; the byte must be a UTF-8 + // continuation byte. + #[inline] + fn consume_continuation_byte(&mut self) { + debug_assert!(self.next_byte_unchecked() & 0xC0 == 0x80); + // Continuation bytes contribute to column overcount. Note + // that due to the special case for the 4-byte sequence intro, + // we must use wrapping add here. + self.current_line_start_position = self.current_line_start_position.wrapping_add(1); + self.position += 1; + } + + // Advance over any kind of byte, excluding newlines. + #[inline(never)] + fn consume_known_byte(&mut self, byte: u8) { + debug_assert!(byte != b'\r' && byte != b'\n' && byte != b'\x0C'); + self.position += 1; + // Continuation bytes contribute to column overcount. + if byte & 0xF0 == 0xF0 { + // This takes two UTF-16 characters to represent, so we + // actually have an undercount. + self.current_line_start_position = self.current_line_start_position.wrapping_sub(1); + } else if byte & 0xC0 == 0x80 { + // Note that due to the special case for the 4-byte + // sequence intro, we must use wrapping add here. + self.current_line_start_position = self.current_line_start_position.wrapping_add(1); + } + } + + #[inline] + fn next_char(&self) -> char { + self.input[self.position..].chars().next().unwrap() + } + + // Given that a newline has been seen, advance over the newline + // and update the state. + #[inline] + fn consume_newline(&mut self) { + let byte = self.next_byte_unchecked(); + debug_assert!(byte == b'\r' || byte == b'\n' || byte == b'\x0C'); + self.position += 1; + if byte == b'\r' && self.next_byte() == Some(b'\n') { + self.position += 1; + } + self.current_line_start_position = self.position; + self.current_line_number += 1; + } + + #[inline] + fn has_newline_at(&self, offset: usize) -> bool { + self.position + offset < self.input.len() + && matches!(self.byte_at(offset), b'\n' | b'\r' | b'\x0C') + } + + #[inline] + fn consume_char(&mut self) -> char { + let c = self.next_char(); + let len_utf8 = c.len_utf8(); + self.position += len_utf8; + // Note that due to the special case for the 4-byte sequence + // intro, we must use wrapping add here. + self.current_line_start_position = self + .current_line_start_position + .wrapping_add(len_utf8 - c.len_utf16()); + c + } + + #[inline] + fn starts_with(&self, needle: &[u8]) -> bool { + self.input.as_bytes()[self.position..].starts_with(needle) + } + + pub fn skip_whitespace(&mut self) { + while !self.is_eof() { + match_byte! { self.next_byte_unchecked(), + b' ' | b'\t' => { + self.advance(1) + }, + b'\n' | b'\x0C' | b'\r' => { + self.consume_newline(); + }, + b'/' => { + if self.starts_with(b"/*") { + consume_comment(self); + } else { + return + } + } + _ => { + return + } + } + } + } + + pub fn skip_cdc_and_cdo(&mut self) { + while !self.is_eof() { + match_byte! { self.next_byte_unchecked(), + b' ' | b'\t' => { + self.advance(1) + }, + b'\n' | b'\x0C' | b'\r' => { + self.consume_newline(); + }, + b'/' => { + if self.starts_with(b"/*") { + consume_comment(self); + } else { + return + } + } + b'<' => { + if self.starts_with(b"<!--") { + self.advance(4) + } else { + return + } + } + b'-' => { + if self.starts_with(b"-->") { + self.advance(3) + } else { + return + } + } + _ => { + return + } + } + } + } +} + +/// A position from the start of the input, counted in UTF-8 bytes. +#[derive(PartialEq, Eq, PartialOrd, Ord, Debug, Clone, Copy)] +pub struct SourcePosition(pub(crate) usize); + +impl SourcePosition { + /// Returns the current byte index in the original input. + #[inline] + pub fn byte_index(&self) -> usize { + self.0 + } +} + +/// The line and column number for a given position within the input. +#[derive(PartialEq, Eq, Debug, Clone, Copy)] +pub struct SourceLocation { + /// The line number, starting at 0 for the first line, unless `with_first_line_number` was used. + pub line: u32, + + /// The column number within a line, starting at 1 for first the character of the line. + /// Column numbers are counted in UTF-16 code units. + pub column: u32, +} + +fn next_token<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>, ()> { + if tokenizer.is_eof() { + return Err(()); + } + let b = tokenizer.next_byte_unchecked(); + let token = match_byte! { b, + b' ' | b'\t' => { + consume_whitespace(tokenizer, false) + }, + b'\n' | b'\x0C' | b'\r' => { + consume_whitespace(tokenizer, true) + }, + b'"' => { consume_string(tokenizer, false) }, + b'#' => { + tokenizer.advance(1); + if is_ident_start(tokenizer) { IDHash(consume_name(tokenizer)) } + else if !tokenizer.is_eof() && match tokenizer.next_byte_unchecked() { + // Any other valid case here already resulted in IDHash. + b'0'..=b'9' | b'-' => true, + _ => false, + } { Hash(consume_name(tokenizer)) } + else { Delim('#') } + }, + b'$' => { + if tokenizer.starts_with(b"$=") { tokenizer.advance(2); SuffixMatch } + else { tokenizer.advance(1); Delim('$') } + }, + b'\'' => { consume_string(tokenizer, true) }, + b'(' => { tokenizer.advance(1); ParenthesisBlock }, + b')' => { tokenizer.advance(1); CloseParenthesis }, + b'*' => { + if tokenizer.starts_with(b"*=") { tokenizer.advance(2); SubstringMatch } + else { tokenizer.advance(1); Delim('*') } + }, + b'+' => { + if ( + tokenizer.has_at_least(1) + && matches!(tokenizer.byte_at(1), b'0'..=b'9') + ) || ( + tokenizer.has_at_least(2) + && tokenizer.byte_at(1) == b'.' + && matches!(tokenizer.byte_at(2), b'0'..=b'9') + ) { + consume_numeric(tokenizer) + } else { + tokenizer.advance(1); + Delim('+') + } + }, + b',' => { tokenizer.advance(1); Comma }, + b'-' => { + if ( + tokenizer.has_at_least(1) + && matches!(tokenizer.byte_at(1), b'0'..=b'9') + ) || ( + tokenizer.has_at_least(2) + && tokenizer.byte_at(1) == b'.' + && matches!(tokenizer.byte_at(2), b'0'..=b'9') + ) { + consume_numeric(tokenizer) + } else if tokenizer.starts_with(b"-->") { + tokenizer.advance(3); + CDC + } else if is_ident_start(tokenizer) { + consume_ident_like(tokenizer) + } else { + tokenizer.advance(1); + Delim('-') + } + }, + b'.' => { + if tokenizer.has_at_least(1) + && matches!(tokenizer.byte_at(1), b'0'..=b'9' + ) { + consume_numeric(tokenizer) + } else { + tokenizer.advance(1); + Delim('.') + } + } + b'/' => { + if tokenizer.starts_with(b"/*") { + Comment(consume_comment(tokenizer)) + } else { + tokenizer.advance(1); + Delim('/') + } + } + b'0'..=b'9' => { consume_numeric(tokenizer) }, + b':' => { tokenizer.advance(1); Colon }, + b';' => { tokenizer.advance(1); Semicolon }, + b'<' => { + if tokenizer.starts_with(b"<!--") { + tokenizer.advance(4); + CDO + } else { + tokenizer.advance(1); + Delim('<') + } + }, + b'@' => { + tokenizer.advance(1); + if is_ident_start(tokenizer) { AtKeyword(consume_name(tokenizer)) } + else { Delim('@') } + }, + b'a'..=b'z' | b'A'..=b'Z' | b'_' | b'\0' => { consume_ident_like(tokenizer) }, + b'[' => { tokenizer.advance(1); SquareBracketBlock }, + b'\\' => { + if !tokenizer.has_newline_at(1) { consume_ident_like(tokenizer) } + else { tokenizer.advance(1); Delim('\\') } + }, + b']' => { tokenizer.advance(1); CloseSquareBracket }, + b'^' => { + if tokenizer.starts_with(b"^=") { tokenizer.advance(2); PrefixMatch } + else { tokenizer.advance(1); Delim('^') } + }, + b'{' => { tokenizer.advance(1); CurlyBracketBlock }, + b'|' => { + if tokenizer.starts_with(b"|=") { tokenizer.advance(2); DashMatch } + else { tokenizer.advance(1); Delim('|') } + }, + b'}' => { tokenizer.advance(1); CloseCurlyBracket }, + b'~' => { + if tokenizer.starts_with(b"~=") { tokenizer.advance(2); IncludeMatch } + else { tokenizer.advance(1); Delim('~') } + }, + _ => { + if !b.is_ascii() { + consume_ident_like(tokenizer) + } else { + tokenizer.advance(1); + Delim(b as char) + } + }, + }; + Ok(token) +} + +fn consume_whitespace<'a>(tokenizer: &mut Tokenizer<'a>, newline: bool) -> Token<'a> { + let start_position = tokenizer.position(); + if newline { + tokenizer.consume_newline(); + } else { + tokenizer.advance(1); + } + while !tokenizer.is_eof() { + let b = tokenizer.next_byte_unchecked(); + match_byte! { b, + b' ' | b'\t' => { + tokenizer.advance(1); + } + b'\n' | b'\x0C' | b'\r' => { + tokenizer.consume_newline(); + } + _ => { + break + } + } + } + WhiteSpace(tokenizer.slice_from(start_position)) +} + +// Check for sourceMappingURL or sourceURL comments and update the +// tokenizer appropriately. +fn check_for_source_map<'a>(tokenizer: &mut Tokenizer<'a>, contents: &'a str) { + let directive = "# sourceMappingURL="; + let directive_old = "@ sourceMappingURL="; + + // If there is a source map directive, extract the URL. + if contents.starts_with(directive) || contents.starts_with(directive_old) { + let contents = &contents[directive.len()..]; + tokenizer.source_map_url = contents + .split(|c| c == ' ' || c == '\t' || c == '\x0C' || c == '\r' || c == '\n') + .next() + } + + let directive = "# sourceURL="; + let directive_old = "@ sourceURL="; + + // If there is a source map directive, extract the URL. + if contents.starts_with(directive) || contents.starts_with(directive_old) { + let contents = &contents[directive.len()..]; + tokenizer.source_url = contents + .split(|c| c == ' ' || c == '\t' || c == '\x0C' || c == '\r' || c == '\n') + .next() + } +} + +fn consume_comment<'a>(tokenizer: &mut Tokenizer<'a>) -> &'a str { + tokenizer.advance(2); // consume "/*" + let start_position = tokenizer.position(); + while !tokenizer.is_eof() { + match_byte! { tokenizer.next_byte_unchecked(), + b'*' => { + let end_position = tokenizer.position(); + tokenizer.advance(1); + if tokenizer.next_byte() == Some(b'/') { + tokenizer.advance(1); + let contents = tokenizer.slice(start_position..end_position); + check_for_source_map(tokenizer, contents); + return contents + } + } + b'\n' | b'\x0C' | b'\r' => { + tokenizer.consume_newline(); + } + b'\x80'..=b'\xBF' => { tokenizer.consume_continuation_byte(); } + b'\xF0'..=b'\xFF' => { tokenizer.consume_4byte_intro(); } + _ => { + // ASCII or other leading byte. + tokenizer.advance(1); + } + } + } + let contents = tokenizer.slice_from(start_position); + check_for_source_map(tokenizer, contents); + contents +} + +fn consume_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool) -> Token<'a> { + match consume_quoted_string(tokenizer, single_quote) { + Ok(value) => QuotedString(value), + Err(value) => BadString(value), + } +} + +/// Return `Err(())` on syntax error (ie. unescaped newline) +fn consume_quoted_string<'a>( + tokenizer: &mut Tokenizer<'a>, + single_quote: bool, +) -> Result<CowRcStr<'a>, CowRcStr<'a>> { + tokenizer.advance(1); // Skip the initial quote + // start_pos is at code point boundary, after " or ' + let start_pos = tokenizer.position(); + let mut string_bytes; + loop { + if tokenizer.is_eof() { + return Ok(tokenizer.slice_from(start_pos).into()); + } + match_byte! { tokenizer.next_byte_unchecked(), + b'"' => { + if !single_quote { + let value = tokenizer.slice_from(start_pos); + tokenizer.advance(1); + return Ok(value.into()) + } + tokenizer.advance(1); + } + b'\'' => { + if single_quote { + let value = tokenizer.slice_from(start_pos); + tokenizer.advance(1); + return Ok(value.into()) + } + tokenizer.advance(1); + } + b'\\' | b'\0' => { + // * The tokenizer’s input is UTF-8 since it’s `&str`. + // * start_pos is at a code point boundary + // * so is the current position (which is before '\\' or '\0' + // + // So `string_bytes` is well-formed UTF-8. + string_bytes = tokenizer.slice_from(start_pos).as_bytes().to_owned(); + break + } + b'\n' | b'\r' | b'\x0C' => { + return Err(tokenizer.slice_from(start_pos).into()) + }, + b'\x80'..=b'\xBF' => { tokenizer.consume_continuation_byte(); } + b'\xF0'..=b'\xFF' => { tokenizer.consume_4byte_intro(); } + _ => { + // ASCII or other leading byte. + tokenizer.advance(1); + } + } + } + + while !tokenizer.is_eof() { + let b = tokenizer.next_byte_unchecked(); + match_byte! { b, + b'\n' | b'\r' | b'\x0C' => { + return Err( + // string_bytes is well-formed UTF-8, see other comments. + unsafe { + from_utf8_release_unchecked(string_bytes) + }.into() + ); + } + b'"' => { + tokenizer.advance(1); + if !single_quote { + break; + } + } + b'\'' => { + tokenizer.advance(1); + if single_quote { + break; + } + } + b'\\' => { + tokenizer.advance(1); + if !tokenizer.is_eof() { + match tokenizer.next_byte_unchecked() { + // Escaped newline + b'\n' | b'\x0C' | b'\r' => { + tokenizer.consume_newline(); + } + // This pushes one well-formed code point + _ => consume_escape_and_write(tokenizer, &mut string_bytes) + } + } + // else: escaped EOF, do nothing. + continue; + } + b'\0' => { + tokenizer.advance(1); + string_bytes.extend("\u{FFFD}".as_bytes()); + continue; + } + b'\x80'..=b'\xBF' => { tokenizer.consume_continuation_byte(); } + b'\xF0'..=b'\xFF' => { tokenizer.consume_4byte_intro(); } + _ => { + // ASCII or other leading byte. + tokenizer.advance(1); + }, + } + + // If this byte is part of a multi-byte code point, + // we’ll end up copying the whole code point before this loop does something else. + string_bytes.push(b); + } + + Ok( + // string_bytes is well-formed UTF-8, see other comments. + unsafe { from_utf8_release_unchecked(string_bytes) }.into(), + ) +} + +#[inline] +fn is_ident_start(tokenizer: &mut Tokenizer) -> bool { + !tokenizer.is_eof() + && match_byte! { tokenizer.next_byte_unchecked(), + b'a'..=b'z' | b'A'..=b'Z' | b'_' | b'\0' => { true }, + b'-' => { + tokenizer.has_at_least(1) && match_byte! { tokenizer.byte_at(1), + b'a'..=b'z' | b'A'..=b'Z' | b'-' | b'_' | b'\0' => { + true + } + b'\\' => { !tokenizer.has_newline_at(1) } + b => { !b.is_ascii() }, + } + }, + b'\\' => { !tokenizer.has_newline_at(1) }, + b => { !b.is_ascii() }, + } +} + +fn consume_ident_like<'a>(tokenizer: &mut Tokenizer<'a>) -> Token<'a> { + let value = consume_name(tokenizer); + if !tokenizer.is_eof() && tokenizer.next_byte_unchecked() == b'(' { + tokenizer.advance(1); + if value.eq_ignore_ascii_case("url") { + consume_unquoted_url(tokenizer).unwrap_or(Function(value)) + } else { + tokenizer.see_function(&value); + Function(value) + } + } else { + Ident(value) + } +} + +fn consume_name<'a>(tokenizer: &mut Tokenizer<'a>) -> CowRcStr<'a> { + // start_pos is the end of the previous token, therefore at a code point boundary + let start_pos = tokenizer.position(); + let mut value_bytes; + loop { + if tokenizer.is_eof() { + return tokenizer.slice_from(start_pos).into(); + } + match_byte! { tokenizer.next_byte_unchecked(), + b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' | b'_' | b'-' => { tokenizer.advance(1) }, + b'\\' | b'\0' => { + // * The tokenizer’s input is UTF-8 since it’s `&str`. + // * start_pos is at a code point boundary + // * so is the current position (which is before '\\' or '\0' + // + // So `value_bytes` is well-formed UTF-8. + value_bytes = tokenizer.slice_from(start_pos).as_bytes().to_owned(); + break + } + b'\x80'..=b'\xBF' => { tokenizer.consume_continuation_byte(); } + b'\xC0'..=b'\xEF' => { tokenizer.advance(1); } + b'\xF0'..=b'\xFF' => { tokenizer.consume_4byte_intro(); } + _b => { + return tokenizer.slice_from(start_pos).into(); + } + } + } + + while !tokenizer.is_eof() { + let b = tokenizer.next_byte_unchecked(); + match_byte! { b, + b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' | b'_' | b'-' => { + tokenizer.advance(1); + value_bytes.push(b) // ASCII + } + b'\\' => { + if tokenizer.has_newline_at(1) { break } + tokenizer.advance(1); + // This pushes one well-formed code point + consume_escape_and_write(tokenizer, &mut value_bytes) + } + b'\0' => { + tokenizer.advance(1); + value_bytes.extend("\u{FFFD}".as_bytes()); + }, + b'\x80'..=b'\xBF' => { + // This byte *is* part of a multi-byte code point, + // we’ll end up copying the whole code point before this loop does something else. + tokenizer.consume_continuation_byte(); + value_bytes.push(b) + } + b'\xC0'..=b'\xEF' => { + // This byte *is* part of a multi-byte code point, + // we’ll end up copying the whole code point before this loop does something else. + tokenizer.advance(1); + value_bytes.push(b) + } + b'\xF0'..=b'\xFF' => { + tokenizer.consume_4byte_intro(); + value_bytes.push(b) + } + _ => { + // ASCII + break; + } + } + } + // string_bytes is well-formed UTF-8, see other comments. + unsafe { from_utf8_release_unchecked(value_bytes) }.into() +} + +fn byte_to_hex_digit(b: u8) -> Option<u32> { + Some(match_byte! { b, + b'0' ..= b'9' => { b - b'0' }, + b'a' ..= b'f' => { b - b'a' + 10 }, + b'A' ..= b'F' => { b - b'A' + 10 }, + _ => { + return None + } + } as u32) +} + +fn byte_to_decimal_digit(b: u8) -> Option<u32> { + if b >= b'0' && b <= b'9' { + Some((b - b'0') as u32) + } else { + None + } +} + +fn consume_numeric<'a>(tokenizer: &mut Tokenizer<'a>) -> Token<'a> { + // Parse [+-]?\d*(\.\d+)?([eE][+-]?\d+)? + // But this is always called so that there is at least one digit in \d*(\.\d+)? + + // Do all the math in f64 so that large numbers overflow to +/-inf + // and i32::{MIN, MAX} are within range. + + let (has_sign, sign) = match tokenizer.next_byte_unchecked() { + b'-' => (true, -1.), + b'+' => (true, 1.), + _ => (false, 1.), + }; + if has_sign { + tokenizer.advance(1); + } + + let mut integral_part: f64 = 0.; + while let Some(digit) = byte_to_decimal_digit(tokenizer.next_byte_unchecked()) { + integral_part = integral_part * 10. + digit as f64; + tokenizer.advance(1); + if tokenizer.is_eof() { + break; + } + } + + let mut is_integer = true; + + let mut fractional_part: f64 = 0.; + if tokenizer.has_at_least(1) + && tokenizer.next_byte_unchecked() == b'.' + && matches!(tokenizer.byte_at(1), b'0'..=b'9') + { + is_integer = false; + tokenizer.advance(1); // Consume '.' + let mut factor = 0.1; + while let Some(digit) = byte_to_decimal_digit(tokenizer.next_byte_unchecked()) { + fractional_part += digit as f64 * factor; + factor *= 0.1; + tokenizer.advance(1); + if tokenizer.is_eof() { + break; + } + } + } + + let mut value = sign * (integral_part + fractional_part); + + if tokenizer.has_at_least(1) && matches!(tokenizer.next_byte_unchecked(), b'e' | b'E') { + if matches!(tokenizer.byte_at(1), b'0'..=b'9') + || (tokenizer.has_at_least(2) + && matches!(tokenizer.byte_at(1), b'+' | b'-') + && matches!(tokenizer.byte_at(2), b'0'..=b'9')) + { + is_integer = false; + tokenizer.advance(1); + let (has_sign, sign) = match tokenizer.next_byte_unchecked() { + b'-' => (true, -1.), + b'+' => (true, 1.), + _ => (false, 1.), + }; + if has_sign { + tokenizer.advance(1); + } + let mut exponent: f64 = 0.; + while let Some(digit) = byte_to_decimal_digit(tokenizer.next_byte_unchecked()) { + exponent = exponent * 10. + digit as f64; + tokenizer.advance(1); + if tokenizer.is_eof() { + break; + } + } + value *= f64::powf(10., sign * exponent); + } + } + + let int_value = if is_integer { + Some(if value >= i32::MAX as f64 { + i32::MAX + } else if value <= i32::MIN as f64 { + i32::MIN + } else { + value as i32 + }) + } else { + None + }; + + if !tokenizer.is_eof() && tokenizer.next_byte_unchecked() == b'%' { + tokenizer.advance(1); + return Percentage { + unit_value: (value / 100.) as f32, + int_value: int_value, + has_sign: has_sign, + }; + } + let value = value as f32; + if is_ident_start(tokenizer) { + let unit = consume_name(tokenizer); + Dimension { + value: value, + int_value: int_value, + has_sign: has_sign, + unit: unit, + } + } else { + Number { + value: value, + int_value: int_value, + has_sign: has_sign, + } + } +} + +#[inline] +unsafe fn from_utf8_release_unchecked(string_bytes: Vec<u8>) -> String { + if cfg!(debug_assertions) { + String::from_utf8(string_bytes).unwrap() + } else { + String::from_utf8_unchecked(string_bytes) + } +} + +fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>, ()> { + // This is only called after "url(", so the current position is a code point boundary. + let start_position = tokenizer.position; + let from_start = &tokenizer.input[tokenizer.position..]; + let mut newlines = 0; + let mut last_newline = 0; + let mut found_printable_char = false; + let mut iter = from_start.bytes().enumerate(); + loop { + let (offset, b) = match iter.next() { + Some(item) => item, + None => { + tokenizer.position = tokenizer.input.len(); + break; + } + }; + match_byte! { b, + b' ' | b'\t' => {}, + b'\n' | b'\x0C' => { + newlines += 1; + last_newline = offset; + } + b'\r' => { + if from_start.as_bytes().get(offset + 1) != Some(&b'\n') { + newlines += 1; + last_newline = offset; + } + } + b'"' | b'\'' => { return Err(()) }, // Do not advance + b')' => { + // Don't use advance, because we may be skipping + // newlines here, and we want to avoid the assert. + tokenizer.position += offset + 1; + break + } + _ => { + // Don't use advance, because we may be skipping + // newlines here, and we want to avoid the assert. + tokenizer.position += offset; + found_printable_char = true; + break + } + } + } + + if newlines > 0 { + tokenizer.current_line_number += newlines; + // No need for wrapping_add here, because there's no possible + // way to wrap. + tokenizer.current_line_start_position = start_position + last_newline + 1; + } + + if found_printable_char { + // This function only consumed ASCII (whitespace) bytes, + // so the current position is a code point boundary. + return Ok(consume_unquoted_url_internal(tokenizer)); + } else { + return Ok(UnquotedUrl("".into())); + } + + fn consume_unquoted_url_internal<'a>(tokenizer: &mut Tokenizer<'a>) -> Token<'a> { + // This function is only called with start_pos at a code point boundary. + let start_pos = tokenizer.position(); + let mut string_bytes: Vec<u8>; + loop { + if tokenizer.is_eof() { + return UnquotedUrl(tokenizer.slice_from(start_pos).into()); + } + match_byte! { tokenizer.next_byte_unchecked(), + b' ' | b'\t' | b'\n' | b'\r' | b'\x0C' => { + let value = tokenizer.slice_from(start_pos); + return consume_url_end(tokenizer, start_pos, value.into()) + } + b')' => { + let value = tokenizer.slice_from(start_pos); + tokenizer.advance(1); + return UnquotedUrl(value.into()) + } + b'\x01'..=b'\x08' | b'\x0B' | b'\x0E'..=b'\x1F' | b'\x7F' // non-printable + | b'"' | b'\'' | b'(' => { + tokenizer.advance(1); + return consume_bad_url(tokenizer, start_pos) + }, + b'\\' | b'\0' => { + // * The tokenizer’s input is UTF-8 since it’s `&str`. + // * start_pos is at a code point boundary + // * so is the current position (which is before '\\' or '\0' + // + // So `string_bytes` is well-formed UTF-8. + string_bytes = tokenizer.slice_from(start_pos).as_bytes().to_owned(); + break + } + b'\x80'..=b'\xBF' => { tokenizer.consume_continuation_byte(); } + b'\xF0'..=b'\xFF' => { tokenizer.consume_4byte_intro(); } + _ => { + // ASCII or other leading byte. + tokenizer.advance(1); + } + } + } + while !tokenizer.is_eof() { + let b = tokenizer.next_byte_unchecked(); + match_byte! { b, + b' ' | b'\t' | b'\n' | b'\r' | b'\x0C' => { + // string_bytes is well-formed UTF-8, see other comments. + let string = unsafe { from_utf8_release_unchecked(string_bytes) }.into(); + return consume_url_end(tokenizer, start_pos, string) + } + b')' => { + tokenizer.advance(1); + break; + } + b'\x01'..=b'\x08' | b'\x0B' | b'\x0E'..=b'\x1F' | b'\x7F' // non-printable + | b'"' | b'\'' | b'(' => { + tokenizer.advance(1); + return consume_bad_url(tokenizer, start_pos); + } + b'\\' => { + tokenizer.advance(1); + if tokenizer.has_newline_at(0) { + return consume_bad_url(tokenizer, start_pos) + } + + // This pushes one well-formed code point to string_bytes + consume_escape_and_write(tokenizer, &mut string_bytes) + }, + b'\0' => { + tokenizer.advance(1); + string_bytes.extend("\u{FFFD}".as_bytes()); + } + b'\x80'..=b'\xBF' => { + // We’ll end up copying the whole code point + // before this loop does something else. + tokenizer.consume_continuation_byte(); + string_bytes.push(b); + } + b'\xF0'..=b'\xFF' => { + // We’ll end up copying the whole code point + // before this loop does something else. + tokenizer.consume_4byte_intro(); + string_bytes.push(b); + } + // If this byte is part of a multi-byte code point, + // we’ll end up copying the whole code point before this loop does something else. + b => { + // ASCII or other leading byte. + tokenizer.advance(1); + string_bytes.push(b) + } + } + } + UnquotedUrl( + // string_bytes is well-formed UTF-8, see other comments. + unsafe { from_utf8_release_unchecked(string_bytes) }.into(), + ) + } + + fn consume_url_end<'a>( + tokenizer: &mut Tokenizer<'a>, + start_pos: SourcePosition, + string: CowRcStr<'a>, + ) -> Token<'a> { + while !tokenizer.is_eof() { + match_byte! { tokenizer.next_byte_unchecked(), + b')' => { + tokenizer.advance(1); + break + } + b' ' | b'\t' => { tokenizer.advance(1); } + b'\n' | b'\x0C' | b'\r' => { + tokenizer.consume_newline(); + } + b => { + tokenizer.consume_known_byte(b); + return consume_bad_url(tokenizer, start_pos); + } + } + } + UnquotedUrl(string) + } + + fn consume_bad_url<'a>(tokenizer: &mut Tokenizer<'a>, start_pos: SourcePosition) -> Token<'a> { + // Consume up to the closing ) + while !tokenizer.is_eof() { + match_byte! { tokenizer.next_byte_unchecked(), + b')' => { + let contents = tokenizer.slice_from(start_pos).into(); + tokenizer.advance(1); + return BadUrl(contents) + } + b'\\' => { + tokenizer.advance(1); + if matches!(tokenizer.next_byte(), Some(b')') | Some(b'\\')) { + tokenizer.advance(1); // Skip an escaped ')' or '\' + } + } + b'\n' | b'\x0C' | b'\r' => { + tokenizer.consume_newline(); + } + b => { + tokenizer.consume_known_byte(b); + } + } + } + BadUrl(tokenizer.slice_from(start_pos).into()) + } +} + +// (value, number of digits up to 6) +fn consume_hex_digits<'a>(tokenizer: &mut Tokenizer<'a>) -> (u32, u32) { + let mut value = 0; + let mut digits = 0; + while digits < 6 && !tokenizer.is_eof() { + match byte_to_hex_digit(tokenizer.next_byte_unchecked()) { + Some(digit) => { + value = value * 16 + digit; + digits += 1; + tokenizer.advance(1); + } + None => break, + } + } + (value, digits) +} + +// Same constraints as consume_escape except it writes into `bytes` the result +// instead of returning it. +fn consume_escape_and_write(tokenizer: &mut Tokenizer, bytes: &mut Vec<u8>) { + bytes.extend( + consume_escape(tokenizer) + .encode_utf8(&mut [0; 4]) + .as_bytes(), + ) +} + +// Assumes that the U+005C REVERSE SOLIDUS (\) has already been consumed +// and that the next input character has already been verified +// to not be a newline. +fn consume_escape(tokenizer: &mut Tokenizer) -> char { + if tokenizer.is_eof() { + return '\u{FFFD}'; + } // Escaped EOF + match_byte! { tokenizer.next_byte_unchecked(), + b'0'..=b'9' | b'A'..=b'F' | b'a'..=b'f' => { + let (c, _) = consume_hex_digits(tokenizer); + if !tokenizer.is_eof() { + match_byte! { tokenizer.next_byte_unchecked(), + b' ' | b'\t' => { + tokenizer.advance(1) + } + b'\n' | b'\x0C' | b'\r' => { + tokenizer.consume_newline(); + } + _ => {} + } + } + static REPLACEMENT_CHAR: char = '\u{FFFD}'; + if c != 0 { + let c = char::from_u32(c); + c.unwrap_or(REPLACEMENT_CHAR) + } else { + REPLACEMENT_CHAR + } + }, + b'\0' => { + tokenizer.advance(1); + '\u{FFFD}' + } + _ => { tokenizer.consume_char() } + } +} |