/* This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ // https://drafts.csswg.org/css-syntax/#tokenization use self::Token::*; use crate::cow_rc_str::CowRcStr; use crate::parser::ParserState; use std::char; use std::ops::Range; #[cfg(not(feature = "dummy_match_byte"))] use cssparser_macros::match_byte; #[cfg(feature = "dummy_match_byte")] macro_rules! match_byte { ($value:expr, $($rest:tt)* ) => { match $value { $( $rest )+ } }; } /// One of the pieces the CSS input is broken into. /// /// Some components use `Cow` in order to borrow from the original input string /// and avoid allocating/copying when possible. #[derive(PartialEq, Debug, Clone)] pub enum Token<'a> { /// A [``](https://drafts.csswg.org/css-syntax/#ident-token-diagram) Ident(CowRcStr<'a>), /// A [``](https://drafts.csswg.org/css-syntax/#at-keyword-token-diagram) /// /// The value does not include the `@` marker. AtKeyword(CowRcStr<'a>), /// A [``](https://drafts.csswg.org/css-syntax/#hash-token-diagram) with the type flag set to "unrestricted" /// /// The value does not include the `#` marker. Hash(CowRcStr<'a>), /// A [``](https://drafts.csswg.org/css-syntax/#hash-token-diagram) with the type flag set to "id" /// /// The value does not include the `#` marker. IDHash(CowRcStr<'a>), // Hash that is a valid ID selector. /// A [``](https://drafts.csswg.org/css-syntax/#string-token-diagram) /// /// The value does not include the quotes. QuotedString(CowRcStr<'a>), /// A [``](https://drafts.csswg.org/css-syntax/#url-token-diagram) /// /// The value does not include the `url(` `)` markers. Note that `url( )` is represented by a /// `Function` token. UnquotedUrl(CowRcStr<'a>), /// A `` Delim(char), /// A [``](https://drafts.csswg.org/css-syntax/#number-token-diagram) Number { /// Whether the number had a `+` or `-` sign. /// /// This is used is some cases like the micro syntax. (See the `parse_nth` function.) has_sign: bool, /// The value as a float value: f32, /// If the origin source did not include a fractional part, the value as an integer. int_value: Option, }, /// A [``](https://drafts.csswg.org/css-syntax/#percentage-token-diagram) Percentage { /// Whether the number had a `+` or `-` sign. has_sign: bool, /// The value as a float, divided by 100 so that the nominal range is 0.0 to 1.0. unit_value: f32, /// If the origin source did not include a fractional part, the value as an integer. /// It is **not** divided by 100. int_value: Option, }, /// A [``](https://drafts.csswg.org/css-syntax/#dimension-token-diagram) Dimension { /// Whether the number had a `+` or `-` sign. /// /// This is used is some cases like the micro syntax. (See the `parse_nth` function.) has_sign: bool, /// The value as a float value: f32, /// If the origin source did not include a fractional part, the value as an integer. int_value: Option, /// The unit, e.g. "px" in `12px` unit: CowRcStr<'a>, }, /// A [``](https://drafts.csswg.org/css-syntax/#whitespace-token-diagram) WhiteSpace(&'a str), /// A comment. /// /// The CSS Syntax spec does not generate tokens for comments, /// But we do, because we can (borrowed &str makes it cheap). /// /// The value does not include the `/*` `*/` markers. Comment(&'a str), /// A `:` `` Colon, // : /// A `;` `` Semicolon, // ; /// A `,` `` Comma, // , /// A `~=` [``](https://drafts.csswg.org/css-syntax/#include-match-token-diagram) IncludeMatch, /// A `|=` [``](https://drafts.csswg.org/css-syntax/#dash-match-token-diagram) DashMatch, /// A `^=` [``](https://drafts.csswg.org/css-syntax/#prefix-match-token-diagram) PrefixMatch, /// A `$=` [``](https://drafts.csswg.org/css-syntax/#suffix-match-token-diagram) SuffixMatch, /// A `*=` [``](https://drafts.csswg.org/css-syntax/#substring-match-token-diagram) SubstringMatch, /// A `` [``](https://drafts.csswg.org/css-syntax/#CDC-token-diagram) CDC, /// A [``](https://drafts.csswg.org/css-syntax/#function-token-diagram) /// /// The value (name) does not include the `(` marker. Function(CowRcStr<'a>), /// A `<(-token>` ParenthesisBlock, /// A `<[-token>` SquareBracketBlock, /// A `<{-token>` CurlyBracketBlock, /// A `` /// /// This token always indicates a parse error. BadUrl(CowRcStr<'a>), /// A `` /// /// This token always indicates a parse error. BadString(CowRcStr<'a>), /// A `<)-token>` /// /// When obtained from one of the `Parser::next*` methods, /// this token is always unmatched and indicates a parse error. CloseParenthesis, /// A `<]-token>` /// /// When obtained from one of the `Parser::next*` methods, /// this token is always unmatched and indicates a parse error. CloseSquareBracket, /// A `<}-token>` /// /// When obtained from one of the `Parser::next*` methods, /// this token is always unmatched and indicates a parse error. CloseCurlyBracket, } impl<'a> Token<'a> { /// Return whether this token represents a parse error. /// /// `BadUrl` and `BadString` are tokenizer-level parse errors. /// /// `CloseParenthesis`, `CloseSquareBracket`, and `CloseCurlyBracket` are *unmatched* /// and therefore parse errors when returned by one of the `Parser::next*` methods. pub fn is_parse_error(&self) -> bool { matches!( *self, BadUrl(_) | BadString(_) | CloseParenthesis | CloseSquareBracket | CloseCurlyBracket ) } } #[derive(Clone)] pub struct Tokenizer<'a> { input: &'a str, /// Counted in bytes, not code points. From 0. position: usize, /// The position at the start of the current line; but adjusted to /// ensure that computing the column will give the result in units /// of UTF-16 characters. current_line_start_position: usize, current_line_number: u32, var_or_env_functions: SeenStatus, source_map_url: Option<&'a str>, source_url: Option<&'a str>, } #[derive(Copy, Clone, PartialEq, Eq)] enum SeenStatus { DontCare, LookingForThem, SeenAtLeastOne, } impl<'a> Tokenizer<'a> { #[inline] pub fn new(input: &str) -> Tokenizer { Tokenizer::with_first_line_number(input, 0) } #[inline] pub fn with_first_line_number(input: &str, first_line_number: u32) -> Tokenizer { Tokenizer { input, position: 0, current_line_start_position: 0, current_line_number: first_line_number, var_or_env_functions: SeenStatus::DontCare, source_map_url: None, source_url: None, } } #[inline] pub fn look_for_var_or_env_functions(&mut self) { self.var_or_env_functions = SeenStatus::LookingForThem; } #[inline] pub fn seen_var_or_env_functions(&mut self) -> bool { let seen = self.var_or_env_functions == SeenStatus::SeenAtLeastOne; self.var_or_env_functions = SeenStatus::DontCare; seen } #[inline] pub fn see_function(&mut self, name: &str) { if self.var_or_env_functions == SeenStatus::LookingForThem { if name.eq_ignore_ascii_case("var") || name.eq_ignore_ascii_case("env") { self.var_or_env_functions = SeenStatus::SeenAtLeastOne; } } } #[inline] pub fn next(&mut self) -> Result, ()> { next_token(self) } #[inline] pub fn position(&self) -> SourcePosition { SourcePosition(self.position) } #[inline] pub fn current_source_location(&self) -> SourceLocation { SourceLocation { line: self.current_line_number, column: (self.position - self.current_line_start_position + 1) as u32, } } #[inline] pub fn current_source_map_url(&self) -> Option<&'a str> { self.source_map_url } #[inline] pub fn current_source_url(&self) -> Option<&'a str> { self.source_url } #[inline] pub fn state(&self) -> ParserState { ParserState { position: self.position, current_line_start_position: self.current_line_start_position, current_line_number: self.current_line_number, at_start_of: None, } } #[inline] pub fn reset(&mut self, state: &ParserState) { self.position = state.position; self.current_line_start_position = state.current_line_start_position; self.current_line_number = state.current_line_number; } #[inline] pub fn slice_from(&self, start_pos: SourcePosition) -> &'a str { &self.input[start_pos.0..self.position] } #[inline] pub fn slice(&self, range: Range) -> &'a str { &self.input[range.start.0..range.end.0] } pub fn current_source_line(&self) -> &'a str { let current = self.position; let start = self.input[0..current] .rfind(|c| matches!(c, '\r' | '\n' | '\x0C')) .map_or(0, |start| start + 1); let end = self.input[current..] .find(|c| matches!(c, '\r' | '\n' | '\x0C')) .map_or(self.input.len(), |end| current + end); &self.input[start..end] } #[inline] pub fn next_byte(&self) -> Option { if self.is_eof() { None } else { Some(self.input.as_bytes()[self.position]) } } // If false, `tokenizer.next_char()` will not panic. #[inline] fn is_eof(&self) -> bool { !self.has_at_least(0) } // If true, the input has at least `n` bytes left *after* the current one. // That is, `tokenizer.char_at(n)` will not panic. #[inline] fn has_at_least(&self, n: usize) -> bool { self.position + n < self.input.len() } // Advance over N bytes in the input. This function can advance // over ASCII bytes (excluding newlines), or UTF-8 sequence // leaders (excluding leaders for 4-byte sequences). #[inline] pub fn advance(&mut self, n: usize) { if cfg!(debug_assertions) { // Each byte must either be an ASCII byte or a sequence // leader, but not a 4-byte leader; also newlines are // rejected. for i in 0..n { let b = self.byte_at(i); debug_assert!(b.is_ascii() || (b & 0xF0 != 0xF0 && b & 0xC0 != 0x80)); debug_assert!(b != b'\r' && b != b'\n' && b != b'\x0C'); } } self.position += n } // Assumes non-EOF #[inline] fn next_byte_unchecked(&self) -> u8 { self.byte_at(0) } #[inline] fn byte_at(&self, offset: usize) -> u8 { self.input.as_bytes()[self.position + offset] } // Advance over a single byte; the byte must be a UTF-8 sequence // leader for a 4-byte sequence. #[inline] fn consume_4byte_intro(&mut self) { debug_assert!(self.next_byte_unchecked() & 0xF0 == 0xF0); // This takes two UTF-16 characters to represent, so we // actually have an undercount. self.current_line_start_position = self.current_line_start_position.wrapping_sub(1); self.position += 1; } // Advance over a single byte; the byte must be a UTF-8 // continuation byte. #[inline] fn consume_continuation_byte(&mut self) { debug_assert!(self.next_byte_unchecked() & 0xC0 == 0x80); // Continuation bytes contribute to column overcount. Note // that due to the special case for the 4-byte sequence intro, // we must use wrapping add here. self.current_line_start_position = self.current_line_start_position.wrapping_add(1); self.position += 1; } // Advance over any kind of byte, excluding newlines. #[inline(never)] fn consume_known_byte(&mut self, byte: u8) { debug_assert!(byte != b'\r' && byte != b'\n' && byte != b'\x0C'); self.position += 1; // Continuation bytes contribute to column overcount. if byte & 0xF0 == 0xF0 { // This takes two UTF-16 characters to represent, so we // actually have an undercount. self.current_line_start_position = self.current_line_start_position.wrapping_sub(1); } else if byte & 0xC0 == 0x80 { // Note that due to the special case for the 4-byte // sequence intro, we must use wrapping add here. self.current_line_start_position = self.current_line_start_position.wrapping_add(1); } } #[inline] fn next_char(&self) -> char { self.input[self.position..].chars().next().unwrap() } // Given that a newline has been seen, advance over the newline // and update the state. #[inline] fn consume_newline(&mut self) { let byte = self.next_byte_unchecked(); debug_assert!(byte == b'\r' || byte == b'\n' || byte == b'\x0C'); self.position += 1; if byte == b'\r' && self.next_byte() == Some(b'\n') { self.position += 1; } self.current_line_start_position = self.position; self.current_line_number += 1; } #[inline] fn has_newline_at(&self, offset: usize) -> bool { self.position + offset < self.input.len() && matches!(self.byte_at(offset), b'\n' | b'\r' | b'\x0C') } #[inline] fn consume_char(&mut self) -> char { let c = self.next_char(); let len_utf8 = c.len_utf8(); self.position += len_utf8; // Note that due to the special case for the 4-byte sequence // intro, we must use wrapping add here. self.current_line_start_position = self .current_line_start_position .wrapping_add(len_utf8 - c.len_utf16()); c } #[inline] fn starts_with(&self, needle: &[u8]) -> bool { self.input.as_bytes()[self.position..].starts_with(needle) } pub fn skip_whitespace(&mut self) { while !self.is_eof() { match_byte! { self.next_byte_unchecked(), b' ' | b'\t' => { self.advance(1) }, b'\n' | b'\x0C' | b'\r' => { self.consume_newline(); }, b'/' => { if self.starts_with(b"/*") { consume_comment(self); } else { return } } _ => return, } } } pub fn skip_cdc_and_cdo(&mut self) { while !self.is_eof() { match_byte! { self.next_byte_unchecked(), b' ' | b'\t' => { self.advance(1) }, b'\n' | b'\x0C' | b'\r' => { self.consume_newline(); }, b'/' => { if self.starts_with(b"/*") { consume_comment(self); } else { return } } b'<' => { if self.starts_with(b"") { self.advance(3) } else { return } } _ => { return } } } } } /// A position from the start of the input, counted in UTF-8 bytes. #[derive(PartialEq, Eq, PartialOrd, Ord, Debug, Clone, Copy)] pub struct SourcePosition(pub(crate) usize); impl SourcePosition { /// Returns the current byte index in the original input. #[inline] pub fn byte_index(&self) -> usize { self.0 } } /// The line and column number for a given position within the input. #[derive(PartialEq, Eq, Debug, Clone, Copy)] pub struct SourceLocation { /// The line number, starting at 0 for the first line, unless `with_first_line_number` was used. pub line: u32, /// The column number within a line, starting at 1 for first the character of the line. /// Column numbers are counted in UTF-16 code units. pub column: u32, } fn next_token<'a>(tokenizer: &mut Tokenizer<'a>) -> Result, ()> { if tokenizer.is_eof() { return Err(()); } let b = tokenizer.next_byte_unchecked(); let token = match_byte! { b, b' ' | b'\t' => { consume_whitespace(tokenizer, false) }, b'\n' | b'\x0C' | b'\r' => consume_whitespace(tokenizer, true), b'"' => consume_string(tokenizer, false), b'#' => { tokenizer.advance(1); if is_ident_start(tokenizer) { IDHash(consume_name(tokenizer)) } else if !tokenizer.is_eof() && match tokenizer.next_byte_unchecked() { // Any other valid case here already resulted in IDHash. b'0'..=b'9' | b'-' => true, _ => false, } { Hash(consume_name(tokenizer)) } else { Delim('#') } }, b'$' => { if tokenizer.starts_with(b"$=") { tokenizer.advance(2); SuffixMatch } else { tokenizer.advance(1); Delim('$') } }, b'\'' => consume_string(tokenizer, true), b'(' => { tokenizer.advance(1); ParenthesisBlock }, b')' => { tokenizer.advance(1); CloseParenthesis }, b'*' => { if tokenizer.starts_with(b"*=") { tokenizer.advance(2); SubstringMatch } else { tokenizer.advance(1); Delim('*') } }, b'+' => { if ( tokenizer.has_at_least(1) && matches!(tokenizer.byte_at(1), b'0'..=b'9') ) || ( tokenizer.has_at_least(2) && tokenizer.byte_at(1) == b'.' && matches!(tokenizer.byte_at(2), b'0'..=b'9') ) { consume_numeric(tokenizer) } else { tokenizer.advance(1); Delim('+') } }, b',' => { tokenizer.advance(1); Comma }, b'-' => { if ( tokenizer.has_at_least(1) && matches!(tokenizer.byte_at(1), b'0'..=b'9') ) || ( tokenizer.has_at_least(2) && tokenizer.byte_at(1) == b'.' && matches!(tokenizer.byte_at(2), b'0'..=b'9') ) { consume_numeric(tokenizer) } else if tokenizer.starts_with(b"-->") { tokenizer.advance(3); CDC } else if is_ident_start(tokenizer) { consume_ident_like(tokenizer) } else { tokenizer.advance(1); Delim('-') } }, b'.' => { if tokenizer.has_at_least(1) && matches!(tokenizer.byte_at(1), b'0'..=b'9' ) { consume_numeric(tokenizer) } else { tokenizer.advance(1); Delim('.') } } b'/' => { if tokenizer.starts_with(b"/*") { Comment(consume_comment(tokenizer)) } else { tokenizer.advance(1); Delim('/') } } b'0'..=b'9' => consume_numeric(tokenizer), b':' => { tokenizer.advance(1); Colon }, b';' => { tokenizer.advance(1); Semicolon }, b'<' => { if tokenizer.starts_with(b"