summaryrefslogtreecommitdiffstats
path: root/third_party/rust/cssparser/src/tokenizer.rs
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/rust/cssparser/src/tokenizer.rs')
-rw-r--r--third_party/rust/cssparser/src/tokenizer.rs1397
1 files changed, 1397 insertions, 0 deletions
diff --git a/third_party/rust/cssparser/src/tokenizer.rs b/third_party/rust/cssparser/src/tokenizer.rs
new file mode 100644
index 0000000000..62f3868362
--- /dev/null
+++ b/third_party/rust/cssparser/src/tokenizer.rs
@@ -0,0 +1,1397 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+// https://drafts.csswg.org/css-syntax/#tokenization
+
+use self::Token::*;
+use crate::cow_rc_str::CowRcStr;
+use crate::parser::ParserState;
+use matches::matches;
+use std::char;
+use std::i32;
+use std::ops::Range;
+
+/// One of the pieces the CSS input is broken into.
+///
+/// Some components use `Cow` in order to borrow from the original input string
+/// and avoid allocating/copying when possible.
+#[derive(PartialEq, Debug, Clone)]
+pub enum Token<'a> {
+ /// A [`<ident-token>`](https://drafts.csswg.org/css-syntax/#ident-token-diagram)
+ Ident(CowRcStr<'a>),
+
+ /// A [`<at-keyword-token>`](https://drafts.csswg.org/css-syntax/#at-keyword-token-diagram)
+ ///
+ /// The value does not include the `@` marker.
+ AtKeyword(CowRcStr<'a>),
+
+ /// A [`<hash-token>`](https://drafts.csswg.org/css-syntax/#hash-token-diagram) with the type flag set to "unrestricted"
+ ///
+ /// The value does not include the `#` marker.
+ Hash(CowRcStr<'a>),
+
+ /// A [`<hash-token>`](https://drafts.csswg.org/css-syntax/#hash-token-diagram) with the type flag set to "id"
+ ///
+ /// The value does not include the `#` marker.
+ IDHash(CowRcStr<'a>), // Hash that is a valid ID selector.
+
+ /// A [`<string-token>`](https://drafts.csswg.org/css-syntax/#string-token-diagram)
+ ///
+ /// The value does not include the quotes.
+ QuotedString(CowRcStr<'a>),
+
+ /// A [`<url-token>`](https://drafts.csswg.org/css-syntax/#url-token-diagram)
+ ///
+ /// The value does not include the `url(` `)` markers. Note that `url( <string-token> )` is represented by a
+ /// `Function` token.
+ UnquotedUrl(CowRcStr<'a>),
+
+ /// A `<delim-token>`
+ Delim(char),
+
+ /// A [`<number-token>`](https://drafts.csswg.org/css-syntax/#number-token-diagram)
+ Number {
+ /// Whether the number had a `+` or `-` sign.
+ ///
+ /// This is used is some cases like the <An+B> micro syntax. (See the `parse_nth` function.)
+ has_sign: bool,
+
+ /// The value as a float
+ value: f32,
+
+ /// If the origin source did not include a fractional part, the value as an integer.
+ int_value: Option<i32>,
+ },
+
+ /// A [`<percentage-token>`](https://drafts.csswg.org/css-syntax/#percentage-token-diagram)
+ Percentage {
+ /// Whether the number had a `+` or `-` sign.
+ has_sign: bool,
+
+ /// The value as a float, divided by 100 so that the nominal range is 0.0 to 1.0.
+ unit_value: f32,
+
+ /// If the origin source did not include a fractional part, the value as an integer.
+ /// It is **not** divided by 100.
+ int_value: Option<i32>,
+ },
+
+ /// A [`<dimension-token>`](https://drafts.csswg.org/css-syntax/#dimension-token-diagram)
+ Dimension {
+ /// Whether the number had a `+` or `-` sign.
+ ///
+ /// This is used is some cases like the <An+B> micro syntax. (See the `parse_nth` function.)
+ has_sign: bool,
+
+ /// The value as a float
+ value: f32,
+
+ /// If the origin source did not include a fractional part, the value as an integer.
+ int_value: Option<i32>,
+
+ /// The unit, e.g. "px" in `12px`
+ unit: CowRcStr<'a>,
+ },
+
+ /// A [`<whitespace-token>`](https://drafts.csswg.org/css-syntax/#whitespace-token-diagram)
+ WhiteSpace(&'a str),
+
+ /// A comment.
+ ///
+ /// The CSS Syntax spec does not generate tokens for comments,
+ /// But we do, because we can (borrowed &str makes it cheap).
+ ///
+ /// The value does not include the `/*` `*/` markers.
+ Comment(&'a str),
+
+ /// A `:` `<colon-token>`
+ Colon, // :
+
+ /// A `;` `<semicolon-token>`
+ Semicolon, // ;
+
+ /// A `,` `<comma-token>`
+ Comma, // ,
+
+ /// A `~=` [`<include-match-token>`](https://drafts.csswg.org/css-syntax/#include-match-token-diagram)
+ IncludeMatch,
+
+ /// A `|=` [`<dash-match-token>`](https://drafts.csswg.org/css-syntax/#dash-match-token-diagram)
+ DashMatch,
+
+ /// A `^=` [`<prefix-match-token>`](https://drafts.csswg.org/css-syntax/#prefix-match-token-diagram)
+ PrefixMatch,
+
+ /// A `$=` [`<suffix-match-token>`](https://drafts.csswg.org/css-syntax/#suffix-match-token-diagram)
+ SuffixMatch,
+
+ /// A `*=` [`<substring-match-token>`](https://drafts.csswg.org/css-syntax/#substring-match-token-diagram)
+ SubstringMatch,
+
+ /// A `<!--` [`<CDO-token>`](https://drafts.csswg.org/css-syntax/#CDO-token-diagram)
+ CDO,
+
+ /// A `-->` [`<CDC-token>`](https://drafts.csswg.org/css-syntax/#CDC-token-diagram)
+ CDC,
+
+ /// A [`<function-token>`](https://drafts.csswg.org/css-syntax/#function-token-diagram)
+ ///
+ /// The value (name) does not include the `(` marker.
+ Function(CowRcStr<'a>),
+
+ /// A `<(-token>`
+ ParenthesisBlock,
+
+ /// A `<[-token>`
+ SquareBracketBlock,
+
+ /// A `<{-token>`
+ CurlyBracketBlock,
+
+ /// A `<bad-url-token>`
+ ///
+ /// This token always indicates a parse error.
+ BadUrl(CowRcStr<'a>),
+
+ /// A `<bad-string-token>`
+ ///
+ /// This token always indicates a parse error.
+ BadString(CowRcStr<'a>),
+
+ /// A `<)-token>`
+ ///
+ /// When obtained from one of the `Parser::next*` methods,
+ /// this token is always unmatched and indicates a parse error.
+ CloseParenthesis,
+
+ /// A `<]-token>`
+ ///
+ /// When obtained from one of the `Parser::next*` methods,
+ /// this token is always unmatched and indicates a parse error.
+ CloseSquareBracket,
+
+ /// A `<}-token>`
+ ///
+ /// When obtained from one of the `Parser::next*` methods,
+ /// this token is always unmatched and indicates a parse error.
+ CloseCurlyBracket,
+}
+
+impl<'a> Token<'a> {
+ /// Return whether this token represents a parse error.
+ ///
+ /// `BadUrl` and `BadString` are tokenizer-level parse errors.
+ ///
+ /// `CloseParenthesis`, `CloseSquareBracket`, and `CloseCurlyBracket` are *unmatched*
+ /// and therefore parse errors when returned by one of the `Parser::next*` methods.
+ pub fn is_parse_error(&self) -> bool {
+ matches!(
+ *self,
+ BadUrl(_) | BadString(_) | CloseParenthesis | CloseSquareBracket | CloseCurlyBracket
+ )
+ }
+}
+
+#[derive(Clone)]
+pub struct Tokenizer<'a> {
+ input: &'a str,
+ /// Counted in bytes, not code points. From 0.
+ position: usize,
+ /// The position at the start of the current line; but adjusted to
+ /// ensure that computing the column will give the result in units
+ /// of UTF-16 characters.
+ current_line_start_position: usize,
+ current_line_number: u32,
+ var_or_env_functions: SeenStatus,
+ source_map_url: Option<&'a str>,
+ source_url: Option<&'a str>,
+}
+
+#[derive(Copy, Clone, PartialEq, Eq)]
+enum SeenStatus {
+ DontCare,
+ LookingForThem,
+ SeenAtLeastOne,
+}
+
+impl<'a> Tokenizer<'a> {
+ #[inline]
+ pub fn new(input: &str) -> Tokenizer {
+ Tokenizer::with_first_line_number(input, 0)
+ }
+
+ #[inline]
+ pub fn with_first_line_number(input: &str, first_line_number: u32) -> Tokenizer {
+ Tokenizer {
+ input: input,
+ position: 0,
+ current_line_start_position: 0,
+ current_line_number: first_line_number,
+ var_or_env_functions: SeenStatus::DontCare,
+ source_map_url: None,
+ source_url: None,
+ }
+ }
+
+ #[inline]
+ pub fn look_for_var_or_env_functions(&mut self) {
+ self.var_or_env_functions = SeenStatus::LookingForThem;
+ }
+
+ #[inline]
+ pub fn seen_var_or_env_functions(&mut self) -> bool {
+ let seen = self.var_or_env_functions == SeenStatus::SeenAtLeastOne;
+ self.var_or_env_functions = SeenStatus::DontCare;
+ seen
+ }
+
+ #[inline]
+ pub fn see_function(&mut self, name: &str) {
+ if self.var_or_env_functions == SeenStatus::LookingForThem {
+ if name.eq_ignore_ascii_case("var") || name.eq_ignore_ascii_case("env") {
+ self.var_or_env_functions = SeenStatus::SeenAtLeastOne;
+ }
+ }
+ }
+
+ #[inline]
+ pub fn next(&mut self) -> Result<Token<'a>, ()> {
+ next_token(self)
+ }
+
+ #[inline]
+ pub fn position(&self) -> SourcePosition {
+ SourcePosition(self.position)
+ }
+
+ #[inline]
+ pub fn current_source_location(&self) -> SourceLocation {
+ SourceLocation {
+ line: self.current_line_number,
+ column: (self.position - self.current_line_start_position + 1) as u32,
+ }
+ }
+
+ #[inline]
+ pub fn current_source_map_url(&self) -> Option<&'a str> {
+ self.source_map_url
+ }
+
+ #[inline]
+ pub fn current_source_url(&self) -> Option<&'a str> {
+ self.source_url
+ }
+
+ #[inline]
+ pub fn state(&self) -> ParserState {
+ ParserState {
+ position: self.position,
+ current_line_start_position: self.current_line_start_position,
+ current_line_number: self.current_line_number,
+ at_start_of: None,
+ }
+ }
+
+ #[inline]
+ pub fn reset(&mut self, state: &ParserState) {
+ self.position = state.position;
+ self.current_line_start_position = state.current_line_start_position;
+ self.current_line_number = state.current_line_number;
+ }
+
+ #[inline]
+ pub fn slice_from(&self, start_pos: SourcePosition) -> &'a str {
+ &self.input[start_pos.0..self.position]
+ }
+
+ #[inline]
+ pub fn slice(&self, range: Range<SourcePosition>) -> &'a str {
+ &self.input[range.start.0..range.end.0]
+ }
+
+ pub fn current_source_line(&self) -> &'a str {
+ let current = self.position;
+ let start = self.input[0..current]
+ .rfind(|c| matches!(c, '\r' | '\n' | '\x0C'))
+ .map_or(0, |start| start + 1);
+ let end = self.input[current..]
+ .find(|c| matches!(c, '\r' | '\n' | '\x0C'))
+ .map_or(self.input.len(), |end| current + end);
+ &self.input[start..end]
+ }
+
+ #[inline]
+ pub fn next_byte(&self) -> Option<u8> {
+ if self.is_eof() {
+ None
+ } else {
+ Some(self.input.as_bytes()[self.position])
+ }
+ }
+
+ // If false, `tokenizer.next_char()` will not panic.
+ #[inline]
+ fn is_eof(&self) -> bool {
+ !self.has_at_least(0)
+ }
+
+ // If true, the input has at least `n` bytes left *after* the current one.
+ // That is, `tokenizer.char_at(n)` will not panic.
+ #[inline]
+ fn has_at_least(&self, n: usize) -> bool {
+ self.position + n < self.input.len()
+ }
+
+ // Advance over N bytes in the input. This function can advance
+ // over ASCII bytes (excluding newlines), or UTF-8 sequence
+ // leaders (excluding leaders for 4-byte sequences).
+ #[inline]
+ pub fn advance(&mut self, n: usize) {
+ if cfg!(debug_assertions) {
+ // Each byte must either be an ASCII byte or a sequence
+ // leader, but not a 4-byte leader; also newlines are
+ // rejected.
+ for i in 0..n {
+ let b = self.byte_at(i);
+ debug_assert!(b.is_ascii() || (b & 0xF0 != 0xF0 && b & 0xC0 != 0x80));
+ debug_assert!(b != b'\r' && b != b'\n' && b != b'\x0C');
+ }
+ }
+ self.position += n
+ }
+
+ // Assumes non-EOF
+ #[inline]
+ fn next_byte_unchecked(&self) -> u8 {
+ self.byte_at(0)
+ }
+
+ #[inline]
+ fn byte_at(&self, offset: usize) -> u8 {
+ self.input.as_bytes()[self.position + offset]
+ }
+
+ // Advance over a single byte; the byte must be a UTF-8 sequence
+ // leader for a 4-byte sequence.
+ #[inline]
+ fn consume_4byte_intro(&mut self) {
+ debug_assert!(self.next_byte_unchecked() & 0xF0 == 0xF0);
+ // This takes two UTF-16 characters to represent, so we
+ // actually have an undercount.
+ self.current_line_start_position = self.current_line_start_position.wrapping_sub(1);
+ self.position += 1;
+ }
+
+ // Advance over a single byte; the byte must be a UTF-8
+ // continuation byte.
+ #[inline]
+ fn consume_continuation_byte(&mut self) {
+ debug_assert!(self.next_byte_unchecked() & 0xC0 == 0x80);
+ // Continuation bytes contribute to column overcount. Note
+ // that due to the special case for the 4-byte sequence intro,
+ // we must use wrapping add here.
+ self.current_line_start_position = self.current_line_start_position.wrapping_add(1);
+ self.position += 1;
+ }
+
+ // Advance over any kind of byte, excluding newlines.
+ #[inline(never)]
+ fn consume_known_byte(&mut self, byte: u8) {
+ debug_assert!(byte != b'\r' && byte != b'\n' && byte != b'\x0C');
+ self.position += 1;
+ // Continuation bytes contribute to column overcount.
+ if byte & 0xF0 == 0xF0 {
+ // This takes two UTF-16 characters to represent, so we
+ // actually have an undercount.
+ self.current_line_start_position = self.current_line_start_position.wrapping_sub(1);
+ } else if byte & 0xC0 == 0x80 {
+ // Note that due to the special case for the 4-byte
+ // sequence intro, we must use wrapping add here.
+ self.current_line_start_position = self.current_line_start_position.wrapping_add(1);
+ }
+ }
+
+ #[inline]
+ fn next_char(&self) -> char {
+ self.input[self.position..].chars().next().unwrap()
+ }
+
+ // Given that a newline has been seen, advance over the newline
+ // and update the state.
+ #[inline]
+ fn consume_newline(&mut self) {
+ let byte = self.next_byte_unchecked();
+ debug_assert!(byte == b'\r' || byte == b'\n' || byte == b'\x0C');
+ self.position += 1;
+ if byte == b'\r' && self.next_byte() == Some(b'\n') {
+ self.position += 1;
+ }
+ self.current_line_start_position = self.position;
+ self.current_line_number += 1;
+ }
+
+ #[inline]
+ fn has_newline_at(&self, offset: usize) -> bool {
+ self.position + offset < self.input.len()
+ && matches!(self.byte_at(offset), b'\n' | b'\r' | b'\x0C')
+ }
+
+ #[inline]
+ fn consume_char(&mut self) -> char {
+ let c = self.next_char();
+ let len_utf8 = c.len_utf8();
+ self.position += len_utf8;
+ // Note that due to the special case for the 4-byte sequence
+ // intro, we must use wrapping add here.
+ self.current_line_start_position = self
+ .current_line_start_position
+ .wrapping_add(len_utf8 - c.len_utf16());
+ c
+ }
+
+ #[inline]
+ fn starts_with(&self, needle: &[u8]) -> bool {
+ self.input.as_bytes()[self.position..].starts_with(needle)
+ }
+
+ pub fn skip_whitespace(&mut self) {
+ while !self.is_eof() {
+ match_byte! { self.next_byte_unchecked(),
+ b' ' | b'\t' => {
+ self.advance(1)
+ },
+ b'\n' | b'\x0C' | b'\r' => {
+ self.consume_newline();
+ },
+ b'/' => {
+ if self.starts_with(b"/*") {
+ consume_comment(self);
+ } else {
+ return
+ }
+ }
+ _ => {
+ return
+ }
+ }
+ }
+ }
+
+ pub fn skip_cdc_and_cdo(&mut self) {
+ while !self.is_eof() {
+ match_byte! { self.next_byte_unchecked(),
+ b' ' | b'\t' => {
+ self.advance(1)
+ },
+ b'\n' | b'\x0C' | b'\r' => {
+ self.consume_newline();
+ },
+ b'/' => {
+ if self.starts_with(b"/*") {
+ consume_comment(self);
+ } else {
+ return
+ }
+ }
+ b'<' => {
+ if self.starts_with(b"<!--") {
+ self.advance(4)
+ } else {
+ return
+ }
+ }
+ b'-' => {
+ if self.starts_with(b"-->") {
+ self.advance(3)
+ } else {
+ return
+ }
+ }
+ _ => {
+ return
+ }
+ }
+ }
+ }
+}
+
+/// A position from the start of the input, counted in UTF-8 bytes.
+#[derive(PartialEq, Eq, PartialOrd, Ord, Debug, Clone, Copy)]
+pub struct SourcePosition(pub(crate) usize);
+
+impl SourcePosition {
+ /// Returns the current byte index in the original input.
+ #[inline]
+ pub fn byte_index(&self) -> usize {
+ self.0
+ }
+}
+
+/// The line and column number for a given position within the input.
+#[derive(PartialEq, Eq, Debug, Clone, Copy)]
+pub struct SourceLocation {
+ /// The line number, starting at 0 for the first line, unless `with_first_line_number` was used.
+ pub line: u32,
+
+ /// The column number within a line, starting at 1 for first the character of the line.
+ /// Column numbers are counted in UTF-16 code units.
+ pub column: u32,
+}
+
+fn next_token<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>, ()> {
+ if tokenizer.is_eof() {
+ return Err(());
+ }
+ let b = tokenizer.next_byte_unchecked();
+ let token = match_byte! { b,
+ b' ' | b'\t' => {
+ consume_whitespace(tokenizer, false)
+ },
+ b'\n' | b'\x0C' | b'\r' => {
+ consume_whitespace(tokenizer, true)
+ },
+ b'"' => { consume_string(tokenizer, false) },
+ b'#' => {
+ tokenizer.advance(1);
+ if is_ident_start(tokenizer) { IDHash(consume_name(tokenizer)) }
+ else if !tokenizer.is_eof() && match tokenizer.next_byte_unchecked() {
+ // Any other valid case here already resulted in IDHash.
+ b'0'..=b'9' | b'-' => true,
+ _ => false,
+ } { Hash(consume_name(tokenizer)) }
+ else { Delim('#') }
+ },
+ b'$' => {
+ if tokenizer.starts_with(b"$=") { tokenizer.advance(2); SuffixMatch }
+ else { tokenizer.advance(1); Delim('$') }
+ },
+ b'\'' => { consume_string(tokenizer, true) },
+ b'(' => { tokenizer.advance(1); ParenthesisBlock },
+ b')' => { tokenizer.advance(1); CloseParenthesis },
+ b'*' => {
+ if tokenizer.starts_with(b"*=") { tokenizer.advance(2); SubstringMatch }
+ else { tokenizer.advance(1); Delim('*') }
+ },
+ b'+' => {
+ if (
+ tokenizer.has_at_least(1)
+ && matches!(tokenizer.byte_at(1), b'0'..=b'9')
+ ) || (
+ tokenizer.has_at_least(2)
+ && tokenizer.byte_at(1) == b'.'
+ && matches!(tokenizer.byte_at(2), b'0'..=b'9')
+ ) {
+ consume_numeric(tokenizer)
+ } else {
+ tokenizer.advance(1);
+ Delim('+')
+ }
+ },
+ b',' => { tokenizer.advance(1); Comma },
+ b'-' => {
+ if (
+ tokenizer.has_at_least(1)
+ && matches!(tokenizer.byte_at(1), b'0'..=b'9')
+ ) || (
+ tokenizer.has_at_least(2)
+ && tokenizer.byte_at(1) == b'.'
+ && matches!(tokenizer.byte_at(2), b'0'..=b'9')
+ ) {
+ consume_numeric(tokenizer)
+ } else if tokenizer.starts_with(b"-->") {
+ tokenizer.advance(3);
+ CDC
+ } else if is_ident_start(tokenizer) {
+ consume_ident_like(tokenizer)
+ } else {
+ tokenizer.advance(1);
+ Delim('-')
+ }
+ },
+ b'.' => {
+ if tokenizer.has_at_least(1)
+ && matches!(tokenizer.byte_at(1), b'0'..=b'9'
+ ) {
+ consume_numeric(tokenizer)
+ } else {
+ tokenizer.advance(1);
+ Delim('.')
+ }
+ }
+ b'/' => {
+ if tokenizer.starts_with(b"/*") {
+ Comment(consume_comment(tokenizer))
+ } else {
+ tokenizer.advance(1);
+ Delim('/')
+ }
+ }
+ b'0'..=b'9' => { consume_numeric(tokenizer) },
+ b':' => { tokenizer.advance(1); Colon },
+ b';' => { tokenizer.advance(1); Semicolon },
+ b'<' => {
+ if tokenizer.starts_with(b"<!--") {
+ tokenizer.advance(4);
+ CDO
+ } else {
+ tokenizer.advance(1);
+ Delim('<')
+ }
+ },
+ b'@' => {
+ tokenizer.advance(1);
+ if is_ident_start(tokenizer) { AtKeyword(consume_name(tokenizer)) }
+ else { Delim('@') }
+ },
+ b'a'..=b'z' | b'A'..=b'Z' | b'_' | b'\0' => { consume_ident_like(tokenizer) },
+ b'[' => { tokenizer.advance(1); SquareBracketBlock },
+ b'\\' => {
+ if !tokenizer.has_newline_at(1) { consume_ident_like(tokenizer) }
+ else { tokenizer.advance(1); Delim('\\') }
+ },
+ b']' => { tokenizer.advance(1); CloseSquareBracket },
+ b'^' => {
+ if tokenizer.starts_with(b"^=") { tokenizer.advance(2); PrefixMatch }
+ else { tokenizer.advance(1); Delim('^') }
+ },
+ b'{' => { tokenizer.advance(1); CurlyBracketBlock },
+ b'|' => {
+ if tokenizer.starts_with(b"|=") { tokenizer.advance(2); DashMatch }
+ else { tokenizer.advance(1); Delim('|') }
+ },
+ b'}' => { tokenizer.advance(1); CloseCurlyBracket },
+ b'~' => {
+ if tokenizer.starts_with(b"~=") { tokenizer.advance(2); IncludeMatch }
+ else { tokenizer.advance(1); Delim('~') }
+ },
+ _ => {
+ if !b.is_ascii() {
+ consume_ident_like(tokenizer)
+ } else {
+ tokenizer.advance(1);
+ Delim(b as char)
+ }
+ },
+ };
+ Ok(token)
+}
+
+fn consume_whitespace<'a>(tokenizer: &mut Tokenizer<'a>, newline: bool) -> Token<'a> {
+ let start_position = tokenizer.position();
+ if newline {
+ tokenizer.consume_newline();
+ } else {
+ tokenizer.advance(1);
+ }
+ while !tokenizer.is_eof() {
+ let b = tokenizer.next_byte_unchecked();
+ match_byte! { b,
+ b' ' | b'\t' => {
+ tokenizer.advance(1);
+ }
+ b'\n' | b'\x0C' | b'\r' => {
+ tokenizer.consume_newline();
+ }
+ _ => {
+ break
+ }
+ }
+ }
+ WhiteSpace(tokenizer.slice_from(start_position))
+}
+
+// Check for sourceMappingURL or sourceURL comments and update the
+// tokenizer appropriately.
+fn check_for_source_map<'a>(tokenizer: &mut Tokenizer<'a>, contents: &'a str) {
+ let directive = "# sourceMappingURL=";
+ let directive_old = "@ sourceMappingURL=";
+
+ // If there is a source map directive, extract the URL.
+ if contents.starts_with(directive) || contents.starts_with(directive_old) {
+ let contents = &contents[directive.len()..];
+ tokenizer.source_map_url = contents
+ .split(|c| c == ' ' || c == '\t' || c == '\x0C' || c == '\r' || c == '\n')
+ .next()
+ }
+
+ let directive = "# sourceURL=";
+ let directive_old = "@ sourceURL=";
+
+ // If there is a source map directive, extract the URL.
+ if contents.starts_with(directive) || contents.starts_with(directive_old) {
+ let contents = &contents[directive.len()..];
+ tokenizer.source_url = contents
+ .split(|c| c == ' ' || c == '\t' || c == '\x0C' || c == '\r' || c == '\n')
+ .next()
+ }
+}
+
+fn consume_comment<'a>(tokenizer: &mut Tokenizer<'a>) -> &'a str {
+ tokenizer.advance(2); // consume "/*"
+ let start_position = tokenizer.position();
+ while !tokenizer.is_eof() {
+ match_byte! { tokenizer.next_byte_unchecked(),
+ b'*' => {
+ let end_position = tokenizer.position();
+ tokenizer.advance(1);
+ if tokenizer.next_byte() == Some(b'/') {
+ tokenizer.advance(1);
+ let contents = tokenizer.slice(start_position..end_position);
+ check_for_source_map(tokenizer, contents);
+ return contents
+ }
+ }
+ b'\n' | b'\x0C' | b'\r' => {
+ tokenizer.consume_newline();
+ }
+ b'\x80'..=b'\xBF' => { tokenizer.consume_continuation_byte(); }
+ b'\xF0'..=b'\xFF' => { tokenizer.consume_4byte_intro(); }
+ _ => {
+ // ASCII or other leading byte.
+ tokenizer.advance(1);
+ }
+ }
+ }
+ let contents = tokenizer.slice_from(start_position);
+ check_for_source_map(tokenizer, contents);
+ contents
+}
+
+fn consume_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool) -> Token<'a> {
+ match consume_quoted_string(tokenizer, single_quote) {
+ Ok(value) => QuotedString(value),
+ Err(value) => BadString(value),
+ }
+}
+
+/// Return `Err(())` on syntax error (ie. unescaped newline)
+fn consume_quoted_string<'a>(
+ tokenizer: &mut Tokenizer<'a>,
+ single_quote: bool,
+) -> Result<CowRcStr<'a>, CowRcStr<'a>> {
+ tokenizer.advance(1); // Skip the initial quote
+ // start_pos is at code point boundary, after " or '
+ let start_pos = tokenizer.position();
+ let mut string_bytes;
+ loop {
+ if tokenizer.is_eof() {
+ return Ok(tokenizer.slice_from(start_pos).into());
+ }
+ match_byte! { tokenizer.next_byte_unchecked(),
+ b'"' => {
+ if !single_quote {
+ let value = tokenizer.slice_from(start_pos);
+ tokenizer.advance(1);
+ return Ok(value.into())
+ }
+ tokenizer.advance(1);
+ }
+ b'\'' => {
+ if single_quote {
+ let value = tokenizer.slice_from(start_pos);
+ tokenizer.advance(1);
+ return Ok(value.into())
+ }
+ tokenizer.advance(1);
+ }
+ b'\\' | b'\0' => {
+ // * The tokenizer’s input is UTF-8 since it’s `&str`.
+ // * start_pos is at a code point boundary
+ // * so is the current position (which is before '\\' or '\0'
+ //
+ // So `string_bytes` is well-formed UTF-8.
+ string_bytes = tokenizer.slice_from(start_pos).as_bytes().to_owned();
+ break
+ }
+ b'\n' | b'\r' | b'\x0C' => {
+ return Err(tokenizer.slice_from(start_pos).into())
+ },
+ b'\x80'..=b'\xBF' => { tokenizer.consume_continuation_byte(); }
+ b'\xF0'..=b'\xFF' => { tokenizer.consume_4byte_intro(); }
+ _ => {
+ // ASCII or other leading byte.
+ tokenizer.advance(1);
+ }
+ }
+ }
+
+ while !tokenizer.is_eof() {
+ let b = tokenizer.next_byte_unchecked();
+ match_byte! { b,
+ b'\n' | b'\r' | b'\x0C' => {
+ return Err(
+ // string_bytes is well-formed UTF-8, see other comments.
+ unsafe {
+ from_utf8_release_unchecked(string_bytes)
+ }.into()
+ );
+ }
+ b'"' => {
+ tokenizer.advance(1);
+ if !single_quote {
+ break;
+ }
+ }
+ b'\'' => {
+ tokenizer.advance(1);
+ if single_quote {
+ break;
+ }
+ }
+ b'\\' => {
+ tokenizer.advance(1);
+ if !tokenizer.is_eof() {
+ match tokenizer.next_byte_unchecked() {
+ // Escaped newline
+ b'\n' | b'\x0C' | b'\r' => {
+ tokenizer.consume_newline();
+ }
+ // This pushes one well-formed code point
+ _ => consume_escape_and_write(tokenizer, &mut string_bytes)
+ }
+ }
+ // else: escaped EOF, do nothing.
+ continue;
+ }
+ b'\0' => {
+ tokenizer.advance(1);
+ string_bytes.extend("\u{FFFD}".as_bytes());
+ continue;
+ }
+ b'\x80'..=b'\xBF' => { tokenizer.consume_continuation_byte(); }
+ b'\xF0'..=b'\xFF' => { tokenizer.consume_4byte_intro(); }
+ _ => {
+ // ASCII or other leading byte.
+ tokenizer.advance(1);
+ },
+ }
+
+ // If this byte is part of a multi-byte code point,
+ // we’ll end up copying the whole code point before this loop does something else.
+ string_bytes.push(b);
+ }
+
+ Ok(
+ // string_bytes is well-formed UTF-8, see other comments.
+ unsafe { from_utf8_release_unchecked(string_bytes) }.into(),
+ )
+}
+
+#[inline]
+fn is_ident_start(tokenizer: &mut Tokenizer) -> bool {
+ !tokenizer.is_eof()
+ && match_byte! { tokenizer.next_byte_unchecked(),
+ b'a'..=b'z' | b'A'..=b'Z' | b'_' | b'\0' => { true },
+ b'-' => {
+ tokenizer.has_at_least(1) && match_byte! { tokenizer.byte_at(1),
+ b'a'..=b'z' | b'A'..=b'Z' | b'-' | b'_' | b'\0' => {
+ true
+ }
+ b'\\' => { !tokenizer.has_newline_at(1) }
+ b => { !b.is_ascii() },
+ }
+ },
+ b'\\' => { !tokenizer.has_newline_at(1) },
+ b => { !b.is_ascii() },
+ }
+}
+
+fn consume_ident_like<'a>(tokenizer: &mut Tokenizer<'a>) -> Token<'a> {
+ let value = consume_name(tokenizer);
+ if !tokenizer.is_eof() && tokenizer.next_byte_unchecked() == b'(' {
+ tokenizer.advance(1);
+ if value.eq_ignore_ascii_case("url") {
+ consume_unquoted_url(tokenizer).unwrap_or(Function(value))
+ } else {
+ tokenizer.see_function(&value);
+ Function(value)
+ }
+ } else {
+ Ident(value)
+ }
+}
+
+fn consume_name<'a>(tokenizer: &mut Tokenizer<'a>) -> CowRcStr<'a> {
+ // start_pos is the end of the previous token, therefore at a code point boundary
+ let start_pos = tokenizer.position();
+ let mut value_bytes;
+ loop {
+ if tokenizer.is_eof() {
+ return tokenizer.slice_from(start_pos).into();
+ }
+ match_byte! { tokenizer.next_byte_unchecked(),
+ b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' | b'_' | b'-' => { tokenizer.advance(1) },
+ b'\\' | b'\0' => {
+ // * The tokenizer’s input is UTF-8 since it’s `&str`.
+ // * start_pos is at a code point boundary
+ // * so is the current position (which is before '\\' or '\0'
+ //
+ // So `value_bytes` is well-formed UTF-8.
+ value_bytes = tokenizer.slice_from(start_pos).as_bytes().to_owned();
+ break
+ }
+ b'\x80'..=b'\xBF' => { tokenizer.consume_continuation_byte(); }
+ b'\xC0'..=b'\xEF' => { tokenizer.advance(1); }
+ b'\xF0'..=b'\xFF' => { tokenizer.consume_4byte_intro(); }
+ _b => {
+ return tokenizer.slice_from(start_pos).into();
+ }
+ }
+ }
+
+ while !tokenizer.is_eof() {
+ let b = tokenizer.next_byte_unchecked();
+ match_byte! { b,
+ b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' | b'_' | b'-' => {
+ tokenizer.advance(1);
+ value_bytes.push(b) // ASCII
+ }
+ b'\\' => {
+ if tokenizer.has_newline_at(1) { break }
+ tokenizer.advance(1);
+ // This pushes one well-formed code point
+ consume_escape_and_write(tokenizer, &mut value_bytes)
+ }
+ b'\0' => {
+ tokenizer.advance(1);
+ value_bytes.extend("\u{FFFD}".as_bytes());
+ },
+ b'\x80'..=b'\xBF' => {
+ // This byte *is* part of a multi-byte code point,
+ // we’ll end up copying the whole code point before this loop does something else.
+ tokenizer.consume_continuation_byte();
+ value_bytes.push(b)
+ }
+ b'\xC0'..=b'\xEF' => {
+ // This byte *is* part of a multi-byte code point,
+ // we’ll end up copying the whole code point before this loop does something else.
+ tokenizer.advance(1);
+ value_bytes.push(b)
+ }
+ b'\xF0'..=b'\xFF' => {
+ tokenizer.consume_4byte_intro();
+ value_bytes.push(b)
+ }
+ _ => {
+ // ASCII
+ break;
+ }
+ }
+ }
+ // string_bytes is well-formed UTF-8, see other comments.
+ unsafe { from_utf8_release_unchecked(value_bytes) }.into()
+}
+
+fn byte_to_hex_digit(b: u8) -> Option<u32> {
+ Some(match_byte! { b,
+ b'0' ..= b'9' => { b - b'0' },
+ b'a' ..= b'f' => { b - b'a' + 10 },
+ b'A' ..= b'F' => { b - b'A' + 10 },
+ _ => {
+ return None
+ }
+ } as u32)
+}
+
+fn byte_to_decimal_digit(b: u8) -> Option<u32> {
+ if b >= b'0' && b <= b'9' {
+ Some((b - b'0') as u32)
+ } else {
+ None
+ }
+}
+
+fn consume_numeric<'a>(tokenizer: &mut Tokenizer<'a>) -> Token<'a> {
+ // Parse [+-]?\d*(\.\d+)?([eE][+-]?\d+)?
+ // But this is always called so that there is at least one digit in \d*(\.\d+)?
+
+ // Do all the math in f64 so that large numbers overflow to +/-inf
+ // and i32::{MIN, MAX} are within range.
+
+ let (has_sign, sign) = match tokenizer.next_byte_unchecked() {
+ b'-' => (true, -1.),
+ b'+' => (true, 1.),
+ _ => (false, 1.),
+ };
+ if has_sign {
+ tokenizer.advance(1);
+ }
+
+ let mut integral_part: f64 = 0.;
+ while let Some(digit) = byte_to_decimal_digit(tokenizer.next_byte_unchecked()) {
+ integral_part = integral_part * 10. + digit as f64;
+ tokenizer.advance(1);
+ if tokenizer.is_eof() {
+ break;
+ }
+ }
+
+ let mut is_integer = true;
+
+ let mut fractional_part: f64 = 0.;
+ if tokenizer.has_at_least(1)
+ && tokenizer.next_byte_unchecked() == b'.'
+ && matches!(tokenizer.byte_at(1), b'0'..=b'9')
+ {
+ is_integer = false;
+ tokenizer.advance(1); // Consume '.'
+ let mut factor = 0.1;
+ while let Some(digit) = byte_to_decimal_digit(tokenizer.next_byte_unchecked()) {
+ fractional_part += digit as f64 * factor;
+ factor *= 0.1;
+ tokenizer.advance(1);
+ if tokenizer.is_eof() {
+ break;
+ }
+ }
+ }
+
+ let mut value = sign * (integral_part + fractional_part);
+
+ if tokenizer.has_at_least(1) && matches!(tokenizer.next_byte_unchecked(), b'e' | b'E') {
+ if matches!(tokenizer.byte_at(1), b'0'..=b'9')
+ || (tokenizer.has_at_least(2)
+ && matches!(tokenizer.byte_at(1), b'+' | b'-')
+ && matches!(tokenizer.byte_at(2), b'0'..=b'9'))
+ {
+ is_integer = false;
+ tokenizer.advance(1);
+ let (has_sign, sign) = match tokenizer.next_byte_unchecked() {
+ b'-' => (true, -1.),
+ b'+' => (true, 1.),
+ _ => (false, 1.),
+ };
+ if has_sign {
+ tokenizer.advance(1);
+ }
+ let mut exponent: f64 = 0.;
+ while let Some(digit) = byte_to_decimal_digit(tokenizer.next_byte_unchecked()) {
+ exponent = exponent * 10. + digit as f64;
+ tokenizer.advance(1);
+ if tokenizer.is_eof() {
+ break;
+ }
+ }
+ value *= f64::powf(10., sign * exponent);
+ }
+ }
+
+ let int_value = if is_integer {
+ Some(if value >= i32::MAX as f64 {
+ i32::MAX
+ } else if value <= i32::MIN as f64 {
+ i32::MIN
+ } else {
+ value as i32
+ })
+ } else {
+ None
+ };
+
+ if !tokenizer.is_eof() && tokenizer.next_byte_unchecked() == b'%' {
+ tokenizer.advance(1);
+ return Percentage {
+ unit_value: (value / 100.) as f32,
+ int_value: int_value,
+ has_sign: has_sign,
+ };
+ }
+ let value = value as f32;
+ if is_ident_start(tokenizer) {
+ let unit = consume_name(tokenizer);
+ Dimension {
+ value: value,
+ int_value: int_value,
+ has_sign: has_sign,
+ unit: unit,
+ }
+ } else {
+ Number {
+ value: value,
+ int_value: int_value,
+ has_sign: has_sign,
+ }
+ }
+}
+
+#[inline]
+unsafe fn from_utf8_release_unchecked(string_bytes: Vec<u8>) -> String {
+ if cfg!(debug_assertions) {
+ String::from_utf8(string_bytes).unwrap()
+ } else {
+ String::from_utf8_unchecked(string_bytes)
+ }
+}
+
+fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>, ()> {
+ // This is only called after "url(", so the current position is a code point boundary.
+ let start_position = tokenizer.position;
+ let from_start = &tokenizer.input[tokenizer.position..];
+ let mut newlines = 0;
+ let mut last_newline = 0;
+ let mut found_printable_char = false;
+ let mut iter = from_start.bytes().enumerate();
+ loop {
+ let (offset, b) = match iter.next() {
+ Some(item) => item,
+ None => {
+ tokenizer.position = tokenizer.input.len();
+ break;
+ }
+ };
+ match_byte! { b,
+ b' ' | b'\t' => {},
+ b'\n' | b'\x0C' => {
+ newlines += 1;
+ last_newline = offset;
+ }
+ b'\r' => {
+ if from_start.as_bytes().get(offset + 1) != Some(&b'\n') {
+ newlines += 1;
+ last_newline = offset;
+ }
+ }
+ b'"' | b'\'' => { return Err(()) }, // Do not advance
+ b')' => {
+ // Don't use advance, because we may be skipping
+ // newlines here, and we want to avoid the assert.
+ tokenizer.position += offset + 1;
+ break
+ }
+ _ => {
+ // Don't use advance, because we may be skipping
+ // newlines here, and we want to avoid the assert.
+ tokenizer.position += offset;
+ found_printable_char = true;
+ break
+ }
+ }
+ }
+
+ if newlines > 0 {
+ tokenizer.current_line_number += newlines;
+ // No need for wrapping_add here, because there's no possible
+ // way to wrap.
+ tokenizer.current_line_start_position = start_position + last_newline + 1;
+ }
+
+ if found_printable_char {
+ // This function only consumed ASCII (whitespace) bytes,
+ // so the current position is a code point boundary.
+ return Ok(consume_unquoted_url_internal(tokenizer));
+ } else {
+ return Ok(UnquotedUrl("".into()));
+ }
+
+ fn consume_unquoted_url_internal<'a>(tokenizer: &mut Tokenizer<'a>) -> Token<'a> {
+ // This function is only called with start_pos at a code point boundary.
+ let start_pos = tokenizer.position();
+ let mut string_bytes: Vec<u8>;
+ loop {
+ if tokenizer.is_eof() {
+ return UnquotedUrl(tokenizer.slice_from(start_pos).into());
+ }
+ match_byte! { tokenizer.next_byte_unchecked(),
+ b' ' | b'\t' | b'\n' | b'\r' | b'\x0C' => {
+ let value = tokenizer.slice_from(start_pos);
+ return consume_url_end(tokenizer, start_pos, value.into())
+ }
+ b')' => {
+ let value = tokenizer.slice_from(start_pos);
+ tokenizer.advance(1);
+ return UnquotedUrl(value.into())
+ }
+ b'\x01'..=b'\x08' | b'\x0B' | b'\x0E'..=b'\x1F' | b'\x7F' // non-printable
+ | b'"' | b'\'' | b'(' => {
+ tokenizer.advance(1);
+ return consume_bad_url(tokenizer, start_pos)
+ },
+ b'\\' | b'\0' => {
+ // * The tokenizer’s input is UTF-8 since it’s `&str`.
+ // * start_pos is at a code point boundary
+ // * so is the current position (which is before '\\' or '\0'
+ //
+ // So `string_bytes` is well-formed UTF-8.
+ string_bytes = tokenizer.slice_from(start_pos).as_bytes().to_owned();
+ break
+ }
+ b'\x80'..=b'\xBF' => { tokenizer.consume_continuation_byte(); }
+ b'\xF0'..=b'\xFF' => { tokenizer.consume_4byte_intro(); }
+ _ => {
+ // ASCII or other leading byte.
+ tokenizer.advance(1);
+ }
+ }
+ }
+ while !tokenizer.is_eof() {
+ let b = tokenizer.next_byte_unchecked();
+ match_byte! { b,
+ b' ' | b'\t' | b'\n' | b'\r' | b'\x0C' => {
+ // string_bytes is well-formed UTF-8, see other comments.
+ let string = unsafe { from_utf8_release_unchecked(string_bytes) }.into();
+ return consume_url_end(tokenizer, start_pos, string)
+ }
+ b')' => {
+ tokenizer.advance(1);
+ break;
+ }
+ b'\x01'..=b'\x08' | b'\x0B' | b'\x0E'..=b'\x1F' | b'\x7F' // non-printable
+ | b'"' | b'\'' | b'(' => {
+ tokenizer.advance(1);
+ return consume_bad_url(tokenizer, start_pos);
+ }
+ b'\\' => {
+ tokenizer.advance(1);
+ if tokenizer.has_newline_at(0) {
+ return consume_bad_url(tokenizer, start_pos)
+ }
+
+ // This pushes one well-formed code point to string_bytes
+ consume_escape_and_write(tokenizer, &mut string_bytes)
+ },
+ b'\0' => {
+ tokenizer.advance(1);
+ string_bytes.extend("\u{FFFD}".as_bytes());
+ }
+ b'\x80'..=b'\xBF' => {
+ // We’ll end up copying the whole code point
+ // before this loop does something else.
+ tokenizer.consume_continuation_byte();
+ string_bytes.push(b);
+ }
+ b'\xF0'..=b'\xFF' => {
+ // We’ll end up copying the whole code point
+ // before this loop does something else.
+ tokenizer.consume_4byte_intro();
+ string_bytes.push(b);
+ }
+ // If this byte is part of a multi-byte code point,
+ // we’ll end up copying the whole code point before this loop does something else.
+ b => {
+ // ASCII or other leading byte.
+ tokenizer.advance(1);
+ string_bytes.push(b)
+ }
+ }
+ }
+ UnquotedUrl(
+ // string_bytes is well-formed UTF-8, see other comments.
+ unsafe { from_utf8_release_unchecked(string_bytes) }.into(),
+ )
+ }
+
+ fn consume_url_end<'a>(
+ tokenizer: &mut Tokenizer<'a>,
+ start_pos: SourcePosition,
+ string: CowRcStr<'a>,
+ ) -> Token<'a> {
+ while !tokenizer.is_eof() {
+ match_byte! { tokenizer.next_byte_unchecked(),
+ b')' => {
+ tokenizer.advance(1);
+ break
+ }
+ b' ' | b'\t' => { tokenizer.advance(1); }
+ b'\n' | b'\x0C' | b'\r' => {
+ tokenizer.consume_newline();
+ }
+ b => {
+ tokenizer.consume_known_byte(b);
+ return consume_bad_url(tokenizer, start_pos);
+ }
+ }
+ }
+ UnquotedUrl(string)
+ }
+
+ fn consume_bad_url<'a>(tokenizer: &mut Tokenizer<'a>, start_pos: SourcePosition) -> Token<'a> {
+ // Consume up to the closing )
+ while !tokenizer.is_eof() {
+ match_byte! { tokenizer.next_byte_unchecked(),
+ b')' => {
+ let contents = tokenizer.slice_from(start_pos).into();
+ tokenizer.advance(1);
+ return BadUrl(contents)
+ }
+ b'\\' => {
+ tokenizer.advance(1);
+ if matches!(tokenizer.next_byte(), Some(b')') | Some(b'\\')) {
+ tokenizer.advance(1); // Skip an escaped ')' or '\'
+ }
+ }
+ b'\n' | b'\x0C' | b'\r' => {
+ tokenizer.consume_newline();
+ }
+ b => {
+ tokenizer.consume_known_byte(b);
+ }
+ }
+ }
+ BadUrl(tokenizer.slice_from(start_pos).into())
+ }
+}
+
+// (value, number of digits up to 6)
+fn consume_hex_digits<'a>(tokenizer: &mut Tokenizer<'a>) -> (u32, u32) {
+ let mut value = 0;
+ let mut digits = 0;
+ while digits < 6 && !tokenizer.is_eof() {
+ match byte_to_hex_digit(tokenizer.next_byte_unchecked()) {
+ Some(digit) => {
+ value = value * 16 + digit;
+ digits += 1;
+ tokenizer.advance(1);
+ }
+ None => break,
+ }
+ }
+ (value, digits)
+}
+
+// Same constraints as consume_escape except it writes into `bytes` the result
+// instead of returning it.
+fn consume_escape_and_write(tokenizer: &mut Tokenizer, bytes: &mut Vec<u8>) {
+ bytes.extend(
+ consume_escape(tokenizer)
+ .encode_utf8(&mut [0; 4])
+ .as_bytes(),
+ )
+}
+
+// Assumes that the U+005C REVERSE SOLIDUS (\) has already been consumed
+// and that the next input character has already been verified
+// to not be a newline.
+fn consume_escape(tokenizer: &mut Tokenizer) -> char {
+ if tokenizer.is_eof() {
+ return '\u{FFFD}';
+ } // Escaped EOF
+ match_byte! { tokenizer.next_byte_unchecked(),
+ b'0'..=b'9' | b'A'..=b'F' | b'a'..=b'f' => {
+ let (c, _) = consume_hex_digits(tokenizer);
+ if !tokenizer.is_eof() {
+ match_byte! { tokenizer.next_byte_unchecked(),
+ b' ' | b'\t' => {
+ tokenizer.advance(1)
+ }
+ b'\n' | b'\x0C' | b'\r' => {
+ tokenizer.consume_newline();
+ }
+ _ => {}
+ }
+ }
+ static REPLACEMENT_CHAR: char = '\u{FFFD}';
+ if c != 0 {
+ let c = char::from_u32(c);
+ c.unwrap_or(REPLACEMENT_CHAR)
+ } else {
+ REPLACEMENT_CHAR
+ }
+ },
+ b'\0' => {
+ tokenizer.advance(1);
+ '\u{FFFD}'
+ }
+ _ => { tokenizer.consume_char() }
+ }
+}