// Copyright 2014-2017 The html5ever Project Developers. See the // COPYRIGHT file at the top-level directory of this distribution. // // Licensed under the Apache License, Version 2.0 or the MIT license // , at your // option. This file may not be copied, modified, or distributed // except according to those terms. //! The HTML5 tokenizer. pub use self::interface::{CharacterTokens, EOFToken, NullCharacterToken, ParseError}; pub use self::interface::{CommentToken, DoctypeToken, TagToken, Token}; pub use self::interface::{Doctype, EndTag, StartTag, Tag, TagKind}; pub use self::interface::{TokenSink, TokenSinkResult}; use self::states::{DoctypeIdKind, Public, System}; use self::states::{DoubleEscaped, Escaped}; use self::states::{DoubleQuoted, SingleQuoted, Unquoted}; use self::states::{Rawtext, Rcdata, ScriptData, ScriptDataEscaped}; use self::char_ref::{CharRef, CharRefTokenizer}; use crate::util::str::lower_ascii_letter; use log::{debug, trace}; use mac::{_tt_as_expr_hack, format_if, matches}; use markup5ever::{namespace_url, ns, small_char_set}; use std::borrow::Cow::{self, Borrowed}; use std::collections::BTreeMap; use std::default::Default; use std::mem::replace; pub use crate::buffer_queue::{BufferQueue, FromSet, NotFromSet, SetResult}; use crate::tendril::StrTendril; use crate::{Attribute, LocalName, QualName, SmallCharSet}; mod char_ref; mod interface; pub mod states; pub enum ProcessResult { Continue, Suspend, Script(Handle), } #[must_use] pub enum TokenizerResult { Done, Script(Handle), } fn option_push(opt_str: &mut Option, c: char) { match *opt_str { Some(ref mut s) => s.push_char(c), None => *opt_str = Some(StrTendril::from_char(c)), } } /// Tokenizer options, with an impl for `Default`. #[derive(Clone)] pub struct TokenizerOpts { /// Report all parse errors described in the spec, at some /// performance penalty? Default: false pub exact_errors: bool, /// Discard a `U+FEFF BYTE ORDER MARK` if we see one at the beginning /// of the stream? Default: true pub discard_bom: bool, /// Keep a record of how long we spent in each state? Printed /// when `end()` is called. Default: false pub profile: bool, /// Initial state override. Only the test runner should use /// a non-`None` value! pub initial_state: Option, /// Last start tag. Only the test runner should use a /// non-`None` value! /// /// FIXME: Can't use Tendril because we want TokenizerOpts /// to be Send. pub last_start_tag_name: Option, } impl Default for TokenizerOpts { fn default() -> TokenizerOpts { TokenizerOpts { exact_errors: false, discard_bom: true, profile: false, initial_state: None, last_start_tag_name: None, } } } /// The HTML tokenizer. pub struct Tokenizer { /// Options controlling the behavior of the tokenizer. opts: TokenizerOpts, /// Destination for tokens we emit. pub sink: Sink, /// The abstract machine state as described in the spec. state: states::State, /// Are we at the end of the file, once buffers have been processed /// completely? This affects whether we will wait for lookahead or not. at_eof: bool, /// Tokenizer for character references, if we're tokenizing /// one at the moment. char_ref_tokenizer: Option>, /// Current input character. Just consumed, may reconsume. current_char: char, /// Should we reconsume the current input character? reconsume: bool, /// Did we just consume \r, translating it to \n? In that case we need /// to ignore the next character if it's \n. ignore_lf: bool, /// Discard a U+FEFF BYTE ORDER MARK if we see one? Only done at the /// beginning of the stream. discard_bom: bool, /// Current tag kind. current_tag_kind: TagKind, /// Current tag name. current_tag_name: StrTendril, /// Current tag is self-closing? current_tag_self_closing: bool, /// Current tag attributes. current_tag_attrs: Vec, /// Current attribute name. current_attr_name: StrTendril, /// Current attribute value. current_attr_value: StrTendril, /// Current comment. current_comment: StrTendril, /// Current doctype token. current_doctype: Doctype, /// Last start tag name, for use in checking "appropriate end tag". last_start_tag_name: Option, /// The "temporary buffer" mentioned in the spec. temp_buf: StrTendril, /// Record of how many ns we spent in each state, if profiling is enabled. state_profile: BTreeMap, /// Record of how many ns we spent in the token sink. time_in_sink: u64, /// Track current line current_line: u64, } impl Tokenizer { /// Create a new tokenizer which feeds tokens to a particular `TokenSink`. pub fn new(sink: Sink, mut opts: TokenizerOpts) -> Tokenizer { let start_tag_name = opts .last_start_tag_name .take() .map(|s| LocalName::from(&*s)); let state = opts.initial_state.unwrap_or(states::Data); let discard_bom = opts.discard_bom; Tokenizer { opts, sink, state, char_ref_tokenizer: None, at_eof: false, current_char: '\0', reconsume: false, ignore_lf: false, discard_bom, current_tag_kind: StartTag, current_tag_name: StrTendril::new(), current_tag_self_closing: false, current_tag_attrs: vec![], current_attr_name: StrTendril::new(), current_attr_value: StrTendril::new(), current_comment: StrTendril::new(), current_doctype: Doctype::new(), last_start_tag_name: start_tag_name, temp_buf: StrTendril::new(), state_profile: BTreeMap::new(), time_in_sink: 0, current_line: 1, } } /// Feed an input string into the tokenizer. pub fn feed(&mut self, input: &mut BufferQueue) -> TokenizerResult { if input.is_empty() { return TokenizerResult::Done; } if self.discard_bom { if let Some(c) = input.peek() { if c == '\u{feff}' { input.next(); } } else { return TokenizerResult::Done; } }; self.run(input) } pub fn set_plaintext_state(&mut self) { self.state = states::Plaintext; } fn process_token(&mut self, token: Token) -> TokenSinkResult { if self.opts.profile { let (ret, dt) = time!(self.sink.process_token(token, self.current_line)); self.time_in_sink += dt; ret } else { self.sink.process_token(token, self.current_line) } } fn process_token_and_continue(&mut self, token: Token) { assert!(matches!( self.process_token(token), TokenSinkResult::Continue )); } //§ preprocessing-the-input-stream // Get the next input character, which might be the character // 'c' that we already consumed from the buffers. fn get_preprocessed_char(&mut self, mut c: char, input: &mut BufferQueue) -> Option { if self.ignore_lf { self.ignore_lf = false; if c == '\n' { c = unwrap_or_return!(input.next(), None); } } if c == '\r' { self.ignore_lf = true; c = '\n'; } if c == '\n' { self.current_line += 1; } if self.opts.exact_errors && match c as u32 { 0x01..=0x08 | 0x0B | 0x0E..=0x1F | 0x7F..=0x9F | 0xFDD0..=0xFDEF => true, n if (n & 0xFFFE) == 0xFFFE => true, _ => false, } { let msg = format!("Bad character {}", c); self.emit_error(Cow::Owned(msg)); } trace!("got character {}", c); self.current_char = c; Some(c) } //§ tokenization // Get the next input character, if one is available. fn get_char(&mut self, input: &mut BufferQueue) -> Option { if self.reconsume { self.reconsume = false; Some(self.current_char) } else { input .next() .and_then(|c| self.get_preprocessed_char(c, input)) } } fn pop_except_from(&mut self, input: &mut BufferQueue, set: SmallCharSet) -> Option { // Bail to the slow path for various corner cases. // This means that `FromSet` can contain characters not in the set! // It shouldn't matter because the fallback `FromSet` case should // always do the same thing as the `NotFromSet` case. if self.opts.exact_errors || self.reconsume || self.ignore_lf { return self.get_char(input).map(FromSet); } let d = input.pop_except_from(set); trace!("got characters {:?}", d); match d { Some(FromSet(c)) => self.get_preprocessed_char(c, input).map(FromSet), // NB: We don't set self.current_char for a run of characters not // in the set. It shouldn't matter for the codepaths that use // this. _ => d, } } // Check if the next characters are an ASCII case-insensitive match. See // BufferQueue::eat. // // NB: this doesn't do input stream preprocessing or set the current input // character. fn eat( &mut self, input: &mut BufferQueue, pat: &str, eq: fn(&u8, &u8) -> bool, ) -> Option { input.push_front(replace(&mut self.temp_buf, StrTendril::new())); match input.eat(pat, eq) { None if self.at_eof => Some(false), None => { while let Some(c) = input.next() { self.temp_buf.push_char(c); } None }, Some(matched) => Some(matched), } } /// Run the state machine for as long as we can. fn run(&mut self, input: &mut BufferQueue) -> TokenizerResult { if self.opts.profile { loop { let state = self.state; let old_sink = self.time_in_sink; let (run, mut dt) = time!(self.step(input)); dt -= (self.time_in_sink - old_sink); let new = match self.state_profile.get_mut(&state) { Some(x) => { *x += dt; false }, None => true, }; if new { // do this here because of borrow shenanigans self.state_profile.insert(state, dt); } match run { ProcessResult::Continue => (), ProcessResult::Suspend => break, ProcessResult::Script(node) => return TokenizerResult::Script(node), } } } else { loop { match self.step(input) { ProcessResult::Continue => (), ProcessResult::Suspend => break, ProcessResult::Script(node) => return TokenizerResult::Script(node), } } } TokenizerResult::Done } fn bad_char_error(&mut self) { let msg = format_if!( self.opts.exact_errors, "Bad character", "Saw {} in state {:?}", self.current_char, self.state ); self.emit_error(msg); } fn bad_eof_error(&mut self) { let msg = format_if!( self.opts.exact_errors, "Unexpected EOF", "Saw EOF in state {:?}", self.state ); self.emit_error(msg); } fn emit_char(&mut self, c: char) { self.process_token_and_continue(match c { '\0' => NullCharacterToken, _ => CharacterTokens(StrTendril::from_char(c)), }); } // The string must not contain '\0'! fn emit_chars(&mut self, b: StrTendril) { self.process_token_and_continue(CharacterTokens(b)); } fn emit_current_tag(&mut self) -> ProcessResult { self.finish_attribute(); let name = LocalName::from(&*self.current_tag_name); self.current_tag_name.clear(); match self.current_tag_kind { StartTag => { self.last_start_tag_name = Some(name.clone()); }, EndTag => { if !self.current_tag_attrs.is_empty() { self.emit_error(Borrowed("Attributes on an end tag")); } if self.current_tag_self_closing { self.emit_error(Borrowed("Self-closing end tag")); } }, } let token = TagToken(Tag { kind: self.current_tag_kind, name, self_closing: self.current_tag_self_closing, attrs: replace(&mut self.current_tag_attrs, vec![]), }); match self.process_token(token) { TokenSinkResult::Continue => ProcessResult::Continue, TokenSinkResult::Plaintext => { self.state = states::Plaintext; ProcessResult::Continue }, TokenSinkResult::Script(node) => { self.state = states::Data; ProcessResult::Script(node) }, TokenSinkResult::RawData(kind) => { self.state = states::RawData(kind); ProcessResult::Continue }, } } fn emit_temp_buf(&mut self) { // FIXME: Make sure that clearing on emit is spec-compatible. let buf = replace(&mut self.temp_buf, StrTendril::new()); self.emit_chars(buf); } fn clear_temp_buf(&mut self) { // Do this without a new allocation. self.temp_buf.clear(); } fn emit_current_comment(&mut self) { let comment = replace(&mut self.current_comment, StrTendril::new()); self.process_token_and_continue(CommentToken(comment)); } fn discard_tag(&mut self) { self.current_tag_name.clear(); self.current_tag_self_closing = false; self.current_tag_attrs = vec![]; } fn create_tag(&mut self, kind: TagKind, c: char) { self.discard_tag(); self.current_tag_name.push_char(c); self.current_tag_kind = kind; } fn have_appropriate_end_tag(&self) -> bool { match self.last_start_tag_name.as_ref() { Some(last) => (self.current_tag_kind == EndTag) && (*self.current_tag_name == **last), None => false, } } fn create_attribute(&mut self, c: char) { self.finish_attribute(); self.current_attr_name.push_char(c); } fn finish_attribute(&mut self) { if self.current_attr_name.is_empty() { return; } // Check for a duplicate attribute. // FIXME: the spec says we should error as soon as the name is finished. // FIXME: linear time search, do we care? let dup = { let name = &*self.current_attr_name; self.current_tag_attrs .iter() .any(|a| &*a.name.local == name) }; if dup { self.emit_error(Borrowed("Duplicate attribute")); self.current_attr_name.clear(); self.current_attr_value.clear(); } else { let name = LocalName::from(&*self.current_attr_name); self.current_attr_name.clear(); self.current_tag_attrs.push(Attribute { // The tree builder will adjust the namespace if necessary. // This only happens in foreign elements. name: QualName::new(None, ns!(), name), value: replace(&mut self.current_attr_value, StrTendril::new()), }); } } fn emit_current_doctype(&mut self) { let doctype = replace(&mut self.current_doctype, Doctype::new()); self.process_token_and_continue(DoctypeToken(doctype)); } fn doctype_id(&mut self, kind: DoctypeIdKind) -> &mut Option { match kind { Public => &mut self.current_doctype.public_id, System => &mut self.current_doctype.system_id, } } fn clear_doctype_id(&mut self, kind: DoctypeIdKind) { let id = self.doctype_id(kind); match *id { Some(ref mut s) => s.clear(), None => *id = Some(StrTendril::new()), } } fn consume_char_ref(&mut self, addnl_allowed: Option) { // NB: The char ref tokenizer assumes we have an additional allowed // character iff we're tokenizing in an attribute value. self.char_ref_tokenizer = Some(Box::new(CharRefTokenizer::new(addnl_allowed))); } fn emit_eof(&mut self) { self.process_token_and_continue(EOFToken); } fn peek(&mut self, input: &BufferQueue) -> Option { if self.reconsume { Some(self.current_char) } else { input.peek() } } fn discard_char(&mut self, input: &mut BufferQueue) { self.get_char(input); } fn emit_error(&mut self, error: Cow<'static, str>) { self.process_token_and_continue(ParseError(error)); } } //§ END // Shorthand for common state machine behaviors. macro_rules! shorthand ( ( $me:ident : emit $c:expr ) => ( $me.emit_char($c) ); ( $me:ident : create_tag $kind:ident $c:expr ) => ( $me.create_tag($kind, $c) ); ( $me:ident : push_tag $c:expr ) => ( $me.current_tag_name.push_char($c) ); ( $me:ident : discard_tag ) => ( $me.discard_tag() ); ( $me:ident : discard_char $input:expr ) => ( $me.discard_char($input) ); ( $me:ident : push_temp $c:expr ) => ( $me.temp_buf.push_char($c) ); ( $me:ident : emit_temp ) => ( $me.emit_temp_buf() ); ( $me:ident : clear_temp ) => ( $me.clear_temp_buf() ); ( $me:ident : create_attr $c:expr ) => ( $me.create_attribute($c) ); ( $me:ident : push_name $c:expr ) => ( $me.current_attr_name.push_char($c) ); ( $me:ident : push_value $c:expr ) => ( $me.current_attr_value.push_char($c) ); ( $me:ident : append_value $c:expr ) => ( $me.current_attr_value.push_tendril($c) ); ( $me:ident : push_comment $c:expr ) => ( $me.current_comment.push_char($c) ); ( $me:ident : append_comment $c:expr ) => ( $me.current_comment.push_slice($c) ); ( $me:ident : emit_comment ) => ( $me.emit_current_comment() ); ( $me:ident : clear_comment ) => ( $me.current_comment.clear() ); ( $me:ident : create_doctype ) => ( $me.current_doctype = Doctype::new() ); ( $me:ident : push_doctype_name $c:expr ) => ( option_push(&mut $me.current_doctype.name, $c) ); ( $me:ident : push_doctype_id $k:ident $c:expr ) => ( option_push($me.doctype_id($k), $c) ); ( $me:ident : clear_doctype_id $k:ident ) => ( $me.clear_doctype_id($k) ); ( $me:ident : force_quirks ) => ( $me.current_doctype.force_quirks = true ); ( $me:ident : emit_doctype ) => ( $me.emit_current_doctype() ); ( $me:ident : error ) => ( $me.bad_char_error() ); ( $me:ident : error_eof ) => ( $me.bad_eof_error() ); ); // Tracing of tokenizer actions. This adds significant bloat and compile time, // so it's behind a cfg flag. #[cfg(trace_tokenizer)] macro_rules! sh_trace ( ( $me:ident : $($cmds:tt)* ) => ({ trace!(" {:s}", stringify!($($cmds)*)); shorthand!($me:expr : $($cmds)*); })); #[cfg(not(trace_tokenizer))] macro_rules! sh_trace ( ( $me:ident : $($cmds:tt)* ) => ( shorthand!($me: $($cmds)*) ) ); // A little DSL for sequencing shorthand actions. macro_rules! go ( // A pattern like $($cmd:tt)* ; $($rest:tt)* causes parse ambiguity. // We have to tell the parser how much lookahead we need. ( $me:ident : $a:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a); go!($me: $($rest)*); }); ( $me:ident : $a:tt $b:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b); go!($me: $($rest)*); }); ( $me:ident : $a:tt $b:tt $c:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b $c); go!($me: $($rest)*); }); ( $me:ident : $a:tt $b:tt $c:tt $d:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b $c $d); go!($me: $($rest)*); }); // These can only come at the end. ( $me:ident : to $s:ident ) => ({ $me.state = states::$s; return ProcessResult::Continue; }); ( $me:ident : to $s:ident $k1:expr ) => ({ $me.state = states::$s($k1); return ProcessResult::Continue; }); ( $me:ident : to $s:ident $k1:ident $k2:expr ) => ({ $me.state = states::$s($k1($k2)); return ProcessResult::Continue; }); ( $me:ident : reconsume $s:ident ) => ({ $me.reconsume = true; go!($me: to $s); }); ( $me:ident : reconsume $s:ident $k1:expr ) => ({ $me.reconsume = true; go!($me: to $s $k1); }); ( $me:ident : reconsume $s:ident $k1:ident $k2:expr ) => ({ $me.reconsume = true; go!($me: to $s $k1 $k2); }); ( $me:ident : consume_char_ref ) => ({ $me.consume_char_ref(None); return ProcessResult::Continue; }); ( $me:ident : consume_char_ref $addnl:expr ) => ({ $me.consume_char_ref(Some($addnl)); return ProcessResult::Continue; }); // We have a default next state after emitting a tag, but the sink can override. ( $me:ident : emit_tag $s:ident ) => ({ $me.state = states::$s; return $me.emit_current_tag(); }); ( $me:ident : eof ) => ({ $me.emit_eof(); return ProcessResult::Suspend; }); // If nothing else matched, it's a single command ( $me:ident : $($cmd:tt)+ ) => ( sh_trace!($me: $($cmd)+) ); // or nothing. ( $me:ident : ) => (()); ); macro_rules! go_match ( ( $me:ident : $x:expr, $($pats:pat),+ => $($cmds:tt)* ) => ( match $x { $($pats)|+ => go!($me: $($cmds)*), _ => (), } )); // This is a macro because it can cause early return // from the function where it is used. macro_rules! get_char ( ($me:expr, $input:expr) => ( unwrap_or_return!($me.get_char($input), ProcessResult::Suspend) )); macro_rules! peek ( ($me:expr, $input:expr) => ( unwrap_or_return!($me.peek($input), ProcessResult::Suspend) )); macro_rules! pop_except_from ( ($me:expr, $input:expr, $set:expr) => ( unwrap_or_return!($me.pop_except_from($input, $set), ProcessResult::Suspend) )); macro_rules! eat ( ($me:expr, $input:expr, $pat:expr) => ( unwrap_or_return!($me.eat($input, $pat, u8::eq_ignore_ascii_case), ProcessResult::Suspend) )); macro_rules! eat_exact ( ($me:expr, $input:expr, $pat:expr) => ( unwrap_or_return!($me.eat($input, $pat, u8::eq), ProcessResult::Suspend) )); impl Tokenizer { // Run the state machine for a while. // Return true if we should be immediately re-invoked // (this just simplifies control flow vs. break / continue). #[allow(clippy::never_loop)] fn step(&mut self, input: &mut BufferQueue) -> ProcessResult { if self.char_ref_tokenizer.is_some() { return self.step_char_ref_tokenizer(input); } trace!("processing in state {:?}", self.state); match self.state { //§ data-state states::Data => loop { match pop_except_from!(self, input, small_char_set!('\r' '\0' '&' '<' '\n')) { FromSet('\0') => go!(self: error; emit '\0'), FromSet('&') => go!(self: consume_char_ref), FromSet('<') => go!(self: to TagOpen), FromSet(c) => go!(self: emit c), NotFromSet(b) => self.emit_chars(b), } }, //§ rcdata-state states::RawData(Rcdata) => loop { match pop_except_from!(self, input, small_char_set!('\r' '\0' '&' '<' '\n')) { FromSet('\0') => go!(self: error; emit '\u{fffd}'), FromSet('&') => go!(self: consume_char_ref), FromSet('<') => go!(self: to RawLessThanSign Rcdata), FromSet(c) => go!(self: emit c), NotFromSet(b) => self.emit_chars(b), } }, //§ rawtext-state states::RawData(Rawtext) => loop { match pop_except_from!(self, input, small_char_set!('\r' '\0' '<' '\n')) { FromSet('\0') => go!(self: error; emit '\u{fffd}'), FromSet('<') => go!(self: to RawLessThanSign Rawtext), FromSet(c) => go!(self: emit c), NotFromSet(b) => self.emit_chars(b), } }, //§ script-data-state states::RawData(ScriptData) => loop { match pop_except_from!(self, input, small_char_set!('\r' '\0' '<' '\n')) { FromSet('\0') => go!(self: error; emit '\u{fffd}'), FromSet('<') => go!(self: to RawLessThanSign ScriptData), FromSet(c) => go!(self: emit c), NotFromSet(b) => self.emit_chars(b), } }, //§ script-data-escaped-state states::RawData(ScriptDataEscaped(Escaped)) => loop { match pop_except_from!(self, input, small_char_set!('\r' '\0' '-' '<' '\n')) { FromSet('\0') => go!(self: error; emit '\u{fffd}'), FromSet('-') => go!(self: emit '-'; to ScriptDataEscapedDash Escaped), FromSet('<') => go!(self: to RawLessThanSign ScriptDataEscaped Escaped), FromSet(c) => go!(self: emit c), NotFromSet(b) => self.emit_chars(b), } }, //§ script-data-double-escaped-state states::RawData(ScriptDataEscaped(DoubleEscaped)) => loop { match pop_except_from!(self, input, small_char_set!('\r' '\0' '-' '<' '\n')) { FromSet('\0') => go!(self: error; emit '\u{fffd}'), FromSet('-') => go!(self: emit '-'; to ScriptDataEscapedDash DoubleEscaped), FromSet('<') => { go!(self: emit '<'; to RawLessThanSign ScriptDataEscaped DoubleEscaped) }, FromSet(c) => go!(self: emit c), NotFromSet(b) => self.emit_chars(b), } }, //§ plaintext-state states::Plaintext => loop { match pop_except_from!(self, input, small_char_set!('\r' '\0' '\n')) { FromSet('\0') => go!(self: error; emit '\u{fffd}'), FromSet(c) => go!(self: emit c), NotFromSet(b) => self.emit_chars(b), } }, //§ tag-open-state states::TagOpen => loop { match get_char!(self, input) { '!' => go!(self: clear_temp; to MarkupDeclarationOpen), '/' => go!(self: to EndTagOpen), '?' => go!(self: error; clear_comment; push_comment '?'; to BogusComment), c => match lower_ascii_letter(c) { Some(cl) => go!(self: create_tag StartTag cl; to TagName), None => go!(self: error; emit '<'; reconsume Data), }, } }, //§ end-tag-open-state states::EndTagOpen => loop { match get_char!(self, input) { '>' => go!(self: error; to Data), '\0' => { go!(self: error; clear_comment; push_comment '\u{fffd}'; to BogusComment) }, c => match lower_ascii_letter(c) { Some(cl) => go!(self: create_tag EndTag cl; to TagName), None => go!(self: error; clear_comment; push_comment c; to BogusComment), }, } }, //§ tag-name-state states::TagName => loop { match get_char!(self, input) { '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeAttributeName), '/' => go!(self: to SelfClosingStartTag), '>' => go!(self: emit_tag Data), '\0' => go!(self: error; push_tag '\u{fffd}'), c => go!(self: push_tag (c.to_ascii_lowercase())), } }, //§ script-data-escaped-less-than-sign-state states::RawLessThanSign(ScriptDataEscaped(Escaped)) => loop { match get_char!(self, input) { '/' => go!(self: clear_temp; to RawEndTagOpen ScriptDataEscaped Escaped), c => match lower_ascii_letter(c) { Some(cl) => go!(self: clear_temp; push_temp cl; emit '<'; emit c; to ScriptDataEscapeStart DoubleEscaped), None => go!(self: emit '<'; reconsume RawData ScriptDataEscaped Escaped), }, } }, //§ script-data-double-escaped-less-than-sign-state states::RawLessThanSign(ScriptDataEscaped(DoubleEscaped)) => loop { match get_char!(self, input) { '/' => go!(self: clear_temp; emit '/'; to ScriptDataDoubleEscapeEnd), _ => go!(self: reconsume RawData ScriptDataEscaped DoubleEscaped), } }, //§ rcdata-less-than-sign-state rawtext-less-than-sign-state script-data-less-than-sign-state // otherwise states::RawLessThanSign(kind) => loop { match get_char!(self, input) { '/' => go!(self: clear_temp; to RawEndTagOpen kind), '!' if kind == ScriptData => { go!(self: emit '<'; emit '!'; to ScriptDataEscapeStart Escaped) }, _ => go!(self: emit '<'; reconsume RawData kind), } }, //§ rcdata-end-tag-open-state rawtext-end-tag-open-state script-data-end-tag-open-state script-data-escaped-end-tag-open-state states::RawEndTagOpen(kind) => loop { let c = get_char!(self, input); match lower_ascii_letter(c) { Some(cl) => go!(self: create_tag EndTag cl; push_temp c; to RawEndTagName kind), None => go!(self: emit '<'; emit '/'; reconsume RawData kind), } }, //§ rcdata-end-tag-name-state rawtext-end-tag-name-state script-data-end-tag-name-state script-data-escaped-end-tag-name-state states::RawEndTagName(kind) => loop { let c = get_char!(self, input); if self.have_appropriate_end_tag() { match c { '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeAttributeName), '/' => go!(self: to SelfClosingStartTag), '>' => go!(self: emit_tag Data), _ => (), } } match lower_ascii_letter(c) { Some(cl) => go!(self: push_tag cl; push_temp c), None => { go!(self: discard_tag; emit '<'; emit '/'; emit_temp; reconsume RawData kind) }, } }, //§ script-data-double-escape-start-state states::ScriptDataEscapeStart(DoubleEscaped) => loop { let c = get_char!(self, input); match c { '\t' | '\n' | '\x0C' | ' ' | '/' | '>' => { let esc = if &*self.temp_buf == "script" { DoubleEscaped } else { Escaped }; go!(self: emit c; to RawData ScriptDataEscaped esc); }, _ => match lower_ascii_letter(c) { Some(cl) => go!(self: push_temp cl; emit c), None => go!(self: reconsume RawData ScriptDataEscaped Escaped), }, } }, //§ script-data-escape-start-state states::ScriptDataEscapeStart(Escaped) => loop { match get_char!(self, input) { '-' => go!(self: emit '-'; to ScriptDataEscapeStartDash), _ => go!(self: reconsume RawData ScriptData), } }, //§ script-data-escape-start-dash-state states::ScriptDataEscapeStartDash => loop { match get_char!(self, input) { '-' => go!(self: emit '-'; to ScriptDataEscapedDashDash Escaped), _ => go!(self: reconsume RawData ScriptData), } }, //§ script-data-escaped-dash-state script-data-double-escaped-dash-state states::ScriptDataEscapedDash(kind) => loop { match get_char!(self, input) { '-' => go!(self: emit '-'; to ScriptDataEscapedDashDash kind), '<' => { if kind == DoubleEscaped { go!(self: emit '<'); } go!(self: to RawLessThanSign ScriptDataEscaped kind); }, '\0' => go!(self: error; emit '\u{fffd}'; to RawData ScriptDataEscaped kind), c => go!(self: emit c; to RawData ScriptDataEscaped kind), } }, //§ script-data-escaped-dash-dash-state script-data-double-escaped-dash-dash-state states::ScriptDataEscapedDashDash(kind) => loop { match get_char!(self, input) { '-' => go!(self: emit '-'), '<' => { if kind == DoubleEscaped { go!(self: emit '<'); } go!(self: to RawLessThanSign ScriptDataEscaped kind); }, '>' => go!(self: emit '>'; to RawData ScriptData), '\0' => go!(self: error; emit '\u{fffd}'; to RawData ScriptDataEscaped kind), c => go!(self: emit c; to RawData ScriptDataEscaped kind), } }, //§ script-data-double-escape-end-state states::ScriptDataDoubleEscapeEnd => loop { let c = get_char!(self, input); match c { '\t' | '\n' | '\x0C' | ' ' | '/' | '>' => { let esc = if &*self.temp_buf == "script" { Escaped } else { DoubleEscaped }; go!(self: emit c; to RawData ScriptDataEscaped esc); }, _ => match lower_ascii_letter(c) { Some(cl) => go!(self: push_temp cl; emit c), None => go!(self: reconsume RawData ScriptDataEscaped DoubleEscaped), }, } }, //§ before-attribute-name-state states::BeforeAttributeName => loop { match get_char!(self, input) { '\t' | '\n' | '\x0C' | ' ' => (), '/' => go!(self: to SelfClosingStartTag), '>' => go!(self: emit_tag Data), '\0' => go!(self: error; create_attr '\u{fffd}'; to AttributeName), c => match lower_ascii_letter(c) { Some(cl) => go!(self: create_attr cl; to AttributeName), None => { go_match!(self: c, '"' , '\'' , '<' , '=' => error); go!(self: create_attr c; to AttributeName); }, }, } }, //§ attribute-name-state states::AttributeName => loop { match get_char!(self, input) { '\t' | '\n' | '\x0C' | ' ' => go!(self: to AfterAttributeName), '/' => go!(self: to SelfClosingStartTag), '=' => go!(self: to BeforeAttributeValue), '>' => go!(self: emit_tag Data), '\0' => go!(self: error; push_name '\u{fffd}'), c => match lower_ascii_letter(c) { Some(cl) => go!(self: push_name cl), None => { go_match!(self: c, '"' , '\'' , '<' => error); go!(self: push_name c); }, }, } }, //§ after-attribute-name-state states::AfterAttributeName => loop { match get_char!(self, input) { '\t' | '\n' | '\x0C' | ' ' => (), '/' => go!(self: to SelfClosingStartTag), '=' => go!(self: to BeforeAttributeValue), '>' => go!(self: emit_tag Data), '\0' => go!(self: error; create_attr '\u{fffd}'; to AttributeName), c => match lower_ascii_letter(c) { Some(cl) => go!(self: create_attr cl; to AttributeName), None => { go_match!(self: c, '"' , '\'' , '<' => error); go!(self: create_attr c; to AttributeName); }, }, } }, //§ before-attribute-value-state // Use peek so we can handle the first attr character along with the rest, // hopefully in the same zero-copy buffer. states::BeforeAttributeValue => loop { match peek!(self, input) { '\t' | '\n' | '\r' | '\x0C' | ' ' => go!(self: discard_char input), '"' => go!(self: discard_char input; to AttributeValue DoubleQuoted), '\'' => go!(self: discard_char input; to AttributeValue SingleQuoted), '\0' => { go!(self: discard_char input; error; push_value '\u{fffd}'; to AttributeValue Unquoted) }, '>' => go!(self: discard_char input; error; emit_tag Data), _ => go!(self: to AttributeValue Unquoted), } }, //§ attribute-value-(double-quoted)-state states::AttributeValue(DoubleQuoted) => loop { match pop_except_from!(self, input, small_char_set!('\r' '"' '&' '\0' '\n')) { FromSet('"') => go!(self: to AfterAttributeValueQuoted), FromSet('&') => go!(self: consume_char_ref '"'), FromSet('\0') => go!(self: error; push_value '\u{fffd}'), FromSet(c) => go!(self: push_value c), NotFromSet(ref b) => go!(self: append_value b), } }, //§ attribute-value-(single-quoted)-state states::AttributeValue(SingleQuoted) => loop { match pop_except_from!(self, input, small_char_set!('\r' '\'' '&' '\0' '\n')) { FromSet('\'') => go!(self: to AfterAttributeValueQuoted), FromSet('&') => go!(self: consume_char_ref '\''), FromSet('\0') => go!(self: error; push_value '\u{fffd}'), FromSet(c) => go!(self: push_value c), NotFromSet(ref b) => go!(self: append_value b), } }, //§ attribute-value-(unquoted)-state states::AttributeValue(Unquoted) => loop { match pop_except_from!( self, input, small_char_set!('\r' '\t' '\n' '\x0C' ' ' '&' '>' '\0') ) { FromSet('\t') | FromSet('\n') | FromSet('\x0C') | FromSet(' ') => { go!(self: to BeforeAttributeName) }, FromSet('&') => go!(self: consume_char_ref '>'), FromSet('>') => go!(self: emit_tag Data), FromSet('\0') => go!(self: error; push_value '\u{fffd}'), FromSet(c) => { go_match!(self: c, '"' , '\'' , '<' , '=' , '`' => error); go!(self: push_value c); }, NotFromSet(ref b) => go!(self: append_value b), } }, //§ after-attribute-value-(quoted)-state states::AfterAttributeValueQuoted => loop { match get_char!(self, input) { '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeAttributeName), '/' => go!(self: to SelfClosingStartTag), '>' => go!(self: emit_tag Data), _ => go!(self: error; reconsume BeforeAttributeName), } }, //§ self-closing-start-tag-state states::SelfClosingStartTag => loop { match get_char!(self, input) { '>' => { self.current_tag_self_closing = true; go!(self: emit_tag Data); }, _ => go!(self: error; reconsume BeforeAttributeName), } }, //§ comment-start-state states::CommentStart => loop { match get_char!(self, input) { '-' => go!(self: to CommentStartDash), '\0' => go!(self: error; push_comment '\u{fffd}'; to Comment), '>' => go!(self: error; emit_comment; to Data), c => go!(self: push_comment c; to Comment), } }, //§ comment-start-dash-state states::CommentStartDash => loop { match get_char!(self, input) { '-' => go!(self: to CommentEnd), '\0' => go!(self: error; append_comment "-\u{fffd}"; to Comment), '>' => go!(self: error; emit_comment; to Data), c => go!(self: push_comment '-'; push_comment c; to Comment), } }, //§ comment-state states::Comment => loop { match get_char!(self, input) { '-' => go!(self: to CommentEndDash), '\0' => go!(self: error; push_comment '\u{fffd}'), c => go!(self: push_comment c), } }, //§ comment-end-dash-state states::CommentEndDash => loop { match get_char!(self, input) { '-' => go!(self: to CommentEnd), '\0' => go!(self: error; append_comment "-\u{fffd}"; to Comment), c => go!(self: push_comment '-'; push_comment c; to Comment), } }, //§ comment-end-state states::CommentEnd => loop { match get_char!(self, input) { '>' => go!(self: emit_comment; to Data), '\0' => go!(self: error; append_comment "--\u{fffd}"; to Comment), '!' => go!(self: error; to CommentEndBang), '-' => go!(self: error; push_comment '-'), c => go!(self: error; append_comment "--"; push_comment c; to Comment), } }, //§ comment-end-bang-state states::CommentEndBang => loop { match get_char!(self, input) { '-' => go!(self: append_comment "--!"; to CommentEndDash), '>' => go!(self: emit_comment; to Data), '\0' => go!(self: error; append_comment "--!\u{fffd}"; to Comment), c => go!(self: append_comment "--!"; push_comment c; to Comment), } }, //§ doctype-state states::Doctype => loop { match get_char!(self, input) { '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeDoctypeName), _ => go!(self: error; reconsume BeforeDoctypeName), } }, //§ before-doctype-name-state states::BeforeDoctypeName => loop { match get_char!(self, input) { '\t' | '\n' | '\x0C' | ' ' => (), '\0' => { go!(self: error; create_doctype; push_doctype_name '\u{fffd}'; to DoctypeName) }, '>' => go!(self: error; create_doctype; force_quirks; emit_doctype; to Data), c => go!(self: create_doctype; push_doctype_name (c.to_ascii_lowercase()); to DoctypeName), } }, //§ doctype-name-state states::DoctypeName => loop { match get_char!(self, input) { '\t' | '\n' | '\x0C' | ' ' => go!(self: clear_temp; to AfterDoctypeName), '>' => go!(self: emit_doctype; to Data), '\0' => go!(self: error; push_doctype_name '\u{fffd}'), c => go!(self: push_doctype_name (c.to_ascii_lowercase())), } }, //§ after-doctype-name-state states::AfterDoctypeName => loop { if eat!(self, input, "public") { go!(self: to AfterDoctypeKeyword Public); } else if eat!(self, input, "system") { go!(self: to AfterDoctypeKeyword System); } else { match get_char!(self, input) { '\t' | '\n' | '\x0C' | ' ' => (), '>' => go!(self: emit_doctype; to Data), _ => go!(self: error; force_quirks; to BogusDoctype), } } }, //§ after-doctype-public-keyword-state after-doctype-system-keyword-state states::AfterDoctypeKeyword(kind) => loop { match get_char!(self, input) { '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeDoctypeIdentifier kind), '"' => { go!(self: error; clear_doctype_id kind; to DoctypeIdentifierDoubleQuoted kind) }, '\'' => { go!(self: error; clear_doctype_id kind; to DoctypeIdentifierSingleQuoted kind) }, '>' => go!(self: error; force_quirks; emit_doctype; to Data), _ => go!(self: error; force_quirks; to BogusDoctype), } }, //§ before-doctype-public-identifier-state before-doctype-system-identifier-state states::BeforeDoctypeIdentifier(kind) => loop { match get_char!(self, input) { '\t' | '\n' | '\x0C' | ' ' => (), '"' => go!(self: clear_doctype_id kind; to DoctypeIdentifierDoubleQuoted kind), '\'' => go!(self: clear_doctype_id kind; to DoctypeIdentifierSingleQuoted kind), '>' => go!(self: error; force_quirks; emit_doctype; to Data), _ => go!(self: error; force_quirks; to BogusDoctype), } }, //§ doctype-public-identifier-(double-quoted)-state doctype-system-identifier-(double-quoted)-state states::DoctypeIdentifierDoubleQuoted(kind) => loop { match get_char!(self, input) { '"' => go!(self: to AfterDoctypeIdentifier kind), '\0' => go!(self: error; push_doctype_id kind '\u{fffd}'), '>' => go!(self: error; force_quirks; emit_doctype; to Data), c => go!(self: push_doctype_id kind c), } }, //§ doctype-public-identifier-(single-quoted)-state doctype-system-identifier-(single-quoted)-state states::DoctypeIdentifierSingleQuoted(kind) => loop { match get_char!(self, input) { '\'' => go!(self: to AfterDoctypeIdentifier kind), '\0' => go!(self: error; push_doctype_id kind '\u{fffd}'), '>' => go!(self: error; force_quirks; emit_doctype; to Data), c => go!(self: push_doctype_id kind c), } }, //§ after-doctype-public-identifier-state states::AfterDoctypeIdentifier(Public) => loop { match get_char!(self, input) { '\t' | '\n' | '\x0C' | ' ' => { go!(self: to BetweenDoctypePublicAndSystemIdentifiers) }, '>' => go!(self: emit_doctype; to Data), '"' => { go!(self: error; clear_doctype_id System; to DoctypeIdentifierDoubleQuoted System) }, '\'' => { go!(self: error; clear_doctype_id System; to DoctypeIdentifierSingleQuoted System) }, _ => go!(self: error; force_quirks; to BogusDoctype), } }, //§ after-doctype-system-identifier-state states::AfterDoctypeIdentifier(System) => loop { match get_char!(self, input) { '\t' | '\n' | '\x0C' | ' ' => (), '>' => go!(self: emit_doctype; to Data), _ => go!(self: error; to BogusDoctype), } }, //§ between-doctype-public-and-system-identifiers-state states::BetweenDoctypePublicAndSystemIdentifiers => loop { match get_char!(self, input) { '\t' | '\n' | '\x0C' | ' ' => (), '>' => go!(self: emit_doctype; to Data), '"' => { go!(self: clear_doctype_id System; to DoctypeIdentifierDoubleQuoted System) }, '\'' => { go!(self: clear_doctype_id System; to DoctypeIdentifierSingleQuoted System) }, _ => go!(self: error; force_quirks; to BogusDoctype), } }, //§ bogus-doctype-state states::BogusDoctype => loop { match get_char!(self, input) { '>' => go!(self: emit_doctype; to Data), _ => (), } }, //§ bogus-comment-state states::BogusComment => loop { match get_char!(self, input) { '>' => go!(self: emit_comment; to Data), '\0' => go!(self: push_comment '\u{fffd}'), c => go!(self: push_comment c), } }, //§ markup-declaration-open-state states::MarkupDeclarationOpen => loop { if eat_exact!(self, input, "--") { go!(self: clear_comment; to CommentStart); } else if eat!(self, input, "doctype") { go!(self: to Doctype); } else { if self .sink .adjusted_current_node_present_but_not_in_html_namespace() { if eat_exact!(self, input, "[CDATA[") { go!(self: clear_temp; to CdataSection); } } go!(self: error; to BogusComment); } }, //§ cdata-section-state states::CdataSection => loop { match get_char!(self, input) { ']' => go!(self: to CdataSectionBracket), '\0' => go!(self: emit_temp; emit '\0'), c => go!(self: push_temp c), } }, //§ cdata-section-bracket states::CdataSectionBracket => match get_char!(self, input) { ']' => go!(self: to CdataSectionEnd), _ => go!(self: push_temp ']'; reconsume CdataSection), }, //§ cdata-section-end states::CdataSectionEnd => loop { match get_char!(self, input) { ']' => go!(self: push_temp ']'), '>' => go!(self: emit_temp; to Data), _ => go!(self: push_temp ']'; push_temp ']'; reconsume CdataSection), } }, //§ END } } fn step_char_ref_tokenizer(&mut self, input: &mut BufferQueue) -> ProcessResult { // FIXME HACK: Take and replace the tokenizer so we don't // double-mut-borrow self. This is why it's boxed. let mut tok = self.char_ref_tokenizer.take().unwrap(); let outcome = tok.step(self, input); let progress = match outcome { char_ref::Done => { self.process_char_ref(tok.get_result()); return ProcessResult::Continue; }, char_ref::Stuck => ProcessResult::Suspend, char_ref::Progress => ProcessResult::Continue, }; self.char_ref_tokenizer = Some(tok); progress } fn process_char_ref(&mut self, char_ref: CharRef) { let CharRef { mut chars, mut num_chars, } = char_ref; if num_chars == 0 { chars[0] = '&'; num_chars = 1; } for i in 0..num_chars { let c = chars[i as usize]; match self.state { states::Data | states::RawData(states::Rcdata) => go!(self: emit c), states::AttributeValue(_) => go!(self: push_value c), _ => panic!( "state {:?} should not be reachable in process_char_ref", self.state ), } } } /// Indicate that we have reached the end of the input. pub fn end(&mut self) { // Handle EOF in the char ref sub-tokenizer, if there is one. // Do this first because it might un-consume stuff. let mut input = BufferQueue::new(); match self.char_ref_tokenizer.take() { None => (), Some(mut tok) => { tok.end_of_file(self, &mut input); self.process_char_ref(tok.get_result()); }, } // Process all remaining buffered input. // If we're waiting for lookahead, we're not gonna get it. self.at_eof = true; assert!(matches!(self.run(&mut input), TokenizerResult::Done)); assert!(input.is_empty()); loop { match self.eof_step() { ProcessResult::Continue => (), ProcessResult::Suspend => break, ProcessResult::Script(_) => unreachable!(), } } self.sink.end(); if self.opts.profile { self.dump_profile(); } } fn dump_profile(&self) { let mut results: Vec<(states::State, u64)> = self.state_profile.iter().map(|(s, t)| (*s, *t)).collect(); results.sort_by(|&(_, x), &(_, y)| y.cmp(&x)); let total: u64 = results .iter() .map(|&(_, t)| t) .fold(0, ::std::ops::Add::add); println!("\nTokenizer profile, in nanoseconds"); println!("\n{:12} total in token sink", self.time_in_sink); println!("\n{:12} total in tokenizer", total); for (k, v) in results.into_iter() { let pct = 100.0 * (v as f64) / (total as f64); println!("{:12} {:4.1}% {:?}", v, pct, k); } } fn eof_step(&mut self) -> ProcessResult { debug!("processing EOF in state {:?}", self.state); match self.state { states::Data | states::RawData(Rcdata) | states::RawData(Rawtext) | states::RawData(ScriptData) | states::Plaintext => go!(self: eof), states::TagName | states::RawData(ScriptDataEscaped(_)) | states::BeforeAttributeName | states::AttributeName | states::AfterAttributeName | states::BeforeAttributeValue | states::AttributeValue(_) | states::AfterAttributeValueQuoted | states::SelfClosingStartTag | states::ScriptDataEscapedDash(_) | states::ScriptDataEscapedDashDash(_) => go!(self: error_eof; to Data), states::TagOpen => go!(self: error_eof; emit '<'; to Data), states::EndTagOpen => go!(self: error_eof; emit '<'; emit '/'; to Data), states::RawLessThanSign(ScriptDataEscaped(DoubleEscaped)) => { go!(self: to RawData ScriptDataEscaped DoubleEscaped) }, states::RawLessThanSign(kind) => go!(self: emit '<'; to RawData kind), states::RawEndTagOpen(kind) => go!(self: emit '<'; emit '/'; to RawData kind), states::RawEndTagName(kind) => { go!(self: emit '<'; emit '/'; emit_temp; to RawData kind) }, states::ScriptDataEscapeStart(kind) => go!(self: to RawData ScriptDataEscaped kind), states::ScriptDataEscapeStartDash => go!(self: to RawData ScriptData), states::ScriptDataDoubleEscapeEnd => { go!(self: to RawData ScriptDataEscaped DoubleEscaped) }, states::CommentStart | states::CommentStartDash | states::Comment | states::CommentEndDash | states::CommentEnd | states::CommentEndBang => go!(self: error_eof; emit_comment; to Data), states::Doctype | states::BeforeDoctypeName => { go!(self: error_eof; create_doctype; force_quirks; emit_doctype; to Data) }, states::DoctypeName | states::AfterDoctypeName | states::AfterDoctypeKeyword(_) | states::BeforeDoctypeIdentifier(_) | states::DoctypeIdentifierDoubleQuoted(_) | states::DoctypeIdentifierSingleQuoted(_) | states::AfterDoctypeIdentifier(_) | states::BetweenDoctypePublicAndSystemIdentifiers => { go!(self: error_eof; force_quirks; emit_doctype; to Data) }, states::BogusDoctype => go!(self: emit_doctype; to Data), states::BogusComment => go!(self: emit_comment; to Data), states::MarkupDeclarationOpen => go!(self: error; to BogusComment), states::CdataSection => go!(self: emit_temp; error_eof; to Data), states::CdataSectionBracket => go!(self: push_temp ']'; to CdataSection), states::CdataSectionEnd => go!(self: push_temp ']'; push_temp ']'; to CdataSection), } } } #[cfg(test)] #[allow(non_snake_case)] mod test { use super::option_push; // private items use crate::tendril::{SliceExt, StrTendril}; use super::{TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts}; use super::interface::{CharacterTokens, EOFToken, NullCharacterToken, ParseError}; use super::interface::{EndTag, StartTag, Tag, TagKind}; use super::interface::{TagToken, Token}; use markup5ever::buffer_queue::BufferQueue; use std::mem::replace; use crate::LocalName; // LinesMatch implements the TokenSink trait. It is used for testing to see // if current_line is being updated when process_token is called. The lines // vector is a collection of the line numbers that each token is on. struct LinesMatch { tokens: Vec, current_str: StrTendril, lines: Vec<(Token, u64)>, } impl LinesMatch { fn new() -> LinesMatch { LinesMatch { tokens: vec![], current_str: StrTendril::new(), lines: vec![], } } fn push(&mut self, token: Token, line_number: u64) { self.finish_str(); self.lines.push((token, line_number)); } fn finish_str(&mut self) { if self.current_str.len() > 0 { let s = replace(&mut self.current_str, StrTendril::new()); self.tokens.push(CharacterTokens(s)); } } } impl TokenSink for LinesMatch { type Handle = (); fn process_token( &mut self, token: Token, line_number: u64, ) -> TokenSinkResult { match token { CharacterTokens(b) => { self.current_str.push_slice(&b); }, NullCharacterToken => { self.current_str.push_char('\0'); }, ParseError(_) => { panic!("unexpected parse error"); }, TagToken(mut t) => { // The spec seems to indicate that one can emit // erroneous end tags with attrs, but the test // cases don't contain them. match t.kind { EndTag => { t.self_closing = false; t.attrs = vec![]; }, _ => t.attrs.sort_by(|a1, a2| a1.name.cmp(&a2.name)), } self.push(TagToken(t), line_number); }, EOFToken => (), _ => self.push(token, line_number), } TokenSinkResult::Continue } } // Take in tokens, process them, and return vector with line // numbers that each token is on fn tokenize(input: Vec, opts: TokenizerOpts) -> Vec<(Token, u64)> { let sink = LinesMatch::new(); let mut tok = Tokenizer::new(sink, opts); let mut buffer = BufferQueue::new(); for chunk in input.into_iter() { buffer.push_back(chunk); let _ = tok.feed(&mut buffer); } tok.end(); tok.sink.lines } // Create a tag token fn create_tag(token: StrTendril, tagkind: TagKind) -> Token { let name = LocalName::from(&*token); let token = TagToken(Tag { kind: tagkind, name, self_closing: false, attrs: vec![], }); token } #[test] fn push_to_None_gives_singleton() { let mut s: Option = None; option_push(&mut s, 'x'); assert_eq!(s, Some("x".to_tendril())); } #[test] fn push_to_empty_appends() { let mut s: Option = Some(StrTendril::new()); option_push(&mut s, 'x'); assert_eq!(s, Some("x".to_tendril())); } #[test] fn push_to_nonempty_appends() { let mut s: Option = Some(StrTendril::from_slice("y")); option_push(&mut s, 'x'); assert_eq!(s, Some("yx".to_tendril())); } #[test] fn check_lines() { let opts = TokenizerOpts { exact_errors: false, discard_bom: true, profile: false, initial_state: None, last_start_tag_name: None, }; let vector = vec![ StrTendril::from("\n"), StrTendril::from("\n"), StrTendril::from("\n"), StrTendril::from("\n"), ]; let expected = vec![ (create_tag(StrTendril::from("a"), StartTag), 1), (create_tag(StrTendril::from("b"), StartTag), 2), (create_tag(StrTendril::from("b"), EndTag), 3), (create_tag(StrTendril::from("a"), EndTag), 4), ]; let results = tokenize(vector, opts); assert_eq!(results, expected); } #[test] fn check_lines_with_new_line() { let opts = TokenizerOpts { exact_errors: false, discard_bom: true, profile: false, initial_state: None, last_start_tag_name: None, }; let vector = vec![ StrTendril::from("\r\n"), StrTendril::from("\r\n"), StrTendril::from("\r\n"), StrTendril::from("\r\n"), ]; let expected = vec![ (create_tag(StrTendril::from("a"), StartTag), 1), (create_tag(StrTendril::from("b"), StartTag), 2), (create_tag(StrTendril::from("b"), EndTag), 3), (create_tag(StrTendril::from("a"), EndTag), 4), ]; let results = tokenize(vector, opts); assert_eq!(results, expected); } }