diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 19:33:14 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 19:33:14 +0000 |
commit | 36d22d82aa202bb199967e9512281e9a53db42c9 (patch) | |
tree | 105e8c98ddea1c1e4784a60a5a6410fa416be2de /third_party/rust/xml-rs/src/reader/lexer.rs | |
parent | Initial commit. (diff) | |
download | firefox-esr-upstream.tar.xz firefox-esr-upstream.zip |
Adding upstream version 115.7.0esr.upstream/115.7.0esrupstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/rust/xml-rs/src/reader/lexer.rs')
-rw-r--r-- | third_party/rust/xml-rs/src/reader/lexer.rs | 867 |
1 files changed, 867 insertions, 0 deletions
diff --git a/third_party/rust/xml-rs/src/reader/lexer.rs b/third_party/rust/xml-rs/src/reader/lexer.rs new file mode 100644 index 0000000000..c466db9210 --- /dev/null +++ b/third_party/rust/xml-rs/src/reader/lexer.rs @@ -0,0 +1,867 @@ +//! Contains simple lexer for XML documents. +//! +//! This module is for internal use. Use `xml::pull` module to do parsing. + +use std::fmt; +use std::collections::VecDeque; +use std::io::Read; +use std::result; +use std::borrow::Cow; + +use common::{Position, TextPosition, is_whitespace_char, is_name_char}; +use reader::Error; +use util; + +/// `Token` represents a single lexeme of an XML document. These lexemes +/// are used to perform actual parsing. +#[derive(Copy, Clone, PartialEq, Eq, Debug)] +pub enum Token { + /// `<?` + ProcessingInstructionStart, + /// `?>` + ProcessingInstructionEnd, + /// `<!DOCTYPE + DoctypeStart, + /// `<` + OpeningTagStart, + /// `</` + ClosingTagStart, + /// `>` + TagEnd, + /// `/>` + EmptyTagEnd, + /// `<!--` + CommentStart, + /// `-->` + CommentEnd, + /// A chunk of characters, used for errors recovery. + Chunk(&'static str), + /// Any non-special character except whitespace. + Character(char), + /// Whitespace character. + Whitespace(char), + /// `=` + EqualsSign, + /// `'` + SingleQuote, + /// `"` + DoubleQuote, + /// `<![CDATA[` + CDataStart, + /// `]]>` + CDataEnd, + /// `&` + ReferenceStart, + /// `;` + ReferenceEnd, +} + +impl fmt::Display for Token { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match *self { + Token::Chunk(s) => write!(f, "{}", s), + Token::Character(c) | Token::Whitespace(c) => write!(f, "{}", c), + other => write!(f, "{}", match other { + Token::OpeningTagStart => "<", + Token::ProcessingInstructionStart => "<?", + Token::DoctypeStart => "<!DOCTYPE", + Token::ClosingTagStart => "</", + Token::CommentStart => "<!--", + Token::CDataStart => "<![CDATA[", + Token::TagEnd => ">", + Token::EmptyTagEnd => "/>", + Token::ProcessingInstructionEnd => "?>", + Token::CommentEnd => "-->", + Token::CDataEnd => "]]>", + Token::ReferenceStart => "&", + Token::ReferenceEnd => ";", + Token::EqualsSign => "=", + Token::SingleQuote => "'", + Token::DoubleQuote => "\"", + _ => unreachable!() + }) + } + } +} + +impl Token { + pub fn as_static_str(&self) -> Option<&'static str> { + match *self { + Token::OpeningTagStart => Some("<"), + Token::ProcessingInstructionStart => Some("<?"), + Token::DoctypeStart => Some("<!DOCTYPE"), + Token::ClosingTagStart => Some("</"), + Token::CommentStart => Some("<!--"), + Token::CDataStart => Some("<![CDATA["), + Token::TagEnd => Some(">"), + Token::EmptyTagEnd => Some("/>"), + Token::ProcessingInstructionEnd => Some("?>"), + Token::CommentEnd => Some("-->"), + Token::CDataEnd => Some("]]>"), + Token::ReferenceStart => Some("&"), + Token::ReferenceEnd => Some(";"), + Token::EqualsSign => Some("="), + Token::SingleQuote => Some("'"), + Token::DoubleQuote => Some("\""), + Token::Chunk(s) => Some(s), + _ => None + } + } + + // using String.push_str(token.to_string()) is simply way too slow + pub fn push_to_string(&self, target: &mut String) { + match self.as_static_str() { + Some(s) => { target.push_str(s); } + None => { + match *self { + Token::Character(c) | Token::Whitespace(c) => target.push(c), + _ => unreachable!() + } + } + } + } + + /// Returns `true` if this token contains data that can be interpreted + /// as a part of the text. Surprisingly, this also means '>' and '=' and '"' and "'" and '-->'. + #[inline] + pub fn contains_char_data(&self) -> bool { + match *self { + Token::Whitespace(_) | Token::Chunk(_) | Token::Character(_) | Token::CommentEnd | + Token::TagEnd | Token::EqualsSign | Token::DoubleQuote | Token::SingleQuote | Token::CDataEnd | + Token::ProcessingInstructionEnd | Token::EmptyTagEnd => true, + _ => false + } + } + + /// Returns `true` if this token corresponds to a white space character. + #[inline] + pub fn is_whitespace(&self) -> bool { + match *self { + Token::Whitespace(_) => true, + _ => false + } + } +} + +enum State { + /// Triggered on '<' + TagStarted, + /// Triggered on '<!' + CommentOrCDataOrDoctypeStarted, + /// Triggered on '<!-' + CommentStarted, + /// Triggered on '<!D' up to '<!DOCTYPE' + DoctypeStarted(DoctypeStartedSubstate), + /// Triggered after DoctypeStarted to handle sub elements + DoctypeFinishing(u8), + /// Triggered on '<![' up to '<![CDATA' + CDataStarted(CDataStartedSubstate), + /// Triggered on '?' + ProcessingInstructionClosing, + /// Triggered on '/' + EmptyTagClosing, + /// Triggered on '-' up to '--' + CommentClosing(ClosingSubstate), + /// Triggered on ']' up to ']]' + CDataClosing(ClosingSubstate), + /// Default state + Normal +} + +#[derive(Copy, Clone)] +enum ClosingSubstate { + First, Second +} + +#[derive(Copy, Clone)] +enum DoctypeStartedSubstate { + D, DO, DOC, DOCT, DOCTY, DOCTYP +} + +#[derive(Copy, Clone)] +enum CDataStartedSubstate { + E, C, CD, CDA, CDAT, CDATA +} + +/// `Result` represents lexing result. It is either a token or an error message. +pub type Result = result::Result<Option<Token>, Error>; + +/// Helps to set up a dispatch table for lexing large unambigous tokens like +/// `<![CDATA[` or `<!DOCTYPE `. +macro_rules! dispatch_on_enum_state( + ($_self:ident, $s:expr, $c:expr, $is:expr, + $($st:ident; $stc:expr ; $next_st:ident ; $chunk:expr),+; + $end_st:ident ; $end_c:expr ; $end_chunk:expr ; $e:expr) => ( + match $s { + $( + $st => match $c { + $stc => $_self.move_to($is($next_st)), + _ => $_self.handle_error($chunk, $c) + }, + )+ + $end_st => match $c { + $end_c => $e, + _ => $_self.handle_error($end_chunk, $c) + } + } + ) +); + +/// `Lexer` is a lexer for XML documents, which implements pull API. +/// +/// Main method is `next_token` which accepts an `std::io::Read` instance and +/// tries to read the next lexeme from it. +/// +/// When `skip_errors` flag is set, invalid lexemes will be returned as `Chunk`s. +/// When it is not set, errors will be reported as `Err` objects with a string message. +/// By default this flag is not set. Use `enable_errors` and `disable_errors` methods +/// to toggle the behavior. +pub struct Lexer { + pos: TextPosition, + head_pos: TextPosition, + char_queue: VecDeque<char>, + st: State, + skip_errors: bool, + inside_comment: bool, + inside_token: bool, + eof_handled: bool +} + +impl Position for Lexer { + #[inline] + /// Returns the position of the last token produced by the lexer + fn position(&self) -> TextPosition { self.pos } +} + +impl Lexer { + /// Returns a new lexer with default state. + pub fn new() -> Lexer { + Lexer { + pos: TextPosition::new(), + head_pos: TextPosition::new(), + char_queue: VecDeque::with_capacity(4), // TODO: check size + st: State::Normal, + skip_errors: false, + inside_comment: false, + inside_token: false, + eof_handled: false + } + } + + /// Enables error handling so `next_token` will return `Some(Err(..))` + /// upon invalid lexeme. + #[inline] + pub fn enable_errors(&mut self) { self.skip_errors = false; } + + /// Disables error handling so `next_token` will return `Some(Chunk(..))` + /// upon invalid lexeme with this lexeme content. + #[inline] + pub fn disable_errors(&mut self) { self.skip_errors = true; } + + /// Enables special handling of some lexemes which should be done when we're parsing comment + /// internals. + #[inline] + pub fn inside_comment(&mut self) { self.inside_comment = true; } + + /// Disables the effect of `inside_comment()` method. + #[inline] + pub fn outside_comment(&mut self) { self.inside_comment = false; } + + /// Reset the eof handled flag of the lexer. + #[inline] + pub fn reset_eof_handled(&mut self) { self.eof_handled = false; } + + /// Tries to read the next token from the buffer. + /// + /// It is possible to pass different instaces of `BufReader` each time + /// this method is called, but the resulting behavior is undefined in this case. + /// + /// Return value: + /// * `Err(reason) where reason: reader::Error` - when an error occurs; + /// * `Ok(None)` - upon end of stream is reached; + /// * `Ok(Some(token)) where token: Token` - in case a complete-token has been read from the stream. + pub fn next_token<B: Read>(&mut self, b: &mut B) -> Result { + // Already reached end of buffer + if self.eof_handled { + return Ok(None); + } + + if !self.inside_token { + self.pos = self.head_pos; + self.inside_token = true; + } + + // Check if we have saved a char or two for ourselves + while let Some(c) = self.char_queue.pop_front() { + match try!(self.read_next_token(c)) { + Some(t) => { + self.inside_token = false; + return Ok(Some(t)); + } + None => {} // continue + } + } + + loop { + // TODO: this should handle multiple encodings + let c = match try!(util::next_char_from(b)) { + Some(c) => c, // got next char + None => break, // nothing to read left + }; + + match try!(self.read_next_token(c)) { + Some(t) => { + self.inside_token = false; + return Ok(Some(t)); + } + None => { + // continue + } + } + } + + // Handle end of stream + self.eof_handled = true; + self.pos = self.head_pos; + match self.st { + State::TagStarted | State::CommentOrCDataOrDoctypeStarted | + State::CommentStarted | State::CDataStarted(_)| State::DoctypeStarted(_) | + State::CommentClosing(ClosingSubstate::Second) | + State::DoctypeFinishing(_) => + Err(self.error("Unexpected end of stream")), + State::ProcessingInstructionClosing => + Ok(Some(Token::Character('?'))), + State::EmptyTagClosing => + Ok(Some(Token::Character('/'))), + State::CommentClosing(ClosingSubstate::First) => + Ok(Some(Token::Character('-'))), + State::CDataClosing(ClosingSubstate::First) => + Ok(Some(Token::Character(']'))), + State::CDataClosing(ClosingSubstate::Second) => + Ok(Some(Token::Chunk("]]"))), + State::Normal => + Ok(None) + } + } + + #[inline] + fn error<M: Into<Cow<'static, str>>>(&self, msg: M) -> Error { + (self, msg).into() + } + + #[inline] + fn read_next_token(&mut self, c: char) -> Result { + let res = self.dispatch_char(c); + if self.char_queue.is_empty() { + if c == '\n' { + self.head_pos.new_line(); + } else { + self.head_pos.advance(1); + } + } + res + } + + fn dispatch_char(&mut self, c: char) -> Result { + match self.st { + State::Normal => self.normal(c), + State::TagStarted => self.tag_opened(c), + State::CommentOrCDataOrDoctypeStarted => self.comment_or_cdata_or_doctype_started(c), + State::CommentStarted => self.comment_started(c), + State::CDataStarted(s) => self.cdata_started(c, s), + State::DoctypeStarted(s) => self.doctype_started(c, s), + State::DoctypeFinishing(d) => self.doctype_finishing(c, d), + State::ProcessingInstructionClosing => self.processing_instruction_closing(c), + State::EmptyTagClosing => self.empty_element_closing(c), + State::CommentClosing(s) => self.comment_closing(c, s), + State::CDataClosing(s) => self.cdata_closing(c, s) + } + } + + #[inline] + fn move_to(&mut self, st: State) -> Result { + self.st = st; + Ok(None) + } + + #[inline] + fn move_to_with(&mut self, st: State, token: Token) -> Result { + self.st = st; + Ok(Some(token)) + } + + #[inline] + fn move_to_with_unread(&mut self, st: State, cs: &[char], token: Token) -> Result { + self.char_queue.extend(cs.iter().cloned()); + self.move_to_with(st, token) + } + + fn handle_error(&mut self, chunk: &'static str, c: char) -> Result { + self.char_queue.push_back(c); + if self.skip_errors || (self.inside_comment && chunk != "--") { // FIXME: looks hacky + self.move_to_with(State::Normal, Token::Chunk(chunk)) + } else { + Err(self.error(format!("Unexpected token '{}' before '{}'", chunk, c))) + } + } + + /// Encountered a char + fn normal(&mut self, c: char) -> Result { + match c { + '<' => self.move_to(State::TagStarted), + '>' => Ok(Some(Token::TagEnd)), + '/' => self.move_to(State::EmptyTagClosing), + '=' => Ok(Some(Token::EqualsSign)), + '"' => Ok(Some(Token::DoubleQuote)), + '\'' => Ok(Some(Token::SingleQuote)), + '?' => self.move_to(State::ProcessingInstructionClosing), + '-' => self.move_to(State::CommentClosing(ClosingSubstate::First)), + ']' => self.move_to(State::CDataClosing(ClosingSubstate::First)), + '&' => Ok(Some(Token::ReferenceStart)), + ';' => Ok(Some(Token::ReferenceEnd)), + _ if is_whitespace_char(c) => Ok(Some(Token::Whitespace(c))), + _ => Ok(Some(Token::Character(c))) + } + } + + /// Encountered '<' + fn tag_opened(&mut self, c: char) -> Result { + match c { + '?' => self.move_to_with(State::Normal, Token::ProcessingInstructionStart), + '/' => self.move_to_with(State::Normal, Token::ClosingTagStart), + '!' => self.move_to(State::CommentOrCDataOrDoctypeStarted), + _ if is_whitespace_char(c) => self.move_to_with_unread(State::Normal, &[c], Token::OpeningTagStart), + _ if is_name_char(c) => self.move_to_with_unread(State::Normal, &[c], Token::OpeningTagStart), + _ => self.handle_error("<", c) + } + } + + /// Encountered '<!' + fn comment_or_cdata_or_doctype_started(&mut self, c: char) -> Result { + match c { + '-' => self.move_to(State::CommentStarted), + '[' => self.move_to(State::CDataStarted(CDataStartedSubstate::E)), + 'D' => self.move_to(State::DoctypeStarted(DoctypeStartedSubstate::D)), + _ => self.handle_error("<!", c) + } + } + + /// Encountered '<!-' + fn comment_started(&mut self, c: char) -> Result { + match c { + '-' => self.move_to_with(State::Normal, Token::CommentStart), + _ => self.handle_error("<!-", c) + } + } + + /// Encountered '<![' + fn cdata_started(&mut self, c: char, s: CDataStartedSubstate) -> Result { + use self::CDataStartedSubstate::{E, C, CD, CDA, CDAT, CDATA}; + dispatch_on_enum_state!(self, s, c, State::CDataStarted, + E ; 'C' ; C ; "<![", + C ; 'D' ; CD ; "<![C", + CD ; 'A' ; CDA ; "<![CD", + CDA ; 'T' ; CDAT ; "<![CDA", + CDAT ; 'A' ; CDATA ; "<![CDAT"; + CDATA ; '[' ; "<![CDATA" ; self.move_to_with(State::Normal, Token::CDataStart) + ) + } + + /// Encountered '<!D' + fn doctype_started(&mut self, c: char, s: DoctypeStartedSubstate) -> Result { + use self::DoctypeStartedSubstate::{D, DO, DOC, DOCT, DOCTY, DOCTYP}; + dispatch_on_enum_state!(self, s, c, State::DoctypeStarted, + D ; 'O' ; DO ; "<!D", + DO ; 'C' ; DOC ; "<!DO", + DOC ; 'T' ; DOCT ; "<!DOC", + DOCT ; 'Y' ; DOCTY ; "<!DOCT", + DOCTY ; 'P' ; DOCTYP ; "<!DOCTY"; + DOCTYP ; 'E' ; "<!DOCTYP" ; self.move_to_with(State::DoctypeFinishing(1), Token::DoctypeStart) + ) + } + + /// State used while awaiting the closing bracket for the <!DOCTYPE tag + fn doctype_finishing(&mut self, c: char, d: u8) -> Result { + match c { + '<' => self.move_to(State::DoctypeFinishing(d + 1)), + '>' if d == 1 => self.move_to_with(State::Normal, Token::TagEnd), + '>' => self.move_to(State::DoctypeFinishing(d - 1)), + _ => Ok(None), + } + } + + /// Encountered '?' + fn processing_instruction_closing(&mut self, c: char) -> Result { + match c { + '>' => self.move_to_with(State::Normal, Token::ProcessingInstructionEnd), + _ => self.move_to_with_unread(State::Normal, &[c], Token::Character('?')), + } + } + + /// Encountered '/' + fn empty_element_closing(&mut self, c: char) -> Result { + match c { + '>' => self.move_to_with(State::Normal, Token::EmptyTagEnd), + _ => self.move_to_with_unread(State::Normal, &[c], Token::Character('/')), + } + } + + /// Encountered '-' + fn comment_closing(&mut self, c: char, s: ClosingSubstate) -> Result { + match s { + ClosingSubstate::First => match c { + '-' => self.move_to(State::CommentClosing(ClosingSubstate::Second)), + _ => self.move_to_with_unread(State::Normal, &[c], Token::Character('-')) + }, + ClosingSubstate::Second => match c { + '>' => self.move_to_with(State::Normal, Token::CommentEnd), + // double dash not followed by a greater-than is a hard error inside comment + _ if self.inside_comment => self.handle_error("--", c), + // nothing else except comment closing starts with a double dash, and comment + // closing can never be after another dash, and also we're outside of a comment, + // therefore it is safe to push only the last read character to the list of unread + // characters and pass the double dash directly to the output + _ => self.move_to_with_unread(State::Normal, &[c], Token::Chunk("--")) + } + } + } + + /// Encountered ']' + fn cdata_closing(&mut self, c: char, s: ClosingSubstate) -> Result { + match s { + ClosingSubstate::First => match c { + ']' => self.move_to(State::CDataClosing(ClosingSubstate::Second)), + _ => self.move_to_with_unread(State::Normal, &[c], Token::Character(']')) + }, + ClosingSubstate::Second => match c { + '>' => self.move_to_with(State::Normal, Token::CDataEnd), + _ => self.move_to_with_unread(State::Normal, &[']', c], Token::Character(']')) + } + } + } +} + +#[cfg(test)] +mod tests { + use common::{Position}; + use std::io::{BufReader, Cursor}; + + use super::{Lexer, Token}; + + macro_rules! assert_oks( + (for $lex:ident and $buf:ident ; $($e:expr)+) => ({ + $( + assert_eq!(Ok(Some($e)), $lex.next_token(&mut $buf)); + )+ + }) + ); + + macro_rules! assert_err( + (for $lex:ident and $buf:ident expect row $r:expr ; $c:expr, $s:expr) => ({ + let err = $lex.next_token(&mut $buf); + assert!(err.is_err()); + let err = err.unwrap_err(); + assert_eq!($r as u64, err.position().row); + assert_eq!($c as u64, err.position().column); + assert_eq!($s, err.msg()); + }) + ); + + macro_rules! assert_none( + (for $lex:ident and $buf:ident) => ( + assert_eq!(Ok(None), $lex.next_token(&mut $buf)); + ) + ); + + fn make_lex_and_buf(s: &str) -> (Lexer, BufReader<Cursor<Vec<u8>>>) { + (Lexer::new(), BufReader::new(Cursor::new(s.to_owned().into_bytes()))) + } + + #[test] + fn simple_lexer_test() { + let (mut lex, mut buf) = make_lex_and_buf( + r#"<a p='q'> x<b z="y">d </b></a><p/> <?nm ?> <!-- a c --> "# + ); + + assert_oks!(for lex and buf ; + Token::OpeningTagStart + Token::Character('a') + Token::Whitespace(' ') + Token::Character('p') + Token::EqualsSign + Token::SingleQuote + Token::Character('q') + Token::SingleQuote + Token::TagEnd + Token::Whitespace(' ') + Token::Character('x') + Token::OpeningTagStart + Token::Character('b') + Token::Whitespace(' ') + Token::Character('z') + Token::EqualsSign + Token::DoubleQuote + Token::Character('y') + Token::DoubleQuote + Token::TagEnd + Token::Character('d') + Token::Whitespace('\t') + Token::ClosingTagStart + Token::Character('b') + Token::TagEnd + Token::ClosingTagStart + Token::Character('a') + Token::TagEnd + Token::OpeningTagStart + Token::Character('p') + Token::EmptyTagEnd + Token::Whitespace(' ') + Token::ProcessingInstructionStart + Token::Character('n') + Token::Character('m') + Token::Whitespace(' ') + Token::ProcessingInstructionEnd + Token::Whitespace(' ') + Token::CommentStart + Token::Whitespace(' ') + Token::Character('a') + Token::Whitespace(' ') + Token::Character('c') + Token::Whitespace(' ') + Token::CommentEnd + Token::Whitespace(' ') + Token::ReferenceStart + Token::Character('n') + Token::Character('b') + Token::Character('s') + Token::Character('p') + Token::ReferenceEnd + ); + assert_none!(for lex and buf); + } + + #[test] + fn special_chars_test() { + let (mut lex, mut buf) = make_lex_and_buf( + r#"?x!+ // -| ]z]]"# + ); + + assert_oks!(for lex and buf ; + Token::Character('?') + Token::Character('x') + Token::Character('!') + Token::Character('+') + Token::Whitespace(' ') + Token::Character('/') + Token::Character('/') + Token::Whitespace(' ') + Token::Character('-') + Token::Character('|') + Token::Whitespace(' ') + Token::Character(']') + Token::Character('z') + Token::Chunk("]]") + ); + assert_none!(for lex and buf); + } + + #[test] + fn cdata_test() { + let (mut lex, mut buf) = make_lex_and_buf( + r#"<a><![CDATA[x y ?]]> </a>"# + ); + + assert_oks!(for lex and buf ; + Token::OpeningTagStart + Token::Character('a') + Token::TagEnd + Token::CDataStart + Token::Character('x') + Token::Whitespace(' ') + Token::Character('y') + Token::Whitespace(' ') + Token::Character('?') + Token::CDataEnd + Token::Whitespace(' ') + Token::ClosingTagStart + Token::Character('a') + Token::TagEnd + ); + assert_none!(for lex and buf); + } + + #[test] + fn doctype_test() { + let (mut lex, mut buf) = make_lex_and_buf( + r#"<a><!DOCTYPE ab xx z> "# + ); + assert_oks!(for lex and buf ; + Token::OpeningTagStart + Token::Character('a') + Token::TagEnd + Token::DoctypeStart + Token::TagEnd + Token::Whitespace(' ') + ); + assert_none!(for lex and buf) + } + + #[test] + fn doctype_with_internal_subset_test() { + let (mut lex, mut buf) = make_lex_and_buf( + r#"<a><!DOCTYPE ab[<!ELEMENT ba> ]> "# + ); + assert_oks!(for lex and buf ; + Token::OpeningTagStart + Token::Character('a') + Token::TagEnd + Token::DoctypeStart + Token::TagEnd + Token::Whitespace(' ') + ); + assert_none!(for lex and buf) + } + + #[test] + fn end_of_stream_handling_ok() { + macro_rules! eof_check( + ($data:expr ; $token:expr) => ({ + let (mut lex, mut buf) = make_lex_and_buf($data); + assert_oks!(for lex and buf ; $token); + assert_none!(for lex and buf); + }) + ); + eof_check!("?" ; Token::Character('?')); + eof_check!("/" ; Token::Character('/')); + eof_check!("-" ; Token::Character('-')); + eof_check!("]" ; Token::Character(']')); + eof_check!("]]" ; Token::Chunk("]]")); + } + + #[test] + fn end_of_stream_handling_error() { + macro_rules! eof_check( + ($data:expr; $r:expr, $c:expr) => ({ + let (mut lex, mut buf) = make_lex_and_buf($data); + assert_err!(for lex and buf expect row $r ; $c, "Unexpected end of stream"); + assert_none!(for lex and buf); + }) + ); + eof_check!("<" ; 0, 1); + eof_check!("<!" ; 0, 2); + eof_check!("<!-" ; 0, 3); + eof_check!("<![" ; 0, 3); + eof_check!("<![C" ; 0, 4); + eof_check!("<![CD" ; 0, 5); + eof_check!("<![CDA" ; 0, 6); + eof_check!("<![CDAT" ; 0, 7); + eof_check!("<![CDATA" ; 0, 8); + eof_check!("--" ; 0, 2); + } + + #[test] + fn error_in_comment_or_cdata_prefix() { + let (mut lex, mut buf) = make_lex_and_buf("<!x"); + assert_err!(for lex and buf expect row 0 ; 0, + "Unexpected token '<!' before 'x'" + ); + + let (mut lex, mut buf) = make_lex_and_buf("<!x"); + lex.disable_errors(); + assert_oks!(for lex and buf ; + Token::Chunk("<!") + Token::Character('x') + ); + assert_none!(for lex and buf); + } + + #[test] + fn error_in_comment_started() { + let (mut lex, mut buf) = make_lex_and_buf("<!-\t"); + assert_err!(for lex and buf expect row 0 ; 0, + "Unexpected token '<!-' before '\t'" + ); + + let (mut lex, mut buf) = make_lex_and_buf("<!-\t"); + lex.disable_errors(); + assert_oks!(for lex and buf ; + Token::Chunk("<!-") + Token::Whitespace('\t') + ); + assert_none!(for lex and buf); + } + + #[test] + fn error_in_comment_two_dashes_not_at_end() { + let (mut lex, mut buf) = make_lex_and_buf("--x"); + lex.inside_comment(); + assert_err!(for lex and buf expect row 0; 0, + "Unexpected token '--' before 'x'" + ); + + let (mut lex, mut buf) = make_lex_and_buf("--x"); + assert_oks!(for lex and buf ; + Token::Chunk("--") + Token::Character('x') + ); + } + + macro_rules! check_case( + ($chunk:expr, $app:expr; $data:expr; $r:expr, $c:expr, $s:expr) => ({ + let (mut lex, mut buf) = make_lex_and_buf($data); + assert_err!(for lex and buf expect row $r ; $c, $s); + + let (mut lex, mut buf) = make_lex_and_buf($data); + lex.disable_errors(); + assert_oks!(for lex and buf ; + Token::Chunk($chunk) + Token::Character($app) + ); + assert_none!(for lex and buf); + }) + ); + + #[test] + fn error_in_cdata_started() { + check_case!("<![", '['; "<![[" ; 0, 0, "Unexpected token '<![' before '['"); + check_case!("<![C", '['; "<![C[" ; 0, 0, "Unexpected token '<![C' before '['"); + check_case!("<![CD", '['; "<![CD[" ; 0, 0, "Unexpected token '<![CD' before '['"); + check_case!("<![CDA", '['; "<![CDA[" ; 0, 0, "Unexpected token '<![CDA' before '['"); + check_case!("<![CDAT", '['; "<![CDAT[" ; 0, 0, "Unexpected token '<![CDAT' before '['"); + check_case!("<![CDATA", '|'; "<![CDATA|" ; 0, 0, "Unexpected token '<![CDATA' before '|'"); + } + + #[test] + fn error_in_doctype_started() { + check_case!("<!D", 'a'; "<!Da" ; 0, 0, "Unexpected token '<!D' before 'a'"); + check_case!("<!DO", 'b'; "<!DOb" ; 0, 0, "Unexpected token '<!DO' before 'b'"); + check_case!("<!DOC", 'c'; "<!DOCc" ; 0, 0, "Unexpected token '<!DOC' before 'c'"); + check_case!("<!DOCT", 'd'; "<!DOCTd" ; 0, 0, "Unexpected token '<!DOCT' before 'd'"); + check_case!("<!DOCTY", 'e'; "<!DOCTYe" ; 0, 0, "Unexpected token '<!DOCTY' before 'e'"); + check_case!("<!DOCTYP", 'f'; "<!DOCTYPf" ; 0, 0, "Unexpected token '<!DOCTYP' before 'f'"); + } + + + + #[test] + fn issue_98_cdata_ending_with_right_bracket() { + let (mut lex, mut buf) = make_lex_and_buf( + r#"<![CDATA[Foo [Bar]]]>"# + ); + + assert_oks!(for lex and buf ; + Token::CDataStart + Token::Character('F') + Token::Character('o') + Token::Character('o') + Token::Whitespace(' ') + Token::Character('[') + Token::Character('B') + Token::Character('a') + Token::Character('r') + Token::Character(']') + Token::CDataEnd + ); + assert_none!(for lex and buf); + } +} |