Adding upstream version 115.7.0esr.upstream/115.7.0esr upstream

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-07 19:33:14 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-07 19:33:14 +0000
commit: 36d22d82aa202bb199967e9512281e9a53db42c9 (patch)
tree: 105e8c98ddea1c1e4784a60a5a6410fa416be2de /third_party/rust/xml-rs/src/reader/lexer.rs
parent: Initial commit. (diff)
download: firefox-esr-upstream.tar.xz
firefox-esr-upstream.zip
1 files changed, 867 insertions, 0 deletions
diff --git a/third_party/rust/xml-rs/src/reader/lexer.rs b/third_party/rust/xml-rs/src/reader/lexer.rs
new file mode 100644
index 0000000000..c466db9210
--- /dev/null
+++ b/third_party/rust/xml-rs/src/reader/lexer.rs
@@ -0,0 +1,867 @@
+//! Contains simple lexer for XML documents.
+//!
+//! This module is for internal use. Use `xml::pull` module to do parsing.
+
+use std::fmt;
+use std::collections::VecDeque;
+use std::io::Read;
+use std::result;
+use std::borrow::Cow;
+
+use common::{Position, TextPosition, is_whitespace_char, is_name_char};
+use reader::Error;
+use util;
+
+/// `Token` represents a single lexeme of an XML document. These lexemes
+/// are used to perform actual parsing.
+#[derive(Copy, Clone, PartialEq, Eq, Debug)]
+pub enum Token {
+    /// `<?`
+    ProcessingInstructionStart,
+    /// `?>`
+    ProcessingInstructionEnd,
+    /// `<!DOCTYPE
+    DoctypeStart,
+    /// `<`
+    OpeningTagStart,
+    /// `</`
+    ClosingTagStart,
+    /// `>`
+    TagEnd,
+    /// `/>`
+    EmptyTagEnd,
+    /// `<!--`
+    CommentStart,
+    /// `-->`
+    CommentEnd,
+    /// A chunk of characters, used for errors recovery.
+    Chunk(&'static str),
+    /// Any non-special character except whitespace.
+    Character(char),
+    /// Whitespace character.
+    Whitespace(char),
+    /// `=`
+    EqualsSign,
+    /// `'`
+    SingleQuote,
+    /// `"`
+    DoubleQuote,
+    /// `<![CDATA[`
+    CDataStart,
+    /// `]]>`
+    CDataEnd,
+    /// `&`
+    ReferenceStart,
+    /// `;`
+    ReferenceEnd,
+}
+
+impl fmt::Display for Token {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        match *self {
+            Token::Chunk(s)                            => write!(f, "{}", s),
+            Token::Character(c) | Token::Whitespace(c) => write!(f, "{}", c),
+            other => write!(f, "{}", match other {
+                Token::OpeningTagStart            => "<",
+                Token::ProcessingInstructionStart => "<?",
+                Token::DoctypeStart               => "<!DOCTYPE",
+                Token::ClosingTagStart            => "</",
+                Token::CommentStart               => "<!--",
+                Token::CDataStart                 => "<![CDATA[",
+                Token::TagEnd                     => ">",
+                Token::EmptyTagEnd                => "/>",
+                Token::ProcessingInstructionEnd   => "?>",
+                Token::CommentEnd                 => "-->",
+                Token::CDataEnd                   => "]]>",
+                Token::ReferenceStart             => "&",
+                Token::ReferenceEnd               => ";",
+                Token::EqualsSign                 => "=",
+                Token::SingleQuote                => "'",
+                Token::DoubleQuote                => "\"",
+                _                          => unreachable!()
+            })
+        }
+    }
+}
+
+impl Token {
+    pub fn as_static_str(&self) -> Option<&'static str> {
+        match *self {
+            Token::OpeningTagStart            => Some("<"),
+            Token::ProcessingInstructionStart => Some("<?"),
+            Token::DoctypeStart               => Some("<!DOCTYPE"),
+            Token::ClosingTagStart            => Some("</"),
+            Token::CommentStart               => Some("<!--"),
+            Token::CDataStart                 => Some("<![CDATA["),
+            Token::TagEnd                     => Some(">"),
+            Token::EmptyTagEnd                => Some("/>"),
+            Token::ProcessingInstructionEnd   => Some("?>"),
+            Token::CommentEnd                 => Some("-->"),
+            Token::CDataEnd                   => Some("]]>"),
+            Token::ReferenceStart             => Some("&"),
+            Token::ReferenceEnd               => Some(";"),
+            Token::EqualsSign                 => Some("="),
+            Token::SingleQuote                => Some("'"),
+            Token::DoubleQuote                => Some("\""),
+            Token::Chunk(s)                   => Some(s),
+            _                                 => None
+        }
+    }
+
+    // using String.push_str(token.to_string()) is simply way too slow
+    pub fn push_to_string(&self, target: &mut String) {
+        match self.as_static_str() {
+            Some(s) => { target.push_str(s); }
+            None => {
+                match *self {
+                    Token::Character(c) | Token::Whitespace(c) => target.push(c),
+                    _ => unreachable!()
+                }
+            }
+        }
+    }
+
+    /// Returns `true` if this token contains data that can be interpreted
+    /// as a part of the text. Surprisingly, this also means '>' and '=' and '"' and "'" and '-->'.
+    #[inline]
+    pub fn contains_char_data(&self) -> bool {
+        match *self {
+            Token::Whitespace(_) | Token::Chunk(_) | Token::Character(_) | Token::CommentEnd |
+            Token::TagEnd | Token::EqualsSign | Token::DoubleQuote | Token::SingleQuote | Token::CDataEnd | 
+            Token::ProcessingInstructionEnd | Token::EmptyTagEnd => true,
+            _ => false
+        }
+    }
+
+    /// Returns `true` if this token corresponds to a white space character.
+    #[inline]
+    pub fn is_whitespace(&self) -> bool {
+        match *self {
+            Token::Whitespace(_) => true,
+            _ => false
+        }
+    }
+}
+
+enum State {
+    /// Triggered on '<'
+    TagStarted,
+    /// Triggered on '<!'
+    CommentOrCDataOrDoctypeStarted,
+    /// Triggered on '<!-'
+    CommentStarted,
+    /// Triggered on '<!D' up to '<!DOCTYPE'
+    DoctypeStarted(DoctypeStartedSubstate),
+    /// Triggered after DoctypeStarted to handle sub elements
+    DoctypeFinishing(u8),
+    /// Triggered on '<![' up to '<![CDATA'
+    CDataStarted(CDataStartedSubstate),
+    /// Triggered on '?'
+    ProcessingInstructionClosing,
+    /// Triggered on '/'
+    EmptyTagClosing,
+    /// Triggered on '-' up to '--'
+    CommentClosing(ClosingSubstate),
+    /// Triggered on ']' up to ']]'
+    CDataClosing(ClosingSubstate),
+    /// Default state
+    Normal
+}
+
+#[derive(Copy, Clone)]
+enum ClosingSubstate {
+    First, Second
+}
+
+#[derive(Copy, Clone)]
+enum DoctypeStartedSubstate {
+    D, DO, DOC, DOCT, DOCTY, DOCTYP
+}
+
+#[derive(Copy, Clone)]
+enum CDataStartedSubstate {
+    E, C, CD, CDA, CDAT, CDATA
+}
+
+/// `Result` represents lexing result. It is either a token or an error message.
+pub type Result = result::Result<Option<Token>, Error>;
+
+/// Helps to set up a dispatch table for lexing large unambigous tokens like
+/// `<![CDATA[` or `<!DOCTYPE `.
+macro_rules! dispatch_on_enum_state(
+    ($_self:ident, $s:expr, $c:expr, $is:expr,
+     $($st:ident; $stc:expr ; $next_st:ident ; $chunk:expr),+;
+     $end_st:ident ; $end_c:expr ; $end_chunk:expr ; $e:expr) => (
+        match $s {
+            $(
+            $st => match $c {
+                $stc => $_self.move_to($is($next_st)),
+                _  => $_self.handle_error($chunk, $c)
+            },
+            )+
+            $end_st => match $c {
+                $end_c => $e,
+                _      => $_self.handle_error($end_chunk, $c)
+            }
+        }
+    )
+);
+
+/// `Lexer` is a lexer for XML documents, which implements pull API.
+///
+/// Main method is `next_token` which accepts an `std::io::Read` instance and
+/// tries to read the next lexeme from it.
+///
+/// When `skip_errors` flag is set, invalid lexemes will be returned as `Chunk`s.
+/// When it is not set, errors will be reported as `Err` objects with a string message.
+/// By default this flag is not set. Use `enable_errors` and `disable_errors` methods
+/// to toggle the behavior.
+pub struct Lexer {
+    pos: TextPosition,
+    head_pos: TextPosition,
+    char_queue: VecDeque<char>,
+    st: State,
+    skip_errors: bool,
+    inside_comment: bool,
+    inside_token: bool,
+    eof_handled: bool
+}
+
+impl Position for Lexer {
+    #[inline]
+    /// Returns the position of the last token produced by the lexer
+    fn position(&self) -> TextPosition { self.pos }
+}
+
+impl Lexer {
+    /// Returns a new lexer with default state.
+    pub fn new() -> Lexer {
+        Lexer {
+            pos: TextPosition::new(),
+            head_pos: TextPosition::new(),
+            char_queue: VecDeque::with_capacity(4),  // TODO: check size
+            st: State::Normal,
+            skip_errors: false,
+            inside_comment: false,
+            inside_token: false,
+            eof_handled: false
+        }
+    }
+
+    /// Enables error handling so `next_token` will return `Some(Err(..))`
+    /// upon invalid lexeme.
+    #[inline]
+    pub fn enable_errors(&mut self) { self.skip_errors = false; }
+
+    /// Disables error handling so `next_token` will return `Some(Chunk(..))`
+    /// upon invalid lexeme with this lexeme content.
+    #[inline]
+    pub fn disable_errors(&mut self) { self.skip_errors = true; }
+
+    /// Enables special handling of some lexemes which should be done when we're parsing comment
+    /// internals.
+    #[inline]
+    pub fn inside_comment(&mut self) { self.inside_comment = true; }
+
+    /// Disables the effect of `inside_comment()` method.
+    #[inline]
+    pub fn outside_comment(&mut self) { self.inside_comment = false; }
+
+    /// Reset the eof handled flag of the lexer.
+    #[inline]
+    pub fn reset_eof_handled(&mut self) { self.eof_handled = false; }
+
+    /// Tries to read the next token from the buffer.
+    ///
+    /// It is possible to pass different instaces of `BufReader` each time
+    /// this method is called, but the resulting behavior is undefined in this case.
+    ///
+    /// Return value:
+    /// * `Err(reason) where reason: reader::Error` - when an error occurs;
+    /// * `Ok(None)` - upon end of stream is reached;
+    /// * `Ok(Some(token)) where token: Token` - in case a complete-token has been read from the stream.
+    pub fn next_token<B: Read>(&mut self, b: &mut B) -> Result {
+        // Already reached end of buffer
+        if self.eof_handled {
+            return Ok(None);
+        }
+
+        if !self.inside_token {
+            self.pos = self.head_pos;
+            self.inside_token = true;
+        }
+
+        // Check if we have saved a char or two for ourselves
+        while let Some(c) = self.char_queue.pop_front() {
+            match try!(self.read_next_token(c)) {
+                Some(t) => {
+                    self.inside_token = false;
+                    return Ok(Some(t));
+                }
+                None => {}  // continue
+            }
+        }
+
+        loop {
+            // TODO: this should handle multiple encodings
+            let c = match try!(util::next_char_from(b)) {
+                Some(c) => c,   // got next char
+                None => break,  // nothing to read left
+            };
+
+            match try!(self.read_next_token(c)) {
+                Some(t) => {
+                    self.inside_token = false;
+                    return Ok(Some(t));
+                }
+                None => {
+                    // continue
+                }
+            }
+        }
+
+        // Handle end of stream
+        self.eof_handled = true;
+        self.pos = self.head_pos;
+        match self.st {
+            State::TagStarted | State::CommentOrCDataOrDoctypeStarted |
+            State::CommentStarted | State::CDataStarted(_)| State::DoctypeStarted(_) |
+            State::CommentClosing(ClosingSubstate::Second) |
+            State::DoctypeFinishing(_) =>
+                Err(self.error("Unexpected end of stream")),
+            State::ProcessingInstructionClosing =>
+                Ok(Some(Token::Character('?'))),
+            State::EmptyTagClosing =>
+                Ok(Some(Token::Character('/'))),
+            State::CommentClosing(ClosingSubstate::First) =>
+                Ok(Some(Token::Character('-'))),
+            State::CDataClosing(ClosingSubstate::First) =>
+                Ok(Some(Token::Character(']'))),
+            State::CDataClosing(ClosingSubstate::Second) =>
+                Ok(Some(Token::Chunk("]]"))),
+            State::Normal =>
+                Ok(None)
+        }
+    }
+
+    #[inline]
+    fn error<M: Into<Cow<'static, str>>>(&self, msg: M) -> Error {
+        (self, msg).into()
+    }
+
+    #[inline]
+    fn read_next_token(&mut self, c: char) -> Result {
+        let res = self.dispatch_char(c);
+        if self.char_queue.is_empty() {
+            if c == '\n' {
+                self.head_pos.new_line();
+            } else {
+                self.head_pos.advance(1);
+            }
+        }
+        res
+    }
+
+    fn dispatch_char(&mut self, c: char) -> Result {
+        match self.st {
+            State::Normal                         => self.normal(c),
+            State::TagStarted                     => self.tag_opened(c),
+            State::CommentOrCDataOrDoctypeStarted => self.comment_or_cdata_or_doctype_started(c),
+            State::CommentStarted                 => self.comment_started(c),
+            State::CDataStarted(s)                => self.cdata_started(c, s),
+            State::DoctypeStarted(s)              => self.doctype_started(c, s),
+            State::DoctypeFinishing(d)            => self.doctype_finishing(c, d),
+            State::ProcessingInstructionClosing   => self.processing_instruction_closing(c),
+            State::EmptyTagClosing                => self.empty_element_closing(c),
+            State::CommentClosing(s)              => self.comment_closing(c, s),
+            State::CDataClosing(s)                => self.cdata_closing(c, s)
+        }
+    }
+
+    #[inline]
+    fn move_to(&mut self, st: State) -> Result {
+        self.st = st;
+        Ok(None)
+    }
+
+    #[inline]
+    fn move_to_with(&mut self, st: State, token: Token) -> Result {
+        self.st = st;
+        Ok(Some(token))
+    }
+
+    #[inline]
+    fn move_to_with_unread(&mut self, st: State, cs: &[char], token: Token) -> Result {
+        self.char_queue.extend(cs.iter().cloned());
+        self.move_to_with(st, token)
+    }
+
+    fn handle_error(&mut self, chunk: &'static str, c: char) -> Result {
+        self.char_queue.push_back(c);
+        if self.skip_errors || (self.inside_comment && chunk != "--") {  // FIXME: looks hacky
+            self.move_to_with(State::Normal, Token::Chunk(chunk))
+        } else {
+            Err(self.error(format!("Unexpected token '{}' before '{}'", chunk, c)))
+        }
+    }
+
+    /// Encountered a char
+    fn normal(&mut self, c: char) -> Result {
+        match c {
+            '<'                        => self.move_to(State::TagStarted),
+            '>'                        => Ok(Some(Token::TagEnd)),
+            '/'                        => self.move_to(State::EmptyTagClosing),
+            '='                        => Ok(Some(Token::EqualsSign)),
+            '"'                        => Ok(Some(Token::DoubleQuote)),
+            '\''                       => Ok(Some(Token::SingleQuote)),
+            '?'                        => self.move_to(State::ProcessingInstructionClosing),
+            '-'                        => self.move_to(State::CommentClosing(ClosingSubstate::First)),
+            ']'                        => self.move_to(State::CDataClosing(ClosingSubstate::First)),
+            '&'                        => Ok(Some(Token::ReferenceStart)),
+            ';'                        => Ok(Some(Token::ReferenceEnd)),
+            _ if is_whitespace_char(c) => Ok(Some(Token::Whitespace(c))),
+            _                          => Ok(Some(Token::Character(c)))
+        }
+    }
+
+    /// Encountered '<'
+    fn tag_opened(&mut self, c: char) -> Result {
+        match c {
+            '?'                        => self.move_to_with(State::Normal, Token::ProcessingInstructionStart),
+            '/'                        => self.move_to_with(State::Normal, Token::ClosingTagStart),
+            '!'                        => self.move_to(State::CommentOrCDataOrDoctypeStarted),
+            _ if is_whitespace_char(c) => self.move_to_with_unread(State::Normal, &[c], Token::OpeningTagStart),
+            _ if is_name_char(c)       => self.move_to_with_unread(State::Normal, &[c], Token::OpeningTagStart),
+            _                          => self.handle_error("<", c)
+        }
+    }
+
+    /// Encountered '<!'
+    fn comment_or_cdata_or_doctype_started(&mut self, c: char) -> Result {
+        match c {
+            '-' => self.move_to(State::CommentStarted),
+            '[' => self.move_to(State::CDataStarted(CDataStartedSubstate::E)),
+            'D' => self.move_to(State::DoctypeStarted(DoctypeStartedSubstate::D)),
+            _   => self.handle_error("<!", c)
+        }
+    }
+
+    /// Encountered '<!-'
+    fn comment_started(&mut self, c: char) -> Result {
+        match c {
+            '-' => self.move_to_with(State::Normal, Token::CommentStart),
+            _   => self.handle_error("<!-", c)
+        }
+    }
+
+    /// Encountered '<!['
+    fn cdata_started(&mut self, c: char, s: CDataStartedSubstate) -> Result {
+        use self::CDataStartedSubstate::{E, C, CD, CDA, CDAT, CDATA};
+        dispatch_on_enum_state!(self, s, c, State::CDataStarted,
+            E     ; 'C' ; C     ; "<![",
+            C     ; 'D' ; CD    ; "<![C",
+            CD    ; 'A' ; CDA   ; "<![CD",
+            CDA   ; 'T' ; CDAT  ; "<![CDA",
+            CDAT  ; 'A' ; CDATA ; "<![CDAT";
+            CDATA ; '[' ; "<![CDATA" ; self.move_to_with(State::Normal, Token::CDataStart)
+        )
+    }
+
+    /// Encountered '<!D'
+    fn doctype_started(&mut self, c: char, s: DoctypeStartedSubstate) -> Result {
+        use self::DoctypeStartedSubstate::{D, DO, DOC, DOCT, DOCTY, DOCTYP};
+        dispatch_on_enum_state!(self, s, c, State::DoctypeStarted,
+            D      ; 'O' ; DO     ; "<!D",
+            DO     ; 'C' ; DOC    ; "<!DO",
+            DOC    ; 'T' ; DOCT   ; "<!DOC",
+            DOCT   ; 'Y' ; DOCTY  ; "<!DOCT",
+            DOCTY  ; 'P' ; DOCTYP ; "<!DOCTY";
+            DOCTYP ; 'E' ; "<!DOCTYP" ; self.move_to_with(State::DoctypeFinishing(1), Token::DoctypeStart)
+        )
+    }
+
+    /// State used while awaiting the closing bracket for the <!DOCTYPE tag
+    fn doctype_finishing(&mut self, c: char, d: u8) -> Result {
+        match c {
+            '<' => self.move_to(State::DoctypeFinishing(d + 1)),
+            '>' if d == 1 => self.move_to_with(State::Normal, Token::TagEnd),
+            '>' => self.move_to(State::DoctypeFinishing(d - 1)),
+            _ => Ok(None),
+        }
+    }
+
+    /// Encountered '?'
+    fn processing_instruction_closing(&mut self, c: char) -> Result {
+        match c {
+            '>' => self.move_to_with(State::Normal, Token::ProcessingInstructionEnd),
+            _   => self.move_to_with_unread(State::Normal, &[c], Token::Character('?')),
+        }
+    }
+
+    /// Encountered '/'
+    fn empty_element_closing(&mut self, c: char) -> Result {
+        match c {
+            '>' => self.move_to_with(State::Normal, Token::EmptyTagEnd),
+            _   => self.move_to_with_unread(State::Normal, &[c], Token::Character('/')),
+        }
+    }
+
+    /// Encountered '-'
+    fn comment_closing(&mut self, c: char, s: ClosingSubstate) -> Result {
+        match s {
+            ClosingSubstate::First => match c {
+                '-' => self.move_to(State::CommentClosing(ClosingSubstate::Second)),
+                _   => self.move_to_with_unread(State::Normal, &[c], Token::Character('-'))
+            },
+            ClosingSubstate::Second => match c {
+                '>'                      => self.move_to_with(State::Normal, Token::CommentEnd),
+                // double dash not followed by a greater-than is a hard error inside comment
+                _ if self.inside_comment => self.handle_error("--", c),
+                // nothing else except comment closing starts with a double dash, and comment
+                // closing can never be after another dash, and also we're outside of a comment,
+                // therefore it is safe to push only the last read character to the list of unread
+                // characters and pass the double dash directly to the output
+                _                        => self.move_to_with_unread(State::Normal, &[c], Token::Chunk("--"))
+            }
+        }
+    }
+
+    /// Encountered ']'
+    fn cdata_closing(&mut self, c: char, s: ClosingSubstate) -> Result {
+        match s {
+            ClosingSubstate::First => match c {
+                ']' => self.move_to(State::CDataClosing(ClosingSubstate::Second)),
+                _   => self.move_to_with_unread(State::Normal, &[c], Token::Character(']'))
+            },
+            ClosingSubstate::Second => match c {
+                '>' => self.move_to_with(State::Normal, Token::CDataEnd),
+                _   => self.move_to_with_unread(State::Normal, &[']', c], Token::Character(']'))
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use common::{Position};
+    use std::io::{BufReader, Cursor};
+
+    use super::{Lexer, Token};
+
+    macro_rules! assert_oks(
+        (for $lex:ident and $buf:ident ; $($e:expr)+) => ({
+            $(
+                assert_eq!(Ok(Some($e)), $lex.next_token(&mut $buf));
+             )+
+        })
+    );
+
+    macro_rules! assert_err(
+        (for $lex:ident and $buf:ident expect row $r:expr ; $c:expr, $s:expr) => ({
+            let err = $lex.next_token(&mut $buf);
+            assert!(err.is_err());
+            let err = err.unwrap_err();
+            assert_eq!($r as u64, err.position().row);
+            assert_eq!($c as u64, err.position().column);
+            assert_eq!($s, err.msg());
+        })
+    );
+
+    macro_rules! assert_none(
+        (for $lex:ident and $buf:ident) => (
+            assert_eq!(Ok(None), $lex.next_token(&mut $buf));
+        )
+    );
+
+    fn make_lex_and_buf(s: &str) -> (Lexer, BufReader<Cursor<Vec<u8>>>) {
+        (Lexer::new(), BufReader::new(Cursor::new(s.to_owned().into_bytes())))
+    }
+
+    #[test]
+    fn simple_lexer_test() {
+        let (mut lex, mut buf) = make_lex_and_buf(
+            r#"<a p='q'> x<b z="y">d	</b></a><p/> <?nm ?> <!-- a c --> &nbsp;"#
+        );
+
+        assert_oks!(for lex and buf ;
+            Token::OpeningTagStart
+            Token::Character('a')
+            Token::Whitespace(' ')
+            Token::Character('p')
+            Token::EqualsSign
+            Token::SingleQuote
+            Token::Character('q')
+            Token::SingleQuote
+            Token::TagEnd
+            Token::Whitespace(' ')
+            Token::Character('x')
+            Token::OpeningTagStart
+            Token::Character('b')
+            Token::Whitespace(' ')
+            Token::Character('z')
+            Token::EqualsSign
+            Token::DoubleQuote
+            Token::Character('y')
+            Token::DoubleQuote
+            Token::TagEnd
+            Token::Character('d')
+            Token::Whitespace('\t')
+            Token::ClosingTagStart
+            Token::Character('b')
+            Token::TagEnd
+            Token::ClosingTagStart
+            Token::Character('a')
+            Token::TagEnd
+            Token::OpeningTagStart
+            Token::Character('p')
+            Token::EmptyTagEnd
+            Token::Whitespace(' ')
+            Token::ProcessingInstructionStart
+            Token::Character('n')
+            Token::Character('m')
+            Token::Whitespace(' ')
+            Token::ProcessingInstructionEnd
+            Token::Whitespace(' ')
+            Token::CommentStart
+            Token::Whitespace(' ')
+            Token::Character('a')
+            Token::Whitespace(' ')
+            Token::Character('c')
+            Token::Whitespace(' ')
+            Token::CommentEnd
+            Token::Whitespace(' ')
+            Token::ReferenceStart
+            Token::Character('n')
+            Token::Character('b')
+            Token::Character('s')
+            Token::Character('p')
+            Token::ReferenceEnd
+        );
+        assert_none!(for lex and buf);
+    }
+
+    #[test]
+    fn special_chars_test() {
+        let (mut lex, mut buf) = make_lex_and_buf(
+            r#"?x!+ // -| ]z]]"#
+        );
+
+        assert_oks!(for lex and buf ;
+            Token::Character('?')
+            Token::Character('x')
+            Token::Character('!')
+            Token::Character('+')
+            Token::Whitespace(' ')
+            Token::Character('/')
+            Token::Character('/')
+            Token::Whitespace(' ')
+            Token::Character('-')
+            Token::Character('|')
+            Token::Whitespace(' ')
+            Token::Character(']')
+            Token::Character('z')
+            Token::Chunk("]]")
+        );
+        assert_none!(for lex and buf);
+    }
+
+    #[test]
+    fn cdata_test() {
+        let (mut lex, mut buf) = make_lex_and_buf(
+            r#"<a><![CDATA[x y ?]]> </a>"#
+        );
+
+        assert_oks!(for lex and buf ;
+            Token::OpeningTagStart
+            Token::Character('a')
+            Token::TagEnd
+            Token::CDataStart
+            Token::Character('x')
+            Token::Whitespace(' ')
+            Token::Character('y')
+            Token::Whitespace(' ')
+            Token::Character('?')
+            Token::CDataEnd
+            Token::Whitespace(' ')
+            Token::ClosingTagStart
+            Token::Character('a')
+            Token::TagEnd
+        );
+        assert_none!(for lex and buf);
+    }
+
+    #[test]
+    fn doctype_test() {
+        let (mut lex, mut buf) = make_lex_and_buf(
+            r#"<a><!DOCTYPE ab xx z> "#
+        );
+        assert_oks!(for lex and buf ;
+            Token::OpeningTagStart
+            Token::Character('a')
+            Token::TagEnd
+            Token::DoctypeStart
+            Token::TagEnd
+            Token::Whitespace(' ')
+        );
+        assert_none!(for lex and buf)
+    }
+
+    #[test]
+    fn doctype_with_internal_subset_test() {
+        let (mut lex, mut buf) = make_lex_and_buf(
+            r#"<a><!DOCTYPE ab[<!ELEMENT ba> ]> "#
+        );
+        assert_oks!(for lex and buf ;
+            Token::OpeningTagStart
+            Token::Character('a')
+            Token::TagEnd
+            Token::DoctypeStart
+            Token::TagEnd
+            Token::Whitespace(' ')
+        );
+        assert_none!(for lex and buf)
+    }
+
+    #[test]
+    fn end_of_stream_handling_ok() {
+        macro_rules! eof_check(
+            ($data:expr ; $token:expr) => ({
+                let (mut lex, mut buf) = make_lex_and_buf($data);
+                assert_oks!(for lex and buf ; $token);
+                assert_none!(for lex and buf);
+            })
+        );
+        eof_check!("?"  ; Token::Character('?'));
+        eof_check!("/"  ; Token::Character('/'));
+        eof_check!("-"  ; Token::Character('-'));
+        eof_check!("]"  ; Token::Character(']'));
+        eof_check!("]]" ; Token::Chunk("]]"));
+    }
+
+    #[test]
+    fn end_of_stream_handling_error() {
+        macro_rules! eof_check(
+            ($data:expr; $r:expr, $c:expr) => ({
+                let (mut lex, mut buf) = make_lex_and_buf($data);
+                assert_err!(for lex and buf expect row $r ; $c, "Unexpected end of stream");
+                assert_none!(for lex and buf);
+            })
+        );
+        eof_check!("<"        ; 0, 1);
+        eof_check!("<!"       ; 0, 2);
+        eof_check!("<!-"      ; 0, 3);
+        eof_check!("<!["      ; 0, 3);
+        eof_check!("<![C"     ; 0, 4);
+        eof_check!("<![CD"    ; 0, 5);
+        eof_check!("<![CDA"   ; 0, 6);
+        eof_check!("<![CDAT"  ; 0, 7);
+        eof_check!("<![CDATA" ; 0, 8);
+        eof_check!("--"       ; 0, 2);
+    }
+
+    #[test]
+    fn error_in_comment_or_cdata_prefix() {
+        let (mut lex, mut buf) = make_lex_and_buf("<!x");
+        assert_err!(for lex and buf expect row 0 ; 0,
+            "Unexpected token '<!' before 'x'"
+        );
+
+        let (mut lex, mut buf) = make_lex_and_buf("<!x");
+        lex.disable_errors();
+        assert_oks!(for lex and buf ;
+            Token::Chunk("<!")
+            Token::Character('x')
+        );
+        assert_none!(for lex and buf);
+    }
+
+    #[test]
+    fn error_in_comment_started() {
+        let (mut lex, mut buf) = make_lex_and_buf("<!-\t");
+        assert_err!(for lex and buf expect row 0 ; 0,
+            "Unexpected token '<!-' before '\t'"
+        );
+
+        let (mut lex, mut buf) = make_lex_and_buf("<!-\t");
+        lex.disable_errors();
+        assert_oks!(for lex and buf ;
+            Token::Chunk("<!-")
+            Token::Whitespace('\t')
+        );
+        assert_none!(for lex and buf);
+    }
+
+    #[test]
+    fn error_in_comment_two_dashes_not_at_end() {
+        let (mut lex, mut buf) = make_lex_and_buf("--x");
+        lex.inside_comment();
+        assert_err!(for lex and buf expect row 0; 0,
+            "Unexpected token '--' before 'x'"
+        );
+
+        let (mut lex, mut buf) = make_lex_and_buf("--x");
+        assert_oks!(for lex and buf ;
+            Token::Chunk("--")
+            Token::Character('x')
+        );
+    }
+
+    macro_rules! check_case(
+        ($chunk:expr, $app:expr; $data:expr; $r:expr, $c:expr, $s:expr) => ({
+            let (mut lex, mut buf) = make_lex_and_buf($data);
+            assert_err!(for lex and buf expect row $r ; $c, $s);
+
+            let (mut lex, mut buf) = make_lex_and_buf($data);
+            lex.disable_errors();
+            assert_oks!(for lex and buf ;
+                Token::Chunk($chunk)
+                Token::Character($app)
+            );
+            assert_none!(for lex and buf);
+        })
+    );
+
+    #[test]
+    fn error_in_cdata_started() {
+        check_case!("<![",      '['; "<![["      ; 0, 0, "Unexpected token '<![' before '['");
+        check_case!("<![C",     '['; "<![C["     ; 0, 0, "Unexpected token '<![C' before '['");
+        check_case!("<![CD",    '['; "<![CD["    ; 0, 0, "Unexpected token '<![CD' before '['");
+        check_case!("<![CDA",   '['; "<![CDA["   ; 0, 0, "Unexpected token '<![CDA' before '['");
+        check_case!("<![CDAT",  '['; "<![CDAT["  ; 0, 0, "Unexpected token '<![CDAT' before '['");
+        check_case!("<![CDATA", '|'; "<![CDATA|" ; 0, 0, "Unexpected token '<![CDATA' before '|'");
+    }
+
+    #[test]
+    fn error_in_doctype_started() {
+        check_case!("<!D",      'a'; "<!Da"      ; 0, 0, "Unexpected token '<!D' before 'a'");
+        check_case!("<!DO",     'b'; "<!DOb"     ; 0, 0, "Unexpected token '<!DO' before 'b'");
+        check_case!("<!DOC",    'c'; "<!DOCc"    ; 0, 0, "Unexpected token '<!DOC' before 'c'");
+        check_case!("<!DOCT",   'd'; "<!DOCTd"   ; 0, 0, "Unexpected token '<!DOCT' before 'd'");
+        check_case!("<!DOCTY",  'e'; "<!DOCTYe"  ; 0, 0, "Unexpected token '<!DOCTY' before 'e'");
+        check_case!("<!DOCTYP", 'f'; "<!DOCTYPf" ; 0, 0, "Unexpected token '<!DOCTYP' before 'f'");
+    }
+
+
+
+    #[test]
+    fn issue_98_cdata_ending_with_right_bracket() {
+        let (mut lex, mut buf) = make_lex_and_buf(
+            r#"<![CDATA[Foo [Bar]]]>"#
+        );
+
+        assert_oks!(for lex and buf ;
+            Token::CDataStart
+            Token::Character('F')
+            Token::Character('o')
+            Token::Character('o')
+            Token::Whitespace(' ')
+            Token::Character('[')
+            Token::Character('B')
+            Token::Character('a')
+            Token::Character('r')
+            Token::Character(']')
+            Token::CDataEnd
+        );
+        assert_none!(for lex and buf);
+    }
+}
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-07 19:33:14 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-07 19:33:14 +0000
commit	36d22d82aa202bb199967e9512281e9a53db42c9 (patch)
tree	105e8c98ddea1c1e4784a60a5a6410fa416be2de /third_party/rust/xml-rs/src/reader/lexer.rs
parent	Initial commit. (diff)
download	firefox-esr-upstream.tar.xz firefox-esr-upstream.zip