diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 19:33:14 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 19:33:14 +0000 |
commit | 36d22d82aa202bb199967e9512281e9a53db42c9 (patch) | |
tree | 105e8c98ddea1c1e4784a60a5a6410fa416be2de /third_party/rust/xml-rs/src/reader | |
parent | Initial commit. (diff) | |
download | firefox-esr-upstream.tar.xz firefox-esr-upstream.zip |
Adding upstream version 115.7.0esr.upstream/115.7.0esrupstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/rust/xml-rs/src/reader')
15 files changed, 2827 insertions, 0 deletions
diff --git a/third_party/rust/xml-rs/src/reader/config.rs b/third_party/rust/xml-rs/src/reader/config.rs new file mode 100644 index 0000000000..0abb165cf4 --- /dev/null +++ b/third_party/rust/xml-rs/src/reader/config.rs @@ -0,0 +1,181 @@ +//! Contains parser configuration structure. +use std::io::Read; +use std::collections::HashMap; + +use reader::EventReader; + +/// Parser configuration structure. +/// +/// This structure contains various configuration options which affect +/// behavior of the parser. +#[derive(Clone, PartialEq, Eq, Debug)] +pub struct ParserConfig { + /// Whether or not should whitespace in textual events be removed. Default is false. + /// + /// When true, all standalone whitespace will be removed (this means no + /// `Whitespace` events will be emitted), and leading and trailing whitespace + /// from `Character` events will be deleted. If after trimming `Characters` + /// event will be empty, it will also be omitted from output stream. This is + /// possible, however, only if `whitespace_to_characters` or + /// `cdata_to_characters` options are set. + /// + /// This option does not affect CDATA events, unless `cdata_to_characters` + /// option is also set. In that case CDATA content will also be trimmed. + pub trim_whitespace: bool, + + /// Whether or not should whitespace be converted to characters. + /// Default is false. + /// + /// If true, instead of `Whitespace` events `Characters` events with the + /// same content will be emitted. If `trim_whitespace` is also true, these + /// events will be trimmed to nothing and, consequently, not emitted. + pub whitespace_to_characters: bool, + + /// Whether or not should CDATA be converted to characters. + /// Default is false. + /// + /// If true, instead of `CData` events `Characters` events with the same + /// content will be emitted. If `trim_whitespace` is also true, these events + /// will be trimmed. If corresponding CDATA contained nothing but whitespace, + /// this event will be omitted from the stream. + pub cdata_to_characters: bool, + + /// Whether or not should comments be omitted. Default is true. + /// + /// If true, `Comment` events will not be emitted at all. + pub ignore_comments: bool, + + /// Whether or not should sequential `Characters` events be merged. + /// Default is true. + /// + /// If true, multiple sequential `Characters` events will be merged into + /// a single event, that is, their data will be concatenated. + /// + /// Multiple sequential `Characters` events are only possible if either + /// `cdata_to_characters` or `ignore_comments` are set. Otherwise character + /// events will always be separated by other events. + pub coalesce_characters: bool, + + /// A map of extra entities recognized by the parser. Default is an empty map. + /// + /// By default the XML parser recognizes the entities defined in the XML spec. Sometimes, + /// however, it is convenient to make the parser recognize additional entities which + /// are also not available through the DTD definitions (especially given that at the moment + /// DTD parsing is not supported). + pub extra_entities: HashMap<String, String>, + + /// Whether or not the parser should ignore the end of stream. Default is false. + /// + /// By default the parser will either error out when it encounters a premature end of + /// stream or complete normally if the end of stream was expected. If you want to continue + /// reading from a stream whose input is supplied progressively, you can set this option to true. + /// In this case the parser will allow you to invoke the next() method even if a supposed end + /// of stream has happened. + /// + /// Note that support for this functionality is incomplete; for example, the parser will fail if + /// the premature end of stream happens inside PCDATA. Therefore, use this option at your own risk. + pub ignore_end_of_stream: bool, + + /// Whether or not non-unicode entity references get replaced with the replacement character + /// + /// When true, any decimal or hexadecimal character reference that cannot be converted from a + /// u32 to a char using [std::char::from_u32](https://doc.rust-lang.org/std/char/fn.from_u32.html) + /// will be converted into the unicode REPLACEMENT CHARACTER (U+FFFD). + pub replace_unknown_entity_references: bool, + + /// Whether or not whitespace at the root level of the document is ignored. Default is true. + /// + /// By default any whitespace that is not enclosed within at least one level of elements will be + /// ignored. Setting this value to false will cause root level whitespace events to be emitted. + pub ignore_root_level_whitespace: bool, +} + +impl ParserConfig { + /// Returns a new config with default values. + /// + /// You can tweak default values using builder-like pattern: + /// + /// ```rust + /// use xml::reader::ParserConfig; + /// + /// let config = ParserConfig::new() + /// .trim_whitespace(true) + /// .ignore_comments(true) + /// .coalesce_characters(false); + /// ``` + pub fn new() -> ParserConfig { + ParserConfig { + trim_whitespace: false, + whitespace_to_characters: false, + cdata_to_characters: false, + ignore_comments: true, + coalesce_characters: true, + extra_entities: HashMap::new(), + ignore_end_of_stream: false, + replace_unknown_entity_references: false, + ignore_root_level_whitespace: true, + } + } + + /// Creates an XML reader with this configuration. + /// + /// This is a convenience method for configuring and creating a reader at the same time: + /// + /// ```rust + /// use xml::reader::ParserConfig; + /// + /// let mut source: &[u8] = b"..."; + /// + /// let reader = ParserConfig::new() + /// .trim_whitespace(true) + /// .ignore_comments(true) + /// .coalesce_characters(false) + /// .create_reader(&mut source); + /// ``` + /// + /// This method is exactly equivalent to calling `EventReader::new_with_config()` with + /// this configuration object. + #[inline] + pub fn create_reader<R: Read>(self, source: R) -> EventReader<R> { + EventReader::new_with_config(source, self) + } + + /// Adds a new entity mapping and returns an updated config object. + /// + /// This is a convenience method for adding external entities mappings to the XML parser. + /// An example: + /// + /// ```rust + /// use xml::reader::ParserConfig; + /// + /// let mut source: &[u8] = b"..."; + /// + /// let reader = ParserConfig::new() + /// .add_entity("nbsp", " ") + /// .add_entity("copy", "©") + /// .add_entity("reg", "®") + /// .create_reader(&mut source); + /// ``` + pub fn add_entity<S: Into<String>, T: Into<String>>(mut self, entity: S, value: T) -> ParserConfig { + self.extra_entities.insert(entity.into(), value.into()); + self + } +} + +impl Default for ParserConfig { + #[inline] + fn default() -> ParserConfig { + ParserConfig::new() + } +} + +gen_setters! { ParserConfig, + trim_whitespace: val bool, + whitespace_to_characters: val bool, + cdata_to_characters: val bool, + ignore_comments: val bool, + coalesce_characters: val bool, + ignore_end_of_stream: val bool, + replace_unknown_entity_references: val bool, + ignore_root_level_whitespace: val bool +} diff --git a/third_party/rust/xml-rs/src/reader/error.rs b/third_party/rust/xml-rs/src/reader/error.rs new file mode 100644 index 0000000000..92378e6373 --- /dev/null +++ b/third_party/rust/xml-rs/src/reader/error.rs @@ -0,0 +1,121 @@ + +use std::io; +use std::borrow::Cow; +use std::fmt; +use std::error; +use std::str; + +use util; +use common::{Position, TextPosition}; + +#[derive(Debug)] +pub enum ErrorKind { + Syntax(Cow<'static, str>), + Io(io::Error), + Utf8(str::Utf8Error), + UnexpectedEof, +} + +/// An XML parsing error. +/// +/// Consists of a 2D position in a document and a textual message describing the error. +#[derive(Clone, PartialEq, Eq, Debug)] +pub struct Error { + pos: TextPosition, + kind: ErrorKind, +} + +impl fmt::Display for Error { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{} {}", self.pos, self.msg()) + } +} + +impl Position for Error { + #[inline] + fn position(&self) -> TextPosition { self.pos } +} + +impl Error { + /// Returns a reference to a message which is contained inside this error. + #[inline] + pub fn msg(&self) -> &str { + use self::ErrorKind::*; + match self.kind { + UnexpectedEof => &"Unexpected EOF", + Utf8(ref reason) => error_description(reason), + Io(ref io_error) => error_description(io_error), + Syntax(ref msg) => msg.as_ref(), + } + } + + pub fn kind(&self) -> &ErrorKind { &self.kind } +} + +impl error::Error for Error { + #[inline] + fn description(&self) -> &str { self.msg() } +} + +impl<'a, P, M> From<(&'a P, M)> for Error where P: Position, M: Into<Cow<'static, str>> { + fn from(orig: (&'a P, M)) -> Self { + Error{ + pos: orig.0.position(), + kind: ErrorKind::Syntax(orig.1.into()) + } + } +} + +impl From<util::CharReadError> for Error { + fn from(e: util::CharReadError) -> Self { + use util::CharReadError::*; + Error{ + pos: TextPosition::new(), + kind: match e { + UnexpectedEof => ErrorKind::UnexpectedEof, + Utf8(reason) => ErrorKind::Utf8(reason), + Io(io_error) => ErrorKind::Io(io_error), + } + } + } +} + +impl From<io::Error> for Error { + fn from(e: io::Error) -> Self { + Error { + pos: TextPosition::new(), + kind: ErrorKind::Io(e), + } + } +} + +impl Clone for ErrorKind { + fn clone(&self) -> Self { + use self::ErrorKind::*; + match *self { + UnexpectedEof => UnexpectedEof, + Utf8(ref reason) => Utf8(reason.clone()), + Io(ref io_error) => Io(io::Error::new(io_error.kind(), error_description(io_error))), + Syntax(ref msg) => Syntax(msg.clone()), + } + } +} +impl PartialEq for ErrorKind { + fn eq(&self, other: &ErrorKind) -> bool { + use self::ErrorKind::*; + match (self, other) { + (&UnexpectedEof, &UnexpectedEof) => true, + (&Utf8(ref left), &Utf8(ref right)) => left == right, + (&Io(ref left), &Io(ref right)) => + left.kind() == right.kind() && + error_description(left) == error_description(right), + (&Syntax(ref left), &Syntax(ref right)) => + left == right, + + (_, _) => false, + } + } +} +impl Eq for ErrorKind {} + +fn error_description(e: &error::Error) -> &str { e.description() } diff --git a/third_party/rust/xml-rs/src/reader/events.rs b/third_party/rust/xml-rs/src/reader/events.rs new file mode 100644 index 0000000000..46d7621a87 --- /dev/null +++ b/third_party/rust/xml-rs/src/reader/events.rs @@ -0,0 +1,219 @@ +//! Contains `XmlEvent` datatype, instances of which are emitted by the parser. + +use std::fmt; +use std::borrow::Cow; + +use name::OwnedName; +use attribute::OwnedAttribute; +use common::XmlVersion; +use namespace::Namespace; + +/// An element of an XML input stream. +/// +/// Items of this enum are emitted by `reader::EventReader`. They correspond to different +/// elements of an XML document. +#[derive(PartialEq, Clone)] +pub enum XmlEvent { + /// Corresponds to XML document declaration. + /// + /// This event is always emitted before any other event. It is emitted + /// even if the actual declaration is not present in the document. + StartDocument { + /// XML version. + /// + /// If XML declaration is not present, defaults to `Version10`. + version: XmlVersion, + + /// XML document encoding. + /// + /// If XML declaration is not present or does not contain `encoding` attribute, + /// defaults to `"UTF-8"`. This field is currently used for no other purpose than + /// informational. + encoding: String, + + /// XML standalone declaration. + /// + /// If XML document is not present or does not contain `standalone` attribute, + /// defaults to `None`. This field is currently used for no other purpose than + /// informational. + standalone: Option<bool> + }, + + /// Denotes to the end of the document stream. + /// + /// This event is always emitted after any other event (except `Error`). After it + /// is emitted for the first time, it will always be emitted on next event pull attempts. + EndDocument, + + /// Denotes an XML processing instruction. + /// + /// This event contains a processing instruction target (`name`) and opaque `data`. It + /// is up to the application to process them. + ProcessingInstruction { + /// Processing instruction target. + name: String, + + /// Processing instruction content. + data: Option<String> + }, + + /// Denotes a beginning of an XML element. + /// + /// This event is emitted after parsing opening tags or after parsing bodiless tags. In the + /// latter case `EndElement` event immediately follows. + StartElement { + /// Qualified name of the element. + name: OwnedName, + + /// A list of attributes associated with the element. + /// + /// Currently attributes are not checked for duplicates (TODO) + attributes: Vec<OwnedAttribute>, + + /// Contents of the namespace mapping at this point of the document. + namespace: Namespace, + }, + + /// Denotes an end of an XML element. + /// + /// This event is emitted after parsing closing tags or after parsing bodiless tags. In the + /// latter case it is emitted immediately after corresponding `StartElement` event. + EndElement { + /// Qualified name of the element. + name: OwnedName + }, + + /// Denotes CDATA content. + /// + /// This event contains unparsed data. No unescaping will be performed. + /// + /// It is possible to configure a parser to emit `Characters` event instead of `CData`. See + /// `pull::ParserConfiguration` structure for more information. + CData(String), + + /// Denotes a comment. + /// + /// It is possible to configure a parser to ignore comments, so this event will never be emitted. + /// See `pull::ParserConfiguration` structure for more information. + Comment(String), + + /// Denotes character data outside of tags. + /// + /// Contents of this event will always be unescaped, so no entities like `<` or `&` or `{` + /// will appear in it. + /// + /// It is possible to configure a parser to trim leading and trailing whitespace for this event. + /// See `pull::ParserConfiguration` structure for more information. + Characters(String), + + /// Denotes a chunk of whitespace outside of tags. + /// + /// It is possible to configure a parser to emit `Characters` event instead of `Whitespace`. + /// See `pull::ParserConfiguration` structure for more information. When combined with whitespace + /// trimming, it will eliminate standalone whitespace from the event stream completely. + Whitespace(String) +} + +impl fmt::Debug for XmlEvent { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match *self { + XmlEvent::StartDocument { ref version, ref encoding, ref standalone } => + write!(f, "StartDocument({}, {}, {:?})", version, *encoding, *standalone), + XmlEvent::EndDocument => + write!(f, "EndDocument"), + XmlEvent::ProcessingInstruction { ref name, ref data } => + write!(f, "ProcessingInstruction({}{})", *name, match *data { + Some(ref data) => format!(", {}", data), + None => String::new() + }), + XmlEvent::StartElement { ref name, ref attributes, namespace: Namespace(ref namespace) } => + write!(f, "StartElement({}, {:?}{})", name, namespace, if attributes.is_empty() { + String::new() + } else { + let attributes: Vec<String> = attributes.iter().map( + |a| format!("{} -> {}", a.name, a.value) + ).collect(); + format!(", [{}]", attributes.join(", ")) + }), + XmlEvent::EndElement { ref name } => + write!(f, "EndElement({})", name), + XmlEvent::Comment(ref data) => + write!(f, "Comment({})", data), + XmlEvent::CData(ref data) => + write!(f, "CData({})", data), + XmlEvent::Characters(ref data) => + write!(f, "Characters({})", data), + XmlEvent::Whitespace(ref data) => + write!(f, "Whitespace({})", data) + } + } +} + +impl XmlEvent { + /// Obtains a writer event from this reader event. + /// + /// This method is useful for streaming processing of XML documents where the output + /// is also an XML document. With this method it is possible to process some events + /// while passing other events through to the writer unchanged: + /// + /// ```rust + /// use std::str; + /// + /// use xml::{EventReader, EventWriter}; + /// use xml::reader::XmlEvent as ReaderEvent; + /// use xml::writer::XmlEvent as WriterEvent; + /// + /// let mut input: &[u8] = b"<hello>world</hello>"; + /// let mut output: Vec<u8> = Vec::new(); + /// + /// { + /// let mut reader = EventReader::new(&mut input); + /// let mut writer = EventWriter::new(&mut output); + /// + /// for e in reader { + /// match e.unwrap() { + /// ReaderEvent::Characters(s) => + /// writer.write(WriterEvent::characters(&s.to_uppercase())).unwrap(), + /// e => if let Some(e) = e.as_writer_event() { + /// writer.write(e).unwrap() + /// } + /// } + /// } + /// } + /// + /// assert_eq!( + /// str::from_utf8(&output).unwrap(), + /// r#"<?xml version="1.0" encoding="UTF-8"?><hello>WORLD</hello>"# + /// ); + /// ``` + /// + /// Note that this API may change or get additions in future to improve its ergonomics. + pub fn as_writer_event<'a>(&'a self) -> Option<::writer::events::XmlEvent<'a>> { + match *self { + XmlEvent::StartDocument { version, ref encoding, standalone } => + Some(::writer::events::XmlEvent::StartDocument { + version: version, + encoding: Some(encoding), + standalone: standalone + }), + XmlEvent::ProcessingInstruction { ref name, ref data } => + Some(::writer::events::XmlEvent::ProcessingInstruction { + name: name, + data: data.as_ref().map(|s| &s[..]) + }), + XmlEvent::StartElement { ref name, ref attributes, ref namespace } => + Some(::writer::events::XmlEvent::StartElement { + name: name.borrow(), + attributes: attributes.iter().map(|a| a.borrow()).collect(), + namespace: Cow::Borrowed(namespace) + }), + XmlEvent::EndElement { ref name } => + Some(::writer::events::XmlEvent::EndElement { name: Some(name.borrow()) }), + XmlEvent::Comment(ref data) => Some(::writer::events::XmlEvent::Comment(data)), + XmlEvent::CData(ref data) => Some(::writer::events::XmlEvent::CData(data)), + XmlEvent::Characters(ref data) => Some(::writer::events::XmlEvent::Characters(data)), + XmlEvent::Whitespace(ref data) => Some(::writer::events::XmlEvent::Characters(data)), + _ => None + } + } +} diff --git a/third_party/rust/xml-rs/src/reader/lexer.rs b/third_party/rust/xml-rs/src/reader/lexer.rs new file mode 100644 index 0000000000..c466db9210 --- /dev/null +++ b/third_party/rust/xml-rs/src/reader/lexer.rs @@ -0,0 +1,867 @@ +//! Contains simple lexer for XML documents. +//! +//! This module is for internal use. Use `xml::pull` module to do parsing. + +use std::fmt; +use std::collections::VecDeque; +use std::io::Read; +use std::result; +use std::borrow::Cow; + +use common::{Position, TextPosition, is_whitespace_char, is_name_char}; +use reader::Error; +use util; + +/// `Token` represents a single lexeme of an XML document. These lexemes +/// are used to perform actual parsing. +#[derive(Copy, Clone, PartialEq, Eq, Debug)] +pub enum Token { + /// `<?` + ProcessingInstructionStart, + /// `?>` + ProcessingInstructionEnd, + /// `<!DOCTYPE + DoctypeStart, + /// `<` + OpeningTagStart, + /// `</` + ClosingTagStart, + /// `>` + TagEnd, + /// `/>` + EmptyTagEnd, + /// `<!--` + CommentStart, + /// `-->` + CommentEnd, + /// A chunk of characters, used for errors recovery. + Chunk(&'static str), + /// Any non-special character except whitespace. + Character(char), + /// Whitespace character. + Whitespace(char), + /// `=` + EqualsSign, + /// `'` + SingleQuote, + /// `"` + DoubleQuote, + /// `<![CDATA[` + CDataStart, + /// `]]>` + CDataEnd, + /// `&` + ReferenceStart, + /// `;` + ReferenceEnd, +} + +impl fmt::Display for Token { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match *self { + Token::Chunk(s) => write!(f, "{}", s), + Token::Character(c) | Token::Whitespace(c) => write!(f, "{}", c), + other => write!(f, "{}", match other { + Token::OpeningTagStart => "<", + Token::ProcessingInstructionStart => "<?", + Token::DoctypeStart => "<!DOCTYPE", + Token::ClosingTagStart => "</", + Token::CommentStart => "<!--", + Token::CDataStart => "<![CDATA[", + Token::TagEnd => ">", + Token::EmptyTagEnd => "/>", + Token::ProcessingInstructionEnd => "?>", + Token::CommentEnd => "-->", + Token::CDataEnd => "]]>", + Token::ReferenceStart => "&", + Token::ReferenceEnd => ";", + Token::EqualsSign => "=", + Token::SingleQuote => "'", + Token::DoubleQuote => "\"", + _ => unreachable!() + }) + } + } +} + +impl Token { + pub fn as_static_str(&self) -> Option<&'static str> { + match *self { + Token::OpeningTagStart => Some("<"), + Token::ProcessingInstructionStart => Some("<?"), + Token::DoctypeStart => Some("<!DOCTYPE"), + Token::ClosingTagStart => Some("</"), + Token::CommentStart => Some("<!--"), + Token::CDataStart => Some("<![CDATA["), + Token::TagEnd => Some(">"), + Token::EmptyTagEnd => Some("/>"), + Token::ProcessingInstructionEnd => Some("?>"), + Token::CommentEnd => Some("-->"), + Token::CDataEnd => Some("]]>"), + Token::ReferenceStart => Some("&"), + Token::ReferenceEnd => Some(";"), + Token::EqualsSign => Some("="), + Token::SingleQuote => Some("'"), + Token::DoubleQuote => Some("\""), + Token::Chunk(s) => Some(s), + _ => None + } + } + + // using String.push_str(token.to_string()) is simply way too slow + pub fn push_to_string(&self, target: &mut String) { + match self.as_static_str() { + Some(s) => { target.push_str(s); } + None => { + match *self { + Token::Character(c) | Token::Whitespace(c) => target.push(c), + _ => unreachable!() + } + } + } + } + + /// Returns `true` if this token contains data that can be interpreted + /// as a part of the text. Surprisingly, this also means '>' and '=' and '"' and "'" and '-->'. + #[inline] + pub fn contains_char_data(&self) -> bool { + match *self { + Token::Whitespace(_) | Token::Chunk(_) | Token::Character(_) | Token::CommentEnd | + Token::TagEnd | Token::EqualsSign | Token::DoubleQuote | Token::SingleQuote | Token::CDataEnd | + Token::ProcessingInstructionEnd | Token::EmptyTagEnd => true, + _ => false + } + } + + /// Returns `true` if this token corresponds to a white space character. + #[inline] + pub fn is_whitespace(&self) -> bool { + match *self { + Token::Whitespace(_) => true, + _ => false + } + } +} + +enum State { + /// Triggered on '<' + TagStarted, + /// Triggered on '<!' + CommentOrCDataOrDoctypeStarted, + /// Triggered on '<!-' + CommentStarted, + /// Triggered on '<!D' up to '<!DOCTYPE' + DoctypeStarted(DoctypeStartedSubstate), + /// Triggered after DoctypeStarted to handle sub elements + DoctypeFinishing(u8), + /// Triggered on '<![' up to '<![CDATA' + CDataStarted(CDataStartedSubstate), + /// Triggered on '?' + ProcessingInstructionClosing, + /// Triggered on '/' + EmptyTagClosing, + /// Triggered on '-' up to '--' + CommentClosing(ClosingSubstate), + /// Triggered on ']' up to ']]' + CDataClosing(ClosingSubstate), + /// Default state + Normal +} + +#[derive(Copy, Clone)] +enum ClosingSubstate { + First, Second +} + +#[derive(Copy, Clone)] +enum DoctypeStartedSubstate { + D, DO, DOC, DOCT, DOCTY, DOCTYP +} + +#[derive(Copy, Clone)] +enum CDataStartedSubstate { + E, C, CD, CDA, CDAT, CDATA +} + +/// `Result` represents lexing result. It is either a token or an error message. +pub type Result = result::Result<Option<Token>, Error>; + +/// Helps to set up a dispatch table for lexing large unambigous tokens like +/// `<![CDATA[` or `<!DOCTYPE `. +macro_rules! dispatch_on_enum_state( + ($_self:ident, $s:expr, $c:expr, $is:expr, + $($st:ident; $stc:expr ; $next_st:ident ; $chunk:expr),+; + $end_st:ident ; $end_c:expr ; $end_chunk:expr ; $e:expr) => ( + match $s { + $( + $st => match $c { + $stc => $_self.move_to($is($next_st)), + _ => $_self.handle_error($chunk, $c) + }, + )+ + $end_st => match $c { + $end_c => $e, + _ => $_self.handle_error($end_chunk, $c) + } + } + ) +); + +/// `Lexer` is a lexer for XML documents, which implements pull API. +/// +/// Main method is `next_token` which accepts an `std::io::Read` instance and +/// tries to read the next lexeme from it. +/// +/// When `skip_errors` flag is set, invalid lexemes will be returned as `Chunk`s. +/// When it is not set, errors will be reported as `Err` objects with a string message. +/// By default this flag is not set. Use `enable_errors` and `disable_errors` methods +/// to toggle the behavior. +pub struct Lexer { + pos: TextPosition, + head_pos: TextPosition, + char_queue: VecDeque<char>, + st: State, + skip_errors: bool, + inside_comment: bool, + inside_token: bool, + eof_handled: bool +} + +impl Position for Lexer { + #[inline] + /// Returns the position of the last token produced by the lexer + fn position(&self) -> TextPosition { self.pos } +} + +impl Lexer { + /// Returns a new lexer with default state. + pub fn new() -> Lexer { + Lexer { + pos: TextPosition::new(), + head_pos: TextPosition::new(), + char_queue: VecDeque::with_capacity(4), // TODO: check size + st: State::Normal, + skip_errors: false, + inside_comment: false, + inside_token: false, + eof_handled: false + } + } + + /// Enables error handling so `next_token` will return `Some(Err(..))` + /// upon invalid lexeme. + #[inline] + pub fn enable_errors(&mut self) { self.skip_errors = false; } + + /// Disables error handling so `next_token` will return `Some(Chunk(..))` + /// upon invalid lexeme with this lexeme content. + #[inline] + pub fn disable_errors(&mut self) { self.skip_errors = true; } + + /// Enables special handling of some lexemes which should be done when we're parsing comment + /// internals. + #[inline] + pub fn inside_comment(&mut self) { self.inside_comment = true; } + + /// Disables the effect of `inside_comment()` method. + #[inline] + pub fn outside_comment(&mut self) { self.inside_comment = false; } + + /// Reset the eof handled flag of the lexer. + #[inline] + pub fn reset_eof_handled(&mut self) { self.eof_handled = false; } + + /// Tries to read the next token from the buffer. + /// + /// It is possible to pass different instaces of `BufReader` each time + /// this method is called, but the resulting behavior is undefined in this case. + /// + /// Return value: + /// * `Err(reason) where reason: reader::Error` - when an error occurs; + /// * `Ok(None)` - upon end of stream is reached; + /// * `Ok(Some(token)) where token: Token` - in case a complete-token has been read from the stream. + pub fn next_token<B: Read>(&mut self, b: &mut B) -> Result { + // Already reached end of buffer + if self.eof_handled { + return Ok(None); + } + + if !self.inside_token { + self.pos = self.head_pos; + self.inside_token = true; + } + + // Check if we have saved a char or two for ourselves + while let Some(c) = self.char_queue.pop_front() { + match try!(self.read_next_token(c)) { + Some(t) => { + self.inside_token = false; + return Ok(Some(t)); + } + None => {} // continue + } + } + + loop { + // TODO: this should handle multiple encodings + let c = match try!(util::next_char_from(b)) { + Some(c) => c, // got next char + None => break, // nothing to read left + }; + + match try!(self.read_next_token(c)) { + Some(t) => { + self.inside_token = false; + return Ok(Some(t)); + } + None => { + // continue + } + } + } + + // Handle end of stream + self.eof_handled = true; + self.pos = self.head_pos; + match self.st { + State::TagStarted | State::CommentOrCDataOrDoctypeStarted | + State::CommentStarted | State::CDataStarted(_)| State::DoctypeStarted(_) | + State::CommentClosing(ClosingSubstate::Second) | + State::DoctypeFinishing(_) => + Err(self.error("Unexpected end of stream")), + State::ProcessingInstructionClosing => + Ok(Some(Token::Character('?'))), + State::EmptyTagClosing => + Ok(Some(Token::Character('/'))), + State::CommentClosing(ClosingSubstate::First) => + Ok(Some(Token::Character('-'))), + State::CDataClosing(ClosingSubstate::First) => + Ok(Some(Token::Character(']'))), + State::CDataClosing(ClosingSubstate::Second) => + Ok(Some(Token::Chunk("]]"))), + State::Normal => + Ok(None) + } + } + + #[inline] + fn error<M: Into<Cow<'static, str>>>(&self, msg: M) -> Error { + (self, msg).into() + } + + #[inline] + fn read_next_token(&mut self, c: char) -> Result { + let res = self.dispatch_char(c); + if self.char_queue.is_empty() { + if c == '\n' { + self.head_pos.new_line(); + } else { + self.head_pos.advance(1); + } + } + res + } + + fn dispatch_char(&mut self, c: char) -> Result { + match self.st { + State::Normal => self.normal(c), + State::TagStarted => self.tag_opened(c), + State::CommentOrCDataOrDoctypeStarted => self.comment_or_cdata_or_doctype_started(c), + State::CommentStarted => self.comment_started(c), + State::CDataStarted(s) => self.cdata_started(c, s), + State::DoctypeStarted(s) => self.doctype_started(c, s), + State::DoctypeFinishing(d) => self.doctype_finishing(c, d), + State::ProcessingInstructionClosing => self.processing_instruction_closing(c), + State::EmptyTagClosing => self.empty_element_closing(c), + State::CommentClosing(s) => self.comment_closing(c, s), + State::CDataClosing(s) => self.cdata_closing(c, s) + } + } + + #[inline] + fn move_to(&mut self, st: State) -> Result { + self.st = st; + Ok(None) + } + + #[inline] + fn move_to_with(&mut self, st: State, token: Token) -> Result { + self.st = st; + Ok(Some(token)) + } + + #[inline] + fn move_to_with_unread(&mut self, st: State, cs: &[char], token: Token) -> Result { + self.char_queue.extend(cs.iter().cloned()); + self.move_to_with(st, token) + } + + fn handle_error(&mut self, chunk: &'static str, c: char) -> Result { + self.char_queue.push_back(c); + if self.skip_errors || (self.inside_comment && chunk != "--") { // FIXME: looks hacky + self.move_to_with(State::Normal, Token::Chunk(chunk)) + } else { + Err(self.error(format!("Unexpected token '{}' before '{}'", chunk, c))) + } + } + + /// Encountered a char + fn normal(&mut self, c: char) -> Result { + match c { + '<' => self.move_to(State::TagStarted), + '>' => Ok(Some(Token::TagEnd)), + '/' => self.move_to(State::EmptyTagClosing), + '=' => Ok(Some(Token::EqualsSign)), + '"' => Ok(Some(Token::DoubleQuote)), + '\'' => Ok(Some(Token::SingleQuote)), + '?' => self.move_to(State::ProcessingInstructionClosing), + '-' => self.move_to(State::CommentClosing(ClosingSubstate::First)), + ']' => self.move_to(State::CDataClosing(ClosingSubstate::First)), + '&' => Ok(Some(Token::ReferenceStart)), + ';' => Ok(Some(Token::ReferenceEnd)), + _ if is_whitespace_char(c) => Ok(Some(Token::Whitespace(c))), + _ => Ok(Some(Token::Character(c))) + } + } + + /// Encountered '<' + fn tag_opened(&mut self, c: char) -> Result { + match c { + '?' => self.move_to_with(State::Normal, Token::ProcessingInstructionStart), + '/' => self.move_to_with(State::Normal, Token::ClosingTagStart), + '!' => self.move_to(State::CommentOrCDataOrDoctypeStarted), + _ if is_whitespace_char(c) => self.move_to_with_unread(State::Normal, &[c], Token::OpeningTagStart), + _ if is_name_char(c) => self.move_to_with_unread(State::Normal, &[c], Token::OpeningTagStart), + _ => self.handle_error("<", c) + } + } + + /// Encountered '<!' + fn comment_or_cdata_or_doctype_started(&mut self, c: char) -> Result { + match c { + '-' => self.move_to(State::CommentStarted), + '[' => self.move_to(State::CDataStarted(CDataStartedSubstate::E)), + 'D' => self.move_to(State::DoctypeStarted(DoctypeStartedSubstate::D)), + _ => self.handle_error("<!", c) + } + } + + /// Encountered '<!-' + fn comment_started(&mut self, c: char) -> Result { + match c { + '-' => self.move_to_with(State::Normal, Token::CommentStart), + _ => self.handle_error("<!-", c) + } + } + + /// Encountered '<![' + fn cdata_started(&mut self, c: char, s: CDataStartedSubstate) -> Result { + use self::CDataStartedSubstate::{E, C, CD, CDA, CDAT, CDATA}; + dispatch_on_enum_state!(self, s, c, State::CDataStarted, + E ; 'C' ; C ; "<![", + C ; 'D' ; CD ; "<![C", + CD ; 'A' ; CDA ; "<![CD", + CDA ; 'T' ; CDAT ; "<![CDA", + CDAT ; 'A' ; CDATA ; "<![CDAT"; + CDATA ; '[' ; "<![CDATA" ; self.move_to_with(State::Normal, Token::CDataStart) + ) + } + + /// Encountered '<!D' + fn doctype_started(&mut self, c: char, s: DoctypeStartedSubstate) -> Result { + use self::DoctypeStartedSubstate::{D, DO, DOC, DOCT, DOCTY, DOCTYP}; + dispatch_on_enum_state!(self, s, c, State::DoctypeStarted, + D ; 'O' ; DO ; "<!D", + DO ; 'C' ; DOC ; "<!DO", + DOC ; 'T' ; DOCT ; "<!DOC", + DOCT ; 'Y' ; DOCTY ; "<!DOCT", + DOCTY ; 'P' ; DOCTYP ; "<!DOCTY"; + DOCTYP ; 'E' ; "<!DOCTYP" ; self.move_to_with(State::DoctypeFinishing(1), Token::DoctypeStart) + ) + } + + /// State used while awaiting the closing bracket for the <!DOCTYPE tag + fn doctype_finishing(&mut self, c: char, d: u8) -> Result { + match c { + '<' => self.move_to(State::DoctypeFinishing(d + 1)), + '>' if d == 1 => self.move_to_with(State::Normal, Token::TagEnd), + '>' => self.move_to(State::DoctypeFinishing(d - 1)), + _ => Ok(None), + } + } + + /// Encountered '?' + fn processing_instruction_closing(&mut self, c: char) -> Result { + match c { + '>' => self.move_to_with(State::Normal, Token::ProcessingInstructionEnd), + _ => self.move_to_with_unread(State::Normal, &[c], Token::Character('?')), + } + } + + /// Encountered '/' + fn empty_element_closing(&mut self, c: char) -> Result { + match c { + '>' => self.move_to_with(State::Normal, Token::EmptyTagEnd), + _ => self.move_to_with_unread(State::Normal, &[c], Token::Character('/')), + } + } + + /// Encountered '-' + fn comment_closing(&mut self, c: char, s: ClosingSubstate) -> Result { + match s { + ClosingSubstate::First => match c { + '-' => self.move_to(State::CommentClosing(ClosingSubstate::Second)), + _ => self.move_to_with_unread(State::Normal, &[c], Token::Character('-')) + }, + ClosingSubstate::Second => match c { + '>' => self.move_to_with(State::Normal, Token::CommentEnd), + // double dash not followed by a greater-than is a hard error inside comment + _ if self.inside_comment => self.handle_error("--", c), + // nothing else except comment closing starts with a double dash, and comment + // closing can never be after another dash, and also we're outside of a comment, + // therefore it is safe to push only the last read character to the list of unread + // characters and pass the double dash directly to the output + _ => self.move_to_with_unread(State::Normal, &[c], Token::Chunk("--")) + } + } + } + + /// Encountered ']' + fn cdata_closing(&mut self, c: char, s: ClosingSubstate) -> Result { + match s { + ClosingSubstate::First => match c { + ']' => self.move_to(State::CDataClosing(ClosingSubstate::Second)), + _ => self.move_to_with_unread(State::Normal, &[c], Token::Character(']')) + }, + ClosingSubstate::Second => match c { + '>' => self.move_to_with(State::Normal, Token::CDataEnd), + _ => self.move_to_with_unread(State::Normal, &[']', c], Token::Character(']')) + } + } + } +} + +#[cfg(test)] +mod tests { + use common::{Position}; + use std::io::{BufReader, Cursor}; + + use super::{Lexer, Token}; + + macro_rules! assert_oks( + (for $lex:ident and $buf:ident ; $($e:expr)+) => ({ + $( + assert_eq!(Ok(Some($e)), $lex.next_token(&mut $buf)); + )+ + }) + ); + + macro_rules! assert_err( + (for $lex:ident and $buf:ident expect row $r:expr ; $c:expr, $s:expr) => ({ + let err = $lex.next_token(&mut $buf); + assert!(err.is_err()); + let err = err.unwrap_err(); + assert_eq!($r as u64, err.position().row); + assert_eq!($c as u64, err.position().column); + assert_eq!($s, err.msg()); + }) + ); + + macro_rules! assert_none( + (for $lex:ident and $buf:ident) => ( + assert_eq!(Ok(None), $lex.next_token(&mut $buf)); + ) + ); + + fn make_lex_and_buf(s: &str) -> (Lexer, BufReader<Cursor<Vec<u8>>>) { + (Lexer::new(), BufReader::new(Cursor::new(s.to_owned().into_bytes()))) + } + + #[test] + fn simple_lexer_test() { + let (mut lex, mut buf) = make_lex_and_buf( + r#"<a p='q'> x<b z="y">d </b></a><p/> <?nm ?> <!-- a c --> "# + ); + + assert_oks!(for lex and buf ; + Token::OpeningTagStart + Token::Character('a') + Token::Whitespace(' ') + Token::Character('p') + Token::EqualsSign + Token::SingleQuote + Token::Character('q') + Token::SingleQuote + Token::TagEnd + Token::Whitespace(' ') + Token::Character('x') + Token::OpeningTagStart + Token::Character('b') + Token::Whitespace(' ') + Token::Character('z') + Token::EqualsSign + Token::DoubleQuote + Token::Character('y') + Token::DoubleQuote + Token::TagEnd + Token::Character('d') + Token::Whitespace('\t') + Token::ClosingTagStart + Token::Character('b') + Token::TagEnd + Token::ClosingTagStart + Token::Character('a') + Token::TagEnd + Token::OpeningTagStart + Token::Character('p') + Token::EmptyTagEnd + Token::Whitespace(' ') + Token::ProcessingInstructionStart + Token::Character('n') + Token::Character('m') + Token::Whitespace(' ') + Token::ProcessingInstructionEnd + Token::Whitespace(' ') + Token::CommentStart + Token::Whitespace(' ') + Token::Character('a') + Token::Whitespace(' ') + Token::Character('c') + Token::Whitespace(' ') + Token::CommentEnd + Token::Whitespace(' ') + Token::ReferenceStart + Token::Character('n') + Token::Character('b') + Token::Character('s') + Token::Character('p') + Token::ReferenceEnd + ); + assert_none!(for lex and buf); + } + + #[test] + fn special_chars_test() { + let (mut lex, mut buf) = make_lex_and_buf( + r#"?x!+ // -| ]z]]"# + ); + + assert_oks!(for lex and buf ; + Token::Character('?') + Token::Character('x') + Token::Character('!') + Token::Character('+') + Token::Whitespace(' ') + Token::Character('/') + Token::Character('/') + Token::Whitespace(' ') + Token::Character('-') + Token::Character('|') + Token::Whitespace(' ') + Token::Character(']') + Token::Character('z') + Token::Chunk("]]") + ); + assert_none!(for lex and buf); + } + + #[test] + fn cdata_test() { + let (mut lex, mut buf) = make_lex_and_buf( + r#"<a><![CDATA[x y ?]]> </a>"# + ); + + assert_oks!(for lex and buf ; + Token::OpeningTagStart + Token::Character('a') + Token::TagEnd + Token::CDataStart + Token::Character('x') + Token::Whitespace(' ') + Token::Character('y') + Token::Whitespace(' ') + Token::Character('?') + Token::CDataEnd + Token::Whitespace(' ') + Token::ClosingTagStart + Token::Character('a') + Token::TagEnd + ); + assert_none!(for lex and buf); + } + + #[test] + fn doctype_test() { + let (mut lex, mut buf) = make_lex_and_buf( + r#"<a><!DOCTYPE ab xx z> "# + ); + assert_oks!(for lex and buf ; + Token::OpeningTagStart + Token::Character('a') + Token::TagEnd + Token::DoctypeStart + Token::TagEnd + Token::Whitespace(' ') + ); + assert_none!(for lex and buf) + } + + #[test] + fn doctype_with_internal_subset_test() { + let (mut lex, mut buf) = make_lex_and_buf( + r#"<a><!DOCTYPE ab[<!ELEMENT ba> ]> "# + ); + assert_oks!(for lex and buf ; + Token::OpeningTagStart + Token::Character('a') + Token::TagEnd + Token::DoctypeStart + Token::TagEnd + Token::Whitespace(' ') + ); + assert_none!(for lex and buf) + } + + #[test] + fn end_of_stream_handling_ok() { + macro_rules! eof_check( + ($data:expr ; $token:expr) => ({ + let (mut lex, mut buf) = make_lex_and_buf($data); + assert_oks!(for lex and buf ; $token); + assert_none!(for lex and buf); + }) + ); + eof_check!("?" ; Token::Character('?')); + eof_check!("/" ; Token::Character('/')); + eof_check!("-" ; Token::Character('-')); + eof_check!("]" ; Token::Character(']')); + eof_check!("]]" ; Token::Chunk("]]")); + } + + #[test] + fn end_of_stream_handling_error() { + macro_rules! eof_check( + ($data:expr; $r:expr, $c:expr) => ({ + let (mut lex, mut buf) = make_lex_and_buf($data); + assert_err!(for lex and buf expect row $r ; $c, "Unexpected end of stream"); + assert_none!(for lex and buf); + }) + ); + eof_check!("<" ; 0, 1); + eof_check!("<!" ; 0, 2); + eof_check!("<!-" ; 0, 3); + eof_check!("<![" ; 0, 3); + eof_check!("<![C" ; 0, 4); + eof_check!("<![CD" ; 0, 5); + eof_check!("<![CDA" ; 0, 6); + eof_check!("<![CDAT" ; 0, 7); + eof_check!("<![CDATA" ; 0, 8); + eof_check!("--" ; 0, 2); + } + + #[test] + fn error_in_comment_or_cdata_prefix() { + let (mut lex, mut buf) = make_lex_and_buf("<!x"); + assert_err!(for lex and buf expect row 0 ; 0, + "Unexpected token '<!' before 'x'" + ); + + let (mut lex, mut buf) = make_lex_and_buf("<!x"); + lex.disable_errors(); + assert_oks!(for lex and buf ; + Token::Chunk("<!") + Token::Character('x') + ); + assert_none!(for lex and buf); + } + + #[test] + fn error_in_comment_started() { + let (mut lex, mut buf) = make_lex_and_buf("<!-\t"); + assert_err!(for lex and buf expect row 0 ; 0, + "Unexpected token '<!-' before '\t'" + ); + + let (mut lex, mut buf) = make_lex_and_buf("<!-\t"); + lex.disable_errors(); + assert_oks!(for lex and buf ; + Token::Chunk("<!-") + Token::Whitespace('\t') + ); + assert_none!(for lex and buf); + } + + #[test] + fn error_in_comment_two_dashes_not_at_end() { + let (mut lex, mut buf) = make_lex_and_buf("--x"); + lex.inside_comment(); + assert_err!(for lex and buf expect row 0; 0, + "Unexpected token '--' before 'x'" + ); + + let (mut lex, mut buf) = make_lex_and_buf("--x"); + assert_oks!(for lex and buf ; + Token::Chunk("--") + Token::Character('x') + ); + } + + macro_rules! check_case( + ($chunk:expr, $app:expr; $data:expr; $r:expr, $c:expr, $s:expr) => ({ + let (mut lex, mut buf) = make_lex_and_buf($data); + assert_err!(for lex and buf expect row $r ; $c, $s); + + let (mut lex, mut buf) = make_lex_and_buf($data); + lex.disable_errors(); + assert_oks!(for lex and buf ; + Token::Chunk($chunk) + Token::Character($app) + ); + assert_none!(for lex and buf); + }) + ); + + #[test] + fn error_in_cdata_started() { + check_case!("<![", '['; "<![[" ; 0, 0, "Unexpected token '<![' before '['"); + check_case!("<![C", '['; "<![C[" ; 0, 0, "Unexpected token '<![C' before '['"); + check_case!("<![CD", '['; "<![CD[" ; 0, 0, "Unexpected token '<![CD' before '['"); + check_case!("<![CDA", '['; "<![CDA[" ; 0, 0, "Unexpected token '<![CDA' before '['"); + check_case!("<![CDAT", '['; "<![CDAT[" ; 0, 0, "Unexpected token '<![CDAT' before '['"); + check_case!("<![CDATA", '|'; "<![CDATA|" ; 0, 0, "Unexpected token '<![CDATA' before '|'"); + } + + #[test] + fn error_in_doctype_started() { + check_case!("<!D", 'a'; "<!Da" ; 0, 0, "Unexpected token '<!D' before 'a'"); + check_case!("<!DO", 'b'; "<!DOb" ; 0, 0, "Unexpected token '<!DO' before 'b'"); + check_case!("<!DOC", 'c'; "<!DOCc" ; 0, 0, "Unexpected token '<!DOC' before 'c'"); + check_case!("<!DOCT", 'd'; "<!DOCTd" ; 0, 0, "Unexpected token '<!DOCT' before 'd'"); + check_case!("<!DOCTY", 'e'; "<!DOCTYe" ; 0, 0, "Unexpected token '<!DOCTY' before 'e'"); + check_case!("<!DOCTYP", 'f'; "<!DOCTYPf" ; 0, 0, "Unexpected token '<!DOCTYP' before 'f'"); + } + + + + #[test] + fn issue_98_cdata_ending_with_right_bracket() { + let (mut lex, mut buf) = make_lex_and_buf( + r#"<![CDATA[Foo [Bar]]]>"# + ); + + assert_oks!(for lex and buf ; + Token::CDataStart + Token::Character('F') + Token::Character('o') + Token::Character('o') + Token::Whitespace(' ') + Token::Character('[') + Token::Character('B') + Token::Character('a') + Token::Character('r') + Token::Character(']') + Token::CDataEnd + ); + assert_none!(for lex and buf); + } +} diff --git a/third_party/rust/xml-rs/src/reader/mod.rs b/third_party/rust/xml-rs/src/reader/mod.rs new file mode 100644 index 0000000000..90f5b52c56 --- /dev/null +++ b/third_party/rust/xml-rs/src/reader/mod.rs @@ -0,0 +1,129 @@ +//! Contains high-level interface for a pull-based XML parser. +//! +//! The most important type in this module is `EventReader`, which provides an iterator +//! view for events in XML document. + +use std::io::{Read}; +use std::result; + +use common::{Position, TextPosition}; + +pub use self::config::ParserConfig; +pub use self::events::XmlEvent; + +use self::parser::PullParser; + +mod lexer; +mod parser; +mod config; +mod events; + +mod error; +pub use self::error::{Error, ErrorKind}; + +/// A result type yielded by `XmlReader`. +pub type Result<T> = result::Result<T, Error>; + +/// A wrapper around an `std::io::Read` instance which provides pull-based XML parsing. +pub struct EventReader<R: Read> { + source: R, + parser: PullParser +} + +impl<R: Read> EventReader<R> { + /// Creates a new reader, consuming the given stream. + #[inline] + pub fn new(source: R) -> EventReader<R> { + EventReader::new_with_config(source, ParserConfig::new()) + } + + /// Creates a new reader with the provded configuration, consuming the given stream. + #[inline] + pub fn new_with_config(source: R, config: ParserConfig) -> EventReader<R> { + EventReader { source: source, parser: PullParser::new(config) } + } + + /// Pulls and returns next XML event from the stream. + /// + /// If returned event is `XmlEvent::Error` or `XmlEvent::EndDocument`, then + /// further calls to this method will return this event again. + #[inline] + pub fn next(&mut self) -> Result<XmlEvent> { + self.parser.next(&mut self.source) + } + + pub fn source(&self) -> &R { &self.source } + pub fn source_mut(&mut self) -> &mut R { &mut self.source } + + /// Unwraps this `EventReader`, returning the underlying reader. + /// + /// Note that this operation is destructive; unwrapping the reader and wrapping it + /// again with `EventReader::new()` will create a fresh reader which will attempt + /// to parse an XML document from the beginning. + pub fn into_inner(self) -> R { + self.source + } +} + +impl<B: Read> Position for EventReader<B> { + /// Returns the position of the last event produced by the reader. + #[inline] + fn position(&self) -> TextPosition { + self.parser.position() + } +} + +impl<R: Read> IntoIterator for EventReader<R> { + type Item = Result<XmlEvent>; + type IntoIter = Events<R>; + + fn into_iter(self) -> Events<R> { + Events { reader: self, finished: false } + } +} + +/// An iterator over XML events created from some type implementing `Read`. +/// +/// When the next event is `xml::event::Error` or `xml::event::EndDocument`, then +/// it will be returned by the iterator once, and then it will stop producing events. +pub struct Events<R: Read> { + reader: EventReader<R>, + finished: bool +} + +impl<R: Read> Events<R> { + /// Unwraps the iterator, returning the internal `EventReader`. + #[inline] + pub fn into_inner(self) -> EventReader<R> { + self.reader + } + + pub fn source(&self) -> &R { &self.reader.source } + pub fn source_mut(&mut self) -> &mut R { &mut self.reader.source } + +} + +impl<R: Read> Iterator for Events<R> { + type Item = Result<XmlEvent>; + + #[inline] + fn next(&mut self) -> Option<Result<XmlEvent>> { + if self.finished && !self.reader.parser.is_ignoring_end_of_stream() { None } + else { + let ev = self.reader.next(); + match ev { + Ok(XmlEvent::EndDocument) | Err(_) => self.finished = true, + _ => {} + } + Some(ev) + } + } +} + +impl<'r> EventReader<&'r [u8]> { + /// A convenience method to create an `XmlReader` from a string slice. + #[inline] + pub fn from_str(source: &'r str) -> EventReader<&'r [u8]> { + EventReader::new(source.as_bytes()) + } +} diff --git a/third_party/rust/xml-rs/src/reader/parser/inside_cdata.rs b/third_party/rust/xml-rs/src/reader/parser/inside_cdata.rs new file mode 100644 index 0000000000..3269fb4d6b --- /dev/null +++ b/third_party/rust/xml-rs/src/reader/parser/inside_cdata.rs @@ -0,0 +1,32 @@ +use reader::events::XmlEvent; +use reader::lexer::Token; + +use super::{Result, PullParser, State}; + +impl PullParser { + pub fn inside_cdata(&mut self, t: Token) -> Option<Result> { + match t { + Token::CDataEnd => { + self.lexer.enable_errors(); + let event = if self.config.cdata_to_characters { + None + } else { + let data = self.take_buf(); + Some(Ok(XmlEvent::CData(data))) + }; + self.into_state(State::OutsideTag, event) + } + + Token::Whitespace(_) => { + t.push_to_string(&mut self.buf); + None + } + + _ => { + self.inside_whitespace = false; + t.push_to_string(&mut self.buf); + None + } + } + } +} diff --git a/third_party/rust/xml-rs/src/reader/parser/inside_closing_tag_name.rs b/third_party/rust/xml-rs/src/reader/parser/inside_closing_tag_name.rs new file mode 100644 index 0000000000..1d8074a5a3 --- /dev/null +++ b/third_party/rust/xml-rs/src/reader/parser/inside_closing_tag_name.rs @@ -0,0 +1,34 @@ +use namespace; + +use reader::lexer::Token; + +use super::{Result, PullParser, State, QualifiedNameTarget, ClosingTagSubstate}; + +impl PullParser { + pub fn inside_closing_tag_name(&mut self, t: Token, s: ClosingTagSubstate) -> Option<Result> { + match s { + ClosingTagSubstate::CTInsideName => self.read_qualified_name(t, QualifiedNameTarget::ClosingTagNameTarget, |this, token, name| { + match name.prefix_ref() { + Some(prefix) if prefix == namespace::NS_XML_PREFIX || + prefix == namespace::NS_XMLNS_PREFIX => + // TODO: {:?} is bad, need something better + Some(self_error!(this; "'{:?}' cannot be an element name prefix", name.prefix)), + _ => { + this.data.element_name = Some(name.clone()); + match token { + Token::Whitespace(_) => this.into_state_continue(State::InsideClosingTag(ClosingTagSubstate::CTAfterName)), + Token::TagEnd => this.emit_end_element(), + _ => Some(self_error!(this; "Unexpected token inside closing tag: {}", token)) + } + } + } + }), + ClosingTagSubstate::CTAfterName => match t { + Token::Whitespace(_) => None, // Skip whitespace + Token::TagEnd => self.emit_end_element(), + _ => Some(self_error!(self; "Unexpected token inside closing tag: {}", t)) + } + } + } + +} diff --git a/third_party/rust/xml-rs/src/reader/parser/inside_comment.rs b/third_party/rust/xml-rs/src/reader/parser/inside_comment.rs new file mode 100644 index 0000000000..fc983205ac --- /dev/null +++ b/third_party/rust/xml-rs/src/reader/parser/inside_comment.rs @@ -0,0 +1,32 @@ +use reader::events::XmlEvent; +use reader::lexer::Token; + +use super::{Result, PullParser, State}; + +impl PullParser { + pub fn inside_comment(&mut self, t: Token) -> Option<Result> { + match t { + // Double dash is illegal inside a comment + Token::Chunk(ref s) if &s[..] == "--" => Some(self_error!(self; "Unexpected token inside a comment: --")), + + Token::CommentEnd if self.config.ignore_comments => { + self.lexer.outside_comment(); + self.into_state_continue(State::OutsideTag) + } + + Token::CommentEnd => { + self.lexer.outside_comment(); + let data = self.take_buf(); + self.into_state_emit(State::OutsideTag, Ok(XmlEvent::Comment(data))) + } + + _ if self.config.ignore_comments => None, // Do not modify buffer if ignoring the comment + + _ => { + t.push_to_string(&mut self.buf); + None + } + } + } + +} diff --git a/third_party/rust/xml-rs/src/reader/parser/inside_declaration.rs b/third_party/rust/xml-rs/src/reader/parser/inside_declaration.rs new file mode 100644 index 0000000000..af39d10d86 --- /dev/null +++ b/third_party/rust/xml-rs/src/reader/parser/inside_declaration.rs @@ -0,0 +1,151 @@ + +use common::XmlVersion; + +use reader::events::XmlEvent; +use reader::lexer::Token; + +use super::{ + Result, PullParser, State, DeclarationSubstate, QualifiedNameTarget, + DEFAULT_VERSION, DEFAULT_ENCODING +}; + +impl PullParser { + // TODO: remove redundancy via macros or extra methods + pub fn inside_declaration(&mut self, t: Token, s: DeclarationSubstate) -> Option<Result> { + macro_rules! unexpected_token( + ($this:expr; $t:expr) => (Some($this.error(format!("Unexpected token inside XML declaration: {}", $t)))); + ($t:expr) => (unexpected_token!(self; $t)); + ); + + #[inline] + fn emit_start_document(this: &mut PullParser) -> Option<Result> { + this.parsed_declaration = true; + let version = this.data.take_version(); + let encoding = this.data.take_encoding(); + let standalone = this.data.take_standalone(); + this.into_state_emit(State::OutsideTag, Ok(XmlEvent::StartDocument { + version: version.unwrap_or(DEFAULT_VERSION), + encoding: encoding.unwrap_or(DEFAULT_ENCODING.into()), + standalone: standalone + })) + } + + match s { + DeclarationSubstate::BeforeVersion => match t { + Token::Whitespace(_) => None, // continue + Token::Character('v') => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideVersion)), + _ => unexpected_token!(t) + }, + + DeclarationSubstate::InsideVersion => self.read_qualified_name(t, QualifiedNameTarget::AttributeNameTarget, |this, token, name| { + match &name.local_name[..] { + "ersion" if name.namespace.is_none() => + this.into_state_continue(State::InsideDeclaration( + if token == Token::EqualsSign { + DeclarationSubstate::InsideVersionValue + } else { + DeclarationSubstate::AfterVersion + } + )), + _ => unexpected_token!(this; name) + } + }), + + DeclarationSubstate::AfterVersion => match t { + Token::Whitespace(_) => None, + Token::EqualsSign => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideVersionValue)), + _ => unexpected_token!(t) + }, + + DeclarationSubstate::InsideVersionValue => self.read_attribute_value(t, |this, value| { + this.data.version = match &value[..] { + "1.0" => Some(XmlVersion::Version10), + "1.1" => Some(XmlVersion::Version11), + _ => None + }; + if this.data.version.is_some() { + this.into_state_continue(State::InsideDeclaration(DeclarationSubstate::AfterVersionValue)) + } else { + Some(self_error!(this; "Unexpected XML version value: {}", value)) + } + }), + + DeclarationSubstate::AfterVersionValue => match t { + Token::Whitespace(_) => None, // skip whitespace + Token::Character('e') => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideEncoding)), + Token::Character('s') => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideStandaloneDecl)), + Token::ProcessingInstructionEnd => emit_start_document(self), + _ => unexpected_token!(t) + }, + + DeclarationSubstate::InsideEncoding => self.read_qualified_name(t, QualifiedNameTarget::AttributeNameTarget, |this, token, name| { + match &name.local_name[..] { + "ncoding" if name.namespace.is_none() => + this.into_state_continue(State::InsideDeclaration( + if token == Token::EqualsSign { DeclarationSubstate::InsideEncodingValue } else { DeclarationSubstate::AfterEncoding } + )), + _ => unexpected_token!(this; name) + } + }), + + DeclarationSubstate::AfterEncoding => match t { + Token::Whitespace(_) => None, + Token::EqualsSign => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideEncodingValue)), + _ => unexpected_token!(t) + }, + + DeclarationSubstate::InsideEncodingValue => self.read_attribute_value(t, |this, value| { + this.data.encoding = Some(value); + this.into_state_continue(State::InsideDeclaration(DeclarationSubstate::BeforeStandaloneDecl)) + }), + + DeclarationSubstate::BeforeStandaloneDecl => match t { + Token::Whitespace(_) => None, // skip whitespace + Token::Character('s') => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideStandaloneDecl)), + Token::ProcessingInstructionEnd => emit_start_document(self), + _ => unexpected_token!(t) + }, + + DeclarationSubstate::InsideStandaloneDecl => self.read_qualified_name(t, QualifiedNameTarget::AttributeNameTarget, |this, token, name| { + match &name.local_name[..] { + "tandalone" if name.namespace.is_none() => + this.into_state_continue(State::InsideDeclaration( + if token == Token::EqualsSign { + DeclarationSubstate::InsideStandaloneDeclValue + } else { + DeclarationSubstate::AfterStandaloneDecl + } + )), + _ => unexpected_token!(this; name) + } + }), + + DeclarationSubstate::AfterStandaloneDecl => match t { + Token::Whitespace(_) => None, + Token::EqualsSign => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideStandaloneDeclValue)), + _ => unexpected_token!(t) + }, + + DeclarationSubstate::InsideStandaloneDeclValue => self.read_attribute_value(t, |this, value| { + let standalone = match &value[..] { + "yes" => Some(true), + "no" => Some(false), + _ => None + }; + if standalone.is_some() { + this.data.standalone = standalone; + this.into_state_continue(State::InsideDeclaration(DeclarationSubstate::AfterStandaloneDeclValue)) + } else { + Some(self_error!(this; "Invalid standalone declaration value: {}", value)) + } + }), + + DeclarationSubstate::AfterStandaloneDeclValue => match t { + Token::Whitespace(_) => None, // skip whitespace + Token::ProcessingInstructionEnd => emit_start_document(self), + _ => unexpected_token!(t) + } + } + } + +} diff --git a/third_party/rust/xml-rs/src/reader/parser/inside_doctype.rs b/third_party/rust/xml-rs/src/reader/parser/inside_doctype.rs new file mode 100644 index 0000000000..8dcf367bc6 --- /dev/null +++ b/third_party/rust/xml-rs/src/reader/parser/inside_doctype.rs @@ -0,0 +1,16 @@ +use reader::lexer::Token; + +use super::{Result, PullParser, State}; + +impl PullParser { + pub fn inside_doctype(&mut self, t: Token) -> Option<Result> { + match t { + Token::TagEnd => { + self.lexer.enable_errors(); + self.into_state_continue(State::OutsideTag) + } + + _ => None + } + } +} diff --git a/third_party/rust/xml-rs/src/reader/parser/inside_opening_tag.rs b/third_party/rust/xml-rs/src/reader/parser/inside_opening_tag.rs new file mode 100644 index 0000000000..533874fb81 --- /dev/null +++ b/third_party/rust/xml-rs/src/reader/parser/inside_opening_tag.rs @@ -0,0 +1,108 @@ +use common::is_name_start_char; +use attribute::OwnedAttribute; +use namespace; + +use reader::lexer::Token; + +use super::{Result, PullParser, State, OpeningTagSubstate, QualifiedNameTarget}; + +impl PullParser { + pub fn inside_opening_tag(&mut self, t: Token, s: OpeningTagSubstate) -> Option<Result> { + macro_rules! unexpected_token(($t:expr) => (Some(self_error!(self; "Unexpected token inside opening tag: {}", $t)))); + match s { + OpeningTagSubstate::InsideName => self.read_qualified_name(t, QualifiedNameTarget::OpeningTagNameTarget, |this, token, name| { + match name.prefix_ref() { + Some(prefix) if prefix == namespace::NS_XML_PREFIX || + prefix == namespace::NS_XMLNS_PREFIX => + Some(self_error!(this; "'{:?}' cannot be an element name prefix", name.prefix)), + _ => { + this.data.element_name = Some(name.clone()); + match token { + Token::TagEnd => this.emit_start_element(false), + Token::EmptyTagEnd => this.emit_start_element(true), + Token::Whitespace(_) => this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideTag)), + _ => unreachable!() + } + } + } + }), + + OpeningTagSubstate::InsideTag => match t { + Token::Whitespace(_) => None, // skip whitespace + Token::Character(c) if is_name_start_char(c) => { + self.buf.push(c); + self.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideAttributeName)) + } + Token::TagEnd => self.emit_start_element(false), + Token::EmptyTagEnd => self.emit_start_element(true), + _ => unexpected_token!(t) + }, + + OpeningTagSubstate::InsideAttributeName => self.read_qualified_name(t, QualifiedNameTarget::AttributeNameTarget, |this, token, name| { + this.data.attr_name = Some(name); + match token { + Token::Whitespace(_) => this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::AfterAttributeName)), + Token::EqualsSign => this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideAttributeValue)), + _ => unreachable!() + } + }), + + OpeningTagSubstate::AfterAttributeName => match t { + Token::Whitespace(_) => None, + Token::EqualsSign => self.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideAttributeValue)), + _ => unexpected_token!(t) + }, + + OpeningTagSubstate::InsideAttributeValue => self.read_attribute_value(t, |this, value| { + let name = this.data.take_attr_name().unwrap(); // unwrap() will always succeed here + + // check that no attribute with such name is already present + // if there is one, XML is not well-formed + if this.data.attributes.iter().find(|a| a.name == name).is_some() { // TODO: looks bad + // TODO: ideally this error should point to the beginning of the attribute, + // TODO: not the end of its value + Some(self_error!(this; "Attribute '{}' is redefined", name)) + } else { + match name.prefix_ref() { + // declaring a new prefix; it is sufficient to check prefix only + // because "xmlns" prefix is reserved + Some(namespace::NS_XMLNS_PREFIX) => { + let ln = &name.local_name[..]; + if ln == namespace::NS_XMLNS_PREFIX { + Some(self_error!(this; "Cannot redefine prefix '{}'", namespace::NS_XMLNS_PREFIX)) + } else if ln == namespace::NS_XML_PREFIX && &value[..] != namespace::NS_XML_URI { + Some(self_error!(this; "Prefix '{}' cannot be rebound to another value", namespace::NS_XML_PREFIX)) + } else if value.is_empty() { + Some(self_error!(this; "Cannot undefine prefix '{}'", ln)) + } else { + this.nst.put(name.local_name.clone(), value); + this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideTag)) + } + } + + // declaring default namespace + None if &name.local_name[..] == namespace::NS_XMLNS_PREFIX => + match &value[..] { + namespace::NS_XMLNS_PREFIX | namespace::NS_XML_PREFIX => + Some(self_error!(this; "Namespace '{}' cannot be default", value)), + _ => { + this.nst.put(namespace::NS_NO_PREFIX, value.clone()); + this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideTag)) + } + }, + + // regular attribute + _ => { + this.data.attributes.push(OwnedAttribute { + name: name.clone(), + value: value + }); + this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideTag)) + } + } + } + }) + } + } + +} diff --git a/third_party/rust/xml-rs/src/reader/parser/inside_processing_instruction.rs b/third_party/rust/xml-rs/src/reader/parser/inside_processing_instruction.rs new file mode 100644 index 0000000000..8ddf6b8d51 --- /dev/null +++ b/third_party/rust/xml-rs/src/reader/parser/inside_processing_instruction.rs @@ -0,0 +1,96 @@ +use common::{ + is_name_start_char, is_name_char, +}; + +use reader::events::XmlEvent; +use reader::lexer::Token; + +use super::{Result, PullParser, State, ProcessingInstructionSubstate, DeclarationSubstate}; + +impl PullParser { + pub fn inside_processing_instruction(&mut self, t: Token, s: ProcessingInstructionSubstate) -> Option<Result> { + match s { + ProcessingInstructionSubstate::PIInsideName => match t { + Token::Character(c) if !self.buf_has_data() && is_name_start_char(c) || + self.buf_has_data() && is_name_char(c) => self.append_char_continue(c), + + Token::ProcessingInstructionEnd => { + // self.buf contains PI name + let name = self.take_buf(); + + // Don't need to check for declaration because it has mandatory attributes + // but there is none + match &name[..] { + // Name is empty, it is an error + "" => Some(self_error!(self; "Encountered processing instruction without name")), + + // Found <?xml-like PI not at the beginning of a document, + // it is an error - see section 2.6 of XML 1.1 spec + "xml"|"xmL"|"xMl"|"xML"|"Xml"|"XmL"|"XMl"|"XML" => + Some(self_error!(self; "Invalid processing instruction: <?{}", name)), + + // All is ok, emitting event + _ => { + self.into_state_emit( + State::OutsideTag, + Ok(XmlEvent::ProcessingInstruction { + name: name, + data: None + }) + ) + } + } + } + + Token::Whitespace(_) => { + // self.buf contains PI name + let name = self.take_buf(); + + match &name[..] { + // We have not ever encountered an element and have not parsed XML declaration + "xml" if !self.encountered_element && !self.parsed_declaration => + self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::BeforeVersion)), + + // Found <?xml-like PI after the beginning of a document, + // it is an error - see section 2.6 of XML 1.1 spec + "xml"|"xmL"|"xMl"|"xML"|"Xml"|"XmL"|"XMl"|"XML" + if self.encountered_element || self.parsed_declaration => + Some(self_error!(self; "Invalid processing instruction: <?{}", name)), + + // All is ok, starting parsing PI data + _ => { + self.lexer.disable_errors(); // data is arbitrary, so disable errors + self.data.name = name; + self.into_state_continue(State::InsideProcessingInstruction(ProcessingInstructionSubstate::PIInsideData)) + } + + } + } + + _ => Some(self_error!(self; "Unexpected token: <?{}{}", self.buf, t)) + }, + + ProcessingInstructionSubstate::PIInsideData => match t { + Token::ProcessingInstructionEnd => { + self.lexer.enable_errors(); + let name = self.data.take_name(); + let data = self.take_buf(); + self.into_state_emit( + State::OutsideTag, + Ok(XmlEvent::ProcessingInstruction { + name: name, + data: Some(data) + }) + ) + }, + + // Any other token should be treated as plain characters + _ => { + t.push_to_string(&mut self.buf); + None + } + }, + } + } + +} diff --git a/third_party/rust/xml-rs/src/reader/parser/inside_reference.rs b/third_party/rust/xml-rs/src/reader/parser/inside_reference.rs new file mode 100644 index 0000000000..60026d5572 --- /dev/null +++ b/third_party/rust/xml-rs/src/reader/parser/inside_reference.rs @@ -0,0 +1,89 @@ +use std::char; + +use common::{is_name_start_char, is_name_char, is_whitespace_str}; + +use reader::lexer::Token; + +use super::{Result, PullParser, State}; + +impl PullParser { + pub fn inside_reference(&mut self, t: Token, prev_st: State) -> Option<Result> { + match t { + Token::Character(c) if !self.data.ref_data.is_empty() && is_name_char(c) || + self.data.ref_data.is_empty() && (is_name_start_char(c) || c == '#') => { + self.data.ref_data.push(c); + None + } + + Token::ReferenceEnd => { + // TODO: check for unicode correctness + let name = self.data.take_ref_data(); + let name_len = name.len(); // compute once + let c = match &name[..] { + "lt" => Ok('<'.to_string()), + "gt" => Ok('>'.to_string()), + "amp" => Ok('&'.to_string()), + "apos" => Ok('\''.to_string()), + "quot" => Ok('"'.to_string()), + "" => Err(self_error!(self; "Encountered empty entity")), + _ if name_len > 2 && name.starts_with("#x") => { + let num_str = &name[2..name_len]; + if num_str == "0" { + Err(self_error!(self; "Null character entity is not allowed")) + } else { + if self.config.replace_unknown_entity_references { + match u32::from_str_radix(num_str, 16).ok().map(|i| char::from_u32(i).unwrap_or('\u{fffd}')) { + Some(c) => Ok(c.to_string()), + None => Err(self_error!(self; "Invalid hexadecimal character number in an entity: {}", name)) + } + } else { + match u32::from_str_radix(num_str, 16).ok().and_then(char::from_u32) { + Some(c) => Ok(c.to_string()), + None => Err(self_error!(self; "Invalid hexadecimal character number in an entity: {}", name)) + } + } + } + } + _ if name_len > 1 && name.starts_with('#') => { + let num_str = &name[1..name_len]; + if num_str == "0" { + Err(self_error!(self; "Null character entity is not allowed")) + } else { + if self.config.replace_unknown_entity_references { + match u32::from_str_radix(num_str, 10).ok().map(|i| char::from_u32(i).unwrap_or('\u{fffd}')) { + Some(c) => Ok(c.to_string()), + None => Err(self_error!(self; "Invalid decimal character number in an entity: {}", name)) + } + } + else { + match u32::from_str_radix(num_str, 10).ok().and_then(char::from_u32) { + Some(c) => Ok(c.to_string()), + None => Err(self_error!(self; "Invalid decimal character number in an entity: {}", name)) + } + } + } + }, + _ => { + if let Some(v) = self.config.extra_entities.get(&name) { + Ok(v.clone()) + } else { + Err(self_error!(self; "Unexpected entity: {}", name)) + } + } + }; + match c { + Ok(c) => { + self.buf.push_str(&c); + if prev_st == State::OutsideTag && !is_whitespace_str(&c) { + self.inside_whitespace = false; + } + self.into_state_continue(prev_st) + } + Err(e) => Some(e) + } + } + + _ => Some(self_error!(self; "Unexpected token inside an entity: {}", t)) + } + } +} diff --git a/third_party/rust/xml-rs/src/reader/parser/mod.rs b/third_party/rust/xml-rs/src/reader/parser/mod.rs new file mode 100644 index 0000000000..58ca3a6b1e --- /dev/null +++ b/third_party/rust/xml-rs/src/reader/parser/mod.rs @@ -0,0 +1,622 @@ +//! Contains an implementation of pull-based XML parser. + +use std::mem; +use std::borrow::Cow; +use std::io::prelude::*; + +use common::{ + self, + XmlVersion, Position, TextPosition, + is_name_start_char, is_name_char, +}; +use name::OwnedName; +use attribute::OwnedAttribute; +use namespace::NamespaceStack; + +use reader::events::XmlEvent; +use reader::config::ParserConfig; +use reader::lexer::{Lexer, Token}; + +macro_rules! gen_takes( + ($($field:ident -> $method:ident, $t:ty, $def:expr);+) => ( + $( + impl MarkupData { + #[inline] + fn $method(&mut self) -> $t { + mem::replace(&mut self.$field, $def) + } + } + )+ + ) +); + +gen_takes!( + name -> take_name, String, String::new(); + ref_data -> take_ref_data, String, String::new(); + + version -> take_version, Option<common::XmlVersion>, None; + encoding -> take_encoding, Option<String>, None; + standalone -> take_standalone, Option<bool>, None; + + element_name -> take_element_name, Option<OwnedName>, None; + + attr_name -> take_attr_name, Option<OwnedName>, None; + attributes -> take_attributes, Vec<OwnedAttribute>, vec!() +); + +macro_rules! self_error( + ($this:ident; $msg:expr) => ($this.error($msg)); + ($this:ident; $fmt:expr, $($arg:expr),+) => ($this.error(format!($fmt, $($arg),+))) +); + +mod outside_tag; +mod inside_processing_instruction; +mod inside_declaration; +mod inside_doctype; +mod inside_opening_tag; +mod inside_closing_tag_name; +mod inside_comment; +mod inside_cdata; +mod inside_reference; + +static DEFAULT_VERSION: XmlVersion = XmlVersion::Version10; +static DEFAULT_ENCODING: &'static str = "UTF-8"; +static DEFAULT_STANDALONE: Option<bool> = None; + +type ElementStack = Vec<OwnedName>; +pub type Result = super::Result<XmlEvent>; + +/// Pull-based XML parser. +pub struct PullParser { + config: ParserConfig, + lexer: Lexer, + st: State, + buf: String, + nst: NamespaceStack, + + data: MarkupData, + final_result: Option<Result>, + next_event: Option<Result>, + est: ElementStack, + pos: Vec<TextPosition>, + + encountered_element: bool, + parsed_declaration: bool, + inside_whitespace: bool, + read_prefix_separator: bool, + pop_namespace: bool +} + +impl PullParser { + /// Returns a new parser using the given config. + pub fn new(config: ParserConfig) -> PullParser { + PullParser { + config: config, + lexer: Lexer::new(), + st: State::OutsideTag, + buf: String::new(), + nst: NamespaceStack::default(), + + data: MarkupData { + name: String::new(), + version: None, + encoding: None, + standalone: None, + ref_data: String::new(), + element_name: None, + quote: None, + attr_name: None, + attributes: Vec::new() + }, + final_result: None, + next_event: None, + est: Vec::new(), + pos: vec![TextPosition::new()], + + encountered_element: false, + parsed_declaration: false, + inside_whitespace: true, + read_prefix_separator: false, + pop_namespace: false + } + } + + /// Checks if this parser ignores the end of stream errors. + pub fn is_ignoring_end_of_stream(&self) -> bool { self.config.ignore_end_of_stream } +} + +impl Position for PullParser { + /// Returns the position of the last event produced by the parser + #[inline] + fn position(&self) -> TextPosition { + self.pos[0] + } +} + +#[derive(Clone, PartialEq)] +pub enum State { + OutsideTag, + InsideOpeningTag(OpeningTagSubstate), + InsideClosingTag(ClosingTagSubstate), + InsideProcessingInstruction(ProcessingInstructionSubstate), + InsideComment, + InsideCData, + InsideDeclaration(DeclarationSubstate), + InsideDoctype, + InsideReference(Box<State>) +} + +#[derive(Clone, PartialEq)] +pub enum OpeningTagSubstate { + InsideName, + + InsideTag, + + InsideAttributeName, + AfterAttributeName, + + InsideAttributeValue, +} + +#[derive(Clone, PartialEq)] +pub enum ClosingTagSubstate { + CTInsideName, + CTAfterName +} + +#[derive(Clone, PartialEq)] +pub enum ProcessingInstructionSubstate { + PIInsideName, + PIInsideData +} + +#[derive(Clone, PartialEq)] +pub enum DeclarationSubstate { + BeforeVersion, + InsideVersion, + AfterVersion, + + InsideVersionValue, + AfterVersionValue, + + InsideEncoding, + AfterEncoding, + + InsideEncodingValue, + + BeforeStandaloneDecl, + InsideStandaloneDecl, + AfterStandaloneDecl, + + InsideStandaloneDeclValue, + AfterStandaloneDeclValue +} + +#[derive(PartialEq)] +enum QualifiedNameTarget { + AttributeNameTarget, + OpeningTagNameTarget, + ClosingTagNameTarget +} + +#[derive(Copy, Clone, PartialEq, Eq)] +enum QuoteToken { + SingleQuoteToken, + DoubleQuoteToken +} + +impl QuoteToken { + fn from_token(t: &Token) -> QuoteToken { + match *t { + Token::SingleQuote => QuoteToken::SingleQuoteToken, + Token::DoubleQuote => QuoteToken::DoubleQuoteToken, + _ => panic!("Unexpected token: {}", t) + } + } + + fn as_token(self) -> Token { + match self { + QuoteToken::SingleQuoteToken => Token::SingleQuote, + QuoteToken::DoubleQuoteToken => Token::DoubleQuote + } + } +} + +struct MarkupData { + name: String, // used for processing instruction name + ref_data: String, // used for reference content + + version: Option<common::XmlVersion>, // used for XML declaration version + encoding: Option<String>, // used for XML declaration encoding + standalone: Option<bool>, // used for XML declaration standalone parameter + + element_name: Option<OwnedName>, // used for element name + + quote: Option<QuoteToken>, // used to hold opening quote for attribute value + attr_name: Option<OwnedName>, // used to hold attribute name + attributes: Vec<OwnedAttribute> // used to hold all accumulated attributes +} + +impl PullParser { + /// Returns next event read from the given buffer. + /// + /// This method should be always called with the same buffer. If you call it + /// providing different buffers each time, the result will be undefined. + pub fn next<R: Read>(&mut self, r: &mut R) -> Result { + if let Some(ref ev) = self.final_result { + return ev.clone(); + } + + if let Some(ev) = self.next_event.take() { + return ev; + } + + if self.pop_namespace { + self.pop_namespace = false; + self.nst.pop(); + } + + loop { + // While lexer gives us Ok(maybe_token) -- we loop. + // Upon having a complete XML-event -- we return from the whole function. + match self.lexer.next_token(r) { + Ok(maybe_token) => + match maybe_token { + None => break, + Some(token) => + match self.dispatch_token(token) { + None => {} // continue + Some(Ok(XmlEvent::EndDocument)) => + return { + self.next_pos(); + self.set_final_result(Ok(XmlEvent::EndDocument)) + }, + Some(Ok(xml_event)) => + return { + self.next_pos(); + Ok(xml_event) + }, + Some(Err(xml_error)) => + return { + self.next_pos(); + self.set_final_result(Err(xml_error)) + }, + } + }, + Err(lexer_error) => + return self.set_final_result(Err(lexer_error)), + } + } + + // Handle end of stream + // Forward pos to the lexer head + self.next_pos(); + let ev = if self.depth() == 0 { + if self.encountered_element && self.st == State::OutsideTag { // all is ok + Ok(XmlEvent::EndDocument) + } else if !self.encountered_element { + self_error!(self; "Unexpected end of stream: no root element found") + } else { // self.st != State::OutsideTag + self_error!(self; "Unexpected end of stream") // TODO: add expected hint? + } + } else { + if self.config.ignore_end_of_stream { + self.final_result = None; + self.lexer.reset_eof_handled(); + return self_error!(self; "Unexpected end of stream: still inside the root element"); + } else { + self_error!(self; "Unexpected end of stream: still inside the root element") + } + }; + self.set_final_result(ev) + } + + // This function is to be called when a terminal event is reached. + // The function sets up the `self.final_result` into `Some(result)` and return `result`. + fn set_final_result(&mut self, result: Result) -> Result { + self.final_result = Some(result.clone()); + result + } + + #[inline] + fn error<M: Into<Cow<'static, str>>>(&self, msg: M) -> Result { + Err((&self.lexer, msg).into()) + } + + #[inline] + fn next_pos(&mut self) { + if self.pos.len() > 1 { + self.pos.remove(0); + } else { + self.pos[0] = self.lexer.position(); + } + } + + #[inline] + fn push_pos(&mut self) { + self.pos.push(self.lexer.position()); + } + + fn dispatch_token(&mut self, t: Token) -> Option<Result> { + match self.st.clone() { + State::OutsideTag => self.outside_tag(t), + State::InsideProcessingInstruction(s) => self.inside_processing_instruction(t, s), + State::InsideDeclaration(s) => self.inside_declaration(t, s), + State::InsideDoctype => self.inside_doctype(t), + State::InsideOpeningTag(s) => self.inside_opening_tag(t, s), + State::InsideClosingTag(s) => self.inside_closing_tag_name(t, s), + State::InsideComment => self.inside_comment(t), + State::InsideCData => self.inside_cdata(t), + State::InsideReference(s) => self.inside_reference(t, *s) + } + } + + #[inline] + fn depth(&self) -> usize { + self.est.len() + } + + #[inline] + fn buf_has_data(&self) -> bool { + self.buf.len() > 0 + } + + #[inline] + fn take_buf(&mut self) -> String { + mem::replace(&mut self.buf, String::new()) + } + + #[inline] + fn append_char_continue(&mut self, c: char) -> Option<Result> { + self.buf.push(c); + None + } + + #[inline] + fn into_state(&mut self, st: State, ev: Option<Result>) -> Option<Result> { + self.st = st; + ev + } + + #[inline] + fn into_state_continue(&mut self, st: State) -> Option<Result> { + self.into_state(st, None) + } + + #[inline] + fn into_state_emit(&mut self, st: State, ev: Result) -> Option<Result> { + self.into_state(st, Some(ev)) + } + + /// Dispatches tokens in order to process qualified name. If qualified name cannot be parsed, + /// an error is returned. + /// + /// # Parameters + /// * `t` --- next token; + /// * `on_name` --- a callback which is executed when whitespace is encountered. + fn read_qualified_name<F>(&mut self, t: Token, target: QualifiedNameTarget, on_name: F) -> Option<Result> + where F: Fn(&mut PullParser, Token, OwnedName) -> Option<Result> { + // We can get here for the first time only when self.data.name contains zero or one character, + // but first character cannot be a colon anyway + if self.buf.len() <= 1 { + self.read_prefix_separator = false; + } + + let invoke_callback = |this: &mut PullParser, t| { + let name = this.take_buf(); + match name.parse() { + Ok(name) => on_name(this, t, name), + Err(_) => Some(self_error!(this; "Qualified name is invalid: {}", name)) + } + }; + + match t { + // There can be only one colon, and not as the first character + Token::Character(':') if self.buf_has_data() && !self.read_prefix_separator => { + self.buf.push(':'); + self.read_prefix_separator = true; + None + } + + Token::Character(c) if c != ':' && (!self.buf_has_data() && is_name_start_char(c) || + self.buf_has_data() && is_name_char(c)) => + self.append_char_continue(c), + + Token::EqualsSign if target == QualifiedNameTarget::AttributeNameTarget => invoke_callback(self, t), + + Token::EmptyTagEnd if target == QualifiedNameTarget::OpeningTagNameTarget => invoke_callback(self, t), + + Token::TagEnd if target == QualifiedNameTarget::OpeningTagNameTarget || + target == QualifiedNameTarget::ClosingTagNameTarget => invoke_callback(self, t), + + Token::Whitespace(_) => invoke_callback(self, t), + + _ => Some(self_error!(self; "Unexpected token inside qualified name: {}", t)) + } + } + + /// Dispatches tokens in order to process attribute value. + /// + /// # Parameters + /// * `t` --- next token; + /// * `on_value` --- a callback which is called when terminating quote is encountered. + fn read_attribute_value<F>(&mut self, t: Token, on_value: F) -> Option<Result> + where F: Fn(&mut PullParser, String) -> Option<Result> { + match t { + Token::Whitespace(_) if self.data.quote.is_none() => None, // skip leading whitespace + + Token::DoubleQuote | Token::SingleQuote => match self.data.quote { + None => { // Entered attribute value + self.data.quote = Some(QuoteToken::from_token(&t)); + None + } + Some(q) if q.as_token() == t => { + self.data.quote = None; + let value = self.take_buf(); + on_value(self, value) + } + _ => { + t.push_to_string(&mut self.buf); + None + } + }, + + Token::ReferenceStart => { + let st = Box::new(self.st.clone()); + self.into_state_continue(State::InsideReference(st)) + } + + Token::OpeningTagStart => + Some(self_error!(self; "Unexpected token inside attribute value: <")), + + // Every character except " and ' and < is okay + _ => { + t.push_to_string(&mut self.buf); + None + } + } + } + + fn emit_start_element(&mut self, emit_end_element: bool) -> Option<Result> { + let mut name = self.data.take_element_name().unwrap(); + let mut attributes = self.data.take_attributes(); + + // check whether the name prefix is bound and fix its namespace + match self.nst.get(name.borrow().prefix_repr()) { + Some("") => name.namespace = None, // default namespace + Some(ns) => name.namespace = Some(ns.into()), + None => return Some(self_error!(self; "Element {} prefix is unbound", name)) + } + + // check and fix accumulated attributes prefixes + for attr in attributes.iter_mut() { + if let Some(ref pfx) = attr.name.prefix { + let new_ns = match self.nst.get(pfx) { + Some("") => None, // default namespace + Some(ns) => Some(ns.into()), + None => return Some(self_error!(self; "Attribute {} prefix is unbound", attr.name)) + }; + attr.name.namespace = new_ns; + } + } + + if emit_end_element { + self.pop_namespace = true; + self.next_event = Some(Ok(XmlEvent::EndElement { + name: name.clone() + })); + } else { + self.est.push(name.clone()); + } + let namespace = self.nst.squash(); + self.into_state_emit(State::OutsideTag, Ok(XmlEvent::StartElement { + name: name, + attributes: attributes, + namespace: namespace + })) + } + + fn emit_end_element(&mut self) -> Option<Result> { + let mut name = self.data.take_element_name().unwrap(); + + // check whether the name prefix is bound and fix its namespace + match self.nst.get(name.borrow().prefix_repr()) { + Some("") => name.namespace = None, // default namespace + Some(ns) => name.namespace = Some(ns.into()), + None => return Some(self_error!(self; "Element {} prefix is unbound", name)) + } + + let op_name = self.est.pop().unwrap(); + + if name == op_name { + self.pop_namespace = true; + self.into_state_emit(State::OutsideTag, Ok(XmlEvent::EndElement { name: name })) + } else { + Some(self_error!(self; "Unexpected closing tag: {}, expected {}", name, op_name)) + } + } + +} + +#[cfg(test)] +mod tests { + use std::io::BufReader; + + use common::{Position, TextPosition}; + use name::OwnedName; + use attribute::OwnedAttribute; + use reader::parser::PullParser; + use reader::ParserConfig; + use reader::events::XmlEvent; + + fn new_parser() -> PullParser { + PullParser::new(ParserConfig::new()) + } + + macro_rules! expect_event( + ($r:expr, $p:expr, $t:pat) => ( + match $p.next(&mut $r) { + $t => {} + e => panic!("Unexpected event: {:?}", e) + } + ); + ($r:expr, $p:expr, $t:pat => $c:expr ) => ( + match $p.next(&mut $r) { + $t if $c => {} + e => panic!("Unexpected event: {:?}", e) + } + ) + ); + + macro_rules! test_data( + ($d:expr) => ({ + static DATA: &'static str = $d; + let r = BufReader::new(DATA.as_bytes()); + let p = new_parser(); + (r, p) + }) + ); + + #[test] + fn issue_3_semicolon_in_attribute_value() { + let (mut r, mut p) = test_data!(r#" + <a attr="zzz;zzz" /> + "#); + + expect_event!(r, p, Ok(XmlEvent::StartDocument { .. })); + expect_event!(r, p, Ok(XmlEvent::StartElement { ref name, ref attributes, ref namespace }) => + *name == OwnedName::local("a") && + attributes.len() == 1 && + attributes[0] == OwnedAttribute::new(OwnedName::local("attr"), "zzz;zzz") && + namespace.is_essentially_empty() + ); + expect_event!(r, p, Ok(XmlEvent::EndElement { ref name }) => *name == OwnedName::local("a")); + expect_event!(r, p, Ok(XmlEvent::EndDocument)); + } + + #[test] + fn issue_140_entity_reference_inside_tag() { + let (mut r, mut p) = test_data!(r#" + <bla>♫</bla> + "#); + + expect_event!(r, p, Ok(XmlEvent::StartDocument { .. })); + expect_event!(r, p, Ok(XmlEvent::StartElement { ref name, .. }) => *name == OwnedName::local("bla")); + expect_event!(r, p, Ok(XmlEvent::Characters(ref s)) => s == "\u{266b}"); + expect_event!(r, p, Ok(XmlEvent::EndElement { ref name, .. }) => *name == OwnedName::local("bla")); + expect_event!(r, p, Ok(XmlEvent::EndDocument)); + } + + #[test] + fn opening_tag_in_attribute_value() { + let (mut r, mut p) = test_data!(r#" + <a attr="zzz<zzz" /> + "#); + + expect_event!(r, p, Ok(XmlEvent::StartDocument { .. })); + expect_event!(r, p, Err(ref e) => + e.msg() == "Unexpected token inside attribute value: <" && + e.position() == TextPosition { row: 1, column: 24 } + ); + } +} diff --git a/third_party/rust/xml-rs/src/reader/parser/outside_tag.rs b/third_party/rust/xml-rs/src/reader/parser/outside_tag.rs new file mode 100644 index 0000000000..d3f7598f75 --- /dev/null +++ b/third_party/rust/xml-rs/src/reader/parser/outside_tag.rs @@ -0,0 +1,130 @@ +use common::is_whitespace_char; + +use reader::events::XmlEvent; +use reader::lexer::Token; + +use super::{ + Result, PullParser, State, ClosingTagSubstate, OpeningTagSubstate, + ProcessingInstructionSubstate, DEFAULT_VERSION, DEFAULT_ENCODING, DEFAULT_STANDALONE +}; + +impl PullParser { + pub fn outside_tag(&mut self, t: Token) -> Option<Result> { + match t { + Token::ReferenceStart => + self.into_state_continue(State::InsideReference(Box::new(State::OutsideTag))), + + Token::Whitespace(_) if self.depth() == 0 && self.config.ignore_root_level_whitespace => None, // skip whitespace outside of the root element + + Token::Whitespace(_) if self.config.trim_whitespace && !self.buf_has_data() => None, + + Token::Whitespace(c) => { + if !self.buf_has_data() { + self.push_pos(); + } + self.append_char_continue(c) + } + + _ if t.contains_char_data() && self.depth() == 0 => + Some(self_error!(self; "Unexpected characters outside the root element: {}", t)), + + _ if t.contains_char_data() => { // Non-whitespace char data + if !self.buf_has_data() { + self.push_pos(); + } + self.inside_whitespace = false; + t.push_to_string(&mut self.buf); + None + } + + Token::ReferenceEnd => { // Semi-colon in a text outside an entity + self.inside_whitespace = false; + Token::ReferenceEnd.push_to_string(&mut self.buf); + None + } + + Token::CommentStart if self.config.coalesce_characters && self.config.ignore_comments => { + // We need to switch the lexer into a comment mode inside comments + self.lexer.inside_comment(); + self.into_state_continue(State::InsideComment) + } + + Token::CDataStart if self.config.coalesce_characters && self.config.cdata_to_characters => { + if !self.buf_has_data() { + self.push_pos(); + } + // We need to disable lexing errors inside CDATA + self.lexer.disable_errors(); + self.into_state_continue(State::InsideCData) + } + + _ => { + // Encountered some markup event, flush the buffer as characters + // or a whitespace + let mut next_event = if self.buf_has_data() { + let buf = self.take_buf(); + if self.inside_whitespace && self.config.trim_whitespace { + None + } else if self.inside_whitespace && !self.config.whitespace_to_characters { + Some(Ok(XmlEvent::Whitespace(buf))) + } else if self.config.trim_whitespace { + Some(Ok(XmlEvent::Characters(buf.trim_matches(is_whitespace_char).into()))) + } else { + Some(Ok(XmlEvent::Characters(buf))) + } + } else { None }; + self.inside_whitespace = true; // Reset inside_whitespace flag + self.push_pos(); + match t { + Token::ProcessingInstructionStart => + self.into_state(State::InsideProcessingInstruction(ProcessingInstructionSubstate::PIInsideName), next_event), + + Token::DoctypeStart if !self.encountered_element => { + // We don't have a doctype event so skip this position + // FIXME: update when we have a doctype event + self.next_pos(); + self.lexer.disable_errors(); + self.into_state(State::InsideDoctype, next_event) + } + + Token::OpeningTagStart => { + // If declaration was not parsed and we have encountered an element, + // emit this declaration as the next event. + if !self.parsed_declaration { + self.parsed_declaration = true; + let sd_event = XmlEvent::StartDocument { + version: DEFAULT_VERSION, + encoding: DEFAULT_ENCODING.into(), + standalone: DEFAULT_STANDALONE + }; + // next_event is always none here because we're outside of + // the root element + next_event = Some(Ok(sd_event)); + self.push_pos(); + } + self.encountered_element = true; + self.nst.push_empty(); + self.into_state(State::InsideOpeningTag(OpeningTagSubstate::InsideName), next_event) + } + + Token::ClosingTagStart if self.depth() > 0 => + self.into_state(State::InsideClosingTag(ClosingTagSubstate::CTInsideName), next_event), + + Token::CommentStart => { + // We need to switch the lexer into a comment mode inside comments + self.lexer.inside_comment(); + self.into_state(State::InsideComment, next_event) + } + + Token::CDataStart => { + // We need to disable lexing errors inside CDATA + self.lexer.disable_errors(); + self.into_state(State::InsideCData, next_event) + } + + _ => Some(self_error!(self; "Unexpected token: {}", t)) + } + } + } + } +} |