diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 19:33:14 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 19:33:14 +0000 |
commit | 36d22d82aa202bb199967e9512281e9a53db42c9 (patch) | |
tree | 105e8c98ddea1c1e4784a60a5a6410fa416be2de /third_party/rust/xml-rs/src | |
parent | Initial commit. (diff) | |
download | firefox-esr-36d22d82aa202bb199967e9512281e9a53db42c9.tar.xz firefox-esr-36d22d82aa202bb199967e9512281e9a53db42c9.zip |
Adding upstream version 115.7.0esr.upstream/115.7.0esr
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/rust/xml-rs/src')
28 files changed, 5183 insertions, 0 deletions
diff --git a/third_party/rust/xml-rs/src/analyze.rs b/third_party/rust/xml-rs/src/analyze.rs new file mode 100644 index 0000000000..d369d2f014 --- /dev/null +++ b/third_party/rust/xml-rs/src/analyze.rs @@ -0,0 +1,99 @@ +#![forbid(unsafe_code)] + +extern crate xml; + +use std::cmp; +use std::env; +use std::io::{self, Read, Write, BufReader}; +use std::fs::File; +use std::collections::HashSet; + +use xml::ParserConfig; +use xml::reader::XmlEvent; + +macro_rules! abort { + ($code:expr) => {::std::process::exit($code)}; + ($code:expr, $($args:tt)+) => {{ + writeln!(&mut ::std::io::stderr(), $($args)+).unwrap(); + ::std::process::exit($code); + }} +} + +fn main() { + let mut file; + let mut stdin; + let source: &mut Read = match env::args().nth(1) { + Some(file_name) => { + file = File::open(file_name) + .unwrap_or_else(|e| abort!(1, "Cannot open input file: {}", e)); + &mut file + } + None => { + stdin = io::stdin(); + &mut stdin + } + }; + + let reader = ParserConfig::new() + .whitespace_to_characters(true) + .ignore_comments(false) + .create_reader(BufReader::new(source)); + + let mut processing_instructions = 0; + let mut elements = 0; + let mut character_blocks = 0; + let mut cdata_blocks = 0; + let mut characters = 0; + let mut comment_blocks = 0; + let mut comment_characters = 0; + let mut namespaces = HashSet::new(); + let mut depth = 0; + let mut max_depth = 0; + + for e in reader { + match e { + Ok(e) => match e { + XmlEvent::StartDocument { version, encoding, standalone } => + println!( + "XML document version {}, encoded in {}, {}standalone", + version, encoding, if standalone.unwrap_or(false) { "" } else { "not " } + ), + XmlEvent::EndDocument => println!("Document finished"), + XmlEvent::ProcessingInstruction { .. } => processing_instructions += 1, + XmlEvent::Whitespace(_) => {} // can't happen due to configuration + XmlEvent::Characters(s) => { + character_blocks += 1; + characters += s.len(); + } + XmlEvent::CData(s) => { + cdata_blocks += 1; + characters += s.len(); + } + XmlEvent::Comment(s) => { + comment_blocks += 1; + comment_characters += s.len(); + } + XmlEvent::StartElement { namespace, .. } => { + depth += 1; + max_depth = cmp::max(max_depth, depth); + elements += 1; + namespaces.extend(namespace.0.into_iter().map(|(_, ns_uri)| ns_uri)); + } + XmlEvent::EndElement { .. } => { + depth -= 1; + } + }, + Err(e) => abort!(1, "Error parsing XML document: {}", e) + } + } + namespaces.remove(xml::namespace::NS_EMPTY_URI); + namespaces.remove(xml::namespace::NS_XMLNS_URI); + namespaces.remove(xml::namespace::NS_XML_URI); + + println!("Elements: {}, maximum depth: {}", elements, max_depth); + println!("Namespaces (excluding built-in): {}", namespaces.len()); + println!("Characters: {}, characters blocks: {}, CDATA blocks: {}", + characters, character_blocks, cdata_blocks); + println!("Comment blocks: {}, comment characters: {}", comment_blocks, comment_characters); + println!("Processing instructions (excluding built-in): {}", processing_instructions); +} diff --git a/third_party/rust/xml-rs/src/attribute.rs b/third_party/rust/xml-rs/src/attribute.rs new file mode 100644 index 0000000000..8728f496d7 --- /dev/null +++ b/third_party/rust/xml-rs/src/attribute.rs @@ -0,0 +1,99 @@ +//! Contains XML attributes manipulation types and functions. +//! + +use std::fmt; + +use name::{Name, OwnedName}; +use escape::escape_str_attribute; + +/// A borrowed version of an XML attribute. +/// +/// Consists of a borrowed qualified name and a borrowed string value. +#[derive(Copy, Clone, Eq, PartialEq, Hash, Debug)] +pub struct Attribute<'a> { + /// Attribute name. + pub name: Name<'a>, + + /// Attribute value. + pub value: &'a str +} + +impl<'a> fmt::Display for Attribute<'a> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}=\"{}\"", self.name, escape_str_attribute(self.value)) + } +} + +impl<'a> Attribute<'a> { + /// Creates an owned attribute out of this borrowed one. + #[inline] + pub fn to_owned(&self) -> OwnedAttribute { + OwnedAttribute { + name: self.name.into(), + value: self.value.into(), + } + } + + /// Creates a borrowed attribute using the provided borrowed name and a borrowed string value. + #[inline] + pub fn new(name: Name<'a>, value: &'a str) -> Attribute<'a> { + Attribute { name, value, } + } +} + +/// An owned version of an XML attribute. +/// +/// Consists of an owned qualified name and an owned string value. +#[derive(Clone, Eq, PartialEq, Hash, Debug)] +pub struct OwnedAttribute { + /// Attribute name. + pub name: OwnedName, + + /// Attribute value. + pub value: String +} + +impl OwnedAttribute { + /// Returns a borrowed `Attribute` out of this owned one. + pub fn borrow(&self) -> Attribute { + Attribute { + name: self.name.borrow(), + value: &*self.value, + } + } + + /// Creates a new owned attribute using the provided owned name and an owned string value. + #[inline] + pub fn new<S: Into<String>>(name: OwnedName, value: S) -> OwnedAttribute { + OwnedAttribute { + name, + value: value.into(), + } + } +} + +impl fmt::Display for OwnedAttribute { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}=\"{}\"", self.name, escape_str_attribute(&*self.value)) + } +} + +#[cfg(test)] +mod tests { + use super::{Attribute}; + + use name::Name; + + #[test] + fn attribute_display() { + let attr = Attribute::new( + Name::qualified("attribute", "urn:namespace", Some("n")), + "its value with > & \" ' < weird symbols" + ); + + assert_eq!( + &*attr.to_string(), + "{urn:namespace}n:attribute=\"its value with > & " ' < weird symbols\"" + ) + } +} diff --git a/third_party/rust/xml-rs/src/common.rs b/third_party/rust/xml-rs/src/common.rs new file mode 100644 index 0000000000..029e8515af --- /dev/null +++ b/third_party/rust/xml-rs/src/common.rs @@ -0,0 +1,142 @@ +//! Contains common types and functions used throughout the library. + +use std::fmt; + +/// Represents a position inside some textual document. +#[derive(Copy, Clone, PartialEq, Eq)] +pub struct TextPosition { + /// Row, counting from 0 + pub row: u64, + /// Column, counting from 0 + pub column: u64, +} + +impl TextPosition { + /// Creates a new position initialized to the beginning of the document + #[inline] + pub fn new() -> TextPosition { + TextPosition { row: 0, column: 0 } + } + + /// Advances the position in a line + #[inline] + pub fn advance(&mut self, count: u8) { + self.column += count as u64; + } + + /// Advances the position in a line to the next tab position + #[inline] + pub fn advance_to_tab(&mut self, width: u8) { + let width = width as u64; + self.column += width - self.column % width + } + + /// Advances the position to the beginning of the next line + #[inline] + pub fn new_line(&mut self) { + self.column = 0; + self.row += 1; + } +} + +impl fmt::Debug for TextPosition { + #[inline] + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}:{}", self.row + 1, self.column + 1) + } +} + +impl fmt::Display for TextPosition { + #[inline] + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}:{}", self.row + 1, self.column + 1) + } +} + +/// Get the position in the document corresponding to the object +/// +/// This trait is implemented by parsers, lexers and errors. +pub trait Position { + /// Returns the current position or a position corresponding to the object. + fn position(&self) -> TextPosition; +} + +impl Position for TextPosition { + #[inline] + fn position(&self) -> TextPosition { + *self + } +} + +/// XML version enumeration. +#[derive(Copy, Clone, PartialEq, Eq)] +pub enum XmlVersion { + /// XML version 1.0. + Version10, + + /// XML version 1.1. + Version11 +} + +impl fmt::Display for XmlVersion { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match *self { + XmlVersion::Version10 => write!(f, "1.0"), + XmlVersion::Version11 => write!(f, "1.1") + } + } +} + +impl fmt::Debug for XmlVersion { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fmt::Display::fmt(self, f) + } +} + +/// Checks whether the given character is a white space character (`S`) +/// as is defined by XML 1.1 specification, [section 2.3][1]. +/// +/// [1]: http://www.w3.org/TR/2006/REC-xml11-20060816/#sec-common-syn +pub fn is_whitespace_char(c: char) -> bool { + match c { + '\x20' | '\x09' | '\x0d' | '\x0a' => true, + _ => false + } +} + +/// Checks whether the given string is compound only by white space +/// characters (`S`) using the previous is_whitespace_char to check +/// all characters of this string +pub fn is_whitespace_str(s: &str) -> bool { + s.chars().all(is_whitespace_char) +} + +/// Checks whether the given character is a name start character (`NameStartChar`) +/// as is defined by XML 1.1 specification, [section 2.3][1]. +/// +/// [1]: http://www.w3.org/TR/2006/REC-xml11-20060816/#sec-common-syn +pub fn is_name_start_char(c: char) -> bool { + match c { + ':' | 'A'...'Z' | '_' | 'a'...'z' | + '\u{C0}'...'\u{D6}' | '\u{D8}'...'\u{F6}' | '\u{F8}'...'\u{2FF}' | + '\u{370}'...'\u{37D}' | '\u{37F}'...'\u{1FFF}' | + '\u{200C}'...'\u{200D}' | '\u{2070}'...'\u{218F}' | + '\u{2C00}'...'\u{2FEF}' | '\u{3001}'...'\u{D7FF}' | + '\u{F900}'...'\u{FDCF}' | '\u{FDF0}'...'\u{FFFD}' | + '\u{10000}'...'\u{EFFFF}' => true, + _ => false + } +} + +/// Checks whether the given character is a name character (`NameChar`) +/// as is defined by XML 1.1 specification, [section 2.3][1]. +/// +/// [1]: http://www.w3.org/TR/2006/REC-xml11-20060816/#sec-common-syn +pub fn is_name_char(c: char) -> bool { + match c { + _ if is_name_start_char(c) => true, + '-' | '.' | '0'...'9' | '\u{B7}' | + '\u{300}'...'\u{36F}' | '\u{203F}'...'\u{2040}' => true, + _ => false + } +} diff --git a/third_party/rust/xml-rs/src/escape.rs b/third_party/rust/xml-rs/src/escape.rs new file mode 100644 index 0000000000..18298b9be1 --- /dev/null +++ b/third_party/rust/xml-rs/src/escape.rs @@ -0,0 +1,126 @@ +//! Contains functions for performing XML special characters escaping. + +use std::borrow::Cow; + +enum Value { + Char(char), + Str(&'static str) +} + +impl Value { + fn dispatch_for_attribute(c: char) -> Value { + match c { + '<' => Value::Str("<"), + '>' => Value::Str(">"), + '"' => Value::Str("""), + '\'' => Value::Str("'"), + '&' => Value::Str("&"), + '\n' => Value::Str("
"), + '\r' => Value::Str("
"), + _ => Value::Char(c) + } + } + + fn dispatch_for_pcdata(c: char) -> Value { + match c { + '<' => Value::Str("<"), + '&' => Value::Str("&"), + _ => Value::Char(c) + } + } +} + +enum Process<'a> { + Borrowed(&'a str), + Owned(String) +} + +impl<'a> Process<'a> { + fn process(&mut self, (i, next): (usize, Value)) { + match next { + Value::Str(s) => match *self { + Process::Owned(ref mut o) => o.push_str(s), + Process::Borrowed(b) => { + let mut r = String::with_capacity(b.len() + s.len()); + r.push_str(&b[..i]); + r.push_str(s); + *self = Process::Owned(r); + } + }, + Value::Char(c) => match *self { + Process::Borrowed(_) => {} + Process::Owned(ref mut o) => o.push(c) + } + } + } + + fn into_result(self) -> Cow<'a, str> { + match self { + Process::Borrowed(b) => Cow::Borrowed(b), + Process::Owned(o) => Cow::Owned(o) + } + } +} + +impl<'a> Extend<(usize, Value)> for Process<'a> { + fn extend<I: IntoIterator<Item=(usize, Value)>>(&mut self, it: I) { + for v in it.into_iter() { + self.process(v); + } + } +} + +fn escape_str(s: &str, dispatch: fn(char) -> Value) -> Cow<str> { + let mut p = Process::Borrowed(s); + p.extend(s.char_indices().map(|(ind, c)| (ind, dispatch(c)))); + p.into_result() +} + +/// Performs escaping of common XML characters inside an attribute value. +/// +/// This function replaces several important markup characters with their +/// entity equivalents: +/// +/// * `<` → `<` +/// * `>` → `>` +/// * `"` → `"` +/// * `'` → `'` +/// * `&` → `&` +/// +/// The resulting string is safe to use inside XML attribute values or in PCDATA sections. +/// +/// Does not perform allocations if the given string does not contain escapable characters. +#[inline] +pub fn escape_str_attribute(s: &str) -> Cow<str> { + escape_str(s, Value::dispatch_for_attribute) +} + +/// Performs escaping of common XML characters inside PCDATA. +/// +/// This function replaces several important markup characters with their +/// entity equivalents: +/// +/// * `<` → `<` +/// * `&` → `&` +/// +/// The resulting string is safe to use inside PCDATA sections but NOT inside attribute values. +/// +/// Does not perform allocations if the given string does not contain escapable characters. +#[inline] +pub fn escape_str_pcdata(s: &str) -> Cow<str> { + escape_str(s, Value::dispatch_for_pcdata) +} + +#[cfg(test)] +mod tests { + use super::{escape_str_pcdata, escape_str_attribute}; + + // TODO: add more tests + + #[test] + fn test_escape_multibyte_code_points() { + assert_eq!(escape_str_attribute("☃<"), "☃<"); + assert_eq!(escape_str_pcdata("☃<"), "☃<"); + } +} + diff --git a/third_party/rust/xml-rs/src/lib.rs b/third_party/rust/xml-rs/src/lib.rs new file mode 100644 index 0000000000..fb672efea0 --- /dev/null +++ b/third_party/rust/xml-rs/src/lib.rs @@ -0,0 +1,29 @@ +//#![warn(missing_doc)] +#![allow(dead_code)] +#![allow(unused_variables)] +#![forbid(non_camel_case_types)] +#![forbid(unsafe_code)] + +//! This crate currently provides an almost XML 1.0/1.1-compliant pull parser. + +#[cfg(doctest)] +#[macro_use] +extern crate doc_comment; + +#[cfg(doctest)] +doctest!("../Readme.md"); + +pub use reader::EventReader; +pub use reader::ParserConfig; +pub use writer::EventWriter; +pub use writer::EmitterConfig; + +pub mod macros; +pub mod name; +pub mod attribute; +pub mod common; +pub mod escape; +pub mod namespace; +pub mod reader; +pub mod writer; +mod util; diff --git a/third_party/rust/xml-rs/src/macros.rs b/third_party/rust/xml-rs/src/macros.rs new file mode 100644 index 0000000000..1cce3d6a5e --- /dev/null +++ b/third_party/rust/xml-rs/src/macros.rs @@ -0,0 +1,30 @@ +#![macro_use] + +//! Contains several macros used in this crate. + +macro_rules! gen_setter { + ($target:ty, $field:ident : into $t:ty) => { + impl $target { + /// Sets the field to the provided value and returns updated config object. + pub fn $field<T: Into<$t>>(mut self, value: T) -> $target { + self.$field = value.into(); + self + } + } + }; + ($target:ty, $field:ident : val $t:ty) => { + impl $target { + /// Sets the field to the provided value and returns updated config object. + pub fn $field(mut self, value: $t) -> $target { + self.$field = value; + self + } + } + } +} + +macro_rules! gen_setters { + ($target:ty, $($field:ident : $k:tt $tpe:ty),+) => ($( + gen_setter! { $target, $field : $k $tpe } + )+) +} diff --git a/third_party/rust/xml-rs/src/name.rs b/third_party/rust/xml-rs/src/name.rs new file mode 100644 index 0000000000..a20eae2f10 --- /dev/null +++ b/third_party/rust/xml-rs/src/name.rs @@ -0,0 +1,301 @@ +//! Contains XML qualified names manipulation types and functions. +//! + +use std::fmt; +use std::str::FromStr; + +use namespace::NS_NO_PREFIX; + +/// Represents a qualified XML name. +/// +/// A qualified name always consists at least of a local name. It can optionally contain +/// a prefix; when reading an XML document, if it contains a prefix, it must also contain a +/// namespace URI, but this is not enforced statically; see below. The name can contain a +/// namespace without a prefix; in that case a default, empty prefix is assumed. +/// +/// When writing XML documents, it is possible to omit the namespace URI, leaving only +/// the prefix. In this case the writer will check that the specifed prefix is bound to some +/// URI in the current namespace context. If both prefix and namespace URI are specified, +/// it is checked that the current namespace context contains this exact correspondence +/// between prefix and namespace URI. +/// +/// # Prefixes and URIs +/// +/// A qualified name with a prefix must always contain a proper namespace URI --- names with +/// a prefix but without a namespace associated with that prefix are meaningless. However, +/// it is impossible to obtain proper namespace URI by a prefix without a context, and such +/// context is only available when parsing a document (or it can be constructed manually +/// when writing a document). Tying a name to a context statically seems impractical. This +/// may change in future, though. +/// +/// # Conversions +/// +/// `Name` implements some `From` instances for conversion from strings and tuples. For example: +/// +/// ```rust +/// # use xml::name::Name; +/// let n1: Name = "p:some-name".into(); +/// let n2: Name = ("p", "some-name").into(); +/// +/// assert_eq!(n1, n2); +/// assert_eq!(n1.local_name, "some-name"); +/// assert_eq!(n1.prefix, Some("p")); +/// assert!(n1.namespace.is_none()); +/// ``` +/// +/// This is added to support easy specification of XML elements when writing XML documents. +#[derive(Copy, Clone, PartialEq, Eq, Hash, Debug)] +pub struct Name<'a> { + /// A local name, e.g. `string` in `xsi:string`. + pub local_name: &'a str, + + /// A namespace URI, e.g. `http://www.w3.org/2000/xmlns/`. + pub namespace: Option<&'a str>, + + /// A name prefix, e.g. `xsi` in `xsi:string`. + pub prefix: Option<&'a str> +} + +impl<'a> From<&'a str> for Name<'a> { + fn from(s: &'a str) -> Name<'a> { + let mut parts = s.splitn(2, ":").fuse(); + match (parts.next(), parts.next()) { + (Some(name), None) => Name::local(name), + (Some(prefix), Some(name)) => Name::prefixed(name, prefix), + _ => unreachable!() + } + } +} + +impl<'a> From<(&'a str, &'a str)> for Name<'a> { + fn from((prefix, name): (&'a str, &'a str)) -> Name<'a> { + Name::prefixed(name, prefix) + } +} + +impl<'a> fmt::Display for Name<'a> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + if let Some(namespace) = self.namespace { + write!(f, "{{{}}}", namespace)?; + } + + if let Some(prefix) = self.prefix { + write!(f, "{}:", prefix)?; + } + + write!(f, "{}", self.local_name) + } +} + +impl<'a> Name<'a> { + /// Returns an owned variant of the qualified name. + pub fn to_owned(&self) -> OwnedName { + OwnedName { + local_name: self.local_name.into(), + namespace: self.namespace.map(|s| s.into()), + prefix: self.prefix.map(|s| s.into()) + } + } + + /// Returns a new `Name` instance representing plain local name. + #[inline] + pub fn local(local_name: &str) -> Name { + Name { + local_name, + prefix: None, + namespace: None + } + } + + /// Returns a new `Name` instance with the given local name and prefix. + #[inline] + pub fn prefixed(local_name: &'a str, prefix: &'a str) -> Name<'a> { + Name { + local_name, + namespace: None, + prefix: Some(prefix) + } + } + + /// Returns a new `Name` instance representing a qualified name with or without a prefix and + /// with a namespace URI. + #[inline] + pub fn qualified(local_name: &'a str, namespace: &'a str, prefix: Option<&'a str>) -> Name<'a> { + Name { + local_name, + namespace: Some(namespace), + prefix, + } + } + + /// Returns a correct XML representation of this local name and prefix. + /// + /// This method is different from the autoimplemented `to_string()` because it does not + /// include namespace URI in the result. + pub fn to_repr(&self) -> String { + self.repr_display().to_string() + } + + /// Returns a structure which can be displayed with `std::fmt` machinery to obtain this + /// local name and prefix. + /// + /// This method is needed for efficiency purposes in order not to create unnecessary + /// allocations. + #[inline] + pub fn repr_display(&self) -> ReprDisplay { + ReprDisplay(self) + } + + /// Returns either a prefix of this name or `namespace::NS_NO_PREFIX` constant. + #[inline] + pub fn prefix_repr(&self) -> &str { + self.prefix.unwrap_or(NS_NO_PREFIX) + } +} + +/// A wrapper around `Name` whose `Display` implementation prints the wrapped name as it is +/// displayed in an XML document. +pub struct ReprDisplay<'a, 'b:'a>(&'a Name<'b>); + +impl<'a, 'b:'a> fmt::Display for ReprDisplay<'a, 'b> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self.0.prefix { + Some(prefix) => write!(f, "{}:{}", prefix, self.0.local_name), + None => write!(f, "{}", self.0.local_name) + } + } +} + +/// An owned variant of `Name`. +/// +/// Everything about `Name` applies to this structure as well. +#[derive(Clone, PartialEq, Eq, Hash, Debug)] +pub struct OwnedName { + /// A local name, e.g. `string` in `xsi:string`. + pub local_name: String, + + /// A namespace URI, e.g. `http://www.w3.org/2000/xmlns/`. + pub namespace: Option<String>, + + /// A name prefix, e.g. `xsi` in `xsi:string`. + pub prefix: Option<String>, +} + +impl fmt::Display for OwnedName { + #[inline] + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fmt::Display::fmt(&self.borrow(), f) + } +} + +impl OwnedName { + /// Constructs a borrowed `Name` based on this owned name. + pub fn borrow(&self) -> Name { + Name { + local_name: &*self.local_name, + namespace: self.namespace.as_ref().map(|s| &**s), + prefix: self.prefix.as_ref().map(|s| &**s), + } + } + + /// Returns a new `OwnedName` instance representing a plain local name. + #[inline] + pub fn local<S>(local_name: S) -> OwnedName where S: Into<String> { + OwnedName { + local_name: local_name.into(), + namespace: None, + prefix: None, + } + } + + /// Returns a new `OwnedName` instance representing a qualified name with or without + /// a prefix and with a namespace URI. + #[inline] + pub fn qualified<S1, S2, S3>(local_name: S1, namespace: S2, prefix: Option<S3>) -> OwnedName + where S1: Into<String>, S2: Into<String>, S3: Into<String> + { + OwnedName { + local_name: local_name.into(), + namespace: Some(namespace.into()), + prefix: prefix.map(|v| v.into()) + } + } + + /// Returns an optional prefix by reference, equivalent to `self.borrow().prefix` + /// but avoids extra work. + #[inline] + pub fn prefix_ref(&self) -> Option<&str> { + self.prefix.as_ref().map(|s| &**s) + } + + /// Returns an optional namespace by reference, equivalen to `self.borrow().namespace` + /// but avoids extra work. + #[inline] + pub fn namespace_ref(&self) -> Option<&str> { + self.namespace.as_ref().map(|s| &**s) + } +} + +impl<'a> From<Name<'a>> for OwnedName { + #[inline] + fn from(n: Name<'a>) -> OwnedName { + n.to_owned() + } +} + +impl FromStr for OwnedName { + type Err = (); + + /// Parses the given string slice into a qualified name. + /// + /// This function, when finishes sucessfully, always return a qualified + /// name without a namespace (`name.namespace == None`). It should be filled later + /// using proper `NamespaceStack`. + /// + /// It is supposed that all characters in the argument string are correct + /// as defined by the XML specification. No additional checks except a check + /// for emptiness are done. + fn from_str(s: &str) -> Result<OwnedName, ()> { + let mut it = s.split(':'); + + let r = match (it.next(), it.next(), it.next()) { + (Some(prefix), Some(local_name), None) if !prefix.is_empty() && + !local_name.is_empty() => + Some((local_name.into(), Some(prefix.into()))), + (Some(local_name), None, None) if !local_name.is_empty() => + Some((local_name.into(), None)), + (_, _, _) => None + }; + r.map(|(local_name, prefix)| OwnedName { + local_name, + namespace: None, + prefix + }).ok_or(()) + } +} + +#[cfg(test)] +mod tests { + use super::OwnedName; + + #[test] + fn test_owned_name_from_str() { + assert_eq!("prefix:name".parse(), Ok(OwnedName { + local_name: "name".into(), + namespace: None, + prefix: Some("prefix".into()) + })); + + assert_eq!("name".parse(), Ok(OwnedName { + local_name: "name".into(), + namespace: None, + prefix: None + })); + + assert_eq!("".parse(), Err::<OwnedName, ()>(())); + assert_eq!(":".parse(), Err::<OwnedName, ()>(())); + assert_eq!(":a".parse(), Err::<OwnedName, ()>(())); + assert_eq!("a:".parse(), Err::<OwnedName, ()>(())); + assert_eq!("a:b:c".parse(), Err::<OwnedName, ()>(())); + } +} diff --git a/third_party/rust/xml-rs/src/namespace.rs b/third_party/rust/xml-rs/src/namespace.rs new file mode 100644 index 0000000000..1ab4a5c025 --- /dev/null +++ b/third_party/rust/xml-rs/src/namespace.rs @@ -0,0 +1,485 @@ +//! Contains namespace manipulation types and functions. + +use std::iter::{Map, Rev}; +use std::collections::btree_map::{BTreeMap, Entry}; +use std::collections::btree_map::Iter as Entries; +use std::collections::HashSet; +use std::slice::Iter; + +/// Designates prefix for namespace definitions. +/// +/// See [Namespaces in XML][namespace] spec for more information. +/// +/// [namespace]: http://www.w3.org/TR/xml-names/#ns-decl +pub const NS_XMLNS_PREFIX: &'static str = "xmlns"; + +/// Designates the standard URI for `xmlns` prefix. +/// +/// See [A Namespace Name for xmlns Attributes][1] for more information. +/// +/// [namespace]: http://www.w3.org/2000/xmlns/ +pub const NS_XMLNS_URI: &'static str = "http://www.w3.org/2000/xmlns/"; + +/// Designates prefix for a namespace containing several special predefined attributes. +/// +/// See [2.10 White Space handling][1], [2.1 Language Identification][2], +/// [XML Base specification][3] and [xml:id specification][4] for more information. +/// +/// [1]: http://www.w3.org/TR/REC-xml/#sec-white-space +/// [2]: http://www.w3.org/TR/REC-xml/#sec-lang-tag +/// [3]: http://www.w3.org/TR/xmlbase/ +/// [4]: http://www.w3.org/TR/xml-id/ +pub const NS_XML_PREFIX: &'static str = "xml"; + +/// Designates the standard URI for `xml` prefix. +/// +/// See `NS_XML_PREFIX` documentation for more information. +pub const NS_XML_URI: &'static str = "http://www.w3.org/XML/1998/namespace"; + +/// Designates the absence of prefix in a qualified name. +/// +/// This constant should be used to define or query default namespace which should be used +/// for element or attribute names without prefix. For example, if a namespace mapping +/// at a particular point in the document contains correspondence like +/// +/// ```none +/// NS_NO_PREFIX --> urn:some:namespace +/// ``` +/// +/// then all names declared without an explicit prefix `urn:some:namespace` is assumed as +/// a namespace URI. +/// +/// By default empty prefix corresponds to absence of namespace, but this can change either +/// when writing an XML document (manually) or when reading an XML document (based on namespace +/// declarations). +pub const NS_NO_PREFIX: &'static str = ""; + +/// Designates an empty namespace URI, which is equivalent to absence of namespace. +/// +/// This constant should not usually be used directly; it is used to designate that +/// empty prefix corresponds to absent namespace in `NamespaceStack` instances created with +/// `NamespaceStack::default()`. Therefore, it can be used to restore `NS_NO_PREFIX` mapping +/// in a namespace back to its default value. +pub const NS_EMPTY_URI: &'static str = ""; + +/// Namespace is a map from prefixes to namespace URIs. +/// +/// No prefix (i.e. default namespace) is designated by `NS_NO_PREFIX` constant. +#[derive(PartialEq, Eq, Clone, Debug)] +pub struct Namespace(pub BTreeMap<String, String>); + +impl Namespace { + /// Returns an empty namespace. + #[inline] + pub fn empty() -> Namespace { Namespace(BTreeMap::new()) } + + /// Checks whether this namespace is empty. + #[inline] + pub fn is_empty(&self) -> bool { + self.0.is_empty() + } + + /// Checks whether this namespace is essentially empty, that is, it does not contain + /// anything but default mappings. + pub fn is_essentially_empty(&self) -> bool { + // a shortcut for a namespace which is definitely not empty + if self.0.len() > 3 { return false; } + + self.0.iter().all(|(k, v)| match (&**k, &**v) { + (NS_NO_PREFIX, NS_EMPTY_URI) => true, + (NS_XMLNS_PREFIX, NS_XMLNS_URI) => true, + (NS_XML_PREFIX, NS_XML_URI) => true, + _ => false + }) + } + + /// Checks whether this namespace mapping contains the given prefix. + /// + /// # Parameters + /// * `prefix` --- namespace prefix. + /// + /// # Return value + /// `true` if this namespace contains the given prefix, `false` otherwise. + #[inline] + pub fn contains<P: ?Sized+AsRef<str>>(&self, prefix: &P) -> bool { + self.0.contains_key(prefix.as_ref()) + } + + /// Puts a mapping into this namespace. + /// + /// This method does not override any already existing mappings. + /// + /// Returns a boolean flag indicating whether the map already contained + /// the given prefix. + /// + /// # Parameters + /// * `prefix` --- namespace prefix; + /// * `uri` --- namespace URI. + /// + /// # Return value + /// `true` if `prefix` has been inserted successfully; `false` if the `prefix` + /// was already present in the namespace. + pub fn put<P, U>(&mut self, prefix: P, uri: U) -> bool + where P: Into<String>, U: Into<String> + { + match self.0.entry(prefix.into()) { + Entry::Occupied(_) => false, + Entry::Vacant(ve) => { + ve.insert(uri.into()); + true + } + } + } + + /// Puts a mapping into this namespace forcefully. + /// + /// This method, unlike `put()`, does replace an already existing mapping. + /// + /// Returns previous URI which was assigned to the given prefix, if it is present. + /// + /// # Parameters + /// * `prefix` --- namespace prefix; + /// * `uri` --- namespace URI. + /// + /// # Return value + /// `Some(uri)` with `uri` being a previous URI assigned to the `prefix`, or + /// `None` if such prefix was not present in the namespace before. + pub fn force_put<P, U>(&mut self, prefix: P, uri: U) -> Option<String> + where P: Into<String>, U: Into<String> + { + self.0.insert(prefix.into(), uri.into()) + } + + /// Queries the namespace for the given prefix. + /// + /// # Parameters + /// * `prefix` --- namespace prefix. + /// + /// # Return value + /// Namespace URI corresponding to the given prefix, if it is present. + pub fn get<'a, P: ?Sized+AsRef<str>>(&'a self, prefix: &P) -> Option<&'a str> { + self.0.get(prefix.as_ref()).map(|s| &**s) + } +} + +/// An alias for iterator type for namespace mappings contained in a namespace. +pub type NamespaceMappings<'a> = Map< + Entries<'a, String, String>, + for<'b> fn((&'b String, &'b String)) -> UriMapping<'b> +>; + +impl<'a> IntoIterator for &'a Namespace { + type Item = UriMapping<'a>; + type IntoIter = NamespaceMappings<'a>; + + fn into_iter(self) -> Self::IntoIter { + fn mapper<'a>((prefix, uri): (&'a String, &'a String)) -> UriMapping<'a> { + (&*prefix, &*uri) + } + self.0.iter().map(mapper) + } +} + +/// Namespace stack is a sequence of namespaces. +/// +/// Namespace stack is used to represent cumulative namespace consisting of +/// combined namespaces from nested elements. +#[derive(Clone, Eq, PartialEq, Debug)] +pub struct NamespaceStack(pub Vec<Namespace>); + +impl NamespaceStack { + /// Returns an empty namespace stack. + #[inline] + pub fn empty() -> NamespaceStack { NamespaceStack(Vec::with_capacity(2)) } + + /// Returns a namespace stack with default items in it. + /// + /// Default items are the following: + /// + /// * `xml` → `http://www.w3.org/XML/1998/namespace`; + /// * `xmlns` → `http://www.w3.org/2000/xmlns/`. + #[inline] + pub fn default() -> NamespaceStack { + let mut nst = NamespaceStack::empty(); + nst.push_empty(); + // xml namespace + nst.put(NS_XML_PREFIX, NS_XML_URI); + // xmlns namespace + nst.put(NS_XMLNS_PREFIX, NS_XMLNS_URI); + // empty namespace + nst.put(NS_NO_PREFIX, NS_EMPTY_URI); + nst + } + + /// Adds an empty namespace to the top of this stack. + #[inline] + pub fn push_empty(&mut self) -> &mut NamespaceStack { + self.0.push(Namespace::empty()); + self + } + + /// Removes the topmost namespace in this stack. + /// + /// Panics if the stack is empty. + #[inline] + pub fn pop(&mut self) -> Namespace { + self.0.pop().unwrap() + } + + /// Removes the topmost namespace in this stack. + /// + /// Returns `Some(namespace)` if this stack is not empty and `None` otherwise. + #[inline] + pub fn try_pop(&mut self) -> Option<Namespace> { + self.0.pop() + } + + /// Borrows the topmost namespace mutably, leaving the stack intact. + /// + /// Panics if the stack is empty. + #[inline] + pub fn peek_mut(&mut self) -> &mut Namespace { + self.0.last_mut().unwrap() + } + + /// Borrows the topmost namespace immutably, leaving the stack intact. + /// + /// Panics if the stack is empty. + #[inline] + pub fn peek(&self) -> &Namespace { + self.0.last().unwrap() + } + + /// Puts a mapping into the topmost namespace if this stack does not already contain one. + /// + /// Returns a boolean flag indicating whether the insertion has completed successfully. + /// Note that both key and value are matched and the mapping is inserted if either + /// namespace prefix is not already mapped, or if it is mapped, but to a different URI. + /// + /// # Parameters + /// * `prefix` --- namespace prefix; + /// * `uri` --- namespace URI. + /// + /// # Return value + /// `true` if `prefix` has been inserted successfully; `false` if the `prefix` + /// was already present in the namespace stack. + pub fn put_checked<P, U>(&mut self, prefix: P, uri: U) -> bool + where P: Into<String> + AsRef<str>, + U: Into<String> + AsRef<str> + { + if self.0.iter().any(|ns| ns.get(&prefix) == Some(uri.as_ref())) { + false + } else { + self.put(prefix, uri); + true + } + } + + /// Puts a mapping into the topmost namespace in this stack. + /// + /// This method does not override a mapping in the topmost namespace if it is + /// already present, however, it does not depend on other namespaces in the stack, + /// so it is possible to put a mapping which is present in lower namespaces. + /// + /// Returns a boolean flag indicating whether the insertion has completed successfully. + /// + /// # Parameters + /// * `prefix` --- namespace prefix; + /// * `uri` --- namespace URI. + /// + /// # Return value + /// `true` if `prefix` has been inserted successfully; `false` if the `prefix` + /// was already present in the namespace. + #[inline] + pub fn put<P, U>(&mut self, prefix: P, uri: U) -> bool + where P: Into<String>, U: Into<String> + { + self.0.last_mut().unwrap().put(prefix, uri) + } + + /// Performs a search for the given prefix in the whole stack. + /// + /// This method walks the stack from top to bottom, querying each namespace + /// in order for the given prefix. If none of the namespaces contains the prefix, + /// `None` is returned. + /// + /// # Parameters + /// * `prefix` --- namespace prefix. + #[inline] + pub fn get<'a, P: ?Sized+AsRef<str>>(&'a self, prefix: &P) -> Option<&'a str> { + let prefix = prefix.as_ref(); + for ns in self.0.iter().rev() { + match ns.get(prefix) { + None => {}, + r => return r, + } + } + None + } + + /// Combines this stack of namespaces into a single namespace. + /// + /// Namespaces are combined in left-to-right order, that is, rightmost namespace + /// elements take priority over leftmost ones. + pub fn squash(&self) -> Namespace { + let mut result = BTreeMap::new(); + for ns in self.0.iter() { + result.extend(ns.0.iter().map(|(k, v)| (k.clone(), v.clone()))); + } + Namespace(result) + } + + /// Returns an object which implements `Extend` using `put_checked()` instead of `put()`. + /// + /// See `CheckedTarget` for more information. + #[inline] + pub fn checked_target(&mut self) -> CheckedTarget { + CheckedTarget(self) + } + + /// Returns an iterator over all mappings in this namespace stack. + #[inline] + pub fn iter(&self) -> NamespaceStackMappings { + self.into_iter() + } +} + +/// An iterator over mappings from prefixes to URIs in a namespace stack. +/// +/// # Example +/// ``` +/// # use xml::namespace::NamespaceStack; +/// let mut nst = NamespaceStack::empty(); +/// nst.push_empty(); +/// nst.put("a", "urn:A"); +/// nst.put("b", "urn:B"); +/// nst.push_empty(); +/// nst.put("c", "urn:C"); +/// +/// assert_eq!(vec![("c", "urn:C"), ("a", "urn:A"), ("b", "urn:B")], nst.iter().collect::<Vec<_>>()); +/// ``` +pub struct NamespaceStackMappings<'a> { + namespaces: Rev<Iter<'a, Namespace>>, + current_namespace: Option<NamespaceMappings<'a>>, + used_keys: HashSet<&'a str> +} + +impl<'a> NamespaceStackMappings<'a> { + fn go_to_next_namespace(&mut self) -> bool { + self.current_namespace = self.namespaces.next().map(|ns| ns.into_iter()); + self.current_namespace.is_some() + } +} + +impl<'a> Iterator for NamespaceStackMappings<'a> { + type Item = UriMapping<'a>; + + fn next(&mut self) -> Option<UriMapping<'a>> { + // If there is no current namespace and no next namespace, we're finished + if self.current_namespace.is_none() && !self.go_to_next_namespace() { + return None; + } + let next_item = self.current_namespace.as_mut().unwrap().next(); + + match next_item { + // There is an element in the current namespace + Some((k, v)) => if self.used_keys.contains(&k) { + // If the current key is used, go to the next one + self.next() + } else { + // Otherwise insert the current key to the set of used keys and + // return the mapping + self.used_keys.insert(k); + Some((k, v)) + }, + // Current namespace is exhausted + None => if self.go_to_next_namespace() { + // If there is next namespace, continue from it + self.next() + } else { + // No next namespace, exiting + None + } + } + } +} + +impl<'a> IntoIterator for &'a NamespaceStack { + type Item = UriMapping<'a>; + type IntoIter = NamespaceStackMappings<'a>; + + fn into_iter(self) -> Self::IntoIter { + NamespaceStackMappings { + namespaces: self.0.iter().rev(), + current_namespace: None, + used_keys: HashSet::new() + } + } +} + +/// A type alias for a pair of `(prefix, uri)` values returned by namespace iterators. +pub type UriMapping<'a> = (&'a str, &'a str); + +impl<'a> Extend<UriMapping<'a>> for Namespace { + fn extend<T>(&mut self, iterable: T) where T: IntoIterator<Item=UriMapping<'a>> { + for (prefix, uri) in iterable { + self.put(prefix, uri); + } + } +} + +impl<'a> Extend<UriMapping<'a>> for NamespaceStack { + fn extend<T>(&mut self, iterable: T) where T: IntoIterator<Item=UriMapping<'a>> { + for (prefix, uri) in iterable { + self.put(prefix, uri); + } + } +} + +/// A wrapper around `NamespaceStack` which implements `Extend` using `put_checked()`. +/// +/// # Example +/// +/// ``` +/// # use xml::namespace::NamespaceStack; +/// +/// let mut nst = NamespaceStack::empty(); +/// nst.push_empty(); +/// nst.put("a", "urn:A"); +/// nst.put("b", "urn:B"); +/// nst.push_empty(); +/// nst.put("c", "urn:C"); +/// +/// nst.checked_target().extend(vec![("a", "urn:Z"), ("b", "urn:B"), ("c", "urn:Y"), ("d", "urn:D")]); +/// assert_eq!( +/// vec![("a", "urn:Z"), ("c", "urn:C"), ("d", "urn:D"), ("b", "urn:B")], +/// nst.iter().collect::<Vec<_>>() +/// ); +/// ``` +/// +/// Compare: +/// +/// ``` +/// # use xml::namespace::NamespaceStack; +/// # let mut nst = NamespaceStack::empty(); +/// # nst.push_empty(); +/// # nst.put("a", "urn:A"); +/// # nst.put("b", "urn:B"); +/// # nst.push_empty(); +/// # nst.put("c", "urn:C"); +/// +/// nst.extend(vec![("a", "urn:Z"), ("b", "urn:B"), ("c", "urn:Y"), ("d", "urn:D")]); +/// assert_eq!( +/// vec![("a", "urn:Z"), ("b", "urn:B"), ("c", "urn:C"), ("d", "urn:D")], +/// nst.iter().collect::<Vec<_>>() +/// ); +/// ``` +pub struct CheckedTarget<'a>(&'a mut NamespaceStack); + +impl<'a, 'b> Extend<UriMapping<'b>> for CheckedTarget<'a> { + fn extend<T>(&mut self, iterable: T) where T: IntoIterator<Item=UriMapping<'b>> { + for (prefix, uri) in iterable { + self.0.put_checked(prefix, uri); + } + } +} diff --git a/third_party/rust/xml-rs/src/reader/config.rs b/third_party/rust/xml-rs/src/reader/config.rs new file mode 100644 index 0000000000..0abb165cf4 --- /dev/null +++ b/third_party/rust/xml-rs/src/reader/config.rs @@ -0,0 +1,181 @@ +//! Contains parser configuration structure. +use std::io::Read; +use std::collections::HashMap; + +use reader::EventReader; + +/// Parser configuration structure. +/// +/// This structure contains various configuration options which affect +/// behavior of the parser. +#[derive(Clone, PartialEq, Eq, Debug)] +pub struct ParserConfig { + /// Whether or not should whitespace in textual events be removed. Default is false. + /// + /// When true, all standalone whitespace will be removed (this means no + /// `Whitespace` events will be emitted), and leading and trailing whitespace + /// from `Character` events will be deleted. If after trimming `Characters` + /// event will be empty, it will also be omitted from output stream. This is + /// possible, however, only if `whitespace_to_characters` or + /// `cdata_to_characters` options are set. + /// + /// This option does not affect CDATA events, unless `cdata_to_characters` + /// option is also set. In that case CDATA content will also be trimmed. + pub trim_whitespace: bool, + + /// Whether or not should whitespace be converted to characters. + /// Default is false. + /// + /// If true, instead of `Whitespace` events `Characters` events with the + /// same content will be emitted. If `trim_whitespace` is also true, these + /// events will be trimmed to nothing and, consequently, not emitted. + pub whitespace_to_characters: bool, + + /// Whether or not should CDATA be converted to characters. + /// Default is false. + /// + /// If true, instead of `CData` events `Characters` events with the same + /// content will be emitted. If `trim_whitespace` is also true, these events + /// will be trimmed. If corresponding CDATA contained nothing but whitespace, + /// this event will be omitted from the stream. + pub cdata_to_characters: bool, + + /// Whether or not should comments be omitted. Default is true. + /// + /// If true, `Comment` events will not be emitted at all. + pub ignore_comments: bool, + + /// Whether or not should sequential `Characters` events be merged. + /// Default is true. + /// + /// If true, multiple sequential `Characters` events will be merged into + /// a single event, that is, their data will be concatenated. + /// + /// Multiple sequential `Characters` events are only possible if either + /// `cdata_to_characters` or `ignore_comments` are set. Otherwise character + /// events will always be separated by other events. + pub coalesce_characters: bool, + + /// A map of extra entities recognized by the parser. Default is an empty map. + /// + /// By default the XML parser recognizes the entities defined in the XML spec. Sometimes, + /// however, it is convenient to make the parser recognize additional entities which + /// are also not available through the DTD definitions (especially given that at the moment + /// DTD parsing is not supported). + pub extra_entities: HashMap<String, String>, + + /// Whether or not the parser should ignore the end of stream. Default is false. + /// + /// By default the parser will either error out when it encounters a premature end of + /// stream or complete normally if the end of stream was expected. If you want to continue + /// reading from a stream whose input is supplied progressively, you can set this option to true. + /// In this case the parser will allow you to invoke the next() method even if a supposed end + /// of stream has happened. + /// + /// Note that support for this functionality is incomplete; for example, the parser will fail if + /// the premature end of stream happens inside PCDATA. Therefore, use this option at your own risk. + pub ignore_end_of_stream: bool, + + /// Whether or not non-unicode entity references get replaced with the replacement character + /// + /// When true, any decimal or hexadecimal character reference that cannot be converted from a + /// u32 to a char using [std::char::from_u32](https://doc.rust-lang.org/std/char/fn.from_u32.html) + /// will be converted into the unicode REPLACEMENT CHARACTER (U+FFFD). + pub replace_unknown_entity_references: bool, + + /// Whether or not whitespace at the root level of the document is ignored. Default is true. + /// + /// By default any whitespace that is not enclosed within at least one level of elements will be + /// ignored. Setting this value to false will cause root level whitespace events to be emitted. + pub ignore_root_level_whitespace: bool, +} + +impl ParserConfig { + /// Returns a new config with default values. + /// + /// You can tweak default values using builder-like pattern: + /// + /// ```rust + /// use xml::reader::ParserConfig; + /// + /// let config = ParserConfig::new() + /// .trim_whitespace(true) + /// .ignore_comments(true) + /// .coalesce_characters(false); + /// ``` + pub fn new() -> ParserConfig { + ParserConfig { + trim_whitespace: false, + whitespace_to_characters: false, + cdata_to_characters: false, + ignore_comments: true, + coalesce_characters: true, + extra_entities: HashMap::new(), + ignore_end_of_stream: false, + replace_unknown_entity_references: false, + ignore_root_level_whitespace: true, + } + } + + /// Creates an XML reader with this configuration. + /// + /// This is a convenience method for configuring and creating a reader at the same time: + /// + /// ```rust + /// use xml::reader::ParserConfig; + /// + /// let mut source: &[u8] = b"..."; + /// + /// let reader = ParserConfig::new() + /// .trim_whitespace(true) + /// .ignore_comments(true) + /// .coalesce_characters(false) + /// .create_reader(&mut source); + /// ``` + /// + /// This method is exactly equivalent to calling `EventReader::new_with_config()` with + /// this configuration object. + #[inline] + pub fn create_reader<R: Read>(self, source: R) -> EventReader<R> { + EventReader::new_with_config(source, self) + } + + /// Adds a new entity mapping and returns an updated config object. + /// + /// This is a convenience method for adding external entities mappings to the XML parser. + /// An example: + /// + /// ```rust + /// use xml::reader::ParserConfig; + /// + /// let mut source: &[u8] = b"..."; + /// + /// let reader = ParserConfig::new() + /// .add_entity("nbsp", " ") + /// .add_entity("copy", "©") + /// .add_entity("reg", "®") + /// .create_reader(&mut source); + /// ``` + pub fn add_entity<S: Into<String>, T: Into<String>>(mut self, entity: S, value: T) -> ParserConfig { + self.extra_entities.insert(entity.into(), value.into()); + self + } +} + +impl Default for ParserConfig { + #[inline] + fn default() -> ParserConfig { + ParserConfig::new() + } +} + +gen_setters! { ParserConfig, + trim_whitespace: val bool, + whitespace_to_characters: val bool, + cdata_to_characters: val bool, + ignore_comments: val bool, + coalesce_characters: val bool, + ignore_end_of_stream: val bool, + replace_unknown_entity_references: val bool, + ignore_root_level_whitespace: val bool +} diff --git a/third_party/rust/xml-rs/src/reader/error.rs b/third_party/rust/xml-rs/src/reader/error.rs new file mode 100644 index 0000000000..92378e6373 --- /dev/null +++ b/third_party/rust/xml-rs/src/reader/error.rs @@ -0,0 +1,121 @@ + +use std::io; +use std::borrow::Cow; +use std::fmt; +use std::error; +use std::str; + +use util; +use common::{Position, TextPosition}; + +#[derive(Debug)] +pub enum ErrorKind { + Syntax(Cow<'static, str>), + Io(io::Error), + Utf8(str::Utf8Error), + UnexpectedEof, +} + +/// An XML parsing error. +/// +/// Consists of a 2D position in a document and a textual message describing the error. +#[derive(Clone, PartialEq, Eq, Debug)] +pub struct Error { + pos: TextPosition, + kind: ErrorKind, +} + +impl fmt::Display for Error { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{} {}", self.pos, self.msg()) + } +} + +impl Position for Error { + #[inline] + fn position(&self) -> TextPosition { self.pos } +} + +impl Error { + /// Returns a reference to a message which is contained inside this error. + #[inline] + pub fn msg(&self) -> &str { + use self::ErrorKind::*; + match self.kind { + UnexpectedEof => &"Unexpected EOF", + Utf8(ref reason) => error_description(reason), + Io(ref io_error) => error_description(io_error), + Syntax(ref msg) => msg.as_ref(), + } + } + + pub fn kind(&self) -> &ErrorKind { &self.kind } +} + +impl error::Error for Error { + #[inline] + fn description(&self) -> &str { self.msg() } +} + +impl<'a, P, M> From<(&'a P, M)> for Error where P: Position, M: Into<Cow<'static, str>> { + fn from(orig: (&'a P, M)) -> Self { + Error{ + pos: orig.0.position(), + kind: ErrorKind::Syntax(orig.1.into()) + } + } +} + +impl From<util::CharReadError> for Error { + fn from(e: util::CharReadError) -> Self { + use util::CharReadError::*; + Error{ + pos: TextPosition::new(), + kind: match e { + UnexpectedEof => ErrorKind::UnexpectedEof, + Utf8(reason) => ErrorKind::Utf8(reason), + Io(io_error) => ErrorKind::Io(io_error), + } + } + } +} + +impl From<io::Error> for Error { + fn from(e: io::Error) -> Self { + Error { + pos: TextPosition::new(), + kind: ErrorKind::Io(e), + } + } +} + +impl Clone for ErrorKind { + fn clone(&self) -> Self { + use self::ErrorKind::*; + match *self { + UnexpectedEof => UnexpectedEof, + Utf8(ref reason) => Utf8(reason.clone()), + Io(ref io_error) => Io(io::Error::new(io_error.kind(), error_description(io_error))), + Syntax(ref msg) => Syntax(msg.clone()), + } + } +} +impl PartialEq for ErrorKind { + fn eq(&self, other: &ErrorKind) -> bool { + use self::ErrorKind::*; + match (self, other) { + (&UnexpectedEof, &UnexpectedEof) => true, + (&Utf8(ref left), &Utf8(ref right)) => left == right, + (&Io(ref left), &Io(ref right)) => + left.kind() == right.kind() && + error_description(left) == error_description(right), + (&Syntax(ref left), &Syntax(ref right)) => + left == right, + + (_, _) => false, + } + } +} +impl Eq for ErrorKind {} + +fn error_description(e: &error::Error) -> &str { e.description() } diff --git a/third_party/rust/xml-rs/src/reader/events.rs b/third_party/rust/xml-rs/src/reader/events.rs new file mode 100644 index 0000000000..46d7621a87 --- /dev/null +++ b/third_party/rust/xml-rs/src/reader/events.rs @@ -0,0 +1,219 @@ +//! Contains `XmlEvent` datatype, instances of which are emitted by the parser. + +use std::fmt; +use std::borrow::Cow; + +use name::OwnedName; +use attribute::OwnedAttribute; +use common::XmlVersion; +use namespace::Namespace; + +/// An element of an XML input stream. +/// +/// Items of this enum are emitted by `reader::EventReader`. They correspond to different +/// elements of an XML document. +#[derive(PartialEq, Clone)] +pub enum XmlEvent { + /// Corresponds to XML document declaration. + /// + /// This event is always emitted before any other event. It is emitted + /// even if the actual declaration is not present in the document. + StartDocument { + /// XML version. + /// + /// If XML declaration is not present, defaults to `Version10`. + version: XmlVersion, + + /// XML document encoding. + /// + /// If XML declaration is not present or does not contain `encoding` attribute, + /// defaults to `"UTF-8"`. This field is currently used for no other purpose than + /// informational. + encoding: String, + + /// XML standalone declaration. + /// + /// If XML document is not present or does not contain `standalone` attribute, + /// defaults to `None`. This field is currently used for no other purpose than + /// informational. + standalone: Option<bool> + }, + + /// Denotes to the end of the document stream. + /// + /// This event is always emitted after any other event (except `Error`). After it + /// is emitted for the first time, it will always be emitted on next event pull attempts. + EndDocument, + + /// Denotes an XML processing instruction. + /// + /// This event contains a processing instruction target (`name`) and opaque `data`. It + /// is up to the application to process them. + ProcessingInstruction { + /// Processing instruction target. + name: String, + + /// Processing instruction content. + data: Option<String> + }, + + /// Denotes a beginning of an XML element. + /// + /// This event is emitted after parsing opening tags or after parsing bodiless tags. In the + /// latter case `EndElement` event immediately follows. + StartElement { + /// Qualified name of the element. + name: OwnedName, + + /// A list of attributes associated with the element. + /// + /// Currently attributes are not checked for duplicates (TODO) + attributes: Vec<OwnedAttribute>, + + /// Contents of the namespace mapping at this point of the document. + namespace: Namespace, + }, + + /// Denotes an end of an XML element. + /// + /// This event is emitted after parsing closing tags or after parsing bodiless tags. In the + /// latter case it is emitted immediately after corresponding `StartElement` event. + EndElement { + /// Qualified name of the element. + name: OwnedName + }, + + /// Denotes CDATA content. + /// + /// This event contains unparsed data. No unescaping will be performed. + /// + /// It is possible to configure a parser to emit `Characters` event instead of `CData`. See + /// `pull::ParserConfiguration` structure for more information. + CData(String), + + /// Denotes a comment. + /// + /// It is possible to configure a parser to ignore comments, so this event will never be emitted. + /// See `pull::ParserConfiguration` structure for more information. + Comment(String), + + /// Denotes character data outside of tags. + /// + /// Contents of this event will always be unescaped, so no entities like `<` or `&` or `{` + /// will appear in it. + /// + /// It is possible to configure a parser to trim leading and trailing whitespace for this event. + /// See `pull::ParserConfiguration` structure for more information. + Characters(String), + + /// Denotes a chunk of whitespace outside of tags. + /// + /// It is possible to configure a parser to emit `Characters` event instead of `Whitespace`. + /// See `pull::ParserConfiguration` structure for more information. When combined with whitespace + /// trimming, it will eliminate standalone whitespace from the event stream completely. + Whitespace(String) +} + +impl fmt::Debug for XmlEvent { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match *self { + XmlEvent::StartDocument { ref version, ref encoding, ref standalone } => + write!(f, "StartDocument({}, {}, {:?})", version, *encoding, *standalone), + XmlEvent::EndDocument => + write!(f, "EndDocument"), + XmlEvent::ProcessingInstruction { ref name, ref data } => + write!(f, "ProcessingInstruction({}{})", *name, match *data { + Some(ref data) => format!(", {}", data), + None => String::new() + }), + XmlEvent::StartElement { ref name, ref attributes, namespace: Namespace(ref namespace) } => + write!(f, "StartElement({}, {:?}{})", name, namespace, if attributes.is_empty() { + String::new() + } else { + let attributes: Vec<String> = attributes.iter().map( + |a| format!("{} -> {}", a.name, a.value) + ).collect(); + format!(", [{}]", attributes.join(", ")) + }), + XmlEvent::EndElement { ref name } => + write!(f, "EndElement({})", name), + XmlEvent::Comment(ref data) => + write!(f, "Comment({})", data), + XmlEvent::CData(ref data) => + write!(f, "CData({})", data), + XmlEvent::Characters(ref data) => + write!(f, "Characters({})", data), + XmlEvent::Whitespace(ref data) => + write!(f, "Whitespace({})", data) + } + } +} + +impl XmlEvent { + /// Obtains a writer event from this reader event. + /// + /// This method is useful for streaming processing of XML documents where the output + /// is also an XML document. With this method it is possible to process some events + /// while passing other events through to the writer unchanged: + /// + /// ```rust + /// use std::str; + /// + /// use xml::{EventReader, EventWriter}; + /// use xml::reader::XmlEvent as ReaderEvent; + /// use xml::writer::XmlEvent as WriterEvent; + /// + /// let mut input: &[u8] = b"<hello>world</hello>"; + /// let mut output: Vec<u8> = Vec::new(); + /// + /// { + /// let mut reader = EventReader::new(&mut input); + /// let mut writer = EventWriter::new(&mut output); + /// + /// for e in reader { + /// match e.unwrap() { + /// ReaderEvent::Characters(s) => + /// writer.write(WriterEvent::characters(&s.to_uppercase())).unwrap(), + /// e => if let Some(e) = e.as_writer_event() { + /// writer.write(e).unwrap() + /// } + /// } + /// } + /// } + /// + /// assert_eq!( + /// str::from_utf8(&output).unwrap(), + /// r#"<?xml version="1.0" encoding="UTF-8"?><hello>WORLD</hello>"# + /// ); + /// ``` + /// + /// Note that this API may change or get additions in future to improve its ergonomics. + pub fn as_writer_event<'a>(&'a self) -> Option<::writer::events::XmlEvent<'a>> { + match *self { + XmlEvent::StartDocument { version, ref encoding, standalone } => + Some(::writer::events::XmlEvent::StartDocument { + version: version, + encoding: Some(encoding), + standalone: standalone + }), + XmlEvent::ProcessingInstruction { ref name, ref data } => + Some(::writer::events::XmlEvent::ProcessingInstruction { + name: name, + data: data.as_ref().map(|s| &s[..]) + }), + XmlEvent::StartElement { ref name, ref attributes, ref namespace } => + Some(::writer::events::XmlEvent::StartElement { + name: name.borrow(), + attributes: attributes.iter().map(|a| a.borrow()).collect(), + namespace: Cow::Borrowed(namespace) + }), + XmlEvent::EndElement { ref name } => + Some(::writer::events::XmlEvent::EndElement { name: Some(name.borrow()) }), + XmlEvent::Comment(ref data) => Some(::writer::events::XmlEvent::Comment(data)), + XmlEvent::CData(ref data) => Some(::writer::events::XmlEvent::CData(data)), + XmlEvent::Characters(ref data) => Some(::writer::events::XmlEvent::Characters(data)), + XmlEvent::Whitespace(ref data) => Some(::writer::events::XmlEvent::Characters(data)), + _ => None + } + } +} diff --git a/third_party/rust/xml-rs/src/reader/lexer.rs b/third_party/rust/xml-rs/src/reader/lexer.rs new file mode 100644 index 0000000000..c466db9210 --- /dev/null +++ b/third_party/rust/xml-rs/src/reader/lexer.rs @@ -0,0 +1,867 @@ +//! Contains simple lexer for XML documents. +//! +//! This module is for internal use. Use `xml::pull` module to do parsing. + +use std::fmt; +use std::collections::VecDeque; +use std::io::Read; +use std::result; +use std::borrow::Cow; + +use common::{Position, TextPosition, is_whitespace_char, is_name_char}; +use reader::Error; +use util; + +/// `Token` represents a single lexeme of an XML document. These lexemes +/// are used to perform actual parsing. +#[derive(Copy, Clone, PartialEq, Eq, Debug)] +pub enum Token { + /// `<?` + ProcessingInstructionStart, + /// `?>` + ProcessingInstructionEnd, + /// `<!DOCTYPE + DoctypeStart, + /// `<` + OpeningTagStart, + /// `</` + ClosingTagStart, + /// `>` + TagEnd, + /// `/>` + EmptyTagEnd, + /// `<!--` + CommentStart, + /// `-->` + CommentEnd, + /// A chunk of characters, used for errors recovery. + Chunk(&'static str), + /// Any non-special character except whitespace. + Character(char), + /// Whitespace character. + Whitespace(char), + /// `=` + EqualsSign, + /// `'` + SingleQuote, + /// `"` + DoubleQuote, + /// `<![CDATA[` + CDataStart, + /// `]]>` + CDataEnd, + /// `&` + ReferenceStart, + /// `;` + ReferenceEnd, +} + +impl fmt::Display for Token { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match *self { + Token::Chunk(s) => write!(f, "{}", s), + Token::Character(c) | Token::Whitespace(c) => write!(f, "{}", c), + other => write!(f, "{}", match other { + Token::OpeningTagStart => "<", + Token::ProcessingInstructionStart => "<?", + Token::DoctypeStart => "<!DOCTYPE", + Token::ClosingTagStart => "</", + Token::CommentStart => "<!--", + Token::CDataStart => "<![CDATA[", + Token::TagEnd => ">", + Token::EmptyTagEnd => "/>", + Token::ProcessingInstructionEnd => "?>", + Token::CommentEnd => "-->", + Token::CDataEnd => "]]>", + Token::ReferenceStart => "&", + Token::ReferenceEnd => ";", + Token::EqualsSign => "=", + Token::SingleQuote => "'", + Token::DoubleQuote => "\"", + _ => unreachable!() + }) + } + } +} + +impl Token { + pub fn as_static_str(&self) -> Option<&'static str> { + match *self { + Token::OpeningTagStart => Some("<"), + Token::ProcessingInstructionStart => Some("<?"), + Token::DoctypeStart => Some("<!DOCTYPE"), + Token::ClosingTagStart => Some("</"), + Token::CommentStart => Some("<!--"), + Token::CDataStart => Some("<![CDATA["), + Token::TagEnd => Some(">"), + Token::EmptyTagEnd => Some("/>"), + Token::ProcessingInstructionEnd => Some("?>"), + Token::CommentEnd => Some("-->"), + Token::CDataEnd => Some("]]>"), + Token::ReferenceStart => Some("&"), + Token::ReferenceEnd => Some(";"), + Token::EqualsSign => Some("="), + Token::SingleQuote => Some("'"), + Token::DoubleQuote => Some("\""), + Token::Chunk(s) => Some(s), + _ => None + } + } + + // using String.push_str(token.to_string()) is simply way too slow + pub fn push_to_string(&self, target: &mut String) { + match self.as_static_str() { + Some(s) => { target.push_str(s); } + None => { + match *self { + Token::Character(c) | Token::Whitespace(c) => target.push(c), + _ => unreachable!() + } + } + } + } + + /// Returns `true` if this token contains data that can be interpreted + /// as a part of the text. Surprisingly, this also means '>' and '=' and '"' and "'" and '-->'. + #[inline] + pub fn contains_char_data(&self) -> bool { + match *self { + Token::Whitespace(_) | Token::Chunk(_) | Token::Character(_) | Token::CommentEnd | + Token::TagEnd | Token::EqualsSign | Token::DoubleQuote | Token::SingleQuote | Token::CDataEnd | + Token::ProcessingInstructionEnd | Token::EmptyTagEnd => true, + _ => false + } + } + + /// Returns `true` if this token corresponds to a white space character. + #[inline] + pub fn is_whitespace(&self) -> bool { + match *self { + Token::Whitespace(_) => true, + _ => false + } + } +} + +enum State { + /// Triggered on '<' + TagStarted, + /// Triggered on '<!' + CommentOrCDataOrDoctypeStarted, + /// Triggered on '<!-' + CommentStarted, + /// Triggered on '<!D' up to '<!DOCTYPE' + DoctypeStarted(DoctypeStartedSubstate), + /// Triggered after DoctypeStarted to handle sub elements + DoctypeFinishing(u8), + /// Triggered on '<![' up to '<![CDATA' + CDataStarted(CDataStartedSubstate), + /// Triggered on '?' + ProcessingInstructionClosing, + /// Triggered on '/' + EmptyTagClosing, + /// Triggered on '-' up to '--' + CommentClosing(ClosingSubstate), + /// Triggered on ']' up to ']]' + CDataClosing(ClosingSubstate), + /// Default state + Normal +} + +#[derive(Copy, Clone)] +enum ClosingSubstate { + First, Second +} + +#[derive(Copy, Clone)] +enum DoctypeStartedSubstate { + D, DO, DOC, DOCT, DOCTY, DOCTYP +} + +#[derive(Copy, Clone)] +enum CDataStartedSubstate { + E, C, CD, CDA, CDAT, CDATA +} + +/// `Result` represents lexing result. It is either a token or an error message. +pub type Result = result::Result<Option<Token>, Error>; + +/// Helps to set up a dispatch table for lexing large unambigous tokens like +/// `<![CDATA[` or `<!DOCTYPE `. +macro_rules! dispatch_on_enum_state( + ($_self:ident, $s:expr, $c:expr, $is:expr, + $($st:ident; $stc:expr ; $next_st:ident ; $chunk:expr),+; + $end_st:ident ; $end_c:expr ; $end_chunk:expr ; $e:expr) => ( + match $s { + $( + $st => match $c { + $stc => $_self.move_to($is($next_st)), + _ => $_self.handle_error($chunk, $c) + }, + )+ + $end_st => match $c { + $end_c => $e, + _ => $_self.handle_error($end_chunk, $c) + } + } + ) +); + +/// `Lexer` is a lexer for XML documents, which implements pull API. +/// +/// Main method is `next_token` which accepts an `std::io::Read` instance and +/// tries to read the next lexeme from it. +/// +/// When `skip_errors` flag is set, invalid lexemes will be returned as `Chunk`s. +/// When it is not set, errors will be reported as `Err` objects with a string message. +/// By default this flag is not set. Use `enable_errors` and `disable_errors` methods +/// to toggle the behavior. +pub struct Lexer { + pos: TextPosition, + head_pos: TextPosition, + char_queue: VecDeque<char>, + st: State, + skip_errors: bool, + inside_comment: bool, + inside_token: bool, + eof_handled: bool +} + +impl Position for Lexer { + #[inline] + /// Returns the position of the last token produced by the lexer + fn position(&self) -> TextPosition { self.pos } +} + +impl Lexer { + /// Returns a new lexer with default state. + pub fn new() -> Lexer { + Lexer { + pos: TextPosition::new(), + head_pos: TextPosition::new(), + char_queue: VecDeque::with_capacity(4), // TODO: check size + st: State::Normal, + skip_errors: false, + inside_comment: false, + inside_token: false, + eof_handled: false + } + } + + /// Enables error handling so `next_token` will return `Some(Err(..))` + /// upon invalid lexeme. + #[inline] + pub fn enable_errors(&mut self) { self.skip_errors = false; } + + /// Disables error handling so `next_token` will return `Some(Chunk(..))` + /// upon invalid lexeme with this lexeme content. + #[inline] + pub fn disable_errors(&mut self) { self.skip_errors = true; } + + /// Enables special handling of some lexemes which should be done when we're parsing comment + /// internals. + #[inline] + pub fn inside_comment(&mut self) { self.inside_comment = true; } + + /// Disables the effect of `inside_comment()` method. + #[inline] + pub fn outside_comment(&mut self) { self.inside_comment = false; } + + /// Reset the eof handled flag of the lexer. + #[inline] + pub fn reset_eof_handled(&mut self) { self.eof_handled = false; } + + /// Tries to read the next token from the buffer. + /// + /// It is possible to pass different instaces of `BufReader` each time + /// this method is called, but the resulting behavior is undefined in this case. + /// + /// Return value: + /// * `Err(reason) where reason: reader::Error` - when an error occurs; + /// * `Ok(None)` - upon end of stream is reached; + /// * `Ok(Some(token)) where token: Token` - in case a complete-token has been read from the stream. + pub fn next_token<B: Read>(&mut self, b: &mut B) -> Result { + // Already reached end of buffer + if self.eof_handled { + return Ok(None); + } + + if !self.inside_token { + self.pos = self.head_pos; + self.inside_token = true; + } + + // Check if we have saved a char or two for ourselves + while let Some(c) = self.char_queue.pop_front() { + match try!(self.read_next_token(c)) { + Some(t) => { + self.inside_token = false; + return Ok(Some(t)); + } + None => {} // continue + } + } + + loop { + // TODO: this should handle multiple encodings + let c = match try!(util::next_char_from(b)) { + Some(c) => c, // got next char + None => break, // nothing to read left + }; + + match try!(self.read_next_token(c)) { + Some(t) => { + self.inside_token = false; + return Ok(Some(t)); + } + None => { + // continue + } + } + } + + // Handle end of stream + self.eof_handled = true; + self.pos = self.head_pos; + match self.st { + State::TagStarted | State::CommentOrCDataOrDoctypeStarted | + State::CommentStarted | State::CDataStarted(_)| State::DoctypeStarted(_) | + State::CommentClosing(ClosingSubstate::Second) | + State::DoctypeFinishing(_) => + Err(self.error("Unexpected end of stream")), + State::ProcessingInstructionClosing => + Ok(Some(Token::Character('?'))), + State::EmptyTagClosing => + Ok(Some(Token::Character('/'))), + State::CommentClosing(ClosingSubstate::First) => + Ok(Some(Token::Character('-'))), + State::CDataClosing(ClosingSubstate::First) => + Ok(Some(Token::Character(']'))), + State::CDataClosing(ClosingSubstate::Second) => + Ok(Some(Token::Chunk("]]"))), + State::Normal => + Ok(None) + } + } + + #[inline] + fn error<M: Into<Cow<'static, str>>>(&self, msg: M) -> Error { + (self, msg).into() + } + + #[inline] + fn read_next_token(&mut self, c: char) -> Result { + let res = self.dispatch_char(c); + if self.char_queue.is_empty() { + if c == '\n' { + self.head_pos.new_line(); + } else { + self.head_pos.advance(1); + } + } + res + } + + fn dispatch_char(&mut self, c: char) -> Result { + match self.st { + State::Normal => self.normal(c), + State::TagStarted => self.tag_opened(c), + State::CommentOrCDataOrDoctypeStarted => self.comment_or_cdata_or_doctype_started(c), + State::CommentStarted => self.comment_started(c), + State::CDataStarted(s) => self.cdata_started(c, s), + State::DoctypeStarted(s) => self.doctype_started(c, s), + State::DoctypeFinishing(d) => self.doctype_finishing(c, d), + State::ProcessingInstructionClosing => self.processing_instruction_closing(c), + State::EmptyTagClosing => self.empty_element_closing(c), + State::CommentClosing(s) => self.comment_closing(c, s), + State::CDataClosing(s) => self.cdata_closing(c, s) + } + } + + #[inline] + fn move_to(&mut self, st: State) -> Result { + self.st = st; + Ok(None) + } + + #[inline] + fn move_to_with(&mut self, st: State, token: Token) -> Result { + self.st = st; + Ok(Some(token)) + } + + #[inline] + fn move_to_with_unread(&mut self, st: State, cs: &[char], token: Token) -> Result { + self.char_queue.extend(cs.iter().cloned()); + self.move_to_with(st, token) + } + + fn handle_error(&mut self, chunk: &'static str, c: char) -> Result { + self.char_queue.push_back(c); + if self.skip_errors || (self.inside_comment && chunk != "--") { // FIXME: looks hacky + self.move_to_with(State::Normal, Token::Chunk(chunk)) + } else { + Err(self.error(format!("Unexpected token '{}' before '{}'", chunk, c))) + } + } + + /// Encountered a char + fn normal(&mut self, c: char) -> Result { + match c { + '<' => self.move_to(State::TagStarted), + '>' => Ok(Some(Token::TagEnd)), + '/' => self.move_to(State::EmptyTagClosing), + '=' => Ok(Some(Token::EqualsSign)), + '"' => Ok(Some(Token::DoubleQuote)), + '\'' => Ok(Some(Token::SingleQuote)), + '?' => self.move_to(State::ProcessingInstructionClosing), + '-' => self.move_to(State::CommentClosing(ClosingSubstate::First)), + ']' => self.move_to(State::CDataClosing(ClosingSubstate::First)), + '&' => Ok(Some(Token::ReferenceStart)), + ';' => Ok(Some(Token::ReferenceEnd)), + _ if is_whitespace_char(c) => Ok(Some(Token::Whitespace(c))), + _ => Ok(Some(Token::Character(c))) + } + } + + /// Encountered '<' + fn tag_opened(&mut self, c: char) -> Result { + match c { + '?' => self.move_to_with(State::Normal, Token::ProcessingInstructionStart), + '/' => self.move_to_with(State::Normal, Token::ClosingTagStart), + '!' => self.move_to(State::CommentOrCDataOrDoctypeStarted), + _ if is_whitespace_char(c) => self.move_to_with_unread(State::Normal, &[c], Token::OpeningTagStart), + _ if is_name_char(c) => self.move_to_with_unread(State::Normal, &[c], Token::OpeningTagStart), + _ => self.handle_error("<", c) + } + } + + /// Encountered '<!' + fn comment_or_cdata_or_doctype_started(&mut self, c: char) -> Result { + match c { + '-' => self.move_to(State::CommentStarted), + '[' => self.move_to(State::CDataStarted(CDataStartedSubstate::E)), + 'D' => self.move_to(State::DoctypeStarted(DoctypeStartedSubstate::D)), + _ => self.handle_error("<!", c) + } + } + + /// Encountered '<!-' + fn comment_started(&mut self, c: char) -> Result { + match c { + '-' => self.move_to_with(State::Normal, Token::CommentStart), + _ => self.handle_error("<!-", c) + } + } + + /// Encountered '<![' + fn cdata_started(&mut self, c: char, s: CDataStartedSubstate) -> Result { + use self::CDataStartedSubstate::{E, C, CD, CDA, CDAT, CDATA}; + dispatch_on_enum_state!(self, s, c, State::CDataStarted, + E ; 'C' ; C ; "<![", + C ; 'D' ; CD ; "<![C", + CD ; 'A' ; CDA ; "<![CD", + CDA ; 'T' ; CDAT ; "<![CDA", + CDAT ; 'A' ; CDATA ; "<![CDAT"; + CDATA ; '[' ; "<![CDATA" ; self.move_to_with(State::Normal, Token::CDataStart) + ) + } + + /// Encountered '<!D' + fn doctype_started(&mut self, c: char, s: DoctypeStartedSubstate) -> Result { + use self::DoctypeStartedSubstate::{D, DO, DOC, DOCT, DOCTY, DOCTYP}; + dispatch_on_enum_state!(self, s, c, State::DoctypeStarted, + D ; 'O' ; DO ; "<!D", + DO ; 'C' ; DOC ; "<!DO", + DOC ; 'T' ; DOCT ; "<!DOC", + DOCT ; 'Y' ; DOCTY ; "<!DOCT", + DOCTY ; 'P' ; DOCTYP ; "<!DOCTY"; + DOCTYP ; 'E' ; "<!DOCTYP" ; self.move_to_with(State::DoctypeFinishing(1), Token::DoctypeStart) + ) + } + + /// State used while awaiting the closing bracket for the <!DOCTYPE tag + fn doctype_finishing(&mut self, c: char, d: u8) -> Result { + match c { + '<' => self.move_to(State::DoctypeFinishing(d + 1)), + '>' if d == 1 => self.move_to_with(State::Normal, Token::TagEnd), + '>' => self.move_to(State::DoctypeFinishing(d - 1)), + _ => Ok(None), + } + } + + /// Encountered '?' + fn processing_instruction_closing(&mut self, c: char) -> Result { + match c { + '>' => self.move_to_with(State::Normal, Token::ProcessingInstructionEnd), + _ => self.move_to_with_unread(State::Normal, &[c], Token::Character('?')), + } + } + + /// Encountered '/' + fn empty_element_closing(&mut self, c: char) -> Result { + match c { + '>' => self.move_to_with(State::Normal, Token::EmptyTagEnd), + _ => self.move_to_with_unread(State::Normal, &[c], Token::Character('/')), + } + } + + /// Encountered '-' + fn comment_closing(&mut self, c: char, s: ClosingSubstate) -> Result { + match s { + ClosingSubstate::First => match c { + '-' => self.move_to(State::CommentClosing(ClosingSubstate::Second)), + _ => self.move_to_with_unread(State::Normal, &[c], Token::Character('-')) + }, + ClosingSubstate::Second => match c { + '>' => self.move_to_with(State::Normal, Token::CommentEnd), + // double dash not followed by a greater-than is a hard error inside comment + _ if self.inside_comment => self.handle_error("--", c), + // nothing else except comment closing starts with a double dash, and comment + // closing can never be after another dash, and also we're outside of a comment, + // therefore it is safe to push only the last read character to the list of unread + // characters and pass the double dash directly to the output + _ => self.move_to_with_unread(State::Normal, &[c], Token::Chunk("--")) + } + } + } + + /// Encountered ']' + fn cdata_closing(&mut self, c: char, s: ClosingSubstate) -> Result { + match s { + ClosingSubstate::First => match c { + ']' => self.move_to(State::CDataClosing(ClosingSubstate::Second)), + _ => self.move_to_with_unread(State::Normal, &[c], Token::Character(']')) + }, + ClosingSubstate::Second => match c { + '>' => self.move_to_with(State::Normal, Token::CDataEnd), + _ => self.move_to_with_unread(State::Normal, &[']', c], Token::Character(']')) + } + } + } +} + +#[cfg(test)] +mod tests { + use common::{Position}; + use std::io::{BufReader, Cursor}; + + use super::{Lexer, Token}; + + macro_rules! assert_oks( + (for $lex:ident and $buf:ident ; $($e:expr)+) => ({ + $( + assert_eq!(Ok(Some($e)), $lex.next_token(&mut $buf)); + )+ + }) + ); + + macro_rules! assert_err( + (for $lex:ident and $buf:ident expect row $r:expr ; $c:expr, $s:expr) => ({ + let err = $lex.next_token(&mut $buf); + assert!(err.is_err()); + let err = err.unwrap_err(); + assert_eq!($r as u64, err.position().row); + assert_eq!($c as u64, err.position().column); + assert_eq!($s, err.msg()); + }) + ); + + macro_rules! assert_none( + (for $lex:ident and $buf:ident) => ( + assert_eq!(Ok(None), $lex.next_token(&mut $buf)); + ) + ); + + fn make_lex_and_buf(s: &str) -> (Lexer, BufReader<Cursor<Vec<u8>>>) { + (Lexer::new(), BufReader::new(Cursor::new(s.to_owned().into_bytes()))) + } + + #[test] + fn simple_lexer_test() { + let (mut lex, mut buf) = make_lex_and_buf( + r#"<a p='q'> x<b z="y">d </b></a><p/> <?nm ?> <!-- a c --> "# + ); + + assert_oks!(for lex and buf ; + Token::OpeningTagStart + Token::Character('a') + Token::Whitespace(' ') + Token::Character('p') + Token::EqualsSign + Token::SingleQuote + Token::Character('q') + Token::SingleQuote + Token::TagEnd + Token::Whitespace(' ') + Token::Character('x') + Token::OpeningTagStart + Token::Character('b') + Token::Whitespace(' ') + Token::Character('z') + Token::EqualsSign + Token::DoubleQuote + Token::Character('y') + Token::DoubleQuote + Token::TagEnd + Token::Character('d') + Token::Whitespace('\t') + Token::ClosingTagStart + Token::Character('b') + Token::TagEnd + Token::ClosingTagStart + Token::Character('a') + Token::TagEnd + Token::OpeningTagStart + Token::Character('p') + Token::EmptyTagEnd + Token::Whitespace(' ') + Token::ProcessingInstructionStart + Token::Character('n') + Token::Character('m') + Token::Whitespace(' ') + Token::ProcessingInstructionEnd + Token::Whitespace(' ') + Token::CommentStart + Token::Whitespace(' ') + Token::Character('a') + Token::Whitespace(' ') + Token::Character('c') + Token::Whitespace(' ') + Token::CommentEnd + Token::Whitespace(' ') + Token::ReferenceStart + Token::Character('n') + Token::Character('b') + Token::Character('s') + Token::Character('p') + Token::ReferenceEnd + ); + assert_none!(for lex and buf); + } + + #[test] + fn special_chars_test() { + let (mut lex, mut buf) = make_lex_and_buf( + r#"?x!+ // -| ]z]]"# + ); + + assert_oks!(for lex and buf ; + Token::Character('?') + Token::Character('x') + Token::Character('!') + Token::Character('+') + Token::Whitespace(' ') + Token::Character('/') + Token::Character('/') + Token::Whitespace(' ') + Token::Character('-') + Token::Character('|') + Token::Whitespace(' ') + Token::Character(']') + Token::Character('z') + Token::Chunk("]]") + ); + assert_none!(for lex and buf); + } + + #[test] + fn cdata_test() { + let (mut lex, mut buf) = make_lex_and_buf( + r#"<a><![CDATA[x y ?]]> </a>"# + ); + + assert_oks!(for lex and buf ; + Token::OpeningTagStart + Token::Character('a') + Token::TagEnd + Token::CDataStart + Token::Character('x') + Token::Whitespace(' ') + Token::Character('y') + Token::Whitespace(' ') + Token::Character('?') + Token::CDataEnd + Token::Whitespace(' ') + Token::ClosingTagStart + Token::Character('a') + Token::TagEnd + ); + assert_none!(for lex and buf); + } + + #[test] + fn doctype_test() { + let (mut lex, mut buf) = make_lex_and_buf( + r#"<a><!DOCTYPE ab xx z> "# + ); + assert_oks!(for lex and buf ; + Token::OpeningTagStart + Token::Character('a') + Token::TagEnd + Token::DoctypeStart + Token::TagEnd + Token::Whitespace(' ') + ); + assert_none!(for lex and buf) + } + + #[test] + fn doctype_with_internal_subset_test() { + let (mut lex, mut buf) = make_lex_and_buf( + r#"<a><!DOCTYPE ab[<!ELEMENT ba> ]> "# + ); + assert_oks!(for lex and buf ; + Token::OpeningTagStart + Token::Character('a') + Token::TagEnd + Token::DoctypeStart + Token::TagEnd + Token::Whitespace(' ') + ); + assert_none!(for lex and buf) + } + + #[test] + fn end_of_stream_handling_ok() { + macro_rules! eof_check( + ($data:expr ; $token:expr) => ({ + let (mut lex, mut buf) = make_lex_and_buf($data); + assert_oks!(for lex and buf ; $token); + assert_none!(for lex and buf); + }) + ); + eof_check!("?" ; Token::Character('?')); + eof_check!("/" ; Token::Character('/')); + eof_check!("-" ; Token::Character('-')); + eof_check!("]" ; Token::Character(']')); + eof_check!("]]" ; Token::Chunk("]]")); + } + + #[test] + fn end_of_stream_handling_error() { + macro_rules! eof_check( + ($data:expr; $r:expr, $c:expr) => ({ + let (mut lex, mut buf) = make_lex_and_buf($data); + assert_err!(for lex and buf expect row $r ; $c, "Unexpected end of stream"); + assert_none!(for lex and buf); + }) + ); + eof_check!("<" ; 0, 1); + eof_check!("<!" ; 0, 2); + eof_check!("<!-" ; 0, 3); + eof_check!("<![" ; 0, 3); + eof_check!("<![C" ; 0, 4); + eof_check!("<![CD" ; 0, 5); + eof_check!("<![CDA" ; 0, 6); + eof_check!("<![CDAT" ; 0, 7); + eof_check!("<![CDATA" ; 0, 8); + eof_check!("--" ; 0, 2); + } + + #[test] + fn error_in_comment_or_cdata_prefix() { + let (mut lex, mut buf) = make_lex_and_buf("<!x"); + assert_err!(for lex and buf expect row 0 ; 0, + "Unexpected token '<!' before 'x'" + ); + + let (mut lex, mut buf) = make_lex_and_buf("<!x"); + lex.disable_errors(); + assert_oks!(for lex and buf ; + Token::Chunk("<!") + Token::Character('x') + ); + assert_none!(for lex and buf); + } + + #[test] + fn error_in_comment_started() { + let (mut lex, mut buf) = make_lex_and_buf("<!-\t"); + assert_err!(for lex and buf expect row 0 ; 0, + "Unexpected token '<!-' before '\t'" + ); + + let (mut lex, mut buf) = make_lex_and_buf("<!-\t"); + lex.disable_errors(); + assert_oks!(for lex and buf ; + Token::Chunk("<!-") + Token::Whitespace('\t') + ); + assert_none!(for lex and buf); + } + + #[test] + fn error_in_comment_two_dashes_not_at_end() { + let (mut lex, mut buf) = make_lex_and_buf("--x"); + lex.inside_comment(); + assert_err!(for lex and buf expect row 0; 0, + "Unexpected token '--' before 'x'" + ); + + let (mut lex, mut buf) = make_lex_and_buf("--x"); + assert_oks!(for lex and buf ; + Token::Chunk("--") + Token::Character('x') + ); + } + + macro_rules! check_case( + ($chunk:expr, $app:expr; $data:expr; $r:expr, $c:expr, $s:expr) => ({ + let (mut lex, mut buf) = make_lex_and_buf($data); + assert_err!(for lex and buf expect row $r ; $c, $s); + + let (mut lex, mut buf) = make_lex_and_buf($data); + lex.disable_errors(); + assert_oks!(for lex and buf ; + Token::Chunk($chunk) + Token::Character($app) + ); + assert_none!(for lex and buf); + }) + ); + + #[test] + fn error_in_cdata_started() { + check_case!("<![", '['; "<![[" ; 0, 0, "Unexpected token '<![' before '['"); + check_case!("<![C", '['; "<![C[" ; 0, 0, "Unexpected token '<![C' before '['"); + check_case!("<![CD", '['; "<![CD[" ; 0, 0, "Unexpected token '<![CD' before '['"); + check_case!("<![CDA", '['; "<![CDA[" ; 0, 0, "Unexpected token '<![CDA' before '['"); + check_case!("<![CDAT", '['; "<![CDAT[" ; 0, 0, "Unexpected token '<![CDAT' before '['"); + check_case!("<![CDATA", '|'; "<![CDATA|" ; 0, 0, "Unexpected token '<![CDATA' before '|'"); + } + + #[test] + fn error_in_doctype_started() { + check_case!("<!D", 'a'; "<!Da" ; 0, 0, "Unexpected token '<!D' before 'a'"); + check_case!("<!DO", 'b'; "<!DOb" ; 0, 0, "Unexpected token '<!DO' before 'b'"); + check_case!("<!DOC", 'c'; "<!DOCc" ; 0, 0, "Unexpected token '<!DOC' before 'c'"); + check_case!("<!DOCT", 'd'; "<!DOCTd" ; 0, 0, "Unexpected token '<!DOCT' before 'd'"); + check_case!("<!DOCTY", 'e'; "<!DOCTYe" ; 0, 0, "Unexpected token '<!DOCTY' before 'e'"); + check_case!("<!DOCTYP", 'f'; "<!DOCTYPf" ; 0, 0, "Unexpected token '<!DOCTYP' before 'f'"); + } + + + + #[test] + fn issue_98_cdata_ending_with_right_bracket() { + let (mut lex, mut buf) = make_lex_and_buf( + r#"<![CDATA[Foo [Bar]]]>"# + ); + + assert_oks!(for lex and buf ; + Token::CDataStart + Token::Character('F') + Token::Character('o') + Token::Character('o') + Token::Whitespace(' ') + Token::Character('[') + Token::Character('B') + Token::Character('a') + Token::Character('r') + Token::Character(']') + Token::CDataEnd + ); + assert_none!(for lex and buf); + } +} diff --git a/third_party/rust/xml-rs/src/reader/mod.rs b/third_party/rust/xml-rs/src/reader/mod.rs new file mode 100644 index 0000000000..90f5b52c56 --- /dev/null +++ b/third_party/rust/xml-rs/src/reader/mod.rs @@ -0,0 +1,129 @@ +//! Contains high-level interface for a pull-based XML parser. +//! +//! The most important type in this module is `EventReader`, which provides an iterator +//! view for events in XML document. + +use std::io::{Read}; +use std::result; + +use common::{Position, TextPosition}; + +pub use self::config::ParserConfig; +pub use self::events::XmlEvent; + +use self::parser::PullParser; + +mod lexer; +mod parser; +mod config; +mod events; + +mod error; +pub use self::error::{Error, ErrorKind}; + +/// A result type yielded by `XmlReader`. +pub type Result<T> = result::Result<T, Error>; + +/// A wrapper around an `std::io::Read` instance which provides pull-based XML parsing. +pub struct EventReader<R: Read> { + source: R, + parser: PullParser +} + +impl<R: Read> EventReader<R> { + /// Creates a new reader, consuming the given stream. + #[inline] + pub fn new(source: R) -> EventReader<R> { + EventReader::new_with_config(source, ParserConfig::new()) + } + + /// Creates a new reader with the provded configuration, consuming the given stream. + #[inline] + pub fn new_with_config(source: R, config: ParserConfig) -> EventReader<R> { + EventReader { source: source, parser: PullParser::new(config) } + } + + /// Pulls and returns next XML event from the stream. + /// + /// If returned event is `XmlEvent::Error` or `XmlEvent::EndDocument`, then + /// further calls to this method will return this event again. + #[inline] + pub fn next(&mut self) -> Result<XmlEvent> { + self.parser.next(&mut self.source) + } + + pub fn source(&self) -> &R { &self.source } + pub fn source_mut(&mut self) -> &mut R { &mut self.source } + + /// Unwraps this `EventReader`, returning the underlying reader. + /// + /// Note that this operation is destructive; unwrapping the reader and wrapping it + /// again with `EventReader::new()` will create a fresh reader which will attempt + /// to parse an XML document from the beginning. + pub fn into_inner(self) -> R { + self.source + } +} + +impl<B: Read> Position for EventReader<B> { + /// Returns the position of the last event produced by the reader. + #[inline] + fn position(&self) -> TextPosition { + self.parser.position() + } +} + +impl<R: Read> IntoIterator for EventReader<R> { + type Item = Result<XmlEvent>; + type IntoIter = Events<R>; + + fn into_iter(self) -> Events<R> { + Events { reader: self, finished: false } + } +} + +/// An iterator over XML events created from some type implementing `Read`. +/// +/// When the next event is `xml::event::Error` or `xml::event::EndDocument`, then +/// it will be returned by the iterator once, and then it will stop producing events. +pub struct Events<R: Read> { + reader: EventReader<R>, + finished: bool +} + +impl<R: Read> Events<R> { + /// Unwraps the iterator, returning the internal `EventReader`. + #[inline] + pub fn into_inner(self) -> EventReader<R> { + self.reader + } + + pub fn source(&self) -> &R { &self.reader.source } + pub fn source_mut(&mut self) -> &mut R { &mut self.reader.source } + +} + +impl<R: Read> Iterator for Events<R> { + type Item = Result<XmlEvent>; + + #[inline] + fn next(&mut self) -> Option<Result<XmlEvent>> { + if self.finished && !self.reader.parser.is_ignoring_end_of_stream() { None } + else { + let ev = self.reader.next(); + match ev { + Ok(XmlEvent::EndDocument) | Err(_) => self.finished = true, + _ => {} + } + Some(ev) + } + } +} + +impl<'r> EventReader<&'r [u8]> { + /// A convenience method to create an `XmlReader` from a string slice. + #[inline] + pub fn from_str(source: &'r str) -> EventReader<&'r [u8]> { + EventReader::new(source.as_bytes()) + } +} diff --git a/third_party/rust/xml-rs/src/reader/parser/inside_cdata.rs b/third_party/rust/xml-rs/src/reader/parser/inside_cdata.rs new file mode 100644 index 0000000000..3269fb4d6b --- /dev/null +++ b/third_party/rust/xml-rs/src/reader/parser/inside_cdata.rs @@ -0,0 +1,32 @@ +use reader::events::XmlEvent; +use reader::lexer::Token; + +use super::{Result, PullParser, State}; + +impl PullParser { + pub fn inside_cdata(&mut self, t: Token) -> Option<Result> { + match t { + Token::CDataEnd => { + self.lexer.enable_errors(); + let event = if self.config.cdata_to_characters { + None + } else { + let data = self.take_buf(); + Some(Ok(XmlEvent::CData(data))) + }; + self.into_state(State::OutsideTag, event) + } + + Token::Whitespace(_) => { + t.push_to_string(&mut self.buf); + None + } + + _ => { + self.inside_whitespace = false; + t.push_to_string(&mut self.buf); + None + } + } + } +} diff --git a/third_party/rust/xml-rs/src/reader/parser/inside_closing_tag_name.rs b/third_party/rust/xml-rs/src/reader/parser/inside_closing_tag_name.rs new file mode 100644 index 0000000000..1d8074a5a3 --- /dev/null +++ b/third_party/rust/xml-rs/src/reader/parser/inside_closing_tag_name.rs @@ -0,0 +1,34 @@ +use namespace; + +use reader::lexer::Token; + +use super::{Result, PullParser, State, QualifiedNameTarget, ClosingTagSubstate}; + +impl PullParser { + pub fn inside_closing_tag_name(&mut self, t: Token, s: ClosingTagSubstate) -> Option<Result> { + match s { + ClosingTagSubstate::CTInsideName => self.read_qualified_name(t, QualifiedNameTarget::ClosingTagNameTarget, |this, token, name| { + match name.prefix_ref() { + Some(prefix) if prefix == namespace::NS_XML_PREFIX || + prefix == namespace::NS_XMLNS_PREFIX => + // TODO: {:?} is bad, need something better + Some(self_error!(this; "'{:?}' cannot be an element name prefix", name.prefix)), + _ => { + this.data.element_name = Some(name.clone()); + match token { + Token::Whitespace(_) => this.into_state_continue(State::InsideClosingTag(ClosingTagSubstate::CTAfterName)), + Token::TagEnd => this.emit_end_element(), + _ => Some(self_error!(this; "Unexpected token inside closing tag: {}", token)) + } + } + } + }), + ClosingTagSubstate::CTAfterName => match t { + Token::Whitespace(_) => None, // Skip whitespace + Token::TagEnd => self.emit_end_element(), + _ => Some(self_error!(self; "Unexpected token inside closing tag: {}", t)) + } + } + } + +} diff --git a/third_party/rust/xml-rs/src/reader/parser/inside_comment.rs b/third_party/rust/xml-rs/src/reader/parser/inside_comment.rs new file mode 100644 index 0000000000..fc983205ac --- /dev/null +++ b/third_party/rust/xml-rs/src/reader/parser/inside_comment.rs @@ -0,0 +1,32 @@ +use reader::events::XmlEvent; +use reader::lexer::Token; + +use super::{Result, PullParser, State}; + +impl PullParser { + pub fn inside_comment(&mut self, t: Token) -> Option<Result> { + match t { + // Double dash is illegal inside a comment + Token::Chunk(ref s) if &s[..] == "--" => Some(self_error!(self; "Unexpected token inside a comment: --")), + + Token::CommentEnd if self.config.ignore_comments => { + self.lexer.outside_comment(); + self.into_state_continue(State::OutsideTag) + } + + Token::CommentEnd => { + self.lexer.outside_comment(); + let data = self.take_buf(); + self.into_state_emit(State::OutsideTag, Ok(XmlEvent::Comment(data))) + } + + _ if self.config.ignore_comments => None, // Do not modify buffer if ignoring the comment + + _ => { + t.push_to_string(&mut self.buf); + None + } + } + } + +} diff --git a/third_party/rust/xml-rs/src/reader/parser/inside_declaration.rs b/third_party/rust/xml-rs/src/reader/parser/inside_declaration.rs new file mode 100644 index 0000000000..af39d10d86 --- /dev/null +++ b/third_party/rust/xml-rs/src/reader/parser/inside_declaration.rs @@ -0,0 +1,151 @@ + +use common::XmlVersion; + +use reader::events::XmlEvent; +use reader::lexer::Token; + +use super::{ + Result, PullParser, State, DeclarationSubstate, QualifiedNameTarget, + DEFAULT_VERSION, DEFAULT_ENCODING +}; + +impl PullParser { + // TODO: remove redundancy via macros or extra methods + pub fn inside_declaration(&mut self, t: Token, s: DeclarationSubstate) -> Option<Result> { + macro_rules! unexpected_token( + ($this:expr; $t:expr) => (Some($this.error(format!("Unexpected token inside XML declaration: {}", $t)))); + ($t:expr) => (unexpected_token!(self; $t)); + ); + + #[inline] + fn emit_start_document(this: &mut PullParser) -> Option<Result> { + this.parsed_declaration = true; + let version = this.data.take_version(); + let encoding = this.data.take_encoding(); + let standalone = this.data.take_standalone(); + this.into_state_emit(State::OutsideTag, Ok(XmlEvent::StartDocument { + version: version.unwrap_or(DEFAULT_VERSION), + encoding: encoding.unwrap_or(DEFAULT_ENCODING.into()), + standalone: standalone + })) + } + + match s { + DeclarationSubstate::BeforeVersion => match t { + Token::Whitespace(_) => None, // continue + Token::Character('v') => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideVersion)), + _ => unexpected_token!(t) + }, + + DeclarationSubstate::InsideVersion => self.read_qualified_name(t, QualifiedNameTarget::AttributeNameTarget, |this, token, name| { + match &name.local_name[..] { + "ersion" if name.namespace.is_none() => + this.into_state_continue(State::InsideDeclaration( + if token == Token::EqualsSign { + DeclarationSubstate::InsideVersionValue + } else { + DeclarationSubstate::AfterVersion + } + )), + _ => unexpected_token!(this; name) + } + }), + + DeclarationSubstate::AfterVersion => match t { + Token::Whitespace(_) => None, + Token::EqualsSign => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideVersionValue)), + _ => unexpected_token!(t) + }, + + DeclarationSubstate::InsideVersionValue => self.read_attribute_value(t, |this, value| { + this.data.version = match &value[..] { + "1.0" => Some(XmlVersion::Version10), + "1.1" => Some(XmlVersion::Version11), + _ => None + }; + if this.data.version.is_some() { + this.into_state_continue(State::InsideDeclaration(DeclarationSubstate::AfterVersionValue)) + } else { + Some(self_error!(this; "Unexpected XML version value: {}", value)) + } + }), + + DeclarationSubstate::AfterVersionValue => match t { + Token::Whitespace(_) => None, // skip whitespace + Token::Character('e') => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideEncoding)), + Token::Character('s') => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideStandaloneDecl)), + Token::ProcessingInstructionEnd => emit_start_document(self), + _ => unexpected_token!(t) + }, + + DeclarationSubstate::InsideEncoding => self.read_qualified_name(t, QualifiedNameTarget::AttributeNameTarget, |this, token, name| { + match &name.local_name[..] { + "ncoding" if name.namespace.is_none() => + this.into_state_continue(State::InsideDeclaration( + if token == Token::EqualsSign { DeclarationSubstate::InsideEncodingValue } else { DeclarationSubstate::AfterEncoding } + )), + _ => unexpected_token!(this; name) + } + }), + + DeclarationSubstate::AfterEncoding => match t { + Token::Whitespace(_) => None, + Token::EqualsSign => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideEncodingValue)), + _ => unexpected_token!(t) + }, + + DeclarationSubstate::InsideEncodingValue => self.read_attribute_value(t, |this, value| { + this.data.encoding = Some(value); + this.into_state_continue(State::InsideDeclaration(DeclarationSubstate::BeforeStandaloneDecl)) + }), + + DeclarationSubstate::BeforeStandaloneDecl => match t { + Token::Whitespace(_) => None, // skip whitespace + Token::Character('s') => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideStandaloneDecl)), + Token::ProcessingInstructionEnd => emit_start_document(self), + _ => unexpected_token!(t) + }, + + DeclarationSubstate::InsideStandaloneDecl => self.read_qualified_name(t, QualifiedNameTarget::AttributeNameTarget, |this, token, name| { + match &name.local_name[..] { + "tandalone" if name.namespace.is_none() => + this.into_state_continue(State::InsideDeclaration( + if token == Token::EqualsSign { + DeclarationSubstate::InsideStandaloneDeclValue + } else { + DeclarationSubstate::AfterStandaloneDecl + } + )), + _ => unexpected_token!(this; name) + } + }), + + DeclarationSubstate::AfterStandaloneDecl => match t { + Token::Whitespace(_) => None, + Token::EqualsSign => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideStandaloneDeclValue)), + _ => unexpected_token!(t) + }, + + DeclarationSubstate::InsideStandaloneDeclValue => self.read_attribute_value(t, |this, value| { + let standalone = match &value[..] { + "yes" => Some(true), + "no" => Some(false), + _ => None + }; + if standalone.is_some() { + this.data.standalone = standalone; + this.into_state_continue(State::InsideDeclaration(DeclarationSubstate::AfterStandaloneDeclValue)) + } else { + Some(self_error!(this; "Invalid standalone declaration value: {}", value)) + } + }), + + DeclarationSubstate::AfterStandaloneDeclValue => match t { + Token::Whitespace(_) => None, // skip whitespace + Token::ProcessingInstructionEnd => emit_start_document(self), + _ => unexpected_token!(t) + } + } + } + +} diff --git a/third_party/rust/xml-rs/src/reader/parser/inside_doctype.rs b/third_party/rust/xml-rs/src/reader/parser/inside_doctype.rs new file mode 100644 index 0000000000..8dcf367bc6 --- /dev/null +++ b/third_party/rust/xml-rs/src/reader/parser/inside_doctype.rs @@ -0,0 +1,16 @@ +use reader::lexer::Token; + +use super::{Result, PullParser, State}; + +impl PullParser { + pub fn inside_doctype(&mut self, t: Token) -> Option<Result> { + match t { + Token::TagEnd => { + self.lexer.enable_errors(); + self.into_state_continue(State::OutsideTag) + } + + _ => None + } + } +} diff --git a/third_party/rust/xml-rs/src/reader/parser/inside_opening_tag.rs b/third_party/rust/xml-rs/src/reader/parser/inside_opening_tag.rs new file mode 100644 index 0000000000..533874fb81 --- /dev/null +++ b/third_party/rust/xml-rs/src/reader/parser/inside_opening_tag.rs @@ -0,0 +1,108 @@ +use common::is_name_start_char; +use attribute::OwnedAttribute; +use namespace; + +use reader::lexer::Token; + +use super::{Result, PullParser, State, OpeningTagSubstate, QualifiedNameTarget}; + +impl PullParser { + pub fn inside_opening_tag(&mut self, t: Token, s: OpeningTagSubstate) -> Option<Result> { + macro_rules! unexpected_token(($t:expr) => (Some(self_error!(self; "Unexpected token inside opening tag: {}", $t)))); + match s { + OpeningTagSubstate::InsideName => self.read_qualified_name(t, QualifiedNameTarget::OpeningTagNameTarget, |this, token, name| { + match name.prefix_ref() { + Some(prefix) if prefix == namespace::NS_XML_PREFIX || + prefix == namespace::NS_XMLNS_PREFIX => + Some(self_error!(this; "'{:?}' cannot be an element name prefix", name.prefix)), + _ => { + this.data.element_name = Some(name.clone()); + match token { + Token::TagEnd => this.emit_start_element(false), + Token::EmptyTagEnd => this.emit_start_element(true), + Token::Whitespace(_) => this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideTag)), + _ => unreachable!() + } + } + } + }), + + OpeningTagSubstate::InsideTag => match t { + Token::Whitespace(_) => None, // skip whitespace + Token::Character(c) if is_name_start_char(c) => { + self.buf.push(c); + self.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideAttributeName)) + } + Token::TagEnd => self.emit_start_element(false), + Token::EmptyTagEnd => self.emit_start_element(true), + _ => unexpected_token!(t) + }, + + OpeningTagSubstate::InsideAttributeName => self.read_qualified_name(t, QualifiedNameTarget::AttributeNameTarget, |this, token, name| { + this.data.attr_name = Some(name); + match token { + Token::Whitespace(_) => this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::AfterAttributeName)), + Token::EqualsSign => this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideAttributeValue)), + _ => unreachable!() + } + }), + + OpeningTagSubstate::AfterAttributeName => match t { + Token::Whitespace(_) => None, + Token::EqualsSign => self.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideAttributeValue)), + _ => unexpected_token!(t) + }, + + OpeningTagSubstate::InsideAttributeValue => self.read_attribute_value(t, |this, value| { + let name = this.data.take_attr_name().unwrap(); // unwrap() will always succeed here + + // check that no attribute with such name is already present + // if there is one, XML is not well-formed + if this.data.attributes.iter().find(|a| a.name == name).is_some() { // TODO: looks bad + // TODO: ideally this error should point to the beginning of the attribute, + // TODO: not the end of its value + Some(self_error!(this; "Attribute '{}' is redefined", name)) + } else { + match name.prefix_ref() { + // declaring a new prefix; it is sufficient to check prefix only + // because "xmlns" prefix is reserved + Some(namespace::NS_XMLNS_PREFIX) => { + let ln = &name.local_name[..]; + if ln == namespace::NS_XMLNS_PREFIX { + Some(self_error!(this; "Cannot redefine prefix '{}'", namespace::NS_XMLNS_PREFIX)) + } else if ln == namespace::NS_XML_PREFIX && &value[..] != namespace::NS_XML_URI { + Some(self_error!(this; "Prefix '{}' cannot be rebound to another value", namespace::NS_XML_PREFIX)) + } else if value.is_empty() { + Some(self_error!(this; "Cannot undefine prefix '{}'", ln)) + } else { + this.nst.put(name.local_name.clone(), value); + this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideTag)) + } + } + + // declaring default namespace + None if &name.local_name[..] == namespace::NS_XMLNS_PREFIX => + match &value[..] { + namespace::NS_XMLNS_PREFIX | namespace::NS_XML_PREFIX => + Some(self_error!(this; "Namespace '{}' cannot be default", value)), + _ => { + this.nst.put(namespace::NS_NO_PREFIX, value.clone()); + this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideTag)) + } + }, + + // regular attribute + _ => { + this.data.attributes.push(OwnedAttribute { + name: name.clone(), + value: value + }); + this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideTag)) + } + } + } + }) + } + } + +} diff --git a/third_party/rust/xml-rs/src/reader/parser/inside_processing_instruction.rs b/third_party/rust/xml-rs/src/reader/parser/inside_processing_instruction.rs new file mode 100644 index 0000000000..8ddf6b8d51 --- /dev/null +++ b/third_party/rust/xml-rs/src/reader/parser/inside_processing_instruction.rs @@ -0,0 +1,96 @@ +use common::{ + is_name_start_char, is_name_char, +}; + +use reader::events::XmlEvent; +use reader::lexer::Token; + +use super::{Result, PullParser, State, ProcessingInstructionSubstate, DeclarationSubstate}; + +impl PullParser { + pub fn inside_processing_instruction(&mut self, t: Token, s: ProcessingInstructionSubstate) -> Option<Result> { + match s { + ProcessingInstructionSubstate::PIInsideName => match t { + Token::Character(c) if !self.buf_has_data() && is_name_start_char(c) || + self.buf_has_data() && is_name_char(c) => self.append_char_continue(c), + + Token::ProcessingInstructionEnd => { + // self.buf contains PI name + let name = self.take_buf(); + + // Don't need to check for declaration because it has mandatory attributes + // but there is none + match &name[..] { + // Name is empty, it is an error + "" => Some(self_error!(self; "Encountered processing instruction without name")), + + // Found <?xml-like PI not at the beginning of a document, + // it is an error - see section 2.6 of XML 1.1 spec + "xml"|"xmL"|"xMl"|"xML"|"Xml"|"XmL"|"XMl"|"XML" => + Some(self_error!(self; "Invalid processing instruction: <?{}", name)), + + // All is ok, emitting event + _ => { + self.into_state_emit( + State::OutsideTag, + Ok(XmlEvent::ProcessingInstruction { + name: name, + data: None + }) + ) + } + } + } + + Token::Whitespace(_) => { + // self.buf contains PI name + let name = self.take_buf(); + + match &name[..] { + // We have not ever encountered an element and have not parsed XML declaration + "xml" if !self.encountered_element && !self.parsed_declaration => + self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::BeforeVersion)), + + // Found <?xml-like PI after the beginning of a document, + // it is an error - see section 2.6 of XML 1.1 spec + "xml"|"xmL"|"xMl"|"xML"|"Xml"|"XmL"|"XMl"|"XML" + if self.encountered_element || self.parsed_declaration => + Some(self_error!(self; "Invalid processing instruction: <?{}", name)), + + // All is ok, starting parsing PI data + _ => { + self.lexer.disable_errors(); // data is arbitrary, so disable errors + self.data.name = name; + self.into_state_continue(State::InsideProcessingInstruction(ProcessingInstructionSubstate::PIInsideData)) + } + + } + } + + _ => Some(self_error!(self; "Unexpected token: <?{}{}", self.buf, t)) + }, + + ProcessingInstructionSubstate::PIInsideData => match t { + Token::ProcessingInstructionEnd => { + self.lexer.enable_errors(); + let name = self.data.take_name(); + let data = self.take_buf(); + self.into_state_emit( + State::OutsideTag, + Ok(XmlEvent::ProcessingInstruction { + name: name, + data: Some(data) + }) + ) + }, + + // Any other token should be treated as plain characters + _ => { + t.push_to_string(&mut self.buf); + None + } + }, + } + } + +} diff --git a/third_party/rust/xml-rs/src/reader/parser/inside_reference.rs b/third_party/rust/xml-rs/src/reader/parser/inside_reference.rs new file mode 100644 index 0000000000..60026d5572 --- /dev/null +++ b/third_party/rust/xml-rs/src/reader/parser/inside_reference.rs @@ -0,0 +1,89 @@ +use std::char; + +use common::{is_name_start_char, is_name_char, is_whitespace_str}; + +use reader::lexer::Token; + +use super::{Result, PullParser, State}; + +impl PullParser { + pub fn inside_reference(&mut self, t: Token, prev_st: State) -> Option<Result> { + match t { + Token::Character(c) if !self.data.ref_data.is_empty() && is_name_char(c) || + self.data.ref_data.is_empty() && (is_name_start_char(c) || c == '#') => { + self.data.ref_data.push(c); + None + } + + Token::ReferenceEnd => { + // TODO: check for unicode correctness + let name = self.data.take_ref_data(); + let name_len = name.len(); // compute once + let c = match &name[..] { + "lt" => Ok('<'.to_string()), + "gt" => Ok('>'.to_string()), + "amp" => Ok('&'.to_string()), + "apos" => Ok('\''.to_string()), + "quot" => Ok('"'.to_string()), + "" => Err(self_error!(self; "Encountered empty entity")), + _ if name_len > 2 && name.starts_with("#x") => { + let num_str = &name[2..name_len]; + if num_str == "0" { + Err(self_error!(self; "Null character entity is not allowed")) + } else { + if self.config.replace_unknown_entity_references { + match u32::from_str_radix(num_str, 16).ok().map(|i| char::from_u32(i).unwrap_or('\u{fffd}')) { + Some(c) => Ok(c.to_string()), + None => Err(self_error!(self; "Invalid hexadecimal character number in an entity: {}", name)) + } + } else { + match u32::from_str_radix(num_str, 16).ok().and_then(char::from_u32) { + Some(c) => Ok(c.to_string()), + None => Err(self_error!(self; "Invalid hexadecimal character number in an entity: {}", name)) + } + } + } + } + _ if name_len > 1 && name.starts_with('#') => { + let num_str = &name[1..name_len]; + if num_str == "0" { + Err(self_error!(self; "Null character entity is not allowed")) + } else { + if self.config.replace_unknown_entity_references { + match u32::from_str_radix(num_str, 10).ok().map(|i| char::from_u32(i).unwrap_or('\u{fffd}')) { + Some(c) => Ok(c.to_string()), + None => Err(self_error!(self; "Invalid decimal character number in an entity: {}", name)) + } + } + else { + match u32::from_str_radix(num_str, 10).ok().and_then(char::from_u32) { + Some(c) => Ok(c.to_string()), + None => Err(self_error!(self; "Invalid decimal character number in an entity: {}", name)) + } + } + } + }, + _ => { + if let Some(v) = self.config.extra_entities.get(&name) { + Ok(v.clone()) + } else { + Err(self_error!(self; "Unexpected entity: {}", name)) + } + } + }; + match c { + Ok(c) => { + self.buf.push_str(&c); + if prev_st == State::OutsideTag && !is_whitespace_str(&c) { + self.inside_whitespace = false; + } + self.into_state_continue(prev_st) + } + Err(e) => Some(e) + } + } + + _ => Some(self_error!(self; "Unexpected token inside an entity: {}", t)) + } + } +} diff --git a/third_party/rust/xml-rs/src/reader/parser/mod.rs b/third_party/rust/xml-rs/src/reader/parser/mod.rs new file mode 100644 index 0000000000..58ca3a6b1e --- /dev/null +++ b/third_party/rust/xml-rs/src/reader/parser/mod.rs @@ -0,0 +1,622 @@ +//! Contains an implementation of pull-based XML parser. + +use std::mem; +use std::borrow::Cow; +use std::io::prelude::*; + +use common::{ + self, + XmlVersion, Position, TextPosition, + is_name_start_char, is_name_char, +}; +use name::OwnedName; +use attribute::OwnedAttribute; +use namespace::NamespaceStack; + +use reader::events::XmlEvent; +use reader::config::ParserConfig; +use reader::lexer::{Lexer, Token}; + +macro_rules! gen_takes( + ($($field:ident -> $method:ident, $t:ty, $def:expr);+) => ( + $( + impl MarkupData { + #[inline] + fn $method(&mut self) -> $t { + mem::replace(&mut self.$field, $def) + } + } + )+ + ) +); + +gen_takes!( + name -> take_name, String, String::new(); + ref_data -> take_ref_data, String, String::new(); + + version -> take_version, Option<common::XmlVersion>, None; + encoding -> take_encoding, Option<String>, None; + standalone -> take_standalone, Option<bool>, None; + + element_name -> take_element_name, Option<OwnedName>, None; + + attr_name -> take_attr_name, Option<OwnedName>, None; + attributes -> take_attributes, Vec<OwnedAttribute>, vec!() +); + +macro_rules! self_error( + ($this:ident; $msg:expr) => ($this.error($msg)); + ($this:ident; $fmt:expr, $($arg:expr),+) => ($this.error(format!($fmt, $($arg),+))) +); + +mod outside_tag; +mod inside_processing_instruction; +mod inside_declaration; +mod inside_doctype; +mod inside_opening_tag; +mod inside_closing_tag_name; +mod inside_comment; +mod inside_cdata; +mod inside_reference; + +static DEFAULT_VERSION: XmlVersion = XmlVersion::Version10; +static DEFAULT_ENCODING: &'static str = "UTF-8"; +static DEFAULT_STANDALONE: Option<bool> = None; + +type ElementStack = Vec<OwnedName>; +pub type Result = super::Result<XmlEvent>; + +/// Pull-based XML parser. +pub struct PullParser { + config: ParserConfig, + lexer: Lexer, + st: State, + buf: String, + nst: NamespaceStack, + + data: MarkupData, + final_result: Option<Result>, + next_event: Option<Result>, + est: ElementStack, + pos: Vec<TextPosition>, + + encountered_element: bool, + parsed_declaration: bool, + inside_whitespace: bool, + read_prefix_separator: bool, + pop_namespace: bool +} + +impl PullParser { + /// Returns a new parser using the given config. + pub fn new(config: ParserConfig) -> PullParser { + PullParser { + config: config, + lexer: Lexer::new(), + st: State::OutsideTag, + buf: String::new(), + nst: NamespaceStack::default(), + + data: MarkupData { + name: String::new(), + version: None, + encoding: None, + standalone: None, + ref_data: String::new(), + element_name: None, + quote: None, + attr_name: None, + attributes: Vec::new() + }, + final_result: None, + next_event: None, + est: Vec::new(), + pos: vec![TextPosition::new()], + + encountered_element: false, + parsed_declaration: false, + inside_whitespace: true, + read_prefix_separator: false, + pop_namespace: false + } + } + + /// Checks if this parser ignores the end of stream errors. + pub fn is_ignoring_end_of_stream(&self) -> bool { self.config.ignore_end_of_stream } +} + +impl Position for PullParser { + /// Returns the position of the last event produced by the parser + #[inline] + fn position(&self) -> TextPosition { + self.pos[0] + } +} + +#[derive(Clone, PartialEq)] +pub enum State { + OutsideTag, + InsideOpeningTag(OpeningTagSubstate), + InsideClosingTag(ClosingTagSubstate), + InsideProcessingInstruction(ProcessingInstructionSubstate), + InsideComment, + InsideCData, + InsideDeclaration(DeclarationSubstate), + InsideDoctype, + InsideReference(Box<State>) +} + +#[derive(Clone, PartialEq)] +pub enum OpeningTagSubstate { + InsideName, + + InsideTag, + + InsideAttributeName, + AfterAttributeName, + + InsideAttributeValue, +} + +#[derive(Clone, PartialEq)] +pub enum ClosingTagSubstate { + CTInsideName, + CTAfterName +} + +#[derive(Clone, PartialEq)] +pub enum ProcessingInstructionSubstate { + PIInsideName, + PIInsideData +} + +#[derive(Clone, PartialEq)] +pub enum DeclarationSubstate { + BeforeVersion, + InsideVersion, + AfterVersion, + + InsideVersionValue, + AfterVersionValue, + + InsideEncoding, + AfterEncoding, + + InsideEncodingValue, + + BeforeStandaloneDecl, + InsideStandaloneDecl, + AfterStandaloneDecl, + + InsideStandaloneDeclValue, + AfterStandaloneDeclValue +} + +#[derive(PartialEq)] +enum QualifiedNameTarget { + AttributeNameTarget, + OpeningTagNameTarget, + ClosingTagNameTarget +} + +#[derive(Copy, Clone, PartialEq, Eq)] +enum QuoteToken { + SingleQuoteToken, + DoubleQuoteToken +} + +impl QuoteToken { + fn from_token(t: &Token) -> QuoteToken { + match *t { + Token::SingleQuote => QuoteToken::SingleQuoteToken, + Token::DoubleQuote => QuoteToken::DoubleQuoteToken, + _ => panic!("Unexpected token: {}", t) + } + } + + fn as_token(self) -> Token { + match self { + QuoteToken::SingleQuoteToken => Token::SingleQuote, + QuoteToken::DoubleQuoteToken => Token::DoubleQuote + } + } +} + +struct MarkupData { + name: String, // used for processing instruction name + ref_data: String, // used for reference content + + version: Option<common::XmlVersion>, // used for XML declaration version + encoding: Option<String>, // used for XML declaration encoding + standalone: Option<bool>, // used for XML declaration standalone parameter + + element_name: Option<OwnedName>, // used for element name + + quote: Option<QuoteToken>, // used to hold opening quote for attribute value + attr_name: Option<OwnedName>, // used to hold attribute name + attributes: Vec<OwnedAttribute> // used to hold all accumulated attributes +} + +impl PullParser { + /// Returns next event read from the given buffer. + /// + /// This method should be always called with the same buffer. If you call it + /// providing different buffers each time, the result will be undefined. + pub fn next<R: Read>(&mut self, r: &mut R) -> Result { + if let Some(ref ev) = self.final_result { + return ev.clone(); + } + + if let Some(ev) = self.next_event.take() { + return ev; + } + + if self.pop_namespace { + self.pop_namespace = false; + self.nst.pop(); + } + + loop { + // While lexer gives us Ok(maybe_token) -- we loop. + // Upon having a complete XML-event -- we return from the whole function. + match self.lexer.next_token(r) { + Ok(maybe_token) => + match maybe_token { + None => break, + Some(token) => + match self.dispatch_token(token) { + None => {} // continue + Some(Ok(XmlEvent::EndDocument)) => + return { + self.next_pos(); + self.set_final_result(Ok(XmlEvent::EndDocument)) + }, + Some(Ok(xml_event)) => + return { + self.next_pos(); + Ok(xml_event) + }, + Some(Err(xml_error)) => + return { + self.next_pos(); + self.set_final_result(Err(xml_error)) + }, + } + }, + Err(lexer_error) => + return self.set_final_result(Err(lexer_error)), + } + } + + // Handle end of stream + // Forward pos to the lexer head + self.next_pos(); + let ev = if self.depth() == 0 { + if self.encountered_element && self.st == State::OutsideTag { // all is ok + Ok(XmlEvent::EndDocument) + } else if !self.encountered_element { + self_error!(self; "Unexpected end of stream: no root element found") + } else { // self.st != State::OutsideTag + self_error!(self; "Unexpected end of stream") // TODO: add expected hint? + } + } else { + if self.config.ignore_end_of_stream { + self.final_result = None; + self.lexer.reset_eof_handled(); + return self_error!(self; "Unexpected end of stream: still inside the root element"); + } else { + self_error!(self; "Unexpected end of stream: still inside the root element") + } + }; + self.set_final_result(ev) + } + + // This function is to be called when a terminal event is reached. + // The function sets up the `self.final_result` into `Some(result)` and return `result`. + fn set_final_result(&mut self, result: Result) -> Result { + self.final_result = Some(result.clone()); + result + } + + #[inline] + fn error<M: Into<Cow<'static, str>>>(&self, msg: M) -> Result { + Err((&self.lexer, msg).into()) + } + + #[inline] + fn next_pos(&mut self) { + if self.pos.len() > 1 { + self.pos.remove(0); + } else { + self.pos[0] = self.lexer.position(); + } + } + + #[inline] + fn push_pos(&mut self) { + self.pos.push(self.lexer.position()); + } + + fn dispatch_token(&mut self, t: Token) -> Option<Result> { + match self.st.clone() { + State::OutsideTag => self.outside_tag(t), + State::InsideProcessingInstruction(s) => self.inside_processing_instruction(t, s), + State::InsideDeclaration(s) => self.inside_declaration(t, s), + State::InsideDoctype => self.inside_doctype(t), + State::InsideOpeningTag(s) => self.inside_opening_tag(t, s), + State::InsideClosingTag(s) => self.inside_closing_tag_name(t, s), + State::InsideComment => self.inside_comment(t), + State::InsideCData => self.inside_cdata(t), + State::InsideReference(s) => self.inside_reference(t, *s) + } + } + + #[inline] + fn depth(&self) -> usize { + self.est.len() + } + + #[inline] + fn buf_has_data(&self) -> bool { + self.buf.len() > 0 + } + + #[inline] + fn take_buf(&mut self) -> String { + mem::replace(&mut self.buf, String::new()) + } + + #[inline] + fn append_char_continue(&mut self, c: char) -> Option<Result> { + self.buf.push(c); + None + } + + #[inline] + fn into_state(&mut self, st: State, ev: Option<Result>) -> Option<Result> { + self.st = st; + ev + } + + #[inline] + fn into_state_continue(&mut self, st: State) -> Option<Result> { + self.into_state(st, None) + } + + #[inline] + fn into_state_emit(&mut self, st: State, ev: Result) -> Option<Result> { + self.into_state(st, Some(ev)) + } + + /// Dispatches tokens in order to process qualified name. If qualified name cannot be parsed, + /// an error is returned. + /// + /// # Parameters + /// * `t` --- next token; + /// * `on_name` --- a callback which is executed when whitespace is encountered. + fn read_qualified_name<F>(&mut self, t: Token, target: QualifiedNameTarget, on_name: F) -> Option<Result> + where F: Fn(&mut PullParser, Token, OwnedName) -> Option<Result> { + // We can get here for the first time only when self.data.name contains zero or one character, + // but first character cannot be a colon anyway + if self.buf.len() <= 1 { + self.read_prefix_separator = false; + } + + let invoke_callback = |this: &mut PullParser, t| { + let name = this.take_buf(); + match name.parse() { + Ok(name) => on_name(this, t, name), + Err(_) => Some(self_error!(this; "Qualified name is invalid: {}", name)) + } + }; + + match t { + // There can be only one colon, and not as the first character + Token::Character(':') if self.buf_has_data() && !self.read_prefix_separator => { + self.buf.push(':'); + self.read_prefix_separator = true; + None + } + + Token::Character(c) if c != ':' && (!self.buf_has_data() && is_name_start_char(c) || + self.buf_has_data() && is_name_char(c)) => + self.append_char_continue(c), + + Token::EqualsSign if target == QualifiedNameTarget::AttributeNameTarget => invoke_callback(self, t), + + Token::EmptyTagEnd if target == QualifiedNameTarget::OpeningTagNameTarget => invoke_callback(self, t), + + Token::TagEnd if target == QualifiedNameTarget::OpeningTagNameTarget || + target == QualifiedNameTarget::ClosingTagNameTarget => invoke_callback(self, t), + + Token::Whitespace(_) => invoke_callback(self, t), + + _ => Some(self_error!(self; "Unexpected token inside qualified name: {}", t)) + } + } + + /// Dispatches tokens in order to process attribute value. + /// + /// # Parameters + /// * `t` --- next token; + /// * `on_value` --- a callback which is called when terminating quote is encountered. + fn read_attribute_value<F>(&mut self, t: Token, on_value: F) -> Option<Result> + where F: Fn(&mut PullParser, String) -> Option<Result> { + match t { + Token::Whitespace(_) if self.data.quote.is_none() => None, // skip leading whitespace + + Token::DoubleQuote | Token::SingleQuote => match self.data.quote { + None => { // Entered attribute value + self.data.quote = Some(QuoteToken::from_token(&t)); + None + } + Some(q) if q.as_token() == t => { + self.data.quote = None; + let value = self.take_buf(); + on_value(self, value) + } + _ => { + t.push_to_string(&mut self.buf); + None + } + }, + + Token::ReferenceStart => { + let st = Box::new(self.st.clone()); + self.into_state_continue(State::InsideReference(st)) + } + + Token::OpeningTagStart => + Some(self_error!(self; "Unexpected token inside attribute value: <")), + + // Every character except " and ' and < is okay + _ => { + t.push_to_string(&mut self.buf); + None + } + } + } + + fn emit_start_element(&mut self, emit_end_element: bool) -> Option<Result> { + let mut name = self.data.take_element_name().unwrap(); + let mut attributes = self.data.take_attributes(); + + // check whether the name prefix is bound and fix its namespace + match self.nst.get(name.borrow().prefix_repr()) { + Some("") => name.namespace = None, // default namespace + Some(ns) => name.namespace = Some(ns.into()), + None => return Some(self_error!(self; "Element {} prefix is unbound", name)) + } + + // check and fix accumulated attributes prefixes + for attr in attributes.iter_mut() { + if let Some(ref pfx) = attr.name.prefix { + let new_ns = match self.nst.get(pfx) { + Some("") => None, // default namespace + Some(ns) => Some(ns.into()), + None => return Some(self_error!(self; "Attribute {} prefix is unbound", attr.name)) + }; + attr.name.namespace = new_ns; + } + } + + if emit_end_element { + self.pop_namespace = true; + self.next_event = Some(Ok(XmlEvent::EndElement { + name: name.clone() + })); + } else { + self.est.push(name.clone()); + } + let namespace = self.nst.squash(); + self.into_state_emit(State::OutsideTag, Ok(XmlEvent::StartElement { + name: name, + attributes: attributes, + namespace: namespace + })) + } + + fn emit_end_element(&mut self) -> Option<Result> { + let mut name = self.data.take_element_name().unwrap(); + + // check whether the name prefix is bound and fix its namespace + match self.nst.get(name.borrow().prefix_repr()) { + Some("") => name.namespace = None, // default namespace + Some(ns) => name.namespace = Some(ns.into()), + None => return Some(self_error!(self; "Element {} prefix is unbound", name)) + } + + let op_name = self.est.pop().unwrap(); + + if name == op_name { + self.pop_namespace = true; + self.into_state_emit(State::OutsideTag, Ok(XmlEvent::EndElement { name: name })) + } else { + Some(self_error!(self; "Unexpected closing tag: {}, expected {}", name, op_name)) + } + } + +} + +#[cfg(test)] +mod tests { + use std::io::BufReader; + + use common::{Position, TextPosition}; + use name::OwnedName; + use attribute::OwnedAttribute; + use reader::parser::PullParser; + use reader::ParserConfig; + use reader::events::XmlEvent; + + fn new_parser() -> PullParser { + PullParser::new(ParserConfig::new()) + } + + macro_rules! expect_event( + ($r:expr, $p:expr, $t:pat) => ( + match $p.next(&mut $r) { + $t => {} + e => panic!("Unexpected event: {:?}", e) + } + ); + ($r:expr, $p:expr, $t:pat => $c:expr ) => ( + match $p.next(&mut $r) { + $t if $c => {} + e => panic!("Unexpected event: {:?}", e) + } + ) + ); + + macro_rules! test_data( + ($d:expr) => ({ + static DATA: &'static str = $d; + let r = BufReader::new(DATA.as_bytes()); + let p = new_parser(); + (r, p) + }) + ); + + #[test] + fn issue_3_semicolon_in_attribute_value() { + let (mut r, mut p) = test_data!(r#" + <a attr="zzz;zzz" /> + "#); + + expect_event!(r, p, Ok(XmlEvent::StartDocument { .. })); + expect_event!(r, p, Ok(XmlEvent::StartElement { ref name, ref attributes, ref namespace }) => + *name == OwnedName::local("a") && + attributes.len() == 1 && + attributes[0] == OwnedAttribute::new(OwnedName::local("attr"), "zzz;zzz") && + namespace.is_essentially_empty() + ); + expect_event!(r, p, Ok(XmlEvent::EndElement { ref name }) => *name == OwnedName::local("a")); + expect_event!(r, p, Ok(XmlEvent::EndDocument)); + } + + #[test] + fn issue_140_entity_reference_inside_tag() { + let (mut r, mut p) = test_data!(r#" + <bla>♫</bla> + "#); + + expect_event!(r, p, Ok(XmlEvent::StartDocument { .. })); + expect_event!(r, p, Ok(XmlEvent::StartElement { ref name, .. }) => *name == OwnedName::local("bla")); + expect_event!(r, p, Ok(XmlEvent::Characters(ref s)) => s == "\u{266b}"); + expect_event!(r, p, Ok(XmlEvent::EndElement { ref name, .. }) => *name == OwnedName::local("bla")); + expect_event!(r, p, Ok(XmlEvent::EndDocument)); + } + + #[test] + fn opening_tag_in_attribute_value() { + let (mut r, mut p) = test_data!(r#" + <a attr="zzz<zzz" /> + "#); + + expect_event!(r, p, Ok(XmlEvent::StartDocument { .. })); + expect_event!(r, p, Err(ref e) => + e.msg() == "Unexpected token inside attribute value: <" && + e.position() == TextPosition { row: 1, column: 24 } + ); + } +} diff --git a/third_party/rust/xml-rs/src/reader/parser/outside_tag.rs b/third_party/rust/xml-rs/src/reader/parser/outside_tag.rs new file mode 100644 index 0000000000..d3f7598f75 --- /dev/null +++ b/third_party/rust/xml-rs/src/reader/parser/outside_tag.rs @@ -0,0 +1,130 @@ +use common::is_whitespace_char; + +use reader::events::XmlEvent; +use reader::lexer::Token; + +use super::{ + Result, PullParser, State, ClosingTagSubstate, OpeningTagSubstate, + ProcessingInstructionSubstate, DEFAULT_VERSION, DEFAULT_ENCODING, DEFAULT_STANDALONE +}; + +impl PullParser { + pub fn outside_tag(&mut self, t: Token) -> Option<Result> { + match t { + Token::ReferenceStart => + self.into_state_continue(State::InsideReference(Box::new(State::OutsideTag))), + + Token::Whitespace(_) if self.depth() == 0 && self.config.ignore_root_level_whitespace => None, // skip whitespace outside of the root element + + Token::Whitespace(_) if self.config.trim_whitespace && !self.buf_has_data() => None, + + Token::Whitespace(c) => { + if !self.buf_has_data() { + self.push_pos(); + } + self.append_char_continue(c) + } + + _ if t.contains_char_data() && self.depth() == 0 => + Some(self_error!(self; "Unexpected characters outside the root element: {}", t)), + + _ if t.contains_char_data() => { // Non-whitespace char data + if !self.buf_has_data() { + self.push_pos(); + } + self.inside_whitespace = false; + t.push_to_string(&mut self.buf); + None + } + + Token::ReferenceEnd => { // Semi-colon in a text outside an entity + self.inside_whitespace = false; + Token::ReferenceEnd.push_to_string(&mut self.buf); + None + } + + Token::CommentStart if self.config.coalesce_characters && self.config.ignore_comments => { + // We need to switch the lexer into a comment mode inside comments + self.lexer.inside_comment(); + self.into_state_continue(State::InsideComment) + } + + Token::CDataStart if self.config.coalesce_characters && self.config.cdata_to_characters => { + if !self.buf_has_data() { + self.push_pos(); + } + // We need to disable lexing errors inside CDATA + self.lexer.disable_errors(); + self.into_state_continue(State::InsideCData) + } + + _ => { + // Encountered some markup event, flush the buffer as characters + // or a whitespace + let mut next_event = if self.buf_has_data() { + let buf = self.take_buf(); + if self.inside_whitespace && self.config.trim_whitespace { + None + } else if self.inside_whitespace && !self.config.whitespace_to_characters { + Some(Ok(XmlEvent::Whitespace(buf))) + } else if self.config.trim_whitespace { + Some(Ok(XmlEvent::Characters(buf.trim_matches(is_whitespace_char).into()))) + } else { + Some(Ok(XmlEvent::Characters(buf))) + } + } else { None }; + self.inside_whitespace = true; // Reset inside_whitespace flag + self.push_pos(); + match t { + Token::ProcessingInstructionStart => + self.into_state(State::InsideProcessingInstruction(ProcessingInstructionSubstate::PIInsideName), next_event), + + Token::DoctypeStart if !self.encountered_element => { + // We don't have a doctype event so skip this position + // FIXME: update when we have a doctype event + self.next_pos(); + self.lexer.disable_errors(); + self.into_state(State::InsideDoctype, next_event) + } + + Token::OpeningTagStart => { + // If declaration was not parsed and we have encountered an element, + // emit this declaration as the next event. + if !self.parsed_declaration { + self.parsed_declaration = true; + let sd_event = XmlEvent::StartDocument { + version: DEFAULT_VERSION, + encoding: DEFAULT_ENCODING.into(), + standalone: DEFAULT_STANDALONE + }; + // next_event is always none here because we're outside of + // the root element + next_event = Some(Ok(sd_event)); + self.push_pos(); + } + self.encountered_element = true; + self.nst.push_empty(); + self.into_state(State::InsideOpeningTag(OpeningTagSubstate::InsideName), next_event) + } + + Token::ClosingTagStart if self.depth() > 0 => + self.into_state(State::InsideClosingTag(ClosingTagSubstate::CTInsideName), next_event), + + Token::CommentStart => { + // We need to switch the lexer into a comment mode inside comments + self.lexer.inside_comment(); + self.into_state(State::InsideComment, next_event) + } + + Token::CDataStart => { + // We need to disable lexing errors inside CDATA + self.lexer.disable_errors(); + self.into_state(State::InsideCData, next_event) + } + + _ => Some(self_error!(self; "Unexpected token: {}", t)) + } + } + } + } +} diff --git a/third_party/rust/xml-rs/src/util.rs b/third_party/rust/xml-rs/src/util.rs new file mode 100644 index 0000000000..23fee04eed --- /dev/null +++ b/third_party/rust/xml-rs/src/util.rs @@ -0,0 +1,107 @@ +use std::io::{self, Read}; +use std::str; +use std::fmt; + +#[derive(Debug)] +pub enum CharReadError { + UnexpectedEof, + Utf8(str::Utf8Error), + Io(io::Error) +} + +impl From<str::Utf8Error> for CharReadError { + fn from(e: str::Utf8Error) -> CharReadError { + CharReadError::Utf8(e) + } +} + +impl From<io::Error> for CharReadError { + fn from(e: io::Error) -> CharReadError { + CharReadError::Io(e) + } +} + +impl fmt::Display for CharReadError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + use self::CharReadError::*; + match *self { + UnexpectedEof => write!(f, "unexpected end of stream"), + Utf8(ref e) => write!(f, "UTF-8 decoding error: {}", e), + Io(ref e) => write!(f, "I/O error: {}", e) + } + } +} + +pub fn next_char_from<R: Read>(source: &mut R) -> Result<Option<char>, CharReadError> { + const MAX_CODEPOINT_LEN: usize = 4; + + let mut bytes = source.bytes(); + let mut buf = [0u8; MAX_CODEPOINT_LEN]; + let mut pos = 0; + + loop { + let next = match bytes.next() { + Some(Ok(b)) => b, + Some(Err(e)) => return Err(e.into()), + None if pos == 0 => return Ok(None), + None => return Err(CharReadError::UnexpectedEof) + }; + buf[pos] = next; + pos += 1; + + match str::from_utf8(&buf[..pos]) { + Ok(s) => return Ok(s.chars().next()), // always Some(..) + Err(_) if pos < MAX_CODEPOINT_LEN => {}, + Err(e) => return Err(e.into()) + } + } +} + +#[cfg(test)] +mod tests { + #[test] + fn test_next_char_from() { + use std::io; + use std::error::Error; + + let mut bytes: &[u8] = "correct".as_bytes(); // correct ASCII + assert_eq!(super::next_char_from(&mut bytes).unwrap(), Some('c')); + + let mut bytes: &[u8] = "правильно".as_bytes(); // correct BMP + assert_eq!(super::next_char_from(&mut bytes).unwrap(), Some('п')); + + let mut bytes: &[u8] = "😊".as_bytes(); // correct non-BMP + assert_eq!(super::next_char_from(&mut bytes).unwrap(), Some('😊')); + + let mut bytes: &[u8] = b""; // empty + assert_eq!(super::next_char_from(&mut bytes).unwrap(), None); + + let mut bytes: &[u8] = b"\xf0\x9f\x98"; // incomplete code point + match super::next_char_from(&mut bytes).unwrap_err() { + super::CharReadError::UnexpectedEof => {}, + e => panic!("Unexpected result: {:?}", e) + }; + + let mut bytes: &[u8] = b"\xff\x9f\x98\x32"; // invalid code point + match super::next_char_from(&mut bytes).unwrap_err() { + super::CharReadError::Utf8(_) => {}, + e => panic!("Unexpected result: {:?}", e) + }; + + + // error during read + struct ErrorReader; + impl io::Read for ErrorReader { + fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> { + Err(io::Error::new(io::ErrorKind::Other, "test error")) + } + } + + let mut r = ErrorReader; + match super::next_char_from(&mut r).unwrap_err() { + super::CharReadError::Io(ref e) if e.kind() == io::ErrorKind::Other && + e.description() == "test error" => {}, + e => panic!("Unexpected result: {:?}", e) + } + } +} diff --git a/third_party/rust/xml-rs/src/writer/config.rs b/third_party/rust/xml-rs/src/writer/config.rs new file mode 100644 index 0000000000..ebabf181f0 --- /dev/null +++ b/third_party/rust/xml-rs/src/writer/config.rs @@ -0,0 +1,157 @@ +//! Contains emitter configuration structure. + +use std::io::Write; +use std::borrow::Cow; + +use writer::EventWriter; + +/// Emitter configuration structure. +/// +/// This structure contains various options which control XML document emitter behavior. +#[derive(Clone, PartialEq, Eq, Debug)] +pub struct EmitterConfig { + /// Line separator used to separate lines in formatted output. Default is `"\n"`. + pub line_separator: Cow<'static, str>, + + /// A string which will be used for a single level of indentation. Default is `" "` + /// (two spaces). + pub indent_string: Cow<'static, str>, + + /// Whether or not the emitted document should be indented. Default is false. + /// + /// The emitter is capable to perform automatic indentation of the emitted XML document. + /// It is done in stream-like fashion and does not require the knowledge of the whole + /// document in advance. + /// + /// Sometimes, however, automatic indentation is undesirable, e.g. when you want to keep + /// existing layout when processing an existing XML document. Also the indentiation algorithm + /// is not thoroughly tested. Hence by default it is disabled. + pub perform_indent: bool, + + /// Whether or not characters in output events will be escaped. Default is true. + /// + /// The emitter can automatically escape characters which can't appear in PCDATA sections + /// or element attributes of an XML document, like `<` or `"` (in attributes). This may + /// introduce some overhead because then every corresponding piece of character data + /// should be scanned for invalid characters. + /// + /// If this option is disabled, the XML writer may produce non-well-formed documents, so + /// use `false` value for this option with care. + pub perform_escaping: bool, + + /// Whether or not to write XML document declaration at the beginning of a document. + /// Default is true. + /// + /// This option controls whether the document declaration should be emitted automatically + /// before a root element is written if it was not emitted explicitly by the user. + pub write_document_declaration: bool, + + /// Whether or not to convert elements with empty content to empty elements. Default is true. + /// + /// This option allows turning elements like `<a></a>` (an element with empty content) + /// into `<a />` (an empty element). + pub normalize_empty_elements: bool, + + /// Whether or not to emit CDATA events as plain characters. Default is false. + /// + /// This option forces the emitter to convert CDATA events into regular character events, + /// performing all the necessary escaping beforehand. This may be occasionally useful + /// for feeding the document into incorrect parsers which do not support CDATA. + pub cdata_to_characters: bool, + + /// Whether or not to keep element names to support `EndElement` events without explicit names. + /// Default is true. + /// + /// This option makes the emitter to keep names of written elements in order to allow + /// omitting names when writing closing element tags. This could incur some memory overhead. + pub keep_element_names_stack: bool, + + /// Whether or not to automatically insert leading and trailing spaces in emitted comments, + /// if necessary. Default is true. + /// + /// This is a convenience option in order for the user not to append spaces before and after + /// comments text in order to get more pretty comments: `<!-- something -->` instead of + /// `<!--something-->`. + pub autopad_comments: bool, + + /// Whether or not to automatically insert spaces before the trailing `/>` in self-closing + /// elements. Default is true. + /// + /// This option is only meaningful if `normalize_empty_elements` is true. For example, the + /// element `<a></a>` would be unaffected. When `normalize_empty_elements` is true, then when + /// this option is also true, the same element would appear `<a />`. If this option is false, + /// then the same element would appear `<a/>`. + pub pad_self_closing: bool, +} + +impl EmitterConfig { + /// Creates an emitter configuration with default values. + /// + /// You can tweak default options with builder-like pattern: + /// + /// ```rust + /// use xml::writer::EmitterConfig; + /// + /// let config = EmitterConfig::new() + /// .line_separator("\r\n") + /// .perform_indent(true) + /// .normalize_empty_elements(false); + /// ``` + #[inline] + pub fn new() -> EmitterConfig { + EmitterConfig { + line_separator: "\n".into(), + indent_string: " ".into(), // two spaces + perform_indent: false, + perform_escaping: true, + write_document_declaration: true, + normalize_empty_elements: true, + cdata_to_characters: false, + keep_element_names_stack: true, + autopad_comments: true, + pad_self_closing: true + } + } + + /// Creates an XML writer with this configuration. + /// + /// This is a convenience method for configuring and creating a writer at the same time: + /// + /// ```rust + /// use xml::writer::EmitterConfig; + /// + /// let mut target: Vec<u8> = Vec::new(); + /// + /// let writer = EmitterConfig::new() + /// .line_separator("\r\n") + /// .perform_indent(true) + /// .normalize_empty_elements(false) + /// .create_writer(&mut target); + /// ``` + /// + /// This method is exactly equivalent to calling `EventWriter::new_with_config()` with + /// this configuration object. + #[inline] + pub fn create_writer<W: Write>(self, sink: W) -> EventWriter<W> { + EventWriter::new_with_config(sink, self) + } +} + +impl Default for EmitterConfig { + #[inline] + fn default() -> EmitterConfig { + EmitterConfig::new() + } +} + +gen_setters!(EmitterConfig, + line_separator: into Cow<'static, str>, + indent_string: into Cow<'static, str>, + perform_indent: val bool, + write_document_declaration: val bool, + normalize_empty_elements: val bool, + cdata_to_characters: val bool, + keep_element_names_stack: val bool, + autopad_comments: val bool, + pad_self_closing: val bool +); diff --git a/third_party/rust/xml-rs/src/writer/emitter.rs b/third_party/rust/xml-rs/src/writer/emitter.rs new file mode 100644 index 0000000000..ba80f66781 --- /dev/null +++ b/third_party/rust/xml-rs/src/writer/emitter.rs @@ -0,0 +1,447 @@ +use std::io; +use std::io::prelude::*; +use std::fmt; +use std::result; +use std::borrow::Cow; +use std::error::Error; + +use common; +use name::{Name, OwnedName}; +use attribute::Attribute; +use escape::{escape_str_attribute, escape_str_pcdata}; +use common::XmlVersion; +use namespace::{NamespaceStack, NS_NO_PREFIX, NS_EMPTY_URI, NS_XMLNS_PREFIX, NS_XML_PREFIX}; + +use writer::config::EmitterConfig; + +/// An error which may be returned by `XmlWriter` when writing XML events. +#[derive(Debug)] +pub enum EmitterError { + /// An I/O error occured in the underlying `Write` instance. + Io(io::Error), + + /// Document declaration has already been written to the output stream. + DocumentStartAlreadyEmitted, + + /// The name of the last opening element is not available. + LastElementNameNotAvailable, + + /// The name of the last opening element is not equal to the name of the provided + /// closing element. + EndElementNameIsNotEqualToLastStartElementName, + + /// End element name is not specified when it is needed, for example, when automatic + /// closing is not enabled in configuration. + EndElementNameIsNotSpecified +} + +impl From<io::Error> for EmitterError { + fn from(err: io::Error) -> EmitterError { + EmitterError::Io(err) + } +} + +impl fmt::Display for EmitterError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + + write!(f, "emitter error: ")?; + match *self { + EmitterError::Io(ref e) => + write!(f, "I/O error: {}", e), + ref other => + write!(f, "{}", other.description()), + } + } +} + +impl Error for EmitterError { + fn description(&self) -> &str { + match *self { + EmitterError::Io(_) => + "I/O error", + EmitterError::DocumentStartAlreadyEmitted => + "document start event has already been emitted", + EmitterError::LastElementNameNotAvailable => + "last element name is not available", + EmitterError::EndElementNameIsNotEqualToLastStartElementName => + "end element name is not equal to last start element name", + EmitterError::EndElementNameIsNotSpecified => + "end element name is not specified and can't be inferred", + } + } +} + +/// A result type yielded by `XmlWriter`. +pub type Result<T> = result::Result<T, EmitterError>; + +// TODO: split into a low-level fast writer without any checks and formatting logic and a +// high-level indenting validating writer +pub struct Emitter { + config: EmitterConfig, + + nst: NamespaceStack, + + indent_level: usize, + indent_stack: Vec<IndentFlags>, + + element_names: Vec<OwnedName>, + + start_document_emitted: bool, + just_wrote_start_element: bool +} + +impl Emitter { + pub fn new(config: EmitterConfig) -> Emitter { + Emitter { + config, + + nst: NamespaceStack::empty(), + + indent_level: 0, + indent_stack: vec![IndentFlags::WroteNothing], + + element_names: Vec::new(), + + start_document_emitted: false, + just_wrote_start_element: false + } + } +} + +#[derive(Copy, Clone, Eq, PartialEq, Debug)] +enum IndentFlags { + WroteNothing, + WroteMarkup, + WroteText, +} + +impl Emitter { + /// Returns the current state of namespaces. + #[inline] + pub fn namespace_stack_mut(&mut self) -> &mut NamespaceStack { + &mut self.nst + } + + #[inline] + fn wrote_text(&self) -> bool { + *self.indent_stack.last().unwrap() == IndentFlags::WroteText + } + + #[inline] + fn wrote_markup(&self) -> bool { + *self.indent_stack.last().unwrap() == IndentFlags::WroteMarkup + } + + #[inline] + fn set_wrote_text(&mut self) { + *self.indent_stack.last_mut().unwrap() = IndentFlags::WroteText; + } + + #[inline] + fn set_wrote_markup(&mut self) { + *self.indent_stack.last_mut().unwrap() = IndentFlags::WroteMarkup; + } + + #[inline] + fn reset_state(&mut self) { + *self.indent_stack.last_mut().unwrap() = IndentFlags::WroteNothing; + } + + fn write_newline<W: Write>(&mut self, target: &mut W, level: usize) -> Result<()> { + target.write_all(self.config.line_separator.as_bytes())?; + for _ in 0..level { + target.write_all(self.config.indent_string.as_bytes())?; + } + Ok(()) + } + + fn before_markup<W: Write>(&mut self, target: &mut W) -> Result<()> { + if self.config.perform_indent && !self.wrote_text() && + (self.indent_level > 0 || self.wrote_markup()) { + let indent_level = self.indent_level; + self.write_newline(target, indent_level)?; + if self.indent_level > 0 && self.config.indent_string.len() > 0 { + self.after_markup(); + } + } + Ok(()) + } + + fn after_markup(&mut self) { + self.set_wrote_markup(); + } + + fn before_start_element<W: Write>(&mut self, target: &mut W) -> Result<()> { + self.before_markup(target)?; + self.indent_stack.push(IndentFlags::WroteNothing); + Ok(()) + } + + fn after_start_element(&mut self) { + self.after_markup(); + self.indent_level += 1; + } + + fn before_end_element<W: Write>(&mut self, target: &mut W) -> Result<()> { + if self.config.perform_indent && self.indent_level > 0 && self.wrote_markup() && + !self.wrote_text() { + let indent_level = self.indent_level; + self.write_newline(target, indent_level - 1) + } else { + Ok(()) + } + } + + fn after_end_element(&mut self) { + if self.indent_level > 0 { + self.indent_level -= 1; + self.indent_stack.pop(); + } + self.set_wrote_markup(); + } + + fn after_text(&mut self) { + self.set_wrote_text(); + } + + pub fn emit_start_document<W: Write>(&mut self, target: &mut W, + version: XmlVersion, + encoding: &str, + standalone: Option<bool>) -> Result<()> { + if self.start_document_emitted { + return Err(EmitterError::DocumentStartAlreadyEmitted); + } + self.start_document_emitted = true; + + self.before_markup(target)?; + let result = { + let mut write = move || { + write!(target, "<?xml version=\"{}\" encoding=\"{}\"", version, encoding)?; + + if let Some(standalone) = standalone { + write!(target, " standalone=\"{}\"", if standalone { "yes" } else { "no" })?; + } + + write!(target, "?>")?; + + Ok(()) + }; + write() + }; + self.after_markup(); + + result + } + + fn check_document_started<W: Write>(&mut self, target: &mut W) -> Result<()> { + if !self.start_document_emitted && self.config.write_document_declaration { + self.emit_start_document(target, common::XmlVersion::Version10, "utf-8", None) + } else { + Ok(()) + } + } + + fn fix_non_empty_element<W: Write>(&mut self, target: &mut W) -> Result<()> { + if self.config.normalize_empty_elements && self.just_wrote_start_element { + self.just_wrote_start_element = false; + target.write_all(b">").map_err(From::from) + } else { + Ok(()) + } + } + + pub fn emit_processing_instruction<W: Write>(&mut self, + target: &mut W, + name: &str, + data: Option<&str>) -> Result<()> { + self.check_document_started(target)?; + self.fix_non_empty_element(target)?; + + self.before_markup(target)?; + + let result = { + let mut write = || { + write!(target, "<?{}", name)?; + + if let Some(data) = data { + write!(target, " {}", data)?; + } + + write!(target, "?>")?; + + Ok(()) + }; + write() + }; + + self.after_markup(); + + result + } + + fn emit_start_element_initial<W>(&mut self, target: &mut W, + name: Name, + attributes: &[Attribute]) -> Result<()> + where W: Write + { + self.check_document_started(target)?; + self.fix_non_empty_element(target)?; + self.before_start_element(target)?; + write!(target, "<{}", name.repr_display())?; + self.emit_current_namespace_attributes(target)?; + self.emit_attributes(target, attributes)?; + self.after_start_element(); + Ok(()) + } + + pub fn emit_start_element<W>(&mut self, target: &mut W, + name: Name, + attributes: &[Attribute]) -> Result<()> + where W: Write + { + if self.config.keep_element_names_stack { + self.element_names.push(name.to_owned()); + } + + self.emit_start_element_initial(target, name, attributes)?; + self.just_wrote_start_element = true; + + if !self.config.normalize_empty_elements { + write!(target, ">")?; + } + + Ok(()) + } + + pub fn emit_current_namespace_attributes<W>(&mut self, target: &mut W) -> Result<()> + where W: Write + { + for (prefix, uri) in self.nst.peek() { + match prefix { + // internal namespaces are not emitted + NS_XMLNS_PREFIX | NS_XML_PREFIX => Ok(()), + //// there is already a namespace binding with this prefix in scope + //prefix if self.nst.get(prefix) == Some(uri) => Ok(()), + // emit xmlns only if it is overridden + NS_NO_PREFIX => if uri != NS_EMPTY_URI { + write!(target, " xmlns=\"{}\"", uri) + } else { Ok(()) }, + // everything else + prefix => write!(target, " xmlns:{}=\"{}\"", prefix, uri) + }?; + } + Ok(()) + } + + pub fn emit_attributes<W: Write>(&mut self, target: &mut W, + attributes: &[Attribute]) -> Result<()> { + for attr in attributes.iter() { + write!( + target, " {}=\"{}\"", + attr.name.repr_display(), + if self.config.perform_escaping { escape_str_attribute(attr.value) } else { Cow::Borrowed(attr.value) } + )? + } + Ok(()) + } + + pub fn emit_end_element<W: Write>(&mut self, target: &mut W, + name: Option<Name>) -> Result<()> { + let owned_name = if self.config.keep_element_names_stack { + Some(self.element_names.pop().ok_or(EmitterError::LastElementNameNotAvailable)?) + } else { + None + }; + + // Check that last started element name equals to the provided name, if there are both + if let Some(ref last_name) = owned_name { + if let Some(ref name) = name { + if last_name.borrow() != *name { + return Err(EmitterError::EndElementNameIsNotEqualToLastStartElementName); + } + } + } + + if let Some(name) = owned_name.as_ref().map(|n| n.borrow()).or(name) { + if self.config.normalize_empty_elements && self.just_wrote_start_element { + self.just_wrote_start_element = false; + let termination = if self.config.pad_self_closing { " />" } else { "/>" }; + let result = target.write_all(termination.as_bytes()).map_err(From::from); + self.after_end_element(); + result + } else { + self.just_wrote_start_element = false; + + self.before_end_element(target)?; + let result = write!(target, "</{}>", name.repr_display()).map_err(From::from); + self.after_end_element(); + + result + } + } else { + Err(EmitterError::EndElementNameIsNotSpecified) + } + } + + pub fn emit_cdata<W: Write>(&mut self, target: &mut W, content: &str) -> Result<()> { + self.fix_non_empty_element(target)?; + if self.config.cdata_to_characters { + self.emit_characters(target, content) + } else { + // TODO: escape ']]>' characters in CDATA as two adjacent CDATA blocks + target.write_all(b"<![CDATA[")?; + target.write_all(content.as_bytes())?; + target.write_all(b"]]>")?; + + self.after_text(); + + Ok(()) + } + } + + pub fn emit_characters<W: Write>(&mut self, target: &mut W, + content: &str) -> Result<()> { + self.check_document_started(target)?; + self.fix_non_empty_element(target)?; + target.write_all( + (if self.config.perform_escaping { + escape_str_pcdata(content) + } else { + Cow::Borrowed(content) + }).as_bytes() + )?; + self.after_text(); + Ok(()) + } + + pub fn emit_comment<W: Write>(&mut self, target: &mut W, content: &str) -> Result<()> { + self.fix_non_empty_element(target)?; + + // TODO: add escaping dashes at the end of the comment + + let autopad_comments = self.config.autopad_comments; + let write = |target: &mut W| -> Result<()> { + target.write_all(b"<!--")?; + + if autopad_comments && !content.starts_with(char::is_whitespace) { + target.write_all(b" ")?; + } + + target.write_all(content.as_bytes())?; + + if autopad_comments && !content.ends_with(char::is_whitespace) { + target.write_all(b" ")?; + } + + target.write_all(b"-->")?; + + Ok(()) + }; + + self.before_markup(target)?; + let result = write(target); + self.after_markup(); + + result + } +} diff --git a/third_party/rust/xml-rs/src/writer/events.rs b/third_party/rust/xml-rs/src/writer/events.rs new file mode 100644 index 0000000000..1f7040f66a --- /dev/null +++ b/third_party/rust/xml-rs/src/writer/events.rs @@ -0,0 +1,241 @@ +//! Contains `XmlEvent` datatype, instances of which are consumed by the writer. + +use std::borrow::Cow; + +use name::Name; +use attribute::Attribute; +use common::XmlVersion; +use namespace::{Namespace, NS_NO_PREFIX}; + +/// A part of an XML output stream. +/// +/// Objects of this enum are consumed by `EventWriter`. They correspond to different parts of +/// an XML document. +#[derive(Debug)] +pub enum XmlEvent<'a> { + /// Corresponds to XML document declaration. + /// + /// This event should always be written before any other event. If it is not written + /// at all, a default XML declaration will be outputted if the corresponding option + /// is set in the configuration. Otherwise an error will be returned. + StartDocument { + /// XML version. + /// + /// Defaults to `XmlVersion::Version10`. + version: XmlVersion, + + /// XML document encoding. + /// + /// Defaults to `Some("UTF-8")`. + encoding: Option<&'a str>, + + /// XML standalone declaration. + /// + /// Defaults to `None`. + standalone: Option<bool> + }, + + /// Denotes an XML processing instruction. + ProcessingInstruction { + /// Processing instruction target. + name: &'a str, + + /// Processing instruction content. + data: Option<&'a str> + }, + + /// Denotes a beginning of an XML element. + StartElement { + /// Qualified name of the element. + name: Name<'a>, + + /// A list of attributes associated with the element. + /// + /// Currently attributes are not checked for duplicates (TODO). Attribute values + /// will be escaped, and all characters invalid for attribute values like `"` or `<` + /// will be changed into character entities. + attributes: Cow<'a, [Attribute<'a>]>, + + /// Contents of the namespace mapping at this point of the document. + /// + /// This mapping will be inspected for "new" entries, and if at this point of the document + /// a particular pair of prefix and namespace URI is already defined, no namespace + /// attributes will be emitted. + namespace: Cow<'a, Namespace>, + }, + + /// Denotes an end of an XML element. + EndElement { + /// Optional qualified name of the element. + /// + /// If `None`, then it is assumed that the element name should be the last valid one. + /// If `Some` and element names tracking is enabled, then the writer will check it for + /// correctness. + name: Option<Name<'a>> + }, + + /// Denotes CDATA content. + /// + /// This event contains unparsed data, and no escaping will be performed when writing it + /// to the output stream. + CData(&'a str), + + /// Denotes a comment. + /// + /// The string will be checked for invalid sequences and error will be returned by the + /// write operation + Comment(&'a str), + + /// Denotes character data outside of tags. + /// + /// Contents of this event will be escaped if `perform_escaping` option is enabled, + /// that is, every character invalid for PCDATA will appear as a character entity. + Characters(&'a str) +} + +impl<'a> XmlEvent<'a> { + /// Returns an writer event for a processing instruction. + #[inline] + pub fn processing_instruction(name: &'a str, data: Option<&'a str>) -> XmlEvent<'a> { + XmlEvent::ProcessingInstruction { name: name, data: data } + } + + /// Returns a builder for a starting element. + /// + /// This builder can then be used to tweak attributes and namespace starting at + /// this element. + #[inline] + pub fn start_element<S>(name: S) -> StartElementBuilder<'a> where S: Into<Name<'a>> { + StartElementBuilder { + name: name.into(), + attributes: Vec::new(), + namespace: Namespace::empty().into() + } + } + + /// Returns a builder for an closing element. + /// + /// This method, unline `start_element()`, does not accept a name because by default + /// the writer is able to determine it automatically. However, when this functionality + /// is disabled, it is possible to specify the name with `name()` method on the builder. + #[inline] + pub fn end_element() -> EndElementBuilder<'a> { + EndElementBuilder { name: None } + } + + /// Returns a CDATA event. + /// + /// Naturally, the provided string won't be escaped, except for closing CDATA token `]]>` + /// (depending on the configuration). + #[inline] + pub fn cdata(data: &'a str) -> XmlEvent<'a> { XmlEvent::CData(data) } + + /// Returns a regular characters (PCDATA) event. + /// + /// All offending symbols, in particular, `&` and `<`, will be escaped by the writer. + #[inline] + pub fn characters(data: &'a str) -> XmlEvent<'a> { XmlEvent::Characters(data) } + + /// Returns a comment event. + #[inline] + pub fn comment(data: &'a str) -> XmlEvent<'a> { XmlEvent::Comment(data) } +} + +impl<'a> From<&'a str> for XmlEvent<'a> { + #[inline] + fn from(s: &'a str) -> XmlEvent<'a> { XmlEvent::Characters(s) } +} + +pub struct EndElementBuilder<'a> { + name: Option<Name<'a>> +} + +/// A builder for a closing element event. +impl<'a> EndElementBuilder<'a> { + /// Sets the name of this closing element. + /// + /// Usually the writer is able to determine closing element names automatically. If + /// this functionality is enabled (by default it is), then this name is checked for correctness. + /// It is possible, however, to disable such behavior; then the user must ensure that + /// closing element name is correct manually. + #[inline] + pub fn name<N>(mut self, name: N) -> EndElementBuilder<'a> where N: Into<Name<'a>> { + self.name = Some(name.into()); + self + } +} + +impl<'a> From<EndElementBuilder<'a>> for XmlEvent<'a> { + fn from(b: EndElementBuilder<'a>) -> XmlEvent<'a> { + XmlEvent::EndElement { name: b.name } + } +} + +/// A builder for a starting element event. +pub struct StartElementBuilder<'a> { + name: Name<'a>, + attributes: Vec<Attribute<'a>>, + namespace: Namespace +} + +impl<'a> StartElementBuilder<'a> { + /// Sets an attribute value of this element to the given string. + /// + /// This method can be used to add attributes to the starting element. Name is a qualified + /// name; its namespace is ignored, but its prefix is checked for correctness, that is, + /// it is checked that the prefix is bound to some namespace in the current context. + /// + /// Currently attributes are not checked for duplicates. Note that duplicate attributes + /// are a violation of XML document well-formedness. + /// + /// The writer checks that you don't specify reserved prefix names, for example `xmlns`. + #[inline] + pub fn attr<N>(mut self, name: N, value: &'a str) -> StartElementBuilder<'a> + where N: Into<Name<'a>> + { + self.attributes.push(Attribute::new(name.into(), value)); + self + } + + /// Adds a namespace to the current namespace context. + /// + /// If no namespace URI was bound to the provided prefix at this point of the document, + /// then the mapping from the prefix to the provided namespace URI will be written as + /// a part of this element attribute set. + /// + /// If the same namespace URI was bound to the provided prefix at this point of the document, + /// then no namespace attributes will be emitted. + /// + /// If some other namespace URI was bound to the provided prefix at this point of the document, + /// then another binding will be added as a part of this element attribute set, shadowing + /// the outer binding. + #[inline] + pub fn ns<S1, S2>(mut self, prefix: S1, uri: S2) -> StartElementBuilder<'a> + where S1: Into<String>, S2: Into<String> + { + self.namespace.put(prefix, uri); + self + } + + /// Adds a default namespace mapping to the current namespace context. + /// + /// Same rules as for `ns()` are also valid for the default namespace mapping. + #[inline] + pub fn default_ns<S>(mut self, uri: S) -> StartElementBuilder<'a> + where S: Into<String> + { + self.namespace.put(NS_NO_PREFIX, uri); + self + } +} + +impl<'a> From<StartElementBuilder<'a>> for XmlEvent<'a> { + #[inline] + fn from(b: StartElementBuilder<'a>) -> XmlEvent<'a> { + XmlEvent::StartElement { + name: b.name, + attributes: Cow::Owned(b.attributes), + namespace: Cow::Owned(b.namespace) + } + } +} diff --git a/third_party/rust/xml-rs/src/writer/mod.rs b/third_party/rust/xml-rs/src/writer/mod.rs new file mode 100644 index 0000000000..ea1b24266f --- /dev/null +++ b/third_party/rust/xml-rs/src/writer/mod.rs @@ -0,0 +1,93 @@ +//! Contains high-level interface for an events-based XML emitter. +//! +//! The most important type in this module is `EventWriter` which allows writing an XML document +//! to some output stream. + +pub use self::emitter::Result; +pub use self::emitter::EmitterError as Error; +pub use self::config::EmitterConfig; +pub use self::events::XmlEvent; + +use self::emitter::Emitter; + +use std::io::prelude::*; + +mod emitter; +mod config; +pub mod events; + +/// A wrapper around an `std::io::Write` instance which emits XML document according to provided +/// events. +pub struct EventWriter<W> { + sink: W, + emitter: Emitter +} + +impl<W: Write> EventWriter<W> { + /// Creates a new `EventWriter` out of an `std::io::Write` instance using the default + /// configuration. + #[inline] + pub fn new(sink: W) -> EventWriter<W> { + EventWriter::new_with_config(sink, EmitterConfig::new()) + } + + /// Creates a new `EventWriter` out of an `std::io::Write` instance using the provided + /// configuration. + #[inline] + pub fn new_with_config(sink: W, config: EmitterConfig) -> EventWriter<W> { + EventWriter { + sink, + emitter: Emitter::new(config) + } + } + + /// Writes the next piece of XML document according to the provided event. + /// + /// Note that output data may not exactly correspond to the written event because + /// of various configuration options. For example, `XmlEvent::EndElement` may + /// correspond to a separate closing element or it may cause writing an empty element. + /// Another example is that `XmlEvent::CData` may be represented as characters in + /// the output stream. + pub fn write<'a, E>(&mut self, event: E) -> Result<()> where E: Into<XmlEvent<'a>> { + match event.into() { + XmlEvent::StartDocument { version, encoding, standalone } => + self.emitter.emit_start_document(&mut self.sink, version, encoding.unwrap_or("UTF-8"), standalone), + XmlEvent::ProcessingInstruction { name, data } => + self.emitter.emit_processing_instruction(&mut self.sink, name, data), + XmlEvent::StartElement { name, attributes, namespace } => { + self.emitter.namespace_stack_mut().push_empty().checked_target().extend(namespace.as_ref()); + self.emitter.emit_start_element(&mut self.sink, name, &attributes) + } + XmlEvent::EndElement { name } => { + let r = self.emitter.emit_end_element(&mut self.sink, name); + self.emitter.namespace_stack_mut().try_pop(); + r + } + XmlEvent::Comment(content) => + self.emitter.emit_comment(&mut self.sink, content), + XmlEvent::CData(content) => + self.emitter.emit_cdata(&mut self.sink, content), + XmlEvent::Characters(content) => + self.emitter.emit_characters(&mut self.sink, content) + } + } + + /// Returns a mutable reference to the underlying `Writer`. + /// + /// Note that having a reference to the underlying sink makes it very easy to emit invalid XML + /// documents. Use this method with care. Valid use cases for this method include accessing + /// methods like `Write::flush`, which do not emit new data but rather change the state + /// of the stream itself. + pub fn inner_mut(&mut self) -> &mut W { + &mut self.sink + } + + /// Unwraps this `EventWriter`, returning the underlying writer. + /// + /// Note that this is a destructive operation: unwrapping a writer and then wrapping + /// it again with `EventWriter::new()` will create a fresh writer whose state will be + /// blank; for example, accumulated namespaces will be reset. + pub fn into_inner(self) -> W { + self.sink + } +} |