diff options
Diffstat (limited to 'third_party/rust/yaml-rust/src')
-rw-r--r-- | third_party/rust/yaml-rust/src/emitter.rs | 635 | ||||
-rw-r--r-- | third_party/rust/yaml-rust/src/lib.rs | 121 | ||||
-rw-r--r-- | third_party/rust/yaml-rust/src/parser.rs | 858 | ||||
-rw-r--r-- | third_party/rust/yaml-rust/src/scanner.rs | 2182 | ||||
-rw-r--r-- | third_party/rust/yaml-rust/src/yaml.rs | 739 |
5 files changed, 4535 insertions, 0 deletions
diff --git a/third_party/rust/yaml-rust/src/emitter.rs b/third_party/rust/yaml-rust/src/emitter.rs new file mode 100644 index 0000000000..f20a3ed679 --- /dev/null +++ b/third_party/rust/yaml-rust/src/emitter.rs @@ -0,0 +1,635 @@ +use std::convert::From; +use std::error::Error; +use std::fmt::{self, Display}; +use crate::yaml::{Hash, Yaml}; + +#[derive(Copy, Clone, Debug)] +pub enum EmitError { + FmtError(fmt::Error), + BadHashmapKey, +} + +impl Error for EmitError { + fn cause(&self) -> Option<&dyn Error> { + None + } +} + +impl Display for EmitError { + fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result { + match *self { + EmitError::FmtError(ref err) => Display::fmt(err, formatter), + EmitError::BadHashmapKey => formatter.write_str("bad hashmap key"), + } + } +} + +impl From<fmt::Error> for EmitError { + fn from(f: fmt::Error) -> Self { + EmitError::FmtError(f) + } +} + +pub struct YamlEmitter<'a> { + writer: &'a mut dyn fmt::Write, + best_indent: usize, + compact: bool, + + level: isize, +} + +pub type EmitResult = Result<(), EmitError>; + +// from serialize::json +fn escape_str(wr: &mut dyn fmt::Write, v: &str) -> Result<(), fmt::Error> { + wr.write_str("\"")?; + + let mut start = 0; + + for (i, byte) in v.bytes().enumerate() { + let escaped = match byte { + b'"' => "\\\"", + b'\\' => "\\\\", + b'\x00' => "\\u0000", + b'\x01' => "\\u0001", + b'\x02' => "\\u0002", + b'\x03' => "\\u0003", + b'\x04' => "\\u0004", + b'\x05' => "\\u0005", + b'\x06' => "\\u0006", + b'\x07' => "\\u0007", + b'\x08' => "\\b", + b'\t' => "\\t", + b'\n' => "\\n", + b'\x0b' => "\\u000b", + b'\x0c' => "\\f", + b'\r' => "\\r", + b'\x0e' => "\\u000e", + b'\x0f' => "\\u000f", + b'\x10' => "\\u0010", + b'\x11' => "\\u0011", + b'\x12' => "\\u0012", + b'\x13' => "\\u0013", + b'\x14' => "\\u0014", + b'\x15' => "\\u0015", + b'\x16' => "\\u0016", + b'\x17' => "\\u0017", + b'\x18' => "\\u0018", + b'\x19' => "\\u0019", + b'\x1a' => "\\u001a", + b'\x1b' => "\\u001b", + b'\x1c' => "\\u001c", + b'\x1d' => "\\u001d", + b'\x1e' => "\\u001e", + b'\x1f' => "\\u001f", + b'\x7f' => "\\u007f", + _ => continue, + }; + + if start < i { + wr.write_str(&v[start..i])?; + } + + wr.write_str(escaped)?; + + start = i + 1; + } + + if start != v.len() { + wr.write_str(&v[start..])?; + } + + wr.write_str("\"")?; + Ok(()) +} + +impl<'a> YamlEmitter<'a> { + pub fn new(writer: &'a mut dyn fmt::Write) -> YamlEmitter { + YamlEmitter { + writer, + best_indent: 2, + compact: true, + level: -1, + } + } + + /// Set 'compact inline notation' on or off, as described for block + /// [sequences](http://www.yaml.org/spec/1.2/spec.html#id2797382) + /// and + /// [mappings](http://www.yaml.org/spec/1.2/spec.html#id2798057). + /// + /// In this form, blocks cannot have any properties (such as anchors + /// or tags), which should be OK, because this emitter doesn't + /// (currently) emit those anyways. + pub fn compact(&mut self, compact: bool) { + self.compact = compact; + } + + /// Determine if this emitter is using 'compact inline notation'. + pub fn is_compact(&self) -> bool { + self.compact + } + + pub fn dump(&mut self, doc: &Yaml) -> EmitResult { + // write DocumentStart + writeln!(self.writer, "---")?; + self.level = -1; + self.emit_node(doc) + } + + fn write_indent(&mut self) -> EmitResult { + if self.level <= 0 { + return Ok(()); + } + for _ in 0..self.level { + for _ in 0..self.best_indent { + write!(self.writer, " ")?; + } + } + Ok(()) + } + + fn emit_node(&mut self, node: &Yaml) -> EmitResult { + match *node { + Yaml::Array(ref v) => self.emit_array(v), + Yaml::Hash(ref h) => self.emit_hash(h), + Yaml::String(ref v) => { + if need_quotes(v) { + escape_str(self.writer, v)?; + } else { + write!(self.writer, "{}", v)?; + } + Ok(()) + } + Yaml::Boolean(v) => { + if v { + self.writer.write_str("true")?; + } else { + self.writer.write_str("false")?; + } + Ok(()) + } + Yaml::Integer(v) => { + write!(self.writer, "{}", v)?; + Ok(()) + } + Yaml::Real(ref v) => { + write!(self.writer, "{}", v)?; + Ok(()) + } + Yaml::Null | Yaml::BadValue => { + write!(self.writer, "~")?; + Ok(()) + } + // XXX(chenyh) Alias + _ => Ok(()), + } + } + + fn emit_array(&mut self, v: &[Yaml]) -> EmitResult { + if v.is_empty() { + write!(self.writer, "[]")?; + } else { + self.level += 1; + for (cnt, x) in v.iter().enumerate() { + if cnt > 0 { + writeln!(self.writer)?; + self.write_indent()?; + } + write!(self.writer, "-")?; + self.emit_val(true, x)?; + } + self.level -= 1; + } + Ok(()) + } + + fn emit_hash(&mut self, h: &Hash) -> EmitResult { + if h.is_empty() { + self.writer.write_str("{}")?; + } else { + self.level += 1; + for (cnt, (k, v)) in h.iter().enumerate() { + let complex_key = match *k { + Yaml::Hash(_) | Yaml::Array(_) => true, + _ => false, + }; + if cnt > 0 { + writeln!(self.writer)?; + self.write_indent()?; + } + if complex_key { + write!(self.writer, "?")?; + self.emit_val(true, k)?; + writeln!(self.writer)?; + self.write_indent()?; + write!(self.writer, ":")?; + self.emit_val(true, v)?; + } else { + self.emit_node(k)?; + write!(self.writer, ":")?; + self.emit_val(false, v)?; + } + } + self.level -= 1; + } + Ok(()) + } + + /// Emit a yaml as a hash or array value: i.e., which should appear + /// following a ":" or "-", either after a space, or on a new line. + /// If `inline` is true, then the preceding characters are distinct + /// and short enough to respect the compact flag. + fn emit_val(&mut self, inline: bool, val: &Yaml) -> EmitResult { + match *val { + Yaml::Array(ref v) => { + if (inline && self.compact) || v.is_empty() { + write!(self.writer, " ")?; + } else { + writeln!(self.writer)?; + self.level += 1; + self.write_indent()?; + self.level -= 1; + } + self.emit_array(v) + } + Yaml::Hash(ref h) => { + if (inline && self.compact) || h.is_empty() { + write!(self.writer, " ")?; + } else { + writeln!(self.writer)?; + self.level += 1; + self.write_indent()?; + self.level -= 1; + } + self.emit_hash(h) + } + _ => { + write!(self.writer, " ")?; + self.emit_node(val) + } + } + } +} + +/// Check if the string requires quoting. +/// Strings starting with any of the following characters must be quoted. +/// :, &, *, ?, |, -, <, >, =, !, %, @ +/// Strings containing any of the following characters must be quoted. +/// {, }, [, ], ,, #, ` +/// +/// If the string contains any of the following control characters, it must be escaped with double quotes: +/// \0, \x01, \x02, \x03, \x04, \x05, \x06, \a, \b, \t, \n, \v, \f, \r, \x0e, \x0f, \x10, \x11, \x12, \x13, \x14, \x15, \x16, \x17, \x18, \x19, \x1a, \e, \x1c, \x1d, \x1e, \x1f, \N, \_, \L, \P +/// +/// Finally, there are other cases when the strings must be quoted, no matter if you're using single or double quotes: +/// * When the string is true or false (otherwise, it would be treated as a boolean value); +/// * When the string is null or ~ (otherwise, it would be considered as a null value); +/// * When the string looks like a number, such as integers (e.g. 2, 14, etc.), floats (e.g. 2.6, 14.9) and exponential numbers (e.g. 12e7, etc.) (otherwise, it would be treated as a numeric value); +/// * When the string looks like a date (e.g. 2014-12-31) (otherwise it would be automatically converted into a Unix timestamp). +fn need_quotes(string: &str) -> bool { + fn need_quotes_spaces(string: &str) -> bool { + string.starts_with(' ') || string.ends_with(' ') + } + + string == "" + || need_quotes_spaces(string) + || string.starts_with(|character: char| match character { + '&' | '*' | '?' | '|' | '-' | '<' | '>' | '=' | '!' | '%' | '@' => true, + _ => false, + }) + || string.contains(|character: char| match character { + ':' + | '{' + | '}' + | '[' + | ']' + | ',' + | '#' + | '`' + | '\"' + | '\'' + | '\\' + | '\0'..='\x06' + | '\t' + | '\n' + | '\r' + | '\x0e'..='\x1a' + | '\x1c'..='\x1f' => true, + _ => false, + }) + || [ + // http://yaml.org/type/bool.html + // Note: 'y', 'Y', 'n', 'N', is not quoted deliberately, as in libyaml. PyYAML also parse + // them as string, not booleans, although it is violating the YAML 1.1 specification. + // See https://github.com/dtolnay/serde-yaml/pull/83#discussion_r152628088. + "yes", "Yes", "YES", "no", "No", "NO", "True", "TRUE", "true", "False", "FALSE", + "false", "on", "On", "ON", "off", "Off", "OFF", + // http://yaml.org/type/null.html + "null", "Null", "NULL", "~", + ] + .contains(&string) + || string.starts_with('.') + || string.starts_with("0x") + || string.parse::<i64>().is_ok() + || string.parse::<f64>().is_ok() +} + +#[cfg(test)] +mod test { + use super::*; + use crate::YamlLoader; + + #[test] + fn test_emit_simple() { + let s = " +# comment +a0 bb: val +a1: + b1: 4 + b2: d +a2: 4 # i'm comment +a3: [1, 2, 3] +a4: + - [a1, a2] + - 2 +"; + + let docs = YamlLoader::load_from_str(&s).unwrap(); + let doc = &docs[0]; + let mut writer = String::new(); + { + let mut emitter = YamlEmitter::new(&mut writer); + emitter.dump(doc).unwrap(); + } + println!("original:\n{}", s); + println!("emitted:\n{}", writer); + let docs_new = match YamlLoader::load_from_str(&writer) { + Ok(y) => y, + Err(e) => panic!(format!("{}", e)), + }; + let doc_new = &docs_new[0]; + + assert_eq!(doc, doc_new); + } + + #[test] + fn test_emit_complex() { + let s = r#" +cataloge: + product: &coffee { name: Coffee, price: 2.5 , unit: 1l } + product: &cookies { name: Cookies!, price: 3.40 , unit: 400g} + +products: + *coffee: + amount: 4 + *cookies: + amount: 4 + [1,2,3,4]: + array key + 2.4: + real key + true: + bool key + {}: + empty hash key + "#; + let docs = YamlLoader::load_from_str(&s).unwrap(); + let doc = &docs[0]; + let mut writer = String::new(); + { + let mut emitter = YamlEmitter::new(&mut writer); + emitter.dump(doc).unwrap(); + } + let docs_new = match YamlLoader::load_from_str(&writer) { + Ok(y) => y, + Err(e) => panic!(format!("{}", e)), + }; + let doc_new = &docs_new[0]; + assert_eq!(doc, doc_new); + } + + #[test] + fn test_emit_avoid_quotes() { + let s = r#"--- +a7: 你好 +boolean: "true" +boolean2: "false" +date: 2014-12-31 +empty_string: "" +empty_string1: " " +empty_string2: " a" +empty_string3: " a " +exp: "12e7" +field: ":" +field2: "{" +field3: "\\" +field4: "\n" +field5: "can't avoid quote" +float: "2.6" +int: "4" +nullable: "null" +nullable2: "~" +products: + "*coffee": + amount: 4 + "*cookies": + amount: 4 + ".milk": + amount: 1 + "2.4": real key + "[1,2,3,4]": array key + "true": bool key + "{}": empty hash key +x: test +y: avoid quoting here +z: string with spaces"#; + + let docs = YamlLoader::load_from_str(&s).unwrap(); + let doc = &docs[0]; + let mut writer = String::new(); + { + let mut emitter = YamlEmitter::new(&mut writer); + emitter.dump(doc).unwrap(); + } + + assert_eq!(s, writer, "actual:\n\n{}\n", writer); + } + + #[test] + fn emit_quoted_bools() { + let input = r#"--- +string0: yes +string1: no +string2: "true" +string3: "false" +string4: "~" +null0: ~ +[true, false]: real_bools +[True, TRUE, False, FALSE, y,Y,yes,Yes,YES,n,N,no,No,NO,on,On,ON,off,Off,OFF]: false_bools +bool0: true +bool1: false"#; + let expected = r#"--- +string0: "yes" +string1: "no" +string2: "true" +string3: "false" +string4: "~" +null0: ~ +? - true + - false +: real_bools +? - "True" + - "TRUE" + - "False" + - "FALSE" + - y + - Y + - "yes" + - "Yes" + - "YES" + - n + - N + - "no" + - "No" + - "NO" + - "on" + - "On" + - "ON" + - "off" + - "Off" + - "OFF" +: false_bools +bool0: true +bool1: false"#; + + let docs = YamlLoader::load_from_str(&input).unwrap(); + let doc = &docs[0]; + let mut writer = String::new(); + { + let mut emitter = YamlEmitter::new(&mut writer); + emitter.dump(doc).unwrap(); + } + + assert_eq!( + expected, writer, + "expected:\n{}\nactual:\n{}\n", + expected, writer + ); + } + + #[test] + fn test_empty_and_nested() { + test_empty_and_nested_flag(false) + } + + #[test] + fn test_empty_and_nested_compact() { + test_empty_and_nested_flag(true) + } + + fn test_empty_and_nested_flag(compact: bool) { + let s = if compact { + r#"--- +a: + b: + c: hello + d: {} +e: + - f + - g + - h: []"# + } else { + r#"--- +a: + b: + c: hello + d: {} +e: + - f + - g + - + h: []"# + }; + + let docs = YamlLoader::load_from_str(&s).unwrap(); + let doc = &docs[0]; + let mut writer = String::new(); + { + let mut emitter = YamlEmitter::new(&mut writer); + emitter.compact(compact); + emitter.dump(doc).unwrap(); + } + + assert_eq!(s, writer); + } + + #[test] + fn test_nested_arrays() { + let s = r#"--- +a: + - b + - - c + - d + - - e + - f"#; + + let docs = YamlLoader::load_from_str(&s).unwrap(); + let doc = &docs[0]; + let mut writer = String::new(); + { + let mut emitter = YamlEmitter::new(&mut writer); + emitter.dump(doc).unwrap(); + } + println!("original:\n{}", s); + println!("emitted:\n{}", writer); + + assert_eq!(s, writer); + } + + #[test] + fn test_deeply_nested_arrays() { + let s = r#"--- +a: + - b + - - c + - d + - - e + - - f + - - e"#; + + let docs = YamlLoader::load_from_str(&s).unwrap(); + let doc = &docs[0]; + let mut writer = String::new(); + { + let mut emitter = YamlEmitter::new(&mut writer); + emitter.dump(doc).unwrap(); + } + println!("original:\n{}", s); + println!("emitted:\n{}", writer); + + assert_eq!(s, writer); + } + + #[test] + fn test_nested_hashes() { + let s = r#"--- +a: + b: + c: + d: + e: f"#; + + let docs = YamlLoader::load_from_str(&s).unwrap(); + let doc = &docs[0]; + let mut writer = String::new(); + { + let mut emitter = YamlEmitter::new(&mut writer); + emitter.dump(doc).unwrap(); + } + println!("original:\n{}", s); + println!("emitted:\n{}", writer); + + assert_eq!(s, writer); + } + +} diff --git a/third_party/rust/yaml-rust/src/lib.rs b/third_party/rust/yaml-rust/src/lib.rs new file mode 100644 index 0000000000..6cf87c7c5e --- /dev/null +++ b/third_party/rust/yaml-rust/src/lib.rs @@ -0,0 +1,121 @@ +// Copyright 2015, Yuheng Chen. See the LICENSE file at the top-level +// directory of this distribution. + +//! YAML 1.2 implementation in pure Rust. +//! +//! # Usage +//! +//! This crate is [on github](https://github.com/chyh1990/yaml-rust) and can be +//! used by adding `yaml-rust` to the dependencies in your project's `Cargo.toml`. +//! +//! ```toml +//! [dependencies.yaml-rust] +//! git = "https://github.com/chyh1990/yaml-rust.git" +//! ``` +//! +//! And this in your crate root: +//! +//! ```rust +//! extern crate yaml_rust; +//! ``` +//! +//! Parse a string into `Vec<Yaml>` and then serialize it as a YAML string. +//! +//! # Examples +//! +//! ``` +//! use yaml_rust::{YamlLoader, YamlEmitter}; +//! +//! let docs = YamlLoader::load_from_str("[1, 2, 3]").unwrap(); +//! let doc = &docs[0]; // select the first document +//! assert_eq!(doc[0].as_i64().unwrap(), 1); // access elements by index +//! +//! let mut out_str = String::new(); +//! let mut emitter = YamlEmitter::new(&mut out_str); +//! emitter.dump(doc).unwrap(); // dump the YAML object to a String +//! +//! ``` + +#![doc(html_root_url = "https://docs.rs/yaml-rust/0.4.5")] +#![cfg_attr(feature = "cargo-clippy", allow(renamed_and_removed_lints))] +#![cfg_attr(feature = "cargo-clippy", warn(cyclomatic_complexity))] +#![cfg_attr( + feature = "cargo-clippy", + allow(match_same_arms, should_implement_trait) +)] + +extern crate linked_hash_map; + +pub mod emitter; +pub mod parser; +pub mod scanner; +pub mod yaml; + +// reexport key APIs +pub use crate::emitter::{EmitError, YamlEmitter}; +pub use crate::parser::Event; +pub use crate::scanner::ScanError; +pub use crate::yaml::{Yaml, YamlLoader}; + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_api() { + let s = " +# from yaml-cpp example +- name: Ogre + position: [0, 5, 0] + powers: + - name: Club + damage: 10 + - name: Fist + damage: 8 +- name: Dragon + position: [1, 0, 10] + powers: + - name: Fire Breath + damage: 25 + - name: Claws + damage: 15 +- name: Wizard + position: [5, -3, 0] + powers: + - name: Acid Rain + damage: 50 + - name: Staff + damage: 3 +"; + let docs = YamlLoader::load_from_str(s).unwrap(); + let doc = &docs[0]; + + assert_eq!(doc[0]["name"].as_str().unwrap(), "Ogre"); + + let mut writer = String::new(); + { + let mut emitter = YamlEmitter::new(&mut writer); + emitter.dump(doc).unwrap(); + } + + assert!(!writer.is_empty()); + } + + fn try_fail(s: &str) -> Result<Vec<Yaml>, ScanError> { + let t = YamlLoader::load_from_str(s)?; + Ok(t) + } + + #[test] + fn test_fail() { + let s = " +# syntax error +scalar +key: [1, 2]] +key1:a2 +"; + assert!(YamlLoader::load_from_str(s).is_err()); + assert!(try_fail(s).is_err()); + } + +} diff --git a/third_party/rust/yaml-rust/src/parser.rs b/third_party/rust/yaml-rust/src/parser.rs new file mode 100644 index 0000000000..4a63146f13 --- /dev/null +++ b/third_party/rust/yaml-rust/src/parser.rs @@ -0,0 +1,858 @@ +use crate::scanner::*; +use std::collections::HashMap; + +#[derive(Clone, Copy, PartialEq, Debug, Eq)] +enum State { + StreamStart, + ImplicitDocumentStart, + DocumentStart, + DocumentContent, + DocumentEnd, + BlockNode, + // BlockNodeOrIndentlessSequence, + // FlowNode, + BlockSequenceFirstEntry, + BlockSequenceEntry, + IndentlessSequenceEntry, + BlockMappingFirstKey, + BlockMappingKey, + BlockMappingValue, + FlowSequenceFirstEntry, + FlowSequenceEntry, + FlowSequenceEntryMappingKey, + FlowSequenceEntryMappingValue, + FlowSequenceEntryMappingEnd, + FlowMappingFirstKey, + FlowMappingKey, + FlowMappingValue, + FlowMappingEmptyValue, + End, +} + +/// `Event` is used with the low-level event base parsing API, +/// see `EventReceiver` trait. +#[derive(Clone, PartialEq, Debug, Eq)] +pub enum Event { + /// Reserved for internal use + Nothing, + StreamStart, + StreamEnd, + DocumentStart, + DocumentEnd, + /// Refer to an anchor ID + Alias(usize), + /// Value, style, anchor_id, tag + Scalar(String, TScalarStyle, usize, Option<TokenType>), + /// Anchor ID + SequenceStart(usize), + SequenceEnd, + /// Anchor ID + MappingStart(usize), + MappingEnd, +} + +impl Event { + fn empty_scalar() -> Event { + // a null scalar + Event::Scalar("~".to_owned(), TScalarStyle::Plain, 0, None) + } + + fn empty_scalar_with_anchor(anchor: usize, tag: Option<TokenType>) -> Event { + Event::Scalar("".to_owned(), TScalarStyle::Plain, anchor, tag) + } +} + +#[derive(Debug)] +pub struct Parser<T> { + scanner: Scanner<T>, + states: Vec<State>, + state: State, + marks: Vec<Marker>, + token: Option<Token>, + current: Option<(Event, Marker)>, + anchors: HashMap<String, usize>, + anchor_id: usize, +} + +pub trait EventReceiver { + fn on_event(&mut self, ev: Event); +} + +pub trait MarkedEventReceiver { + fn on_event(&mut self, ev: Event, _mark: Marker); +} + +impl<R: EventReceiver> MarkedEventReceiver for R { + fn on_event(&mut self, ev: Event, _mark: Marker) { + self.on_event(ev) + } +} + +pub type ParseResult = Result<(Event, Marker), ScanError>; + +impl<T: Iterator<Item = char>> Parser<T> { + pub fn new(src: T) -> Parser<T> { + Parser { + scanner: Scanner::new(src), + states: Vec::new(), + state: State::StreamStart, + marks: Vec::new(), + token: None, + current: None, + + anchors: HashMap::new(), + // valid anchor_id starts from 1 + anchor_id: 1, + } + } + + pub fn peek(&mut self) -> Result<&(Event, Marker), ScanError> { + match self.current { + Some(ref x) => Ok(x), + None => { + self.current = Some(self.next()?); + self.peek() + } + } + } + + pub fn next(&mut self) -> ParseResult { + match self.current { + None => self.parse(), + Some(_) => Ok(self.current.take().unwrap()), + } + } + + fn peek_token(&mut self) -> Result<&Token, ScanError> { + match self.token { + None => { + self.token = Some(self.scan_next_token()?); + Ok(self.token.as_ref().unwrap()) + } + Some(ref tok) => Ok(tok), + } + } + + fn scan_next_token(&mut self) -> Result<Token, ScanError> { + let token = self.scanner.next(); + match token { + None => match self.scanner.get_error() { + None => Err(ScanError::new(self.scanner.mark(), "unexpected eof")), + Some(e) => Err(e), + }, + Some(tok) => Ok(tok), + } + } + + fn fetch_token(&mut self) -> Token { + self.token + .take() + .expect("fetch_token needs to be preceded by peek_token") + } + + fn skip(&mut self) { + self.token = None; + //self.peek_token(); + } + fn pop_state(&mut self) { + self.state = self.states.pop().unwrap() + } + fn push_state(&mut self, state: State) { + self.states.push(state); + } + + fn parse(&mut self) -> ParseResult { + if self.state == State::End { + return Ok((Event::StreamEnd, self.scanner.mark())); + } + let (ev, mark) = self.state_machine()?; + // println!("EV {:?}", ev); + Ok((ev, mark)) + } + + pub fn load<R: MarkedEventReceiver>( + &mut self, + recv: &mut R, + multi: bool, + ) -> Result<(), ScanError> { + if !self.scanner.stream_started() { + let (ev, mark) = self.next()?; + assert_eq!(ev, Event::StreamStart); + recv.on_event(ev, mark); + } + + if self.scanner.stream_ended() { + // XXX has parsed? + recv.on_event(Event::StreamEnd, self.scanner.mark()); + return Ok(()); + } + loop { + let (ev, mark) = self.next()?; + if ev == Event::StreamEnd { + recv.on_event(ev, mark); + return Ok(()); + } + // clear anchors before a new document + self.anchors.clear(); + self.load_document(ev, mark, recv)?; + if !multi { + break; + } + } + Ok(()) + } + + fn load_document<R: MarkedEventReceiver>( + &mut self, + first_ev: Event, + mark: Marker, + recv: &mut R, + ) -> Result<(), ScanError> { + assert_eq!(first_ev, Event::DocumentStart); + recv.on_event(first_ev, mark); + + let (ev, mark) = self.next()?; + self.load_node(ev, mark, recv)?; + + // DOCUMENT-END is expected. + let (ev, mark) = self.next()?; + assert_eq!(ev, Event::DocumentEnd); + recv.on_event(ev, mark); + + Ok(()) + } + + fn load_node<R: MarkedEventReceiver>( + &mut self, + first_ev: Event, + mark: Marker, + recv: &mut R, + ) -> Result<(), ScanError> { + match first_ev { + Event::Alias(..) | Event::Scalar(..) => { + recv.on_event(first_ev, mark); + Ok(()) + } + Event::SequenceStart(_) => { + recv.on_event(first_ev, mark); + self.load_sequence(recv) + } + Event::MappingStart(_) => { + recv.on_event(first_ev, mark); + self.load_mapping(recv) + } + _ => { + println!("UNREACHABLE EVENT: {:?}", first_ev); + unreachable!(); + } + } + } + + fn load_mapping<R: MarkedEventReceiver>(&mut self, recv: &mut R) -> Result<(), ScanError> { + let (mut key_ev, mut key_mark) = self.next()?; + while key_ev != Event::MappingEnd { + // key + self.load_node(key_ev, key_mark, recv)?; + + // value + let (ev, mark) = self.next()?; + self.load_node(ev, mark, recv)?; + + // next event + let (ev, mark) = self.next()?; + key_ev = ev; + key_mark = mark; + } + recv.on_event(key_ev, key_mark); + Ok(()) + } + + fn load_sequence<R: MarkedEventReceiver>(&mut self, recv: &mut R) -> Result<(), ScanError> { + let (mut ev, mut mark) = self.next()?; + while ev != Event::SequenceEnd { + self.load_node(ev, mark, recv)?; + + // next event + let (next_ev, next_mark) = self.next()?; + ev = next_ev; + mark = next_mark; + } + recv.on_event(ev, mark); + Ok(()) + } + + fn state_machine(&mut self) -> ParseResult { + // let next_tok = self.peek_token()?; + // println!("cur_state {:?}, next tok: {:?}", self.state, next_tok); + match self.state { + State::StreamStart => self.stream_start(), + + State::ImplicitDocumentStart => self.document_start(true), + State::DocumentStart => self.document_start(false), + State::DocumentContent => self.document_content(), + State::DocumentEnd => self.document_end(), + + State::BlockNode => self.parse_node(true, false), + // State::BlockNodeOrIndentlessSequence => self.parse_node(true, true), + // State::FlowNode => self.parse_node(false, false), + State::BlockMappingFirstKey => self.block_mapping_key(true), + State::BlockMappingKey => self.block_mapping_key(false), + State::BlockMappingValue => self.block_mapping_value(), + + State::BlockSequenceFirstEntry => self.block_sequence_entry(true), + State::BlockSequenceEntry => self.block_sequence_entry(false), + + State::FlowSequenceFirstEntry => self.flow_sequence_entry(true), + State::FlowSequenceEntry => self.flow_sequence_entry(false), + + State::FlowMappingFirstKey => self.flow_mapping_key(true), + State::FlowMappingKey => self.flow_mapping_key(false), + State::FlowMappingValue => self.flow_mapping_value(false), + + State::IndentlessSequenceEntry => self.indentless_sequence_entry(), + + State::FlowSequenceEntryMappingKey => self.flow_sequence_entry_mapping_key(), + State::FlowSequenceEntryMappingValue => self.flow_sequence_entry_mapping_value(), + State::FlowSequenceEntryMappingEnd => self.flow_sequence_entry_mapping_end(), + State::FlowMappingEmptyValue => self.flow_mapping_value(true), + + /* impossible */ + State::End => unreachable!(), + } + } + + fn stream_start(&mut self) -> ParseResult { + match *self.peek_token()? { + Token(mark, TokenType::StreamStart(_)) => { + self.state = State::ImplicitDocumentStart; + self.skip(); + Ok((Event::StreamStart, mark)) + } + Token(mark, _) => Err(ScanError::new(mark, "did not find expected <stream-start>")), + } + } + + fn document_start(&mut self, implicit: bool) -> ParseResult { + if !implicit { + while let TokenType::DocumentEnd = self.peek_token()?.1 { + self.skip(); + } + } + + match *self.peek_token()? { + Token(mark, TokenType::StreamEnd) => { + self.state = State::End; + self.skip(); + Ok((Event::StreamEnd, mark)) + } + Token(_, TokenType::VersionDirective(..)) + | Token(_, TokenType::TagDirective(..)) + | Token(_, TokenType::DocumentStart) => { + // explicit document + self._explicit_document_start() + } + Token(mark, _) if implicit => { + self.parser_process_directives()?; + self.push_state(State::DocumentEnd); + self.state = State::BlockNode; + Ok((Event::DocumentStart, mark)) + } + _ => { + // explicit document + self._explicit_document_start() + } + } + } + + fn parser_process_directives(&mut self) -> Result<(), ScanError> { + loop { + match self.peek_token()?.1 { + TokenType::VersionDirective(_, _) => { + // XXX parsing with warning according to spec + //if major != 1 || minor > 2 { + // return Err(ScanError::new(tok.0, + // "found incompatible YAML document")); + //} + } + TokenType::TagDirective(..) => { + // TODO add tag directive + } + _ => break, + } + self.skip(); + } + // TODO tag directive + Ok(()) + } + + fn _explicit_document_start(&mut self) -> ParseResult { + self.parser_process_directives()?; + match *self.peek_token()? { + Token(mark, TokenType::DocumentStart) => { + self.push_state(State::DocumentEnd); + self.state = State::DocumentContent; + self.skip(); + Ok((Event::DocumentStart, mark)) + } + Token(mark, _) => Err(ScanError::new( + mark, + "did not find expected <document start>", + )), + } + } + + fn document_content(&mut self) -> ParseResult { + match *self.peek_token()? { + Token(mark, TokenType::VersionDirective(..)) + | Token(mark, TokenType::TagDirective(..)) + | Token(mark, TokenType::DocumentStart) + | Token(mark, TokenType::DocumentEnd) + | Token(mark, TokenType::StreamEnd) => { + self.pop_state(); + // empty scalar + Ok((Event::empty_scalar(), mark)) + } + _ => self.parse_node(true, false), + } + } + + fn document_end(&mut self) -> ParseResult { + let mut _implicit = true; + let marker: Marker = match *self.peek_token()? { + Token(mark, TokenType::DocumentEnd) => { + self.skip(); + _implicit = false; + mark + } + Token(mark, _) => mark, + }; + + // TODO tag handling + self.state = State::DocumentStart; + Ok((Event::DocumentEnd, marker)) + } + + fn register_anchor(&mut self, name: String, _: &Marker) -> Result<usize, ScanError> { + // anchors can be overridden/reused + // if self.anchors.contains_key(name) { + // return Err(ScanError::new(*mark, + // "while parsing anchor, found duplicated anchor")); + // } + let new_id = self.anchor_id; + self.anchor_id += 1; + self.anchors.insert(name, new_id); + Ok(new_id) + } + + fn parse_node(&mut self, block: bool, indentless_sequence: bool) -> ParseResult { + let mut anchor_id = 0; + let mut tag = None; + match *self.peek_token()? { + Token(_, TokenType::Alias(_)) => { + self.pop_state(); + if let Token(mark, TokenType::Alias(name)) = self.fetch_token() { + match self.anchors.get(&name) { + None => { + return Err(ScanError::new( + mark, + "while parsing node, found unknown anchor", + )) + } + Some(id) => return Ok((Event::Alias(*id), mark)), + } + } else { + unreachable!() + } + } + Token(_, TokenType::Anchor(_)) => { + if let Token(mark, TokenType::Anchor(name)) = self.fetch_token() { + anchor_id = self.register_anchor(name, &mark)?; + if let TokenType::Tag(..) = self.peek_token()?.1 { + if let tg @ TokenType::Tag(..) = self.fetch_token().1 { + tag = Some(tg); + } else { + unreachable!() + } + } + } else { + unreachable!() + } + } + Token(_, TokenType::Tag(..)) => { + if let tg @ TokenType::Tag(..) = self.fetch_token().1 { + tag = Some(tg); + if let TokenType::Anchor(_) = self.peek_token()?.1 { + if let Token(mark, TokenType::Anchor(name)) = self.fetch_token() { + anchor_id = self.register_anchor(name, &mark)?; + } else { + unreachable!() + } + } + } else { + unreachable!() + } + } + _ => {} + } + match *self.peek_token()? { + Token(mark, TokenType::BlockEntry) if indentless_sequence => { + self.state = State::IndentlessSequenceEntry; + Ok((Event::SequenceStart(anchor_id), mark)) + } + Token(_, TokenType::Scalar(..)) => { + self.pop_state(); + if let Token(mark, TokenType::Scalar(style, v)) = self.fetch_token() { + Ok((Event::Scalar(v, style, anchor_id, tag), mark)) + } else { + unreachable!() + } + } + Token(mark, TokenType::FlowSequenceStart) => { + self.state = State::FlowSequenceFirstEntry; + Ok((Event::SequenceStart(anchor_id), mark)) + } + Token(mark, TokenType::FlowMappingStart) => { + self.state = State::FlowMappingFirstKey; + Ok((Event::MappingStart(anchor_id), mark)) + } + Token(mark, TokenType::BlockSequenceStart) if block => { + self.state = State::BlockSequenceFirstEntry; + Ok((Event::SequenceStart(anchor_id), mark)) + } + Token(mark, TokenType::BlockMappingStart) if block => { + self.state = State::BlockMappingFirstKey; + Ok((Event::MappingStart(anchor_id), mark)) + } + // ex 7.2, an empty scalar can follow a secondary tag + Token(mark, _) if tag.is_some() || anchor_id > 0 => { + self.pop_state(); + Ok((Event::empty_scalar_with_anchor(anchor_id, tag), mark)) + } + Token(mark, _) => Err(ScanError::new( + mark, + "while parsing a node, did not find expected node content", + )), + } + } + + fn block_mapping_key(&mut self, first: bool) -> ParseResult { + // skip BlockMappingStart + if first { + let _ = self.peek_token()?; + //self.marks.push(tok.0); + self.skip(); + } + match *self.peek_token()? { + Token(_, TokenType::Key) => { + self.skip(); + match *self.peek_token()? { + Token(mark, TokenType::Key) + | Token(mark, TokenType::Value) + | Token(mark, TokenType::BlockEnd) => { + self.state = State::BlockMappingValue; + // empty scalar + Ok((Event::empty_scalar(), mark)) + } + _ => { + self.push_state(State::BlockMappingValue); + self.parse_node(true, true) + } + } + } + // XXX(chenyh): libyaml failed to parse spec 1.2, ex8.18 + Token(mark, TokenType::Value) => { + self.state = State::BlockMappingValue; + Ok((Event::empty_scalar(), mark)) + } + Token(mark, TokenType::BlockEnd) => { + self.pop_state(); + self.skip(); + Ok((Event::MappingEnd, mark)) + } + Token(mark, _) => Err(ScanError::new( + mark, + "while parsing a block mapping, did not find expected key", + )), + } + } + + fn block_mapping_value(&mut self) -> ParseResult { + match *self.peek_token()? { + Token(_, TokenType::Value) => { + self.skip(); + match *self.peek_token()? { + Token(mark, TokenType::Key) + | Token(mark, TokenType::Value) + | Token(mark, TokenType::BlockEnd) => { + self.state = State::BlockMappingKey; + // empty scalar + Ok((Event::empty_scalar(), mark)) + } + _ => { + self.push_state(State::BlockMappingKey); + self.parse_node(true, true) + } + } + } + Token(mark, _) => { + self.state = State::BlockMappingKey; + // empty scalar + Ok((Event::empty_scalar(), mark)) + } + } + } + + fn flow_mapping_key(&mut self, first: bool) -> ParseResult { + if first { + let _ = self.peek_token()?; + self.skip(); + } + let marker: Marker = + { + match *self.peek_token()? { + Token(mark, TokenType::FlowMappingEnd) => mark, + Token(mark, _) => { + if !first { + match *self.peek_token()? { + Token(_, TokenType::FlowEntry) => self.skip(), + Token(mark, _) => return Err(ScanError::new(mark, + "while parsing a flow mapping, did not find expected ',' or '}'")) + } + } + + match *self.peek_token()? { + Token(_, TokenType::Key) => { + self.skip(); + match *self.peek_token()? { + Token(mark, TokenType::Value) + | Token(mark, TokenType::FlowEntry) + | Token(mark, TokenType::FlowMappingEnd) => { + self.state = State::FlowMappingValue; + return Ok((Event::empty_scalar(), mark)); + } + _ => { + self.push_state(State::FlowMappingValue); + return self.parse_node(false, false); + } + } + } + Token(marker, TokenType::Value) => { + self.state = State::FlowMappingValue; + return Ok((Event::empty_scalar(), marker)); + } + Token(_, TokenType::FlowMappingEnd) => (), + _ => { + self.push_state(State::FlowMappingEmptyValue); + return self.parse_node(false, false); + } + } + + mark + } + } + }; + + self.pop_state(); + self.skip(); + Ok((Event::MappingEnd, marker)) + } + + fn flow_mapping_value(&mut self, empty: bool) -> ParseResult { + let mark: Marker = { + if empty { + let Token(mark, _) = *self.peek_token()?; + self.state = State::FlowMappingKey; + return Ok((Event::empty_scalar(), mark)); + } else { + match *self.peek_token()? { + Token(marker, TokenType::Value) => { + self.skip(); + match self.peek_token()?.1 { + TokenType::FlowEntry | TokenType::FlowMappingEnd => {} + _ => { + self.push_state(State::FlowMappingKey); + return self.parse_node(false, false); + } + } + marker + } + Token(marker, _) => marker, + } + } + }; + + self.state = State::FlowMappingKey; + Ok((Event::empty_scalar(), mark)) + } + + fn flow_sequence_entry(&mut self, first: bool) -> ParseResult { + // skip FlowMappingStart + if first { + let _ = self.peek_token()?; + //self.marks.push(tok.0); + self.skip(); + } + match *self.peek_token()? { + Token(mark, TokenType::FlowSequenceEnd) => { + self.pop_state(); + self.skip(); + return Ok((Event::SequenceEnd, mark)); + } + Token(_, TokenType::FlowEntry) if !first => { + self.skip(); + } + Token(mark, _) if !first => { + return Err(ScanError::new( + mark, + "while parsing a flow sequence, expected ',' or ']'", + )); + } + _ => { /* next */ } + } + match *self.peek_token()? { + Token(mark, TokenType::FlowSequenceEnd) => { + self.pop_state(); + self.skip(); + Ok((Event::SequenceEnd, mark)) + } + Token(mark, TokenType::Key) => { + self.state = State::FlowSequenceEntryMappingKey; + self.skip(); + Ok((Event::MappingStart(0), mark)) + } + _ => { + self.push_state(State::FlowSequenceEntry); + self.parse_node(false, false) + } + } + } + + fn indentless_sequence_entry(&mut self) -> ParseResult { + match *self.peek_token()? { + Token(_, TokenType::BlockEntry) => (), + Token(mark, _) => { + self.pop_state(); + return Ok((Event::SequenceEnd, mark)); + } + } + self.skip(); + match *self.peek_token()? { + Token(mark, TokenType::BlockEntry) + | Token(mark, TokenType::Key) + | Token(mark, TokenType::Value) + | Token(mark, TokenType::BlockEnd) => { + self.state = State::IndentlessSequenceEntry; + Ok((Event::empty_scalar(), mark)) + } + _ => { + self.push_state(State::IndentlessSequenceEntry); + self.parse_node(true, false) + } + } + } + + fn block_sequence_entry(&mut self, first: bool) -> ParseResult { + // BLOCK-SEQUENCE-START + if first { + let _ = self.peek_token()?; + //self.marks.push(tok.0); + self.skip(); + } + match *self.peek_token()? { + Token(mark, TokenType::BlockEnd) => { + self.pop_state(); + self.skip(); + Ok((Event::SequenceEnd, mark)) + } + Token(_, TokenType::BlockEntry) => { + self.skip(); + match *self.peek_token()? { + Token(mark, TokenType::BlockEntry) | Token(mark, TokenType::BlockEnd) => { + self.state = State::BlockSequenceEntry; + Ok((Event::empty_scalar(), mark)) + } + _ => { + self.push_state(State::BlockSequenceEntry); + self.parse_node(true, false) + } + } + } + Token(mark, _) => Err(ScanError::new( + mark, + "while parsing a block collection, did not find expected '-' indicator", + )), + } + } + + fn flow_sequence_entry_mapping_key(&mut self) -> ParseResult { + match *self.peek_token()? { + Token(mark, TokenType::Value) + | Token(mark, TokenType::FlowEntry) + | Token(mark, TokenType::FlowSequenceEnd) => { + self.skip(); + self.state = State::FlowSequenceEntryMappingValue; + Ok((Event::empty_scalar(), mark)) + } + _ => { + self.push_state(State::FlowSequenceEntryMappingValue); + self.parse_node(false, false) + } + } + } + + fn flow_sequence_entry_mapping_value(&mut self) -> ParseResult { + match *self.peek_token()? { + Token(_, TokenType::Value) => { + self.skip(); + self.state = State::FlowSequenceEntryMappingValue; + match *self.peek_token()? { + Token(mark, TokenType::FlowEntry) | Token(mark, TokenType::FlowSequenceEnd) => { + self.state = State::FlowSequenceEntryMappingEnd; + Ok((Event::empty_scalar(), mark)) + } + _ => { + self.push_state(State::FlowSequenceEntryMappingEnd); + self.parse_node(false, false) + } + } + } + Token(mark, _) => { + self.state = State::FlowSequenceEntryMappingEnd; + Ok((Event::empty_scalar(), mark)) + } + } + } + + fn flow_sequence_entry_mapping_end(&mut self) -> ParseResult { + self.state = State::FlowSequenceEntry; + Ok((Event::MappingEnd, self.scanner.mark())) + } +} + +#[cfg(test)] +mod test { + use super::{Event, Parser}; + + #[test] + fn test_peek_eq_parse() { + let s = " +a0 bb: val +a1: &x + b1: 4 + b2: d +a2: 4 +a3: [1, 2, 3] +a4: + - [a1, a2] + - 2 +a5: *x +"; + let mut p = Parser::new(s.chars()); + while { + let event_peek = p.peek().unwrap().clone(); + let event = p.next().unwrap(); + assert_eq!(event, event_peek); + event.0 != Event::StreamEnd + } {} + } +} diff --git a/third_party/rust/yaml-rust/src/scanner.rs b/third_party/rust/yaml-rust/src/scanner.rs new file mode 100644 index 0000000000..a8659a8522 --- /dev/null +++ b/third_party/rust/yaml-rust/src/scanner.rs @@ -0,0 +1,2182 @@ +use std::collections::VecDeque; +use std::error::Error; +use std::{char, fmt}; + +#[derive(Clone, Copy, PartialEq, Debug, Eq)] +pub enum TEncoding { + Utf8, +} + +#[derive(Clone, Copy, PartialEq, Debug, Eq)] +pub enum TScalarStyle { + Any, + Plain, + SingleQuoted, + DoubleQuoted, + + Literal, + Foled, +} + +#[derive(Clone, Copy, PartialEq, Debug, Eq)] +pub struct Marker { + index: usize, + line: usize, + col: usize, +} + +impl Marker { + fn new(index: usize, line: usize, col: usize) -> Marker { + Marker { index, line, col } + } + + pub fn index(&self) -> usize { + self.index + } + + pub fn line(&self) -> usize { + self.line + } + + pub fn col(&self) -> usize { + self.col + } +} + +#[derive(Clone, PartialEq, Debug, Eq)] +pub struct ScanError { + mark: Marker, + info: String, +} + +impl ScanError { + pub fn new(loc: Marker, info: &str) -> ScanError { + ScanError { + mark: loc, + info: info.to_owned(), + } + } + + pub fn marker(&self) -> &Marker { + &self.mark + } +} + +impl Error for ScanError { + fn description(&self) -> &str { + self.info.as_ref() + } + + fn cause(&self) -> Option<&dyn Error> { + None + } +} + +impl fmt::Display for ScanError { + // col starts from 0 + fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result { + write!( + formatter, + "{} at line {} column {}", + self.info, + self.mark.line, + self.mark.col + 1 + ) + } +} + +#[derive(Clone, PartialEq, Debug, Eq)] +pub enum TokenType { + NoToken, + StreamStart(TEncoding), + StreamEnd, + /// major, minor + VersionDirective(u32, u32), + /// handle, prefix + TagDirective(String, String), + DocumentStart, + DocumentEnd, + BlockSequenceStart, + BlockMappingStart, + BlockEnd, + FlowSequenceStart, + FlowSequenceEnd, + FlowMappingStart, + FlowMappingEnd, + BlockEntry, + FlowEntry, + Key, + Value, + Alias(String), + Anchor(String), + /// handle, suffix + Tag(String, String), + Scalar(TScalarStyle, String), +} + +#[derive(Clone, PartialEq, Debug, Eq)] +pub struct Token(pub Marker, pub TokenType); + +#[derive(Clone, PartialEq, Debug, Eq)] +struct SimpleKey { + possible: bool, + required: bool, + token_number: usize, + mark: Marker, +} + +impl SimpleKey { + fn new(mark: Marker) -> SimpleKey { + SimpleKey { + possible: false, + required: false, + token_number: 0, + mark, + } + } +} + +#[derive(Debug)] +pub struct Scanner<T> { + rdr: T, + mark: Marker, + tokens: VecDeque<Token>, + buffer: VecDeque<char>, + error: Option<ScanError>, + + stream_start_produced: bool, + stream_end_produced: bool, + adjacent_value_allowed_at: usize, + simple_key_allowed: bool, + simple_keys: Vec<SimpleKey>, + indent: isize, + indents: Vec<isize>, + flow_level: u8, + tokens_parsed: usize, + token_available: bool, +} + +impl<T: Iterator<Item = char>> Iterator for Scanner<T> { + type Item = Token; + fn next(&mut self) -> Option<Token> { + if self.error.is_some() { + return None; + } + match self.next_token() { + Ok(tok) => tok, + Err(e) => { + self.error = Some(e); + None + } + } + } +} + +#[inline] +fn is_z(c: char) -> bool { + c == '\0' +} +#[inline] +fn is_break(c: char) -> bool { + c == '\n' || c == '\r' +} +#[inline] +fn is_breakz(c: char) -> bool { + is_break(c) || is_z(c) +} +#[inline] +fn is_blank(c: char) -> bool { + c == ' ' || c == '\t' +} +#[inline] +fn is_blankz(c: char) -> bool { + is_blank(c) || is_breakz(c) +} +#[inline] +fn is_digit(c: char) -> bool { + c >= '0' && c <= '9' +} +#[inline] +fn is_alpha(c: char) -> bool { + match c { + '0'..='9' | 'a'..='z' | 'A'..='Z' => true, + '_' | '-' => true, + _ => false, + } +} +#[inline] +fn is_hex(c: char) -> bool { + (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F') +} +#[inline] +fn as_hex(c: char) -> u32 { + match c { + '0'..='9' => (c as u32) - ('0' as u32), + 'a'..='f' => (c as u32) - ('a' as u32) + 10, + 'A'..='F' => (c as u32) - ('A' as u32) + 10, + _ => unreachable!(), + } +} +#[inline] +fn is_flow(c: char) -> bool { + match c { + ',' | '[' | ']' | '{' | '}' => true, + _ => false, + } +} + +pub type ScanResult = Result<(), ScanError>; + +impl<T: Iterator<Item = char>> Scanner<T> { + /// Creates the YAML tokenizer. + pub fn new(rdr: T) -> Scanner<T> { + Scanner { + rdr, + buffer: VecDeque::new(), + mark: Marker::new(0, 1, 0), + tokens: VecDeque::new(), + error: None, + + stream_start_produced: false, + stream_end_produced: false, + adjacent_value_allowed_at: 0, + simple_key_allowed: true, + simple_keys: Vec::new(), + indent: -1, + indents: Vec::new(), + flow_level: 0, + tokens_parsed: 0, + token_available: false, + } + } + #[inline] + pub fn get_error(&self) -> Option<ScanError> { + match self.error { + None => None, + Some(ref e) => Some(e.clone()), + } + } + + #[inline] + fn lookahead(&mut self, count: usize) { + if self.buffer.len() >= count { + return; + } + for _ in 0..(count - self.buffer.len()) { + self.buffer.push_back(self.rdr.next().unwrap_or('\0')); + } + } + #[inline] + fn skip(&mut self) { + let c = self.buffer.pop_front().unwrap(); + + self.mark.index += 1; + if c == '\n' { + self.mark.line += 1; + self.mark.col = 0; + } else { + self.mark.col += 1; + } + } + #[inline] + fn skip_line(&mut self) { + if self.buffer[0] == '\r' && self.buffer[1] == '\n' { + self.skip(); + self.skip(); + } else if is_break(self.buffer[0]) { + self.skip(); + } + } + #[inline] + fn ch(&self) -> char { + self.buffer[0] + } + #[inline] + fn ch_is(&self, c: char) -> bool { + self.buffer[0] == c + } + #[allow(dead_code)] + #[inline] + fn eof(&self) -> bool { + self.ch_is('\0') + } + #[inline] + pub fn stream_started(&self) -> bool { + self.stream_start_produced + } + #[inline] + pub fn stream_ended(&self) -> bool { + self.stream_end_produced + } + #[inline] + pub fn mark(&self) -> Marker { + self.mark + } + #[inline] + fn read_break(&mut self, s: &mut String) { + if self.buffer[0] == '\r' && self.buffer[1] == '\n' { + s.push('\n'); + self.skip(); + self.skip(); + } else if self.buffer[0] == '\r' || self.buffer[0] == '\n' { + s.push('\n'); + self.skip(); + } else { + unreachable!(); + } + } + fn insert_token(&mut self, pos: usize, tok: Token) { + let old_len = self.tokens.len(); + assert!(pos <= old_len); + self.tokens.push_back(tok); + for i in 0..old_len - pos { + self.tokens.swap(old_len - i, old_len - i - 1); + } + } + fn allow_simple_key(&mut self) { + self.simple_key_allowed = true; + } + fn disallow_simple_key(&mut self) { + self.simple_key_allowed = false; + } + + pub fn fetch_next_token(&mut self) -> ScanResult { + self.lookahead(1); + // println!("--> fetch_next_token Cur {:?} {:?}", self.mark, self.ch()); + + if !self.stream_start_produced { + self.fetch_stream_start(); + return Ok(()); + } + self.skip_to_next_token(); + + self.stale_simple_keys()?; + + let mark = self.mark; + self.unroll_indent(mark.col as isize); + + self.lookahead(4); + + if is_z(self.ch()) { + self.fetch_stream_end()?; + return Ok(()); + } + + // Is it a directive? + if self.mark.col == 0 && self.ch_is('%') { + return self.fetch_directive(); + } + + if self.mark.col == 0 + && self.buffer[0] == '-' + && self.buffer[1] == '-' + && self.buffer[2] == '-' + && is_blankz(self.buffer[3]) + { + self.fetch_document_indicator(TokenType::DocumentStart)?; + return Ok(()); + } + + if self.mark.col == 0 + && self.buffer[0] == '.' + && self.buffer[1] == '.' + && self.buffer[2] == '.' + && is_blankz(self.buffer[3]) + { + self.fetch_document_indicator(TokenType::DocumentEnd)?; + return Ok(()); + } + + let c = self.buffer[0]; + let nc = self.buffer[1]; + match c { + '[' => self.fetch_flow_collection_start(TokenType::FlowSequenceStart), + '{' => self.fetch_flow_collection_start(TokenType::FlowMappingStart), + ']' => self.fetch_flow_collection_end(TokenType::FlowSequenceEnd), + '}' => self.fetch_flow_collection_end(TokenType::FlowMappingEnd), + ',' => self.fetch_flow_entry(), + '-' if is_blankz(nc) => self.fetch_block_entry(), + '?' if is_blankz(nc) => self.fetch_key(), + ':' if is_blankz(nc) + || (self.flow_level > 0 + && (is_flow(nc) || self.mark.index == self.adjacent_value_allowed_at)) => + { + self.fetch_value() + } + // Is it an alias? + '*' => self.fetch_anchor(true), + // Is it an anchor? + '&' => self.fetch_anchor(false), + '!' => self.fetch_tag(), + // Is it a literal scalar? + '|' if self.flow_level == 0 => self.fetch_block_scalar(true), + // Is it a folded scalar? + '>' if self.flow_level == 0 => self.fetch_block_scalar(false), + '\'' => self.fetch_flow_scalar(true), + '"' => self.fetch_flow_scalar(false), + // plain scalar + '-' if !is_blankz(nc) => self.fetch_plain_scalar(), + ':' | '?' if !is_blankz(nc) && self.flow_level == 0 => self.fetch_plain_scalar(), + '%' | '@' | '`' => Err(ScanError::new( + self.mark, + &format!("unexpected character: `{}'", c), + )), + _ => self.fetch_plain_scalar(), + } + } + + pub fn next_token(&mut self) -> Result<Option<Token>, ScanError> { + if self.stream_end_produced { + return Ok(None); + } + + if !self.token_available { + self.fetch_more_tokens()?; + } + let t = self.tokens.pop_front().unwrap(); + self.token_available = false; + self.tokens_parsed += 1; + + if let TokenType::StreamEnd = t.1 { + self.stream_end_produced = true; + } + Ok(Some(t)) + } + + pub fn fetch_more_tokens(&mut self) -> ScanResult { + let mut need_more; + loop { + need_more = false; + if self.tokens.is_empty() { + need_more = true; + } else { + self.stale_simple_keys()?; + for sk in &self.simple_keys { + if sk.possible && sk.token_number == self.tokens_parsed { + need_more = true; + break; + } + } + } + + if !need_more { + break; + } + self.fetch_next_token()?; + } + self.token_available = true; + + Ok(()) + } + + fn stale_simple_keys(&mut self) -> ScanResult { + for sk in &mut self.simple_keys { + if sk.possible + && (sk.mark.line < self.mark.line || sk.mark.index + 1024 < self.mark.index) + { + if sk.required { + return Err(ScanError::new(self.mark, "simple key expect ':'")); + } + sk.possible = false; + } + } + Ok(()) + } + + fn skip_to_next_token(&mut self) { + loop { + self.lookahead(1); + // TODO(chenyh) BOM + match self.ch() { + ' ' => self.skip(), + '\t' if self.flow_level > 0 || !self.simple_key_allowed => self.skip(), + '\n' | '\r' => { + self.lookahead(2); + self.skip_line(); + if self.flow_level == 0 { + self.allow_simple_key(); + } + } + '#' => { + while !is_breakz(self.ch()) { + self.skip(); + self.lookahead(1); + } + } + _ => break, + } + } + } + + fn fetch_stream_start(&mut self) { + let mark = self.mark; + self.indent = -1; + self.stream_start_produced = true; + self.allow_simple_key(); + self.tokens + .push_back(Token(mark, TokenType::StreamStart(TEncoding::Utf8))); + self.simple_keys.push(SimpleKey::new(Marker::new(0, 0, 0))); + } + + fn fetch_stream_end(&mut self) -> ScanResult { + // force new line + if self.mark.col != 0 { + self.mark.col = 0; + self.mark.line += 1; + } + + self.unroll_indent(-1); + self.remove_simple_key()?; + self.disallow_simple_key(); + + self.tokens + .push_back(Token(self.mark, TokenType::StreamEnd)); + Ok(()) + } + + fn fetch_directive(&mut self) -> ScanResult { + self.unroll_indent(-1); + self.remove_simple_key()?; + + self.disallow_simple_key(); + + let tok = self.scan_directive()?; + + self.tokens.push_back(tok); + + Ok(()) + } + + fn scan_directive(&mut self) -> Result<Token, ScanError> { + let start_mark = self.mark; + self.skip(); + + let name = self.scan_directive_name()?; + let tok = match name.as_ref() { + "YAML" => self.scan_version_directive_value(&start_mark)?, + "TAG" => self.scan_tag_directive_value(&start_mark)?, + // XXX This should be a warning instead of an error + _ => { + // skip current line + self.lookahead(1); + while !is_breakz(self.ch()) { + self.skip(); + self.lookahead(1); + } + // XXX return an empty TagDirective token + Token( + start_mark, + TokenType::TagDirective(String::new(), String::new()), + ) + // return Err(ScanError::new(start_mark, + // "while scanning a directive, found unknown directive name")) + } + }; + self.lookahead(1); + + while is_blank(self.ch()) { + self.skip(); + self.lookahead(1); + } + + if self.ch() == '#' { + while !is_breakz(self.ch()) { + self.skip(); + self.lookahead(1); + } + } + + if !is_breakz(self.ch()) { + return Err(ScanError::new( + start_mark, + "while scanning a directive, did not find expected comment or line break", + )); + } + + // Eat a line break + if is_break(self.ch()) { + self.lookahead(2); + self.skip_line(); + } + + Ok(tok) + } + + fn scan_version_directive_value(&mut self, mark: &Marker) -> Result<Token, ScanError> { + self.lookahead(1); + + while is_blank(self.ch()) { + self.skip(); + self.lookahead(1); + } + + let major = self.scan_version_directive_number(mark)?; + + if self.ch() != '.' { + return Err(ScanError::new( + *mark, + "while scanning a YAML directive, did not find expected digit or '.' character", + )); + } + + self.skip(); + + let minor = self.scan_version_directive_number(mark)?; + + Ok(Token(*mark, TokenType::VersionDirective(major, minor))) + } + + fn scan_directive_name(&mut self) -> Result<String, ScanError> { + let start_mark = self.mark; + let mut string = String::new(); + self.lookahead(1); + while is_alpha(self.ch()) { + string.push(self.ch()); + self.skip(); + self.lookahead(1); + } + + if string.is_empty() { + return Err(ScanError::new( + start_mark, + "while scanning a directive, could not find expected directive name", + )); + } + + if !is_blankz(self.ch()) { + return Err(ScanError::new( + start_mark, + "while scanning a directive, found unexpected non-alphabetical character", + )); + } + + Ok(string) + } + + fn scan_version_directive_number(&mut self, mark: &Marker) -> Result<u32, ScanError> { + let mut val = 0u32; + let mut length = 0usize; + self.lookahead(1); + while is_digit(self.ch()) { + if length + 1 > 9 { + return Err(ScanError::new( + *mark, + "while scanning a YAML directive, found extremely long version number", + )); + } + length += 1; + val = val * 10 + ((self.ch() as u32) - ('0' as u32)); + self.skip(); + self.lookahead(1); + } + + if length == 0 { + return Err(ScanError::new( + *mark, + "while scanning a YAML directive, did not find expected version number", + )); + } + + Ok(val) + } + + fn scan_tag_directive_value(&mut self, mark: &Marker) -> Result<Token, ScanError> { + self.lookahead(1); + /* Eat whitespaces. */ + while is_blank(self.ch()) { + self.skip(); + self.lookahead(1); + } + let handle = self.scan_tag_handle(true, mark)?; + + self.lookahead(1); + /* Eat whitespaces. */ + while is_blank(self.ch()) { + self.skip(); + self.lookahead(1); + } + + let is_secondary = handle == "!!"; + let prefix = self.scan_tag_uri(true, is_secondary, &String::new(), mark)?; + + self.lookahead(1); + + if is_blankz(self.ch()) { + Ok(Token(*mark, TokenType::TagDirective(handle, prefix))) + } else { + Err(ScanError::new( + *mark, + "while scanning TAG, did not find expected whitespace or line break", + )) + } + } + + fn fetch_tag(&mut self) -> ScanResult { + self.save_simple_key()?; + self.disallow_simple_key(); + + let tok = self.scan_tag()?; + self.tokens.push_back(tok); + Ok(()) + } + + fn scan_tag(&mut self) -> Result<Token, ScanError> { + let start_mark = self.mark; + let mut handle = String::new(); + let mut suffix; + let mut secondary = false; + + // Check if the tag is in the canonical form (verbatim). + self.lookahead(2); + + if self.buffer[1] == '<' { + // Eat '!<' + self.skip(); + self.skip(); + suffix = self.scan_tag_uri(false, false, &String::new(), &start_mark)?; + + if self.ch() != '>' { + return Err(ScanError::new( + start_mark, + "while scanning a tag, did not find the expected '>'", + )); + } + + self.skip(); + } else { + // The tag has either the '!suffix' or the '!handle!suffix' + handle = self.scan_tag_handle(false, &start_mark)?; + // Check if it is, indeed, handle. + if handle.len() >= 2 && handle.starts_with('!') && handle.ends_with('!') { + if handle == "!!" { + secondary = true; + } + suffix = self.scan_tag_uri(false, secondary, &String::new(), &start_mark)?; + } else { + suffix = self.scan_tag_uri(false, false, &handle, &start_mark)?; + handle = "!".to_owned(); + // A special case: the '!' tag. Set the handle to '' and the + // suffix to '!'. + if suffix.is_empty() { + handle.clear(); + suffix = "!".to_owned(); + } + } + } + + self.lookahead(1); + if is_blankz(self.ch()) { + // XXX: ex 7.2, an empty scalar can follow a secondary tag + Ok(Token(start_mark, TokenType::Tag(handle, suffix))) + } else { + Err(ScanError::new( + start_mark, + "while scanning a tag, did not find expected whitespace or line break", + )) + } + } + + fn scan_tag_handle(&mut self, directive: bool, mark: &Marker) -> Result<String, ScanError> { + let mut string = String::new(); + self.lookahead(1); + if self.ch() != '!' { + return Err(ScanError::new( + *mark, + "while scanning a tag, did not find expected '!'", + )); + } + + string.push(self.ch()); + self.skip(); + + self.lookahead(1); + while is_alpha(self.ch()) { + string.push(self.ch()); + self.skip(); + self.lookahead(1); + } + + // Check if the trailing character is '!' and copy it. + if self.ch() == '!' { + string.push(self.ch()); + self.skip(); + } else if directive && string != "!" { + // It's either the '!' tag or not really a tag handle. If it's a %TAG + // directive, it's an error. If it's a tag token, it must be a part of + // URI. + return Err(ScanError::new( + *mark, + "while parsing a tag directive, did not find expected '!'", + )); + } + Ok(string) + } + + fn scan_tag_uri( + &mut self, + directive: bool, + _is_secondary: bool, + head: &str, + mark: &Marker, + ) -> Result<String, ScanError> { + let mut length = head.len(); + let mut string = String::new(); + + // Copy the head if needed. + // Note that we don't copy the leading '!' character. + if length > 1 { + string.extend(head.chars().skip(1)); + } + + self.lookahead(1); + /* + * The set of characters that may appear in URI is as follows: + * + * '0'-'9', 'A'-'Z', 'a'-'z', '_', '-', ';', '/', '?', ':', '@', '&', + * '=', '+', '$', ',', '.', '!', '~', '*', '\'', '(', ')', '[', ']', + * '%'. + */ + while match self.ch() { + ';' | '/' | '?' | ':' | '@' | '&' => true, + '=' | '+' | '$' | ',' | '.' | '!' | '~' | '*' | '\'' | '(' | ')' | '[' | ']' => true, + '%' => true, + c if is_alpha(c) => true, + _ => false, + } { + // Check if it is a URI-escape sequence. + if self.ch() == '%' { + string.push(self.scan_uri_escapes(directive, mark)?); + } else { + string.push(self.ch()); + self.skip(); + } + + length += 1; + self.lookahead(1); + } + + if length == 0 { + return Err(ScanError::new( + *mark, + "while parsing a tag, did not find expected tag URI", + )); + } + + Ok(string) + } + + fn scan_uri_escapes(&mut self, _directive: bool, mark: &Marker) -> Result<char, ScanError> { + let mut width = 0usize; + let mut code = 0u32; + loop { + self.lookahead(3); + + if !(self.ch() == '%' && is_hex(self.buffer[1]) && is_hex(self.buffer[2])) { + return Err(ScanError::new( + *mark, + "while parsing a tag, did not find URI escaped octet", + )); + } + + let octet = (as_hex(self.buffer[1]) << 4) + as_hex(self.buffer[2]); + if width == 0 { + width = match octet { + _ if octet & 0x80 == 0x00 => 1, + _ if octet & 0xE0 == 0xC0 => 2, + _ if octet & 0xF0 == 0xE0 => 3, + _ if octet & 0xF8 == 0xF0 => 4, + _ => { + return Err(ScanError::new( + *mark, + "while parsing a tag, found an incorrect leading UTF-8 octet", + )); + } + }; + code = octet; + } else { + if octet & 0xc0 != 0x80 { + return Err(ScanError::new( + *mark, + "while parsing a tag, found an incorrect trailing UTF-8 octet", + )); + } + code = (code << 8) + octet; + } + + self.skip(); + self.skip(); + self.skip(); + + width -= 1; + if width == 0 { + break; + } + } + + match char::from_u32(code) { + Some(ch) => Ok(ch), + None => Err(ScanError::new( + *mark, + "while parsing a tag, found an invalid UTF-8 codepoint", + )), + } + } + + fn fetch_anchor(&mut self, alias: bool) -> ScanResult { + self.save_simple_key()?; + self.disallow_simple_key(); + + let tok = self.scan_anchor(alias)?; + + self.tokens.push_back(tok); + + Ok(()) + } + + fn scan_anchor(&mut self, alias: bool) -> Result<Token, ScanError> { + let mut string = String::new(); + let start_mark = self.mark; + + self.skip(); + self.lookahead(1); + while is_alpha(self.ch()) { + string.push(self.ch()); + self.skip(); + self.lookahead(1); + } + + if string.is_empty() + || match self.ch() { + c if is_blankz(c) => false, + '?' | ':' | ',' | ']' | '}' | '%' | '@' | '`' => false, + _ => true, + } + { + return Err(ScanError::new(start_mark, "while scanning an anchor or alias, did not find expected alphabetic or numeric character")); + } + + if alias { + Ok(Token(start_mark, TokenType::Alias(string))) + } else { + Ok(Token(start_mark, TokenType::Anchor(string))) + } + } + + fn fetch_flow_collection_start(&mut self, tok: TokenType) -> ScanResult { + // The indicators '[' and '{' may start a simple key. + self.save_simple_key()?; + + self.increase_flow_level()?; + + self.allow_simple_key(); + + let start_mark = self.mark; + self.skip(); + + self.tokens.push_back(Token(start_mark, tok)); + Ok(()) + } + + fn fetch_flow_collection_end(&mut self, tok: TokenType) -> ScanResult { + self.remove_simple_key()?; + self.decrease_flow_level(); + + self.disallow_simple_key(); + + let start_mark = self.mark; + self.skip(); + + self.tokens.push_back(Token(start_mark, tok)); + Ok(()) + } + + fn fetch_flow_entry(&mut self) -> ScanResult { + self.remove_simple_key()?; + self.allow_simple_key(); + + let start_mark = self.mark; + self.skip(); + + self.tokens + .push_back(Token(start_mark, TokenType::FlowEntry)); + Ok(()) + } + + fn increase_flow_level(&mut self) -> ScanResult { + self.simple_keys.push(SimpleKey::new(Marker::new(0, 0, 0))); + self.flow_level = self + .flow_level + .checked_add(1) + .ok_or_else(|| ScanError::new(self.mark, "recursion limit exceeded"))?; + Ok(()) + } + fn decrease_flow_level(&mut self) { + if self.flow_level > 0 { + self.flow_level -= 1; + self.simple_keys.pop().unwrap(); + } + } + + fn fetch_block_entry(&mut self) -> ScanResult { + if self.flow_level == 0 { + // Check if we are allowed to start a new entry. + if !self.simple_key_allowed { + return Err(ScanError::new( + self.mark, + "block sequence entries are not allowed in this context", + )); + } + + let mark = self.mark; + // generate BLOCK-SEQUENCE-START if indented + self.roll_indent(mark.col, None, TokenType::BlockSequenceStart, mark); + } else { + // - * only allowed in block + return Err(ScanError::new( + self.mark, + r#""-" is only valid inside a block"#, + )); + } + self.remove_simple_key()?; + self.allow_simple_key(); + + let start_mark = self.mark; + self.skip(); + + self.tokens + .push_back(Token(start_mark, TokenType::BlockEntry)); + Ok(()) + } + + fn fetch_document_indicator(&mut self, t: TokenType) -> ScanResult { + self.unroll_indent(-1); + self.remove_simple_key()?; + self.disallow_simple_key(); + + let mark = self.mark; + + self.skip(); + self.skip(); + self.skip(); + + self.tokens.push_back(Token(mark, t)); + Ok(()) + } + + fn fetch_block_scalar(&mut self, literal: bool) -> ScanResult { + self.save_simple_key()?; + self.allow_simple_key(); + let tok = self.scan_block_scalar(literal)?; + + self.tokens.push_back(tok); + Ok(()) + } + + fn scan_block_scalar(&mut self, literal: bool) -> Result<Token, ScanError> { + let start_mark = self.mark; + let mut chomping: i32 = 0; + let mut increment: usize = 0; + let mut indent: usize = 0; + let mut trailing_blank: bool; + let mut leading_blank: bool = false; + + let mut string = String::new(); + let mut leading_break = String::new(); + let mut trailing_breaks = String::new(); + + // skip '|' or '>' + self.skip(); + self.lookahead(1); + + if self.ch() == '+' || self.ch() == '-' { + if self.ch() == '+' { + chomping = 1; + } else { + chomping = -1; + } + self.skip(); + self.lookahead(1); + if is_digit(self.ch()) { + if self.ch() == '0' { + return Err(ScanError::new( + start_mark, + "while scanning a block scalar, found an indentation indicator equal to 0", + )); + } + increment = (self.ch() as usize) - ('0' as usize); + self.skip(); + } + } else if is_digit(self.ch()) { + if self.ch() == '0' { + return Err(ScanError::new( + start_mark, + "while scanning a block scalar, found an indentation indicator equal to 0", + )); + } + + increment = (self.ch() as usize) - ('0' as usize); + self.skip(); + self.lookahead(1); + if self.ch() == '+' || self.ch() == '-' { + if self.ch() == '+' { + chomping = 1; + } else { + chomping = -1; + } + self.skip(); + } + } + + // Eat whitespaces and comments to the end of the line. + self.lookahead(1); + + while is_blank(self.ch()) { + self.skip(); + self.lookahead(1); + } + + if self.ch() == '#' { + while !is_breakz(self.ch()) { + self.skip(); + self.lookahead(1); + } + } + + // Check if we are at the end of the line. + if !is_breakz(self.ch()) { + return Err(ScanError::new( + start_mark, + "while scanning a block scalar, did not find expected comment or line break", + )); + } + + if is_break(self.ch()) { + self.lookahead(2); + self.skip_line(); + } + + if increment > 0 { + indent = if self.indent >= 0 { + (self.indent + increment as isize) as usize + } else { + increment + } + } + // Scan the leading line breaks and determine the indentation level if needed. + self.block_scalar_breaks(&mut indent, &mut trailing_breaks)?; + + self.lookahead(1); + + let start_mark = self.mark; + + while self.mark.col == indent && !is_z(self.ch()) { + // We are at the beginning of a non-empty line. + trailing_blank = is_blank(self.ch()); + if !literal && !leading_break.is_empty() && !leading_blank && !trailing_blank { + if trailing_breaks.is_empty() { + string.push(' '); + } + leading_break.clear(); + } else { + string.push_str(&leading_break); + leading_break.clear(); + } + + string.push_str(&trailing_breaks); + trailing_breaks.clear(); + + leading_blank = is_blank(self.ch()); + + while !is_breakz(self.ch()) { + string.push(self.ch()); + self.skip(); + self.lookahead(1); + } + // break on EOF + if is_z(self.ch()) { + break; + } + + self.lookahead(2); + self.read_break(&mut leading_break); + + // Eat the following indentation spaces and line breaks. + self.block_scalar_breaks(&mut indent, &mut trailing_breaks)?; + } + + // Chomp the tail. + if chomping != -1 { + string.push_str(&leading_break); + } + + if chomping == 1 { + string.push_str(&trailing_breaks); + } + + if literal { + Ok(Token( + start_mark, + TokenType::Scalar(TScalarStyle::Literal, string), + )) + } else { + Ok(Token( + start_mark, + TokenType::Scalar(TScalarStyle::Foled, string), + )) + } + } + + fn block_scalar_breaks(&mut self, indent: &mut usize, breaks: &mut String) -> ScanResult { + let mut max_indent = 0; + loop { + self.lookahead(1); + while (*indent == 0 || self.mark.col < *indent) && self.buffer[0] == ' ' { + self.skip(); + self.lookahead(1); + } + + if self.mark.col > max_indent { + max_indent = self.mark.col; + } + + // Check for a tab character messing the indentation. + if (*indent == 0 || self.mark.col < *indent) && self.buffer[0] == '\t' { + return Err(ScanError::new(self.mark, + "while scanning a block scalar, found a tab character where an indentation space is expected")); + } + + if !is_break(self.ch()) { + break; + } + + self.lookahead(2); + // Consume the line break. + self.read_break(breaks); + } + + if *indent == 0 { + *indent = max_indent; + if *indent < (self.indent + 1) as usize { + *indent = (self.indent + 1) as usize; + } + if *indent < 1 { + *indent = 1; + } + } + Ok(()) + } + + fn fetch_flow_scalar(&mut self, single: bool) -> ScanResult { + self.save_simple_key()?; + self.disallow_simple_key(); + + let tok = self.scan_flow_scalar(single)?; + + // From spec: To ensure JSON compatibility, if a key inside a flow mapping is JSON-like, + // YAML allows the following value to be specified adjacent to the “:”. + self.adjacent_value_allowed_at = self.mark.index; + + self.tokens.push_back(tok); + Ok(()) + } + + fn scan_flow_scalar(&mut self, single: bool) -> Result<Token, ScanError> { + let start_mark = self.mark; + + let mut string = String::new(); + let mut leading_break = String::new(); + let mut trailing_breaks = String::new(); + let mut whitespaces = String::new(); + let mut leading_blanks; + + /* Eat the left quote. */ + self.skip(); + + loop { + /* Check for a document indicator. */ + self.lookahead(4); + + if self.mark.col == 0 + && (((self.buffer[0] == '-') && (self.buffer[1] == '-') && (self.buffer[2] == '-')) + || ((self.buffer[0] == '.') + && (self.buffer[1] == '.') + && (self.buffer[2] == '.'))) + && is_blankz(self.buffer[3]) + { + return Err(ScanError::new( + start_mark, + "while scanning a quoted scalar, found unexpected document indicator", + )); + } + + if is_z(self.ch()) { + return Err(ScanError::new( + start_mark, + "while scanning a quoted scalar, found unexpected end of stream", + )); + } + + self.lookahead(2); + + leading_blanks = false; + // Consume non-blank characters. + + while !is_blankz(self.ch()) { + match self.ch() { + // Check for an escaped single quote. + '\'' if self.buffer[1] == '\'' && single => { + string.push('\''); + self.skip(); + self.skip(); + } + // Check for the right quote. + '\'' if single => break, + '"' if !single => break, + // Check for an escaped line break. + '\\' if !single && is_break(self.buffer[1]) => { + self.lookahead(3); + self.skip(); + self.skip_line(); + leading_blanks = true; + break; + } + // Check for an escape sequence. + '\\' if !single => { + let mut code_length = 0usize; + match self.buffer[1] { + '0' => string.push('\0'), + 'a' => string.push('\x07'), + 'b' => string.push('\x08'), + 't' | '\t' => string.push('\t'), + 'n' => string.push('\n'), + 'v' => string.push('\x0b'), + 'f' => string.push('\x0c'), + 'r' => string.push('\x0d'), + 'e' => string.push('\x1b'), + ' ' => string.push('\x20'), + '"' => string.push('"'), + '\'' => string.push('\''), + '\\' => string.push('\\'), + // NEL (#x85) + 'N' => string.push(char::from_u32(0x85).unwrap()), + // #xA0 + '_' => string.push(char::from_u32(0xA0).unwrap()), + // LS (#x2028) + 'L' => string.push(char::from_u32(0x2028).unwrap()), + // PS (#x2029) + 'P' => string.push(char::from_u32(0x2029).unwrap()), + 'x' => code_length = 2, + 'u' => code_length = 4, + 'U' => code_length = 8, + _ => { + return Err(ScanError::new( + start_mark, + "while parsing a quoted scalar, found unknown escape character", + )) + } + } + self.skip(); + self.skip(); + // Consume an arbitrary escape code. + if code_length > 0 { + self.lookahead(code_length); + let mut value = 0u32; + for i in 0..code_length { + if !is_hex(self.buffer[i]) { + return Err(ScanError::new(start_mark, + "while parsing a quoted scalar, did not find expected hexadecimal number")); + } + value = (value << 4) + as_hex(self.buffer[i]); + } + + let ch = match char::from_u32(value) { + Some(v) => v, + None => { + return Err(ScanError::new(start_mark, + "while parsing a quoted scalar, found invalid Unicode character escape code")); + } + }; + string.push(ch); + + for _ in 0..code_length { + self.skip(); + } + } + } + c => { + string.push(c); + self.skip(); + } + } + self.lookahead(2); + } + self.lookahead(1); + match self.ch() { + '\'' if single => break, + '"' if !single => break, + _ => {} + } + + // Consume blank characters. + while is_blank(self.ch()) || is_break(self.ch()) { + if is_blank(self.ch()) { + // Consume a space or a tab character. + if leading_blanks { + self.skip(); + } else { + whitespaces.push(self.ch()); + self.skip(); + } + } else { + self.lookahead(2); + // Check if it is a first line break. + if leading_blanks { + self.read_break(&mut trailing_breaks); + } else { + whitespaces.clear(); + self.read_break(&mut leading_break); + leading_blanks = true; + } + } + self.lookahead(1); + } + // Join the whitespaces or fold line breaks. + if leading_blanks { + if leading_break.is_empty() { + string.push_str(&leading_break); + string.push_str(&trailing_breaks); + trailing_breaks.clear(); + leading_break.clear(); + } else { + if trailing_breaks.is_empty() { + string.push(' '); + } else { + string.push_str(&trailing_breaks); + trailing_breaks.clear(); + } + leading_break.clear(); + } + } else { + string.push_str(&whitespaces); + whitespaces.clear(); + } + } // loop + + // Eat the right quote. + self.skip(); + + if single { + Ok(Token( + start_mark, + TokenType::Scalar(TScalarStyle::SingleQuoted, string), + )) + } else { + Ok(Token( + start_mark, + TokenType::Scalar(TScalarStyle::DoubleQuoted, string), + )) + } + } + + fn fetch_plain_scalar(&mut self) -> ScanResult { + self.save_simple_key()?; + self.disallow_simple_key(); + + let tok = self.scan_plain_scalar()?; + + self.tokens.push_back(tok); + Ok(()) + } + + fn scan_plain_scalar(&mut self) -> Result<Token, ScanError> { + let indent = self.indent + 1; + let start_mark = self.mark; + + let mut string = String::new(); + let mut leading_break = String::new(); + let mut trailing_breaks = String::new(); + let mut whitespaces = String::new(); + let mut leading_blanks = false; + + loop { + /* Check for a document indicator. */ + self.lookahead(4); + + if self.mark.col == 0 + && (((self.buffer[0] == '-') && (self.buffer[1] == '-') && (self.buffer[2] == '-')) + || ((self.buffer[0] == '.') + && (self.buffer[1] == '.') + && (self.buffer[2] == '.'))) + && is_blankz(self.buffer[3]) + { + break; + } + + if self.ch() == '#' { + break; + } + while !is_blankz(self.ch()) { + // indicators can end a plain scalar, see 7.3.3. Plain Style + match self.ch() { + ':' if is_blankz(self.buffer[1]) + || (self.flow_level > 0 && is_flow(self.buffer[1])) => + { + break; + } + ',' | '[' | ']' | '{' | '}' if self.flow_level > 0 => break, + _ => {} + } + + if leading_blanks || !whitespaces.is_empty() { + if leading_blanks { + if leading_break.is_empty() { + string.push_str(&leading_break); + string.push_str(&trailing_breaks); + trailing_breaks.clear(); + leading_break.clear(); + } else { + if trailing_breaks.is_empty() { + string.push(' '); + } else { + string.push_str(&trailing_breaks); + trailing_breaks.clear(); + } + leading_break.clear(); + } + leading_blanks = false; + } else { + string.push_str(&whitespaces); + whitespaces.clear(); + } + } + + string.push(self.ch()); + self.skip(); + self.lookahead(2); + } + // is the end? + if !(is_blank(self.ch()) || is_break(self.ch())) { + break; + } + self.lookahead(1); + + while is_blank(self.ch()) || is_break(self.ch()) { + if is_blank(self.ch()) { + if leading_blanks && (self.mark.col as isize) < indent && self.ch() == '\t' { + return Err(ScanError::new( + start_mark, + "while scanning a plain scalar, found a tab", + )); + } + + if leading_blanks { + self.skip(); + } else { + whitespaces.push(self.ch()); + self.skip(); + } + } else { + self.lookahead(2); + // Check if it is a first line break + if leading_blanks { + self.read_break(&mut trailing_breaks); + } else { + whitespaces.clear(); + self.read_break(&mut leading_break); + leading_blanks = true; + } + } + self.lookahead(1); + } + + // check indentation level + if self.flow_level == 0 && (self.mark.col as isize) < indent { + break; + } + } + + if leading_blanks { + self.allow_simple_key(); + } + + Ok(Token( + start_mark, + TokenType::Scalar(TScalarStyle::Plain, string), + )) + } + + fn fetch_key(&mut self) -> ScanResult { + let start_mark = self.mark; + if self.flow_level == 0 { + // Check if we are allowed to start a new key (not necessarily simple). + if !self.simple_key_allowed { + return Err(ScanError::new( + self.mark, + "mapping keys are not allowed in this context", + )); + } + self.roll_indent( + start_mark.col, + None, + TokenType::BlockMappingStart, + start_mark, + ); + } + + self.remove_simple_key()?; + + if self.flow_level == 0 { + self.allow_simple_key(); + } else { + self.disallow_simple_key(); + } + + self.skip(); + self.tokens.push_back(Token(start_mark, TokenType::Key)); + Ok(()) + } + + fn fetch_value(&mut self) -> ScanResult { + let sk = self.simple_keys.last().unwrap().clone(); + let start_mark = self.mark; + if sk.possible { + // insert simple key + let tok = Token(sk.mark, TokenType::Key); + let tokens_parsed = self.tokens_parsed; + self.insert_token(sk.token_number - tokens_parsed, tok); + + // Add the BLOCK-MAPPING-START token if needed. + self.roll_indent( + sk.mark.col, + Some(sk.token_number), + TokenType::BlockMappingStart, + start_mark, + ); + + self.simple_keys.last_mut().unwrap().possible = false; + self.disallow_simple_key(); + } else { + // The ':' indicator follows a complex key. + if self.flow_level == 0 { + if !self.simple_key_allowed { + return Err(ScanError::new( + start_mark, + "mapping values are not allowed in this context", + )); + } + + self.roll_indent( + start_mark.col, + None, + TokenType::BlockMappingStart, + start_mark, + ); + } + + if self.flow_level == 0 { + self.allow_simple_key(); + } else { + self.disallow_simple_key(); + } + } + self.skip(); + self.tokens.push_back(Token(start_mark, TokenType::Value)); + + Ok(()) + } + + fn roll_indent(&mut self, col: usize, number: Option<usize>, tok: TokenType, mark: Marker) { + if self.flow_level > 0 { + return; + } + + if self.indent < col as isize { + self.indents.push(self.indent); + self.indent = col as isize; + let tokens_parsed = self.tokens_parsed; + match number { + Some(n) => self.insert_token(n - tokens_parsed, Token(mark, tok)), + None => self.tokens.push_back(Token(mark, tok)), + } + } + } + + fn unroll_indent(&mut self, col: isize) { + if self.flow_level > 0 { + return; + } + while self.indent > col { + self.tokens.push_back(Token(self.mark, TokenType::BlockEnd)); + self.indent = self.indents.pop().unwrap(); + } + } + + fn save_simple_key(&mut self) -> Result<(), ScanError> { + let required = self.flow_level > 0 && self.indent == (self.mark.col as isize); + if self.simple_key_allowed { + let mut sk = SimpleKey::new(self.mark); + sk.possible = true; + sk.required = required; + sk.token_number = self.tokens_parsed + self.tokens.len(); + + self.remove_simple_key()?; + + self.simple_keys.pop(); + self.simple_keys.push(sk); + } + Ok(()) + } + + fn remove_simple_key(&mut self) -> ScanResult { + let last = self.simple_keys.last_mut().unwrap(); + if last.possible && last.required { + return Err(ScanError::new(self.mark, "simple key expected")); + } + + last.possible = false; + Ok(()) + } +} + +#[cfg(test)] +mod test { + use super::TokenType::*; + use super::*; + + macro_rules! next { + ($p:ident, $tk:pat) => {{ + let tok = $p.next().unwrap(); + match tok.1 { + $tk => {} + _ => panic!("unexpected token: {:?}", tok), + } + }}; + } + + macro_rules! next_scalar { + ($p:ident, $tk:expr, $v:expr) => {{ + let tok = $p.next().unwrap(); + match tok.1 { + Scalar(style, ref v) => { + assert_eq!(style, $tk); + assert_eq!(v, $v); + } + _ => panic!("unexpected token: {:?}", tok), + } + }}; + } + + macro_rules! end { + ($p:ident) => {{ + assert_eq!($p.next(), None); + }}; + } + /// test cases in libyaml scanner.c + #[test] + fn test_empty() { + let s = ""; + let mut p = Scanner::new(s.chars()); + next!(p, StreamStart(..)); + next!(p, StreamEnd); + end!(p); + } + + #[test] + fn test_scalar() { + let s = "a scalar"; + let mut p = Scanner::new(s.chars()); + next!(p, StreamStart(..)); + next!(p, Scalar(TScalarStyle::Plain, _)); + next!(p, StreamEnd); + end!(p); + } + + #[test] + fn test_explicit_scalar() { + let s = "--- +'a scalar' +... +"; + let mut p = Scanner::new(s.chars()); + next!(p, StreamStart(..)); + next!(p, DocumentStart); + next!(p, Scalar(TScalarStyle::SingleQuoted, _)); + next!(p, DocumentEnd); + next!(p, StreamEnd); + end!(p); + } + + #[test] + fn test_multiple_documents() { + let s = " +'a scalar' +--- +'a scalar' +--- +'a scalar' +"; + let mut p = Scanner::new(s.chars()); + next!(p, StreamStart(..)); + next!(p, Scalar(TScalarStyle::SingleQuoted, _)); + next!(p, DocumentStart); + next!(p, Scalar(TScalarStyle::SingleQuoted, _)); + next!(p, DocumentStart); + next!(p, Scalar(TScalarStyle::SingleQuoted, _)); + next!(p, StreamEnd); + end!(p); + } + + #[test] + fn test_a_flow_sequence() { + let s = "[item 1, item 2, item 3]"; + let mut p = Scanner::new(s.chars()); + next!(p, StreamStart(..)); + next!(p, FlowSequenceStart); + next_scalar!(p, TScalarStyle::Plain, "item 1"); + next!(p, FlowEntry); + next!(p, Scalar(TScalarStyle::Plain, _)); + next!(p, FlowEntry); + next!(p, Scalar(TScalarStyle::Plain, _)); + next!(p, FlowSequenceEnd); + next!(p, StreamEnd); + end!(p); + } + + #[test] + fn test_a_flow_mapping() { + let s = " +{ + a simple key: a value, # Note that the KEY token is produced. + ? a complex key: another value, +} +"; + let mut p = Scanner::new(s.chars()); + next!(p, StreamStart(..)); + next!(p, FlowMappingStart); + next!(p, Key); + next!(p, Scalar(TScalarStyle::Plain, _)); + next!(p, Value); + next!(p, Scalar(TScalarStyle::Plain, _)); + next!(p, FlowEntry); + next!(p, Key); + next_scalar!(p, TScalarStyle::Plain, "a complex key"); + next!(p, Value); + next!(p, Scalar(TScalarStyle::Plain, _)); + next!(p, FlowEntry); + next!(p, FlowMappingEnd); + next!(p, StreamEnd); + end!(p); + } + + #[test] + fn test_block_sequences() { + let s = " +- item 1 +- item 2 +- + - item 3.1 + - item 3.2 +- + key 1: value 1 + key 2: value 2 +"; + let mut p = Scanner::new(s.chars()); + next!(p, StreamStart(..)); + next!(p, BlockSequenceStart); + next!(p, BlockEntry); + next_scalar!(p, TScalarStyle::Plain, "item 1"); + next!(p, BlockEntry); + next_scalar!(p, TScalarStyle::Plain, "item 2"); + next!(p, BlockEntry); + next!(p, BlockSequenceStart); + next!(p, BlockEntry); + next_scalar!(p, TScalarStyle::Plain, "item 3.1"); + next!(p, BlockEntry); + next_scalar!(p, TScalarStyle::Plain, "item 3.2"); + next!(p, BlockEnd); + next!(p, BlockEntry); + next!(p, BlockMappingStart); + next!(p, Key); + next_scalar!(p, TScalarStyle::Plain, "key 1"); + next!(p, Value); + next_scalar!(p, TScalarStyle::Plain, "value 1"); + next!(p, Key); + next_scalar!(p, TScalarStyle::Plain, "key 2"); + next!(p, Value); + next_scalar!(p, TScalarStyle::Plain, "value 2"); + next!(p, BlockEnd); + next!(p, BlockEnd); + next!(p, StreamEnd); + end!(p); + } + + #[test] + fn test_block_mappings() { + let s = " +a simple key: a value # The KEY token is produced here. +? a complex key +: another value +a mapping: + key 1: value 1 + key 2: value 2 +a sequence: + - item 1 + - item 2 +"; + let mut p = Scanner::new(s.chars()); + next!(p, StreamStart(..)); + next!(p, BlockMappingStart); + next!(p, Key); + next!(p, Scalar(_, _)); + next!(p, Value); + next!(p, Scalar(_, _)); + next!(p, Key); + next!(p, Scalar(_, _)); + next!(p, Value); + next!(p, Scalar(_, _)); + next!(p, Key); + next!(p, Scalar(_, _)); + next!(p, Value); // libyaml comment seems to be wrong + next!(p, BlockMappingStart); + next!(p, Key); + next!(p, Scalar(_, _)); + next!(p, Value); + next!(p, Scalar(_, _)); + next!(p, Key); + next!(p, Scalar(_, _)); + next!(p, Value); + next!(p, Scalar(_, _)); + next!(p, BlockEnd); + next!(p, Key); + next!(p, Scalar(_, _)); + next!(p, Value); + next!(p, BlockSequenceStart); + next!(p, BlockEntry); + next!(p, Scalar(_, _)); + next!(p, BlockEntry); + next!(p, Scalar(_, _)); + next!(p, BlockEnd); + next!(p, BlockEnd); + next!(p, StreamEnd); + end!(p); + } + + #[test] + fn test_no_block_sequence_start() { + let s = " +key: +- item 1 +- item 2 +"; + let mut p = Scanner::new(s.chars()); + next!(p, StreamStart(..)); + next!(p, BlockMappingStart); + next!(p, Key); + next_scalar!(p, TScalarStyle::Plain, "key"); + next!(p, Value); + next!(p, BlockEntry); + next_scalar!(p, TScalarStyle::Plain, "item 1"); + next!(p, BlockEntry); + next_scalar!(p, TScalarStyle::Plain, "item 2"); + next!(p, BlockEnd); + next!(p, StreamEnd); + end!(p); + } + + #[test] + fn test_collections_in_sequence() { + let s = " +- - item 1 + - item 2 +- key 1: value 1 + key 2: value 2 +- ? complex key + : complex value +"; + let mut p = Scanner::new(s.chars()); + next!(p, StreamStart(..)); + next!(p, BlockSequenceStart); + next!(p, BlockEntry); + next!(p, BlockSequenceStart); + next!(p, BlockEntry); + next_scalar!(p, TScalarStyle::Plain, "item 1"); + next!(p, BlockEntry); + next_scalar!(p, TScalarStyle::Plain, "item 2"); + next!(p, BlockEnd); + next!(p, BlockEntry); + next!(p, BlockMappingStart); + next!(p, Key); + next_scalar!(p, TScalarStyle::Plain, "key 1"); + next!(p, Value); + next_scalar!(p, TScalarStyle::Plain, "value 1"); + next!(p, Key); + next_scalar!(p, TScalarStyle::Plain, "key 2"); + next!(p, Value); + next_scalar!(p, TScalarStyle::Plain, "value 2"); + next!(p, BlockEnd); + next!(p, BlockEntry); + next!(p, BlockMappingStart); + next!(p, Key); + next_scalar!(p, TScalarStyle::Plain, "complex key"); + next!(p, Value); + next_scalar!(p, TScalarStyle::Plain, "complex value"); + next!(p, BlockEnd); + next!(p, BlockEnd); + next!(p, StreamEnd); + end!(p); + } + + #[test] + fn test_collections_in_mapping() { + let s = " +? a sequence +: - item 1 + - item 2 +? a mapping +: key 1: value 1 + key 2: value 2 +"; + let mut p = Scanner::new(s.chars()); + next!(p, StreamStart(..)); + next!(p, BlockMappingStart); + next!(p, Key); + next_scalar!(p, TScalarStyle::Plain, "a sequence"); + next!(p, Value); + next!(p, BlockSequenceStart); + next!(p, BlockEntry); + next_scalar!(p, TScalarStyle::Plain, "item 1"); + next!(p, BlockEntry); + next_scalar!(p, TScalarStyle::Plain, "item 2"); + next!(p, BlockEnd); + next!(p, Key); + next_scalar!(p, TScalarStyle::Plain, "a mapping"); + next!(p, Value); + next!(p, BlockMappingStart); + next!(p, Key); + next_scalar!(p, TScalarStyle::Plain, "key 1"); + next!(p, Value); + next_scalar!(p, TScalarStyle::Plain, "value 1"); + next!(p, Key); + next_scalar!(p, TScalarStyle::Plain, "key 2"); + next!(p, Value); + next_scalar!(p, TScalarStyle::Plain, "value 2"); + next!(p, BlockEnd); + next!(p, BlockEnd); + next!(p, StreamEnd); + end!(p); + } + + #[test] + fn test_spec_ex7_3() { + let s = " +{ + ? foo :, + : bar, +} +"; + let mut p = Scanner::new(s.chars()); + next!(p, StreamStart(..)); + next!(p, FlowMappingStart); + next!(p, Key); + next_scalar!(p, TScalarStyle::Plain, "foo"); + next!(p, Value); + next!(p, FlowEntry); + next!(p, Value); + next_scalar!(p, TScalarStyle::Plain, "bar"); + next!(p, FlowEntry); + next!(p, FlowMappingEnd); + next!(p, StreamEnd); + end!(p); + } + + #[test] + fn test_plain_scalar_starting_with_indicators_in_flow() { + // "Plain scalars must not begin with most indicators, as this would cause ambiguity with + // other YAML constructs. However, the “:”, “?” and “-” indicators may be used as the first + // character if followed by a non-space “safe” character, as this causes no ambiguity." + + let s = "{a: :b}"; + let mut p = Scanner::new(s.chars()); + next!(p, StreamStart(..)); + next!(p, FlowMappingStart); + next!(p, Key); + next_scalar!(p, TScalarStyle::Plain, "a"); + next!(p, Value); + next_scalar!(p, TScalarStyle::Plain, ":b"); + next!(p, FlowMappingEnd); + next!(p, StreamEnd); + end!(p); + + let s = "{a: ?b}"; + let mut p = Scanner::new(s.chars()); + next!(p, StreamStart(..)); + next!(p, FlowMappingStart); + next!(p, Key); + next_scalar!(p, TScalarStyle::Plain, "a"); + next!(p, Value); + next_scalar!(p, TScalarStyle::Plain, "?b"); + next!(p, FlowMappingEnd); + next!(p, StreamEnd); + end!(p); + } + + #[test] + fn test_plain_scalar_starting_with_indicators_in_block() { + let s = ":a"; + let mut p = Scanner::new(s.chars()); + next!(p, StreamStart(..)); + next_scalar!(p, TScalarStyle::Plain, ":a"); + next!(p, StreamEnd); + end!(p); + + let s = "?a"; + let mut p = Scanner::new(s.chars()); + next!(p, StreamStart(..)); + next_scalar!(p, TScalarStyle::Plain, "?a"); + next!(p, StreamEnd); + end!(p); + } + + #[test] + fn test_plain_scalar_containing_indicators_in_block() { + let s = "a:,b"; + let mut p = Scanner::new(s.chars()); + next!(p, StreamStart(..)); + next_scalar!(p, TScalarStyle::Plain, "a:,b"); + next!(p, StreamEnd); + end!(p); + + let s = ":,b"; + let mut p = Scanner::new(s.chars()); + next!(p, StreamStart(..)); + next_scalar!(p, TScalarStyle::Plain, ":,b"); + next!(p, StreamEnd); + end!(p); + } + + #[test] + fn test_scanner_cr() { + let s = "---\r\n- tok1\r\n- tok2"; + let mut p = Scanner::new(s.chars()); + next!(p, StreamStart(..)); + next!(p, DocumentStart); + next!(p, BlockSequenceStart); + next!(p, BlockEntry); + next_scalar!(p, TScalarStyle::Plain, "tok1"); + next!(p, BlockEntry); + next_scalar!(p, TScalarStyle::Plain, "tok2"); + next!(p, BlockEnd); + next!(p, StreamEnd); + end!(p); + } + + #[test] + fn test_uri() { + // TODO + } + + #[test] + fn test_uri_escapes() { + // TODO + } +} diff --git a/third_party/rust/yaml-rust/src/yaml.rs b/third_party/rust/yaml-rust/src/yaml.rs new file mode 100644 index 0000000000..4bb70da531 --- /dev/null +++ b/third_party/rust/yaml-rust/src/yaml.rs @@ -0,0 +1,739 @@ +use linked_hash_map::LinkedHashMap; +use crate::parser::*; +use crate::scanner::{Marker, ScanError, TScalarStyle, TokenType}; +use std::collections::BTreeMap; +use std::f64; +use std::i64; +use std::mem; +use std::ops::Index; +use std::string; +use std::vec; + +/// A YAML node is stored as this `Yaml` enumeration, which provides an easy way to +/// access your YAML document. +/// +/// # Examples +/// +/// ``` +/// use yaml_rust::Yaml; +/// let foo = Yaml::from_str("-123"); // convert the string to the appropriate YAML type +/// assert_eq!(foo.as_i64().unwrap(), -123); +/// +/// // iterate over an Array +/// let vec = Yaml::Array(vec![Yaml::Integer(1), Yaml::Integer(2)]); +/// for v in vec.as_vec().unwrap() { +/// assert!(v.as_i64().is_some()); +/// } +/// ``` +#[derive(Clone, PartialEq, PartialOrd, Debug, Eq, Ord, Hash)] +pub enum Yaml { + /// Float types are stored as String and parsed on demand. + /// Note that f64 does NOT implement Eq trait and can NOT be stored in BTreeMap. + Real(string::String), + /// YAML int is stored as i64. + Integer(i64), + /// YAML scalar. + String(string::String), + /// YAML bool, e.g. `true` or `false`. + Boolean(bool), + /// YAML array, can be accessed as a `Vec`. + Array(self::Array), + /// YAML hash, can be accessed as a `LinkedHashMap`. + /// + /// Insertion order will match the order of insertion into the map. + Hash(self::Hash), + /// Alias, not fully supported yet. + Alias(usize), + /// YAML null, e.g. `null` or `~`. + Null, + /// Accessing a nonexistent node via the Index trait returns `BadValue`. This + /// simplifies error handling in the calling code. Invalid type conversion also + /// returns `BadValue`. + BadValue, +} + +pub type Array = Vec<Yaml>; +pub type Hash = LinkedHashMap<Yaml, Yaml>; + +// parse f64 as Core schema +// See: https://github.com/chyh1990/yaml-rust/issues/51 +fn parse_f64(v: &str) -> Option<f64> { + match v { + ".inf" | ".Inf" | ".INF" | "+.inf" | "+.Inf" | "+.INF" => Some(f64::INFINITY), + "-.inf" | "-.Inf" | "-.INF" => Some(f64::NEG_INFINITY), + ".nan" | "NaN" | ".NAN" => Some(f64::NAN), + _ => v.parse::<f64>().ok(), + } +} + +pub struct YamlLoader { + docs: Vec<Yaml>, + // states + // (current node, anchor_id) tuple + doc_stack: Vec<(Yaml, usize)>, + key_stack: Vec<Yaml>, + anchor_map: BTreeMap<usize, Yaml>, +} + +impl MarkedEventReceiver for YamlLoader { + fn on_event(&mut self, ev: Event, _: Marker) { + // println!("EV {:?}", ev); + match ev { + Event::DocumentStart => { + // do nothing + } + Event::DocumentEnd => { + match self.doc_stack.len() { + // empty document + 0 => self.docs.push(Yaml::BadValue), + 1 => self.docs.push(self.doc_stack.pop().unwrap().0), + _ => unreachable!(), + } + } + Event::SequenceStart(aid) => { + self.doc_stack.push((Yaml::Array(Vec::new()), aid)); + } + Event::SequenceEnd => { + let node = self.doc_stack.pop().unwrap(); + self.insert_new_node(node); + } + Event::MappingStart(aid) => { + self.doc_stack.push((Yaml::Hash(Hash::new()), aid)); + self.key_stack.push(Yaml::BadValue); + } + Event::MappingEnd => { + self.key_stack.pop().unwrap(); + let node = self.doc_stack.pop().unwrap(); + self.insert_new_node(node); + } + Event::Scalar(v, style, aid, tag) => { + let node = if style != TScalarStyle::Plain { + Yaml::String(v) + } else if let Some(TokenType::Tag(ref handle, ref suffix)) = tag { + // XXX tag:yaml.org,2002: + if handle == "!!" { + match suffix.as_ref() { + "bool" => { + // "true" or "false" + match v.parse::<bool>() { + Err(_) => Yaml::BadValue, + Ok(v) => Yaml::Boolean(v), + } + } + "int" => match v.parse::<i64>() { + Err(_) => Yaml::BadValue, + Ok(v) => Yaml::Integer(v), + }, + "float" => match parse_f64(&v) { + Some(_) => Yaml::Real(v), + None => Yaml::BadValue, + }, + "null" => match v.as_ref() { + "~" | "null" => Yaml::Null, + _ => Yaml::BadValue, + }, + _ => Yaml::String(v), + } + } else { + Yaml::String(v) + } + } else { + // Datatype is not specified, or unrecognized + Yaml::from_str(&v) + }; + + self.insert_new_node((node, aid)); + } + Event::Alias(id) => { + let n = match self.anchor_map.get(&id) { + Some(v) => v.clone(), + None => Yaml::BadValue, + }; + self.insert_new_node((n, 0)); + } + _ => { /* ignore */ } + } + // println!("DOC {:?}", self.doc_stack); + } +} + +impl YamlLoader { + fn insert_new_node(&mut self, node: (Yaml, usize)) { + // valid anchor id starts from 1 + if node.1 > 0 { + self.anchor_map.insert(node.1, node.0.clone()); + } + if self.doc_stack.is_empty() { + self.doc_stack.push(node); + } else { + let parent = self.doc_stack.last_mut().unwrap(); + match *parent { + (Yaml::Array(ref mut v), _) => v.push(node.0), + (Yaml::Hash(ref mut h), _) => { + let cur_key = self.key_stack.last_mut().unwrap(); + // current node is a key + if cur_key.is_badvalue() { + *cur_key = node.0; + // current node is a value + } else { + let mut newkey = Yaml::BadValue; + mem::swap(&mut newkey, cur_key); + h.insert(newkey, node.0); + } + } + _ => unreachable!(), + } + } + } + + pub fn load_from_str(source: &str) -> Result<Vec<Yaml>, ScanError> { + let mut loader = YamlLoader { + docs: Vec::new(), + doc_stack: Vec::new(), + key_stack: Vec::new(), + anchor_map: BTreeMap::new(), + }; + let mut parser = Parser::new(source.chars()); + parser.load(&mut loader, true)?; + Ok(loader.docs) + } +} + +macro_rules! define_as ( + ($name:ident, $t:ident, $yt:ident) => ( +pub fn $name(&self) -> Option<$t> { + match *self { + Yaml::$yt(v) => Some(v), + _ => None + } +} + ); +); + +macro_rules! define_as_ref ( + ($name:ident, $t:ty, $yt:ident) => ( +pub fn $name(&self) -> Option<$t> { + match *self { + Yaml::$yt(ref v) => Some(v), + _ => None + } +} + ); +); + +macro_rules! define_into ( + ($name:ident, $t:ty, $yt:ident) => ( +pub fn $name(self) -> Option<$t> { + match self { + Yaml::$yt(v) => Some(v), + _ => None + } +} + ); +); + +impl Yaml { + define_as!(as_bool, bool, Boolean); + define_as!(as_i64, i64, Integer); + + define_as_ref!(as_str, &str, String); + define_as_ref!(as_hash, &Hash, Hash); + define_as_ref!(as_vec, &Array, Array); + + define_into!(into_bool, bool, Boolean); + define_into!(into_i64, i64, Integer); + define_into!(into_string, String, String); + define_into!(into_hash, Hash, Hash); + define_into!(into_vec, Array, Array); + + pub fn is_null(&self) -> bool { + match *self { + Yaml::Null => true, + _ => false, + } + } + + pub fn is_badvalue(&self) -> bool { + match *self { + Yaml::BadValue => true, + _ => false, + } + } + + pub fn is_array(&self) -> bool { + match *self { + Yaml::Array(_) => true, + _ => false, + } + } + + pub fn as_f64(&self) -> Option<f64> { + match *self { + Yaml::Real(ref v) => parse_f64(v), + _ => None, + } + } + + pub fn into_f64(self) -> Option<f64> { + match self { + Yaml::Real(ref v) => parse_f64(v), + _ => None, + } + } +} + +#[cfg_attr(feature = "cargo-clippy", allow(should_implement_trait))] +impl Yaml { + // Not implementing FromStr because there is no possibility of Error. + // This function falls back to Yaml::String if nothing else matches. + pub fn from_str(v: &str) -> Yaml { + if v.starts_with("0x") { + if let Ok(i) = i64::from_str_radix(&v[2..], 16) { + return Yaml::Integer(i); + } + } + if v.starts_with("0o") { + if let Ok(i) = i64::from_str_radix(&v[2..], 8) { + return Yaml::Integer(i); + } + } + if v.starts_with('+') { + if let Ok(i) = v[1..].parse::<i64>() { + return Yaml::Integer(i); + } + } + match v { + "~" | "null" => Yaml::Null, + "true" => Yaml::Boolean(true), + "false" => Yaml::Boolean(false), + _ if v.parse::<i64>().is_ok() => Yaml::Integer(v.parse::<i64>().unwrap()), + // try parsing as f64 + _ if parse_f64(v).is_some() => Yaml::Real(v.to_owned()), + _ => Yaml::String(v.to_owned()), + } + } +} + +static BAD_VALUE: Yaml = Yaml::BadValue; +impl<'a> Index<&'a str> for Yaml { + type Output = Yaml; + + fn index(&self, idx: &'a str) -> &Yaml { + let key = Yaml::String(idx.to_owned()); + match self.as_hash() { + Some(h) => h.get(&key).unwrap_or(&BAD_VALUE), + None => &BAD_VALUE, + } + } +} + +impl Index<usize> for Yaml { + type Output = Yaml; + + fn index(&self, idx: usize) -> &Yaml { + if let Some(v) = self.as_vec() { + v.get(idx).unwrap_or(&BAD_VALUE) + } else if let Some(v) = self.as_hash() { + let key = Yaml::Integer(idx as i64); + v.get(&key).unwrap_or(&BAD_VALUE) + } else { + &BAD_VALUE + } + } +} + +impl IntoIterator for Yaml { + type Item = Yaml; + type IntoIter = YamlIter; + + fn into_iter(self) -> Self::IntoIter { + YamlIter { + yaml: self.into_vec().unwrap_or_else(Vec::new).into_iter(), + } + } +} + +pub struct YamlIter { + yaml: vec::IntoIter<Yaml>, +} + +impl Iterator for YamlIter { + type Item = Yaml; + + fn next(&mut self) -> Option<Yaml> { + self.yaml.next() + } +} + +#[cfg(test)] +mod test { + use std::f64; + use crate::yaml::*; + #[test] + fn test_coerce() { + let s = "--- +a: 1 +b: 2.2 +c: [1, 2] +"; + let out = YamlLoader::load_from_str(&s).unwrap(); + let doc = &out[0]; + assert_eq!(doc["a"].as_i64().unwrap(), 1i64); + assert_eq!(doc["b"].as_f64().unwrap(), 2.2f64); + assert_eq!(doc["c"][1].as_i64().unwrap(), 2i64); + assert!(doc["d"][0].is_badvalue()); + } + + #[test] + fn test_empty_doc() { + let s: String = "".to_owned(); + YamlLoader::load_from_str(&s).unwrap(); + let s: String = "---".to_owned(); + assert_eq!(YamlLoader::load_from_str(&s).unwrap()[0], Yaml::Null); + } + + #[test] + fn test_parser() { + let s: String = " +# comment +a0 bb: val +a1: + b1: 4 + b2: d +a2: 4 # i'm comment +a3: [1, 2, 3] +a4: + - - a1 + - a2 + - 2 +a5: 'single_quoted' +a6: \"double_quoted\" +a7: 你好 +" + .to_owned(); + let out = YamlLoader::load_from_str(&s).unwrap(); + let doc = &out[0]; + assert_eq!(doc["a7"].as_str().unwrap(), "你好"); + } + + #[test] + fn test_multi_doc() { + let s = " +'a scalar' +--- +'a scalar' +--- +'a scalar' +"; + let out = YamlLoader::load_from_str(&s).unwrap(); + assert_eq!(out.len(), 3); + } + + #[test] + fn test_anchor() { + let s = " +a1: &DEFAULT + b1: 4 + b2: d +a2: *DEFAULT +"; + let out = YamlLoader::load_from_str(&s).unwrap(); + let doc = &out[0]; + assert_eq!(doc["a2"]["b1"].as_i64().unwrap(), 4); + } + + #[test] + fn test_bad_anchor() { + let s = " +a1: &DEFAULT + b1: 4 + b2: *DEFAULT +"; + let out = YamlLoader::load_from_str(&s).unwrap(); + let doc = &out[0]; + assert_eq!(doc["a1"]["b2"], Yaml::BadValue); + } + + #[test] + fn test_github_27() { + // https://github.com/chyh1990/yaml-rust/issues/27 + let s = "&a"; + let out = YamlLoader::load_from_str(&s).unwrap(); + let doc = &out[0]; + assert_eq!(doc.as_str().unwrap(), ""); + } + + #[test] + fn test_plain_datatype() { + let s = " +- 'string' +- \"string\" +- string +- 123 +- -321 +- 1.23 +- -1e4 +- ~ +- null +- true +- false +- !!str 0 +- !!int 100 +- !!float 2 +- !!null ~ +- !!bool true +- !!bool false +- 0xFF +# bad values +- !!int string +- !!float string +- !!bool null +- !!null val +- 0o77 +- [ 0xF, 0xF ] +- +12345 +- [ true, false ] +"; + let out = YamlLoader::load_from_str(&s).unwrap(); + let doc = &out[0]; + + assert_eq!(doc[0].as_str().unwrap(), "string"); + assert_eq!(doc[1].as_str().unwrap(), "string"); + assert_eq!(doc[2].as_str().unwrap(), "string"); + assert_eq!(doc[3].as_i64().unwrap(), 123); + assert_eq!(doc[4].as_i64().unwrap(), -321); + assert_eq!(doc[5].as_f64().unwrap(), 1.23); + assert_eq!(doc[6].as_f64().unwrap(), -1e4); + assert!(doc[7].is_null()); + assert!(doc[8].is_null()); + assert_eq!(doc[9].as_bool().unwrap(), true); + assert_eq!(doc[10].as_bool().unwrap(), false); + assert_eq!(doc[11].as_str().unwrap(), "0"); + assert_eq!(doc[12].as_i64().unwrap(), 100); + assert_eq!(doc[13].as_f64().unwrap(), 2.0); + assert!(doc[14].is_null()); + assert_eq!(doc[15].as_bool().unwrap(), true); + assert_eq!(doc[16].as_bool().unwrap(), false); + assert_eq!(doc[17].as_i64().unwrap(), 255); + assert!(doc[18].is_badvalue()); + assert!(doc[19].is_badvalue()); + assert!(doc[20].is_badvalue()); + assert!(doc[21].is_badvalue()); + assert_eq!(doc[22].as_i64().unwrap(), 63); + assert_eq!(doc[23][0].as_i64().unwrap(), 15); + assert_eq!(doc[23][1].as_i64().unwrap(), 15); + assert_eq!(doc[24].as_i64().unwrap(), 12345); + assert!(doc[25][0].as_bool().unwrap()); + assert!(!doc[25][1].as_bool().unwrap()); + } + + #[test] + fn test_bad_hyphen() { + // See: https://github.com/chyh1990/yaml-rust/issues/23 + let s = "{-"; + assert!(YamlLoader::load_from_str(&s).is_err()); + } + + #[test] + fn test_issue_65() { + // See: https://github.com/chyh1990/yaml-rust/issues/65 + let b = "\n\"ll\\\"ll\\\r\n\"ll\\\"ll\\\r\r\r\rU\r\r\rU"; + assert!(YamlLoader::load_from_str(&b).is_err()); + } + + #[test] + fn test_bad_docstart() { + assert!(YamlLoader::load_from_str("---This used to cause an infinite loop").is_ok()); + assert_eq!( + YamlLoader::load_from_str("----"), + Ok(vec![Yaml::String(String::from("----"))]) + ); + assert_eq!( + YamlLoader::load_from_str("--- #here goes a comment"), + Ok(vec![Yaml::Null]) + ); + assert_eq!( + YamlLoader::load_from_str("---- #here goes a comment"), + Ok(vec![Yaml::String(String::from("----"))]) + ); + } + + #[test] + fn test_plain_datatype_with_into_methods() { + let s = " +- 'string' +- \"string\" +- string +- 123 +- -321 +- 1.23 +- -1e4 +- true +- false +- !!str 0 +- !!int 100 +- !!float 2 +- !!bool true +- !!bool false +- 0xFF +- 0o77 +- +12345 +- -.INF +- .NAN +- !!float .INF +"; + let mut out = YamlLoader::load_from_str(&s).unwrap().into_iter(); + let mut doc = out.next().unwrap().into_iter(); + + assert_eq!(doc.next().unwrap().into_string().unwrap(), "string"); + assert_eq!(doc.next().unwrap().into_string().unwrap(), "string"); + assert_eq!(doc.next().unwrap().into_string().unwrap(), "string"); + assert_eq!(doc.next().unwrap().into_i64().unwrap(), 123); + assert_eq!(doc.next().unwrap().into_i64().unwrap(), -321); + assert_eq!(doc.next().unwrap().into_f64().unwrap(), 1.23); + assert_eq!(doc.next().unwrap().into_f64().unwrap(), -1e4); + assert_eq!(doc.next().unwrap().into_bool().unwrap(), true); + assert_eq!(doc.next().unwrap().into_bool().unwrap(), false); + assert_eq!(doc.next().unwrap().into_string().unwrap(), "0"); + assert_eq!(doc.next().unwrap().into_i64().unwrap(), 100); + assert_eq!(doc.next().unwrap().into_f64().unwrap(), 2.0); + assert_eq!(doc.next().unwrap().into_bool().unwrap(), true); + assert_eq!(doc.next().unwrap().into_bool().unwrap(), false); + assert_eq!(doc.next().unwrap().into_i64().unwrap(), 255); + assert_eq!(doc.next().unwrap().into_i64().unwrap(), 63); + assert_eq!(doc.next().unwrap().into_i64().unwrap(), 12345); + assert_eq!(doc.next().unwrap().into_f64().unwrap(), f64::NEG_INFINITY); + assert!(doc.next().unwrap().into_f64().is_some()); + assert_eq!(doc.next().unwrap().into_f64().unwrap(), f64::INFINITY); + } + + #[test] + fn test_hash_order() { + let s = "--- +b: ~ +a: ~ +c: ~ +"; + let out = YamlLoader::load_from_str(&s).unwrap(); + let first = out.into_iter().next().unwrap(); + let mut iter = first.into_hash().unwrap().into_iter(); + assert_eq!( + Some((Yaml::String("b".to_owned()), Yaml::Null)), + iter.next() + ); + assert_eq!( + Some((Yaml::String("a".to_owned()), Yaml::Null)), + iter.next() + ); + assert_eq!( + Some((Yaml::String("c".to_owned()), Yaml::Null)), + iter.next() + ); + assert_eq!(None, iter.next()); + } + + #[test] + fn test_integer_key() { + let s = " +0: + important: true +1: + important: false +"; + let out = YamlLoader::load_from_str(&s).unwrap(); + let first = out.into_iter().next().unwrap(); + assert_eq!(first[0]["important"].as_bool().unwrap(), true); + } + + #[test] + fn test_indentation_equality() { + let four_spaces = YamlLoader::load_from_str( + r#" +hash: + with: + indentations +"#, + ) + .unwrap() + .into_iter() + .next() + .unwrap(); + + let two_spaces = YamlLoader::load_from_str( + r#" +hash: + with: + indentations +"#, + ) + .unwrap() + .into_iter() + .next() + .unwrap(); + + let one_space = YamlLoader::load_from_str( + r#" +hash: + with: + indentations +"#, + ) + .unwrap() + .into_iter() + .next() + .unwrap(); + + let mixed_spaces = YamlLoader::load_from_str( + r#" +hash: + with: + indentations +"#, + ) + .unwrap() + .into_iter() + .next() + .unwrap(); + + assert_eq!(four_spaces, two_spaces); + assert_eq!(two_spaces, one_space); + assert_eq!(four_spaces, mixed_spaces); + } + + #[test] + fn test_two_space_indentations() { + // https://github.com/kbknapp/clap-rs/issues/965 + + let s = r#" +subcommands: + - server: + about: server related commands +subcommands2: + - server: + about: server related commands +subcommands3: + - server: + about: server related commands + "#; + + let out = YamlLoader::load_from_str(&s).unwrap(); + let doc = &out.into_iter().next().unwrap(); + + println!("{:#?}", doc); + assert_eq!(doc["subcommands"][0]["server"], Yaml::Null); + assert!(doc["subcommands2"][0]["server"].as_hash().is_some()); + assert!(doc["subcommands3"][0]["server"].as_hash().is_some()); + } + + #[test] + fn test_recursion_depth_check_objects() { + let s = "{a:".repeat(10_000) + &"}".repeat(10_000); + assert!(YamlLoader::load_from_str(&s).is_err()); + } + + #[test] + fn test_recursion_depth_check_arrays() { + let s = "[".repeat(10_000) + &"]".repeat(10_000); + assert!(YamlLoader::load_from_str(&s).is_err()); + } +} |