// Licensed under the Apache License, Version 2.0 or the MIT license // , at your // option. This file may not be copied, modified, or distributed // except according to those terms. //! Streams of tendrils. use fmt; use tendril::{Atomicity, NonAtomic, Tendril}; use std::borrow::Cow; use std::fs::File; use std::io; use std::marker::PhantomData; use std::path::Path; #[cfg(feature = "encoding")] use encoding; #[cfg(feature = "encoding_rs")] use encoding_rs::{self, DecoderResult}; use utf8; /// Trait for types that can process a tendril. /// /// This is a "push" interface, unlike the "pull" interface of /// `Iterator>`. The push interface matches /// [html5ever][] and other incremental parsers with a similar /// architecture. /// /// [html5ever]: https://github.com/servo/html5ever pub trait TendrilSink where F: fmt::Format, A: Atomicity, { /// Process this tendril. fn process(&mut self, t: Tendril); /// Indicates that an error has occurred. fn error(&mut self, desc: Cow<'static, str>); /// What the overall result of processing is. type Output; /// Indicates the end of the stream. fn finish(self) -> Self::Output; /// Process one tendril and finish. fn one(mut self, t: T) -> Self::Output where Self: Sized, T: Into>, { self.process(t.into()); self.finish() } /// Consume an iterator of tendrils, processing each item, then finish. fn from_iter(mut self, i: I) -> Self::Output where Self: Sized, I: IntoIterator, I::Item: Into>, { for t in i { self.process(t.into()) } self.finish() } /// Read from the given stream of bytes until exhaustion and process incrementally, /// then finish. Return `Err` at the first I/O error. fn read_from(mut self, r: &mut R) -> io::Result where Self: Sized, R: io::Read, F: fmt::SliceFormat, { const BUFFER_SIZE: u32 = 4 * 1024; loop { let mut tendril = Tendril::::new(); // FIXME: this exposes uninitialized bytes to a generic R type // this is fine for R=File which never reads these bytes, // but user-defined types might. // The standard library pushes zeros to `Vec` for that reason. unsafe { tendril.push_uninitialized(BUFFER_SIZE); } loop { match r.read(&mut tendril) { Ok(0) => return Ok(self.finish()), Ok(n) => { tendril.pop_back(BUFFER_SIZE - n as u32); self.process(tendril); break; } Err(ref e) if e.kind() == io::ErrorKind::Interrupted => {} Err(e) => return Err(e), } } } } /// Read from the file at the given path and process incrementally, /// then finish. Return `Err` at the first I/O error. fn from_file

(self, path: P) -> io::Result where Self: Sized, P: AsRef, F: fmt::SliceFormat, { self.read_from(&mut File::open(path)?) } } /// A `TendrilSink` adaptor that takes bytes, decodes them as UTF-8, /// lossily replace ill-formed byte sequences with U+FFFD replacement characters, /// and emits Unicode (`StrTendril`). /// /// This does not allocate memory: the output is either subtendrils on the input, /// on inline tendrils for a single code point. pub struct Utf8LossyDecoder where Sink: TendrilSink, A: Atomicity, { pub inner_sink: Sink, incomplete: Option, marker: PhantomData, } impl Utf8LossyDecoder where Sink: TendrilSink, A: Atomicity, { /// Create a new incremental UTF-8 decoder. #[inline] pub fn new(inner_sink: Sink) -> Self { Utf8LossyDecoder { inner_sink: inner_sink, incomplete: None, marker: PhantomData, } } } impl TendrilSink for Utf8LossyDecoder where Sink: TendrilSink, A: Atomicity, { #[inline] fn process(&mut self, mut t: Tendril) { // FIXME: remove take() and map() when non-lexical borrows are stable. if let Some(mut incomplete) = self.incomplete.take() { let resume_at = incomplete.try_complete(&t).map(|(result, rest)| { match result { Ok(s) => self.inner_sink.process(Tendril::from_slice(s)), Err(_) => { self.inner_sink.error("invalid byte sequence".into()); self.inner_sink .process(Tendril::from_slice(utf8::REPLACEMENT_CHARACTER)); } } t.len() - rest.len() }); match resume_at { None => { self.incomplete = Some(incomplete); return; } Some(resume_at) => t.pop_front(resume_at as u32), } } while !t.is_empty() { let unborrowed_result = match utf8::decode(&t) { Ok(s) => { debug_assert!(s.as_ptr() == t.as_ptr()); debug_assert!(s.len() == t.len()); Ok(()) } Err(utf8::DecodeError::Invalid { valid_prefix, invalid_sequence, .. }) => { debug_assert!(valid_prefix.as_ptr() == t.as_ptr()); debug_assert!(valid_prefix.len() <= t.len()); Err(( valid_prefix.len(), Err(valid_prefix.len() + invalid_sequence.len()), )) } Err(utf8::DecodeError::Incomplete { valid_prefix, incomplete_suffix, }) => { debug_assert!(valid_prefix.as_ptr() == t.as_ptr()); debug_assert!(valid_prefix.len() <= t.len()); Err((valid_prefix.len(), Ok(incomplete_suffix))) } }; match unborrowed_result { Ok(()) => { unsafe { self.inner_sink.process(t.reinterpret_without_validating()) } return; } Err((valid_len, and_then)) => { if valid_len > 0 { let subtendril = t.subtendril(0, valid_len as u32); unsafe { self.inner_sink .process(subtendril.reinterpret_without_validating()) } } match and_then { Ok(incomplete) => { self.incomplete = Some(incomplete); return; } Err(offset) => { self.inner_sink.error("invalid byte sequence".into()); self.inner_sink .process(Tendril::from_slice(utf8::REPLACEMENT_CHARACTER)); t.pop_front(offset as u32); } } } } } } #[inline] fn error(&mut self, desc: Cow<'static, str>) { self.inner_sink.error(desc); } type Output = Sink::Output; #[inline] fn finish(mut self) -> Sink::Output { if self.incomplete.is_some() { self.inner_sink .error("incomplete byte sequence at end of stream".into()); self.inner_sink .process(Tendril::from_slice(utf8::REPLACEMENT_CHARACTER)); } self.inner_sink.finish() } } /// A `TendrilSink` adaptor that takes bytes, decodes them as the given character encoding, /// lossily replace ill-formed byte sequences with U+FFFD replacement characters, /// and emits Unicode (`StrTendril`). /// /// This allocates new tendrils for encodings other than UTF-8. #[cfg(any(feature = "encoding", feature = "encoding_rs"))] pub struct LossyDecoder where Sink: TendrilSink, A: Atomicity, { inner: LossyDecoderInner, } #[cfg(any(feature = "encoding", feature = "encoding_rs"))] enum LossyDecoderInner where Sink: TendrilSink, A: Atomicity, { Utf8(Utf8LossyDecoder), #[cfg(feature = "encoding")] Encoding(Box, Sink), #[cfg(feature = "encoding_rs")] EncodingRs(encoding_rs::Decoder, Sink), } #[cfg(any(feature = "encoding", feature = "encoding_rs"))] impl LossyDecoder where Sink: TendrilSink, A: Atomicity, { /// Create a new incremental decoder using the encoding crate. #[cfg(feature = "encoding")] #[inline] pub fn new(encoding: encoding::EncodingRef, sink: Sink) -> Self { if encoding.name() == "utf-8" { LossyDecoder::utf8(sink) } else { LossyDecoder { inner: LossyDecoderInner::Encoding(encoding.raw_decoder(), sink), } } } /// Create a new incremental decoder using the encoding_rs crate. #[cfg(feature = "encoding_rs")] #[inline] pub fn new_encoding_rs(encoding: &'static encoding_rs::Encoding, sink: Sink) -> Self { if encoding == encoding_rs::UTF_8 { return Self::utf8(sink); } Self { inner: LossyDecoderInner::EncodingRs(encoding.new_decoder(), sink), } } /// Create a new incremental decoder for the UTF-8 encoding. /// /// This is useful for content that is known at run-time to be UTF-8 /// (whereas `Utf8LossyDecoder` requires knowning at compile-time.) #[inline] pub fn utf8(sink: Sink) -> LossyDecoder { LossyDecoder { inner: LossyDecoderInner::Utf8(Utf8LossyDecoder::new(sink)), } } /// Give a reference to the inner sink. pub fn inner_sink(&self) -> &Sink { match self.inner { LossyDecoderInner::Utf8(ref utf8) => &utf8.inner_sink, #[cfg(feature = "encoding")] LossyDecoderInner::Encoding(_, ref inner_sink) => inner_sink, #[cfg(feature = "encoding_rs")] LossyDecoderInner::EncodingRs(_, ref inner_sink) => inner_sink, } } /// Give a mutable reference to the inner sink. pub fn inner_sink_mut(&mut self) -> &mut Sink { match self.inner { LossyDecoderInner::Utf8(ref mut utf8) => &mut utf8.inner_sink, #[cfg(feature = "encoding")] LossyDecoderInner::Encoding(_, ref mut inner_sink) => inner_sink, #[cfg(feature = "encoding_rs")] LossyDecoderInner::EncodingRs(_, ref mut inner_sink) => inner_sink, } } } #[cfg(any(feature = "encoding", feature = "encoding_rs"))] impl TendrilSink for LossyDecoder where Sink: TendrilSink, A: Atomicity, { #[inline] fn process(&mut self, t: Tendril) { match self.inner { LossyDecoderInner::Utf8(ref mut utf8) => return utf8.process(t), #[cfg(feature = "encoding")] LossyDecoderInner::Encoding(ref mut decoder, ref mut sink) => { let mut out = Tendril::new(); let mut t = t; loop { match decoder.raw_feed(&*t, &mut out) { (_, Some(err)) => { out.push_char('\u{fffd}'); sink.error(err.cause); debug_assert!(err.upto >= 0); t.pop_front(err.upto as u32); // continue loop and process remainder of t } (_, None) => break, } } if out.len() > 0 { sink.process(out); } } #[cfg(feature = "encoding_rs")] LossyDecoderInner::EncodingRs(ref mut decoder, ref mut sink) => { if t.is_empty() { return; } decode_to_sink(t, decoder, sink, false); } } } #[inline] fn error(&mut self, desc: Cow<'static, str>) { match self.inner { LossyDecoderInner::Utf8(ref mut utf8) => utf8.error(desc), #[cfg(feature = "encoding")] LossyDecoderInner::Encoding(_, ref mut sink) => sink.error(desc), #[cfg(feature = "encoding_rs")] LossyDecoderInner::EncodingRs(_, ref mut sink) => sink.error(desc), } } type Output = Sink::Output; #[inline] fn finish(self) -> Sink::Output { match self.inner { LossyDecoderInner::Utf8(utf8) => return utf8.finish(), #[cfg(feature = "encoding")] LossyDecoderInner::Encoding(mut decoder, mut sink) => { let mut out = Tendril::new(); if let Some(err) = decoder.raw_finish(&mut out) { out.push_char('\u{fffd}'); sink.error(err.cause); } if out.len() > 0 { sink.process(out); } sink.finish() } #[cfg(feature = "encoding_rs")] LossyDecoderInner::EncodingRs(mut decoder, mut sink) => { decode_to_sink(Tendril::new(), &mut decoder, &mut sink, true); sink.finish() } } } } #[cfg(feature = "encoding_rs")] fn decode_to_sink( mut t: Tendril, decoder: &mut encoding_rs::Decoder, sink: &mut Sink, last: bool, ) where Sink: TendrilSink, A: Atomicity, { loop { let mut out = >::new(); let max_len = decoder .max_utf8_buffer_length_without_replacement(t.len()) .unwrap_or(8192); unsafe { out.push_uninitialized(std::cmp::min(max_len as u32, 8192)); } let (result, bytes_read, bytes_written) = decoder.decode_to_utf8_without_replacement(&t, &mut out, last); if bytes_written > 0 { sink.process(unsafe { out.subtendril(0, bytes_written as u32) .reinterpret_without_validating() }); } match result { DecoderResult::InputEmpty => return, DecoderResult::OutputFull => {} DecoderResult::Malformed(_, _) => { sink.error(Cow::Borrowed("invalid sequence")); sink.process("\u{FFFD}".into()); } } t.pop_front(bytes_read as u32); if t.is_empty() { return; } } } #[cfg(test)] mod test { use super::{TendrilSink, Utf8LossyDecoder}; use fmt; use std::borrow::Cow; use tendril::{Atomicity, NonAtomic, Tendril}; #[cfg(any(feature = "encoding", feature = "encoding_rs"))] use super::LossyDecoder; #[cfg(any(feature = "encoding", feature = "encoding_rs"))] use tendril::SliceExt; #[cfg(feature = "encoding")] use encoding::all as enc; #[cfg(feature = "encoding_rs")] use encoding_rs as enc_rs; struct Accumulate where A: Atomicity, { tendrils: Vec>, errors: Vec, } impl Accumulate where A: Atomicity, { fn new() -> Accumulate { Accumulate { tendrils: vec![], errors: vec![], } } } impl TendrilSink for Accumulate where A: Atomicity, { fn process(&mut self, t: Tendril) { self.tendrils.push(t); } fn error(&mut self, desc: Cow<'static, str>) { self.errors.push(desc.into_owned()); } type Output = (Vec>, Vec); fn finish(self) -> Self::Output { (self.tendrils, self.errors) } } fn check_utf8(input: &[&[u8]], expected: &[&str], errs: usize) { let decoder = Utf8LossyDecoder::new(Accumulate::::new()); let (tendrils, errors) = decoder.from_iter(input.iter().cloned()); assert_eq!( expected, &*tendrils.iter().map(|t| &**t).collect::>() ); assert_eq!(errs, errors.len()); } #[test] fn utf8() { check_utf8(&[], &[], 0); check_utf8(&[b""], &[], 0); check_utf8(&[b"xyz"], &["xyz"], 0); check_utf8(&[b"x", b"y", b"z"], &["x", "y", "z"], 0); check_utf8(&[b"xy\xEA\x99\xAEzw"], &["xy\u{a66e}zw"], 0); check_utf8(&[b"xy\xEA", b"\x99\xAEzw"], &["xy", "\u{a66e}z", "w"], 0); check_utf8(&[b"xy\xEA\x99", b"\xAEzw"], &["xy", "\u{a66e}z", "w"], 0); check_utf8( &[b"xy\xEA", b"\x99", b"\xAEzw"], &["xy", "\u{a66e}z", "w"], 0, ); check_utf8(&[b"\xEA", b"", b"\x99", b"", b"\xAE"], &["\u{a66e}"], 0); check_utf8( &[b"", b"\xEA", b"", b"\x99", b"", b"\xAE", b""], &["\u{a66e}"], 0, ); check_utf8( &[b"xy\xEA", b"\xFF", b"\x99\xAEz"], &["xy", "\u{fffd}", "\u{fffd}", "\u{fffd}", "\u{fffd}", "z"], 4, ); check_utf8( &[b"xy\xEA\x99", b"\xFFz"], &["xy", "\u{fffd}", "\u{fffd}", "z"], 2, ); check_utf8(&[b"\xC5\x91\xC5\x91\xC5\x91"], &["őőő"], 0); check_utf8( &[b"\xC5\x91", b"\xC5\x91", b"\xC5\x91"], &["ő", "ő", "ő"], 0, ); check_utf8( &[b"\xC5", b"\x91\xC5", b"\x91\xC5", b"\x91"], &["ő", "ő", "ő"], 0, ); check_utf8( &[b"\xC5", b"\x91\xff", b"\x91\xC5", b"\x91"], &["ő", "\u{fffd}", "\u{fffd}", "ő"], 2, ); // incomplete char at end of input check_utf8(&[b"\xC0"], &["\u{fffd}"], 1); check_utf8(&[b"\xEA\x99"], &["\u{fffd}"], 1); } #[cfg(any(feature = "encoding", feature = "encoding_rs"))] fn check_decode( mut decoder: LossyDecoder>, input: &[&[u8]], expected: &str, errs: usize, ) { for x in input { decoder.process(x.to_tendril()); } let (tendrils, errors) = decoder.finish(); let mut tendril: Tendril = Tendril::new(); for t in tendrils { tendril.push_tendril(&t); } assert_eq!(expected, &*tendril); assert_eq!(errs, errors.len()); } #[cfg(any(feature = "encoding", feature = "encoding_rs"))] pub type Tests = &'static [(&'static [&'static [u8]], &'static str, usize)]; #[cfg(any(feature = "encoding"))] const ASCII: Tests = &[ (&[], "", 0), (&[b""], "", 0), (&[b"xyz"], "xyz", 0), (&[b"xy", b"", b"", b"z"], "xyz", 0), (&[b"x", b"y", b"z"], "xyz", 0), (&[b"\xFF"], "\u{fffd}", 1), (&[b"x\xC0yz"], "x\u{fffd}yz", 1), (&[b"x", b"\xC0y", b"z"], "x\u{fffd}yz", 1), (&[b"x\xC0yz\xFF\xFFw"], "x\u{fffd}yz\u{fffd}\u{fffd}w", 3), ]; #[cfg(feature = "encoding")] #[test] fn decode_ascii() { for &(input, expected, errs) in ASCII { let decoder = LossyDecoder::new(enc::ASCII, Accumulate::new()); check_decode(decoder, input, expected, errs); } } #[cfg(any(feature = "encoding", feature = "encoding_rs"))] const UTF_8: Tests = &[ (&[], "", 0), (&[b""], "", 0), (&[b"xyz"], "xyz", 0), (&[b"x", b"y", b"z"], "xyz", 0), (&[b"\xEA\x99\xAE"], "\u{a66e}", 0), (&[b"\xEA", b"\x99\xAE"], "\u{a66e}", 0), (&[b"\xEA\x99", b"\xAE"], "\u{a66e}", 0), (&[b"\xEA", b"\x99", b"\xAE"], "\u{a66e}", 0), (&[b"\xEA", b"", b"\x99", b"", b"\xAE"], "\u{a66e}", 0), ( &[b"", b"\xEA", b"", b"\x99", b"", b"\xAE", b""], "\u{a66e}", 0, ), (&[b"xy\xEA", b"\x99\xAEz"], "xy\u{a66e}z", 0), ( &[b"xy\xEA", b"\xFF", b"\x99\xAEz"], "xy\u{fffd}\u{fffd}\u{fffd}\u{fffd}z", 4, ), (&[b"xy\xEA\x99", b"\xFFz"], "xy\u{fffd}\u{fffd}z", 2), // incomplete char at end of input (&[b"\xC0"], "\u{fffd}", 1), (&[b"\xEA\x99"], "\u{fffd}", 1), ]; #[cfg(feature = "encoding")] #[test] fn decode_utf8() { for &(input, expected, errs) in UTF_8 { let decoder = LossyDecoder::new(enc::UTF_8, Accumulate::new()); check_decode(decoder, input, expected, errs); } } #[cfg(feature = "encoding_rs")] #[test] fn decode_utf8_encoding_rs() { for &(input, expected, errs) in UTF_8 { let decoder = LossyDecoder::new_encoding_rs(enc_rs::UTF_8, Accumulate::new()); check_decode(decoder, input, expected, errs); } } #[cfg(any(feature = "encoding", feature = "encoding_rs"))] const KOI8_U: Tests = &[ (&[b"\xfc\xce\xc5\xd2\xc7\xc9\xd1"], "Энергия", 0), (&[b"\xfc\xce", b"\xc5\xd2\xc7\xc9\xd1"], "Энергия", 0), (&[b"\xfc\xce", b"\xc5\xd2\xc7", b"\xc9\xd1"], "Энергия", 0), ( &[b"\xfc\xce", b"", b"\xc5\xd2\xc7", b"\xc9\xd1", b""], "Энергия", 0, ), ]; #[cfg(feature = "encoding")] #[test] fn decode_koi8_u() { for &(input, expected, errs) in KOI8_U { let decoder = LossyDecoder::new(enc::KOI8_U, Accumulate::new()); check_decode(decoder, input, expected, errs); } } #[cfg(feature = "encoding_rs")] #[test] fn decode_koi8_u_encoding_rs() { for &(input, expected, errs) in KOI8_U { let decoder = LossyDecoder::new_encoding_rs(enc_rs::KOI8_U, Accumulate::new()); check_decode(decoder, input, expected, errs); } } #[cfg(any(feature = "encoding", feature = "encoding_rs"))] const WINDOWS_949: Tests = &[ (&[], "", 0), (&[b""], "", 0), (&[b"\xbe\xc8\xb3\xe7"], "안녕", 0), (&[b"\xbe", b"\xc8\xb3\xe7"], "안녕", 0), (&[b"\xbe", b"", b"\xc8\xb3\xe7"], "안녕", 0), ( &[b"\xbe\xc8\xb3\xe7\xc7\xcf\xbc\xbc\xbf\xe4"], "안녕하세요", 0, ), (&[b"\xbe\xc8\xb3\xe7\xc7"], "안녕\u{fffd}", 1), (&[b"\xbe", b"", b"\xc8\xb3"], "안\u{fffd}", 1), (&[b"\xbe\x28\xb3\xe7"], "\u{fffd}(녕", 1), ]; #[cfg(feature = "encoding")] #[test] fn decode_windows_949() { for &(input, expected, errs) in WINDOWS_949 { let decoder = LossyDecoder::new(enc::WINDOWS_949, Accumulate::new()); check_decode(decoder, input, expected, errs); } } #[cfg(feature = "encoding_rs")] #[test] fn decode_windows_949_encoding_rs() { for &(input, expected, errs) in WINDOWS_949 { let decoder = LossyDecoder::new_encoding_rs(enc_rs::EUC_KR, Accumulate::new()); check_decode(decoder, input, expected, errs); } } #[test] fn read_from() { let decoder = Utf8LossyDecoder::new(Accumulate::::new()); let mut bytes: &[u8] = b"foo\xffbar"; let (tendrils, errors) = decoder.read_from(&mut bytes).unwrap(); assert_eq!( &*tendrils.iter().map(|t| &**t).collect::>(), &["foo", "\u{FFFD}", "bar"] ); assert_eq!(errors, &["invalid byte sequence"]); } }