diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-17 12:02:58 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-17 12:02:58 +0000 |
commit | 698f8c2f01ea549d77d7dc3338a12e04c11057b9 (patch) | |
tree | 173a775858bd501c378080a10dca74132f05bc50 /vendor/tendril/src/stream.rs | |
parent | Initial commit. (diff) | |
download | rustc-698f8c2f01ea549d77d7dc3338a12e04c11057b9.tar.xz rustc-698f8c2f01ea549d77d7dc3338a12e04c11057b9.zip |
Adding upstream version 1.64.0+dfsg1.upstream/1.64.0+dfsg1
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'vendor/tendril/src/stream.rs')
-rw-r--r-- | vendor/tendril/src/stream.rs | 752 |
1 files changed, 752 insertions, 0 deletions
diff --git a/vendor/tendril/src/stream.rs b/vendor/tendril/src/stream.rs new file mode 100644 index 000000000..469d58c9b --- /dev/null +++ b/vendor/tendril/src/stream.rs @@ -0,0 +1,752 @@ +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +//! Streams of tendrils. + +use fmt; +use tendril::{Atomicity, NonAtomic, Tendril}; + +use std::borrow::Cow; +use std::fs::File; +use std::io; +use std::marker::PhantomData; +use std::path::Path; + +#[cfg(feature = "encoding")] +use encoding; +#[cfg(feature = "encoding_rs")] +use encoding_rs::{self, DecoderResult}; +use utf8; + +/// Trait for types that can process a tendril. +/// +/// This is a "push" interface, unlike the "pull" interface of +/// `Iterator<Item=Tendril<F>>`. The push interface matches +/// [html5ever][] and other incremental parsers with a similar +/// architecture. +/// +/// [html5ever]: https://github.com/servo/html5ever +pub trait TendrilSink<F, A = NonAtomic> +where + F: fmt::Format, + A: Atomicity, +{ + /// Process this tendril. + fn process(&mut self, t: Tendril<F, A>); + + /// Indicates that an error has occurred. + fn error(&mut self, desc: Cow<'static, str>); + + /// What the overall result of processing is. + type Output; + + /// Indicates the end of the stream. + fn finish(self) -> Self::Output; + + /// Process one tendril and finish. + fn one<T>(mut self, t: T) -> Self::Output + where + Self: Sized, + T: Into<Tendril<F, A>>, + { + self.process(t.into()); + self.finish() + } + + /// Consume an iterator of tendrils, processing each item, then finish. + fn from_iter<I>(mut self, i: I) -> Self::Output + where + Self: Sized, + I: IntoIterator, + I::Item: Into<Tendril<F, A>>, + { + for t in i { + self.process(t.into()) + } + self.finish() + } + + /// Read from the given stream of bytes until exhaustion and process incrementally, + /// then finish. Return `Err` at the first I/O error. + fn read_from<R>(mut self, r: &mut R) -> io::Result<Self::Output> + where + Self: Sized, + R: io::Read, + F: fmt::SliceFormat<Slice = [u8]>, + { + const BUFFER_SIZE: u32 = 4 * 1024; + loop { + let mut tendril = Tendril::<F, A>::new(); + // FIXME: this exposes uninitialized bytes to a generic R type + // this is fine for R=File which never reads these bytes, + // but user-defined types might. + // The standard library pushes zeros to `Vec<u8>` for that reason. + unsafe { + tendril.push_uninitialized(BUFFER_SIZE); + } + loop { + match r.read(&mut tendril) { + Ok(0) => return Ok(self.finish()), + Ok(n) => { + tendril.pop_back(BUFFER_SIZE - n as u32); + self.process(tendril); + break; + } + Err(ref e) if e.kind() == io::ErrorKind::Interrupted => {} + Err(e) => return Err(e), + } + } + } + } + + /// Read from the file at the given path and process incrementally, + /// then finish. Return `Err` at the first I/O error. + fn from_file<P>(self, path: P) -> io::Result<Self::Output> + where + Self: Sized, + P: AsRef<Path>, + F: fmt::SliceFormat<Slice = [u8]>, + { + self.read_from(&mut File::open(path)?) + } +} + +/// A `TendrilSink` adaptor that takes bytes, decodes them as UTF-8, +/// lossily replace ill-formed byte sequences with U+FFFD replacement characters, +/// and emits Unicode (`StrTendril`). +/// +/// This does not allocate memory: the output is either subtendrils on the input, +/// on inline tendrils for a single code point. +pub struct Utf8LossyDecoder<Sink, A = NonAtomic> +where + Sink: TendrilSink<fmt::UTF8, A>, + A: Atomicity, +{ + pub inner_sink: Sink, + incomplete: Option<utf8::Incomplete>, + marker: PhantomData<A>, +} + +impl<Sink, A> Utf8LossyDecoder<Sink, A> +where + Sink: TendrilSink<fmt::UTF8, A>, + A: Atomicity, +{ + /// Create a new incremental UTF-8 decoder. + #[inline] + pub fn new(inner_sink: Sink) -> Self { + Utf8LossyDecoder { + inner_sink: inner_sink, + incomplete: None, + marker: PhantomData, + } + } +} + +impl<Sink, A> TendrilSink<fmt::Bytes, A> for Utf8LossyDecoder<Sink, A> +where + Sink: TendrilSink<fmt::UTF8, A>, + A: Atomicity, +{ + #[inline] + fn process(&mut self, mut t: Tendril<fmt::Bytes, A>) { + // FIXME: remove take() and map() when non-lexical borrows are stable. + if let Some(mut incomplete) = self.incomplete.take() { + let resume_at = incomplete.try_complete(&t).map(|(result, rest)| { + match result { + Ok(s) => self.inner_sink.process(Tendril::from_slice(s)), + Err(_) => { + self.inner_sink.error("invalid byte sequence".into()); + self.inner_sink + .process(Tendril::from_slice(utf8::REPLACEMENT_CHARACTER)); + } + } + t.len() - rest.len() + }); + match resume_at { + None => { + self.incomplete = Some(incomplete); + return; + } + Some(resume_at) => t.pop_front(resume_at as u32), + } + } + while !t.is_empty() { + let unborrowed_result = match utf8::decode(&t) { + Ok(s) => { + debug_assert!(s.as_ptr() == t.as_ptr()); + debug_assert!(s.len() == t.len()); + Ok(()) + } + Err(utf8::DecodeError::Invalid { + valid_prefix, + invalid_sequence, + .. + }) => { + debug_assert!(valid_prefix.as_ptr() == t.as_ptr()); + debug_assert!(valid_prefix.len() <= t.len()); + Err(( + valid_prefix.len(), + Err(valid_prefix.len() + invalid_sequence.len()), + )) + } + Err(utf8::DecodeError::Incomplete { + valid_prefix, + incomplete_suffix, + }) => { + debug_assert!(valid_prefix.as_ptr() == t.as_ptr()); + debug_assert!(valid_prefix.len() <= t.len()); + Err((valid_prefix.len(), Ok(incomplete_suffix))) + } + }; + match unborrowed_result { + Ok(()) => { + unsafe { self.inner_sink.process(t.reinterpret_without_validating()) } + return; + } + Err((valid_len, and_then)) => { + if valid_len > 0 { + let subtendril = t.subtendril(0, valid_len as u32); + unsafe { + self.inner_sink + .process(subtendril.reinterpret_without_validating()) + } + } + match and_then { + Ok(incomplete) => { + self.incomplete = Some(incomplete); + return; + } + Err(offset) => { + self.inner_sink.error("invalid byte sequence".into()); + self.inner_sink + .process(Tendril::from_slice(utf8::REPLACEMENT_CHARACTER)); + t.pop_front(offset as u32); + } + } + } + } + } + } + + #[inline] + fn error(&mut self, desc: Cow<'static, str>) { + self.inner_sink.error(desc); + } + + type Output = Sink::Output; + + #[inline] + fn finish(mut self) -> Sink::Output { + if self.incomplete.is_some() { + self.inner_sink + .error("incomplete byte sequence at end of stream".into()); + self.inner_sink + .process(Tendril::from_slice(utf8::REPLACEMENT_CHARACTER)); + } + self.inner_sink.finish() + } +} + +/// A `TendrilSink` adaptor that takes bytes, decodes them as the given character encoding, +/// lossily replace ill-formed byte sequences with U+FFFD replacement characters, +/// and emits Unicode (`StrTendril`). +/// +/// This allocates new tendrils for encodings other than UTF-8. +#[cfg(any(feature = "encoding", feature = "encoding_rs"))] +pub struct LossyDecoder<Sink, A = NonAtomic> +where + Sink: TendrilSink<fmt::UTF8, A>, + A: Atomicity, +{ + inner: LossyDecoderInner<Sink, A>, +} + +#[cfg(any(feature = "encoding", feature = "encoding_rs"))] +enum LossyDecoderInner<Sink, A> +where + Sink: TendrilSink<fmt::UTF8, A>, + A: Atomicity, +{ + Utf8(Utf8LossyDecoder<Sink, A>), + #[cfg(feature = "encoding")] + Encoding(Box<encoding::RawDecoder>, Sink), + #[cfg(feature = "encoding_rs")] + EncodingRs(encoding_rs::Decoder, Sink), +} + +#[cfg(any(feature = "encoding", feature = "encoding_rs"))] +impl<Sink, A> LossyDecoder<Sink, A> +where + Sink: TendrilSink<fmt::UTF8, A>, + A: Atomicity, +{ + /// Create a new incremental decoder using the encoding crate. + #[cfg(feature = "encoding")] + #[inline] + pub fn new(encoding: encoding::EncodingRef, sink: Sink) -> Self { + if encoding.name() == "utf-8" { + LossyDecoder::utf8(sink) + } else { + LossyDecoder { + inner: LossyDecoderInner::Encoding(encoding.raw_decoder(), sink), + } + } + } + + /// Create a new incremental decoder using the encoding_rs crate. + #[cfg(feature = "encoding_rs")] + #[inline] + pub fn new_encoding_rs(encoding: &'static encoding_rs::Encoding, sink: Sink) -> Self { + if encoding == encoding_rs::UTF_8 { + return Self::utf8(sink); + } + Self { + inner: LossyDecoderInner::EncodingRs(encoding.new_decoder(), sink), + } + } + + /// Create a new incremental decoder for the UTF-8 encoding. + /// + /// This is useful for content that is known at run-time to be UTF-8 + /// (whereas `Utf8LossyDecoder` requires knowning at compile-time.) + #[inline] + pub fn utf8(sink: Sink) -> LossyDecoder<Sink, A> { + LossyDecoder { + inner: LossyDecoderInner::Utf8(Utf8LossyDecoder::new(sink)), + } + } + + /// Give a reference to the inner sink. + pub fn inner_sink(&self) -> &Sink { + match self.inner { + LossyDecoderInner::Utf8(ref utf8) => &utf8.inner_sink, + #[cfg(feature = "encoding")] + LossyDecoderInner::Encoding(_, ref inner_sink) => inner_sink, + #[cfg(feature = "encoding_rs")] + LossyDecoderInner::EncodingRs(_, ref inner_sink) => inner_sink, + } + } + + /// Give a mutable reference to the inner sink. + pub fn inner_sink_mut(&mut self) -> &mut Sink { + match self.inner { + LossyDecoderInner::Utf8(ref mut utf8) => &mut utf8.inner_sink, + #[cfg(feature = "encoding")] + LossyDecoderInner::Encoding(_, ref mut inner_sink) => inner_sink, + #[cfg(feature = "encoding_rs")] + LossyDecoderInner::EncodingRs(_, ref mut inner_sink) => inner_sink, + } + } +} + +#[cfg(any(feature = "encoding", feature = "encoding_rs"))] +impl<Sink, A> TendrilSink<fmt::Bytes, A> for LossyDecoder<Sink, A> +where + Sink: TendrilSink<fmt::UTF8, A>, + A: Atomicity, +{ + #[inline] + fn process(&mut self, t: Tendril<fmt::Bytes, A>) { + match self.inner { + LossyDecoderInner::Utf8(ref mut utf8) => return utf8.process(t), + #[cfg(feature = "encoding")] + LossyDecoderInner::Encoding(ref mut decoder, ref mut sink) => { + let mut out = Tendril::new(); + let mut t = t; + loop { + match decoder.raw_feed(&*t, &mut out) { + (_, Some(err)) => { + out.push_char('\u{fffd}'); + sink.error(err.cause); + debug_assert!(err.upto >= 0); + t.pop_front(err.upto as u32); + // continue loop and process remainder of t + } + (_, None) => break, + } + } + if out.len() > 0 { + sink.process(out); + } + } + #[cfg(feature = "encoding_rs")] + LossyDecoderInner::EncodingRs(ref mut decoder, ref mut sink) => { + if t.is_empty() { + return; + } + decode_to_sink(t, decoder, sink, false); + } + } + } + + #[inline] + fn error(&mut self, desc: Cow<'static, str>) { + match self.inner { + LossyDecoderInner::Utf8(ref mut utf8) => utf8.error(desc), + #[cfg(feature = "encoding")] + LossyDecoderInner::Encoding(_, ref mut sink) => sink.error(desc), + #[cfg(feature = "encoding_rs")] + LossyDecoderInner::EncodingRs(_, ref mut sink) => sink.error(desc), + } + } + + type Output = Sink::Output; + + #[inline] + fn finish(self) -> Sink::Output { + match self.inner { + LossyDecoderInner::Utf8(utf8) => return utf8.finish(), + #[cfg(feature = "encoding")] + LossyDecoderInner::Encoding(mut decoder, mut sink) => { + let mut out = Tendril::new(); + if let Some(err) = decoder.raw_finish(&mut out) { + out.push_char('\u{fffd}'); + sink.error(err.cause); + } + if out.len() > 0 { + sink.process(out); + } + sink.finish() + } + #[cfg(feature = "encoding_rs")] + LossyDecoderInner::EncodingRs(mut decoder, mut sink) => { + decode_to_sink(Tendril::new(), &mut decoder, &mut sink, true); + sink.finish() + } + } + } +} + +#[cfg(feature = "encoding_rs")] +fn decode_to_sink<Sink, A>( + mut t: Tendril<fmt::Bytes, A>, + decoder: &mut encoding_rs::Decoder, + sink: &mut Sink, + last: bool, +) where + Sink: TendrilSink<fmt::UTF8, A>, + A: Atomicity, +{ + loop { + let mut out = <Tendril<fmt::Bytes, A>>::new(); + let max_len = decoder + .max_utf8_buffer_length_without_replacement(t.len()) + .unwrap_or(8192); + unsafe { + out.push_uninitialized(std::cmp::min(max_len as u32, 8192)); + } + let (result, bytes_read, bytes_written) = + decoder.decode_to_utf8_without_replacement(&t, &mut out, last); + if bytes_written > 0 { + sink.process(unsafe { + out.subtendril(0, bytes_written as u32) + .reinterpret_without_validating() + }); + } + match result { + DecoderResult::InputEmpty => return, + DecoderResult::OutputFull => {} + DecoderResult::Malformed(_, _) => { + sink.error(Cow::Borrowed("invalid sequence")); + sink.process("\u{FFFD}".into()); + } + } + t.pop_front(bytes_read as u32); + if t.is_empty() { + return; + } + } +} + +#[cfg(test)] +mod test { + use super::{TendrilSink, Utf8LossyDecoder}; + use fmt; + use std::borrow::Cow; + use tendril::{Atomicity, NonAtomic, Tendril}; + + #[cfg(any(feature = "encoding", feature = "encoding_rs"))] + use super::LossyDecoder; + #[cfg(any(feature = "encoding", feature = "encoding_rs"))] + use tendril::SliceExt; + + #[cfg(feature = "encoding")] + use encoding::all as enc; + #[cfg(feature = "encoding_rs")] + use encoding_rs as enc_rs; + + struct Accumulate<A> + where + A: Atomicity, + { + tendrils: Vec<Tendril<fmt::UTF8, A>>, + errors: Vec<String>, + } + + impl<A> Accumulate<A> + where + A: Atomicity, + { + fn new() -> Accumulate<A> { + Accumulate { + tendrils: vec![], + errors: vec![], + } + } + } + + impl<A> TendrilSink<fmt::UTF8, A> for Accumulate<A> + where + A: Atomicity, + { + fn process(&mut self, t: Tendril<fmt::UTF8, A>) { + self.tendrils.push(t); + } + + fn error(&mut self, desc: Cow<'static, str>) { + self.errors.push(desc.into_owned()); + } + + type Output = (Vec<Tendril<fmt::UTF8, A>>, Vec<String>); + + fn finish(self) -> Self::Output { + (self.tendrils, self.errors) + } + } + + fn check_utf8(input: &[&[u8]], expected: &[&str], errs: usize) { + let decoder = Utf8LossyDecoder::new(Accumulate::<NonAtomic>::new()); + let (tendrils, errors) = decoder.from_iter(input.iter().cloned()); + assert_eq!( + expected, + &*tendrils.iter().map(|t| &**t).collect::<Vec<_>>() + ); + assert_eq!(errs, errors.len()); + } + + #[test] + fn utf8() { + check_utf8(&[], &[], 0); + check_utf8(&[b""], &[], 0); + check_utf8(&[b"xyz"], &["xyz"], 0); + check_utf8(&[b"x", b"y", b"z"], &["x", "y", "z"], 0); + + check_utf8(&[b"xy\xEA\x99\xAEzw"], &["xy\u{a66e}zw"], 0); + check_utf8(&[b"xy\xEA", b"\x99\xAEzw"], &["xy", "\u{a66e}z", "w"], 0); + check_utf8(&[b"xy\xEA\x99", b"\xAEzw"], &["xy", "\u{a66e}z", "w"], 0); + check_utf8( + &[b"xy\xEA", b"\x99", b"\xAEzw"], + &["xy", "\u{a66e}z", "w"], + 0, + ); + check_utf8(&[b"\xEA", b"", b"\x99", b"", b"\xAE"], &["\u{a66e}"], 0); + check_utf8( + &[b"", b"\xEA", b"", b"\x99", b"", b"\xAE", b""], + &["\u{a66e}"], + 0, + ); + + check_utf8( + &[b"xy\xEA", b"\xFF", b"\x99\xAEz"], + &["xy", "\u{fffd}", "\u{fffd}", "\u{fffd}", "\u{fffd}", "z"], + 4, + ); + check_utf8( + &[b"xy\xEA\x99", b"\xFFz"], + &["xy", "\u{fffd}", "\u{fffd}", "z"], + 2, + ); + + check_utf8(&[b"\xC5\x91\xC5\x91\xC5\x91"], &["őőő"], 0); + check_utf8( + &[b"\xC5\x91", b"\xC5\x91", b"\xC5\x91"], + &["ő", "ő", "ő"], + 0, + ); + check_utf8( + &[b"\xC5", b"\x91\xC5", b"\x91\xC5", b"\x91"], + &["ő", "ő", "ő"], + 0, + ); + check_utf8( + &[b"\xC5", b"\x91\xff", b"\x91\xC5", b"\x91"], + &["ő", "\u{fffd}", "\u{fffd}", "ő"], + 2, + ); + + // incomplete char at end of input + check_utf8(&[b"\xC0"], &["\u{fffd}"], 1); + check_utf8(&[b"\xEA\x99"], &["\u{fffd}"], 1); + } + + #[cfg(any(feature = "encoding", feature = "encoding_rs"))] + fn check_decode( + mut decoder: LossyDecoder<Accumulate<NonAtomic>>, + input: &[&[u8]], + expected: &str, + errs: usize, + ) { + for x in input { + decoder.process(x.to_tendril()); + } + let (tendrils, errors) = decoder.finish(); + let mut tendril: Tendril<fmt::UTF8> = Tendril::new(); + for t in tendrils { + tendril.push_tendril(&t); + } + assert_eq!(expected, &*tendril); + assert_eq!(errs, errors.len()); + } + + #[cfg(any(feature = "encoding", feature = "encoding_rs"))] + pub type Tests = &'static [(&'static [&'static [u8]], &'static str, usize)]; + + #[cfg(any(feature = "encoding"))] + const ASCII: Tests = &[ + (&[], "", 0), + (&[b""], "", 0), + (&[b"xyz"], "xyz", 0), + (&[b"xy", b"", b"", b"z"], "xyz", 0), + (&[b"x", b"y", b"z"], "xyz", 0), + (&[b"\xFF"], "\u{fffd}", 1), + (&[b"x\xC0yz"], "x\u{fffd}yz", 1), + (&[b"x", b"\xC0y", b"z"], "x\u{fffd}yz", 1), + (&[b"x\xC0yz\xFF\xFFw"], "x\u{fffd}yz\u{fffd}\u{fffd}w", 3), + ]; + + #[cfg(feature = "encoding")] + #[test] + fn decode_ascii() { + for &(input, expected, errs) in ASCII { + let decoder = LossyDecoder::new(enc::ASCII, Accumulate::new()); + check_decode(decoder, input, expected, errs); + } + } + + #[cfg(any(feature = "encoding", feature = "encoding_rs"))] + const UTF_8: Tests = &[ + (&[], "", 0), + (&[b""], "", 0), + (&[b"xyz"], "xyz", 0), + (&[b"x", b"y", b"z"], "xyz", 0), + (&[b"\xEA\x99\xAE"], "\u{a66e}", 0), + (&[b"\xEA", b"\x99\xAE"], "\u{a66e}", 0), + (&[b"\xEA\x99", b"\xAE"], "\u{a66e}", 0), + (&[b"\xEA", b"\x99", b"\xAE"], "\u{a66e}", 0), + (&[b"\xEA", b"", b"\x99", b"", b"\xAE"], "\u{a66e}", 0), + ( + &[b"", b"\xEA", b"", b"\x99", b"", b"\xAE", b""], + "\u{a66e}", + 0, + ), + (&[b"xy\xEA", b"\x99\xAEz"], "xy\u{a66e}z", 0), + ( + &[b"xy\xEA", b"\xFF", b"\x99\xAEz"], + "xy\u{fffd}\u{fffd}\u{fffd}\u{fffd}z", + 4, + ), + (&[b"xy\xEA\x99", b"\xFFz"], "xy\u{fffd}\u{fffd}z", 2), + // incomplete char at end of input + (&[b"\xC0"], "\u{fffd}", 1), + (&[b"\xEA\x99"], "\u{fffd}", 1), + ]; + + #[cfg(feature = "encoding")] + #[test] + fn decode_utf8() { + for &(input, expected, errs) in UTF_8 { + let decoder = LossyDecoder::new(enc::UTF_8, Accumulate::new()); + check_decode(decoder, input, expected, errs); + } + } + + #[cfg(feature = "encoding_rs")] + #[test] + fn decode_utf8_encoding_rs() { + for &(input, expected, errs) in UTF_8 { + let decoder = LossyDecoder::new_encoding_rs(enc_rs::UTF_8, Accumulate::new()); + check_decode(decoder, input, expected, errs); + } + } + + #[cfg(any(feature = "encoding", feature = "encoding_rs"))] + const KOI8_U: Tests = &[ + (&[b"\xfc\xce\xc5\xd2\xc7\xc9\xd1"], "Энергия", 0), + (&[b"\xfc\xce", b"\xc5\xd2\xc7\xc9\xd1"], "Энергия", 0), + (&[b"\xfc\xce", b"\xc5\xd2\xc7", b"\xc9\xd1"], "Энергия", 0), + ( + &[b"\xfc\xce", b"", b"\xc5\xd2\xc7", b"\xc9\xd1", b""], + "Энергия", + 0, + ), + ]; + + #[cfg(feature = "encoding")] + #[test] + fn decode_koi8_u() { + for &(input, expected, errs) in KOI8_U { + let decoder = LossyDecoder::new(enc::KOI8_U, Accumulate::new()); + check_decode(decoder, input, expected, errs); + } + } + + #[cfg(feature = "encoding_rs")] + #[test] + fn decode_koi8_u_encoding_rs() { + for &(input, expected, errs) in KOI8_U { + let decoder = LossyDecoder::new_encoding_rs(enc_rs::KOI8_U, Accumulate::new()); + check_decode(decoder, input, expected, errs); + } + } + + #[cfg(any(feature = "encoding", feature = "encoding_rs"))] + const WINDOWS_949: Tests = &[ + (&[], "", 0), + (&[b""], "", 0), + (&[b"\xbe\xc8\xb3\xe7"], "안녕", 0), + (&[b"\xbe", b"\xc8\xb3\xe7"], "안녕", 0), + (&[b"\xbe", b"", b"\xc8\xb3\xe7"], "안녕", 0), + ( + &[b"\xbe\xc8\xb3\xe7\xc7\xcf\xbc\xbc\xbf\xe4"], + "안녕하세요", + 0, + ), + (&[b"\xbe\xc8\xb3\xe7\xc7"], "안녕\u{fffd}", 1), + (&[b"\xbe", b"", b"\xc8\xb3"], "안\u{fffd}", 1), + (&[b"\xbe\x28\xb3\xe7"], "\u{fffd}(녕", 1), + ]; + + #[cfg(feature = "encoding")] + #[test] + fn decode_windows_949() { + for &(input, expected, errs) in WINDOWS_949 { + let decoder = LossyDecoder::new(enc::WINDOWS_949, Accumulate::new()); + check_decode(decoder, input, expected, errs); + } + } + + #[cfg(feature = "encoding_rs")] + #[test] + fn decode_windows_949_encoding_rs() { + for &(input, expected, errs) in WINDOWS_949 { + let decoder = LossyDecoder::new_encoding_rs(enc_rs::EUC_KR, Accumulate::new()); + check_decode(decoder, input, expected, errs); + } + } + + #[test] + fn read_from() { + let decoder = Utf8LossyDecoder::new(Accumulate::<NonAtomic>::new()); + let mut bytes: &[u8] = b"foo\xffbar"; + let (tendrils, errors) = decoder.read_from(&mut bytes).unwrap(); + assert_eq!( + &*tendrils.iter().map(|t| &**t).collect::<Vec<_>>(), + &["foo", "\u{FFFD}", "bar"] + ); + assert_eq!(errors, &["invalid byte sequence"]); + } +} |