diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-17 12:02:58 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-17 12:02:58 +0000 |
commit | 698f8c2f01ea549d77d7dc3338a12e04c11057b9 (patch) | |
tree | 173a775858bd501c378080a10dca74132f05bc50 /vendor/utf-8 | |
parent | Initial commit. (diff) | |
download | rustc-698f8c2f01ea549d77d7dc3338a12e04c11057b9.tar.xz rustc-698f8c2f01ea549d77d7dc3338a12e04c11057b9.zip |
Adding upstream version 1.64.0+dfsg1.upstream/1.64.0+dfsg1
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'vendor/utf-8')
-rw-r--r-- | vendor/utf-8/.cargo-checksum.json | 1 | ||||
-rw-r--r-- | vendor/utf-8/Cargo.toml | 29 | ||||
-rw-r--r-- | vendor/utf-8/README.md | 5 | ||||
-rw-r--r-- | vendor/utf-8/benches/from_utf8_lossy.rs | 30 | ||||
-rw-r--r-- | vendor/utf-8/src/lib.rs | 186 | ||||
-rw-r--r-- | vendor/utf-8/src/lossy.rs | 92 | ||||
-rw-r--r-- | vendor/utf-8/src/read.rs | 167 | ||||
-rw-r--r-- | vendor/utf-8/tests/unit.rs | 197 |
8 files changed, 707 insertions, 0 deletions
diff --git a/vendor/utf-8/.cargo-checksum.json b/vendor/utf-8/.cargo-checksum.json new file mode 100644 index 000000000..78a9fbe4f --- /dev/null +++ b/vendor/utf-8/.cargo-checksum.json @@ -0,0 +1 @@ +{"files":{"Cargo.toml":"fa8028d20c12bd6964ea9578f16763128c661c3b8d1b134d6a8423322e9e4a6d","README.md":"e5cd8b3b67c2962e13b0aa95fc2af9152999e1bd333df8be8a3be5eab53e540a","benches/from_utf8_lossy.rs":"ed57fc9fca84d160a70fa06bcf6658adca9f4518cb6e0be6a52accc291736b0e","src/lib.rs":"32e657c72a7a895b26288f271e3194270002548692368bdb1ef32b5698975395","src/lossy.rs":"c7d3f193fe04b60145a5e32f5e6c55c181664f82309ef59bb15533194d69e345","src/read.rs":"6eae22954e18a5afa8f62c876498a643563c5b68d03329a417aa354a28108046","tests/unit.rs":"9e920a552549009191d61147d60196fcce9cbc7f2065d33b6d9c757e258a9edd"},"package":"05e42f7c18b8f902290b009cde6d651262f956c98bc51bca4cd1d511c9cd85c7"}
\ No newline at end of file diff --git a/vendor/utf-8/Cargo.toml b/vendor/utf-8/Cargo.toml new file mode 100644 index 000000000..8974e5131 --- /dev/null +++ b/vendor/utf-8/Cargo.toml @@ -0,0 +1,29 @@ +# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO +# +# When uploading crates to the registry Cargo will automatically +# "normalize" Cargo.toml files for maximal compatibility +# with all versions of Cargo and also rewrite `path` dependencies +# to registry (e.g. crates.io) dependencies +# +# If you believe there's an error in this file please file an +# issue against the rust-lang/cargo repository. If you're +# editing this file be aware that the upstream Cargo.toml +# will likely look very different (and much more reasonable) + +[package] +name = "utf-8" +version = "0.7.5" +authors = ["Simon Sapin <simon.sapin@exyr.org>"] +description = "Incremental, zero-copy UTF-8 decoding with error handling" +license = "MIT OR Apache-2.0" +repository = "https://github.com/SimonSapin/rust-utf8" +[profile.test] + +[profile.bench] + +[lib] +name = "utf8" +test = false +bench = false + +[dependencies] diff --git a/vendor/utf-8/README.md b/vendor/utf-8/README.md new file mode 100644 index 000000000..145889b37 --- /dev/null +++ b/vendor/utf-8/README.md @@ -0,0 +1,5 @@ +# rust-utf8 + +Incremental, zero-copy UTF-8 decoding for Rust + +[Documentation](https://docs.rs/utf-8/) diff --git a/vendor/utf-8/benches/from_utf8_lossy.rs b/vendor/utf-8/benches/from_utf8_lossy.rs new file mode 100644 index 000000000..95d9edf39 --- /dev/null +++ b/vendor/utf-8/benches/from_utf8_lossy.rs @@ -0,0 +1,30 @@ +#![feature(test)] + +extern crate test; +extern crate utf8; + +#[path = "../tests/shared/data.rs"] +mod data; + +#[path = "../tests/shared/string_from_utf8_lossy.rs"] +mod string_from_utf8_lossy; + +#[bench] +fn bench_our_string_from_utf8_lossy(bencher: &mut test::Bencher) { + bencher.bytes = data::DECODED_LOSSY.iter().map(|&(input, _expected)| input.len() as u64).sum(); + bencher.iter(|| { + for &(input, _expected) in data::DECODED_LOSSY { + test::black_box(string_from_utf8_lossy::string_from_utf8_lossy(input)); + } + }) +} + +#[bench] +fn bench_std_string_from_utf8_lossy(bencher: &mut test::Bencher) { + bencher.bytes = data::DECODED_LOSSY.iter().map(|&(input, _expected)| input.len() as u64).sum(); + bencher.iter(|| { + for &(input, _expected) in data::DECODED_LOSSY { + test::black_box(String::from_utf8_lossy(input)); + } + }) +} diff --git a/vendor/utf-8/src/lib.rs b/vendor/utf-8/src/lib.rs new file mode 100644 index 000000000..ec223f209 --- /dev/null +++ b/vendor/utf-8/src/lib.rs @@ -0,0 +1,186 @@ +mod lossy; +mod read; + +pub use lossy::LossyDecoder; +pub use read::{BufReadDecoder, BufReadDecoderError}; + +use std::cmp; +use std::error::Error; +use std::fmt; +use std::str; + +/// The replacement character, U+FFFD. In lossy decoding, insert it for every decoding error. +pub const REPLACEMENT_CHARACTER: &'static str = "\u{FFFD}"; + +#[derive(Debug, Copy, Clone)] +pub enum DecodeError<'a> { + /// In lossy decoding insert `valid_prefix`, then `"\u{FFFD}"`, + /// then call `decode()` again with `remaining_input`. + Invalid { + valid_prefix: &'a str, + invalid_sequence: &'a [u8], + remaining_input: &'a [u8], + }, + + /// Call the `incomplete_suffix.try_complete` method with more input when available. + /// If no more input is available, this is an invalid byte sequence. + Incomplete { + valid_prefix: &'a str, + incomplete_suffix: Incomplete, + }, +} + +impl<'a> fmt::Display for DecodeError<'a> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match *self { + DecodeError::Invalid { + valid_prefix, + invalid_sequence, + remaining_input, + } => write!( + f, + "found invalid byte sequence {invalid_sequence:02x?} after \ + {valid_byte_count} valid bytes, followed by {unprocessed_byte_count} more \ + unprocessed bytes", + invalid_sequence = invalid_sequence, + valid_byte_count = valid_prefix.len(), + unprocessed_byte_count = remaining_input.len() + ), + DecodeError::Incomplete { + valid_prefix, + incomplete_suffix, + } => write!( + f, + "found incomplete byte sequence {incomplete_suffix:02x?} after \ + {valid_byte_count} bytes", + incomplete_suffix = incomplete_suffix, + valid_byte_count = valid_prefix.len() + ), + } + } +} + +impl<'a> Error for DecodeError<'a> {} + +#[derive(Debug, Copy, Clone)] +pub struct Incomplete { + pub buffer: [u8; 4], + pub buffer_len: u8, +} + +pub fn decode(input: &[u8]) -> Result<&str, DecodeError> { + let error = match str::from_utf8(input) { + Ok(valid) => return Ok(valid), + Err(error) => error, + }; + + // FIXME: separate function from here to guide inlining? + let (valid, after_valid) = input.split_at(error.valid_up_to()); + let valid = unsafe { + str::from_utf8_unchecked(valid) + }; + + match error.error_len() { + Some(invalid_sequence_length) => { + let (invalid, rest) = after_valid.split_at(invalid_sequence_length); + Err(DecodeError::Invalid { + valid_prefix: valid, + invalid_sequence: invalid, + remaining_input: rest + }) + } + None => { + Err(DecodeError::Incomplete { + valid_prefix: valid, + incomplete_suffix: Incomplete::new(after_valid), + }) + } + } +} + +impl Incomplete { + pub fn empty() -> Self { + Incomplete { + buffer: [0, 0, 0, 0], + buffer_len: 0, + } + } + + pub fn is_empty(&self) -> bool { + self.buffer_len == 0 + } + + pub fn new(bytes: &[u8]) -> Self { + let mut buffer = [0, 0, 0, 0]; + let len = bytes.len(); + buffer[..len].copy_from_slice(bytes); + Incomplete { + buffer: buffer, + buffer_len: len as u8, + } + } + + /// * `None`: still incomplete, call `try_complete` again with more input. + /// If no more input is available, this is invalid byte sequence. + /// * `Some((result, remaining_input))`: We’re done with this `Incomplete`. + /// To keep decoding, pass `remaining_input` to `decode()`. + pub fn try_complete<'input>(&mut self, input: &'input [u8]) + -> Option<(Result<&str, &[u8]>, &'input [u8])> { + let (consumed, opt_result) = self.try_complete_offsets(input); + let result = opt_result?; + let remaining_input = &input[consumed..]; + let result_bytes = self.take_buffer(); + let result = match result { + Ok(()) => Ok(unsafe { str::from_utf8_unchecked(result_bytes) }), + Err(()) => Err(result_bytes), + }; + Some((result, remaining_input)) + } + + fn take_buffer(&mut self) -> &[u8] { + let len = self.buffer_len as usize; + self.buffer_len = 0; + &self.buffer[..len as usize] + } + + /// (consumed_from_input, None): not enough input + /// (consumed_from_input, Some(Err(()))): error bytes in buffer + /// (consumed_from_input, Some(Ok(()))): UTF-8 string in buffer + fn try_complete_offsets(&mut self, input: &[u8]) -> (usize, Option<Result<(), ()>>) { + let initial_buffer_len = self.buffer_len as usize; + let copied_from_input; + { + let unwritten = &mut self.buffer[initial_buffer_len..]; + copied_from_input = cmp::min(unwritten.len(), input.len()); + unwritten[..copied_from_input].copy_from_slice(&input[..copied_from_input]); + } + let spliced = &self.buffer[..initial_buffer_len + copied_from_input]; + match str::from_utf8(spliced) { + Ok(_) => { + self.buffer_len = spliced.len() as u8; + (copied_from_input, Some(Ok(()))) + } + Err(error) => { + let valid_up_to = error.valid_up_to(); + if valid_up_to > 0 { + let consumed = valid_up_to.checked_sub(initial_buffer_len).unwrap(); + self.buffer_len = valid_up_to as u8; + (consumed, Some(Ok(()))) + } else { + match error.error_len() { + Some(invalid_sequence_length) => { + let consumed = invalid_sequence_length + .checked_sub(initial_buffer_len).unwrap(); + self.buffer_len = invalid_sequence_length as u8; + (consumed, Some(Err(()))) + } + None => { + self.buffer_len = spliced.len() as u8; + (copied_from_input, None) + } + } + } + } + } + } +} diff --git a/vendor/utf-8/src/lossy.rs b/vendor/utf-8/src/lossy.rs new file mode 100644 index 000000000..00bcdecf0 --- /dev/null +++ b/vendor/utf-8/src/lossy.rs @@ -0,0 +1,92 @@ +use super::*; + +/// A push-based, lossy decoder for UTF-8. +/// Errors are replaced with the U+FFFD replacement character. +/// +/// Users “push” bytes into the decoder, which in turn “pushes” `&str` slices into a callback. +/// +/// For example, `String::from_utf8_lossy` (but returning `String` instead of `Cow`) +/// can be rewritten as: +/// +/// ```rust +/// fn string_from_utf8_lossy(input: &[u8]) -> String { +/// let mut string = String::new(); +/// utf8::LossyDecoder::new(|s| string.push_str(s)).feed(input); +/// string +/// } +/// ``` +/// +/// **Note:** Dropping the decoder signals the end of the input: +/// If the last input chunk ended with an incomplete byte sequence for a code point, +/// this is an error and a replacement character is emitted. +/// Use `std::mem::forget` to inhibit this behavior. +pub struct LossyDecoder<F: FnMut(&str)> { + push_str: F, + incomplete: Incomplete, +} + +impl<F: FnMut(&str)> LossyDecoder<F> { + /// Create a new decoder from a callback. + #[inline] + pub fn new(push_str: F) -> Self { + LossyDecoder { + push_str: push_str, + incomplete: Incomplete { + buffer: [0, 0, 0, 0], + buffer_len: 0, + }, + } + } + + /// Feed one chunk of input into the decoder. + /// + /// The input is decoded lossily + /// and the callback called once or more with `&str` string slices. + /// + /// If the UTF-8 byte sequence for one code point was split into this bytes chunk + /// and previous bytes chunks, it will be correctly pieced back together. + pub fn feed(&mut self, mut input: &[u8]) { + if self.incomplete.buffer_len > 0 { + match self.incomplete.try_complete(input) { + Some((Ok(s), remaining)) => { + (self.push_str)(s); + input = remaining + } + Some((Err(_), remaining)) => { + (self.push_str)(REPLACEMENT_CHARACTER); + input = remaining + } + None => { + return + } + } + } + loop { + match decode(input) { + Ok(s) => { + (self.push_str)(s); + return + } + Err(DecodeError::Incomplete { valid_prefix, incomplete_suffix }) => { + (self.push_str)(valid_prefix); + self.incomplete = incomplete_suffix; + return + } + Err(DecodeError::Invalid { valid_prefix, remaining_input, .. }) => { + (self.push_str)(valid_prefix); + (self.push_str)(REPLACEMENT_CHARACTER); + input = remaining_input + } + } + } + } +} + +impl<F: FnMut(&str)> Drop for LossyDecoder<F> { + #[inline] + fn drop(&mut self) { + if self.incomplete.buffer_len > 0 { + (self.push_str)(REPLACEMENT_CHARACTER) + } + } +} diff --git a/vendor/utf-8/src/read.rs b/vendor/utf-8/src/read.rs new file mode 100644 index 000000000..5e38f54a1 --- /dev/null +++ b/vendor/utf-8/src/read.rs @@ -0,0 +1,167 @@ +use std::io::{self, BufRead}; +use std::error::Error; +use std::fmt; +use std::str; +use super::*; + +/// Wraps a `std::io::BufRead` buffered byte stream and decode it as UTF-8. +pub struct BufReadDecoder<B: BufRead> { + buf_read: B, + bytes_consumed: usize, + incomplete: Incomplete, +} + +#[derive(Debug)] +pub enum BufReadDecoderError<'a> { + /// Represents one UTF-8 error in the byte stream. + /// + /// In lossy decoding, each such error should be replaced with U+FFFD. + /// (See `BufReadDecoder::next_lossy` and `BufReadDecoderError::lossy`.) + InvalidByteSequence(&'a [u8]), + + /// An I/O error from the underlying byte stream + Io(io::Error), +} + +impl<'a> BufReadDecoderError<'a> { + /// Replace UTF-8 errors with U+FFFD + pub fn lossy(self) -> Result<&'static str, io::Error> { + match self { + BufReadDecoderError::Io(error) => Err(error), + BufReadDecoderError::InvalidByteSequence(_) => Ok(REPLACEMENT_CHARACTER), + } + } +} + +impl<'a> fmt::Display for BufReadDecoderError<'a> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match *self { + BufReadDecoderError::InvalidByteSequence(bytes) => { + write!(f, "invalid byte sequence: {:02x?}", bytes) + } + BufReadDecoderError::Io(ref err) => write!(f, "underlying bytestream error: {}", err), + } + } +} + +impl<'a> Error for BufReadDecoderError<'a> { + fn source(&self) -> Option<&(dyn Error + 'static)> { + match *self { + BufReadDecoderError::InvalidByteSequence(_) => None, + BufReadDecoderError::Io(ref err) => Some(err), + } + } +} + +impl<B: BufRead> BufReadDecoder<B> { + /// This is to `Read::read_to_string` what `String::from_utf8_lossy` is to `String::from_utf8`. + pub fn read_to_string_lossy(buf_read: B) -> io::Result<String> { + let mut decoder = Self::new(buf_read); + let mut string = String::new(); + while let Some(result) = decoder.next_lossy() { + string.push_str(result?) + } + Ok(string) + } + + pub fn new(buf_read: B) -> Self { + Self { + buf_read, + bytes_consumed: 0, + incomplete: Incomplete::empty(), + } + } + + /// Same as `BufReadDecoder::next_strict`, but replace UTF-8 errors with U+FFFD. + pub fn next_lossy(&mut self) -> Option<io::Result<&str>> { + self.next_strict().map(|result| result.or_else(|e| e.lossy())) + } + + /// Decode and consume the next chunk of UTF-8 input. + /// + /// This method is intended to be called repeatedly until it returns `None`, + /// which represents EOF from the underlying byte stream. + /// This is similar to `Iterator::next`, + /// except that decoded chunks borrow the decoder (~iterator) + /// so they need to be handled or copied before the next chunk can start decoding. + pub fn next_strict(&mut self) -> Option<Result<&str, BufReadDecoderError>> { + enum BytesSource { + BufRead(usize), + Incomplete, + } + macro_rules! try_io { + ($io_result: expr) => { + match $io_result { + Ok(value) => value, + Err(error) => return Some(Err(BufReadDecoderError::Io(error))) + } + } + } + let (source, result) = loop { + if self.bytes_consumed > 0 { + self.buf_read.consume(self.bytes_consumed); + self.bytes_consumed = 0; + } + let buf = try_io!(self.buf_read.fill_buf()); + + // Force loop iteration to go through an explicit `continue` + enum Unreachable {} + let _: Unreachable = if self.incomplete.is_empty() { + if buf.is_empty() { + return None // EOF + } + match str::from_utf8(buf) { + Ok(_) => { + break (BytesSource::BufRead(buf.len()), Ok(())) + } + Err(error) => { + let valid_up_to = error.valid_up_to(); + if valid_up_to > 0 { + break (BytesSource::BufRead(valid_up_to), Ok(())) + } + match error.error_len() { + Some(invalid_sequence_length) => { + break (BytesSource::BufRead(invalid_sequence_length), Err(())) + } + None => { + self.bytes_consumed = buf.len(); + self.incomplete = Incomplete::new(buf); + // need more input bytes + continue + } + } + } + } + } else { + if buf.is_empty() { + break (BytesSource::Incomplete, Err(())) // EOF with incomplete code point + } + let (consumed, opt_result) = self.incomplete.try_complete_offsets(buf); + self.bytes_consumed = consumed; + match opt_result { + None => { + // need more input bytes + continue + } + Some(result) => { + break (BytesSource::Incomplete, result) + } + } + }; + }; + let bytes = match source { + BytesSource::BufRead(byte_count) => { + self.bytes_consumed = byte_count; + let buf = try_io!(self.buf_read.fill_buf()); + &buf[..byte_count] + } + BytesSource::Incomplete => { + self.incomplete.take_buffer() + } + }; + match result { + Ok(()) => Some(Ok(unsafe { str::from_utf8_unchecked(bytes) })), + Err(()) => Some(Err(BufReadDecoderError::InvalidByteSequence(bytes))), + } + } +} diff --git a/vendor/utf-8/tests/unit.rs b/vendor/utf-8/tests/unit.rs new file mode 100644 index 000000000..6839e84f2 --- /dev/null +++ b/vendor/utf-8/tests/unit.rs @@ -0,0 +1,197 @@ +extern crate utf8; + +use std::borrow::Cow; +use std::collections::VecDeque; +use std::io; +use utf8::*; + +/// A re-implementation of std::str::from_utf8 +pub fn str_from_utf8(input: &[u8]) -> Result<&str, usize> { + match decode(input) { + Ok(s) => return Ok(s), + Err(DecodeError::Invalid { valid_prefix, .. }) | + Err(DecodeError::Incomplete { valid_prefix, .. }) => Err(valid_prefix.len()), + } +} + +#[test] +fn test_str_from_utf8() { + let xs = b"hello"; + assert_eq!(str_from_utf8(xs), Ok("hello")); + + let xs = "ศไทย中华Việt Nam".as_bytes(); + assert_eq!(str_from_utf8(xs), Ok("ศไทย中华Việt Nam")); + + let xs = b"hello\xFF"; + assert!(str_from_utf8(xs).is_err()); +} + +#[test] +fn test_is_utf8() { + // Chars of 1, 2, 3, and 4 bytes + assert!(str_from_utf8("eé€\u{10000}".as_bytes()).is_ok()); + // invalid prefix + assert!(str_from_utf8(&[0x80]).is_err()); + // invalid 2 byte prefix + assert!(str_from_utf8(&[0xc0]).is_err()); + assert!(str_from_utf8(&[0xc0, 0x10]).is_err()); + // invalid 3 byte prefix + assert!(str_from_utf8(&[0xe0]).is_err()); + assert!(str_from_utf8(&[0xe0, 0x10]).is_err()); + assert!(str_from_utf8(&[0xe0, 0xff, 0x10]).is_err()); + // invalid 4 byte prefix + assert!(str_from_utf8(&[0xf0]).is_err()); + assert!(str_from_utf8(&[0xf0, 0x10]).is_err()); + assert!(str_from_utf8(&[0xf0, 0xff, 0x10]).is_err()); + assert!(str_from_utf8(&[0xf0, 0xff, 0xff, 0x10]).is_err()); + + // deny overlong encodings + assert!(str_from_utf8(&[0xc0, 0x80]).is_err()); + assert!(str_from_utf8(&[0xc0, 0xae]).is_err()); + assert!(str_from_utf8(&[0xe0, 0x80, 0x80]).is_err()); + assert!(str_from_utf8(&[0xe0, 0x80, 0xaf]).is_err()); + assert!(str_from_utf8(&[0xe0, 0x81, 0x81]).is_err()); + assert!(str_from_utf8(&[0xf0, 0x82, 0x82, 0xac]).is_err()); + assert!(str_from_utf8(&[0xf4, 0x90, 0x80, 0x80]).is_err()); + + // deny surrogates + assert!(str_from_utf8(&[0xED, 0xA0, 0x80]).is_err()); + assert!(str_from_utf8(&[0xED, 0xBF, 0xBF]).is_err()); + + assert!(str_from_utf8(&[0xC2, 0x80]).is_ok()); + assert!(str_from_utf8(&[0xDF, 0xBF]).is_ok()); + assert!(str_from_utf8(&[0xE0, 0xA0, 0x80]).is_ok()); + assert!(str_from_utf8(&[0xED, 0x9F, 0xBF]).is_ok()); + assert!(str_from_utf8(&[0xEE, 0x80, 0x80]).is_ok()); + assert!(str_from_utf8(&[0xEF, 0xBF, 0xBF]).is_ok()); + assert!(str_from_utf8(&[0xF0, 0x90, 0x80, 0x80]).is_ok()); + assert!(str_from_utf8(&[0xF4, 0x8F, 0xBF, 0xBF]).is_ok()); +} + +/// A re-implementation of String::from_utf8_lossy +pub fn string_from_utf8_lossy(input: &[u8]) -> Cow<str> { + let mut result = decode(input); + if let Ok(s) = result { + return s.into() + } + let mut string = String::with_capacity(input.len() + REPLACEMENT_CHARACTER.len()); + loop { + match result { + Ok(s) => { + string.push_str(s); + return string.into() + } + Err(DecodeError::Incomplete { valid_prefix, .. }) => { + string.push_str(valid_prefix); + string.push_str(REPLACEMENT_CHARACTER); + return string.into() + } + Err(DecodeError::Invalid { valid_prefix, remaining_input, .. }) => { + string.push_str(valid_prefix); + string.push_str(REPLACEMENT_CHARACTER); + result = decode(remaining_input); + } + } + } +} + +pub const DECODED_LOSSY: &'static [(&'static [u8], &'static str)] = &[ + (b"hello", "hello"), + (b"\xe0\xb8\xa8\xe0\xb9\x84\xe0\xb8\x97\xe0\xb8\xa2\xe4\xb8\xad\xe5\x8d\x8e", "ศไทย中华"), + (b"Vi\xe1\xbb\x87t Nam", "Việt Nam"), + (b"Hello\xC2 There\xFF ", "Hello\u{FFFD} There\u{FFFD} "), + (b"Hello\xC0\x80 There", "Hello\u{FFFD}\u{FFFD} There"), + (b"\xE6\x83 Goodbye", "\u{FFFD} Goodbye"), + (b"\xF5foo\xF5\x80bar", "\u{FFFD}foo\u{FFFD}\u{FFFD}bar"), + (b"\xF5foo\xF5\xC2", "\u{FFFD}foo\u{FFFD}\u{FFFD}"), + (b"\xF1foo\xF1\x80bar\xF1\x80\x80baz", "\u{FFFD}foo\u{FFFD}bar\u{FFFD}baz"), + (b"\xF4foo\xF4\x80bar\xF4\xBFbaz", "\u{FFFD}foo\u{FFFD}bar\u{FFFD}\u{FFFD}baz"), + (b"\xF0\x80\x80\x80foo\xF0\x90\x80\x80bar", "\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}foo\u{10000}bar"), + (b"\xF0\x90\x80foo", "\u{FFFD}foo"), + // surrogates + (b"\xED\xA0\x80foo\xED\xBF\xBFbar", "\u{FFFD}\u{FFFD}\u{FFFD}foo\u{FFFD}\u{FFFD}\u{FFFD}bar"), +]; + +#[test] +fn test_string_from_utf8_lossy() { + for &(input, expected) in DECODED_LOSSY { + assert_eq!(string_from_utf8_lossy(input), expected); + } +} + +pub fn all_partitions<'a, F>(input: &'a [u8], f: F) + where F: Fn(&[&[u8]]) +{ + + fn all_partitions_inner<'a, F>(chunks: &mut Vec<&'a [u8]>, input: &'a [u8], f: &F) + where F: Fn(&[&[u8]]) + { + if input.is_empty() { + f(chunks) + } + for i in 1..(input.len() + 1) { + chunks.push(&input[..i]); + all_partitions_inner(chunks, &input[i..], f); + chunks.pop(); + } + } + + let mut chunks = Vec::new(); + all_partitions_inner(&mut chunks, input, &f); + assert_eq!(chunks.len(), 0); +} + +#[test] +fn test_incremental_decoder() { + for &(input, expected) in DECODED_LOSSY { + all_partitions(input, |chunks| { + let mut string = String::new(); + { + let mut decoder = LossyDecoder::new(|s| string.push_str(s)); + for &chunk in &*chunks { + decoder.feed(chunk); + } + } + assert_eq!(string, expected); + }); + } +} + +#[test] +fn test_bufread_decoder() { + for &(input, expected) in DECODED_LOSSY { + all_partitions(input, |chunks| { + let chunks = Chunks(chunks.to_vec().into()); + let string = BufReadDecoder::read_to_string_lossy(chunks).unwrap(); + assert_eq!(string, expected) + }); + } +} + +struct Chunks<'a>(VecDeque<&'a [u8]>); + +impl<'a> io::Read for Chunks<'a> { + fn read(&mut self, _: &mut [u8]) -> io::Result<usize> { + unimplemented!() + } +} + +impl<'a> io::BufRead for Chunks<'a> { + fn fill_buf(&mut self) -> io::Result<&[u8]> { + Ok(*self.0.front().unwrap()) + } + + fn consume(&mut self, bytes: usize) { + { + let front = self.0.front_mut().unwrap(); + *front = &front[bytes..]; + if !front.is_empty() { + return + } + } + if self.0.len() > 1 { + self.0.pop_front(); + } + } + +} |