summaryrefslogtreecommitdiffstats
path: root/vendor/utf-8
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-17 12:02:58 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-17 12:02:58 +0000
commit698f8c2f01ea549d77d7dc3338a12e04c11057b9 (patch)
tree173a775858bd501c378080a10dca74132f05bc50 /vendor/utf-8
parentInitial commit. (diff)
downloadrustc-698f8c2f01ea549d77d7dc3338a12e04c11057b9.tar.xz
rustc-698f8c2f01ea549d77d7dc3338a12e04c11057b9.zip
Adding upstream version 1.64.0+dfsg1.upstream/1.64.0+dfsg1
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'vendor/utf-8')
-rw-r--r--vendor/utf-8/.cargo-checksum.json1
-rw-r--r--vendor/utf-8/Cargo.toml29
-rw-r--r--vendor/utf-8/README.md5
-rw-r--r--vendor/utf-8/benches/from_utf8_lossy.rs30
-rw-r--r--vendor/utf-8/src/lib.rs186
-rw-r--r--vendor/utf-8/src/lossy.rs92
-rw-r--r--vendor/utf-8/src/read.rs167
-rw-r--r--vendor/utf-8/tests/unit.rs197
8 files changed, 707 insertions, 0 deletions
diff --git a/vendor/utf-8/.cargo-checksum.json b/vendor/utf-8/.cargo-checksum.json
new file mode 100644
index 000000000..78a9fbe4f
--- /dev/null
+++ b/vendor/utf-8/.cargo-checksum.json
@@ -0,0 +1 @@
+{"files":{"Cargo.toml":"fa8028d20c12bd6964ea9578f16763128c661c3b8d1b134d6a8423322e9e4a6d","README.md":"e5cd8b3b67c2962e13b0aa95fc2af9152999e1bd333df8be8a3be5eab53e540a","benches/from_utf8_lossy.rs":"ed57fc9fca84d160a70fa06bcf6658adca9f4518cb6e0be6a52accc291736b0e","src/lib.rs":"32e657c72a7a895b26288f271e3194270002548692368bdb1ef32b5698975395","src/lossy.rs":"c7d3f193fe04b60145a5e32f5e6c55c181664f82309ef59bb15533194d69e345","src/read.rs":"6eae22954e18a5afa8f62c876498a643563c5b68d03329a417aa354a28108046","tests/unit.rs":"9e920a552549009191d61147d60196fcce9cbc7f2065d33b6d9c757e258a9edd"},"package":"05e42f7c18b8f902290b009cde6d651262f956c98bc51bca4cd1d511c9cd85c7"} \ No newline at end of file
diff --git a/vendor/utf-8/Cargo.toml b/vendor/utf-8/Cargo.toml
new file mode 100644
index 000000000..8974e5131
--- /dev/null
+++ b/vendor/utf-8/Cargo.toml
@@ -0,0 +1,29 @@
+# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
+#
+# When uploading crates to the registry Cargo will automatically
+# "normalize" Cargo.toml files for maximal compatibility
+# with all versions of Cargo and also rewrite `path` dependencies
+# to registry (e.g. crates.io) dependencies
+#
+# If you believe there's an error in this file please file an
+# issue against the rust-lang/cargo repository. If you're
+# editing this file be aware that the upstream Cargo.toml
+# will likely look very different (and much more reasonable)
+
+[package]
+name = "utf-8"
+version = "0.7.5"
+authors = ["Simon Sapin <simon.sapin@exyr.org>"]
+description = "Incremental, zero-copy UTF-8 decoding with error handling"
+license = "MIT OR Apache-2.0"
+repository = "https://github.com/SimonSapin/rust-utf8"
+[profile.test]
+
+[profile.bench]
+
+[lib]
+name = "utf8"
+test = false
+bench = false
+
+[dependencies]
diff --git a/vendor/utf-8/README.md b/vendor/utf-8/README.md
new file mode 100644
index 000000000..145889b37
--- /dev/null
+++ b/vendor/utf-8/README.md
@@ -0,0 +1,5 @@
+# rust-utf8
+
+Incremental, zero-copy UTF-8 decoding for Rust
+
+[Documentation](https://docs.rs/utf-8/)
diff --git a/vendor/utf-8/benches/from_utf8_lossy.rs b/vendor/utf-8/benches/from_utf8_lossy.rs
new file mode 100644
index 000000000..95d9edf39
--- /dev/null
+++ b/vendor/utf-8/benches/from_utf8_lossy.rs
@@ -0,0 +1,30 @@
+#![feature(test)]
+
+extern crate test;
+extern crate utf8;
+
+#[path = "../tests/shared/data.rs"]
+mod data;
+
+#[path = "../tests/shared/string_from_utf8_lossy.rs"]
+mod string_from_utf8_lossy;
+
+#[bench]
+fn bench_our_string_from_utf8_lossy(bencher: &mut test::Bencher) {
+ bencher.bytes = data::DECODED_LOSSY.iter().map(|&(input, _expected)| input.len() as u64).sum();
+ bencher.iter(|| {
+ for &(input, _expected) in data::DECODED_LOSSY {
+ test::black_box(string_from_utf8_lossy::string_from_utf8_lossy(input));
+ }
+ })
+}
+
+#[bench]
+fn bench_std_string_from_utf8_lossy(bencher: &mut test::Bencher) {
+ bencher.bytes = data::DECODED_LOSSY.iter().map(|&(input, _expected)| input.len() as u64).sum();
+ bencher.iter(|| {
+ for &(input, _expected) in data::DECODED_LOSSY {
+ test::black_box(String::from_utf8_lossy(input));
+ }
+ })
+}
diff --git a/vendor/utf-8/src/lib.rs b/vendor/utf-8/src/lib.rs
new file mode 100644
index 000000000..ec223f209
--- /dev/null
+++ b/vendor/utf-8/src/lib.rs
@@ -0,0 +1,186 @@
+mod lossy;
+mod read;
+
+pub use lossy::LossyDecoder;
+pub use read::{BufReadDecoder, BufReadDecoderError};
+
+use std::cmp;
+use std::error::Error;
+use std::fmt;
+use std::str;
+
+/// The replacement character, U+FFFD. In lossy decoding, insert it for every decoding error.
+pub const REPLACEMENT_CHARACTER: &'static str = "\u{FFFD}";
+
+#[derive(Debug, Copy, Clone)]
+pub enum DecodeError<'a> {
+ /// In lossy decoding insert `valid_prefix`, then `"\u{FFFD}"`,
+ /// then call `decode()` again with `remaining_input`.
+ Invalid {
+ valid_prefix: &'a str,
+ invalid_sequence: &'a [u8],
+ remaining_input: &'a [u8],
+ },
+
+ /// Call the `incomplete_suffix.try_complete` method with more input when available.
+ /// If no more input is available, this is an invalid byte sequence.
+ Incomplete {
+ valid_prefix: &'a str,
+ incomplete_suffix: Incomplete,
+ },
+}
+
+impl<'a> fmt::Display for DecodeError<'a> {
+ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ match *self {
+ DecodeError::Invalid {
+ valid_prefix,
+ invalid_sequence,
+ remaining_input,
+ } => write!(
+ f,
+ "found invalid byte sequence {invalid_sequence:02x?} after \
+ {valid_byte_count} valid bytes, followed by {unprocessed_byte_count} more \
+ unprocessed bytes",
+ invalid_sequence = invalid_sequence,
+ valid_byte_count = valid_prefix.len(),
+ unprocessed_byte_count = remaining_input.len()
+ ),
+ DecodeError::Incomplete {
+ valid_prefix,
+ incomplete_suffix,
+ } => write!(
+ f,
+ "found incomplete byte sequence {incomplete_suffix:02x?} after \
+ {valid_byte_count} bytes",
+ incomplete_suffix = incomplete_suffix,
+ valid_byte_count = valid_prefix.len()
+ ),
+ }
+ }
+}
+
+impl<'a> Error for DecodeError<'a> {}
+
+#[derive(Debug, Copy, Clone)]
+pub struct Incomplete {
+ pub buffer: [u8; 4],
+ pub buffer_len: u8,
+}
+
+pub fn decode(input: &[u8]) -> Result<&str, DecodeError> {
+ let error = match str::from_utf8(input) {
+ Ok(valid) => return Ok(valid),
+ Err(error) => error,
+ };
+
+ // FIXME: separate function from here to guide inlining?
+ let (valid, after_valid) = input.split_at(error.valid_up_to());
+ let valid = unsafe {
+ str::from_utf8_unchecked(valid)
+ };
+
+ match error.error_len() {
+ Some(invalid_sequence_length) => {
+ let (invalid, rest) = after_valid.split_at(invalid_sequence_length);
+ Err(DecodeError::Invalid {
+ valid_prefix: valid,
+ invalid_sequence: invalid,
+ remaining_input: rest
+ })
+ }
+ None => {
+ Err(DecodeError::Incomplete {
+ valid_prefix: valid,
+ incomplete_suffix: Incomplete::new(after_valid),
+ })
+ }
+ }
+}
+
+impl Incomplete {
+ pub fn empty() -> Self {
+ Incomplete {
+ buffer: [0, 0, 0, 0],
+ buffer_len: 0,
+ }
+ }
+
+ pub fn is_empty(&self) -> bool {
+ self.buffer_len == 0
+ }
+
+ pub fn new(bytes: &[u8]) -> Self {
+ let mut buffer = [0, 0, 0, 0];
+ let len = bytes.len();
+ buffer[..len].copy_from_slice(bytes);
+ Incomplete {
+ buffer: buffer,
+ buffer_len: len as u8,
+ }
+ }
+
+ /// * `None`: still incomplete, call `try_complete` again with more input.
+ /// If no more input is available, this is invalid byte sequence.
+ /// * `Some((result, remaining_input))`: We’re done with this `Incomplete`.
+ /// To keep decoding, pass `remaining_input` to `decode()`.
+ pub fn try_complete<'input>(&mut self, input: &'input [u8])
+ -> Option<(Result<&str, &[u8]>, &'input [u8])> {
+ let (consumed, opt_result) = self.try_complete_offsets(input);
+ let result = opt_result?;
+ let remaining_input = &input[consumed..];
+ let result_bytes = self.take_buffer();
+ let result = match result {
+ Ok(()) => Ok(unsafe { str::from_utf8_unchecked(result_bytes) }),
+ Err(()) => Err(result_bytes),
+ };
+ Some((result, remaining_input))
+ }
+
+ fn take_buffer(&mut self) -> &[u8] {
+ let len = self.buffer_len as usize;
+ self.buffer_len = 0;
+ &self.buffer[..len as usize]
+ }
+
+ /// (consumed_from_input, None): not enough input
+ /// (consumed_from_input, Some(Err(()))): error bytes in buffer
+ /// (consumed_from_input, Some(Ok(()))): UTF-8 string in buffer
+ fn try_complete_offsets(&mut self, input: &[u8]) -> (usize, Option<Result<(), ()>>) {
+ let initial_buffer_len = self.buffer_len as usize;
+ let copied_from_input;
+ {
+ let unwritten = &mut self.buffer[initial_buffer_len..];
+ copied_from_input = cmp::min(unwritten.len(), input.len());
+ unwritten[..copied_from_input].copy_from_slice(&input[..copied_from_input]);
+ }
+ let spliced = &self.buffer[..initial_buffer_len + copied_from_input];
+ match str::from_utf8(spliced) {
+ Ok(_) => {
+ self.buffer_len = spliced.len() as u8;
+ (copied_from_input, Some(Ok(())))
+ }
+ Err(error) => {
+ let valid_up_to = error.valid_up_to();
+ if valid_up_to > 0 {
+ let consumed = valid_up_to.checked_sub(initial_buffer_len).unwrap();
+ self.buffer_len = valid_up_to as u8;
+ (consumed, Some(Ok(())))
+ } else {
+ match error.error_len() {
+ Some(invalid_sequence_length) => {
+ let consumed = invalid_sequence_length
+ .checked_sub(initial_buffer_len).unwrap();
+ self.buffer_len = invalid_sequence_length as u8;
+ (consumed, Some(Err(())))
+ }
+ None => {
+ self.buffer_len = spliced.len() as u8;
+ (copied_from_input, None)
+ }
+ }
+ }
+ }
+ }
+ }
+}
diff --git a/vendor/utf-8/src/lossy.rs b/vendor/utf-8/src/lossy.rs
new file mode 100644
index 000000000..00bcdecf0
--- /dev/null
+++ b/vendor/utf-8/src/lossy.rs
@@ -0,0 +1,92 @@
+use super::*;
+
+/// A push-based, lossy decoder for UTF-8.
+/// Errors are replaced with the U+FFFD replacement character.
+///
+/// Users “push” bytes into the decoder, which in turn “pushes” `&str` slices into a callback.
+///
+/// For example, `String::from_utf8_lossy` (but returning `String` instead of `Cow`)
+/// can be rewritten as:
+///
+/// ```rust
+/// fn string_from_utf8_lossy(input: &[u8]) -> String {
+/// let mut string = String::new();
+/// utf8::LossyDecoder::new(|s| string.push_str(s)).feed(input);
+/// string
+/// }
+/// ```
+///
+/// **Note:** Dropping the decoder signals the end of the input:
+/// If the last input chunk ended with an incomplete byte sequence for a code point,
+/// this is an error and a replacement character is emitted.
+/// Use `std::mem::forget` to inhibit this behavior.
+pub struct LossyDecoder<F: FnMut(&str)> {
+ push_str: F,
+ incomplete: Incomplete,
+}
+
+impl<F: FnMut(&str)> LossyDecoder<F> {
+ /// Create a new decoder from a callback.
+ #[inline]
+ pub fn new(push_str: F) -> Self {
+ LossyDecoder {
+ push_str: push_str,
+ incomplete: Incomplete {
+ buffer: [0, 0, 0, 0],
+ buffer_len: 0,
+ },
+ }
+ }
+
+ /// Feed one chunk of input into the decoder.
+ ///
+ /// The input is decoded lossily
+ /// and the callback called once or more with `&str` string slices.
+ ///
+ /// If the UTF-8 byte sequence for one code point was split into this bytes chunk
+ /// and previous bytes chunks, it will be correctly pieced back together.
+ pub fn feed(&mut self, mut input: &[u8]) {
+ if self.incomplete.buffer_len > 0 {
+ match self.incomplete.try_complete(input) {
+ Some((Ok(s), remaining)) => {
+ (self.push_str)(s);
+ input = remaining
+ }
+ Some((Err(_), remaining)) => {
+ (self.push_str)(REPLACEMENT_CHARACTER);
+ input = remaining
+ }
+ None => {
+ return
+ }
+ }
+ }
+ loop {
+ match decode(input) {
+ Ok(s) => {
+ (self.push_str)(s);
+ return
+ }
+ Err(DecodeError::Incomplete { valid_prefix, incomplete_suffix }) => {
+ (self.push_str)(valid_prefix);
+ self.incomplete = incomplete_suffix;
+ return
+ }
+ Err(DecodeError::Invalid { valid_prefix, remaining_input, .. }) => {
+ (self.push_str)(valid_prefix);
+ (self.push_str)(REPLACEMENT_CHARACTER);
+ input = remaining_input
+ }
+ }
+ }
+ }
+}
+
+impl<F: FnMut(&str)> Drop for LossyDecoder<F> {
+ #[inline]
+ fn drop(&mut self) {
+ if self.incomplete.buffer_len > 0 {
+ (self.push_str)(REPLACEMENT_CHARACTER)
+ }
+ }
+}
diff --git a/vendor/utf-8/src/read.rs b/vendor/utf-8/src/read.rs
new file mode 100644
index 000000000..5e38f54a1
--- /dev/null
+++ b/vendor/utf-8/src/read.rs
@@ -0,0 +1,167 @@
+use std::io::{self, BufRead};
+use std::error::Error;
+use std::fmt;
+use std::str;
+use super::*;
+
+/// Wraps a `std::io::BufRead` buffered byte stream and decode it as UTF-8.
+pub struct BufReadDecoder<B: BufRead> {
+ buf_read: B,
+ bytes_consumed: usize,
+ incomplete: Incomplete,
+}
+
+#[derive(Debug)]
+pub enum BufReadDecoderError<'a> {
+ /// Represents one UTF-8 error in the byte stream.
+ ///
+ /// In lossy decoding, each such error should be replaced with U+FFFD.
+ /// (See `BufReadDecoder::next_lossy` and `BufReadDecoderError::lossy`.)
+ InvalidByteSequence(&'a [u8]),
+
+ /// An I/O error from the underlying byte stream
+ Io(io::Error),
+}
+
+impl<'a> BufReadDecoderError<'a> {
+ /// Replace UTF-8 errors with U+FFFD
+ pub fn lossy(self) -> Result<&'static str, io::Error> {
+ match self {
+ BufReadDecoderError::Io(error) => Err(error),
+ BufReadDecoderError::InvalidByteSequence(_) => Ok(REPLACEMENT_CHARACTER),
+ }
+ }
+}
+
+impl<'a> fmt::Display for BufReadDecoderError<'a> {
+ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ match *self {
+ BufReadDecoderError::InvalidByteSequence(bytes) => {
+ write!(f, "invalid byte sequence: {:02x?}", bytes)
+ }
+ BufReadDecoderError::Io(ref err) => write!(f, "underlying bytestream error: {}", err),
+ }
+ }
+}
+
+impl<'a> Error for BufReadDecoderError<'a> {
+ fn source(&self) -> Option<&(dyn Error + 'static)> {
+ match *self {
+ BufReadDecoderError::InvalidByteSequence(_) => None,
+ BufReadDecoderError::Io(ref err) => Some(err),
+ }
+ }
+}
+
+impl<B: BufRead> BufReadDecoder<B> {
+ /// This is to `Read::read_to_string` what `String::from_utf8_lossy` is to `String::from_utf8`.
+ pub fn read_to_string_lossy(buf_read: B) -> io::Result<String> {
+ let mut decoder = Self::new(buf_read);
+ let mut string = String::new();
+ while let Some(result) = decoder.next_lossy() {
+ string.push_str(result?)
+ }
+ Ok(string)
+ }
+
+ pub fn new(buf_read: B) -> Self {
+ Self {
+ buf_read,
+ bytes_consumed: 0,
+ incomplete: Incomplete::empty(),
+ }
+ }
+
+ /// Same as `BufReadDecoder::next_strict`, but replace UTF-8 errors with U+FFFD.
+ pub fn next_lossy(&mut self) -> Option<io::Result<&str>> {
+ self.next_strict().map(|result| result.or_else(|e| e.lossy()))
+ }
+
+ /// Decode and consume the next chunk of UTF-8 input.
+ ///
+ /// This method is intended to be called repeatedly until it returns `None`,
+ /// which represents EOF from the underlying byte stream.
+ /// This is similar to `Iterator::next`,
+ /// except that decoded chunks borrow the decoder (~iterator)
+ /// so they need to be handled or copied before the next chunk can start decoding.
+ pub fn next_strict(&mut self) -> Option<Result<&str, BufReadDecoderError>> {
+ enum BytesSource {
+ BufRead(usize),
+ Incomplete,
+ }
+ macro_rules! try_io {
+ ($io_result: expr) => {
+ match $io_result {
+ Ok(value) => value,
+ Err(error) => return Some(Err(BufReadDecoderError::Io(error)))
+ }
+ }
+ }
+ let (source, result) = loop {
+ if self.bytes_consumed > 0 {
+ self.buf_read.consume(self.bytes_consumed);
+ self.bytes_consumed = 0;
+ }
+ let buf = try_io!(self.buf_read.fill_buf());
+
+ // Force loop iteration to go through an explicit `continue`
+ enum Unreachable {}
+ let _: Unreachable = if self.incomplete.is_empty() {
+ if buf.is_empty() {
+ return None // EOF
+ }
+ match str::from_utf8(buf) {
+ Ok(_) => {
+ break (BytesSource::BufRead(buf.len()), Ok(()))
+ }
+ Err(error) => {
+ let valid_up_to = error.valid_up_to();
+ if valid_up_to > 0 {
+ break (BytesSource::BufRead(valid_up_to), Ok(()))
+ }
+ match error.error_len() {
+ Some(invalid_sequence_length) => {
+ break (BytesSource::BufRead(invalid_sequence_length), Err(()))
+ }
+ None => {
+ self.bytes_consumed = buf.len();
+ self.incomplete = Incomplete::new(buf);
+ // need more input bytes
+ continue
+ }
+ }
+ }
+ }
+ } else {
+ if buf.is_empty() {
+ break (BytesSource::Incomplete, Err(())) // EOF with incomplete code point
+ }
+ let (consumed, opt_result) = self.incomplete.try_complete_offsets(buf);
+ self.bytes_consumed = consumed;
+ match opt_result {
+ None => {
+ // need more input bytes
+ continue
+ }
+ Some(result) => {
+ break (BytesSource::Incomplete, result)
+ }
+ }
+ };
+ };
+ let bytes = match source {
+ BytesSource::BufRead(byte_count) => {
+ self.bytes_consumed = byte_count;
+ let buf = try_io!(self.buf_read.fill_buf());
+ &buf[..byte_count]
+ }
+ BytesSource::Incomplete => {
+ self.incomplete.take_buffer()
+ }
+ };
+ match result {
+ Ok(()) => Some(Ok(unsafe { str::from_utf8_unchecked(bytes) })),
+ Err(()) => Some(Err(BufReadDecoderError::InvalidByteSequence(bytes))),
+ }
+ }
+}
diff --git a/vendor/utf-8/tests/unit.rs b/vendor/utf-8/tests/unit.rs
new file mode 100644
index 000000000..6839e84f2
--- /dev/null
+++ b/vendor/utf-8/tests/unit.rs
@@ -0,0 +1,197 @@
+extern crate utf8;
+
+use std::borrow::Cow;
+use std::collections::VecDeque;
+use std::io;
+use utf8::*;
+
+/// A re-implementation of std::str::from_utf8
+pub fn str_from_utf8(input: &[u8]) -> Result<&str, usize> {
+ match decode(input) {
+ Ok(s) => return Ok(s),
+ Err(DecodeError::Invalid { valid_prefix, .. }) |
+ Err(DecodeError::Incomplete { valid_prefix, .. }) => Err(valid_prefix.len()),
+ }
+}
+
+#[test]
+fn test_str_from_utf8() {
+ let xs = b"hello";
+ assert_eq!(str_from_utf8(xs), Ok("hello"));
+
+ let xs = "ศไทย中华Việt Nam".as_bytes();
+ assert_eq!(str_from_utf8(xs), Ok("ศไทย中华Việt Nam"));
+
+ let xs = b"hello\xFF";
+ assert!(str_from_utf8(xs).is_err());
+}
+
+#[test]
+fn test_is_utf8() {
+ // Chars of 1, 2, 3, and 4 bytes
+ assert!(str_from_utf8("eé€\u{10000}".as_bytes()).is_ok());
+ // invalid prefix
+ assert!(str_from_utf8(&[0x80]).is_err());
+ // invalid 2 byte prefix
+ assert!(str_from_utf8(&[0xc0]).is_err());
+ assert!(str_from_utf8(&[0xc0, 0x10]).is_err());
+ // invalid 3 byte prefix
+ assert!(str_from_utf8(&[0xe0]).is_err());
+ assert!(str_from_utf8(&[0xe0, 0x10]).is_err());
+ assert!(str_from_utf8(&[0xe0, 0xff, 0x10]).is_err());
+ // invalid 4 byte prefix
+ assert!(str_from_utf8(&[0xf0]).is_err());
+ assert!(str_from_utf8(&[0xf0, 0x10]).is_err());
+ assert!(str_from_utf8(&[0xf0, 0xff, 0x10]).is_err());
+ assert!(str_from_utf8(&[0xf0, 0xff, 0xff, 0x10]).is_err());
+
+ // deny overlong encodings
+ assert!(str_from_utf8(&[0xc0, 0x80]).is_err());
+ assert!(str_from_utf8(&[0xc0, 0xae]).is_err());
+ assert!(str_from_utf8(&[0xe0, 0x80, 0x80]).is_err());
+ assert!(str_from_utf8(&[0xe0, 0x80, 0xaf]).is_err());
+ assert!(str_from_utf8(&[0xe0, 0x81, 0x81]).is_err());
+ assert!(str_from_utf8(&[0xf0, 0x82, 0x82, 0xac]).is_err());
+ assert!(str_from_utf8(&[0xf4, 0x90, 0x80, 0x80]).is_err());
+
+ // deny surrogates
+ assert!(str_from_utf8(&[0xED, 0xA0, 0x80]).is_err());
+ assert!(str_from_utf8(&[0xED, 0xBF, 0xBF]).is_err());
+
+ assert!(str_from_utf8(&[0xC2, 0x80]).is_ok());
+ assert!(str_from_utf8(&[0xDF, 0xBF]).is_ok());
+ assert!(str_from_utf8(&[0xE0, 0xA0, 0x80]).is_ok());
+ assert!(str_from_utf8(&[0xED, 0x9F, 0xBF]).is_ok());
+ assert!(str_from_utf8(&[0xEE, 0x80, 0x80]).is_ok());
+ assert!(str_from_utf8(&[0xEF, 0xBF, 0xBF]).is_ok());
+ assert!(str_from_utf8(&[0xF0, 0x90, 0x80, 0x80]).is_ok());
+ assert!(str_from_utf8(&[0xF4, 0x8F, 0xBF, 0xBF]).is_ok());
+}
+
+/// A re-implementation of String::from_utf8_lossy
+pub fn string_from_utf8_lossy(input: &[u8]) -> Cow<str> {
+ let mut result = decode(input);
+ if let Ok(s) = result {
+ return s.into()
+ }
+ let mut string = String::with_capacity(input.len() + REPLACEMENT_CHARACTER.len());
+ loop {
+ match result {
+ Ok(s) => {
+ string.push_str(s);
+ return string.into()
+ }
+ Err(DecodeError::Incomplete { valid_prefix, .. }) => {
+ string.push_str(valid_prefix);
+ string.push_str(REPLACEMENT_CHARACTER);
+ return string.into()
+ }
+ Err(DecodeError::Invalid { valid_prefix, remaining_input, .. }) => {
+ string.push_str(valid_prefix);
+ string.push_str(REPLACEMENT_CHARACTER);
+ result = decode(remaining_input);
+ }
+ }
+ }
+}
+
+pub const DECODED_LOSSY: &'static [(&'static [u8], &'static str)] = &[
+ (b"hello", "hello"),
+ (b"\xe0\xb8\xa8\xe0\xb9\x84\xe0\xb8\x97\xe0\xb8\xa2\xe4\xb8\xad\xe5\x8d\x8e", "ศไทย中华"),
+ (b"Vi\xe1\xbb\x87t Nam", "Việt Nam"),
+ (b"Hello\xC2 There\xFF ", "Hello\u{FFFD} There\u{FFFD} "),
+ (b"Hello\xC0\x80 There", "Hello\u{FFFD}\u{FFFD} There"),
+ (b"\xE6\x83 Goodbye", "\u{FFFD} Goodbye"),
+ (b"\xF5foo\xF5\x80bar", "\u{FFFD}foo\u{FFFD}\u{FFFD}bar"),
+ (b"\xF5foo\xF5\xC2", "\u{FFFD}foo\u{FFFD}\u{FFFD}"),
+ (b"\xF1foo\xF1\x80bar\xF1\x80\x80baz", "\u{FFFD}foo\u{FFFD}bar\u{FFFD}baz"),
+ (b"\xF4foo\xF4\x80bar\xF4\xBFbaz", "\u{FFFD}foo\u{FFFD}bar\u{FFFD}\u{FFFD}baz"),
+ (b"\xF0\x80\x80\x80foo\xF0\x90\x80\x80bar", "\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}foo\u{10000}bar"),
+ (b"\xF0\x90\x80foo", "\u{FFFD}foo"),
+ // surrogates
+ (b"\xED\xA0\x80foo\xED\xBF\xBFbar", "\u{FFFD}\u{FFFD}\u{FFFD}foo\u{FFFD}\u{FFFD}\u{FFFD}bar"),
+];
+
+#[test]
+fn test_string_from_utf8_lossy() {
+ for &(input, expected) in DECODED_LOSSY {
+ assert_eq!(string_from_utf8_lossy(input), expected);
+ }
+}
+
+pub fn all_partitions<'a, F>(input: &'a [u8], f: F)
+ where F: Fn(&[&[u8]])
+{
+
+ fn all_partitions_inner<'a, F>(chunks: &mut Vec<&'a [u8]>, input: &'a [u8], f: &F)
+ where F: Fn(&[&[u8]])
+ {
+ if input.is_empty() {
+ f(chunks)
+ }
+ for i in 1..(input.len() + 1) {
+ chunks.push(&input[..i]);
+ all_partitions_inner(chunks, &input[i..], f);
+ chunks.pop();
+ }
+ }
+
+ let mut chunks = Vec::new();
+ all_partitions_inner(&mut chunks, input, &f);
+ assert_eq!(chunks.len(), 0);
+}
+
+#[test]
+fn test_incremental_decoder() {
+ for &(input, expected) in DECODED_LOSSY {
+ all_partitions(input, |chunks| {
+ let mut string = String::new();
+ {
+ let mut decoder = LossyDecoder::new(|s| string.push_str(s));
+ for &chunk in &*chunks {
+ decoder.feed(chunk);
+ }
+ }
+ assert_eq!(string, expected);
+ });
+ }
+}
+
+#[test]
+fn test_bufread_decoder() {
+ for &(input, expected) in DECODED_LOSSY {
+ all_partitions(input, |chunks| {
+ let chunks = Chunks(chunks.to_vec().into());
+ let string = BufReadDecoder::read_to_string_lossy(chunks).unwrap();
+ assert_eq!(string, expected)
+ });
+ }
+}
+
+struct Chunks<'a>(VecDeque<&'a [u8]>);
+
+impl<'a> io::Read for Chunks<'a> {
+ fn read(&mut self, _: &mut [u8]) -> io::Result<usize> {
+ unimplemented!()
+ }
+}
+
+impl<'a> io::BufRead for Chunks<'a> {
+ fn fill_buf(&mut self) -> io::Result<&[u8]> {
+ Ok(*self.0.front().unwrap())
+ }
+
+ fn consume(&mut self, bytes: usize) {
+ {
+ let front = self.0.front_mut().unwrap();
+ *front = &front[bytes..];
+ if !front.is_empty() {
+ return
+ }
+ }
+ if self.0.len() > 1 {
+ self.0.pop_front();
+ }
+ }
+
+}