From 246f239d9f40f633160f0c18f87a20922d4e77bb Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Wed, 17 Apr 2024 14:06:37 +0200 Subject: Merging debian version 1.65.0+dfsg1-2. Signed-off-by: Daniel Baumann --- library/core/src/str/lossy.rs | 244 +++++++++++++++++++++++++++--------------- 1 file changed, 157 insertions(+), 87 deletions(-) (limited to 'library/core/src/str/lossy.rs') diff --git a/library/core/src/str/lossy.rs b/library/core/src/str/lossy.rs index 6ec1c9390..59f873d12 100644 --- a/library/core/src/str/lossy.rs +++ b/library/core/src/str/lossy.rs @@ -1,51 +1,170 @@ -use crate::char; -use crate::fmt::{self, Write}; -use crate::mem; +use crate::fmt; +use crate::fmt::Formatter; +use crate::fmt::Write; +use crate::iter::FusedIterator; use super::from_utf8_unchecked; use super::validations::utf8_char_width; -/// Lossy UTF-8 string. -#[unstable(feature = "str_internals", issue = "none")] -pub struct Utf8Lossy { - bytes: [u8], +/// An item returned by the [`Utf8Chunks`] iterator. +/// +/// A `Utf8Chunk` stores a sequence of [`u8`] up to the first broken character +/// when decoding a UTF-8 string. +/// +/// # Examples +/// +/// ``` +/// #![feature(utf8_chunks)] +/// +/// use std::str::Utf8Chunks; +/// +/// // An invalid UTF-8 string +/// let bytes = b"foo\xF1\x80bar"; +/// +/// // Decode the first `Utf8Chunk` +/// let chunk = Utf8Chunks::new(bytes).next().unwrap(); +/// +/// // The first three characters are valid UTF-8 +/// assert_eq!("foo", chunk.valid()); +/// +/// // The fourth character is broken +/// assert_eq!(b"\xF1\x80", chunk.invalid()); +/// ``` +#[unstable(feature = "utf8_chunks", issue = "99543")] +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct Utf8Chunk<'a> { + valid: &'a str, + invalid: &'a [u8], } -impl Utf8Lossy { +impl<'a> Utf8Chunk<'a> { + /// Returns the next validated UTF-8 substring. + /// + /// This substring can be empty at the start of the string or between + /// broken UTF-8 characters. #[must_use] - pub fn from_bytes(bytes: &[u8]) -> &Utf8Lossy { - // SAFETY: Both use the same memory layout, and UTF-8 correctness isn't required. - unsafe { mem::transmute(bytes) } + #[unstable(feature = "utf8_chunks", issue = "99543")] + pub fn valid(&self) -> &'a str { + self.valid } - pub fn chunks(&self) -> Utf8LossyChunksIter<'_> { - Utf8LossyChunksIter { source: &self.bytes } + /// Returns the invalid sequence that caused a failure. + /// + /// The returned slice will have a maximum length of 3 and starts after the + /// substring given by [`valid`]. Decoding will resume after this sequence. + /// + /// If empty, this is the last chunk in the string. If non-empty, an + /// unexpected byte was encountered or the end of the input was reached + /// unexpectedly. + /// + /// Lossy decoding would replace this sequence with [`U+FFFD REPLACEMENT + /// CHARACTER`]. + /// + /// [`valid`]: Self::valid + /// [`U+FFFD REPLACEMENT CHARACTER`]: crate::char::REPLACEMENT_CHARACTER + #[must_use] + #[unstable(feature = "utf8_chunks", issue = "99543")] + pub fn invalid(&self) -> &'a [u8] { + self.invalid } } -/// Iterator over lossy UTF-8 string -#[must_use = "iterators are lazy and do nothing unless consumed"] +#[must_use] +#[unstable(feature = "str_internals", issue = "none")] +pub struct Debug<'a>(&'a [u8]); + #[unstable(feature = "str_internals", issue = "none")] -#[allow(missing_debug_implementations)] -pub struct Utf8LossyChunksIter<'a> { +impl fmt::Debug for Debug<'_> { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + f.write_char('"')?; + + for chunk in Utf8Chunks::new(self.0) { + // Valid part. + // Here we partially parse UTF-8 again which is suboptimal. + { + let valid = chunk.valid(); + let mut from = 0; + for (i, c) in valid.char_indices() { + let esc = c.escape_debug(); + // If char needs escaping, flush backlog so far and write, else skip + if esc.len() != 1 { + f.write_str(&valid[from..i])?; + for c in esc { + f.write_char(c)?; + } + from = i + c.len_utf8(); + } + } + f.write_str(&valid[from..])?; + } + + // Broken parts of string as hex escape. + for &b in chunk.invalid() { + write!(f, "\\x{:02X}", b)?; + } + } + + f.write_char('"') + } +} + +/// An iterator used to decode a slice of mostly UTF-8 bytes to string slices +/// ([`&str`]) and byte slices ([`&[u8]`][byteslice]). +/// +/// If you want a simple conversion from UTF-8 byte slices to string slices, +/// [`from_utf8`] is easier to use. +/// +/// [byteslice]: slice +/// [`from_utf8`]: super::from_utf8 +/// +/// # Examples +/// +/// This can be used to create functionality similar to +/// [`String::from_utf8_lossy`] without allocating heap memory: +/// +/// ``` +/// #![feature(utf8_chunks)] +/// +/// use std::str::Utf8Chunks; +/// +/// fn from_utf8_lossy(input: &[u8], mut push: F) where F: FnMut(&str) { +/// for chunk in Utf8Chunks::new(input) { +/// push(chunk.valid()); +/// +/// if !chunk.invalid().is_empty() { +/// push("\u{FFFD}"); +/// } +/// } +/// } +/// ``` +/// +/// [`String::from_utf8_lossy`]: ../../std/string/struct.String.html#method.from_utf8_lossy +#[must_use = "iterators are lazy and do nothing unless consumed"] +#[unstable(feature = "utf8_chunks", issue = "99543")] +#[derive(Clone)] +pub struct Utf8Chunks<'a> { source: &'a [u8], } -#[unstable(feature = "str_internals", issue = "none")] -#[derive(PartialEq, Eq, Debug)] -pub struct Utf8LossyChunk<'a> { - /// Sequence of valid chars. - /// Can be empty between broken UTF-8 chars. - pub valid: &'a str, - /// Single broken char, empty if none. - /// Empty iff iterator item is last. - pub broken: &'a [u8], +impl<'a> Utf8Chunks<'a> { + /// Creates a new iterator to decode the bytes. + #[unstable(feature = "utf8_chunks", issue = "99543")] + pub fn new(bytes: &'a [u8]) -> Self { + Self { source: bytes } + } + + #[doc(hidden)] + #[unstable(feature = "str_internals", issue = "none")] + pub fn debug(&self) -> Debug<'_> { + Debug(self.source) + } } -impl<'a> Iterator for Utf8LossyChunksIter<'a> { - type Item = Utf8LossyChunk<'a>; +#[unstable(feature = "utf8_chunks", issue = "99543")] +impl<'a> Iterator for Utf8Chunks<'a> { + type Item = Utf8Chunk<'a>; - fn next(&mut self) -> Option> { + fn next(&mut self) -> Option> { if self.source.is_empty() { return None; } @@ -130,71 +249,22 @@ impl<'a> Iterator for Utf8LossyChunksIter<'a> { // SAFETY: `valid_up_to <= i` because it is only ever assigned via // `valid_up_to = i` and `i` only increases. - let (valid, broken) = unsafe { inspected.split_at_unchecked(valid_up_to) }; + let (valid, invalid) = unsafe { inspected.split_at_unchecked(valid_up_to) }; - Some(Utf8LossyChunk { + Some(Utf8Chunk { // SAFETY: All bytes up to `valid_up_to` are valid UTF-8. valid: unsafe { from_utf8_unchecked(valid) }, - broken, + invalid, }) } } -impl fmt::Display for Utf8Lossy { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - // If we're the empty string then our iterator won't actually yield - // anything, so perform the formatting manually - if self.bytes.is_empty() { - return "".fmt(f); - } - - for Utf8LossyChunk { valid, broken } in self.chunks() { - // If we successfully decoded the whole chunk as a valid string then - // we can return a direct formatting of the string which will also - // respect various formatting flags if possible. - if valid.len() == self.bytes.len() { - assert!(broken.is_empty()); - return valid.fmt(f); - } - - f.write_str(valid)?; - if !broken.is_empty() { - f.write_char(char::REPLACEMENT_CHARACTER)?; - } - } - Ok(()) - } -} - -impl fmt::Debug for Utf8Lossy { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.write_char('"')?; +#[unstable(feature = "utf8_chunks", issue = "99543")] +impl FusedIterator for Utf8Chunks<'_> {} - for Utf8LossyChunk { valid, broken } in self.chunks() { - // Valid part. - // Here we partially parse UTF-8 again which is suboptimal. - { - let mut from = 0; - for (i, c) in valid.char_indices() { - let esc = c.escape_debug(); - // If char needs escaping, flush backlog so far and write, else skip - if esc.len() != 1 { - f.write_str(&valid[from..i])?; - for c in esc { - f.write_char(c)?; - } - from = i + c.len_utf8(); - } - } - f.write_str(&valid[from..])?; - } - - // Broken parts of string as hex escape. - for &b in broken { - write!(f, "\\x{:02x}", b)?; - } - } - - f.write_char('"') +#[unstable(feature = "utf8_chunks", issue = "99543")] +impl fmt::Debug for Utf8Chunks<'_> { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + f.debug_struct("Utf8Chunks").field("source", &self.debug()).finish() } } -- cgit v1.2.3