summaryrefslogtreecommitdiffstats
path: root/library/core/src/str/lossy.rs
diff options
context:
space:
mode:
Diffstat (limited to 'library/core/src/str/lossy.rs')
-rw-r--r--library/core/src/str/lossy.rs244
1 files changed, 157 insertions, 87 deletions
diff --git a/library/core/src/str/lossy.rs b/library/core/src/str/lossy.rs
index 6ec1c9390..59f873d12 100644
--- a/library/core/src/str/lossy.rs
+++ b/library/core/src/str/lossy.rs
@@ -1,51 +1,170 @@
-use crate::char;
-use crate::fmt::{self, Write};
-use crate::mem;
+use crate::fmt;
+use crate::fmt::Formatter;
+use crate::fmt::Write;
+use crate::iter::FusedIterator;
use super::from_utf8_unchecked;
use super::validations::utf8_char_width;
-/// Lossy UTF-8 string.
-#[unstable(feature = "str_internals", issue = "none")]
-pub struct Utf8Lossy {
- bytes: [u8],
+/// An item returned by the [`Utf8Chunks`] iterator.
+///
+/// A `Utf8Chunk` stores a sequence of [`u8`] up to the first broken character
+/// when decoding a UTF-8 string.
+///
+/// # Examples
+///
+/// ```
+/// #![feature(utf8_chunks)]
+///
+/// use std::str::Utf8Chunks;
+///
+/// // An invalid UTF-8 string
+/// let bytes = b"foo\xF1\x80bar";
+///
+/// // Decode the first `Utf8Chunk`
+/// let chunk = Utf8Chunks::new(bytes).next().unwrap();
+///
+/// // The first three characters are valid UTF-8
+/// assert_eq!("foo", chunk.valid());
+///
+/// // The fourth character is broken
+/// assert_eq!(b"\xF1\x80", chunk.invalid());
+/// ```
+#[unstable(feature = "utf8_chunks", issue = "99543")]
+#[derive(Clone, Debug, PartialEq, Eq)]
+pub struct Utf8Chunk<'a> {
+ valid: &'a str,
+ invalid: &'a [u8],
}
-impl Utf8Lossy {
+impl<'a> Utf8Chunk<'a> {
+ /// Returns the next validated UTF-8 substring.
+ ///
+ /// This substring can be empty at the start of the string or between
+ /// broken UTF-8 characters.
#[must_use]
- pub fn from_bytes(bytes: &[u8]) -> &Utf8Lossy {
- // SAFETY: Both use the same memory layout, and UTF-8 correctness isn't required.
- unsafe { mem::transmute(bytes) }
+ #[unstable(feature = "utf8_chunks", issue = "99543")]
+ pub fn valid(&self) -> &'a str {
+ self.valid
}
- pub fn chunks(&self) -> Utf8LossyChunksIter<'_> {
- Utf8LossyChunksIter { source: &self.bytes }
+ /// Returns the invalid sequence that caused a failure.
+ ///
+ /// The returned slice will have a maximum length of 3 and starts after the
+ /// substring given by [`valid`]. Decoding will resume after this sequence.
+ ///
+ /// If empty, this is the last chunk in the string. If non-empty, an
+ /// unexpected byte was encountered or the end of the input was reached
+ /// unexpectedly.
+ ///
+ /// Lossy decoding would replace this sequence with [`U+FFFD REPLACEMENT
+ /// CHARACTER`].
+ ///
+ /// [`valid`]: Self::valid
+ /// [`U+FFFD REPLACEMENT CHARACTER`]: crate::char::REPLACEMENT_CHARACTER
+ #[must_use]
+ #[unstable(feature = "utf8_chunks", issue = "99543")]
+ pub fn invalid(&self) -> &'a [u8] {
+ self.invalid
}
}
-/// Iterator over lossy UTF-8 string
-#[must_use = "iterators are lazy and do nothing unless consumed"]
+#[must_use]
+#[unstable(feature = "str_internals", issue = "none")]
+pub struct Debug<'a>(&'a [u8]);
+
#[unstable(feature = "str_internals", issue = "none")]
-#[allow(missing_debug_implementations)]
-pub struct Utf8LossyChunksIter<'a> {
+impl fmt::Debug for Debug<'_> {
+ fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
+ f.write_char('"')?;
+
+ for chunk in Utf8Chunks::new(self.0) {
+ // Valid part.
+ // Here we partially parse UTF-8 again which is suboptimal.
+ {
+ let valid = chunk.valid();
+ let mut from = 0;
+ for (i, c) in valid.char_indices() {
+ let esc = c.escape_debug();
+ // If char needs escaping, flush backlog so far and write, else skip
+ if esc.len() != 1 {
+ f.write_str(&valid[from..i])?;
+ for c in esc {
+ f.write_char(c)?;
+ }
+ from = i + c.len_utf8();
+ }
+ }
+ f.write_str(&valid[from..])?;
+ }
+
+ // Broken parts of string as hex escape.
+ for &b in chunk.invalid() {
+ write!(f, "\\x{:02X}", b)?;
+ }
+ }
+
+ f.write_char('"')
+ }
+}
+
+/// An iterator used to decode a slice of mostly UTF-8 bytes to string slices
+/// ([`&str`]) and byte slices ([`&[u8]`][byteslice]).
+///
+/// If you want a simple conversion from UTF-8 byte slices to string slices,
+/// [`from_utf8`] is easier to use.
+///
+/// [byteslice]: slice
+/// [`from_utf8`]: super::from_utf8
+///
+/// # Examples
+///
+/// This can be used to create functionality similar to
+/// [`String::from_utf8_lossy`] without allocating heap memory:
+///
+/// ```
+/// #![feature(utf8_chunks)]
+///
+/// use std::str::Utf8Chunks;
+///
+/// fn from_utf8_lossy<F>(input: &[u8], mut push: F) where F: FnMut(&str) {
+/// for chunk in Utf8Chunks::new(input) {
+/// push(chunk.valid());
+///
+/// if !chunk.invalid().is_empty() {
+/// push("\u{FFFD}");
+/// }
+/// }
+/// }
+/// ```
+///
+/// [`String::from_utf8_lossy`]: ../../std/string/struct.String.html#method.from_utf8_lossy
+#[must_use = "iterators are lazy and do nothing unless consumed"]
+#[unstable(feature = "utf8_chunks", issue = "99543")]
+#[derive(Clone)]
+pub struct Utf8Chunks<'a> {
source: &'a [u8],
}
-#[unstable(feature = "str_internals", issue = "none")]
-#[derive(PartialEq, Eq, Debug)]
-pub struct Utf8LossyChunk<'a> {
- /// Sequence of valid chars.
- /// Can be empty between broken UTF-8 chars.
- pub valid: &'a str,
- /// Single broken char, empty if none.
- /// Empty iff iterator item is last.
- pub broken: &'a [u8],
+impl<'a> Utf8Chunks<'a> {
+ /// Creates a new iterator to decode the bytes.
+ #[unstable(feature = "utf8_chunks", issue = "99543")]
+ pub fn new(bytes: &'a [u8]) -> Self {
+ Self { source: bytes }
+ }
+
+ #[doc(hidden)]
+ #[unstable(feature = "str_internals", issue = "none")]
+ pub fn debug(&self) -> Debug<'_> {
+ Debug(self.source)
+ }
}
-impl<'a> Iterator for Utf8LossyChunksIter<'a> {
- type Item = Utf8LossyChunk<'a>;
+#[unstable(feature = "utf8_chunks", issue = "99543")]
+impl<'a> Iterator for Utf8Chunks<'a> {
+ type Item = Utf8Chunk<'a>;
- fn next(&mut self) -> Option<Utf8LossyChunk<'a>> {
+ fn next(&mut self) -> Option<Utf8Chunk<'a>> {
if self.source.is_empty() {
return None;
}
@@ -130,71 +249,22 @@ impl<'a> Iterator for Utf8LossyChunksIter<'a> {
// SAFETY: `valid_up_to <= i` because it is only ever assigned via
// `valid_up_to = i` and `i` only increases.
- let (valid, broken) = unsafe { inspected.split_at_unchecked(valid_up_to) };
+ let (valid, invalid) = unsafe { inspected.split_at_unchecked(valid_up_to) };
- Some(Utf8LossyChunk {
+ Some(Utf8Chunk {
// SAFETY: All bytes up to `valid_up_to` are valid UTF-8.
valid: unsafe { from_utf8_unchecked(valid) },
- broken,
+ invalid,
})
}
}
-impl fmt::Display for Utf8Lossy {
- fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
- // If we're the empty string then our iterator won't actually yield
- // anything, so perform the formatting manually
- if self.bytes.is_empty() {
- return "".fmt(f);
- }
-
- for Utf8LossyChunk { valid, broken } in self.chunks() {
- // If we successfully decoded the whole chunk as a valid string then
- // we can return a direct formatting of the string which will also
- // respect various formatting flags if possible.
- if valid.len() == self.bytes.len() {
- assert!(broken.is_empty());
- return valid.fmt(f);
- }
-
- f.write_str(valid)?;
- if !broken.is_empty() {
- f.write_char(char::REPLACEMENT_CHARACTER)?;
- }
- }
- Ok(())
- }
-}
-
-impl fmt::Debug for Utf8Lossy {
- fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
- f.write_char('"')?;
+#[unstable(feature = "utf8_chunks", issue = "99543")]
+impl FusedIterator for Utf8Chunks<'_> {}
- for Utf8LossyChunk { valid, broken } in self.chunks() {
- // Valid part.
- // Here we partially parse UTF-8 again which is suboptimal.
- {
- let mut from = 0;
- for (i, c) in valid.char_indices() {
- let esc = c.escape_debug();
- // If char needs escaping, flush backlog so far and write, else skip
- if esc.len() != 1 {
- f.write_str(&valid[from..i])?;
- for c in esc {
- f.write_char(c)?;
- }
- from = i + c.len_utf8();
- }
- }
- f.write_str(&valid[from..])?;
- }
-
- // Broken parts of string as hex escape.
- for &b in broken {
- write!(f, "\\x{:02x}", b)?;
- }
- }
-
- f.write_char('"')
+#[unstable(feature = "utf8_chunks", issue = "99543")]
+impl fmt::Debug for Utf8Chunks<'_> {
+ fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
+ f.debug_struct("Utf8Chunks").field("source", &self.debug()).finish()
}
}