4 files changed, 189 insertions, 93 deletions
diff --git a/library/core/src/str/error.rs b/library/core/src/str/error.rs
index 4e569fcc8..343889b69 100644
--- a/library/core/src/str/error.rs
+++ b/library/core/src/str/error.rs
@@ -1,5 +1,7 @@
 //! Defines utf8 error type.
 
+#[cfg(not(bootstrap))]
+use crate::error::Error;
 use crate::fmt;
 
 /// Errors which can occur when attempting to interpret a sequence of [`u8`]
@@ -122,6 +124,15 @@ impl fmt::Display for Utf8Error {
     }
 }
 
+#[cfg(not(bootstrap))]
+#[stable(feature = "rust1", since = "1.0.0")]
+impl Error for Utf8Error {
+    #[allow(deprecated)]
+    fn description(&self) -> &str {
+        "invalid utf-8: corrupt contents"
+    }
+}
+
 /// An error returned when parsing a `bool` using [`from_str`] fails
 ///
 /// [`from_str`]: super::FromStr::from_str
@@ -136,3 +147,12 @@ impl fmt::Display for ParseBoolError {
         "provided string was not `true` or `false`".fmt(f)
     }
 }
+
+#[cfg(not(bootstrap))]
+#[stable(feature = "rust1", since = "1.0.0")]
+impl Error for ParseBoolError {
+    #[allow(deprecated)]
+    fn description(&self) -> &str {
+        "failed to parse bool"
+    }
+}
diff --git a/library/core/src/str/lossy.rs b/library/core/src/str/lossy.rs
index 6ec1c9390..59f873d12 100644
--- a/library/core/src/str/lossy.rs
+++ b/library/core/src/str/lossy.rs
@@ -1,51 +1,170 @@
-use crate::char;
-use crate::fmt::{self, Write};
-use crate::mem;
+use crate::fmt;
+use crate::fmt::Formatter;
+use crate::fmt::Write;
+use crate::iter::FusedIterator;
 
 use super::from_utf8_unchecked;
 use super::validations::utf8_char_width;
 
-/// Lossy UTF-8 string.
-#[unstable(feature = "str_internals", issue = "none")]
-pub struct Utf8Lossy {
-    bytes: [u8],
+/// An item returned by the [`Utf8Chunks`] iterator.
+///
+/// A `Utf8Chunk` stores a sequence of [`u8`] up to the first broken character
+/// when decoding a UTF-8 string.
+///
+/// # Examples
+///
+/// ```
+/// #![feature(utf8_chunks)]
+///
+/// use std::str::Utf8Chunks;
+///
+/// // An invalid UTF-8 string
+/// let bytes = b"foo\xF1\x80bar";
+///
+/// // Decode the first `Utf8Chunk`
+/// let chunk = Utf8Chunks::new(bytes).next().unwrap();
+///
+/// // The first three characters are valid UTF-8
+/// assert_eq!("foo", chunk.valid());
+///
+/// // The fourth character is broken
+/// assert_eq!(b"\xF1\x80", chunk.invalid());
+/// ```
+#[unstable(feature = "utf8_chunks", issue = "99543")]
+#[derive(Clone, Debug, PartialEq, Eq)]
+pub struct Utf8Chunk<'a> {
+    valid: &'a str,
+    invalid: &'a [u8],
 }
 
-impl Utf8Lossy {
+impl<'a> Utf8Chunk<'a> {
+    /// Returns the next validated UTF-8 substring.
+    ///
+    /// This substring can be empty at the start of the string or between
+    /// broken UTF-8 characters.
     #[must_use]
-    pub fn from_bytes(bytes: &[u8]) -> &Utf8Lossy {
-        // SAFETY: Both use the same memory layout, and UTF-8 correctness isn't required.
-        unsafe { mem::transmute(bytes) }
+    #[unstable(feature = "utf8_chunks", issue = "99543")]
+    pub fn valid(&self) -> &'a str {
+        self.valid
     }
 
-    pub fn chunks(&self) -> Utf8LossyChunksIter<'_> {
-        Utf8LossyChunksIter { source: &self.bytes }
+    /// Returns the invalid sequence that caused a failure.
+    ///
+    /// The returned slice will have a maximum length of 3 and starts after the
+    /// substring given by [`valid`]. Decoding will resume after this sequence.
+    ///
+    /// If empty, this is the last chunk in the string. If non-empty, an
+    /// unexpected byte was encountered or the end of the input was reached
+    /// unexpectedly.
+    ///
+    /// Lossy decoding would replace this sequence with [`U+FFFD REPLACEMENT
+    /// CHARACTER`].
+    ///
+    /// [`valid`]: Self::valid
+    /// [`U+FFFD REPLACEMENT CHARACTER`]: crate::char::REPLACEMENT_CHARACTER
+    #[must_use]
+    #[unstable(feature = "utf8_chunks", issue = "99543")]
+    pub fn invalid(&self) -> &'a [u8] {
+        self.invalid
     }
 }
 
-/// Iterator over lossy UTF-8 string
-#[must_use = "iterators are lazy and do nothing unless consumed"]
+#[must_use]
+#[unstable(feature = "str_internals", issue = "none")]
+pub struct Debug<'a>(&'a [u8]);
+
 #[unstable(feature = "str_internals", issue = "none")]
-#[allow(missing_debug_implementations)]
-pub struct Utf8LossyChunksIter<'a> {
+impl fmt::Debug for Debug<'_> {
+    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
+        f.write_char('"')?;
+
+        for chunk in Utf8Chunks::new(self.0) {
+            // Valid part.
+            // Here we partially parse UTF-8 again which is suboptimal.
+            {
+                let valid = chunk.valid();
+                let mut from = 0;
+                for (i, c) in valid.char_indices() {
+                    let esc = c.escape_debug();
+                    // If char needs escaping, flush backlog so far and write, else skip
+                    if esc.len() != 1 {
+                        f.write_str(&valid[from..i])?;
+                        for c in esc {
+                            f.write_char(c)?;
+                        }
+                        from = i + c.len_utf8();
+                    }
+                }
+                f.write_str(&valid[from..])?;
+            }
+
+            // Broken parts of string as hex escape.
+            for &b in chunk.invalid() {
+                write!(f, "\\x{:02X}", b)?;
+            }
+        }
+
+        f.write_char('"')
+    }
+}
+
+/// An iterator used to decode a slice of mostly UTF-8 bytes to string slices
+/// ([`&str`]) and byte slices ([`&[u8]`][byteslice]).
+///
+/// If you want a simple conversion from UTF-8 byte slices to string slices,
+/// [`from_utf8`] is easier to use.
+///
+/// [byteslice]: slice
+/// [`from_utf8`]: super::from_utf8
+///
+/// # Examples
+///
+/// This can be used to create functionality similar to
+/// [`String::from_utf8_lossy`] without allocating heap memory:
+///
+/// ```
+/// #![feature(utf8_chunks)]
+///
+/// use std::str::Utf8Chunks;
+///
+/// fn from_utf8_lossy<F>(input: &[u8], mut push: F) where F: FnMut(&str) {
+///     for chunk in Utf8Chunks::new(input) {
+///         push(chunk.valid());
+///
+///         if !chunk.invalid().is_empty() {
+///             push("\u{FFFD}");
+///         }
+///     }
+/// }
+/// ```
+///
+/// [`String::from_utf8_lossy`]: ../../std/string/struct.String.html#method.from_utf8_lossy
+#[must_use = "iterators are lazy and do nothing unless consumed"]
+#[unstable(feature = "utf8_chunks", issue = "99543")]
+#[derive(Clone)]
+pub struct Utf8Chunks<'a> {
     source: &'a [u8],
 }
 
-#[unstable(feature = "str_internals", issue = "none")]
-#[derive(PartialEq, Eq, Debug)]
-pub struct Utf8LossyChunk<'a> {
-    /// Sequence of valid chars.
-    /// Can be empty between broken UTF-8 chars.
-    pub valid: &'a str,
-    /// Single broken char, empty if none.
-    /// Empty iff iterator item is last.
-    pub broken: &'a [u8],
+impl<'a> Utf8Chunks<'a> {
+    /// Creates a new iterator to decode the bytes.
+    #[unstable(feature = "utf8_chunks", issue = "99543")]
+    pub fn new(bytes: &'a [u8]) -> Self {
+        Self { source: bytes }
+    }
+
+    #[doc(hidden)]
+    #[unstable(feature = "str_internals", issue = "none")]
+    pub fn debug(&self) -> Debug<'_> {
+        Debug(self.source)
+    }
 }
 
-impl<'a> Iterator for Utf8LossyChunksIter<'a> {
-    type Item = Utf8LossyChunk<'a>;
+#[unstable(feature = "utf8_chunks", issue = "99543")]
+impl<'a> Iterator for Utf8Chunks<'a> {
+    type Item = Utf8Chunk<'a>;
 
-    fn next(&mut self) -> Option<Utf8LossyChunk<'a>> {
+    fn next(&mut self) -> Option<Utf8Chunk<'a>> {
         if self.source.is_empty() {
             return None;
         }
@@ -130,71 +249,22 @@ impl<'a> Iterator for Utf8LossyChunksIter<'a> {
 
         // SAFETY: `valid_up_to <= i` because it is only ever assigned via
         // `valid_up_to = i` and `i` only increases.
-        let (valid, broken) = unsafe { inspected.split_at_unchecked(valid_up_to) };
+        let (valid, invalid) = unsafe { inspected.split_at_unchecked(valid_up_to) };
 
-        Some(Utf8LossyChunk {
+        Some(Utf8Chunk {
             // SAFETY: All bytes up to `valid_up_to` are valid UTF-8.
             valid: unsafe { from_utf8_unchecked(valid) },
-            broken,
+            invalid,
         })
     }
 }
 
-impl fmt::Display for Utf8Lossy {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        // If we're the empty string then our iterator won't actually yield
-        // anything, so perform the formatting manually
-        if self.bytes.is_empty() {
-            return "".fmt(f);
-        }
-
-        for Utf8LossyChunk { valid, broken } in self.chunks() {
-            // If we successfully decoded the whole chunk as a valid string then
-            // we can return a direct formatting of the string which will also
-            // respect various formatting flags if possible.
-            if valid.len() == self.bytes.len() {
-                assert!(broken.is_empty());
-                return valid.fmt(f);
-            }
-
-            f.write_str(valid)?;
-            if !broken.is_empty() {
-                f.write_char(char::REPLACEMENT_CHARACTER)?;
-            }
-        }
-        Ok(())
-    }
-}
-
-impl fmt::Debug for Utf8Lossy {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        f.write_char('"')?;
+#[unstable(feature = "utf8_chunks", issue = "99543")]
+impl FusedIterator for Utf8Chunks<'_> {}
 
-        for Utf8LossyChunk { valid, broken } in self.chunks() {
-            // Valid part.
-            // Here we partially parse UTF-8 again which is suboptimal.
-            {
-                let mut from = 0;
-                for (i, c) in valid.char_indices() {
-                    let esc = c.escape_debug();
-                    // If char needs escaping, flush backlog so far and write, else skip
-                    if esc.len() != 1 {
-                        f.write_str(&valid[from..i])?;
-                        for c in esc {
-                            f.write_char(c)?;
-                        }
-                        from = i + c.len_utf8();
-                    }
-                }
-                f.write_str(&valid[from..])?;
-            }
-
-            // Broken parts of string as hex escape.
-            for &b in broken {
-                write!(f, "\\x{:02x}", b)?;
-            }
-        }
-
-        f.write_char('"')
+#[unstable(feature = "utf8_chunks", issue = "99543")]
+impl fmt::Debug for Utf8Chunks<'_> {
+    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
+        f.debug_struct("Utf8Chunks").field("source", &self.debug()).finish()
     }
 }
diff --git a/library/core/src/str/mod.rs b/library/core/src/str/mod.rs
index c4f2e283e..f673aa2a4 100644
--- a/library/core/src/str/mod.rs
+++ b/library/core/src/str/mod.rs
@@ -22,9 +22,9 @@ use crate::slice::{self, SliceIndex};
 
 pub mod pattern;
 
-#[unstable(feature = "str_internals", issue = "none")]
-#[allow(missing_docs)]
-pub mod lossy;
+mod lossy;
+#[unstable(feature = "utf8_chunks", issue = "99543")]
+pub use lossy::{Utf8Chunk, Utf8Chunks};
 
 #[stable(feature = "rust1", since = "1.0.0")]
 pub use converts::{from_utf8, from_utf8_unchecked};
@@ -91,10 +91,12 @@ const fn slice_error_fail(s: &str, begin: usize, end: usize) -> ! {
     }
 }
 
+#[track_caller]
 const fn slice_error_fail_ct(_: &str, _: usize, _: usize) -> ! {
     panic!("failed to slice string");
 }
 
+#[track_caller]
 fn slice_error_fail_rt(s: &str, begin: usize, end: usize) -> ! {
     const MAX_DISPLAY_LENGTH: usize = 256;
     let trunc_len = s.floor_char_boundary(MAX_DISPLAY_LENGTH);
@@ -2353,7 +2355,7 @@ impl str {
     #[inline]
     pub fn is_ascii(&self) -> bool {
         // We can treat each byte as character here: all multibyte characters
-        // start with a byte that is not in the ascii range, so we will stop
+        // start with a byte that is not in the ASCII range, so we will stop
         // there already.
         self.as_bytes().is_ascii()
     }
@@ -2638,3 +2640,7 @@ impl_fn_for_zst! {
         unsafe { from_utf8_unchecked(bytes) }
     };
 }
+
+#[stable(feature = "rust1", since = "1.0.0")]
+#[cfg(not(bootstrap))]
+impl !crate::error::Error for &str {}
diff --git a/library/core/src/str/validations.rs b/library/core/src/str/validations.rs
index 04bc66523..2acef432f 100644
--- a/library/core/src/str/validations.rs
+++ b/library/core/src/str/validations.rs
@@ -216,12 +216,12 @@ pub(super) const fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> {
                     // SAFETY: since `align - index` and `ascii_block_size` are
                     // multiples of `usize_bytes`, `block = ptr.add(index)` is
                     // always aligned with a `usize` so it's safe to dereference
-                    // both `block` and `block.offset(1)`.
+                    // both `block` and `block.add(1)`.
                     unsafe {
                         let block = ptr.add(index) as *const usize;
                         // break if there is a nonascii byte
                         let zu = contains_nonascii(*block);
-                        let zv = contains_nonascii(*block.offset(1));
+                        let zv = contains_nonascii(*block.add(1));
                         if zu || zv {
                             break;
                         }