diff options
Diffstat (limited to 'library/core/src/str')
-rw-r--r-- | library/core/src/str/error.rs | 20 | ||||
-rw-r--r-- | library/core/src/str/lossy.rs | 244 | ||||
-rw-r--r-- | library/core/src/str/mod.rs | 14 | ||||
-rw-r--r-- | library/core/src/str/validations.rs | 4 |
4 files changed, 189 insertions, 93 deletions
diff --git a/library/core/src/str/error.rs b/library/core/src/str/error.rs index 4e569fcc8..343889b69 100644 --- a/library/core/src/str/error.rs +++ b/library/core/src/str/error.rs @@ -1,5 +1,7 @@ //! Defines utf8 error type. +#[cfg(not(bootstrap))] +use crate::error::Error; use crate::fmt; /// Errors which can occur when attempting to interpret a sequence of [`u8`] @@ -122,6 +124,15 @@ impl fmt::Display for Utf8Error { } } +#[cfg(not(bootstrap))] +#[stable(feature = "rust1", since = "1.0.0")] +impl Error for Utf8Error { + #[allow(deprecated)] + fn description(&self) -> &str { + "invalid utf-8: corrupt contents" + } +} + /// An error returned when parsing a `bool` using [`from_str`] fails /// /// [`from_str`]: super::FromStr::from_str @@ -136,3 +147,12 @@ impl fmt::Display for ParseBoolError { "provided string was not `true` or `false`".fmt(f) } } + +#[cfg(not(bootstrap))] +#[stable(feature = "rust1", since = "1.0.0")] +impl Error for ParseBoolError { + #[allow(deprecated)] + fn description(&self) -> &str { + "failed to parse bool" + } +} diff --git a/library/core/src/str/lossy.rs b/library/core/src/str/lossy.rs index 6ec1c9390..59f873d12 100644 --- a/library/core/src/str/lossy.rs +++ b/library/core/src/str/lossy.rs @@ -1,51 +1,170 @@ -use crate::char; -use crate::fmt::{self, Write}; -use crate::mem; +use crate::fmt; +use crate::fmt::Formatter; +use crate::fmt::Write; +use crate::iter::FusedIterator; use super::from_utf8_unchecked; use super::validations::utf8_char_width; -/// Lossy UTF-8 string. -#[unstable(feature = "str_internals", issue = "none")] -pub struct Utf8Lossy { - bytes: [u8], +/// An item returned by the [`Utf8Chunks`] iterator. +/// +/// A `Utf8Chunk` stores a sequence of [`u8`] up to the first broken character +/// when decoding a UTF-8 string. +/// +/// # Examples +/// +/// ``` +/// #![feature(utf8_chunks)] +/// +/// use std::str::Utf8Chunks; +/// +/// // An invalid UTF-8 string +/// let bytes = b"foo\xF1\x80bar"; +/// +/// // Decode the first `Utf8Chunk` +/// let chunk = Utf8Chunks::new(bytes).next().unwrap(); +/// +/// // The first three characters are valid UTF-8 +/// assert_eq!("foo", chunk.valid()); +/// +/// // The fourth character is broken +/// assert_eq!(b"\xF1\x80", chunk.invalid()); +/// ``` +#[unstable(feature = "utf8_chunks", issue = "99543")] +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct Utf8Chunk<'a> { + valid: &'a str, + invalid: &'a [u8], } -impl Utf8Lossy { +impl<'a> Utf8Chunk<'a> { + /// Returns the next validated UTF-8 substring. + /// + /// This substring can be empty at the start of the string or between + /// broken UTF-8 characters. #[must_use] - pub fn from_bytes(bytes: &[u8]) -> &Utf8Lossy { - // SAFETY: Both use the same memory layout, and UTF-8 correctness isn't required. - unsafe { mem::transmute(bytes) } + #[unstable(feature = "utf8_chunks", issue = "99543")] + pub fn valid(&self) -> &'a str { + self.valid } - pub fn chunks(&self) -> Utf8LossyChunksIter<'_> { - Utf8LossyChunksIter { source: &self.bytes } + /// Returns the invalid sequence that caused a failure. + /// + /// The returned slice will have a maximum length of 3 and starts after the + /// substring given by [`valid`]. Decoding will resume after this sequence. + /// + /// If empty, this is the last chunk in the string. If non-empty, an + /// unexpected byte was encountered or the end of the input was reached + /// unexpectedly. + /// + /// Lossy decoding would replace this sequence with [`U+FFFD REPLACEMENT + /// CHARACTER`]. + /// + /// [`valid`]: Self::valid + /// [`U+FFFD REPLACEMENT CHARACTER`]: crate::char::REPLACEMENT_CHARACTER + #[must_use] + #[unstable(feature = "utf8_chunks", issue = "99543")] + pub fn invalid(&self) -> &'a [u8] { + self.invalid } } -/// Iterator over lossy UTF-8 string -#[must_use = "iterators are lazy and do nothing unless consumed"] +#[must_use] +#[unstable(feature = "str_internals", issue = "none")] +pub struct Debug<'a>(&'a [u8]); + #[unstable(feature = "str_internals", issue = "none")] -#[allow(missing_debug_implementations)] -pub struct Utf8LossyChunksIter<'a> { +impl fmt::Debug for Debug<'_> { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + f.write_char('"')?; + + for chunk in Utf8Chunks::new(self.0) { + // Valid part. + // Here we partially parse UTF-8 again which is suboptimal. + { + let valid = chunk.valid(); + let mut from = 0; + for (i, c) in valid.char_indices() { + let esc = c.escape_debug(); + // If char needs escaping, flush backlog so far and write, else skip + if esc.len() != 1 { + f.write_str(&valid[from..i])?; + for c in esc { + f.write_char(c)?; + } + from = i + c.len_utf8(); + } + } + f.write_str(&valid[from..])?; + } + + // Broken parts of string as hex escape. + for &b in chunk.invalid() { + write!(f, "\\x{:02X}", b)?; + } + } + + f.write_char('"') + } +} + +/// An iterator used to decode a slice of mostly UTF-8 bytes to string slices +/// ([`&str`]) and byte slices ([`&[u8]`][byteslice]). +/// +/// If you want a simple conversion from UTF-8 byte slices to string slices, +/// [`from_utf8`] is easier to use. +/// +/// [byteslice]: slice +/// [`from_utf8`]: super::from_utf8 +/// +/// # Examples +/// +/// This can be used to create functionality similar to +/// [`String::from_utf8_lossy`] without allocating heap memory: +/// +/// ``` +/// #![feature(utf8_chunks)] +/// +/// use std::str::Utf8Chunks; +/// +/// fn from_utf8_lossy<F>(input: &[u8], mut push: F) where F: FnMut(&str) { +/// for chunk in Utf8Chunks::new(input) { +/// push(chunk.valid()); +/// +/// if !chunk.invalid().is_empty() { +/// push("\u{FFFD}"); +/// } +/// } +/// } +/// ``` +/// +/// [`String::from_utf8_lossy`]: ../../std/string/struct.String.html#method.from_utf8_lossy +#[must_use = "iterators are lazy and do nothing unless consumed"] +#[unstable(feature = "utf8_chunks", issue = "99543")] +#[derive(Clone)] +pub struct Utf8Chunks<'a> { source: &'a [u8], } -#[unstable(feature = "str_internals", issue = "none")] -#[derive(PartialEq, Eq, Debug)] -pub struct Utf8LossyChunk<'a> { - /// Sequence of valid chars. - /// Can be empty between broken UTF-8 chars. - pub valid: &'a str, - /// Single broken char, empty if none. - /// Empty iff iterator item is last. - pub broken: &'a [u8], +impl<'a> Utf8Chunks<'a> { + /// Creates a new iterator to decode the bytes. + #[unstable(feature = "utf8_chunks", issue = "99543")] + pub fn new(bytes: &'a [u8]) -> Self { + Self { source: bytes } + } + + #[doc(hidden)] + #[unstable(feature = "str_internals", issue = "none")] + pub fn debug(&self) -> Debug<'_> { + Debug(self.source) + } } -impl<'a> Iterator for Utf8LossyChunksIter<'a> { - type Item = Utf8LossyChunk<'a>; +#[unstable(feature = "utf8_chunks", issue = "99543")] +impl<'a> Iterator for Utf8Chunks<'a> { + type Item = Utf8Chunk<'a>; - fn next(&mut self) -> Option<Utf8LossyChunk<'a>> { + fn next(&mut self) -> Option<Utf8Chunk<'a>> { if self.source.is_empty() { return None; } @@ -130,71 +249,22 @@ impl<'a> Iterator for Utf8LossyChunksIter<'a> { // SAFETY: `valid_up_to <= i` because it is only ever assigned via // `valid_up_to = i` and `i` only increases. - let (valid, broken) = unsafe { inspected.split_at_unchecked(valid_up_to) }; + let (valid, invalid) = unsafe { inspected.split_at_unchecked(valid_up_to) }; - Some(Utf8LossyChunk { + Some(Utf8Chunk { // SAFETY: All bytes up to `valid_up_to` are valid UTF-8. valid: unsafe { from_utf8_unchecked(valid) }, - broken, + invalid, }) } } -impl fmt::Display for Utf8Lossy { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - // If we're the empty string then our iterator won't actually yield - // anything, so perform the formatting manually - if self.bytes.is_empty() { - return "".fmt(f); - } - - for Utf8LossyChunk { valid, broken } in self.chunks() { - // If we successfully decoded the whole chunk as a valid string then - // we can return a direct formatting of the string which will also - // respect various formatting flags if possible. - if valid.len() == self.bytes.len() { - assert!(broken.is_empty()); - return valid.fmt(f); - } - - f.write_str(valid)?; - if !broken.is_empty() { - f.write_char(char::REPLACEMENT_CHARACTER)?; - } - } - Ok(()) - } -} - -impl fmt::Debug for Utf8Lossy { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.write_char('"')?; +#[unstable(feature = "utf8_chunks", issue = "99543")] +impl FusedIterator for Utf8Chunks<'_> {} - for Utf8LossyChunk { valid, broken } in self.chunks() { - // Valid part. - // Here we partially parse UTF-8 again which is suboptimal. - { - let mut from = 0; - for (i, c) in valid.char_indices() { - let esc = c.escape_debug(); - // If char needs escaping, flush backlog so far and write, else skip - if esc.len() != 1 { - f.write_str(&valid[from..i])?; - for c in esc { - f.write_char(c)?; - } - from = i + c.len_utf8(); - } - } - f.write_str(&valid[from..])?; - } - - // Broken parts of string as hex escape. - for &b in broken { - write!(f, "\\x{:02x}", b)?; - } - } - - f.write_char('"') +#[unstable(feature = "utf8_chunks", issue = "99543")] +impl fmt::Debug for Utf8Chunks<'_> { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + f.debug_struct("Utf8Chunks").field("source", &self.debug()).finish() } } diff --git a/library/core/src/str/mod.rs b/library/core/src/str/mod.rs index c4f2e283e..f673aa2a4 100644 --- a/library/core/src/str/mod.rs +++ b/library/core/src/str/mod.rs @@ -22,9 +22,9 @@ use crate::slice::{self, SliceIndex}; pub mod pattern; -#[unstable(feature = "str_internals", issue = "none")] -#[allow(missing_docs)] -pub mod lossy; +mod lossy; +#[unstable(feature = "utf8_chunks", issue = "99543")] +pub use lossy::{Utf8Chunk, Utf8Chunks}; #[stable(feature = "rust1", since = "1.0.0")] pub use converts::{from_utf8, from_utf8_unchecked}; @@ -91,10 +91,12 @@ const fn slice_error_fail(s: &str, begin: usize, end: usize) -> ! { } } +#[track_caller] const fn slice_error_fail_ct(_: &str, _: usize, _: usize) -> ! { panic!("failed to slice string"); } +#[track_caller] fn slice_error_fail_rt(s: &str, begin: usize, end: usize) -> ! { const MAX_DISPLAY_LENGTH: usize = 256; let trunc_len = s.floor_char_boundary(MAX_DISPLAY_LENGTH); @@ -2353,7 +2355,7 @@ impl str { #[inline] pub fn is_ascii(&self) -> bool { // We can treat each byte as character here: all multibyte characters - // start with a byte that is not in the ascii range, so we will stop + // start with a byte that is not in the ASCII range, so we will stop // there already. self.as_bytes().is_ascii() } @@ -2638,3 +2640,7 @@ impl_fn_for_zst! { unsafe { from_utf8_unchecked(bytes) } }; } + +#[stable(feature = "rust1", since = "1.0.0")] +#[cfg(not(bootstrap))] +impl !crate::error::Error for &str {} diff --git a/library/core/src/str/validations.rs b/library/core/src/str/validations.rs index 04bc66523..2acef432f 100644 --- a/library/core/src/str/validations.rs +++ b/library/core/src/str/validations.rs @@ -216,12 +216,12 @@ pub(super) const fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> { // SAFETY: since `align - index` and `ascii_block_size` are // multiples of `usize_bytes`, `block = ptr.add(index)` is // always aligned with a `usize` so it's safe to dereference - // both `block` and `block.offset(1)`. + // both `block` and `block.add(1)`. unsafe { let block = ptr.add(index) as *const usize; // break if there is a nonascii byte let zu = contains_nonascii(*block); - let zv = contains_nonascii(*block.offset(1)); + let zv = contains_nonascii(*block.add(1)); if zu || zv { break; } |