diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-06-19 09:26:03 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-06-19 09:26:03 +0000 |
commit | 9918693037dce8aa4bb6f08741b6812923486c18 (patch) | |
tree | 21d2b40bec7e6a7ea664acee056eb3d08e15a1cf /library/core/src/str | |
parent | Releasing progress-linux version 1.75.0+dfsg1-5~progress7.99u1. (diff) | |
download | rustc-9918693037dce8aa4bb6f08741b6812923486c18.tar.xz rustc-9918693037dce8aa4bb6f08741b6812923486c18.zip |
Merging upstream version 1.76.0+dfsg1.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'library/core/src/str')
-rw-r--r-- | library/core/src/str/iter.rs | 50 | ||||
-rw-r--r-- | library/core/src/str/mod.rs | 79 | ||||
-rw-r--r-- | library/core/src/str/pattern.rs | 20 | ||||
-rw-r--r-- | library/core/src/str/traits.rs | 46 |
4 files changed, 160 insertions, 35 deletions
diff --git a/library/core/src/str/iter.rs b/library/core/src/str/iter.rs index c30f01b3c..dd2efb005 100644 --- a/library/core/src/str/iter.rs +++ b/library/core/src/str/iter.rs @@ -8,6 +8,7 @@ use crate::iter::{TrustedRandomAccess, TrustedRandomAccessNoCoerce}; use crate::ops::Try; use crate::option; use crate::slice::{self, Split as SliceSplit}; +use core::num::NonZeroUsize; use super::from_utf8_unchecked; use super::pattern::Pattern; @@ -50,6 +51,55 @@ impl<'a> Iterator for Chars<'a> { } #[inline] + fn advance_by(&mut self, mut remainder: usize) -> Result<(), NonZeroUsize> { + const CHUNK_SIZE: usize = 32; + + if remainder >= CHUNK_SIZE { + let mut chunks = self.iter.as_slice().array_chunks::<CHUNK_SIZE>(); + let mut bytes_skipped: usize = 0; + + while remainder > CHUNK_SIZE + && let Some(chunk) = chunks.next() + { + bytes_skipped += CHUNK_SIZE; + + let mut start_bytes = [false; CHUNK_SIZE]; + + for i in 0..CHUNK_SIZE { + start_bytes[i] = !super::validations::utf8_is_cont_byte(chunk[i]); + } + + remainder -= start_bytes.into_iter().map(|i| i as u8).sum::<u8>() as usize; + } + + // SAFETY: The amount of bytes exists since we just iterated over them, + // so advance_by will succeed. + unsafe { self.iter.advance_by(bytes_skipped).unwrap_unchecked() }; + + // skip trailing continuation bytes + while self.iter.len() > 0 { + let b = self.iter.as_slice()[0]; + if !super::validations::utf8_is_cont_byte(b) { + break; + } + // SAFETY: We just peeked at the byte, therefore it exists + unsafe { self.iter.advance_by(1).unwrap_unchecked() }; + } + } + + while (remainder > 0) && (self.iter.len() > 0) { + remainder -= 1; + let b = self.iter.as_slice()[0]; + let slurp = super::validations::utf8_char_width(b); + // SAFETY: utf8 validity requires that the string must contain + // the continuation bytes (if any) + unsafe { self.iter.advance_by(slurp).unwrap_unchecked() }; + } + + NonZeroUsize::new(remainder).map_or(Ok(()), Err) + } + + #[inline] fn size_hint(&self) -> (usize, Option<usize>) { let len = self.iter.len(); // `(len + 3)` can't overflow, because we know that the `slice::Iter` diff --git a/library/core/src/str/mod.rs b/library/core/src/str/mod.rs index 27178328b..a22c46edc 100644 --- a/library/core/src/str/mod.rs +++ b/library/core/src/str/mod.rs @@ -2423,6 +2423,85 @@ impl str { me.make_ascii_lowercase() } + /// Returns a string slice with leading ASCII whitespace removed. + /// + /// 'Whitespace' refers to the definition used by + /// [`u8::is_ascii_whitespace`]. + /// + /// [`u8::is_ascii_whitespace`]: u8::is_ascii_whitespace + /// + /// # Examples + /// + /// ``` + /// #![feature(byte_slice_trim_ascii)] + /// + /// assert_eq!(" \t \u{3000}hello world\n".trim_ascii_start(), "\u{3000}hello world\n"); + /// assert_eq!(" ".trim_ascii_start(), ""); + /// assert_eq!("".trim_ascii_start(), ""); + /// ``` + #[unstable(feature = "byte_slice_trim_ascii", issue = "94035")] + #[must_use = "this returns the trimmed string as a new slice, \ + without modifying the original"] + #[inline] + pub const fn trim_ascii_start(&self) -> &str { + // SAFETY: Removing ASCII characters from a `&str` does not invalidate + // UTF-8. + unsafe { core::str::from_utf8_unchecked(self.as_bytes().trim_ascii_start()) } + } + + /// Returns a string slice with trailing ASCII whitespace removed. + /// + /// 'Whitespace' refers to the definition used by + /// [`u8::is_ascii_whitespace`]. + /// + /// [`u8::is_ascii_whitespace`]: u8::is_ascii_whitespace + /// + /// # Examples + /// + /// ``` + /// #![feature(byte_slice_trim_ascii)] + /// + /// assert_eq!("\r hello world\u{3000}\n ".trim_ascii_end(), "\r hello world\u{3000}"); + /// assert_eq!(" ".trim_ascii_end(), ""); + /// assert_eq!("".trim_ascii_end(), ""); + /// ``` + #[unstable(feature = "byte_slice_trim_ascii", issue = "94035")] + #[must_use = "this returns the trimmed string as a new slice, \ + without modifying the original"] + #[inline] + pub const fn trim_ascii_end(&self) -> &str { + // SAFETY: Removing ASCII characters from a `&str` does not invalidate + // UTF-8. + unsafe { core::str::from_utf8_unchecked(self.as_bytes().trim_ascii_end()) } + } + + /// Returns a string slice with leading and trailing ASCII whitespace + /// removed. + /// + /// 'Whitespace' refers to the definition used by + /// [`u8::is_ascii_whitespace`]. + /// + /// [`u8::is_ascii_whitespace`]: u8::is_ascii_whitespace + /// + /// # Examples + /// + /// ``` + /// #![feature(byte_slice_trim_ascii)] + /// + /// assert_eq!("\r hello world\n ".trim_ascii(), "hello world"); + /// assert_eq!(" ".trim_ascii(), ""); + /// assert_eq!("".trim_ascii(), ""); + /// ``` + #[unstable(feature = "byte_slice_trim_ascii", issue = "94035")] + #[must_use = "this returns the trimmed string as a new slice, \ + without modifying the original"] + #[inline] + pub const fn trim_ascii(&self) -> &str { + // SAFETY: Removing ASCII characters from a `&str` does not invalidate + // UTF-8. + unsafe { core::str::from_utf8_unchecked(self.as_bytes().trim_ascii()) } + } + /// Return an iterator that escapes each char in `self` with [`char::escape_debug`]. /// /// Note: only extended grapheme codepoints that begin the string will be diff --git a/library/core/src/str/pattern.rs b/library/core/src/str/pattern.rs index 701e61e66..caa54e00f 100644 --- a/library/core/src/str/pattern.rs +++ b/library/core/src/str/pattern.rs @@ -1740,9 +1740,9 @@ fn simd_contains(needle: &str, haystack: &str) -> Option<bool> { debug_assert!(needle.len() > 1); use crate::ops::BitAnd; + use crate::simd::cmp::SimdPartialEq; use crate::simd::mask8x16 as Mask; use crate::simd::u8x16 as Block; - use crate::simd::{SimdPartialEq, ToBitMask}; let first_probe = needle[0]; let last_byte_offset = needle.len() - 1; @@ -1765,7 +1765,7 @@ fn simd_contains(needle: &str, haystack: &str) -> Option<bool> { }; // do a naive search if the haystack is too small to fit - if haystack.len() < Block::LANES + last_byte_offset { + if haystack.len() < Block::LEN + last_byte_offset { return Some(haystack.windows(needle.len()).any(|c| c == needle)); } @@ -1812,7 +1812,7 @@ fn simd_contains(needle: &str, haystack: &str) -> Option<bool> { let eq_first: Mask = a.simd_eq(first_probe); let eq_last: Mask = b.simd_eq(second_probe); let both = eq_first.bitand(eq_last); - let mask = both.to_bitmask(); + let mask = both.to_bitmask() as u16; return mask; }; @@ -1822,32 +1822,32 @@ fn simd_contains(needle: &str, haystack: &str) -> Option<bool> { // The loop condition must ensure that there's enough headroom to read LANE bytes, // and not only at the current index but also at the index shifted by block_offset const UNROLL: usize = 4; - while i + last_byte_offset + UNROLL * Block::LANES < haystack.len() && !result { + while i + last_byte_offset + UNROLL * Block::LEN < haystack.len() && !result { let mut masks = [0u16; UNROLL]; for j in 0..UNROLL { - masks[j] = test_chunk(i + j * Block::LANES); + masks[j] = test_chunk(i + j * Block::LEN); } for j in 0..UNROLL { let mask = masks[j]; if mask != 0 { - result |= check_mask(i + j * Block::LANES, mask, result); + result |= check_mask(i + j * Block::LEN, mask, result); } } - i += UNROLL * Block::LANES; + i += UNROLL * Block::LEN; } - while i + last_byte_offset + Block::LANES < haystack.len() && !result { + while i + last_byte_offset + Block::LEN < haystack.len() && !result { let mask = test_chunk(i); if mask != 0 { result |= check_mask(i, mask, result); } - i += Block::LANES; + i += Block::LEN; } // Process the tail that didn't fit into LANES-sized steps. // This simply repeats the same procedure but as right-aligned chunk instead // of a left-aligned one. The last byte must be exactly flush with the string end so // we don't miss a single byte or read out of bounds. - let i = haystack.len() - last_byte_offset - Block::LANES; + let i = haystack.len() - last_byte_offset - Block::LEN; let mask = test_chunk(i); if mask != 0 { result |= check_mask(i, mask, result); diff --git a/library/core/src/str/traits.rs b/library/core/src/str/traits.rs index 16fb1dad7..777ad0d81 100644 --- a/library/core/src/str/traits.rs +++ b/library/core/src/str/traits.rs @@ -1,8 +1,8 @@ //! Trait implementations for `str`. use crate::cmp::Ordering; -use crate::intrinsics::assert_unsafe_precondition; use crate::ops; +use crate::panic::debug_assert_nounwind; use crate::ptr; use crate::slice::SliceIndex; @@ -191,39 +191,35 @@ unsafe impl SliceIndex<str> for ops::Range<usize> { #[inline] unsafe fn get_unchecked(self, slice: *const str) -> *const Self::Output { let slice = slice as *const [u8]; + + debug_assert_nounwind!( + // We'd like to check that the bounds are on char boundaries, + // but there's not really a way to do so without reading + // behind the pointer, which has aliasing implications. + // It's also not possible to move this check up to + // `str::get_unchecked` without adding a special function + // to `SliceIndex` just for this. + self.end >= self.start && self.end <= slice.len(), + "str::get_unchecked requires that the range is within the string slice", + ); + // SAFETY: the caller guarantees that `self` is in bounds of `slice` // which satisfies all the conditions for `add`. - let ptr = unsafe { - let this = ops::Range { ..self }; - assert_unsafe_precondition!( - "str::get_unchecked requires that the range is within the string slice", - (this: ops::Range<usize>, slice: *const [u8]) => - // We'd like to check that the bounds are on char boundaries, - // but there's not really a way to do so without reading - // behind the pointer, which has aliasing implications. - // It's also not possible to move this check up to - // `str::get_unchecked` without adding a special function - // to `SliceIndex` just for this. - this.end >= this.start && this.end <= slice.len() - ); - slice.as_ptr().add(self.start) - }; + let ptr = unsafe { slice.as_ptr().add(self.start) }; let len = self.end - self.start; ptr::slice_from_raw_parts(ptr, len) as *const str } #[inline] unsafe fn get_unchecked_mut(self, slice: *mut str) -> *mut Self::Output { let slice = slice as *mut [u8]; + + debug_assert_nounwind!( + self.end >= self.start && self.end <= slice.len(), + "str::get_unchecked_mut requires that the range is within the string slice", + ); + // SAFETY: see comments for `get_unchecked`. - let ptr = unsafe { - let this = ops::Range { ..self }; - assert_unsafe_precondition!( - "str::get_unchecked_mut requires that the range is within the string slice", - (this: ops::Range<usize>, slice: *mut [u8]) => - this.end >= this.start && this.end <= slice.len() - ); - slice.as_mut_ptr().add(self.start) - }; + let ptr = unsafe { slice.as_mut_ptr().add(self.start) }; let len = self.end - self.start; ptr::slice_from_raw_parts_mut(ptr, len) as *mut str } |