summaryrefslogtreecommitdiffstats
path: root/library/core/src/str
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-17 12:18:32 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-17 12:18:32 +0000
commit4547b622d8d29df964fa2914213088b148c498fc (patch)
tree9fc6b25f3c3add6b745be9a2400a6e96140046e9 /library/core/src/str
parentReleasing progress-linux version 1.66.0+dfsg1-1~progress7.99u1. (diff)
downloadrustc-4547b622d8d29df964fa2914213088b148c498fc.tar.xz
rustc-4547b622d8d29df964fa2914213088b148c498fc.zip
Merging upstream version 1.67.1+dfsg1.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'library/core/src/str')
-rw-r--r--library/core/src/str/converts.rs2
-rw-r--r--library/core/src/str/mod.rs16
-rw-r--r--library/core/src/str/pattern.rs234
3 files changed, 249 insertions, 3 deletions
diff --git a/library/core/src/str/converts.rs b/library/core/src/str/converts.rs
index b0c55ca4f..5f8748206 100644
--- a/library/core/src/str/converts.rs
+++ b/library/core/src/str/converts.rs
@@ -77,7 +77,7 @@ use super::Utf8Error;
/// let sparkle_heart = [240, 159, 146, 150];
///
/// // We know these bytes are valid, so just use `unwrap()`.
-/// let sparkle_heart = str::from_utf8(&sparkle_heart).unwrap();
+/// let sparkle_heart: &str = str::from_utf8(&sparkle_heart).unwrap();
///
/// assert_eq!("💖", sparkle_heart);
/// ```
diff --git a/library/core/src/str/mod.rs b/library/core/src/str/mod.rs
index fbc0fc397..45fd2caae 100644
--- a/library/core/src/str/mod.rs
+++ b/library/core/src/str/mod.rs
@@ -396,7 +396,7 @@ impl str {
#[stable(feature = "rust1", since = "1.0.0")]
#[rustc_const_stable(feature = "rustc_str_as_ptr", since = "1.32.0")]
#[must_use]
- #[inline]
+ #[inline(always)]
pub const fn as_ptr(&self) -> *const u8 {
self as *const str as *const u8
}
@@ -411,7 +411,7 @@ impl str {
/// modified in a way that it remains valid UTF-8.
#[stable(feature = "str_as_mut_ptr", since = "1.36.0")]
#[must_use]
- #[inline]
+ #[inline(always)]
pub fn as_mut_ptr(&mut self) -> *mut u8 {
self as *mut str as *mut u8
}
@@ -902,6 +902,12 @@ impl str {
///
/// assert_eq!(None, iter.next());
/// ```
+ ///
+ /// If the string is empty or all whitespace, the iterator yields no string slices:
+ /// ```
+ /// assert_eq!("".split_whitespace().next(), None);
+ /// assert_eq!(" ".split_whitespace().next(), None);
+ /// ```
#[must_use = "this returns the split string as an iterator, \
without modifying the original"]
#[stable(feature = "split_whitespace", since = "1.1.0")]
@@ -946,6 +952,12 @@ impl str {
///
/// assert_eq!(None, iter.next());
/// ```
+ ///
+ /// If the string is empty or all ASCII whitespace, the iterator yields no string slices:
+ /// ```
+ /// assert_eq!("".split_ascii_whitespace().next(), None);
+ /// assert_eq!(" ".split_ascii_whitespace().next(), None);
+ /// ```
#[must_use = "this returns the split string as an iterator, \
without modifying the original"]
#[stable(feature = "split_ascii_whitespace", since = "1.34.0")]
diff --git a/library/core/src/str/pattern.rs b/library/core/src/str/pattern.rs
index ec2cb429e..19da6d2fb 100644
--- a/library/core/src/str/pattern.rs
+++ b/library/core/src/str/pattern.rs
@@ -39,6 +39,7 @@
)]
use crate::cmp;
+use crate::cmp::Ordering;
use crate::fmt;
use crate::slice::memchr;
@@ -946,6 +947,32 @@ impl<'a, 'b> Pattern<'a> for &'b str {
haystack.as_bytes().starts_with(self.as_bytes())
}
+ /// Checks whether the pattern matches anywhere in the haystack
+ #[inline]
+ fn is_contained_in(self, haystack: &'a str) -> bool {
+ if self.len() == 0 {
+ return true;
+ }
+
+ match self.len().cmp(&haystack.len()) {
+ Ordering::Less => {
+ if self.len() == 1 {
+ return haystack.as_bytes().contains(&self.as_bytes()[0]);
+ }
+
+ #[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
+ if self.len() <= 32 {
+ if let Some(result) = simd_contains(self, haystack) {
+ return result;
+ }
+ }
+
+ self.into_searcher(haystack).next_match().is_some()
+ }
+ _ => self == haystack,
+ }
+ }
+
/// Removes the pattern from the front of haystack, if it matches.
#[inline]
fn strip_prefix_of(self, haystack: &'a str) -> Option<&'a str> {
@@ -1684,3 +1711,210 @@ impl TwoWayStrategy for RejectAndMatch {
SearchStep::Match(a, b)
}
}
+
+/// SIMD search for short needles based on
+/// Wojciech Muła's "SIMD-friendly algorithms for substring searching"[0]
+///
+/// It skips ahead by the vector width on each iteration (rather than the needle length as two-way
+/// does) by probing the first and last byte of the needle for the whole vector width
+/// and only doing full needle comparisons when the vectorized probe indicated potential matches.
+///
+/// Since the x86_64 baseline only offers SSE2 we only use u8x16 here.
+/// If we ever ship std with for x86-64-v3 or adapt this for other platforms then wider vectors
+/// should be evaluated.
+///
+/// For haystacks smaller than vector-size + needle length it falls back to
+/// a naive O(n*m) search so this implementation should not be called on larger needles.
+///
+/// [0]: http://0x80.pl/articles/simd-strfind.html#sse-avx2
+#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
+#[inline]
+fn simd_contains(needle: &str, haystack: &str) -> Option<bool> {
+ let needle = needle.as_bytes();
+ let haystack = haystack.as_bytes();
+
+ debug_assert!(needle.len() > 1);
+
+ use crate::ops::BitAnd;
+ use crate::simd::mask8x16 as Mask;
+ use crate::simd::u8x16 as Block;
+ use crate::simd::{SimdPartialEq, ToBitMask};
+
+ let first_probe = needle[0];
+ let last_byte_offset = needle.len() - 1;
+
+ // the offset used for the 2nd vector
+ let second_probe_offset = if needle.len() == 2 {
+ // never bail out on len=2 needles because the probes will fully cover them and have
+ // no degenerate cases.
+ 1
+ } else {
+ // try a few bytes in case first and last byte of the needle are the same
+ let Some(second_probe_offset) = (needle.len().saturating_sub(4)..needle.len()).rfind(|&idx| needle[idx] != first_probe) else {
+ // fall back to other search methods if we can't find any different bytes
+ // since we could otherwise hit some degenerate cases
+ return None;
+ };
+ second_probe_offset
+ };
+
+ // do a naive search if the haystack is too small to fit
+ if haystack.len() < Block::LANES + last_byte_offset {
+ return Some(haystack.windows(needle.len()).any(|c| c == needle));
+ }
+
+ let first_probe: Block = Block::splat(first_probe);
+ let second_probe: Block = Block::splat(needle[second_probe_offset]);
+ // first byte are already checked by the outer loop. to verify a match only the
+ // remainder has to be compared.
+ let trimmed_needle = &needle[1..];
+
+ // this #[cold] is load-bearing, benchmark before removing it...
+ let check_mask = #[cold]
+ |idx, mask: u16, skip: bool| -> bool {
+ if skip {
+ return false;
+ }
+
+ // and so is this. optimizations are weird.
+ let mut mask = mask;
+
+ while mask != 0 {
+ let trailing = mask.trailing_zeros();
+ let offset = idx + trailing as usize + 1;
+ // SAFETY: mask is between 0 and 15 trailing zeroes, we skip one additional byte that was already compared
+ // and then take trimmed_needle.len() bytes. This is within the bounds defined by the outer loop
+ unsafe {
+ let sub = haystack.get_unchecked(offset..).get_unchecked(..trimmed_needle.len());
+ if small_slice_eq(sub, trimmed_needle) {
+ return true;
+ }
+ }
+ mask &= !(1 << trailing);
+ }
+ return false;
+ };
+
+ let test_chunk = |idx| -> u16 {
+ // SAFETY: this requires at least LANES bytes being readable at idx
+ // that is ensured by the loop ranges (see comments below)
+ let a: Block = unsafe { haystack.as_ptr().add(idx).cast::<Block>().read_unaligned() };
+ // SAFETY: this requires LANES + block_offset bytes being readable at idx
+ let b: Block = unsafe {
+ haystack.as_ptr().add(idx).add(second_probe_offset).cast::<Block>().read_unaligned()
+ };
+ let eq_first: Mask = a.simd_eq(first_probe);
+ let eq_last: Mask = b.simd_eq(second_probe);
+ let both = eq_first.bitand(eq_last);
+ let mask = both.to_bitmask();
+
+ return mask;
+ };
+
+ let mut i = 0;
+ let mut result = false;
+ // The loop condition must ensure that there's enough headroom to read LANE bytes,
+ // and not only at the current index but also at the index shifted by block_offset
+ const UNROLL: usize = 4;
+ while i + last_byte_offset + UNROLL * Block::LANES < haystack.len() && !result {
+ let mut masks = [0u16; UNROLL];
+ for j in 0..UNROLL {
+ masks[j] = test_chunk(i + j * Block::LANES);
+ }
+ for j in 0..UNROLL {
+ let mask = masks[j];
+ if mask != 0 {
+ result |= check_mask(i + j * Block::LANES, mask, result);
+ }
+ }
+ i += UNROLL * Block::LANES;
+ }
+ while i + last_byte_offset + Block::LANES < haystack.len() && !result {
+ let mask = test_chunk(i);
+ if mask != 0 {
+ result |= check_mask(i, mask, result);
+ }
+ i += Block::LANES;
+ }
+
+ // Process the tail that didn't fit into LANES-sized steps.
+ // This simply repeats the same procedure but as right-aligned chunk instead
+ // of a left-aligned one. The last byte must be exactly flush with the string end so
+ // we don't miss a single byte or read out of bounds.
+ let i = haystack.len() - last_byte_offset - Block::LANES;
+ let mask = test_chunk(i);
+ if mask != 0 {
+ result |= check_mask(i, mask, result);
+ }
+
+ Some(result)
+}
+
+/// Compares short slices for equality.
+///
+/// It avoids a call to libc's memcmp which is faster on long slices
+/// due to SIMD optimizations but it incurs a function call overhead.
+///
+/// # Safety
+///
+/// Both slices must have the same length.
+#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))] // only called on x86
+#[inline]
+unsafe fn small_slice_eq(x: &[u8], y: &[u8]) -> bool {
+ debug_assert_eq!(x.len(), y.len());
+ // This function is adapted from
+ // https://github.com/BurntSushi/memchr/blob/8037d11b4357b0f07be2bb66dc2659d9cf28ad32/src/memmem/util.rs#L32
+
+ // If we don't have enough bytes to do 4-byte at a time loads, then
+ // fall back to the naive slow version.
+ //
+ // Potential alternative: We could do a copy_nonoverlapping combined with a mask instead
+ // of a loop. Benchmark it.
+ if x.len() < 4 {
+ for (&b1, &b2) in x.iter().zip(y) {
+ if b1 != b2 {
+ return false;
+ }
+ }
+ return true;
+ }
+ // When we have 4 or more bytes to compare, then proceed in chunks of 4 at
+ // a time using unaligned loads.
+ //
+ // Also, why do 4 byte loads instead of, say, 8 byte loads? The reason is
+ // that this particular version of memcmp is likely to be called with tiny
+ // needles. That means that if we do 8 byte loads, then a higher proportion
+ // of memcmp calls will use the slower variant above. With that said, this
+ // is a hypothesis and is only loosely supported by benchmarks. There's
+ // likely some improvement that could be made here. The main thing here
+ // though is to optimize for latency, not throughput.
+
+ // SAFETY: Via the conditional above, we know that both `px` and `py`
+ // have the same length, so `px < pxend` implies that `py < pyend`.
+ // Thus, derefencing both `px` and `py` in the loop below is safe.
+ //
+ // Moreover, we set `pxend` and `pyend` to be 4 bytes before the actual
+ // end of `px` and `py`. Thus, the final dereference outside of the
+ // loop is guaranteed to be valid. (The final comparison will overlap with
+ // the last comparison done in the loop for lengths that aren't multiples
+ // of four.)
+ //
+ // Finally, we needn't worry about alignment here, since we do unaligned
+ // loads.
+ unsafe {
+ let (mut px, mut py) = (x.as_ptr(), y.as_ptr());
+ let (pxend, pyend) = (px.add(x.len() - 4), py.add(y.len() - 4));
+ while px < pxend {
+ let vx = (px as *const u32).read_unaligned();
+ let vy = (py as *const u32).read_unaligned();
+ if vx != vy {
+ return false;
+ }
+ px = px.add(4);
+ py = py.add(4);
+ }
+ let vx = (pxend as *const u32).read_unaligned();
+ let vy = (pyend as *const u32).read_unaligned();
+ vx == vy
+ }
+}