From 698f8c2f01ea549d77d7dc3338a12e04c11057b9 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Wed, 17 Apr 2024 14:02:58 +0200 Subject: Adding upstream version 1.64.0+dfsg1. Signed-off-by: Daniel Baumann --- library/core/src/str/converts.rs | 203 +++ library/core/src/str/count.rs | 136 ++ library/core/src/str/error.rs | 138 ++ library/core/src/str/iter.rs | 1499 ++++++++++++++++++++ library/core/src/str/lossy.rs | 200 +++ library/core/src/str/mod.rs | 2640 +++++++++++++++++++++++++++++++++++ library/core/src/str/pattern.rs | 1686 ++++++++++++++++++++++ library/core/src/str/traits.rs | 604 ++++++++ library/core/src/str/validations.rs | 274 ++++ 9 files changed, 7380 insertions(+) create mode 100644 library/core/src/str/converts.rs create mode 100644 library/core/src/str/count.rs create mode 100644 library/core/src/str/error.rs create mode 100644 library/core/src/str/iter.rs create mode 100644 library/core/src/str/lossy.rs create mode 100644 library/core/src/str/mod.rs create mode 100644 library/core/src/str/pattern.rs create mode 100644 library/core/src/str/traits.rs create mode 100644 library/core/src/str/validations.rs (limited to 'library/core/src/str') diff --git a/library/core/src/str/converts.rs b/library/core/src/str/converts.rs new file mode 100644 index 000000000..b0c55ca4f --- /dev/null +++ b/library/core/src/str/converts.rs @@ -0,0 +1,203 @@ +//! Ways to create a `str` from bytes slice. + +use crate::mem; + +use super::validations::run_utf8_validation; +use super::Utf8Error; + +/// Converts a slice of bytes to a string slice. +/// +/// A string slice ([`&str`]) is made of bytes ([`u8`]), and a byte slice +/// ([`&[u8]`][byteslice]) is made of bytes, so this function converts between +/// the two. Not all byte slices are valid string slices, however: [`&str`] requires +/// that it is valid UTF-8. `from_utf8()` checks to ensure that the bytes are valid +/// UTF-8, and then does the conversion. +/// +/// [`&str`]: str +/// [byteslice]: slice +/// +/// If you are sure that the byte slice is valid UTF-8, and you don't want to +/// incur the overhead of the validity check, there is an unsafe version of +/// this function, [`from_utf8_unchecked`], which has the same +/// behavior but skips the check. +/// +/// If you need a `String` instead of a `&str`, consider +/// [`String::from_utf8`][string]. +/// +/// [string]: ../../std/string/struct.String.html#method.from_utf8 +/// +/// Because you can stack-allocate a `[u8; N]`, and you can take a +/// [`&[u8]`][byteslice] of it, this function is one way to have a +/// stack-allocated string. There is an example of this in the +/// examples section below. +/// +/// [byteslice]: slice +/// +/// # Errors +/// +/// Returns `Err` if the slice is not UTF-8 with a description as to why the +/// provided slice is not UTF-8. +/// +/// # Examples +/// +/// Basic usage: +/// +/// ``` +/// use std::str; +/// +/// // some bytes, in a vector +/// let sparkle_heart = vec![240, 159, 146, 150]; +/// +/// // We know these bytes are valid, so just use `unwrap()`. +/// let sparkle_heart = str::from_utf8(&sparkle_heart).unwrap(); +/// +/// assert_eq!("💖", sparkle_heart); +/// ``` +/// +/// Incorrect bytes: +/// +/// ``` +/// use std::str; +/// +/// // some invalid bytes, in a vector +/// let sparkle_heart = vec![0, 159, 146, 150]; +/// +/// assert!(str::from_utf8(&sparkle_heart).is_err()); +/// ``` +/// +/// See the docs for [`Utf8Error`] for more details on the kinds of +/// errors that can be returned. +/// +/// A "stack allocated string": +/// +/// ``` +/// use std::str; +/// +/// // some bytes, in a stack-allocated array +/// let sparkle_heart = [240, 159, 146, 150]; +/// +/// // We know these bytes are valid, so just use `unwrap()`. +/// let sparkle_heart = str::from_utf8(&sparkle_heart).unwrap(); +/// +/// assert_eq!("💖", sparkle_heart); +/// ``` +#[stable(feature = "rust1", since = "1.0.0")] +#[rustc_const_stable(feature = "const_str_from_utf8_shared", since = "1.63.0")] +#[rustc_allow_const_fn_unstable(str_internals)] +pub const fn from_utf8(v: &[u8]) -> Result<&str, Utf8Error> { + // FIXME: This should use `?` again, once it's `const` + match run_utf8_validation(v) { + Ok(_) => { + // SAFETY: validation succeeded. + Ok(unsafe { from_utf8_unchecked(v) }) + } + Err(err) => Err(err), + } +} + +/// Converts a mutable slice of bytes to a mutable string slice. +/// +/// # Examples +/// +/// Basic usage: +/// +/// ``` +/// use std::str; +/// +/// // "Hello, Rust!" as a mutable vector +/// let mut hellorust = vec![72, 101, 108, 108, 111, 44, 32, 82, 117, 115, 116, 33]; +/// +/// // As we know these bytes are valid, we can use `unwrap()` +/// let outstr = str::from_utf8_mut(&mut hellorust).unwrap(); +/// +/// assert_eq!("Hello, Rust!", outstr); +/// ``` +/// +/// Incorrect bytes: +/// +/// ``` +/// use std::str; +/// +/// // Some invalid bytes in a mutable vector +/// let mut invalid = vec![128, 223]; +/// +/// assert!(str::from_utf8_mut(&mut invalid).is_err()); +/// ``` +/// See the docs for [`Utf8Error`] for more details on the kinds of +/// errors that can be returned. +#[stable(feature = "str_mut_extras", since = "1.20.0")] +#[rustc_const_unstable(feature = "const_str_from_utf8", issue = "91006")] +pub const fn from_utf8_mut(v: &mut [u8]) -> Result<&mut str, Utf8Error> { + // This should use `?` again, once it's `const` + match run_utf8_validation(v) { + Ok(_) => { + // SAFETY: validation succeeded. + Ok(unsafe { from_utf8_unchecked_mut(v) }) + } + Err(err) => Err(err), + } +} + +/// Converts a slice of bytes to a string slice without checking +/// that the string contains valid UTF-8. +/// +/// See the safe version, [`from_utf8`], for more information. +/// +/// # Safety +/// +/// The bytes passed in must be valid UTF-8. +/// +/// # Examples +/// +/// Basic usage: +/// +/// ``` +/// use std::str; +/// +/// // some bytes, in a vector +/// let sparkle_heart = vec![240, 159, 146, 150]; +/// +/// let sparkle_heart = unsafe { +/// str::from_utf8_unchecked(&sparkle_heart) +/// }; +/// +/// assert_eq!("💖", sparkle_heart); +/// ``` +#[inline] +#[must_use] +#[stable(feature = "rust1", since = "1.0.0")] +#[rustc_const_stable(feature = "const_str_from_utf8_unchecked", since = "1.55.0")] +pub const unsafe fn from_utf8_unchecked(v: &[u8]) -> &str { + // SAFETY: the caller must guarantee that the bytes `v` are valid UTF-8. + // Also relies on `&str` and `&[u8]` having the same layout. + unsafe { mem::transmute(v) } +} + +/// Converts a slice of bytes to a string slice without checking +/// that the string contains valid UTF-8; mutable version. +/// +/// See the immutable version, [`from_utf8_unchecked()`] for more information. +/// +/// # Examples +/// +/// Basic usage: +/// +/// ``` +/// use std::str; +/// +/// let mut heart = vec![240, 159, 146, 150]; +/// let heart = unsafe { str::from_utf8_unchecked_mut(&mut heart) }; +/// +/// assert_eq!("💖", heart); +/// ``` +#[inline] +#[must_use] +#[stable(feature = "str_mut_extras", since = "1.20.0")] +#[rustc_const_unstable(feature = "const_str_from_utf8_unchecked_mut", issue = "91005")] +pub const unsafe fn from_utf8_unchecked_mut(v: &mut [u8]) -> &mut str { + // SAFETY: the caller must guarantee that the bytes `v` + // are valid UTF-8, thus the cast to `*mut str` is safe. + // Also, the pointer dereference is safe because that pointer + // comes from a reference which is guaranteed to be valid for writes. + unsafe { &mut *(v as *mut [u8] as *mut str) } +} diff --git a/library/core/src/str/count.rs b/library/core/src/str/count.rs new file mode 100644 index 000000000..28567a7e7 --- /dev/null +++ b/library/core/src/str/count.rs @@ -0,0 +1,136 @@ +//! Code for efficiently counting the number of `char`s in a UTF-8 encoded +//! string. +//! +//! Broadly, UTF-8 encodes `char`s as a "leading" byte which begins the `char`, +//! followed by some number (possibly 0) of continuation bytes. +//! +//! The leading byte can have a number of bit-patterns (with the specific +//! pattern indicating how many continuation bytes follow), but the continuation +//! bytes are always in the format `0b10XX_XXXX` (where the `X`s can take any +//! value). That is, the most significant bit is set, and the second most +//! significant bit is unset. +//! +//! To count the number of characters, we can just count the number of bytes in +//! the string which are not continuation bytes, which can be done many bytes at +//! a time fairly easily. +//! +//! Note: Because the term "leading byte" can sometimes be ambiguous (for +//! example, it could also refer to the first byte of a slice), we'll often use +//! the term "non-continuation byte" to refer to these bytes in the code. +use core::intrinsics::unlikely; + +const USIZE_SIZE: usize = core::mem::size_of::(); +const UNROLL_INNER: usize = 4; + +#[inline] +pub(super) fn count_chars(s: &str) -> usize { + if s.len() < USIZE_SIZE * UNROLL_INNER { + // Avoid entering the optimized implementation for strings where the + // difference is not likely to matter, or where it might even be slower. + // That said, a ton of thought was not spent on the particular threshold + // here, beyond "this value seems to make sense". + char_count_general_case(s.as_bytes()) + } else { + do_count_chars(s) + } +} + +fn do_count_chars(s: &str) -> usize { + // For correctness, `CHUNK_SIZE` must be: + // + // - Less than or equal to 255, otherwise we'll overflow bytes in `counts`. + // - A multiple of `UNROLL_INNER`, otherwise our `break` inside the + // `body.chunks(CHUNK_SIZE)` loop is incorrect. + // + // For performance, `CHUNK_SIZE` should be: + // - Relatively cheap to `/` against (so some simple sum of powers of two). + // - Large enough to avoid paying for the cost of the `sum_bytes_in_usize` + // too often. + const CHUNK_SIZE: usize = 192; + + // Check the properties of `CHUNK_SIZE` and `UNROLL_INNER` that are required + // for correctness. + const _: () = assert!(CHUNK_SIZE < 256); + const _: () = assert!(CHUNK_SIZE % UNROLL_INNER == 0); + + // SAFETY: transmuting `[u8]` to `[usize]` is safe except for size + // differences which are handled by `align_to`. + let (head, body, tail) = unsafe { s.as_bytes().align_to::() }; + + // This should be quite rare, and basically exists to handle the degenerate + // cases where align_to fails (as well as miri under symbolic alignment + // mode). + // + // The `unlikely` helps discourage LLVM from inlining the body, which is + // nice, as we would rather not mark the `char_count_general_case` function + // as cold. + if unlikely(body.is_empty() || head.len() > USIZE_SIZE || tail.len() > USIZE_SIZE) { + return char_count_general_case(s.as_bytes()); + } + + let mut total = char_count_general_case(head) + char_count_general_case(tail); + // Split `body` into `CHUNK_SIZE` chunks to reduce the frequency with which + // we call `sum_bytes_in_usize`. + for chunk in body.chunks(CHUNK_SIZE) { + // We accumulate intermediate sums in `counts`, where each byte contains + // a subset of the sum of this chunk, like a `[u8; size_of::()]`. + let mut counts = 0; + + let (unrolled_chunks, remainder) = chunk.as_chunks::(); + for unrolled in unrolled_chunks { + for &word in unrolled { + // Because `CHUNK_SIZE` is < 256, this addition can't cause the + // count in any of the bytes to overflow into a subsequent byte. + counts += contains_non_continuation_byte(word); + } + } + + // Sum the values in `counts` (which, again, is conceptually a `[u8; + // size_of::()]`), and accumulate the result into `total`. + total += sum_bytes_in_usize(counts); + + // If there's any data in `remainder`, then handle it. This will only + // happen for the last `chunk` in `body.chunks()` (because `CHUNK_SIZE` + // is divisible by `UNROLL_INNER`), so we explicitly break at the end + // (which seems to help LLVM out). + if !remainder.is_empty() { + // Accumulate all the data in the remainder. + let mut counts = 0; + for &word in remainder { + counts += contains_non_continuation_byte(word); + } + total += sum_bytes_in_usize(counts); + break; + } + } + total +} + +// Checks each byte of `w` to see if it contains the first byte in a UTF-8 +// sequence. Bytes in `w` which are continuation bytes are left as `0x00` (e.g. +// false), and bytes which are non-continuation bytes are left as `0x01` (e.g. +// true) +#[inline] +fn contains_non_continuation_byte(w: usize) -> usize { + const LSB: usize = usize::repeat_u8(0x01); + ((!w >> 7) | (w >> 6)) & LSB +} + +// Morally equivalent to `values.to_ne_bytes().into_iter().sum::()`, but +// more efficient. +#[inline] +fn sum_bytes_in_usize(values: usize) -> usize { + const LSB_SHORTS: usize = usize::repeat_u16(0x0001); + const SKIP_BYTES: usize = usize::repeat_u16(0x00ff); + + let pair_sum: usize = (values & SKIP_BYTES) + ((values >> 8) & SKIP_BYTES); + pair_sum.wrapping_mul(LSB_SHORTS) >> ((USIZE_SIZE - 2) * 8) +} + +// This is the most direct implementation of the concept of "count the number of +// bytes in the string which are not continuation bytes", and is used for the +// head and tail of the input string (the first and last item in the tuple +// returned by `slice::align_to`). +fn char_count_general_case(s: &[u8]) -> usize { + s.iter().filter(|&&byte| !super::validations::utf8_is_cont_byte(byte)).count() +} diff --git a/library/core/src/str/error.rs b/library/core/src/str/error.rs new file mode 100644 index 000000000..4e569fcc8 --- /dev/null +++ b/library/core/src/str/error.rs @@ -0,0 +1,138 @@ +//! Defines utf8 error type. + +use crate::fmt; + +/// Errors which can occur when attempting to interpret a sequence of [`u8`] +/// as a string. +/// +/// As such, the `from_utf8` family of functions and methods for both [`String`]s +/// and [`&str`]s make use of this error, for example. +/// +/// [`String`]: ../../std/string/struct.String.html#method.from_utf8 +/// [`&str`]: super::from_utf8 +/// +/// # Examples +/// +/// This error type’s methods can be used to create functionality +/// similar to `String::from_utf8_lossy` without allocating heap memory: +/// +/// ``` +/// fn from_utf8_lossy(mut input: &[u8], mut push: F) where F: FnMut(&str) { +/// loop { +/// match std::str::from_utf8(input) { +/// Ok(valid) => { +/// push(valid); +/// break +/// } +/// Err(error) => { +/// let (valid, after_valid) = input.split_at(error.valid_up_to()); +/// unsafe { +/// push(std::str::from_utf8_unchecked(valid)) +/// } +/// push("\u{FFFD}"); +/// +/// if let Some(invalid_sequence_length) = error.error_len() { +/// input = &after_valid[invalid_sequence_length..] +/// } else { +/// break +/// } +/// } +/// } +/// } +/// } +/// ``` +#[derive(Copy, Eq, PartialEq, Clone, Debug)] +#[stable(feature = "rust1", since = "1.0.0")] +pub struct Utf8Error { + pub(super) valid_up_to: usize, + pub(super) error_len: Option, +} + +impl Utf8Error { + /// Returns the index in the given string up to which valid UTF-8 was + /// verified. + /// + /// It is the maximum index such that `from_utf8(&input[..index])` + /// would return `Ok(_)`. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use std::str; + /// + /// // some invalid bytes, in a vector + /// let sparkle_heart = vec![0, 159, 146, 150]; + /// + /// // std::str::from_utf8 returns a Utf8Error + /// let error = str::from_utf8(&sparkle_heart).unwrap_err(); + /// + /// // the second byte is invalid here + /// assert_eq!(1, error.valid_up_to()); + /// ``` + #[stable(feature = "utf8_error", since = "1.5.0")] + #[rustc_const_stable(feature = "const_str_from_utf8_shared", since = "1.63.0")] + #[must_use] + #[inline] + pub const fn valid_up_to(&self) -> usize { + self.valid_up_to + } + + /// Provides more information about the failure: + /// + /// * `None`: the end of the input was reached unexpectedly. + /// `self.valid_up_to()` is 1 to 3 bytes from the end of the input. + /// If a byte stream (such as a file or a network socket) is being decoded incrementally, + /// this could be a valid `char` whose UTF-8 byte sequence is spanning multiple chunks. + /// + /// * `Some(len)`: an unexpected byte was encountered. + /// The length provided is that of the invalid byte sequence + /// that starts at the index given by `valid_up_to()`. + /// Decoding should resume after that sequence + /// (after inserting a [`U+FFFD REPLACEMENT CHARACTER`][U+FFFD]) in case of + /// lossy decoding. + /// + /// [U+FFFD]: ../../std/char/constant.REPLACEMENT_CHARACTER.html + #[stable(feature = "utf8_error_error_len", since = "1.20.0")] + #[rustc_const_stable(feature = "const_str_from_utf8_shared", since = "1.63.0")] + #[must_use] + #[inline] + pub const fn error_len(&self) -> Option { + // FIXME: This should become `map` again, once it's `const` + match self.error_len { + Some(len) => Some(len as usize), + None => None, + } + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl fmt::Display for Utf8Error { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + if let Some(error_len) = self.error_len { + write!( + f, + "invalid utf-8 sequence of {} bytes from index {}", + error_len, self.valid_up_to + ) + } else { + write!(f, "incomplete utf-8 byte sequence from index {}", self.valid_up_to) + } + } +} + +/// An error returned when parsing a `bool` using [`from_str`] fails +/// +/// [`from_str`]: super::FromStr::from_str +#[derive(Debug, Clone, PartialEq, Eq)] +#[non_exhaustive] +#[stable(feature = "rust1", since = "1.0.0")] +pub struct ParseBoolError; + +#[stable(feature = "rust1", since = "1.0.0")] +impl fmt::Display for ParseBoolError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + "provided string was not `true` or `false`".fmt(f) + } +} diff --git a/library/core/src/str/iter.rs b/library/core/src/str/iter.rs new file mode 100644 index 000000000..24083ee6a --- /dev/null +++ b/library/core/src/str/iter.rs @@ -0,0 +1,1499 @@ +//! Iterators for `str` methods. + +use crate::char; +use crate::fmt::{self, Write}; +use crate::iter::{Chain, FlatMap, Flatten}; +use crate::iter::{Copied, Filter, FusedIterator, Map, TrustedLen}; +use crate::iter::{TrustedRandomAccess, TrustedRandomAccessNoCoerce}; +use crate::ops::Try; +use crate::option; +use crate::slice::{self, Split as SliceSplit}; + +use super::from_utf8_unchecked; +use super::pattern::Pattern; +use super::pattern::{DoubleEndedSearcher, ReverseSearcher, Searcher}; +use super::validations::{next_code_point, next_code_point_reverse}; +use super::LinesAnyMap; +use super::{BytesIsNotEmpty, UnsafeBytesToStr}; +use super::{CharEscapeDebugContinue, CharEscapeDefault, CharEscapeUnicode}; +use super::{IsAsciiWhitespace, IsNotEmpty, IsWhitespace}; + +/// An iterator over the [`char`]s of a string slice. +/// +/// +/// This struct is created by the [`chars`] method on [`str`]. +/// See its documentation for more. +/// +/// [`char`]: prim@char +/// [`chars`]: str::chars +#[derive(Clone)] +#[must_use = "iterators are lazy and do nothing unless consumed"] +#[stable(feature = "rust1", since = "1.0.0")] +pub struct Chars<'a> { + pub(super) iter: slice::Iter<'a, u8>, +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl<'a> Iterator for Chars<'a> { + type Item = char; + + #[inline] + fn next(&mut self) -> Option { + // SAFETY: `str` invariant says `self.iter` is a valid UTF-8 string and + // the resulting `ch` is a valid Unicode Scalar Value. + unsafe { next_code_point(&mut self.iter).map(|ch| char::from_u32_unchecked(ch)) } + } + + #[inline] + fn count(self) -> usize { + super::count::count_chars(self.as_str()) + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + let len = self.iter.len(); + // `(len + 3)` can't overflow, because we know that the `slice::Iter` + // belongs to a slice in memory which has a maximum length of + // `isize::MAX` (that's well below `usize::MAX`). + ((len + 3) / 4, Some(len)) + } + + #[inline] + fn last(mut self) -> Option { + // No need to go through the entire string. + self.next_back() + } +} + +#[stable(feature = "chars_debug_impl", since = "1.38.0")] +impl fmt::Debug for Chars<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "Chars(")?; + f.debug_list().entries(self.clone()).finish()?; + write!(f, ")")?; + Ok(()) + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl<'a> DoubleEndedIterator for Chars<'a> { + #[inline] + fn next_back(&mut self) -> Option { + // SAFETY: `str` invariant says `self.iter` is a valid UTF-8 string and + // the resulting `ch` is a valid Unicode Scalar Value. + unsafe { next_code_point_reverse(&mut self.iter).map(|ch| char::from_u32_unchecked(ch)) } + } +} + +#[stable(feature = "fused", since = "1.26.0")] +impl FusedIterator for Chars<'_> {} + +impl<'a> Chars<'a> { + /// Views the underlying data as a subslice of the original data. + /// + /// This has the same lifetime as the original slice, and so the + /// iterator can continue to be used while this exists. + /// + /// # Examples + /// + /// ``` + /// let mut chars = "abc".chars(); + /// + /// assert_eq!(chars.as_str(), "abc"); + /// chars.next(); + /// assert_eq!(chars.as_str(), "bc"); + /// chars.next(); + /// chars.next(); + /// assert_eq!(chars.as_str(), ""); + /// ``` + #[stable(feature = "iter_to_slice", since = "1.4.0")] + #[must_use] + #[inline] + pub fn as_str(&self) -> &'a str { + // SAFETY: `Chars` is only made from a str, which guarantees the iter is valid UTF-8. + unsafe { from_utf8_unchecked(self.iter.as_slice()) } + } +} + +/// An iterator over the [`char`]s of a string slice, and their positions. +/// +/// This struct is created by the [`char_indices`] method on [`str`]. +/// See its documentation for more. +/// +/// [`char`]: prim@char +/// [`char_indices`]: str::char_indices +#[derive(Clone, Debug)] +#[must_use = "iterators are lazy and do nothing unless consumed"] +#[stable(feature = "rust1", since = "1.0.0")] +pub struct CharIndices<'a> { + pub(super) front_offset: usize, + pub(super) iter: Chars<'a>, +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl<'a> Iterator for CharIndices<'a> { + type Item = (usize, char); + + #[inline] + fn next(&mut self) -> Option<(usize, char)> { + let pre_len = self.iter.iter.len(); + match self.iter.next() { + None => None, + Some(ch) => { + let index = self.front_offset; + let len = self.iter.iter.len(); + self.front_offset += pre_len - len; + Some((index, ch)) + } + } + } + + #[inline] + fn count(self) -> usize { + self.iter.count() + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + self.iter.size_hint() + } + + #[inline] + fn last(mut self) -> Option<(usize, char)> { + // No need to go through the entire string. + self.next_back() + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl<'a> DoubleEndedIterator for CharIndices<'a> { + #[inline] + fn next_back(&mut self) -> Option<(usize, char)> { + self.iter.next_back().map(|ch| { + let index = self.front_offset + self.iter.iter.len(); + (index, ch) + }) + } +} + +#[stable(feature = "fused", since = "1.26.0")] +impl FusedIterator for CharIndices<'_> {} + +impl<'a> CharIndices<'a> { + /// Views the underlying data as a subslice of the original data. + /// + /// This has the same lifetime as the original slice, and so the + /// iterator can continue to be used while this exists. + #[stable(feature = "iter_to_slice", since = "1.4.0")] + #[must_use] + #[inline] + pub fn as_str(&self) -> &'a str { + self.iter.as_str() + } + + /// Returns the byte position of the next character, or the length + /// of the underlying string if there are no more characters. + /// + /// # Examples + /// + /// ``` + /// #![feature(char_indices_offset)] + /// let mut chars = "a楽".char_indices(); + /// + /// assert_eq!(chars.offset(), 0); + /// assert_eq!(chars.next(), Some((0, 'a'))); + /// + /// assert_eq!(chars.offset(), 1); + /// assert_eq!(chars.next(), Some((1, '楽'))); + /// + /// assert_eq!(chars.offset(), 4); + /// assert_eq!(chars.next(), None); + /// ``` + #[inline] + #[must_use] + #[unstable(feature = "char_indices_offset", issue = "83871")] + pub fn offset(&self) -> usize { + self.front_offset + } +} + +/// An iterator over the bytes of a string slice. +/// +/// This struct is created by the [`bytes`] method on [`str`]. +/// See its documentation for more. +/// +/// [`bytes`]: str::bytes +#[must_use = "iterators are lazy and do nothing unless consumed"] +#[stable(feature = "rust1", since = "1.0.0")] +#[derive(Clone, Debug)] +pub struct Bytes<'a>(pub(super) Copied>); + +#[stable(feature = "rust1", since = "1.0.0")] +impl Iterator for Bytes<'_> { + type Item = u8; + + #[inline] + fn next(&mut self) -> Option { + self.0.next() + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + self.0.size_hint() + } + + #[inline] + fn count(self) -> usize { + self.0.count() + } + + #[inline] + fn last(self) -> Option { + self.0.last() + } + + #[inline] + fn nth(&mut self, n: usize) -> Option { + self.0.nth(n) + } + + #[inline] + fn all(&mut self, f: F) -> bool + where + F: FnMut(Self::Item) -> bool, + { + self.0.all(f) + } + + #[inline] + fn any(&mut self, f: F) -> bool + where + F: FnMut(Self::Item) -> bool, + { + self.0.any(f) + } + + #[inline] + fn find

(&mut self, predicate: P) -> Option + where + P: FnMut(&Self::Item) -> bool, + { + self.0.find(predicate) + } + + #[inline] + fn position

(&mut self, predicate: P) -> Option + where + P: FnMut(Self::Item) -> bool, + { + self.0.position(predicate) + } + + #[inline] + fn rposition

(&mut self, predicate: P) -> Option + where + P: FnMut(Self::Item) -> bool, + { + self.0.rposition(predicate) + } + + #[inline] + unsafe fn __iterator_get_unchecked(&mut self, idx: usize) -> u8 { + // SAFETY: the caller must uphold the safety contract + // for `Iterator::__iterator_get_unchecked`. + unsafe { self.0.__iterator_get_unchecked(idx) } + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl DoubleEndedIterator for Bytes<'_> { + #[inline] + fn next_back(&mut self) -> Option { + self.0.next_back() + } + + #[inline] + fn nth_back(&mut self, n: usize) -> Option { + self.0.nth_back(n) + } + + #[inline] + fn rfind

(&mut self, predicate: P) -> Option + where + P: FnMut(&Self::Item) -> bool, + { + self.0.rfind(predicate) + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl ExactSizeIterator for Bytes<'_> { + #[inline] + fn len(&self) -> usize { + self.0.len() + } + + #[inline] + fn is_empty(&self) -> bool { + self.0.is_empty() + } +} + +#[stable(feature = "fused", since = "1.26.0")] +impl FusedIterator for Bytes<'_> {} + +#[unstable(feature = "trusted_len", issue = "37572")] +unsafe impl TrustedLen for Bytes<'_> {} + +#[doc(hidden)] +#[unstable(feature = "trusted_random_access", issue = "none")] +unsafe impl TrustedRandomAccess for Bytes<'_> {} + +#[doc(hidden)] +#[unstable(feature = "trusted_random_access", issue = "none")] +unsafe impl TrustedRandomAccessNoCoerce for Bytes<'_> { + const MAY_HAVE_SIDE_EFFECT: bool = false; +} + +/// This macro generates a Clone impl for string pattern API +/// wrapper types of the form X<'a, P> +macro_rules! derive_pattern_clone { + (clone $t:ident with |$s:ident| $e:expr) => { + impl<'a, P> Clone for $t<'a, P> + where + P: Pattern<'a, Searcher: Clone>, + { + fn clone(&self) -> Self { + let $s = self; + $e + } + } + }; +} + +/// This macro generates two public iterator structs +/// wrapping a private internal one that makes use of the `Pattern` API. +/// +/// For all patterns `P: Pattern<'a>` the following items will be +/// generated (generics omitted): +/// +/// struct $forward_iterator($internal_iterator); +/// struct $reverse_iterator($internal_iterator); +/// +/// impl Iterator for $forward_iterator +/// { /* internal ends up calling Searcher::next_match() */ } +/// +/// impl DoubleEndedIterator for $forward_iterator +/// where P::Searcher: DoubleEndedSearcher +/// { /* internal ends up calling Searcher::next_match_back() */ } +/// +/// impl Iterator for $reverse_iterator +/// where P::Searcher: ReverseSearcher +/// { /* internal ends up calling Searcher::next_match_back() */ } +/// +/// impl DoubleEndedIterator for $reverse_iterator +/// where P::Searcher: DoubleEndedSearcher +/// { /* internal ends up calling Searcher::next_match() */ } +/// +/// The internal one is defined outside the macro, and has almost the same +/// semantic as a DoubleEndedIterator by delegating to `pattern::Searcher` and +/// `pattern::ReverseSearcher` for both forward and reverse iteration. +/// +/// "Almost", because a `Searcher` and a `ReverseSearcher` for a given +/// `Pattern` might not return the same elements, so actually implementing +/// `DoubleEndedIterator` for it would be incorrect. +/// (See the docs in `str::pattern` for more details) +/// +/// However, the internal struct still represents a single ended iterator from +/// either end, and depending on pattern is also a valid double ended iterator, +/// so the two wrapper structs implement `Iterator` +/// and `DoubleEndedIterator` depending on the concrete pattern type, leading +/// to the complex impls seen above. +macro_rules! generate_pattern_iterators { + { + // Forward iterator + forward: + $(#[$forward_iterator_attribute:meta])* + struct $forward_iterator:ident; + + // Reverse iterator + reverse: + $(#[$reverse_iterator_attribute:meta])* + struct $reverse_iterator:ident; + + // Stability of all generated items + stability: + $(#[$common_stability_attribute:meta])* + + // Internal almost-iterator that is being delegated to + internal: + $internal_iterator:ident yielding ($iterty:ty); + + // Kind of delegation - either single ended or double ended + delegate $($t:tt)* + } => { + $(#[$forward_iterator_attribute])* + $(#[$common_stability_attribute])* + pub struct $forward_iterator<'a, P: Pattern<'a>>(pub(super) $internal_iterator<'a, P>); + + $(#[$common_stability_attribute])* + impl<'a, P> fmt::Debug for $forward_iterator<'a, P> + where + P: Pattern<'a, Searcher: fmt::Debug>, + { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_tuple(stringify!($forward_iterator)) + .field(&self.0) + .finish() + } + } + + $(#[$common_stability_attribute])* + impl<'a, P: Pattern<'a>> Iterator for $forward_iterator<'a, P> { + type Item = $iterty; + + #[inline] + fn next(&mut self) -> Option<$iterty> { + self.0.next() + } + } + + $(#[$common_stability_attribute])* + impl<'a, P> Clone for $forward_iterator<'a, P> + where + P: Pattern<'a, Searcher: Clone>, + { + fn clone(&self) -> Self { + $forward_iterator(self.0.clone()) + } + } + + $(#[$reverse_iterator_attribute])* + $(#[$common_stability_attribute])* + pub struct $reverse_iterator<'a, P: Pattern<'a>>(pub(super) $internal_iterator<'a, P>); + + $(#[$common_stability_attribute])* + impl<'a, P> fmt::Debug for $reverse_iterator<'a, P> + where + P: Pattern<'a, Searcher: fmt::Debug>, + { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_tuple(stringify!($reverse_iterator)) + .field(&self.0) + .finish() + } + } + + $(#[$common_stability_attribute])* + impl<'a, P> Iterator for $reverse_iterator<'a, P> + where + P: Pattern<'a, Searcher: ReverseSearcher<'a>>, + { + type Item = $iterty; + + #[inline] + fn next(&mut self) -> Option<$iterty> { + self.0.next_back() + } + } + + $(#[$common_stability_attribute])* + impl<'a, P> Clone for $reverse_iterator<'a, P> + where + P: Pattern<'a, Searcher: Clone>, + { + fn clone(&self) -> Self { + $reverse_iterator(self.0.clone()) + } + } + + #[stable(feature = "fused", since = "1.26.0")] + impl<'a, P: Pattern<'a>> FusedIterator for $forward_iterator<'a, P> {} + + #[stable(feature = "fused", since = "1.26.0")] + impl<'a, P> FusedIterator for $reverse_iterator<'a, P> + where + P: Pattern<'a, Searcher: ReverseSearcher<'a>>, + {} + + generate_pattern_iterators!($($t)* with $(#[$common_stability_attribute])*, + $forward_iterator, + $reverse_iterator, $iterty); + }; + { + double ended; with $(#[$common_stability_attribute:meta])*, + $forward_iterator:ident, + $reverse_iterator:ident, $iterty:ty + } => { + $(#[$common_stability_attribute])* + impl<'a, P> DoubleEndedIterator for $forward_iterator<'a, P> + where + P: Pattern<'a, Searcher: DoubleEndedSearcher<'a>>, + { + #[inline] + fn next_back(&mut self) -> Option<$iterty> { + self.0.next_back() + } + } + + $(#[$common_stability_attribute])* + impl<'a, P> DoubleEndedIterator for $reverse_iterator<'a, P> + where + P: Pattern<'a, Searcher: DoubleEndedSearcher<'a>>, + { + #[inline] + fn next_back(&mut self) -> Option<$iterty> { + self.0.next() + } + } + }; + { + single ended; with $(#[$common_stability_attribute:meta])*, + $forward_iterator:ident, + $reverse_iterator:ident, $iterty:ty + } => {} +} + +derive_pattern_clone! { + clone SplitInternal + with |s| SplitInternal { matcher: s.matcher.clone(), ..*s } +} + +pub(super) struct SplitInternal<'a, P: Pattern<'a>> { + pub(super) start: usize, + pub(super) end: usize, + pub(super) matcher: P::Searcher, + pub(super) allow_trailing_empty: bool, + pub(super) finished: bool, +} + +impl<'a, P> fmt::Debug for SplitInternal<'a, P> +where + P: Pattern<'a, Searcher: fmt::Debug>, +{ + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("SplitInternal") + .field("start", &self.start) + .field("end", &self.end) + .field("matcher", &self.matcher) + .field("allow_trailing_empty", &self.allow_trailing_empty) + .field("finished", &self.finished) + .finish() + } +} + +impl<'a, P: Pattern<'a>> SplitInternal<'a, P> { + #[inline] + fn get_end(&mut self) -> Option<&'a str> { + if !self.finished && (self.allow_trailing_empty || self.end - self.start > 0) { + self.finished = true; + // SAFETY: `self.start` and `self.end` always lie on unicode boundaries. + unsafe { + let string = self.matcher.haystack().get_unchecked(self.start..self.end); + Some(string) + } + } else { + None + } + } + + #[inline] + fn next(&mut self) -> Option<&'a str> { + if self.finished { + return None; + } + + let haystack = self.matcher.haystack(); + match self.matcher.next_match() { + // SAFETY: `Searcher` guarantees that `a` and `b` lie on unicode boundaries. + Some((a, b)) => unsafe { + let elt = haystack.get_unchecked(self.start..a); + self.start = b; + Some(elt) + }, + None => self.get_end(), + } + } + + #[inline] + fn next_inclusive(&mut self) -> Option<&'a str> { + if self.finished { + return None; + } + + let haystack = self.matcher.haystack(); + match self.matcher.next_match() { + // SAFETY: `Searcher` guarantees that `b` lies on unicode boundary, + // and self.start is either the start of the original string, + // or `b` was assigned to it, so it also lies on unicode boundary. + Some((_, b)) => unsafe { + let elt = haystack.get_unchecked(self.start..b); + self.start = b; + Some(elt) + }, + None => self.get_end(), + } + } + + #[inline] + fn next_back(&mut self) -> Option<&'a str> + where + P::Searcher: ReverseSearcher<'a>, + { + if self.finished { + return None; + } + + if !self.allow_trailing_empty { + self.allow_trailing_empty = true; + match self.next_back() { + Some(elt) if !elt.is_empty() => return Some(elt), + _ => { + if self.finished { + return None; + } + } + } + } + + let haystack = self.matcher.haystack(); + match self.matcher.next_match_back() { + // SAFETY: `Searcher` guarantees that `a` and `b` lie on unicode boundaries. + Some((a, b)) => unsafe { + let elt = haystack.get_unchecked(b..self.end); + self.end = a; + Some(elt) + }, + // SAFETY: `self.start` and `self.end` always lie on unicode boundaries. + None => unsafe { + self.finished = true; + Some(haystack.get_unchecked(self.start..self.end)) + }, + } + } + + #[inline] + fn next_back_inclusive(&mut self) -> Option<&'a str> + where + P::Searcher: ReverseSearcher<'a>, + { + if self.finished { + return None; + } + + if !self.allow_trailing_empty { + self.allow_trailing_empty = true; + match self.next_back_inclusive() { + Some(elt) if !elt.is_empty() => return Some(elt), + _ => { + if self.finished { + return None; + } + } + } + } + + let haystack = self.matcher.haystack(); + match self.matcher.next_match_back() { + // SAFETY: `Searcher` guarantees that `b` lies on unicode boundary, + // and self.end is either the end of the original string, + // or `b` was assigned to it, so it also lies on unicode boundary. + Some((_, b)) => unsafe { + let elt = haystack.get_unchecked(b..self.end); + self.end = b; + Some(elt) + }, + // SAFETY: self.start is either the start of the original string, + // or start of a substring that represents the part of the string that hasn't + // iterated yet. Either way, it is guaranteed to lie on unicode boundary. + // self.end is either the end of the original string, + // or `b` was assigned to it, so it also lies on unicode boundary. + None => unsafe { + self.finished = true; + Some(haystack.get_unchecked(self.start..self.end)) + }, + } + } + + #[inline] + fn as_str(&self) -> &'a str { + // `Self::get_end` doesn't change `self.start` + if self.finished { + return ""; + } + + // SAFETY: `self.start` and `self.end` always lie on unicode boundaries. + unsafe { self.matcher.haystack().get_unchecked(self.start..self.end) } + } +} + +generate_pattern_iterators! { + forward: + /// Created with the method [`split`]. + /// + /// [`split`]: str::split + struct Split; + reverse: + /// Created with the method [`rsplit`]. + /// + /// [`rsplit`]: str::rsplit + struct RSplit; + stability: + #[stable(feature = "rust1", since = "1.0.0")] + internal: + SplitInternal yielding (&'a str); + delegate double ended; +} + +impl<'a, P: Pattern<'a>> Split<'a, P> { + /// Returns remainder of the split string + /// + /// # Examples + /// + /// ``` + /// #![feature(str_split_as_str)] + /// let mut split = "Mary had a little lamb".split(' '); + /// assert_eq!(split.as_str(), "Mary had a little lamb"); + /// split.next(); + /// assert_eq!(split.as_str(), "had a little lamb"); + /// split.by_ref().for_each(drop); + /// assert_eq!(split.as_str(), ""); + /// ``` + #[inline] + #[unstable(feature = "str_split_as_str", issue = "77998")] + pub fn as_str(&self) -> &'a str { + self.0.as_str() + } +} + +impl<'a, P: Pattern<'a>> RSplit<'a, P> { + /// Returns remainder of the split string + /// + /// # Examples + /// + /// ``` + /// #![feature(str_split_as_str)] + /// let mut split = "Mary had a little lamb".rsplit(' '); + /// assert_eq!(split.as_str(), "Mary had a little lamb"); + /// split.next(); + /// assert_eq!(split.as_str(), "Mary had a little"); + /// split.by_ref().for_each(drop); + /// assert_eq!(split.as_str(), ""); + /// ``` + #[inline] + #[unstable(feature = "str_split_as_str", issue = "77998")] + pub fn as_str(&self) -> &'a str { + self.0.as_str() + } +} + +generate_pattern_iterators! { + forward: + /// Created with the method [`split_terminator`]. + /// + /// [`split_terminator`]: str::split_terminator + struct SplitTerminator; + reverse: + /// Created with the method [`rsplit_terminator`]. + /// + /// [`rsplit_terminator`]: str::rsplit_terminator + struct RSplitTerminator; + stability: + #[stable(feature = "rust1", since = "1.0.0")] + internal: + SplitInternal yielding (&'a str); + delegate double ended; +} + +impl<'a, P: Pattern<'a>> SplitTerminator<'a, P> { + /// Returns remainder of the split string + /// + /// # Examples + /// + /// ``` + /// #![feature(str_split_as_str)] + /// let mut split = "A..B..".split_terminator('.'); + /// assert_eq!(split.as_str(), "A..B.."); + /// split.next(); + /// assert_eq!(split.as_str(), ".B.."); + /// split.by_ref().for_each(drop); + /// assert_eq!(split.as_str(), ""); + /// ``` + #[inline] + #[unstable(feature = "str_split_as_str", issue = "77998")] + pub fn as_str(&self) -> &'a str { + self.0.as_str() + } +} + +impl<'a, P: Pattern<'a>> RSplitTerminator<'a, P> { + /// Returns remainder of the split string + /// + /// # Examples + /// + /// ``` + /// #![feature(str_split_as_str)] + /// let mut split = "A..B..".rsplit_terminator('.'); + /// assert_eq!(split.as_str(), "A..B.."); + /// split.next(); + /// assert_eq!(split.as_str(), "A..B"); + /// split.by_ref().for_each(drop); + /// assert_eq!(split.as_str(), ""); + /// ``` + #[inline] + #[unstable(feature = "str_split_as_str", issue = "77998")] + pub fn as_str(&self) -> &'a str { + self.0.as_str() + } +} + +derive_pattern_clone! { + clone SplitNInternal + with |s| SplitNInternal { iter: s.iter.clone(), ..*s } +} + +pub(super) struct SplitNInternal<'a, P: Pattern<'a>> { + pub(super) iter: SplitInternal<'a, P>, + /// The number of splits remaining + pub(super) count: usize, +} + +impl<'a, P> fmt::Debug for SplitNInternal<'a, P> +where + P: Pattern<'a, Searcher: fmt::Debug>, +{ + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("SplitNInternal") + .field("iter", &self.iter) + .field("count", &self.count) + .finish() + } +} + +impl<'a, P: Pattern<'a>> SplitNInternal<'a, P> { + #[inline] + fn next(&mut self) -> Option<&'a str> { + match self.count { + 0 => None, + 1 => { + self.count = 0; + self.iter.get_end() + } + _ => { + self.count -= 1; + self.iter.next() + } + } + } + + #[inline] + fn next_back(&mut self) -> Option<&'a str> + where + P::Searcher: ReverseSearcher<'a>, + { + match self.count { + 0 => None, + 1 => { + self.count = 0; + self.iter.get_end() + } + _ => { + self.count -= 1; + self.iter.next_back() + } + } + } + + #[inline] + fn as_str(&self) -> &'a str { + self.iter.as_str() + } +} + +generate_pattern_iterators! { + forward: + /// Created with the method [`splitn`]. + /// + /// [`splitn`]: str::splitn + struct SplitN; + reverse: + /// Created with the method [`rsplitn`]. + /// + /// [`rsplitn`]: str::rsplitn + struct RSplitN; + stability: + #[stable(feature = "rust1", since = "1.0.0")] + internal: + SplitNInternal yielding (&'a str); + delegate single ended; +} + +impl<'a, P: Pattern<'a>> SplitN<'a, P> { + /// Returns remainder of the split string + /// + /// # Examples + /// + /// ``` + /// #![feature(str_split_as_str)] + /// let mut split = "Mary had a little lamb".splitn(3, ' '); + /// assert_eq!(split.as_str(), "Mary had a little lamb"); + /// split.next(); + /// assert_eq!(split.as_str(), "had a little lamb"); + /// split.by_ref().for_each(drop); + /// assert_eq!(split.as_str(), ""); + /// ``` + #[inline] + #[unstable(feature = "str_split_as_str", issue = "77998")] + pub fn as_str(&self) -> &'a str { + self.0.as_str() + } +} + +impl<'a, P: Pattern<'a>> RSplitN<'a, P> { + /// Returns remainder of the split string + /// + /// # Examples + /// + /// ``` + /// #![feature(str_split_as_str)] + /// let mut split = "Mary had a little lamb".rsplitn(3, ' '); + /// assert_eq!(split.as_str(), "Mary had a little lamb"); + /// split.next(); + /// assert_eq!(split.as_str(), "Mary had a little"); + /// split.by_ref().for_each(drop); + /// assert_eq!(split.as_str(), ""); + /// ``` + #[inline] + #[unstable(feature = "str_split_as_str", issue = "77998")] + pub fn as_str(&self) -> &'a str { + self.0.as_str() + } +} + +derive_pattern_clone! { + clone MatchIndicesInternal + with |s| MatchIndicesInternal(s.0.clone()) +} + +pub(super) struct MatchIndicesInternal<'a, P: Pattern<'a>>(pub(super) P::Searcher); + +impl<'a, P> fmt::Debug for MatchIndicesInternal<'a, P> +where + P: Pattern<'a, Searcher: fmt::Debug>, +{ + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_tuple("MatchIndicesInternal").field(&self.0).finish() + } +} + +impl<'a, P: Pattern<'a>> MatchIndicesInternal<'a, P> { + #[inline] + fn next(&mut self) -> Option<(usize, &'a str)> { + self.0 + .next_match() + // SAFETY: `Searcher` guarantees that `start` and `end` lie on unicode boundaries. + .map(|(start, end)| unsafe { (start, self.0.haystack().get_unchecked(start..end)) }) + } + + #[inline] + fn next_back(&mut self) -> Option<(usize, &'a str)> + where + P::Searcher: ReverseSearcher<'a>, + { + self.0 + .next_match_back() + // SAFETY: `Searcher` guarantees that `start` and `end` lie on unicode boundaries. + .map(|(start, end)| unsafe { (start, self.0.haystack().get_unchecked(start..end)) }) + } +} + +generate_pattern_iterators! { + forward: + /// Created with the method [`match_indices`]. + /// + /// [`match_indices`]: str::match_indices + struct MatchIndices; + reverse: + /// Created with the method [`rmatch_indices`]. + /// + /// [`rmatch_indices`]: str::rmatch_indices + struct RMatchIndices; + stability: + #[stable(feature = "str_match_indices", since = "1.5.0")] + internal: + MatchIndicesInternal yielding ((usize, &'a str)); + delegate double ended; +} + +derive_pattern_clone! { + clone MatchesInternal + with |s| MatchesInternal(s.0.clone()) +} + +pub(super) struct MatchesInternal<'a, P: Pattern<'a>>(pub(super) P::Searcher); + +impl<'a, P> fmt::Debug for MatchesInternal<'a, P> +where + P: Pattern<'a, Searcher: fmt::Debug>, +{ + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_tuple("MatchesInternal").field(&self.0).finish() + } +} + +impl<'a, P: Pattern<'a>> MatchesInternal<'a, P> { + #[inline] + fn next(&mut self) -> Option<&'a str> { + // SAFETY: `Searcher` guarantees that `start` and `end` lie on unicode boundaries. + self.0.next_match().map(|(a, b)| unsafe { + // Indices are known to be on utf8 boundaries + self.0.haystack().get_unchecked(a..b) + }) + } + + #[inline] + fn next_back(&mut self) -> Option<&'a str> + where + P::Searcher: ReverseSearcher<'a>, + { + // SAFETY: `Searcher` guarantees that `start` and `end` lie on unicode boundaries. + self.0.next_match_back().map(|(a, b)| unsafe { + // Indices are known to be on utf8 boundaries + self.0.haystack().get_unchecked(a..b) + }) + } +} + +generate_pattern_iterators! { + forward: + /// Created with the method [`matches`]. + /// + /// [`matches`]: str::matches + struct Matches; + reverse: + /// Created with the method [`rmatches`]. + /// + /// [`rmatches`]: str::rmatches + struct RMatches; + stability: + #[stable(feature = "str_matches", since = "1.2.0")] + internal: + MatchesInternal yielding (&'a str); + delegate double ended; +} + +/// An iterator over the lines of a string, as string slices. +/// +/// This struct is created with the [`lines`] method on [`str`]. +/// See its documentation for more. +/// +/// [`lines`]: str::lines +#[stable(feature = "rust1", since = "1.0.0")] +#[must_use = "iterators are lazy and do nothing unless consumed"] +#[derive(Clone, Debug)] +pub struct Lines<'a>(pub(super) Map, LinesAnyMap>); + +#[stable(feature = "rust1", since = "1.0.0")] +impl<'a> Iterator for Lines<'a> { + type Item = &'a str; + + #[inline] + fn next(&mut self) -> Option<&'a str> { + self.0.next() + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + self.0.size_hint() + } + + #[inline] + fn last(mut self) -> Option<&'a str> { + self.next_back() + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl<'a> DoubleEndedIterator for Lines<'a> { + #[inline] + fn next_back(&mut self) -> Option<&'a str> { + self.0.next_back() + } +} + +#[stable(feature = "fused", since = "1.26.0")] +impl FusedIterator for Lines<'_> {} + +/// Created with the method [`lines_any`]. +/// +/// [`lines_any`]: str::lines_any +#[stable(feature = "rust1", since = "1.0.0")] +#[deprecated(since = "1.4.0", note = "use lines()/Lines instead now")] +#[must_use = "iterators are lazy and do nothing unless consumed"] +#[derive(Clone, Debug)] +#[allow(deprecated)] +pub struct LinesAny<'a>(pub(super) Lines<'a>); + +#[stable(feature = "rust1", since = "1.0.0")] +#[allow(deprecated)] +impl<'a> Iterator for LinesAny<'a> { + type Item = &'a str; + + #[inline] + fn next(&mut self) -> Option<&'a str> { + self.0.next() + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + self.0.size_hint() + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +#[allow(deprecated)] +impl<'a> DoubleEndedIterator for LinesAny<'a> { + #[inline] + fn next_back(&mut self) -> Option<&'a str> { + self.0.next_back() + } +} + +#[stable(feature = "fused", since = "1.26.0")] +#[allow(deprecated)] +impl FusedIterator for LinesAny<'_> {} + +/// An iterator over the non-whitespace substrings of a string, +/// separated by any amount of whitespace. +/// +/// This struct is created by the [`split_whitespace`] method on [`str`]. +/// See its documentation for more. +/// +/// [`split_whitespace`]: str::split_whitespace +#[stable(feature = "split_whitespace", since = "1.1.0")] +#[derive(Clone, Debug)] +pub struct SplitWhitespace<'a> { + pub(super) inner: Filter, IsNotEmpty>, +} + +/// An iterator over the non-ASCII-whitespace substrings of a string, +/// separated by any amount of ASCII whitespace. +/// +/// This struct is created by the [`split_ascii_whitespace`] method on [`str`]. +/// See its documentation for more. +/// +/// [`split_ascii_whitespace`]: str::split_ascii_whitespace +#[stable(feature = "split_ascii_whitespace", since = "1.34.0")] +#[derive(Clone, Debug)] +pub struct SplitAsciiWhitespace<'a> { + pub(super) inner: + Map, BytesIsNotEmpty>, UnsafeBytesToStr>, +} + +/// An iterator over the substrings of a string, +/// terminated by a substring matching to a predicate function +/// Unlike `Split`, it contains the matched part as a terminator +/// of the subslice. +/// +/// This struct is created by the [`split_inclusive`] method on [`str`]. +/// See its documentation for more. +/// +/// [`split_inclusive`]: str::split_inclusive +#[stable(feature = "split_inclusive", since = "1.51.0")] +pub struct SplitInclusive<'a, P: Pattern<'a>>(pub(super) SplitInternal<'a, P>); + +#[stable(feature = "split_whitespace", since = "1.1.0")] +impl<'a> Iterator for SplitWhitespace<'a> { + type Item = &'a str; + + #[inline] + fn next(&mut self) -> Option<&'a str> { + self.inner.next() + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + self.inner.size_hint() + } + + #[inline] + fn last(mut self) -> Option<&'a str> { + self.next_back() + } +} + +#[stable(feature = "split_whitespace", since = "1.1.0")] +impl<'a> DoubleEndedIterator for SplitWhitespace<'a> { + #[inline] + fn next_back(&mut self) -> Option<&'a str> { + self.inner.next_back() + } +} + +#[stable(feature = "fused", since = "1.26.0")] +impl FusedIterator for SplitWhitespace<'_> {} + +impl<'a> SplitWhitespace<'a> { + /// Returns remainder of the split string + /// + /// # Examples + /// + /// ``` + /// #![feature(str_split_whitespace_as_str)] + /// + /// let mut split = "Mary had a little lamb".split_whitespace(); + /// assert_eq!(split.as_str(), "Mary had a little lamb"); + /// + /// split.next(); + /// assert_eq!(split.as_str(), "had a little lamb"); + /// + /// split.by_ref().for_each(drop); + /// assert_eq!(split.as_str(), ""); + /// ``` + #[inline] + #[must_use] + #[unstable(feature = "str_split_whitespace_as_str", issue = "77998")] + pub fn as_str(&self) -> &'a str { + self.inner.iter.as_str() + } +} + +#[stable(feature = "split_ascii_whitespace", since = "1.34.0")] +impl<'a> Iterator for SplitAsciiWhitespace<'a> { + type Item = &'a str; + + #[inline] + fn next(&mut self) -> Option<&'a str> { + self.inner.next() + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + self.inner.size_hint() + } + + #[inline] + fn last(mut self) -> Option<&'a str> { + self.next_back() + } +} + +#[stable(feature = "split_ascii_whitespace", since = "1.34.0")] +impl<'a> DoubleEndedIterator for SplitAsciiWhitespace<'a> { + #[inline] + fn next_back(&mut self) -> Option<&'a str> { + self.inner.next_back() + } +} + +#[stable(feature = "split_ascii_whitespace", since = "1.34.0")] +impl FusedIterator for SplitAsciiWhitespace<'_> {} + +impl<'a> SplitAsciiWhitespace<'a> { + /// Returns remainder of the split string + /// + /// # Examples + /// + /// ``` + /// #![feature(str_split_whitespace_as_str)] + /// + /// let mut split = "Mary had a little lamb".split_ascii_whitespace(); + /// assert_eq!(split.as_str(), "Mary had a little lamb"); + /// + /// split.next(); + /// assert_eq!(split.as_str(), "had a little lamb"); + /// + /// split.by_ref().for_each(drop); + /// assert_eq!(split.as_str(), ""); + /// ``` + #[inline] + #[must_use] + #[unstable(feature = "str_split_whitespace_as_str", issue = "77998")] + pub fn as_str(&self) -> &'a str { + if self.inner.iter.iter.finished { + return ""; + } + + // SAFETY: Slice is created from str. + unsafe { crate::str::from_utf8_unchecked(&self.inner.iter.iter.v) } + } +} + +#[stable(feature = "split_inclusive", since = "1.51.0")] +impl<'a, P: Pattern<'a>> Iterator for SplitInclusive<'a, P> { + type Item = &'a str; + + #[inline] + fn next(&mut self) -> Option<&'a str> { + self.0.next_inclusive() + } +} + +#[stable(feature = "split_inclusive", since = "1.51.0")] +impl<'a, P: Pattern<'a, Searcher: fmt::Debug>> fmt::Debug for SplitInclusive<'a, P> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("SplitInclusive").field("0", &self.0).finish() + } +} + +// FIXME(#26925) Remove in favor of `#[derive(Clone)]` +#[stable(feature = "split_inclusive", since = "1.51.0")] +impl<'a, P: Pattern<'a, Searcher: Clone>> Clone for SplitInclusive<'a, P> { + fn clone(&self) -> Self { + SplitInclusive(self.0.clone()) + } +} + +#[stable(feature = "split_inclusive", since = "1.51.0")] +impl<'a, P: Pattern<'a, Searcher: ReverseSearcher<'a>>> DoubleEndedIterator + for SplitInclusive<'a, P> +{ + #[inline] + fn next_back(&mut self) -> Option<&'a str> { + self.0.next_back_inclusive() + } +} + +#[stable(feature = "split_inclusive", since = "1.51.0")] +impl<'a, P: Pattern<'a>> FusedIterator for SplitInclusive<'a, P> {} + +impl<'a, P: Pattern<'a>> SplitInclusive<'a, P> { + /// Returns remainder of the split string + /// + /// # Examples + /// + /// ``` + /// #![feature(str_split_inclusive_as_str)] + /// let mut split = "Mary had a little lamb".split_inclusive(' '); + /// assert_eq!(split.as_str(), "Mary had a little lamb"); + /// split.next(); + /// assert_eq!(split.as_str(), "had a little lamb"); + /// split.by_ref().for_each(drop); + /// assert_eq!(split.as_str(), ""); + /// ``` + #[inline] + #[unstable(feature = "str_split_inclusive_as_str", issue = "77998")] + pub fn as_str(&self) -> &'a str { + self.0.as_str() + } +} + +/// An iterator of [`u16`] over the string encoded as UTF-16. +/// +/// This struct is created by the [`encode_utf16`] method on [`str`]. +/// See its documentation for more. +/// +/// [`encode_utf16`]: str::encode_utf16 +#[derive(Clone)] +#[stable(feature = "encode_utf16", since = "1.8.0")] +pub struct EncodeUtf16<'a> { + pub(super) chars: Chars<'a>, + pub(super) extra: u16, +} + +#[stable(feature = "collection_debug", since = "1.17.0")] +impl fmt::Debug for EncodeUtf16<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("EncodeUtf16").finish_non_exhaustive() + } +} + +#[stable(feature = "encode_utf16", since = "1.8.0")] +impl<'a> Iterator for EncodeUtf16<'a> { + type Item = u16; + + #[inline] + fn next(&mut self) -> Option { + if self.extra != 0 { + let tmp = self.extra; + self.extra = 0; + return Some(tmp); + } + + let mut buf = [0; 2]; + self.chars.next().map(|ch| { + let n = ch.encode_utf16(&mut buf).len(); + if n == 2 { + self.extra = buf[1]; + } + buf[0] + }) + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + let (low, high) = self.chars.size_hint(); + // every char gets either one u16 or two u16, + // so this iterator is between 1 or 2 times as + // long as the underlying iterator. + (low, high.and_then(|n| n.checked_mul(2))) + } +} + +#[stable(feature = "fused", since = "1.26.0")] +impl FusedIterator for EncodeUtf16<'_> {} + +/// The return type of [`str::escape_debug`]. +#[stable(feature = "str_escape", since = "1.34.0")] +#[derive(Clone, Debug)] +pub struct EscapeDebug<'a> { + pub(super) inner: Chain< + Flatten>, + FlatMap, char::EscapeDebug, CharEscapeDebugContinue>, + >, +} + +/// The return type of [`str::escape_default`]. +#[stable(feature = "str_escape", since = "1.34.0")] +#[derive(Clone, Debug)] +pub struct EscapeDefault<'a> { + pub(super) inner: FlatMap, char::EscapeDefault, CharEscapeDefault>, +} + +/// The return type of [`str::escape_unicode`]. +#[stable(feature = "str_escape", since = "1.34.0")] +#[derive(Clone, Debug)] +pub struct EscapeUnicode<'a> { + pub(super) inner: FlatMap, char::EscapeUnicode, CharEscapeUnicode>, +} + +macro_rules! escape_types_impls { + ($( $Name: ident ),+) => {$( + #[stable(feature = "str_escape", since = "1.34.0")] + impl<'a> fmt::Display for $Name<'a> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + self.clone().try_for_each(|c| f.write_char(c)) + } + } + + #[stable(feature = "str_escape", since = "1.34.0")] + impl<'a> Iterator for $Name<'a> { + type Item = char; + + #[inline] + fn next(&mut self) -> Option { self.inner.next() } + + #[inline] + fn size_hint(&self) -> (usize, Option) { self.inner.size_hint() } + + #[inline] + fn try_fold(&mut self, init: Acc, fold: Fold) -> R where + Self: Sized, Fold: FnMut(Acc, Self::Item) -> R, R: Try + { + self.inner.try_fold(init, fold) + } + + #[inline] + fn fold(self, init: Acc, fold: Fold) -> Acc + where Fold: FnMut(Acc, Self::Item) -> Acc, + { + self.inner.fold(init, fold) + } + } + + #[stable(feature = "str_escape", since = "1.34.0")] + impl<'a> FusedIterator for $Name<'a> {} + )+} +} + +escape_types_impls!(EscapeDebug, EscapeDefault, EscapeUnicode); diff --git a/library/core/src/str/lossy.rs b/library/core/src/str/lossy.rs new file mode 100644 index 000000000..6ec1c9390 --- /dev/null +++ b/library/core/src/str/lossy.rs @@ -0,0 +1,200 @@ +use crate::char; +use crate::fmt::{self, Write}; +use crate::mem; + +use super::from_utf8_unchecked; +use super::validations::utf8_char_width; + +/// Lossy UTF-8 string. +#[unstable(feature = "str_internals", issue = "none")] +pub struct Utf8Lossy { + bytes: [u8], +} + +impl Utf8Lossy { + #[must_use] + pub fn from_bytes(bytes: &[u8]) -> &Utf8Lossy { + // SAFETY: Both use the same memory layout, and UTF-8 correctness isn't required. + unsafe { mem::transmute(bytes) } + } + + pub fn chunks(&self) -> Utf8LossyChunksIter<'_> { + Utf8LossyChunksIter { source: &self.bytes } + } +} + +/// Iterator over lossy UTF-8 string +#[must_use = "iterators are lazy and do nothing unless consumed"] +#[unstable(feature = "str_internals", issue = "none")] +#[allow(missing_debug_implementations)] +pub struct Utf8LossyChunksIter<'a> { + source: &'a [u8], +} + +#[unstable(feature = "str_internals", issue = "none")] +#[derive(PartialEq, Eq, Debug)] +pub struct Utf8LossyChunk<'a> { + /// Sequence of valid chars. + /// Can be empty between broken UTF-8 chars. + pub valid: &'a str, + /// Single broken char, empty if none. + /// Empty iff iterator item is last. + pub broken: &'a [u8], +} + +impl<'a> Iterator for Utf8LossyChunksIter<'a> { + type Item = Utf8LossyChunk<'a>; + + fn next(&mut self) -> Option> { + if self.source.is_empty() { + return None; + } + + const TAG_CONT_U8: u8 = 128; + fn safe_get(xs: &[u8], i: usize) -> u8 { + *xs.get(i).unwrap_or(&0) + } + + let mut i = 0; + let mut valid_up_to = 0; + while i < self.source.len() { + // SAFETY: `i < self.source.len()` per previous line. + // For some reason the following are both significantly slower: + // while let Some(&byte) = self.source.get(i) { + // while let Some(byte) = self.source.get(i).copied() { + let byte = unsafe { *self.source.get_unchecked(i) }; + i += 1; + + if byte < 128 { + // This could be a `1 => ...` case in the match below, but for + // the common case of all-ASCII inputs, we bypass loading the + // sizeable UTF8_CHAR_WIDTH table into cache. + } else { + let w = utf8_char_width(byte); + + match w { + 2 => { + if safe_get(self.source, i) & 192 != TAG_CONT_U8 { + break; + } + i += 1; + } + 3 => { + match (byte, safe_get(self.source, i)) { + (0xE0, 0xA0..=0xBF) => (), + (0xE1..=0xEC, 0x80..=0xBF) => (), + (0xED, 0x80..=0x9F) => (), + (0xEE..=0xEF, 0x80..=0xBF) => (), + _ => break, + } + i += 1; + if safe_get(self.source, i) & 192 != TAG_CONT_U8 { + break; + } + i += 1; + } + 4 => { + match (byte, safe_get(self.source, i)) { + (0xF0, 0x90..=0xBF) => (), + (0xF1..=0xF3, 0x80..=0xBF) => (), + (0xF4, 0x80..=0x8F) => (), + _ => break, + } + i += 1; + if safe_get(self.source, i) & 192 != TAG_CONT_U8 { + break; + } + i += 1; + if safe_get(self.source, i) & 192 != TAG_CONT_U8 { + break; + } + i += 1; + } + _ => break, + } + } + + valid_up_to = i; + } + + // SAFETY: `i <= self.source.len()` because it is only ever incremented + // via `i += 1` and in between every single one of those increments, `i` + // is compared against `self.source.len()`. That happens either + // literally by `i < self.source.len()` in the while-loop's condition, + // or indirectly by `safe_get(self.source, i) & 192 != TAG_CONT_U8`. The + // loop is terminated as soon as the latest `i += 1` has made `i` no + // longer less than `self.source.len()`, which means it'll be at most + // equal to `self.source.len()`. + let (inspected, remaining) = unsafe { self.source.split_at_unchecked(i) }; + self.source = remaining; + + // SAFETY: `valid_up_to <= i` because it is only ever assigned via + // `valid_up_to = i` and `i` only increases. + let (valid, broken) = unsafe { inspected.split_at_unchecked(valid_up_to) }; + + Some(Utf8LossyChunk { + // SAFETY: All bytes up to `valid_up_to` are valid UTF-8. + valid: unsafe { from_utf8_unchecked(valid) }, + broken, + }) + } +} + +impl fmt::Display for Utf8Lossy { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + // If we're the empty string then our iterator won't actually yield + // anything, so perform the formatting manually + if self.bytes.is_empty() { + return "".fmt(f); + } + + for Utf8LossyChunk { valid, broken } in self.chunks() { + // If we successfully decoded the whole chunk as a valid string then + // we can return a direct formatting of the string which will also + // respect various formatting flags if possible. + if valid.len() == self.bytes.len() { + assert!(broken.is_empty()); + return valid.fmt(f); + } + + f.write_str(valid)?; + if !broken.is_empty() { + f.write_char(char::REPLACEMENT_CHARACTER)?; + } + } + Ok(()) + } +} + +impl fmt::Debug for Utf8Lossy { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_char('"')?; + + for Utf8LossyChunk { valid, broken } in self.chunks() { + // Valid part. + // Here we partially parse UTF-8 again which is suboptimal. + { + let mut from = 0; + for (i, c) in valid.char_indices() { + let esc = c.escape_debug(); + // If char needs escaping, flush backlog so far and write, else skip + if esc.len() != 1 { + f.write_str(&valid[from..i])?; + for c in esc { + f.write_char(c)?; + } + from = i + c.len_utf8(); + } + } + f.write_str(&valid[from..])?; + } + + // Broken parts of string as hex escape. + for &b in broken { + write!(f, "\\x{:02x}", b)?; + } + } + + f.write_char('"') + } +} diff --git a/library/core/src/str/mod.rs b/library/core/src/str/mod.rs new file mode 100644 index 000000000..c4f2e283e --- /dev/null +++ b/library/core/src/str/mod.rs @@ -0,0 +1,2640 @@ +//! String manipulation. +//! +//! For more details, see the [`std::str`] module. +//! +//! [`std::str`]: ../../std/str/index.html + +#![stable(feature = "rust1", since = "1.0.0")] + +mod converts; +mod count; +mod error; +mod iter; +mod traits; +mod validations; + +use self::pattern::Pattern; +use self::pattern::{DoubleEndedSearcher, ReverseSearcher, Searcher}; + +use crate::char::{self, EscapeDebugExtArgs}; +use crate::mem; +use crate::slice::{self, SliceIndex}; + +pub mod pattern; + +#[unstable(feature = "str_internals", issue = "none")] +#[allow(missing_docs)] +pub mod lossy; + +#[stable(feature = "rust1", since = "1.0.0")] +pub use converts::{from_utf8, from_utf8_unchecked}; + +#[stable(feature = "str_mut_extras", since = "1.20.0")] +pub use converts::{from_utf8_mut, from_utf8_unchecked_mut}; + +#[stable(feature = "rust1", since = "1.0.0")] +pub use error::{ParseBoolError, Utf8Error}; + +#[stable(feature = "rust1", since = "1.0.0")] +pub use traits::FromStr; + +#[stable(feature = "rust1", since = "1.0.0")] +pub use iter::{Bytes, CharIndices, Chars, Lines, SplitWhitespace}; + +#[stable(feature = "rust1", since = "1.0.0")] +#[allow(deprecated)] +pub use iter::LinesAny; + +#[stable(feature = "rust1", since = "1.0.0")] +pub use iter::{RSplit, RSplitTerminator, Split, SplitTerminator}; + +#[stable(feature = "rust1", since = "1.0.0")] +pub use iter::{RSplitN, SplitN}; + +#[stable(feature = "str_matches", since = "1.2.0")] +pub use iter::{Matches, RMatches}; + +#[stable(feature = "str_match_indices", since = "1.5.0")] +pub use iter::{MatchIndices, RMatchIndices}; + +#[stable(feature = "encode_utf16", since = "1.8.0")] +pub use iter::EncodeUtf16; + +#[stable(feature = "str_escape", since = "1.34.0")] +pub use iter::{EscapeDebug, EscapeDefault, EscapeUnicode}; + +#[stable(feature = "split_ascii_whitespace", since = "1.34.0")] +pub use iter::SplitAsciiWhitespace; + +#[stable(feature = "split_inclusive", since = "1.51.0")] +pub use iter::SplitInclusive; + +#[unstable(feature = "str_internals", issue = "none")] +pub use validations::{next_code_point, utf8_char_width}; + +use iter::MatchIndicesInternal; +use iter::SplitInternal; +use iter::{MatchesInternal, SplitNInternal}; + +#[inline(never)] +#[cold] +#[track_caller] +#[rustc_allow_const_fn_unstable(const_eval_select)] +const fn slice_error_fail(s: &str, begin: usize, end: usize) -> ! { + // SAFETY: panics for both branches + unsafe { + crate::intrinsics::const_eval_select( + (s, begin, end), + slice_error_fail_ct, + slice_error_fail_rt, + ) + } +} + +const fn slice_error_fail_ct(_: &str, _: usize, _: usize) -> ! { + panic!("failed to slice string"); +} + +fn slice_error_fail_rt(s: &str, begin: usize, end: usize) -> ! { + const MAX_DISPLAY_LENGTH: usize = 256; + let trunc_len = s.floor_char_boundary(MAX_DISPLAY_LENGTH); + let s_trunc = &s[..trunc_len]; + let ellipsis = if trunc_len < s.len() { "[...]" } else { "" }; + + // 1. out of bounds + if begin > s.len() || end > s.len() { + let oob_index = if begin > s.len() { begin } else { end }; + panic!("byte index {oob_index} is out of bounds of `{s_trunc}`{ellipsis}"); + } + + // 2. begin <= end + assert!( + begin <= end, + "begin <= end ({} <= {}) when slicing `{}`{}", + begin, + end, + s_trunc, + ellipsis + ); + + // 3. character boundary + let index = if !s.is_char_boundary(begin) { begin } else { end }; + // find the character + let char_start = s.floor_char_boundary(index); + // `char_start` must be less than len and a char boundary + let ch = s[char_start..].chars().next().unwrap(); + let char_range = char_start..char_start + ch.len_utf8(); + panic!( + "byte index {} is not a char boundary; it is inside {:?} (bytes {:?}) of `{}`{}", + index, ch, char_range, s_trunc, ellipsis + ); +} + +#[cfg(not(test))] +impl str { + /// Returns the length of `self`. + /// + /// This length is in bytes, not [`char`]s or graphemes. In other words, + /// it might not be what a human considers the length of the string. + /// + /// [`char`]: prim@char + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// let len = "foo".len(); + /// assert_eq!(3, len); + /// + /// assert_eq!("ƒoo".len(), 4); // fancy f! + /// assert_eq!("ƒoo".chars().count(), 3); + /// ``` + #[stable(feature = "rust1", since = "1.0.0")] + #[rustc_const_stable(feature = "const_str_len", since = "1.39.0")] + #[must_use] + #[inline] + pub const fn len(&self) -> usize { + self.as_bytes().len() + } + + /// Returns `true` if `self` has a length of zero bytes. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// let s = ""; + /// assert!(s.is_empty()); + /// + /// let s = "not empty"; + /// assert!(!s.is_empty()); + /// ``` + #[stable(feature = "rust1", since = "1.0.0")] + #[rustc_const_stable(feature = "const_str_is_empty", since = "1.39.0")] + #[must_use] + #[inline] + pub const fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Checks that `index`-th byte is the first byte in a UTF-8 code point + /// sequence or the end of the string. + /// + /// The start and end of the string (when `index == self.len()`) are + /// considered to be boundaries. + /// + /// Returns `false` if `index` is greater than `self.len()`. + /// + /// # Examples + /// + /// ``` + /// let s = "Löwe 老虎 Léopard"; + /// assert!(s.is_char_boundary(0)); + /// // start of `老` + /// assert!(s.is_char_boundary(6)); + /// assert!(s.is_char_boundary(s.len())); + /// + /// // second byte of `ö` + /// assert!(!s.is_char_boundary(2)); + /// + /// // third byte of `老` + /// assert!(!s.is_char_boundary(8)); + /// ``` + #[must_use] + #[stable(feature = "is_char_boundary", since = "1.9.0")] + #[rustc_const_unstable(feature = "const_is_char_boundary", issue = "none")] + #[inline] + pub const fn is_char_boundary(&self, index: usize) -> bool { + // 0 is always ok. + // Test for 0 explicitly so that it can optimize out the check + // easily and skip reading string data for that case. + // Note that optimizing `self.get(..index)` relies on this. + if index == 0 { + return true; + } + + match self.as_bytes().get(index) { + // For `None` we have two options: + // + // - index == self.len() + // Empty strings are valid, so return true + // - index > self.len() + // In this case return false + // + // The check is placed exactly here, because it improves generated + // code on higher opt-levels. See PR #84751 for more details. + None => index == self.len(), + + Some(&b) => b.is_utf8_char_boundary(), + } + } + + /// Finds the closest `x` not exceeding `index` where `is_char_boundary(x)` is `true`. + /// + /// This method can help you truncate a string so that it's still valid UTF-8, but doesn't + /// exceed a given number of bytes. Note that this is done purely at the character level + /// and can still visually split graphemes, even though the underlying characters aren't + /// split. For example, the emoji 🧑‍🔬 (scientist) could be split so that the string only + /// includes 🧑 (person) instead. + /// + /// # Examples + /// + /// ``` + /// #![feature(round_char_boundary)] + /// let s = "❤️🧡💛💚💙💜"; + /// assert_eq!(s.len(), 26); + /// assert!(!s.is_char_boundary(13)); + /// + /// let closest = s.floor_char_boundary(13); + /// assert_eq!(closest, 10); + /// assert_eq!(&s[..closest], "❤️🧡"); + /// ``` + #[unstable(feature = "round_char_boundary", issue = "93743")] + #[inline] + pub fn floor_char_boundary(&self, index: usize) -> usize { + if index >= self.len() { + self.len() + } else { + let lower_bound = index.saturating_sub(3); + let new_index = self.as_bytes()[lower_bound..=index] + .iter() + .rposition(|b| b.is_utf8_char_boundary()); + + // SAFETY: we know that the character boundary will be within four bytes + unsafe { lower_bound + new_index.unwrap_unchecked() } + } + } + + /// Finds the closest `x` not below `index` where `is_char_boundary(x)` is `true`. + /// + /// This method is the natural complement to [`floor_char_boundary`]. See that method + /// for more details. + /// + /// [`floor_char_boundary`]: str::floor_char_boundary + /// + /// # Panics + /// + /// Panics if `index > self.len()`. + /// + /// # Examples + /// + /// ``` + /// #![feature(round_char_boundary)] + /// let s = "❤️🧡💛💚💙💜"; + /// assert_eq!(s.len(), 26); + /// assert!(!s.is_char_boundary(13)); + /// + /// let closest = s.ceil_char_boundary(13); + /// assert_eq!(closest, 14); + /// assert_eq!(&s[..closest], "❤️🧡💛"); + /// ``` + #[unstable(feature = "round_char_boundary", issue = "93743")] + #[inline] + pub fn ceil_char_boundary(&self, index: usize) -> usize { + if index > self.len() { + slice_error_fail(self, index, index) + } else { + let upper_bound = Ord::min(index + 4, self.len()); + self.as_bytes()[index..upper_bound] + .iter() + .position(|b| b.is_utf8_char_boundary()) + .map_or(upper_bound, |pos| pos + index) + } + } + + /// Converts a string slice to a byte slice. To convert the byte slice back + /// into a string slice, use the [`from_utf8`] function. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// let bytes = "bors".as_bytes(); + /// assert_eq!(b"bors", bytes); + /// ``` + #[stable(feature = "rust1", since = "1.0.0")] + #[rustc_const_stable(feature = "str_as_bytes", since = "1.39.0")] + #[must_use] + #[inline(always)] + #[allow(unused_attributes)] + pub const fn as_bytes(&self) -> &[u8] { + // SAFETY: const sound because we transmute two types with the same layout + unsafe { mem::transmute(self) } + } + + /// Converts a mutable string slice to a mutable byte slice. + /// + /// # Safety + /// + /// The caller must ensure that the content of the slice is valid UTF-8 + /// before the borrow ends and the underlying `str` is used. + /// + /// Use of a `str` whose contents are not valid UTF-8 is undefined behavior. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// let mut s = String::from("Hello"); + /// let bytes = unsafe { s.as_bytes_mut() }; + /// + /// assert_eq!(b"Hello", bytes); + /// ``` + /// + /// Mutability: + /// + /// ``` + /// let mut s = String::from("🗻∈🌏"); + /// + /// unsafe { + /// let bytes = s.as_bytes_mut(); + /// + /// bytes[0] = 0xF0; + /// bytes[1] = 0x9F; + /// bytes[2] = 0x8D; + /// bytes[3] = 0x94; + /// } + /// + /// assert_eq!("🍔∈🌏", s); + /// ``` + #[stable(feature = "str_mut_extras", since = "1.20.0")] + #[must_use] + #[inline(always)] + pub unsafe fn as_bytes_mut(&mut self) -> &mut [u8] { + // SAFETY: the cast from `&str` to `&[u8]` is safe since `str` + // has the same layout as `&[u8]` (only libstd can make this guarantee). + // The pointer dereference is safe since it comes from a mutable reference which + // is guaranteed to be valid for writes. + unsafe { &mut *(self as *mut str as *mut [u8]) } + } + + /// Converts a string slice to a raw pointer. + /// + /// As string slices are a slice of bytes, the raw pointer points to a + /// [`u8`]. This pointer will be pointing to the first byte of the string + /// slice. + /// + /// The caller must ensure that the returned pointer is never written to. + /// If you need to mutate the contents of the string slice, use [`as_mut_ptr`]. + /// + /// [`as_mut_ptr`]: str::as_mut_ptr + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// let s = "Hello"; + /// let ptr = s.as_ptr(); + /// ``` + #[stable(feature = "rust1", since = "1.0.0")] + #[rustc_const_stable(feature = "rustc_str_as_ptr", since = "1.32.0")] + #[must_use] + #[inline] + pub const fn as_ptr(&self) -> *const u8 { + self as *const str as *const u8 + } + + /// Converts a mutable string slice to a raw pointer. + /// + /// As string slices are a slice of bytes, the raw pointer points to a + /// [`u8`]. This pointer will be pointing to the first byte of the string + /// slice. + /// + /// It is your responsibility to make sure that the string slice only gets + /// modified in a way that it remains valid UTF-8. + #[stable(feature = "str_as_mut_ptr", since = "1.36.0")] + #[must_use] + #[inline] + pub fn as_mut_ptr(&mut self) -> *mut u8 { + self as *mut str as *mut u8 + } + + /// Returns a subslice of `str`. + /// + /// This is the non-panicking alternative to indexing the `str`. Returns + /// [`None`] whenever equivalent indexing operation would panic. + /// + /// # Examples + /// + /// ``` + /// let v = String::from("🗻∈🌏"); + /// + /// assert_eq!(Some("🗻"), v.get(0..4)); + /// + /// // indices not on UTF-8 sequence boundaries + /// assert!(v.get(1..).is_none()); + /// assert!(v.get(..8).is_none()); + /// + /// // out of bounds + /// assert!(v.get(..42).is_none()); + /// ``` + #[stable(feature = "str_checked_slicing", since = "1.20.0")] + #[rustc_const_unstable(feature = "const_slice_index", issue = "none")] + #[inline] + pub const fn get>(&self, i: I) -> Option<&I::Output> { + i.get(self) + } + + /// Returns a mutable subslice of `str`. + /// + /// This is the non-panicking alternative to indexing the `str`. Returns + /// [`None`] whenever equivalent indexing operation would panic. + /// + /// # Examples + /// + /// ``` + /// let mut v = String::from("hello"); + /// // correct length + /// assert!(v.get_mut(0..5).is_some()); + /// // out of bounds + /// assert!(v.get_mut(..42).is_none()); + /// assert_eq!(Some("he"), v.get_mut(0..2).map(|v| &*v)); + /// + /// assert_eq!("hello", v); + /// { + /// let s = v.get_mut(0..2); + /// let s = s.map(|s| { + /// s.make_ascii_uppercase(); + /// &*s + /// }); + /// assert_eq!(Some("HE"), s); + /// } + /// assert_eq!("HEllo", v); + /// ``` + #[stable(feature = "str_checked_slicing", since = "1.20.0")] + #[rustc_const_unstable(feature = "const_slice_index", issue = "none")] + #[inline] + pub const fn get_mut>(&mut self, i: I) -> Option<&mut I::Output> { + i.get_mut(self) + } + + /// Returns an unchecked subslice of `str`. + /// + /// This is the unchecked alternative to indexing the `str`. + /// + /// # Safety + /// + /// Callers of this function are responsible that these preconditions are + /// satisfied: + /// + /// * The starting index must not exceed the ending index; + /// * Indexes must be within bounds of the original slice; + /// * Indexes must lie on UTF-8 sequence boundaries. + /// + /// Failing that, the returned string slice may reference invalid memory or + /// violate the invariants communicated by the `str` type. + /// + /// # Examples + /// + /// ``` + /// let v = "🗻∈🌏"; + /// unsafe { + /// assert_eq!("🗻", v.get_unchecked(0..4)); + /// assert_eq!("∈", v.get_unchecked(4..7)); + /// assert_eq!("🌏", v.get_unchecked(7..11)); + /// } + /// ``` + #[stable(feature = "str_checked_slicing", since = "1.20.0")] + #[rustc_const_unstable(feature = "const_slice_index", issue = "none")] + #[inline] + pub const unsafe fn get_unchecked>(&self, i: I) -> &I::Output { + // SAFETY: the caller must uphold the safety contract for `get_unchecked`; + // the slice is dereferenceable because `self` is a safe reference. + // The returned pointer is safe because impls of `SliceIndex` have to guarantee that it is. + unsafe { &*i.get_unchecked(self) } + } + + /// Returns a mutable, unchecked subslice of `str`. + /// + /// This is the unchecked alternative to indexing the `str`. + /// + /// # Safety + /// + /// Callers of this function are responsible that these preconditions are + /// satisfied: + /// + /// * The starting index must not exceed the ending index; + /// * Indexes must be within bounds of the original slice; + /// * Indexes must lie on UTF-8 sequence boundaries. + /// + /// Failing that, the returned string slice may reference invalid memory or + /// violate the invariants communicated by the `str` type. + /// + /// # Examples + /// + /// ``` + /// let mut v = String::from("🗻∈🌏"); + /// unsafe { + /// assert_eq!("🗻", v.get_unchecked_mut(0..4)); + /// assert_eq!("∈", v.get_unchecked_mut(4..7)); + /// assert_eq!("🌏", v.get_unchecked_mut(7..11)); + /// } + /// ``` + #[stable(feature = "str_checked_slicing", since = "1.20.0")] + #[rustc_const_unstable(feature = "const_slice_index", issue = "none")] + #[inline] + pub const unsafe fn get_unchecked_mut>( + &mut self, + i: I, + ) -> &mut I::Output { + // SAFETY: the caller must uphold the safety contract for `get_unchecked_mut`; + // the slice is dereferenceable because `self` is a safe reference. + // The returned pointer is safe because impls of `SliceIndex` have to guarantee that it is. + unsafe { &mut *i.get_unchecked_mut(self) } + } + + /// Creates a string slice from another string slice, bypassing safety + /// checks. + /// + /// This is generally not recommended, use with caution! For a safe + /// alternative see [`str`] and [`Index`]. + /// + /// [`Index`]: crate::ops::Index + /// + /// This new slice goes from `begin` to `end`, including `begin` but + /// excluding `end`. + /// + /// To get a mutable string slice instead, see the + /// [`slice_mut_unchecked`] method. + /// + /// [`slice_mut_unchecked`]: str::slice_mut_unchecked + /// + /// # Safety + /// + /// Callers of this function are responsible that three preconditions are + /// satisfied: + /// + /// * `begin` must not exceed `end`. + /// * `begin` and `end` must be byte positions within the string slice. + /// * `begin` and `end` must lie on UTF-8 sequence boundaries. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// let s = "Löwe 老虎 Léopard"; + /// + /// unsafe { + /// assert_eq!("Löwe 老虎 Léopard", s.slice_unchecked(0, 21)); + /// } + /// + /// let s = "Hello, world!"; + /// + /// unsafe { + /// assert_eq!("world", s.slice_unchecked(7, 12)); + /// } + /// ``` + #[stable(feature = "rust1", since = "1.0.0")] + #[deprecated(since = "1.29.0", note = "use `get_unchecked(begin..end)` instead")] + #[must_use] + #[inline] + pub unsafe fn slice_unchecked(&self, begin: usize, end: usize) -> &str { + // SAFETY: the caller must uphold the safety contract for `get_unchecked`; + // the slice is dereferenceable because `self` is a safe reference. + // The returned pointer is safe because impls of `SliceIndex` have to guarantee that it is. + unsafe { &*(begin..end).get_unchecked(self) } + } + + /// Creates a string slice from another string slice, bypassing safety + /// checks. + /// This is generally not recommended, use with caution! For a safe + /// alternative see [`str`] and [`IndexMut`]. + /// + /// [`IndexMut`]: crate::ops::IndexMut + /// + /// This new slice goes from `begin` to `end`, including `begin` but + /// excluding `end`. + /// + /// To get an immutable string slice instead, see the + /// [`slice_unchecked`] method. + /// + /// [`slice_unchecked`]: str::slice_unchecked + /// + /// # Safety + /// + /// Callers of this function are responsible that three preconditions are + /// satisfied: + /// + /// * `begin` must not exceed `end`. + /// * `begin` and `end` must be byte positions within the string slice. + /// * `begin` and `end` must lie on UTF-8 sequence boundaries. + #[stable(feature = "str_slice_mut", since = "1.5.0")] + #[deprecated(since = "1.29.0", note = "use `get_unchecked_mut(begin..end)` instead")] + #[inline] + pub unsafe fn slice_mut_unchecked(&mut self, begin: usize, end: usize) -> &mut str { + // SAFETY: the caller must uphold the safety contract for `get_unchecked_mut`; + // the slice is dereferenceable because `self` is a safe reference. + // The returned pointer is safe because impls of `SliceIndex` have to guarantee that it is. + unsafe { &mut *(begin..end).get_unchecked_mut(self) } + } + + /// Divide one string slice into two at an index. + /// + /// The argument, `mid`, should be a byte offset from the start of the + /// string. It must also be on the boundary of a UTF-8 code point. + /// + /// The two slices returned go from the start of the string slice to `mid`, + /// and from `mid` to the end of the string slice. + /// + /// To get mutable string slices instead, see the [`split_at_mut`] + /// method. + /// + /// [`split_at_mut`]: str::split_at_mut + /// + /// # Panics + /// + /// Panics if `mid` is not on a UTF-8 code point boundary, or if it is + /// past the end of the last code point of the string slice. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// let s = "Per Martin-Löf"; + /// + /// let (first, last) = s.split_at(3); + /// + /// assert_eq!("Per", first); + /// assert_eq!(" Martin-Löf", last); + /// ``` + #[inline] + #[must_use] + #[stable(feature = "str_split_at", since = "1.4.0")] + pub fn split_at(&self, mid: usize) -> (&str, &str) { + // is_char_boundary checks that the index is in [0, .len()] + if self.is_char_boundary(mid) { + // SAFETY: just checked that `mid` is on a char boundary. + unsafe { (self.get_unchecked(0..mid), self.get_unchecked(mid..self.len())) } + } else { + slice_error_fail(self, 0, mid) + } + } + + /// Divide one mutable string slice into two at an index. + /// + /// The argument, `mid`, should be a byte offset from the start of the + /// string. It must also be on the boundary of a UTF-8 code point. + /// + /// The two slices returned go from the start of the string slice to `mid`, + /// and from `mid` to the end of the string slice. + /// + /// To get immutable string slices instead, see the [`split_at`] method. + /// + /// [`split_at`]: str::split_at + /// + /// # Panics + /// + /// Panics if `mid` is not on a UTF-8 code point boundary, or if it is + /// past the end of the last code point of the string slice. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// let mut s = "Per Martin-Löf".to_string(); + /// { + /// let (first, last) = s.split_at_mut(3); + /// first.make_ascii_uppercase(); + /// assert_eq!("PER", first); + /// assert_eq!(" Martin-Löf", last); + /// } + /// assert_eq!("PER Martin-Löf", s); + /// ``` + #[inline] + #[must_use] + #[stable(feature = "str_split_at", since = "1.4.0")] + pub fn split_at_mut(&mut self, mid: usize) -> (&mut str, &mut str) { + // is_char_boundary checks that the index is in [0, .len()] + if self.is_char_boundary(mid) { + let len = self.len(); + let ptr = self.as_mut_ptr(); + // SAFETY: just checked that `mid` is on a char boundary. + unsafe { + ( + from_utf8_unchecked_mut(slice::from_raw_parts_mut(ptr, mid)), + from_utf8_unchecked_mut(slice::from_raw_parts_mut(ptr.add(mid), len - mid)), + ) + } + } else { + slice_error_fail(self, 0, mid) + } + } + + /// Returns an iterator over the [`char`]s of a string slice. + /// + /// As a string slice consists of valid UTF-8, we can iterate through a + /// string slice by [`char`]. This method returns such an iterator. + /// + /// It's important to remember that [`char`] represents a Unicode Scalar + /// Value, and might not match your idea of what a 'character' is. Iteration + /// over grapheme clusters may be what you actually want. This functionality + /// is not provided by Rust's standard library, check crates.io instead. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// let word = "goodbye"; + /// + /// let count = word.chars().count(); + /// assert_eq!(7, count); + /// + /// let mut chars = word.chars(); + /// + /// assert_eq!(Some('g'), chars.next()); + /// assert_eq!(Some('o'), chars.next()); + /// assert_eq!(Some('o'), chars.next()); + /// assert_eq!(Some('d'), chars.next()); + /// assert_eq!(Some('b'), chars.next()); + /// assert_eq!(Some('y'), chars.next()); + /// assert_eq!(Some('e'), chars.next()); + /// + /// assert_eq!(None, chars.next()); + /// ``` + /// + /// Remember, [`char`]s might not match your intuition about characters: + /// + /// [`char`]: prim@char + /// + /// ``` + /// let y = "y̆"; + /// + /// let mut chars = y.chars(); + /// + /// assert_eq!(Some('y'), chars.next()); // not 'y̆' + /// assert_eq!(Some('\u{0306}'), chars.next()); + /// + /// assert_eq!(None, chars.next()); + /// ``` + #[stable(feature = "rust1", since = "1.0.0")] + #[inline] + pub fn chars(&self) -> Chars<'_> { + Chars { iter: self.as_bytes().iter() } + } + + /// Returns an iterator over the [`char`]s of a string slice, and their + /// positions. + /// + /// As a string slice consists of valid UTF-8, we can iterate through a + /// string slice by [`char`]. This method returns an iterator of both + /// these [`char`]s, as well as their byte positions. + /// + /// The iterator yields tuples. The position is first, the [`char`] is + /// second. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// let word = "goodbye"; + /// + /// let count = word.char_indices().count(); + /// assert_eq!(7, count); + /// + /// let mut char_indices = word.char_indices(); + /// + /// assert_eq!(Some((0, 'g')), char_indices.next()); + /// assert_eq!(Some((1, 'o')), char_indices.next()); + /// assert_eq!(Some((2, 'o')), char_indices.next()); + /// assert_eq!(Some((3, 'd')), char_indices.next()); + /// assert_eq!(Some((4, 'b')), char_indices.next()); + /// assert_eq!(Some((5, 'y')), char_indices.next()); + /// assert_eq!(Some((6, 'e')), char_indices.next()); + /// + /// assert_eq!(None, char_indices.next()); + /// ``` + /// + /// Remember, [`char`]s might not match your intuition about characters: + /// + /// [`char`]: prim@char + /// + /// ``` + /// let yes = "y̆es"; + /// + /// let mut char_indices = yes.char_indices(); + /// + /// assert_eq!(Some((0, 'y')), char_indices.next()); // not (0, 'y̆') + /// assert_eq!(Some((1, '\u{0306}')), char_indices.next()); + /// + /// // note the 3 here - the last character took up two bytes + /// assert_eq!(Some((3, 'e')), char_indices.next()); + /// assert_eq!(Some((4, 's')), char_indices.next()); + /// + /// assert_eq!(None, char_indices.next()); + /// ``` + #[stable(feature = "rust1", since = "1.0.0")] + #[inline] + pub fn char_indices(&self) -> CharIndices<'_> { + CharIndices { front_offset: 0, iter: self.chars() } + } + + /// An iterator over the bytes of a string slice. + /// + /// As a string slice consists of a sequence of bytes, we can iterate + /// through a string slice by byte. This method returns such an iterator. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// let mut bytes = "bors".bytes(); + /// + /// assert_eq!(Some(b'b'), bytes.next()); + /// assert_eq!(Some(b'o'), bytes.next()); + /// assert_eq!(Some(b'r'), bytes.next()); + /// assert_eq!(Some(b's'), bytes.next()); + /// + /// assert_eq!(None, bytes.next()); + /// ``` + #[stable(feature = "rust1", since = "1.0.0")] + #[inline] + pub fn bytes(&self) -> Bytes<'_> { + Bytes(self.as_bytes().iter().copied()) + } + + /// Splits a string slice by whitespace. + /// + /// The iterator returned will return string slices that are sub-slices of + /// the original string slice, separated by any amount of whitespace. + /// + /// 'Whitespace' is defined according to the terms of the Unicode Derived + /// Core Property `White_Space`. If you only want to split on ASCII whitespace + /// instead, use [`split_ascii_whitespace`]. + /// + /// [`split_ascii_whitespace`]: str::split_ascii_whitespace + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// let mut iter = "A few words".split_whitespace(); + /// + /// assert_eq!(Some("A"), iter.next()); + /// assert_eq!(Some("few"), iter.next()); + /// assert_eq!(Some("words"), iter.next()); + /// + /// assert_eq!(None, iter.next()); + /// ``` + /// + /// All kinds of whitespace are considered: + /// + /// ``` + /// let mut iter = " Mary had\ta\u{2009}little \n\t lamb".split_whitespace(); + /// assert_eq!(Some("Mary"), iter.next()); + /// assert_eq!(Some("had"), iter.next()); + /// assert_eq!(Some("a"), iter.next()); + /// assert_eq!(Some("little"), iter.next()); + /// assert_eq!(Some("lamb"), iter.next()); + /// + /// assert_eq!(None, iter.next()); + /// ``` + #[must_use = "this returns the split string as an iterator, \ + without modifying the original"] + #[stable(feature = "split_whitespace", since = "1.1.0")] + #[cfg_attr(not(test), rustc_diagnostic_item = "str_split_whitespace")] + #[inline] + pub fn split_whitespace(&self) -> SplitWhitespace<'_> { + SplitWhitespace { inner: self.split(IsWhitespace).filter(IsNotEmpty) } + } + + /// Splits a string slice by ASCII whitespace. + /// + /// The iterator returned will return string slices that are sub-slices of + /// the original string slice, separated by any amount of ASCII whitespace. + /// + /// To split by Unicode `Whitespace` instead, use [`split_whitespace`]. + /// + /// [`split_whitespace`]: str::split_whitespace + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// let mut iter = "A few words".split_ascii_whitespace(); + /// + /// assert_eq!(Some("A"), iter.next()); + /// assert_eq!(Some("few"), iter.next()); + /// assert_eq!(Some("words"), iter.next()); + /// + /// assert_eq!(None, iter.next()); + /// ``` + /// + /// All kinds of ASCII whitespace are considered: + /// + /// ``` + /// let mut iter = " Mary had\ta little \n\t lamb".split_ascii_whitespace(); + /// assert_eq!(Some("Mary"), iter.next()); + /// assert_eq!(Some("had"), iter.next()); + /// assert_eq!(Some("a"), iter.next()); + /// assert_eq!(Some("little"), iter.next()); + /// assert_eq!(Some("lamb"), iter.next()); + /// + /// assert_eq!(None, iter.next()); + /// ``` + #[must_use = "this returns the split string as an iterator, \ + without modifying the original"] + #[stable(feature = "split_ascii_whitespace", since = "1.34.0")] + #[inline] + pub fn split_ascii_whitespace(&self) -> SplitAsciiWhitespace<'_> { + let inner = + self.as_bytes().split(IsAsciiWhitespace).filter(BytesIsNotEmpty).map(UnsafeBytesToStr); + SplitAsciiWhitespace { inner } + } + + /// An iterator over the lines of a string, as string slices. + /// + /// Lines are ended with either a newline (`\n`) or a carriage return with + /// a line feed (`\r\n`). + /// + /// The final line ending is optional. A string that ends with a final line + /// ending will return the same lines as an otherwise identical string + /// without a final line ending. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// let text = "foo\r\nbar\n\nbaz\n"; + /// let mut lines = text.lines(); + /// + /// assert_eq!(Some("foo"), lines.next()); + /// assert_eq!(Some("bar"), lines.next()); + /// assert_eq!(Some(""), lines.next()); + /// assert_eq!(Some("baz"), lines.next()); + /// + /// assert_eq!(None, lines.next()); + /// ``` + /// + /// The final line ending isn't required: + /// + /// ``` + /// let text = "foo\nbar\n\r\nbaz"; + /// let mut lines = text.lines(); + /// + /// assert_eq!(Some("foo"), lines.next()); + /// assert_eq!(Some("bar"), lines.next()); + /// assert_eq!(Some(""), lines.next()); + /// assert_eq!(Some("baz"), lines.next()); + /// + /// assert_eq!(None, lines.next()); + /// ``` + #[stable(feature = "rust1", since = "1.0.0")] + #[inline] + pub fn lines(&self) -> Lines<'_> { + Lines(self.split_terminator('\n').map(LinesAnyMap)) + } + + /// An iterator over the lines of a string. + #[stable(feature = "rust1", since = "1.0.0")] + #[deprecated(since = "1.4.0", note = "use lines() instead now")] + #[inline] + #[allow(deprecated)] + pub fn lines_any(&self) -> LinesAny<'_> { + LinesAny(self.lines()) + } + + /// Returns an iterator of `u16` over the string encoded as UTF-16. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// let text = "Zażółć gęślą jaźń"; + /// + /// let utf8_len = text.len(); + /// let utf16_len = text.encode_utf16().count(); + /// + /// assert!(utf16_len <= utf8_len); + /// ``` + #[must_use = "this returns the encoded string as an iterator, \ + without modifying the original"] + #[stable(feature = "encode_utf16", since = "1.8.0")] + pub fn encode_utf16(&self) -> EncodeUtf16<'_> { + EncodeUtf16 { chars: self.chars(), extra: 0 } + } + + /// Returns `true` if the given pattern matches a sub-slice of + /// this string slice. + /// + /// Returns `false` if it does not. + /// + /// The [pattern] can be a `&str`, [`char`], a slice of [`char`]s, or a + /// function or closure that determines if a character matches. + /// + /// [`char`]: prim@char + /// [pattern]: self::pattern + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// let bananas = "bananas"; + /// + /// assert!(bananas.contains("nana")); + /// assert!(!bananas.contains("apples")); + /// ``` + #[stable(feature = "rust1", since = "1.0.0")] + #[inline] + pub fn contains<'a, P: Pattern<'a>>(&'a self, pat: P) -> bool { + pat.is_contained_in(self) + } + + /// Returns `true` if the given pattern matches a prefix of this + /// string slice. + /// + /// Returns `false` if it does not. + /// + /// The [pattern] can be a `&str`, [`char`], a slice of [`char`]s, or a + /// function or closure that determines if a character matches. + /// + /// [`char`]: prim@char + /// [pattern]: self::pattern + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// let bananas = "bananas"; + /// + /// assert!(bananas.starts_with("bana")); + /// assert!(!bananas.starts_with("nana")); + /// ``` + #[stable(feature = "rust1", since = "1.0.0")] + pub fn starts_with<'a, P: Pattern<'a>>(&'a self, pat: P) -> bool { + pat.is_prefix_of(self) + } + + /// Returns `true` if the given pattern matches a suffix of this + /// string slice. + /// + /// Returns `false` if it does not. + /// + /// The [pattern] can be a `&str`, [`char`], a slice of [`char`]s, or a + /// function or closure that determines if a character matches. + /// + /// [`char`]: prim@char + /// [pattern]: self::pattern + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// let bananas = "bananas"; + /// + /// assert!(bananas.ends_with("anas")); + /// assert!(!bananas.ends_with("nana")); + /// ``` + #[stable(feature = "rust1", since = "1.0.0")] + pub fn ends_with<'a, P>(&'a self, pat: P) -> bool + where + P: Pattern<'a, Searcher: ReverseSearcher<'a>>, + { + pat.is_suffix_of(self) + } + + /// Returns the byte index of the first character of this string slice that + /// matches the pattern. + /// + /// Returns [`None`] if the pattern doesn't match. + /// + /// The [pattern] can be a `&str`, [`char`], a slice of [`char`]s, or a + /// function or closure that determines if a character matches. + /// + /// [`char`]: prim@char + /// [pattern]: self::pattern + /// + /// # Examples + /// + /// Simple patterns: + /// + /// ``` + /// let s = "Löwe 老虎 Léopard Gepardi"; + /// + /// assert_eq!(s.find('L'), Some(0)); + /// assert_eq!(s.find('é'), Some(14)); + /// assert_eq!(s.find("pard"), Some(17)); + /// ``` + /// + /// More complex patterns using point-free style and closures: + /// + /// ``` + /// let s = "Löwe 老虎 Léopard"; + /// + /// assert_eq!(s.find(char::is_whitespace), Some(5)); + /// assert_eq!(s.find(char::is_lowercase), Some(1)); + /// assert_eq!(s.find(|c: char| c.is_whitespace() || c.is_lowercase()), Some(1)); + /// assert_eq!(s.find(|c: char| (c < 'o') && (c > 'a')), Some(4)); + /// ``` + /// + /// Not finding the pattern: + /// + /// ``` + /// let s = "Löwe 老虎 Léopard"; + /// let x: &[_] = &['1', '2']; + /// + /// assert_eq!(s.find(x), None); + /// ``` + #[stable(feature = "rust1", since = "1.0.0")] + #[inline] + pub fn find<'a, P: Pattern<'a>>(&'a self, pat: P) -> Option { + pat.into_searcher(self).next_match().map(|(i, _)| i) + } + + /// Returns the byte index for the first character of the last match of the pattern in + /// this string slice. + /// + /// Returns [`None`] if the pattern doesn't match. + /// + /// The [pattern] can be a `&str`, [`char`], a slice of [`char`]s, or a + /// function or closure that determines if a character matches. + /// + /// [`char`]: prim@char + /// [pattern]: self::pattern + /// + /// # Examples + /// + /// Simple patterns: + /// + /// ``` + /// let s = "Löwe 老虎 Léopard Gepardi"; + /// + /// assert_eq!(s.rfind('L'), Some(13)); + /// assert_eq!(s.rfind('é'), Some(14)); + /// assert_eq!(s.rfind("pard"), Some(24)); + /// ``` + /// + /// More complex patterns with closures: + /// + /// ``` + /// let s = "Löwe 老虎 Léopard"; + /// + /// assert_eq!(s.rfind(char::is_whitespace), Some(12)); + /// assert_eq!(s.rfind(char::is_lowercase), Some(20)); + /// ``` + /// + /// Not finding the pattern: + /// + /// ``` + /// let s = "Löwe 老虎 Léopard"; + /// let x: &[_] = &['1', '2']; + /// + /// assert_eq!(s.rfind(x), None); + /// ``` + #[stable(feature = "rust1", since = "1.0.0")] + #[inline] + pub fn rfind<'a, P>(&'a self, pat: P) -> Option + where + P: Pattern<'a, Searcher: ReverseSearcher<'a>>, + { + pat.into_searcher(self).next_match_back().map(|(i, _)| i) + } + + /// An iterator over substrings of this string slice, separated by + /// characters matched by a pattern. + /// + /// The [pattern] can be a `&str`, [`char`], a slice of [`char`]s, or a + /// function or closure that determines if a character matches. + /// + /// [`char`]: prim@char + /// [pattern]: self::pattern + /// + /// # Iterator behavior + /// + /// The returned iterator will be a [`DoubleEndedIterator`] if the pattern + /// allows a reverse search and forward/reverse search yields the same + /// elements. This is true for, e.g., [`char`], but not for `&str`. + /// + /// If the pattern allows a reverse search but its results might differ + /// from a forward search, the [`rsplit`] method can be used. + /// + /// [`rsplit`]: str::rsplit + /// + /// # Examples + /// + /// Simple patterns: + /// + /// ``` + /// let v: Vec<&str> = "Mary had a little lamb".split(' ').collect(); + /// assert_eq!(v, ["Mary", "had", "a", "little", "lamb"]); + /// + /// let v: Vec<&str> = "".split('X').collect(); + /// assert_eq!(v, [""]); + /// + /// let v: Vec<&str> = "lionXXtigerXleopard".split('X').collect(); + /// assert_eq!(v, ["lion", "", "tiger", "leopard"]); + /// + /// let v: Vec<&str> = "lion::tiger::leopard".split("::").collect(); + /// assert_eq!(v, ["lion", "tiger", "leopard"]); + /// + /// let v: Vec<&str> = "abc1def2ghi".split(char::is_numeric).collect(); + /// assert_eq!(v, ["abc", "def", "ghi"]); + /// + /// let v: Vec<&str> = "lionXtigerXleopard".split(char::is_uppercase).collect(); + /// assert_eq!(v, ["lion", "tiger", "leopard"]); + /// ``` + /// + /// If the pattern is a slice of chars, split on each occurrence of any of the characters: + /// + /// ``` + /// let v: Vec<&str> = "2020-11-03 23:59".split(&['-', ' ', ':', '@'][..]).collect(); + /// assert_eq!(v, ["2020", "11", "03", "23", "59"]); + /// ``` + /// + /// A more complex pattern, using a closure: + /// + /// ``` + /// let v: Vec<&str> = "abc1defXghi".split(|c| c == '1' || c == 'X').collect(); + /// assert_eq!(v, ["abc", "def", "ghi"]); + /// ``` + /// + /// If a string contains multiple contiguous separators, you will end up + /// with empty strings in the output: + /// + /// ``` + /// let x = "||||a||b|c".to_string(); + /// let d: Vec<_> = x.split('|').collect(); + /// + /// assert_eq!(d, &["", "", "", "", "a", "", "b", "c"]); + /// ``` + /// + /// Contiguous separators are separated by the empty string. + /// + /// ``` + /// let x = "(///)".to_string(); + /// let d: Vec<_> = x.split('/').collect(); + /// + /// assert_eq!(d, &["(", "", "", ")"]); + /// ``` + /// + /// Separators at the start or end of a string are neighbored + /// by empty strings. + /// + /// ``` + /// let d: Vec<_> = "010".split("0").collect(); + /// assert_eq!(d, &["", "1", ""]); + /// ``` + /// + /// When the empty string is used as a separator, it separates + /// every character in the string, along with the beginning + /// and end of the string. + /// + /// ``` + /// let f: Vec<_> = "rust".split("").collect(); + /// assert_eq!(f, &["", "r", "u", "s", "t", ""]); + /// ``` + /// + /// Contiguous separators can lead to possibly surprising behavior + /// when whitespace is used as the separator. This code is correct: + /// + /// ``` + /// let x = " a b c".to_string(); + /// let d: Vec<_> = x.split(' ').collect(); + /// + /// assert_eq!(d, &["", "", "", "", "a", "", "b", "c"]); + /// ``` + /// + /// It does _not_ give you: + /// + /// ```,ignore + /// assert_eq!(d, &["a", "b", "c"]); + /// ``` + /// + /// Use [`split_whitespace`] for this behavior. + /// + /// [`split_whitespace`]: str::split_whitespace + #[stable(feature = "rust1", since = "1.0.0")] + #[inline] + pub fn split<'a, P: Pattern<'a>>(&'a self, pat: P) -> Split<'a, P> { + Split(SplitInternal { + start: 0, + end: self.len(), + matcher: pat.into_searcher(self), + allow_trailing_empty: true, + finished: false, + }) + } + + /// An iterator over substrings of this string slice, separated by + /// characters matched by a pattern. Differs from the iterator produced by + /// `split` in that `split_inclusive` leaves the matched part as the + /// terminator of the substring. + /// + /// The [pattern] can be a `&str`, [`char`], a slice of [`char`]s, or a + /// function or closure that determines if a character matches. + /// + /// [`char`]: prim@char + /// [pattern]: self::pattern + /// + /// # Examples + /// + /// ``` + /// let v: Vec<&str> = "Mary had a little lamb\nlittle lamb\nlittle lamb." + /// .split_inclusive('\n').collect(); + /// assert_eq!(v, ["Mary had a little lamb\n", "little lamb\n", "little lamb."]); + /// ``` + /// + /// If the last element of the string is matched, + /// that element will be considered the terminator of the preceding substring. + /// That substring will be the last item returned by the iterator. + /// + /// ``` + /// let v: Vec<&str> = "Mary had a little lamb\nlittle lamb\nlittle lamb.\n" + /// .split_inclusive('\n').collect(); + /// assert_eq!(v, ["Mary had a little lamb\n", "little lamb\n", "little lamb.\n"]); + /// ``` + #[stable(feature = "split_inclusive", since = "1.51.0")] + #[inline] + pub fn split_inclusive<'a, P: Pattern<'a>>(&'a self, pat: P) -> SplitInclusive<'a, P> { + SplitInclusive(SplitInternal { + start: 0, + end: self.len(), + matcher: pat.into_searcher(self), + allow_trailing_empty: false, + finished: false, + }) + } + + /// An iterator over substrings of the given string slice, separated by + /// characters matched by a pattern and yielded in reverse order. + /// + /// The [pattern] can be a `&str`, [`char`], a slice of [`char`]s, or a + /// function or closure that determines if a character matches. + /// + /// [`char`]: prim@char + /// [pattern]: self::pattern + /// + /// # Iterator behavior + /// + /// The returned iterator requires that the pattern supports a reverse + /// search, and it will be a [`DoubleEndedIterator`] if a forward/reverse + /// search yields the same elements. + /// + /// For iterating from the front, the [`split`] method can be used. + /// + /// [`split`]: str::split + /// + /// # Examples + /// + /// Simple patterns: + /// + /// ``` + /// let v: Vec<&str> = "Mary had a little lamb".rsplit(' ').collect(); + /// assert_eq!(v, ["lamb", "little", "a", "had", "Mary"]); + /// + /// let v: Vec<&str> = "".rsplit('X').collect(); + /// assert_eq!(v, [""]); + /// + /// let v: Vec<&str> = "lionXXtigerXleopard".rsplit('X').collect(); + /// assert_eq!(v, ["leopard", "tiger", "", "lion"]); + /// + /// let v: Vec<&str> = "lion::tiger::leopard".rsplit("::").collect(); + /// assert_eq!(v, ["leopard", "tiger", "lion"]); + /// ``` + /// + /// A more complex pattern, using a closure: + /// + /// ``` + /// let v: Vec<&str> = "abc1defXghi".rsplit(|c| c == '1' || c == 'X').collect(); + /// assert_eq!(v, ["ghi", "def", "abc"]); + /// ``` + #[stable(feature = "rust1", since = "1.0.0")] + #[inline] + pub fn rsplit<'a, P>(&'a self, pat: P) -> RSplit<'a, P> + where + P: Pattern<'a, Searcher: ReverseSearcher<'a>>, + { + RSplit(self.split(pat).0) + } + + /// An iterator over substrings of the given string slice, separated by + /// characters matched by a pattern. + /// + /// The [pattern] can be a `&str`, [`char`], a slice of [`char`]s, or a + /// function or closure that determines if a character matches. + /// + /// [`char`]: prim@char + /// [pattern]: self::pattern + /// + /// Equivalent to [`split`], except that the trailing substring + /// is skipped if empty. + /// + /// [`split`]: str::split + /// + /// This method can be used for string data that is _terminated_, + /// rather than _separated_ by a pattern. + /// + /// # Iterator behavior + /// + /// The returned iterator will be a [`DoubleEndedIterator`] if the pattern + /// allows a reverse search and forward/reverse search yields the same + /// elements. This is true for, e.g., [`char`], but not for `&str`. + /// + /// If the pattern allows a reverse search but its results might differ + /// from a forward search, the [`rsplit_terminator`] method can be used. + /// + /// [`rsplit_terminator`]: str::rsplit_terminator + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// let v: Vec<&str> = "A.B.".split_terminator('.').collect(); + /// assert_eq!(v, ["A", "B"]); + /// + /// let v: Vec<&str> = "A..B..".split_terminator(".").collect(); + /// assert_eq!(v, ["A", "", "B", ""]); + /// + /// let v: Vec<&str> = "A.B:C.D".split_terminator(&['.', ':'][..]).collect(); + /// assert_eq!(v, ["A", "B", "C", "D"]); + /// ``` + #[stable(feature = "rust1", since = "1.0.0")] + #[inline] + pub fn split_terminator<'a, P: Pattern<'a>>(&'a self, pat: P) -> SplitTerminator<'a, P> { + SplitTerminator(SplitInternal { allow_trailing_empty: false, ..self.split(pat).0 }) + } + + /// An iterator over substrings of `self`, separated by characters + /// matched by a pattern and yielded in reverse order. + /// + /// The [pattern] can be a `&str`, [`char`], a slice of [`char`]s, or a + /// function or closure that determines if a character matches. + /// + /// [`char`]: prim@char + /// [pattern]: self::pattern + /// + /// Equivalent to [`split`], except that the trailing substring is + /// skipped if empty. + /// + /// [`split`]: str::split + /// + /// This method can be used for string data that is _terminated_, + /// rather than _separated_ by a pattern. + /// + /// # Iterator behavior + /// + /// The returned iterator requires that the pattern supports a + /// reverse search, and it will be double ended if a forward/reverse + /// search yields the same elements. + /// + /// For iterating from the front, the [`split_terminator`] method can be + /// used. + /// + /// [`split_terminator`]: str::split_terminator + /// + /// # Examples + /// + /// ``` + /// let v: Vec<&str> = "A.B.".rsplit_terminator('.').collect(); + /// assert_eq!(v, ["B", "A"]); + /// + /// let v: Vec<&str> = "A..B..".rsplit_terminator(".").collect(); + /// assert_eq!(v, ["", "B", "", "A"]); + /// + /// let v: Vec<&str> = "A.B:C.D".rsplit_terminator(&['.', ':'][..]).collect(); + /// assert_eq!(v, ["D", "C", "B", "A"]); + /// ``` + #[stable(feature = "rust1", since = "1.0.0")] + #[inline] + pub fn rsplit_terminator<'a, P>(&'a self, pat: P) -> RSplitTerminator<'a, P> + where + P: Pattern<'a, Searcher: ReverseSearcher<'a>>, + { + RSplitTerminator(self.split_terminator(pat).0) + } + + /// An iterator over substrings of the given string slice, separated by a + /// pattern, restricted to returning at most `n` items. + /// + /// If `n` substrings are returned, the last substring (the `n`th substring) + /// will contain the remainder of the string. + /// + /// The [pattern] can be a `&str`, [`char`], a slice of [`char`]s, or a + /// function or closure that determines if a character matches. + /// + /// [`char`]: prim@char + /// [pattern]: self::pattern + /// + /// # Iterator behavior + /// + /// The returned iterator will not be double ended, because it is + /// not efficient to support. + /// + /// If the pattern allows a reverse search, the [`rsplitn`] method can be + /// used. + /// + /// [`rsplitn`]: str::rsplitn + /// + /// # Examples + /// + /// Simple patterns: + /// + /// ``` + /// let v: Vec<&str> = "Mary had a little lambda".splitn(3, ' ').collect(); + /// assert_eq!(v, ["Mary", "had", "a little lambda"]); + /// + /// let v: Vec<&str> = "lionXXtigerXleopard".splitn(3, "X").collect(); + /// assert_eq!(v, ["lion", "", "tigerXleopard"]); + /// + /// let v: Vec<&str> = "abcXdef".splitn(1, 'X').collect(); + /// assert_eq!(v, ["abcXdef"]); + /// + /// let v: Vec<&str> = "".splitn(1, 'X').collect(); + /// assert_eq!(v, [""]); + /// ``` + /// + /// A more complex pattern, using a closure: + /// + /// ``` + /// let v: Vec<&str> = "abc1defXghi".splitn(2, |c| c == '1' || c == 'X').collect(); + /// assert_eq!(v, ["abc", "defXghi"]); + /// ``` + #[stable(feature = "rust1", since = "1.0.0")] + #[inline] + pub fn splitn<'a, P: Pattern<'a>>(&'a self, n: usize, pat: P) -> SplitN<'a, P> { + SplitN(SplitNInternal { iter: self.split(pat).0, count: n }) + } + + /// An iterator over substrings of this string slice, separated by a + /// pattern, starting from the end of the string, restricted to returning + /// at most `n` items. + /// + /// If `n` substrings are returned, the last substring (the `n`th substring) + /// will contain the remainder of the string. + /// + /// The [pattern] can be a `&str`, [`char`], a slice of [`char`]s, or a + /// function or closure that determines if a character matches. + /// + /// [`char`]: prim@char + /// [pattern]: self::pattern + /// + /// # Iterator behavior + /// + /// The returned iterator will not be double ended, because it is not + /// efficient to support. + /// + /// For splitting from the front, the [`splitn`] method can be used. + /// + /// [`splitn`]: str::splitn + /// + /// # Examples + /// + /// Simple patterns: + /// + /// ``` + /// let v: Vec<&str> = "Mary had a little lamb".rsplitn(3, ' ').collect(); + /// assert_eq!(v, ["lamb", "little", "Mary had a"]); + /// + /// let v: Vec<&str> = "lionXXtigerXleopard".rsplitn(3, 'X').collect(); + /// assert_eq!(v, ["leopard", "tiger", "lionX"]); + /// + /// let v: Vec<&str> = "lion::tiger::leopard".rsplitn(2, "::").collect(); + /// assert_eq!(v, ["leopard", "lion::tiger"]); + /// ``` + /// + /// A more complex pattern, using a closure: + /// + /// ``` + /// let v: Vec<&str> = "abc1defXghi".rsplitn(2, |c| c == '1' || c == 'X').collect(); + /// assert_eq!(v, ["ghi", "abc1def"]); + /// ``` + #[stable(feature = "rust1", since = "1.0.0")] + #[inline] + pub fn rsplitn<'a, P>(&'a self, n: usize, pat: P) -> RSplitN<'a, P> + where + P: Pattern<'a, Searcher: ReverseSearcher<'a>>, + { + RSplitN(self.splitn(n, pat).0) + } + + /// Splits the string on the first occurrence of the specified delimiter and + /// returns prefix before delimiter and suffix after delimiter. + /// + /// # Examples + /// + /// ``` + /// assert_eq!("cfg".split_once('='), None); + /// assert_eq!("cfg=".split_once('='), Some(("cfg", ""))); + /// assert_eq!("cfg=foo".split_once('='), Some(("cfg", "foo"))); + /// assert_eq!("cfg=foo=bar".split_once('='), Some(("cfg", "foo=bar"))); + /// ``` + #[stable(feature = "str_split_once", since = "1.52.0")] + #[inline] + pub fn split_once<'a, P: Pattern<'a>>(&'a self, delimiter: P) -> Option<(&'a str, &'a str)> { + let (start, end) = delimiter.into_searcher(self).next_match()?; + // SAFETY: `Searcher` is known to return valid indices. + unsafe { Some((self.get_unchecked(..start), self.get_unchecked(end..))) } + } + + /// Splits the string on the last occurrence of the specified delimiter and + /// returns prefix before delimiter and suffix after delimiter. + /// + /// # Examples + /// + /// ``` + /// assert_eq!("cfg".rsplit_once('='), None); + /// assert_eq!("cfg=foo".rsplit_once('='), Some(("cfg", "foo"))); + /// assert_eq!("cfg=foo=bar".rsplit_once('='), Some(("cfg=foo", "bar"))); + /// ``` + #[stable(feature = "str_split_once", since = "1.52.0")] + #[inline] + pub fn rsplit_once<'a, P>(&'a self, delimiter: P) -> Option<(&'a str, &'a str)> + where + P: Pattern<'a, Searcher: ReverseSearcher<'a>>, + { + let (start, end) = delimiter.into_searcher(self).next_match_back()?; + // SAFETY: `Searcher` is known to return valid indices. + unsafe { Some((self.get_unchecked(..start), self.get_unchecked(end..))) } + } + + /// An iterator over the disjoint matches of a pattern within the given string + /// slice. + /// + /// The [pattern] can be a `&str`, [`char`], a slice of [`char`]s, or a + /// function or closure that determines if a character matches. + /// + /// [`char`]: prim@char + /// [pattern]: self::pattern + /// + /// # Iterator behavior + /// + /// The returned iterator will be a [`DoubleEndedIterator`] if the pattern + /// allows a reverse search and forward/reverse search yields the same + /// elements. This is true for, e.g., [`char`], but not for `&str`. + /// + /// If the pattern allows a reverse search but its results might differ + /// from a forward search, the [`rmatches`] method can be used. + /// + /// [`rmatches`]: str::matches + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// let v: Vec<&str> = "abcXXXabcYYYabc".matches("abc").collect(); + /// assert_eq!(v, ["abc", "abc", "abc"]); + /// + /// let v: Vec<&str> = "1abc2abc3".matches(char::is_numeric).collect(); + /// assert_eq!(v, ["1", "2", "3"]); + /// ``` + #[stable(feature = "str_matches", since = "1.2.0")] + #[inline] + pub fn matches<'a, P: Pattern<'a>>(&'a self, pat: P) -> Matches<'a, P> { + Matches(MatchesInternal(pat.into_searcher(self))) + } + + /// An iterator over the disjoint matches of a pattern within this string slice, + /// yielded in reverse order. + /// + /// The [pattern] can be a `&str`, [`char`], a slice of [`char`]s, or a + /// function or closure that determines if a character matches. + /// + /// [`char`]: prim@char + /// [pattern]: self::pattern + /// + /// # Iterator behavior + /// + /// The returned iterator requires that the pattern supports a reverse + /// search, and it will be a [`DoubleEndedIterator`] if a forward/reverse + /// search yields the same elements. + /// + /// For iterating from the front, the [`matches`] method can be used. + /// + /// [`matches`]: str::matches + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// let v: Vec<&str> = "abcXXXabcYYYabc".rmatches("abc").collect(); + /// assert_eq!(v, ["abc", "abc", "abc"]); + /// + /// let v: Vec<&str> = "1abc2abc3".rmatches(char::is_numeric).collect(); + /// assert_eq!(v, ["3", "2", "1"]); + /// ``` + #[stable(feature = "str_matches", since = "1.2.0")] + #[inline] + pub fn rmatches<'a, P>(&'a self, pat: P) -> RMatches<'a, P> + where + P: Pattern<'a, Searcher: ReverseSearcher<'a>>, + { + RMatches(self.matches(pat).0) + } + + /// An iterator over the disjoint matches of a pattern within this string + /// slice as well as the index that the match starts at. + /// + /// For matches of `pat` within `self` that overlap, only the indices + /// corresponding to the first match are returned. + /// + /// The [pattern] can be a `&str`, [`char`], a slice of [`char`]s, or a + /// function or closure that determines if a character matches. + /// + /// [`char`]: prim@char + /// [pattern]: self::pattern + /// + /// # Iterator behavior + /// + /// The returned iterator will be a [`DoubleEndedIterator`] if the pattern + /// allows a reverse search and forward/reverse search yields the same + /// elements. This is true for, e.g., [`char`], but not for `&str`. + /// + /// If the pattern allows a reverse search but its results might differ + /// from a forward search, the [`rmatch_indices`] method can be used. + /// + /// [`rmatch_indices`]: str::rmatch_indices + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// let v: Vec<_> = "abcXXXabcYYYabc".match_indices("abc").collect(); + /// assert_eq!(v, [(0, "abc"), (6, "abc"), (12, "abc")]); + /// + /// let v: Vec<_> = "1abcabc2".match_indices("abc").collect(); + /// assert_eq!(v, [(1, "abc"), (4, "abc")]); + /// + /// let v: Vec<_> = "ababa".match_indices("aba").collect(); + /// assert_eq!(v, [(0, "aba")]); // only the first `aba` + /// ``` + #[stable(feature = "str_match_indices", since = "1.5.0")] + #[inline] + pub fn match_indices<'a, P: Pattern<'a>>(&'a self, pat: P) -> MatchIndices<'a, P> { + MatchIndices(MatchIndicesInternal(pat.into_searcher(self))) + } + + /// An iterator over the disjoint matches of a pattern within `self`, + /// yielded in reverse order along with the index of the match. + /// + /// For matches of `pat` within `self` that overlap, only the indices + /// corresponding to the last match are returned. + /// + /// The [pattern] can be a `&str`, [`char`], a slice of [`char`]s, or a + /// function or closure that determines if a character matches. + /// + /// [`char`]: prim@char + /// [pattern]: self::pattern + /// + /// # Iterator behavior + /// + /// The returned iterator requires that the pattern supports a reverse + /// search, and it will be a [`DoubleEndedIterator`] if a forward/reverse + /// search yields the same elements. + /// + /// For iterating from the front, the [`match_indices`] method can be used. + /// + /// [`match_indices`]: str::match_indices + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// let v: Vec<_> = "abcXXXabcYYYabc".rmatch_indices("abc").collect(); + /// assert_eq!(v, [(12, "abc"), (6, "abc"), (0, "abc")]); + /// + /// let v: Vec<_> = "1abcabc2".rmatch_indices("abc").collect(); + /// assert_eq!(v, [(4, "abc"), (1, "abc")]); + /// + /// let v: Vec<_> = "ababa".rmatch_indices("aba").collect(); + /// assert_eq!(v, [(2, "aba")]); // only the last `aba` + /// ``` + #[stable(feature = "str_match_indices", since = "1.5.0")] + #[inline] + pub fn rmatch_indices<'a, P>(&'a self, pat: P) -> RMatchIndices<'a, P> + where + P: Pattern<'a, Searcher: ReverseSearcher<'a>>, + { + RMatchIndices(self.match_indices(pat).0) + } + + /// Returns a string slice with leading and trailing whitespace removed. + /// + /// 'Whitespace' is defined according to the terms of the Unicode Derived + /// Core Property `White_Space`, which includes newlines. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// let s = "\n Hello\tworld\t\n"; + /// + /// assert_eq!("Hello\tworld", s.trim()); + /// ``` + #[inline] + #[must_use = "this returns the trimmed string as a slice, \ + without modifying the original"] + #[stable(feature = "rust1", since = "1.0.0")] + #[cfg_attr(not(test), rustc_diagnostic_item = "str_trim")] + pub fn trim(&self) -> &str { + self.trim_matches(|c: char| c.is_whitespace()) + } + + /// Returns a string slice with leading whitespace removed. + /// + /// 'Whitespace' is defined according to the terms of the Unicode Derived + /// Core Property `White_Space`, which includes newlines. + /// + /// # Text directionality + /// + /// A string is a sequence of bytes. `start` in this context means the first + /// position of that byte string; for a left-to-right language like English or + /// Russian, this will be left side, and for right-to-left languages like + /// Arabic or Hebrew, this will be the right side. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// let s = "\n Hello\tworld\t\n"; + /// assert_eq!("Hello\tworld\t\n", s.trim_start()); + /// ``` + /// + /// Directionality: + /// + /// ``` + /// let s = " English "; + /// assert!(Some('E') == s.trim_start().chars().next()); + /// + /// let s = " עברית "; + /// assert!(Some('ע') == s.trim_start().chars().next()); + /// ``` + #[inline] + #[must_use = "this returns the trimmed string as a new slice, \ + without modifying the original"] + #[stable(feature = "trim_direction", since = "1.30.0")] + #[cfg_attr(not(test), rustc_diagnostic_item = "str_trim_start")] + pub fn trim_start(&self) -> &str { + self.trim_start_matches(|c: char| c.is_whitespace()) + } + + /// Returns a string slice with trailing whitespace removed. + /// + /// 'Whitespace' is defined according to the terms of the Unicode Derived + /// Core Property `White_Space`, which includes newlines. + /// + /// # Text directionality + /// + /// A string is a sequence of bytes. `end` in this context means the last + /// position of that byte string; for a left-to-right language like English or + /// Russian, this will be right side, and for right-to-left languages like + /// Arabic or Hebrew, this will be the left side. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// let s = "\n Hello\tworld\t\n"; + /// assert_eq!("\n Hello\tworld", s.trim_end()); + /// ``` + /// + /// Directionality: + /// + /// ``` + /// let s = " English "; + /// assert!(Some('h') == s.trim_end().chars().rev().next()); + /// + /// let s = " עברית "; + /// assert!(Some('ת') == s.trim_end().chars().rev().next()); + /// ``` + #[inline] + #[must_use = "this returns the trimmed string as a new slice, \ + without modifying the original"] + #[stable(feature = "trim_direction", since = "1.30.0")] + #[cfg_attr(not(test), rustc_diagnostic_item = "str_trim_end")] + pub fn trim_end(&self) -> &str { + self.trim_end_matches(|c: char| c.is_whitespace()) + } + + /// Returns a string slice with leading whitespace removed. + /// + /// 'Whitespace' is defined according to the terms of the Unicode Derived + /// Core Property `White_Space`. + /// + /// # Text directionality + /// + /// A string is a sequence of bytes. 'Left' in this context means the first + /// position of that byte string; for a language like Arabic or Hebrew + /// which are 'right to left' rather than 'left to right', this will be + /// the _right_ side, not the left. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// let s = " Hello\tworld\t"; + /// + /// assert_eq!("Hello\tworld\t", s.trim_left()); + /// ``` + /// + /// Directionality: + /// + /// ``` + /// let s = " English"; + /// assert!(Some('E') == s.trim_left().chars().next()); + /// + /// let s = " עברית"; + /// assert!(Some('ע') == s.trim_left().chars().next()); + /// ``` + #[must_use = "this returns the trimmed string as a new slice, \ + without modifying the original"] + #[inline] + #[stable(feature = "rust1", since = "1.0.0")] + #[deprecated(since = "1.33.0", note = "superseded by `trim_start`", suggestion = "trim_start")] + pub fn trim_left(&self) -> &str { + self.trim_start() + } + + /// Returns a string slice with trailing whitespace removed. + /// + /// 'Whitespace' is defined according to the terms of the Unicode Derived + /// Core Property `White_Space`. + /// + /// # Text directionality + /// + /// A string is a sequence of bytes. 'Right' in this context means the last + /// position of that byte string; for a language like Arabic or Hebrew + /// which are 'right to left' rather than 'left to right', this will be + /// the _left_ side, not the right. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// let s = " Hello\tworld\t"; + /// + /// assert_eq!(" Hello\tworld", s.trim_right()); + /// ``` + /// + /// Directionality: + /// + /// ``` + /// let s = "English "; + /// assert!(Some('h') == s.trim_right().chars().rev().next()); + /// + /// let s = "עברית "; + /// assert!(Some('ת') == s.trim_right().chars().rev().next()); + /// ``` + #[must_use = "this returns the trimmed string as a new slice, \ + without modifying the original"] + #[inline] + #[stable(feature = "rust1", since = "1.0.0")] + #[deprecated(since = "1.33.0", note = "superseded by `trim_end`", suggestion = "trim_end")] + pub fn trim_right(&self) -> &str { + self.trim_end() + } + + /// Returns a string slice with all prefixes and suffixes that match a + /// pattern repeatedly removed. + /// + /// The [pattern] can be a [`char`], a slice of [`char`]s, or a function + /// or closure that determines if a character matches. + /// + /// [`char`]: prim@char + /// [pattern]: self::pattern + /// + /// # Examples + /// + /// Simple patterns: + /// + /// ``` + /// assert_eq!("11foo1bar11".trim_matches('1'), "foo1bar"); + /// assert_eq!("123foo1bar123".trim_matches(char::is_numeric), "foo1bar"); + /// + /// let x: &[_] = &['1', '2']; + /// assert_eq!("12foo1bar12".trim_matches(x), "foo1bar"); + /// ``` + /// + /// A more complex pattern, using a closure: + /// + /// ``` + /// assert_eq!("1foo1barXX".trim_matches(|c| c == '1' || c == 'X'), "foo1bar"); + /// ``` + #[must_use = "this returns the trimmed string as a new slice, \ + without modifying the original"] + #[stable(feature = "rust1", since = "1.0.0")] + pub fn trim_matches<'a, P>(&'a self, pat: P) -> &'a str + where + P: Pattern<'a, Searcher: DoubleEndedSearcher<'a>>, + { + let mut i = 0; + let mut j = 0; + let mut matcher = pat.into_searcher(self); + if let Some((a, b)) = matcher.next_reject() { + i = a; + j = b; // Remember earliest known match, correct it below if + // last match is different + } + if let Some((_, b)) = matcher.next_reject_back() { + j = b; + } + // SAFETY: `Searcher` is known to return valid indices. + unsafe { self.get_unchecked(i..j) } + } + + /// Returns a string slice with all prefixes that match a pattern + /// repeatedly removed. + /// + /// The [pattern] can be a `&str`, [`char`], a slice of [`char`]s, or a + /// function or closure that determines if a character matches. + /// + /// [`char`]: prim@char + /// [pattern]: self::pattern + /// + /// # Text directionality + /// + /// A string is a sequence of bytes. `start` in this context means the first + /// position of that byte string; for a left-to-right language like English or + /// Russian, this will be left side, and for right-to-left languages like + /// Arabic or Hebrew, this will be the right side. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// assert_eq!("11foo1bar11".trim_start_matches('1'), "foo1bar11"); + /// assert_eq!("123foo1bar123".trim_start_matches(char::is_numeric), "foo1bar123"); + /// + /// let x: &[_] = &['1', '2']; + /// assert_eq!("12foo1bar12".trim_start_matches(x), "foo1bar12"); + /// ``` + #[must_use = "this returns the trimmed string as a new slice, \ + without modifying the original"] + #[stable(feature = "trim_direction", since = "1.30.0")] + pub fn trim_start_matches<'a, P: Pattern<'a>>(&'a self, pat: P) -> &'a str { + let mut i = self.len(); + let mut matcher = pat.into_searcher(self); + if let Some((a, _)) = matcher.next_reject() { + i = a; + } + // SAFETY: `Searcher` is known to return valid indices. + unsafe { self.get_unchecked(i..self.len()) } + } + + /// Returns a string slice with the prefix removed. + /// + /// If the string starts with the pattern `prefix`, returns substring after the prefix, wrapped + /// in `Some`. Unlike `trim_start_matches`, this method removes the prefix exactly once. + /// + /// If the string does not start with `prefix`, returns `None`. + /// + /// The [pattern] can be a `&str`, [`char`], a slice of [`char`]s, or a + /// function or closure that determines if a character matches. + /// + /// [`char`]: prim@char + /// [pattern]: self::pattern + /// + /// # Examples + /// + /// ``` + /// assert_eq!("foo:bar".strip_prefix("foo:"), Some("bar")); + /// assert_eq!("foo:bar".strip_prefix("bar"), None); + /// assert_eq!("foofoo".strip_prefix("foo"), Some("foo")); + /// ``` + #[must_use = "this returns the remaining substring as a new slice, \ + without modifying the original"] + #[stable(feature = "str_strip", since = "1.45.0")] + pub fn strip_prefix<'a, P: Pattern<'a>>(&'a self, prefix: P) -> Option<&'a str> { + prefix.strip_prefix_of(self) + } + + /// Returns a string slice with the suffix removed. + /// + /// If the string ends with the pattern `suffix`, returns the substring before the suffix, + /// wrapped in `Some`. Unlike `trim_end_matches`, this method removes the suffix exactly once. + /// + /// If the string does not end with `suffix`, returns `None`. + /// + /// The [pattern] can be a `&str`, [`char`], a slice of [`char`]s, or a + /// function or closure that determines if a character matches. + /// + /// [`char`]: prim@char + /// [pattern]: self::pattern + /// + /// # Examples + /// + /// ``` + /// assert_eq!("bar:foo".strip_suffix(":foo"), Some("bar")); + /// assert_eq!("bar:foo".strip_suffix("bar"), None); + /// assert_eq!("foofoo".strip_suffix("foo"), Some("foo")); + /// ``` + #[must_use = "this returns the remaining substring as a new slice, \ + without modifying the original"] + #[stable(feature = "str_strip", since = "1.45.0")] + pub fn strip_suffix<'a, P>(&'a self, suffix: P) -> Option<&'a str> + where + P: Pattern<'a>, +

>::Searcher: ReverseSearcher<'a>, + { + suffix.strip_suffix_of(self) + } + + /// Returns a string slice with all suffixes that match a pattern + /// repeatedly removed. + /// + /// The [pattern] can be a `&str`, [`char`], a slice of [`char`]s, or a + /// function or closure that determines if a character matches. + /// + /// [`char`]: prim@char + /// [pattern]: self::pattern + /// + /// # Text directionality + /// + /// A string is a sequence of bytes. `end` in this context means the last + /// position of that byte string; for a left-to-right language like English or + /// Russian, this will be right side, and for right-to-left languages like + /// Arabic or Hebrew, this will be the left side. + /// + /// # Examples + /// + /// Simple patterns: + /// + /// ``` + /// assert_eq!("11foo1bar11".trim_end_matches('1'), "11foo1bar"); + /// assert_eq!("123foo1bar123".trim_end_matches(char::is_numeric), "123foo1bar"); + /// + /// let x: &[_] = &['1', '2']; + /// assert_eq!("12foo1bar12".trim_end_matches(x), "12foo1bar"); + /// ``` + /// + /// A more complex pattern, using a closure: + /// + /// ``` + /// assert_eq!("1fooX".trim_end_matches(|c| c == '1' || c == 'X'), "1foo"); + /// ``` + #[must_use = "this returns the trimmed string as a new slice, \ + without modifying the original"] + #[stable(feature = "trim_direction", since = "1.30.0")] + pub fn trim_end_matches<'a, P>(&'a self, pat: P) -> &'a str + where + P: Pattern<'a, Searcher: ReverseSearcher<'a>>, + { + let mut j = 0; + let mut matcher = pat.into_searcher(self); + if let Some((_, b)) = matcher.next_reject_back() { + j = b; + } + // SAFETY: `Searcher` is known to return valid indices. + unsafe { self.get_unchecked(0..j) } + } + + /// Returns a string slice with all prefixes that match a pattern + /// repeatedly removed. + /// + /// The [pattern] can be a `&str`, [`char`], a slice of [`char`]s, or a + /// function or closure that determines if a character matches. + /// + /// [`char`]: prim@char + /// [pattern]: self::pattern + /// + /// # Text directionality + /// + /// A string is a sequence of bytes. 'Left' in this context means the first + /// position of that byte string; for a language like Arabic or Hebrew + /// which are 'right to left' rather than 'left to right', this will be + /// the _right_ side, not the left. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// assert_eq!("11foo1bar11".trim_left_matches('1'), "foo1bar11"); + /// assert_eq!("123foo1bar123".trim_left_matches(char::is_numeric), "foo1bar123"); + /// + /// let x: &[_] = &['1', '2']; + /// assert_eq!("12foo1bar12".trim_left_matches(x), "foo1bar12"); + /// ``` + #[stable(feature = "rust1", since = "1.0.0")] + #[deprecated( + since = "1.33.0", + note = "superseded by `trim_start_matches`", + suggestion = "trim_start_matches" + )] + pub fn trim_left_matches<'a, P: Pattern<'a>>(&'a self, pat: P) -> &'a str { + self.trim_start_matches(pat) + } + + /// Returns a string slice with all suffixes that match a pattern + /// repeatedly removed. + /// + /// The [pattern] can be a `&str`, [`char`], a slice of [`char`]s, or a + /// function or closure that determines if a character matches. + /// + /// [`char`]: prim@char + /// [pattern]: self::pattern + /// + /// # Text directionality + /// + /// A string is a sequence of bytes. 'Right' in this context means the last + /// position of that byte string; for a language like Arabic or Hebrew + /// which are 'right to left' rather than 'left to right', this will be + /// the _left_ side, not the right. + /// + /// # Examples + /// + /// Simple patterns: + /// + /// ``` + /// assert_eq!("11foo1bar11".trim_right_matches('1'), "11foo1bar"); + /// assert_eq!("123foo1bar123".trim_right_matches(char::is_numeric), "123foo1bar"); + /// + /// let x: &[_] = &['1', '2']; + /// assert_eq!("12foo1bar12".trim_right_matches(x), "12foo1bar"); + /// ``` + /// + /// A more complex pattern, using a closure: + /// + /// ``` + /// assert_eq!("1fooX".trim_right_matches(|c| c == '1' || c == 'X'), "1foo"); + /// ``` + #[stable(feature = "rust1", since = "1.0.0")] + #[deprecated( + since = "1.33.0", + note = "superseded by `trim_end_matches`", + suggestion = "trim_end_matches" + )] + pub fn trim_right_matches<'a, P>(&'a self, pat: P) -> &'a str + where + P: Pattern<'a, Searcher: ReverseSearcher<'a>>, + { + self.trim_end_matches(pat) + } + + /// Parses this string slice into another type. + /// + /// Because `parse` is so general, it can cause problems with type + /// inference. As such, `parse` is one of the few times you'll see + /// the syntax affectionately known as the 'turbofish': `::<>`. This + /// helps the inference algorithm understand specifically which type + /// you're trying to parse into. + /// + /// `parse` can parse into any type that implements the [`FromStr`] trait. + + /// + /// # Errors + /// + /// Will return [`Err`] if it's not possible to parse this string slice into + /// the desired type. + /// + /// [`Err`]: FromStr::Err + /// + /// # Examples + /// + /// Basic usage + /// + /// ``` + /// let four: u32 = "4".parse().unwrap(); + /// + /// assert_eq!(4, four); + /// ``` + /// + /// Using the 'turbofish' instead of annotating `four`: + /// + /// ``` + /// let four = "4".parse::(); + /// + /// assert_eq!(Ok(4), four); + /// ``` + /// + /// Failing to parse: + /// + /// ``` + /// let nope = "j".parse::(); + /// + /// assert!(nope.is_err()); + /// ``` + #[inline] + #[stable(feature = "rust1", since = "1.0.0")] + pub fn parse(&self) -> Result { + FromStr::from_str(self) + } + + /// Checks if all characters in this string are within the ASCII range. + /// + /// # Examples + /// + /// ``` + /// let ascii = "hello!\n"; + /// let non_ascii = "Grüße, Jürgen ❤"; + /// + /// assert!(ascii.is_ascii()); + /// assert!(!non_ascii.is_ascii()); + /// ``` + #[stable(feature = "ascii_methods_on_intrinsics", since = "1.23.0")] + #[must_use] + #[inline] + pub fn is_ascii(&self) -> bool { + // We can treat each byte as character here: all multibyte characters + // start with a byte that is not in the ascii range, so we will stop + // there already. + self.as_bytes().is_ascii() + } + + /// Checks that two strings are an ASCII case-insensitive match. + /// + /// Same as `to_ascii_lowercase(a) == to_ascii_lowercase(b)`, + /// but without allocating and copying temporaries. + /// + /// # Examples + /// + /// ``` + /// assert!("Ferris".eq_ignore_ascii_case("FERRIS")); + /// assert!("Ferrös".eq_ignore_ascii_case("FERRöS")); + /// assert!(!"Ferrös".eq_ignore_ascii_case("FERRÖS")); + /// ``` + #[stable(feature = "ascii_methods_on_intrinsics", since = "1.23.0")] + #[must_use] + #[inline] + pub fn eq_ignore_ascii_case(&self, other: &str) -> bool { + self.as_bytes().eq_ignore_ascii_case(other.as_bytes()) + } + + /// Converts this string to its ASCII upper case equivalent in-place. + /// + /// ASCII letters 'a' to 'z' are mapped to 'A' to 'Z', + /// but non-ASCII letters are unchanged. + /// + /// To return a new uppercased value without modifying the existing one, use + /// [`to_ascii_uppercase()`]. + /// + /// [`to_ascii_uppercase()`]: #method.to_ascii_uppercase + /// + /// # Examples + /// + /// ``` + /// let mut s = String::from("Grüße, Jürgen ❤"); + /// + /// s.make_ascii_uppercase(); + /// + /// assert_eq!("GRüßE, JüRGEN ❤", s); + /// ``` + #[stable(feature = "ascii_methods_on_intrinsics", since = "1.23.0")] + #[inline] + pub fn make_ascii_uppercase(&mut self) { + // SAFETY: changing ASCII letters only does not invalidate UTF-8. + let me = unsafe { self.as_bytes_mut() }; + me.make_ascii_uppercase() + } + + /// Converts this string to its ASCII lower case equivalent in-place. + /// + /// ASCII letters 'A' to 'Z' are mapped to 'a' to 'z', + /// but non-ASCII letters are unchanged. + /// + /// To return a new lowercased value without modifying the existing one, use + /// [`to_ascii_lowercase()`]. + /// + /// [`to_ascii_lowercase()`]: #method.to_ascii_lowercase + /// + /// # Examples + /// + /// ``` + /// let mut s = String::from("GRÜßE, JÜRGEN ❤"); + /// + /// s.make_ascii_lowercase(); + /// + /// assert_eq!("grÜße, jÜrgen ❤", s); + /// ``` + #[stable(feature = "ascii_methods_on_intrinsics", since = "1.23.0")] + #[inline] + pub fn make_ascii_lowercase(&mut self) { + // SAFETY: changing ASCII letters only does not invalidate UTF-8. + let me = unsafe { self.as_bytes_mut() }; + me.make_ascii_lowercase() + } + + /// Return an iterator that escapes each char in `self` with [`char::escape_debug`]. + /// + /// Note: only extended grapheme codepoints that begin the string will be + /// escaped. + /// + /// # Examples + /// + /// As an iterator: + /// + /// ``` + /// for c in "❤\n!".escape_debug() { + /// print!("{c}"); + /// } + /// println!(); + /// ``` + /// + /// Using `println!` directly: + /// + /// ``` + /// println!("{}", "❤\n!".escape_debug()); + /// ``` + /// + /// + /// Both are equivalent to: + /// + /// ``` + /// println!("❤\\n!"); + /// ``` + /// + /// Using `to_string`: + /// + /// ``` + /// assert_eq!("❤\n!".escape_debug().to_string(), "❤\\n!"); + /// ``` + #[must_use = "this returns the escaped string as an iterator, \ + without modifying the original"] + #[stable(feature = "str_escape", since = "1.34.0")] + pub fn escape_debug(&self) -> EscapeDebug<'_> { + let mut chars = self.chars(); + EscapeDebug { + inner: chars + .next() + .map(|first| first.escape_debug_ext(EscapeDebugExtArgs::ESCAPE_ALL)) + .into_iter() + .flatten() + .chain(chars.flat_map(CharEscapeDebugContinue)), + } + } + + /// Return an iterator that escapes each char in `self` with [`char::escape_default`]. + /// + /// # Examples + /// + /// As an iterator: + /// + /// ``` + /// for c in "❤\n!".escape_default() { + /// print!("{c}"); + /// } + /// println!(); + /// ``` + /// + /// Using `println!` directly: + /// + /// ``` + /// println!("{}", "❤\n!".escape_default()); + /// ``` + /// + /// + /// Both are equivalent to: + /// + /// ``` + /// println!("\\u{{2764}}\\n!"); + /// ``` + /// + /// Using `to_string`: + /// + /// ``` + /// assert_eq!("❤\n!".escape_default().to_string(), "\\u{2764}\\n!"); + /// ``` + #[must_use = "this returns the escaped string as an iterator, \ + without modifying the original"] + #[stable(feature = "str_escape", since = "1.34.0")] + pub fn escape_default(&self) -> EscapeDefault<'_> { + EscapeDefault { inner: self.chars().flat_map(CharEscapeDefault) } + } + + /// Return an iterator that escapes each char in `self` with [`char::escape_unicode`]. + /// + /// # Examples + /// + /// As an iterator: + /// + /// ``` + /// for c in "❤\n!".escape_unicode() { + /// print!("{c}"); + /// } + /// println!(); + /// ``` + /// + /// Using `println!` directly: + /// + /// ``` + /// println!("{}", "❤\n!".escape_unicode()); + /// ``` + /// + /// + /// Both are equivalent to: + /// + /// ``` + /// println!("\\u{{2764}}\\u{{a}}\\u{{21}}"); + /// ``` + /// + /// Using `to_string`: + /// + /// ``` + /// assert_eq!("❤\n!".escape_unicode().to_string(), "\\u{2764}\\u{a}\\u{21}"); + /// ``` + #[must_use = "this returns the escaped string as an iterator, \ + without modifying the original"] + #[stable(feature = "str_escape", since = "1.34.0")] + pub fn escape_unicode(&self) -> EscapeUnicode<'_> { + EscapeUnicode { inner: self.chars().flat_map(CharEscapeUnicode) } + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl AsRef<[u8]> for str { + #[inline] + fn as_ref(&self) -> &[u8] { + self.as_bytes() + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +#[rustc_const_unstable(feature = "const_default_impls", issue = "87864")] +impl const Default for &str { + /// Creates an empty str + #[inline] + fn default() -> Self { + "" + } +} + +#[stable(feature = "default_mut_str", since = "1.28.0")] +impl Default for &mut str { + /// Creates an empty mutable str + #[inline] + fn default() -> Self { + // SAFETY: The empty string is valid UTF-8. + unsafe { from_utf8_unchecked_mut(&mut []) } + } +} + +impl_fn_for_zst! { + /// A nameable, cloneable fn type + #[derive(Clone)] + struct LinesAnyMap impl<'a> Fn = |line: &'a str| -> &'a str { + let l = line.len(); + if l > 0 && line.as_bytes()[l - 1] == b'\r' { &line[0 .. l - 1] } + else { line } + }; + + #[derive(Clone)] + struct CharEscapeDebugContinue impl Fn = |c: char| -> char::EscapeDebug { + c.escape_debug_ext(EscapeDebugExtArgs { + escape_grapheme_extended: false, + escape_single_quote: true, + escape_double_quote: true + }) + }; + + #[derive(Clone)] + struct CharEscapeUnicode impl Fn = |c: char| -> char::EscapeUnicode { + c.escape_unicode() + }; + #[derive(Clone)] + struct CharEscapeDefault impl Fn = |c: char| -> char::EscapeDefault { + c.escape_default() + }; + + #[derive(Clone)] + struct IsWhitespace impl Fn = |c: char| -> bool { + c.is_whitespace() + }; + + #[derive(Clone)] + struct IsAsciiWhitespace impl Fn = |byte: &u8| -> bool { + byte.is_ascii_whitespace() + }; + + #[derive(Clone)] + struct IsNotEmpty impl<'a, 'b> Fn = |s: &'a &'b str| -> bool { + !s.is_empty() + }; + + #[derive(Clone)] + struct BytesIsNotEmpty impl<'a, 'b> Fn = |s: &'a &'b [u8]| -> bool { + !s.is_empty() + }; + + #[derive(Clone)] + struct UnsafeBytesToStr impl<'a> Fn = |bytes: &'a [u8]| -> &'a str { + // SAFETY: not safe + unsafe { from_utf8_unchecked(bytes) } + }; +} diff --git a/library/core/src/str/pattern.rs b/library/core/src/str/pattern.rs new file mode 100644 index 000000000..031fb8e8b --- /dev/null +++ b/library/core/src/str/pattern.rs @@ -0,0 +1,1686 @@ +//! The string Pattern API. +//! +//! The Pattern API provides a generic mechanism for using different pattern +//! types when searching through a string. +//! +//! For more details, see the traits [`Pattern`], [`Searcher`], +//! [`ReverseSearcher`], and [`DoubleEndedSearcher`]. +//! +//! Although this API is unstable, it is exposed via stable APIs on the +//! [`str`] type. +//! +//! # Examples +//! +//! [`Pattern`] is [implemented][pattern-impls] in the stable API for +//! [`&str`][`str`], [`char`], slices of [`char`], and functions and closures +//! implementing `FnMut(char) -> bool`. +//! +//! ``` +//! let s = "Can you find a needle in a haystack?"; +//! +//! // &str pattern +//! assert_eq!(s.find("you"), Some(4)); +//! // char pattern +//! assert_eq!(s.find('n'), Some(2)); +//! // array of chars pattern +//! assert_eq!(s.find(&['a', 'e', 'i', 'o', 'u']), Some(1)); +//! // slice of chars pattern +//! assert_eq!(s.find(&['a', 'e', 'i', 'o', 'u'][..]), Some(1)); +//! // closure pattern +//! assert_eq!(s.find(|c: char| c.is_ascii_punctuation()), Some(35)); +//! ``` +//! +//! [pattern-impls]: Pattern#implementors + +#![unstable( + feature = "pattern", + reason = "API not fully fleshed out and ready to be stabilized", + issue = "27721" +)] + +use crate::cmp; +use crate::fmt; +use crate::slice::memchr; + +// Pattern + +/// A string pattern. +/// +/// A `Pattern<'a>` expresses that the implementing type +/// can be used as a string pattern for searching in a [`&'a str`][str]. +/// +/// For example, both `'a'` and `"aa"` are patterns that +/// would match at index `1` in the string `"baaaab"`. +/// +/// The trait itself acts as a builder for an associated +/// [`Searcher`] type, which does the actual work of finding +/// occurrences of the pattern in a string. +/// +/// Depending on the type of the pattern, the behaviour of methods like +/// [`str::find`] and [`str::contains`] can change. The table below describes +/// some of those behaviours. +/// +/// | Pattern type | Match condition | +/// |--------------------------|-------------------------------------------| +/// | `&str` | is substring | +/// | `char` | is contained in string | +/// | `&[char]` | any char in slice is contained in string | +/// | `F: FnMut(char) -> bool` | `F` returns `true` for a char in string | +/// | `&&str` | is substring | +/// | `&String` | is substring | +/// +/// # Examples +/// +/// ``` +/// // &str +/// assert_eq!("abaaa".find("ba"), Some(1)); +/// assert_eq!("abaaa".find("bac"), None); +/// +/// // char +/// assert_eq!("abaaa".find('a'), Some(0)); +/// assert_eq!("abaaa".find('b'), Some(1)); +/// assert_eq!("abaaa".find('c'), None); +/// +/// // &[char; N] +/// assert_eq!("ab".find(&['b', 'a']), Some(0)); +/// assert_eq!("abaaa".find(&['a', 'z']), Some(0)); +/// assert_eq!("abaaa".find(&['c', 'd']), None); +/// +/// // &[char] +/// assert_eq!("ab".find(&['b', 'a'][..]), Some(0)); +/// assert_eq!("abaaa".find(&['a', 'z'][..]), Some(0)); +/// assert_eq!("abaaa".find(&['c', 'd'][..]), None); +/// +/// // FnMut(char) -> bool +/// assert_eq!("abcdef_z".find(|ch| ch > 'd' && ch < 'y'), Some(4)); +/// assert_eq!("abcddd_z".find(|ch| ch > 'd' && ch < 'y'), None); +/// ``` +pub trait Pattern<'a>: Sized { + /// Associated searcher for this pattern + type Searcher: Searcher<'a>; + + /// Constructs the associated searcher from + /// `self` and the `haystack` to search in. + fn into_searcher(self, haystack: &'a str) -> Self::Searcher; + + /// Checks whether the pattern matches anywhere in the haystack + #[inline] + fn is_contained_in(self, haystack: &'a str) -> bool { + self.into_searcher(haystack).next_match().is_some() + } + + /// Checks whether the pattern matches at the front of the haystack + #[inline] + fn is_prefix_of(self, haystack: &'a str) -> bool { + matches!(self.into_searcher(haystack).next(), SearchStep::Match(0, _)) + } + + /// Checks whether the pattern matches at the back of the haystack + #[inline] + fn is_suffix_of(self, haystack: &'a str) -> bool + where + Self::Searcher: ReverseSearcher<'a>, + { + matches!(self.into_searcher(haystack).next_back(), SearchStep::Match(_, j) if haystack.len() == j) + } + + /// Removes the pattern from the front of haystack, if it matches. + #[inline] + fn strip_prefix_of(self, haystack: &'a str) -> Option<&'a str> { + if let SearchStep::Match(start, len) = self.into_searcher(haystack).next() { + debug_assert_eq!( + start, 0, + "The first search step from Searcher \ + must include the first character" + ); + // SAFETY: `Searcher` is known to return valid indices. + unsafe { Some(haystack.get_unchecked(len..)) } + } else { + None + } + } + + /// Removes the pattern from the back of haystack, if it matches. + #[inline] + fn strip_suffix_of(self, haystack: &'a str) -> Option<&'a str> + where + Self::Searcher: ReverseSearcher<'a>, + { + if let SearchStep::Match(start, end) = self.into_searcher(haystack).next_back() { + debug_assert_eq!( + end, + haystack.len(), + "The first search step from ReverseSearcher \ + must include the last character" + ); + // SAFETY: `Searcher` is known to return valid indices. + unsafe { Some(haystack.get_unchecked(..start)) } + } else { + None + } + } +} + +// Searcher + +/// Result of calling [`Searcher::next()`] or [`ReverseSearcher::next_back()`]. +#[derive(Copy, Clone, Eq, PartialEq, Debug)] +pub enum SearchStep { + /// Expresses that a match of the pattern has been found at + /// `haystack[a..b]`. + Match(usize, usize), + /// Expresses that `haystack[a..b]` has been rejected as a possible match + /// of the pattern. + /// + /// Note that there might be more than one `Reject` between two `Match`es, + /// there is no requirement for them to be combined into one. + Reject(usize, usize), + /// Expresses that every byte of the haystack has been visited, ending + /// the iteration. + Done, +} + +/// A searcher for a string pattern. +/// +/// This trait provides methods for searching for non-overlapping +/// matches of a pattern starting from the front (left) of a string. +/// +/// It will be implemented by associated `Searcher` +/// types of the [`Pattern`] trait. +/// +/// The trait is marked unsafe because the indices returned by the +/// [`next()`][Searcher::next] methods are required to lie on valid utf8 +/// boundaries in the haystack. This enables consumers of this trait to +/// slice the haystack without additional runtime checks. +pub unsafe trait Searcher<'a> { + /// Getter for the underlying string to be searched in + /// + /// Will always return the same [`&str`][str]. + fn haystack(&self) -> &'a str; + + /// Performs the next search step starting from the front. + /// + /// - Returns [`Match(a, b)`][SearchStep::Match] if `haystack[a..b]` matches + /// the pattern. + /// - Returns [`Reject(a, b)`][SearchStep::Reject] if `haystack[a..b]` can + /// not match the pattern, even partially. + /// - Returns [`Done`][SearchStep::Done] if every byte of the haystack has + /// been visited. + /// + /// The stream of [`Match`][SearchStep::Match] and + /// [`Reject`][SearchStep::Reject] values up to a [`Done`][SearchStep::Done] + /// will contain index ranges that are adjacent, non-overlapping, + /// covering the whole haystack, and laying on utf8 boundaries. + /// + /// A [`Match`][SearchStep::Match] result needs to contain the whole matched + /// pattern, however [`Reject`][SearchStep::Reject] results may be split up + /// into arbitrary many adjacent fragments. Both ranges may have zero length. + /// + /// As an example, the pattern `"aaa"` and the haystack `"cbaaaaab"` + /// might produce the stream + /// `[Reject(0, 1), Reject(1, 2), Match(2, 5), Reject(5, 8)]` + fn next(&mut self) -> SearchStep; + + /// Finds the next [`Match`][SearchStep::Match] result. See [`next()`][Searcher::next]. + /// + /// Unlike [`next()`][Searcher::next], there is no guarantee that the returned ranges + /// of this and [`next_reject`][Searcher::next_reject] will overlap. This will return + /// `(start_match, end_match)`, where start_match is the index of where + /// the match begins, and end_match is the index after the end of the match. + #[inline] + fn next_match(&mut self) -> Option<(usize, usize)> { + loop { + match self.next() { + SearchStep::Match(a, b) => return Some((a, b)), + SearchStep::Done => return None, + _ => continue, + } + } + } + + /// Finds the next [`Reject`][SearchStep::Reject] result. See [`next()`][Searcher::next] + /// and [`next_match()`][Searcher::next_match]. + /// + /// Unlike [`next()`][Searcher::next], there is no guarantee that the returned ranges + /// of this and [`next_match`][Searcher::next_match] will overlap. + #[inline] + fn next_reject(&mut self) -> Option<(usize, usize)> { + loop { + match self.next() { + SearchStep::Reject(a, b) => return Some((a, b)), + SearchStep::Done => return None, + _ => continue, + } + } + } +} + +/// A reverse searcher for a string pattern. +/// +/// This trait provides methods for searching for non-overlapping +/// matches of a pattern starting from the back (right) of a string. +/// +/// It will be implemented by associated [`Searcher`] +/// types of the [`Pattern`] trait if the pattern supports searching +/// for it from the back. +/// +/// The index ranges returned by this trait are not required +/// to exactly match those of the forward search in reverse. +/// +/// For the reason why this trait is marked unsafe, see them +/// parent trait [`Searcher`]. +pub unsafe trait ReverseSearcher<'a>: Searcher<'a> { + /// Performs the next search step starting from the back. + /// + /// - Returns [`Match(a, b)`][SearchStep::Match] if `haystack[a..b]` + /// matches the pattern. + /// - Returns [`Reject(a, b)`][SearchStep::Reject] if `haystack[a..b]` + /// can not match the pattern, even partially. + /// - Returns [`Done`][SearchStep::Done] if every byte of the haystack + /// has been visited + /// + /// The stream of [`Match`][SearchStep::Match] and + /// [`Reject`][SearchStep::Reject] values up to a [`Done`][SearchStep::Done] + /// will contain index ranges that are adjacent, non-overlapping, + /// covering the whole haystack, and laying on utf8 boundaries. + /// + /// A [`Match`][SearchStep::Match] result needs to contain the whole matched + /// pattern, however [`Reject`][SearchStep::Reject] results may be split up + /// into arbitrary many adjacent fragments. Both ranges may have zero length. + /// + /// As an example, the pattern `"aaa"` and the haystack `"cbaaaaab"` + /// might produce the stream + /// `[Reject(7, 8), Match(4, 7), Reject(1, 4), Reject(0, 1)]`. + fn next_back(&mut self) -> SearchStep; + + /// Finds the next [`Match`][SearchStep::Match] result. + /// See [`next_back()`][ReverseSearcher::next_back]. + #[inline] + fn next_match_back(&mut self) -> Option<(usize, usize)> { + loop { + match self.next_back() { + SearchStep::Match(a, b) => return Some((a, b)), + SearchStep::Done => return None, + _ => continue, + } + } + } + + /// Finds the next [`Reject`][SearchStep::Reject] result. + /// See [`next_back()`][ReverseSearcher::next_back]. + #[inline] + fn next_reject_back(&mut self) -> Option<(usize, usize)> { + loop { + match self.next_back() { + SearchStep::Reject(a, b) => return Some((a, b)), + SearchStep::Done => return None, + _ => continue, + } + } + } +} + +/// A marker trait to express that a [`ReverseSearcher`] +/// can be used for a [`DoubleEndedIterator`] implementation. +/// +/// For this, the impl of [`Searcher`] and [`ReverseSearcher`] need +/// to follow these conditions: +/// +/// - All results of `next()` need to be identical +/// to the results of `next_back()` in reverse order. +/// - `next()` and `next_back()` need to behave as +/// the two ends of a range of values, that is they +/// can not "walk past each other". +/// +/// # Examples +/// +/// `char::Searcher` is a `DoubleEndedSearcher` because searching for a +/// [`char`] only requires looking at one at a time, which behaves the same +/// from both ends. +/// +/// `(&str)::Searcher` is not a `DoubleEndedSearcher` because +/// the pattern `"aa"` in the haystack `"aaa"` matches as either +/// `"[aa]a"` or `"a[aa]"`, depending from which side it is searched. +pub trait DoubleEndedSearcher<'a>: ReverseSearcher<'a> {} + +///////////////////////////////////////////////////////////////////////////// +// Impl for char +///////////////////////////////////////////////////////////////////////////// + +/// Associated type for `>::Searcher`. +#[derive(Clone, Debug)] +pub struct CharSearcher<'a> { + haystack: &'a str, + // safety invariant: `finger`/`finger_back` must be a valid utf8 byte index of `haystack` + // This invariant can be broken *within* next_match and next_match_back, however + // they must exit with fingers on valid code point boundaries. + /// `finger` is the current byte index of the forward search. + /// Imagine that it exists before the byte at its index, i.e. + /// `haystack[finger]` is the first byte of the slice we must inspect during + /// forward searching + finger: usize, + /// `finger_back` is the current byte index of the reverse search. + /// Imagine that it exists after the byte at its index, i.e. + /// haystack[finger_back - 1] is the last byte of the slice we must inspect during + /// forward searching (and thus the first byte to be inspected when calling next_back()). + finger_back: usize, + /// The character being searched for + needle: char, + + // safety invariant: `utf8_size` must be less than 5 + /// The number of bytes `needle` takes up when encoded in utf8. + utf8_size: usize, + /// A utf8 encoded copy of the `needle` + utf8_encoded: [u8; 4], +} + +unsafe impl<'a> Searcher<'a> for CharSearcher<'a> { + #[inline] + fn haystack(&self) -> &'a str { + self.haystack + } + #[inline] + fn next(&mut self) -> SearchStep { + let old_finger = self.finger; + // SAFETY: 1-4 guarantee safety of `get_unchecked` + // 1. `self.finger` and `self.finger_back` are kept on unicode boundaries + // (this is invariant) + // 2. `self.finger >= 0` since it starts at 0 and only increases + // 3. `self.finger < self.finger_back` because otherwise the char `iter` + // would return `SearchStep::Done` + // 4. `self.finger` comes before the end of the haystack because `self.finger_back` + // starts at the end and only decreases + let slice = unsafe { self.haystack.get_unchecked(old_finger..self.finger_back) }; + let mut iter = slice.chars(); + let old_len = iter.iter.len(); + if let Some(ch) = iter.next() { + // add byte offset of current character + // without re-encoding as utf-8 + self.finger += old_len - iter.iter.len(); + if ch == self.needle { + SearchStep::Match(old_finger, self.finger) + } else { + SearchStep::Reject(old_finger, self.finger) + } + } else { + SearchStep::Done + } + } + #[inline] + fn next_match(&mut self) -> Option<(usize, usize)> { + loop { + // get the haystack after the last character found + let bytes = self.haystack.as_bytes().get(self.finger..self.finger_back)?; + // the last byte of the utf8 encoded needle + // SAFETY: we have an invariant that `utf8_size < 5` + let last_byte = unsafe { *self.utf8_encoded.get_unchecked(self.utf8_size - 1) }; + if let Some(index) = memchr::memchr(last_byte, bytes) { + // The new finger is the index of the byte we found, + // plus one, since we memchr'd for the last byte of the character. + // + // Note that this doesn't always give us a finger on a UTF8 boundary. + // If we *didn't* find our character + // we may have indexed to the non-last byte of a 3-byte or 4-byte character. + // We can't just skip to the next valid starting byte because a character like + // ꁁ (U+A041 YI SYLLABLE PA), utf-8 `EA 81 81` will have us always find + // the second byte when searching for the third. + // + // However, this is totally okay. While we have the invariant that + // self.finger is on a UTF8 boundary, this invariant is not relied upon + // within this method (it is relied upon in CharSearcher::next()). + // + // We only exit this method when we reach the end of the string, or if we + // find something. When we find something the `finger` will be set + // to a UTF8 boundary. + self.finger += index + 1; + if self.finger >= self.utf8_size { + let found_char = self.finger - self.utf8_size; + if let Some(slice) = self.haystack.as_bytes().get(found_char..self.finger) { + if slice == &self.utf8_encoded[0..self.utf8_size] { + return Some((found_char, self.finger)); + } + } + } + } else { + // found nothing, exit + self.finger = self.finger_back; + return None; + } + } + } + + // let next_reject use the default implementation from the Searcher trait +} + +unsafe impl<'a> ReverseSearcher<'a> for CharSearcher<'a> { + #[inline] + fn next_back(&mut self) -> SearchStep { + let old_finger = self.finger_back; + // SAFETY: see the comment for next() above + let slice = unsafe { self.haystack.get_unchecked(self.finger..old_finger) }; + let mut iter = slice.chars(); + let old_len = iter.iter.len(); + if let Some(ch) = iter.next_back() { + // subtract byte offset of current character + // without re-encoding as utf-8 + self.finger_back -= old_len - iter.iter.len(); + if ch == self.needle { + SearchStep::Match(self.finger_back, old_finger) + } else { + SearchStep::Reject(self.finger_back, old_finger) + } + } else { + SearchStep::Done + } + } + #[inline] + fn next_match_back(&mut self) -> Option<(usize, usize)> { + let haystack = self.haystack.as_bytes(); + loop { + // get the haystack up to but not including the last character searched + let bytes = haystack.get(self.finger..self.finger_back)?; + // the last byte of the utf8 encoded needle + // SAFETY: we have an invariant that `utf8_size < 5` + let last_byte = unsafe { *self.utf8_encoded.get_unchecked(self.utf8_size - 1) }; + if let Some(index) = memchr::memrchr(last_byte, bytes) { + // we searched a slice that was offset by self.finger, + // add self.finger to recoup the original index + let index = self.finger + index; + // memrchr will return the index of the byte we wish to + // find. In case of an ASCII character, this is indeed + // were we wish our new finger to be ("after" the found + // char in the paradigm of reverse iteration). For + // multibyte chars we need to skip down by the number of more + // bytes they have than ASCII + let shift = self.utf8_size - 1; + if index >= shift { + let found_char = index - shift; + if let Some(slice) = haystack.get(found_char..(found_char + self.utf8_size)) { + if slice == &self.utf8_encoded[0..self.utf8_size] { + // move finger to before the character found (i.e., at its start index) + self.finger_back = found_char; + return Some((self.finger_back, self.finger_back + self.utf8_size)); + } + } + } + // We can't use finger_back = index - size + 1 here. If we found the last char + // of a different-sized character (or the middle byte of a different character) + // we need to bump the finger_back down to `index`. This similarly makes + // `finger_back` have the potential to no longer be on a boundary, + // but this is OK since we only exit this function on a boundary + // or when the haystack has been searched completely. + // + // Unlike next_match this does not + // have the problem of repeated bytes in utf-8 because + // we're searching for the last byte, and we can only have + // found the last byte when searching in reverse. + self.finger_back = index; + } else { + self.finger_back = self.finger; + // found nothing, exit + return None; + } + } + } + + // let next_reject_back use the default implementation from the Searcher trait +} + +impl<'a> DoubleEndedSearcher<'a> for CharSearcher<'a> {} + +/// Searches for chars that are equal to a given [`char`]. +/// +/// # Examples +/// +/// ``` +/// assert_eq!("Hello world".find('o'), Some(4)); +/// ``` +impl<'a> Pattern<'a> for char { + type Searcher = CharSearcher<'a>; + + #[inline] + fn into_searcher(self, haystack: &'a str) -> Self::Searcher { + let mut utf8_encoded = [0; 4]; + let utf8_size = self.encode_utf8(&mut utf8_encoded).len(); + CharSearcher { + haystack, + finger: 0, + finger_back: haystack.len(), + needle: self, + utf8_size, + utf8_encoded, + } + } + + #[inline] + fn is_contained_in(self, haystack: &'a str) -> bool { + if (self as u32) < 128 { + haystack.as_bytes().contains(&(self as u8)) + } else { + let mut buffer = [0u8; 4]; + self.encode_utf8(&mut buffer).is_contained_in(haystack) + } + } + + #[inline] + fn is_prefix_of(self, haystack: &'a str) -> bool { + self.encode_utf8(&mut [0u8; 4]).is_prefix_of(haystack) + } + + #[inline] + fn strip_prefix_of(self, haystack: &'a str) -> Option<&'a str> { + self.encode_utf8(&mut [0u8; 4]).strip_prefix_of(haystack) + } + + #[inline] + fn is_suffix_of(self, haystack: &'a str) -> bool + where + Self::Searcher: ReverseSearcher<'a>, + { + self.encode_utf8(&mut [0u8; 4]).is_suffix_of(haystack) + } + + #[inline] + fn strip_suffix_of(self, haystack: &'a str) -> Option<&'a str> + where + Self::Searcher: ReverseSearcher<'a>, + { + self.encode_utf8(&mut [0u8; 4]).strip_suffix_of(haystack) + } +} + +///////////////////////////////////////////////////////////////////////////// +// Impl for a MultiCharEq wrapper +///////////////////////////////////////////////////////////////////////////// + +#[doc(hidden)] +trait MultiCharEq { + fn matches(&mut self, c: char) -> bool; +} + +impl MultiCharEq for F +where + F: FnMut(char) -> bool, +{ + #[inline] + fn matches(&mut self, c: char) -> bool { + (*self)(c) + } +} + +impl MultiCharEq for [char; N] { + #[inline] + fn matches(&mut self, c: char) -> bool { + self.iter().any(|&m| m == c) + } +} + +impl MultiCharEq for &[char; N] { + #[inline] + fn matches(&mut self, c: char) -> bool { + self.iter().any(|&m| m == c) + } +} + +impl MultiCharEq for &[char] { + #[inline] + fn matches(&mut self, c: char) -> bool { + self.iter().any(|&m| m == c) + } +} + +struct MultiCharEqPattern(C); + +#[derive(Clone, Debug)] +struct MultiCharEqSearcher<'a, C: MultiCharEq> { + char_eq: C, + haystack: &'a str, + char_indices: super::CharIndices<'a>, +} + +impl<'a, C: MultiCharEq> Pattern<'a> for MultiCharEqPattern { + type Searcher = MultiCharEqSearcher<'a, C>; + + #[inline] + fn into_searcher(self, haystack: &'a str) -> MultiCharEqSearcher<'a, C> { + MultiCharEqSearcher { haystack, char_eq: self.0, char_indices: haystack.char_indices() } + } +} + +unsafe impl<'a, C: MultiCharEq> Searcher<'a> for MultiCharEqSearcher<'a, C> { + #[inline] + fn haystack(&self) -> &'a str { + self.haystack + } + + #[inline] + fn next(&mut self) -> SearchStep { + let s = &mut self.char_indices; + // Compare lengths of the internal byte slice iterator + // to find length of current char + let pre_len = s.iter.iter.len(); + if let Some((i, c)) = s.next() { + let len = s.iter.iter.len(); + let char_len = pre_len - len; + if self.char_eq.matches(c) { + return SearchStep::Match(i, i + char_len); + } else { + return SearchStep::Reject(i, i + char_len); + } + } + SearchStep::Done + } +} + +unsafe impl<'a, C: MultiCharEq> ReverseSearcher<'a> for MultiCharEqSearcher<'a, C> { + #[inline] + fn next_back(&mut self) -> SearchStep { + let s = &mut self.char_indices; + // Compare lengths of the internal byte slice iterator + // to find length of current char + let pre_len = s.iter.iter.len(); + if let Some((i, c)) = s.next_back() { + let len = s.iter.iter.len(); + let char_len = pre_len - len; + if self.char_eq.matches(c) { + return SearchStep::Match(i, i + char_len); + } else { + return SearchStep::Reject(i, i + char_len); + } + } + SearchStep::Done + } +} + +impl<'a, C: MultiCharEq> DoubleEndedSearcher<'a> for MultiCharEqSearcher<'a, C> {} + +///////////////////////////////////////////////////////////////////////////// + +macro_rules! pattern_methods { + ($t:ty, $pmap:expr, $smap:expr) => { + type Searcher = $t; + + #[inline] + fn into_searcher(self, haystack: &'a str) -> $t { + ($smap)(($pmap)(self).into_searcher(haystack)) + } + + #[inline] + fn is_contained_in(self, haystack: &'a str) -> bool { + ($pmap)(self).is_contained_in(haystack) + } + + #[inline] + fn is_prefix_of(self, haystack: &'a str) -> bool { + ($pmap)(self).is_prefix_of(haystack) + } + + #[inline] + fn strip_prefix_of(self, haystack: &'a str) -> Option<&'a str> { + ($pmap)(self).strip_prefix_of(haystack) + } + + #[inline] + fn is_suffix_of(self, haystack: &'a str) -> bool + where + $t: ReverseSearcher<'a>, + { + ($pmap)(self).is_suffix_of(haystack) + } + + #[inline] + fn strip_suffix_of(self, haystack: &'a str) -> Option<&'a str> + where + $t: ReverseSearcher<'a>, + { + ($pmap)(self).strip_suffix_of(haystack) + } + }; +} + +macro_rules! searcher_methods { + (forward) => { + #[inline] + fn haystack(&self) -> &'a str { + self.0.haystack() + } + #[inline] + fn next(&mut self) -> SearchStep { + self.0.next() + } + #[inline] + fn next_match(&mut self) -> Option<(usize, usize)> { + self.0.next_match() + } + #[inline] + fn next_reject(&mut self) -> Option<(usize, usize)> { + self.0.next_reject() + } + }; + (reverse) => { + #[inline] + fn next_back(&mut self) -> SearchStep { + self.0.next_back() + } + #[inline] + fn next_match_back(&mut self) -> Option<(usize, usize)> { + self.0.next_match_back() + } + #[inline] + fn next_reject_back(&mut self) -> Option<(usize, usize)> { + self.0.next_reject_back() + } + }; +} + +/// Associated type for `<[char; N] as Pattern<'a>>::Searcher`. +#[derive(Clone, Debug)] +pub struct CharArraySearcher<'a, const N: usize>( + as Pattern<'a>>::Searcher, +); + +/// Associated type for `<&[char; N] as Pattern<'a>>::Searcher`. +#[derive(Clone, Debug)] +pub struct CharArrayRefSearcher<'a, 'b, const N: usize>( + as Pattern<'a>>::Searcher, +); + +/// Searches for chars that are equal to any of the [`char`]s in the array. +/// +/// # Examples +/// +/// ``` +/// assert_eq!("Hello world".find(['l', 'l']), Some(2)); +/// assert_eq!("Hello world".find(['l', 'l']), Some(2)); +/// ``` +impl<'a, const N: usize> Pattern<'a> for [char; N] { + pattern_methods!(CharArraySearcher<'a, N>, MultiCharEqPattern, CharArraySearcher); +} + +unsafe impl<'a, const N: usize> Searcher<'a> for CharArraySearcher<'a, N> { + searcher_methods!(forward); +} + +unsafe impl<'a, const N: usize> ReverseSearcher<'a> for CharArraySearcher<'a, N> { + searcher_methods!(reverse); +} + +/// Searches for chars that are equal to any of the [`char`]s in the array. +/// +/// # Examples +/// +/// ``` +/// assert_eq!("Hello world".find(&['l', 'l']), Some(2)); +/// assert_eq!("Hello world".find(&['l', 'l']), Some(2)); +/// ``` +impl<'a, 'b, const N: usize> Pattern<'a> for &'b [char; N] { + pattern_methods!(CharArrayRefSearcher<'a, 'b, N>, MultiCharEqPattern, CharArrayRefSearcher); +} + +unsafe impl<'a, 'b, const N: usize> Searcher<'a> for CharArrayRefSearcher<'a, 'b, N> { + searcher_methods!(forward); +} + +unsafe impl<'a, 'b, const N: usize> ReverseSearcher<'a> for CharArrayRefSearcher<'a, 'b, N> { + searcher_methods!(reverse); +} + +///////////////////////////////////////////////////////////////////////////// +// Impl for &[char] +///////////////////////////////////////////////////////////////////////////// + +// Todo: Change / Remove due to ambiguity in meaning. + +/// Associated type for `<&[char] as Pattern<'a>>::Searcher`. +#[derive(Clone, Debug)] +pub struct CharSliceSearcher<'a, 'b>( as Pattern<'a>>::Searcher); + +unsafe impl<'a, 'b> Searcher<'a> for CharSliceSearcher<'a, 'b> { + searcher_methods!(forward); +} + +unsafe impl<'a, 'b> ReverseSearcher<'a> for CharSliceSearcher<'a, 'b> { + searcher_methods!(reverse); +} + +impl<'a, 'b> DoubleEndedSearcher<'a> for CharSliceSearcher<'a, 'b> {} + +/// Searches for chars that are equal to any of the [`char`]s in the slice. +/// +/// # Examples +/// +/// ``` +/// assert_eq!("Hello world".find(&['l', 'l'] as &[_]), Some(2)); +/// assert_eq!("Hello world".find(&['l', 'l'][..]), Some(2)); +/// ``` +impl<'a, 'b> Pattern<'a> for &'b [char] { + pattern_methods!(CharSliceSearcher<'a, 'b>, MultiCharEqPattern, CharSliceSearcher); +} + +///////////////////////////////////////////////////////////////////////////// +// Impl for F: FnMut(char) -> bool +///////////////////////////////////////////////////////////////////////////// + +/// Associated type for `>::Searcher`. +#[derive(Clone)] +pub struct CharPredicateSearcher<'a, F>( as Pattern<'a>>::Searcher) +where + F: FnMut(char) -> bool; + +impl fmt::Debug for CharPredicateSearcher<'_, F> +where + F: FnMut(char) -> bool, +{ + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("CharPredicateSearcher") + .field("haystack", &self.0.haystack) + .field("char_indices", &self.0.char_indices) + .finish() + } +} +unsafe impl<'a, F> Searcher<'a> for CharPredicateSearcher<'a, F> +where + F: FnMut(char) -> bool, +{ + searcher_methods!(forward); +} + +unsafe impl<'a, F> ReverseSearcher<'a> for CharPredicateSearcher<'a, F> +where + F: FnMut(char) -> bool, +{ + searcher_methods!(reverse); +} + +impl<'a, F> DoubleEndedSearcher<'a> for CharPredicateSearcher<'a, F> where F: FnMut(char) -> bool {} + +/// Searches for [`char`]s that match the given predicate. +/// +/// # Examples +/// +/// ``` +/// assert_eq!("Hello world".find(char::is_uppercase), Some(0)); +/// assert_eq!("Hello world".find(|c| "aeiou".contains(c)), Some(1)); +/// ``` +impl<'a, F> Pattern<'a> for F +where + F: FnMut(char) -> bool, +{ + pattern_methods!(CharPredicateSearcher<'a, F>, MultiCharEqPattern, CharPredicateSearcher); +} + +///////////////////////////////////////////////////////////////////////////// +// Impl for &&str +///////////////////////////////////////////////////////////////////////////// + +/// Delegates to the `&str` impl. +impl<'a, 'b, 'c> Pattern<'a> for &'c &'b str { + pattern_methods!(StrSearcher<'a, 'b>, |&s| s, |s| s); +} + +///////////////////////////////////////////////////////////////////////////// +// Impl for &str +///////////////////////////////////////////////////////////////////////////// + +/// Non-allocating substring search. +/// +/// Will handle the pattern `""` as returning empty matches at each character +/// boundary. +/// +/// # Examples +/// +/// ``` +/// assert_eq!("Hello world".find("world"), Some(6)); +/// ``` +impl<'a, 'b> Pattern<'a> for &'b str { + type Searcher = StrSearcher<'a, 'b>; + + #[inline] + fn into_searcher(self, haystack: &'a str) -> StrSearcher<'a, 'b> { + StrSearcher::new(haystack, self) + } + + /// Checks whether the pattern matches at the front of the haystack. + #[inline] + fn is_prefix_of(self, haystack: &'a str) -> bool { + haystack.as_bytes().starts_with(self.as_bytes()) + } + + /// Removes the pattern from the front of haystack, if it matches. + #[inline] + fn strip_prefix_of(self, haystack: &'a str) -> Option<&'a str> { + if self.is_prefix_of(haystack) { + // SAFETY: prefix was just verified to exist. + unsafe { Some(haystack.get_unchecked(self.as_bytes().len()..)) } + } else { + None + } + } + + /// Checks whether the pattern matches at the back of the haystack. + #[inline] + fn is_suffix_of(self, haystack: &'a str) -> bool { + haystack.as_bytes().ends_with(self.as_bytes()) + } + + /// Removes the pattern from the back of haystack, if it matches. + #[inline] + fn strip_suffix_of(self, haystack: &'a str) -> Option<&'a str> { + if self.is_suffix_of(haystack) { + let i = haystack.len() - self.as_bytes().len(); + // SAFETY: suffix was just verified to exist. + unsafe { Some(haystack.get_unchecked(..i)) } + } else { + None + } + } +} + +///////////////////////////////////////////////////////////////////////////// +// Two Way substring searcher +///////////////////////////////////////////////////////////////////////////// + +#[derive(Clone, Debug)] +/// Associated type for `<&str as Pattern<'a>>::Searcher`. +pub struct StrSearcher<'a, 'b> { + haystack: &'a str, + needle: &'b str, + + searcher: StrSearcherImpl, +} + +#[derive(Clone, Debug)] +enum StrSearcherImpl { + Empty(EmptyNeedle), + TwoWay(TwoWaySearcher), +} + +#[derive(Clone, Debug)] +struct EmptyNeedle { + position: usize, + end: usize, + is_match_fw: bool, + is_match_bw: bool, + // Needed in case of an empty haystack, see #85462 + is_finished: bool, +} + +impl<'a, 'b> StrSearcher<'a, 'b> { + fn new(haystack: &'a str, needle: &'b str) -> StrSearcher<'a, 'b> { + if needle.is_empty() { + StrSearcher { + haystack, + needle, + searcher: StrSearcherImpl::Empty(EmptyNeedle { + position: 0, + end: haystack.len(), + is_match_fw: true, + is_match_bw: true, + is_finished: false, + }), + } + } else { + StrSearcher { + haystack, + needle, + searcher: StrSearcherImpl::TwoWay(TwoWaySearcher::new( + needle.as_bytes(), + haystack.len(), + )), + } + } + } +} + +unsafe impl<'a, 'b> Searcher<'a> for StrSearcher<'a, 'b> { + #[inline] + fn haystack(&self) -> &'a str { + self.haystack + } + + #[inline] + fn next(&mut self) -> SearchStep { + match self.searcher { + StrSearcherImpl::Empty(ref mut searcher) => { + if searcher.is_finished { + return SearchStep::Done; + } + // empty needle rejects every char and matches every empty string between them + let is_match = searcher.is_match_fw; + searcher.is_match_fw = !searcher.is_match_fw; + let pos = searcher.position; + match self.haystack[pos..].chars().next() { + _ if is_match => SearchStep::Match(pos, pos), + None => { + searcher.is_finished = true; + SearchStep::Done + } + Some(ch) => { + searcher.position += ch.len_utf8(); + SearchStep::Reject(pos, searcher.position) + } + } + } + StrSearcherImpl::TwoWay(ref mut searcher) => { + // TwoWaySearcher produces valid *Match* indices that split at char boundaries + // as long as it does correct matching and that haystack and needle are + // valid UTF-8 + // *Rejects* from the algorithm can fall on any indices, but we will walk them + // manually to the next character boundary, so that they are utf-8 safe. + if searcher.position == self.haystack.len() { + return SearchStep::Done; + } + let is_long = searcher.memory == usize::MAX; + match searcher.next::( + self.haystack.as_bytes(), + self.needle.as_bytes(), + is_long, + ) { + SearchStep::Reject(a, mut b) => { + // skip to next char boundary + while !self.haystack.is_char_boundary(b) { + b += 1; + } + searcher.position = cmp::max(b, searcher.position); + SearchStep::Reject(a, b) + } + otherwise => otherwise, + } + } + } + } + + #[inline] + fn next_match(&mut self) -> Option<(usize, usize)> { + match self.searcher { + StrSearcherImpl::Empty(..) => loop { + match self.next() { + SearchStep::Match(a, b) => return Some((a, b)), + SearchStep::Done => return None, + SearchStep::Reject(..) => {} + } + }, + StrSearcherImpl::TwoWay(ref mut searcher) => { + let is_long = searcher.memory == usize::MAX; + // write out `true` and `false` cases to encourage the compiler + // to specialize the two cases separately. + if is_long { + searcher.next::( + self.haystack.as_bytes(), + self.needle.as_bytes(), + true, + ) + } else { + searcher.next::( + self.haystack.as_bytes(), + self.needle.as_bytes(), + false, + ) + } + } + } + } +} + +unsafe impl<'a, 'b> ReverseSearcher<'a> for StrSearcher<'a, 'b> { + #[inline] + fn next_back(&mut self) -> SearchStep { + match self.searcher { + StrSearcherImpl::Empty(ref mut searcher) => { + if searcher.is_finished { + return SearchStep::Done; + } + let is_match = searcher.is_match_bw; + searcher.is_match_bw = !searcher.is_match_bw; + let end = searcher.end; + match self.haystack[..end].chars().next_back() { + _ if is_match => SearchStep::Match(end, end), + None => { + searcher.is_finished = true; + SearchStep::Done + } + Some(ch) => { + searcher.end -= ch.len_utf8(); + SearchStep::Reject(searcher.end, end) + } + } + } + StrSearcherImpl::TwoWay(ref mut searcher) => { + if searcher.end == 0 { + return SearchStep::Done; + } + let is_long = searcher.memory == usize::MAX; + match searcher.next_back::( + self.haystack.as_bytes(), + self.needle.as_bytes(), + is_long, + ) { + SearchStep::Reject(mut a, b) => { + // skip to next char boundary + while !self.haystack.is_char_boundary(a) { + a -= 1; + } + searcher.end = cmp::min(a, searcher.end); + SearchStep::Reject(a, b) + } + otherwise => otherwise, + } + } + } + } + + #[inline] + fn next_match_back(&mut self) -> Option<(usize, usize)> { + match self.searcher { + StrSearcherImpl::Empty(..) => loop { + match self.next_back() { + SearchStep::Match(a, b) => return Some((a, b)), + SearchStep::Done => return None, + SearchStep::Reject(..) => {} + } + }, + StrSearcherImpl::TwoWay(ref mut searcher) => { + let is_long = searcher.memory == usize::MAX; + // write out `true` and `false`, like `next_match` + if is_long { + searcher.next_back::( + self.haystack.as_bytes(), + self.needle.as_bytes(), + true, + ) + } else { + searcher.next_back::( + self.haystack.as_bytes(), + self.needle.as_bytes(), + false, + ) + } + } + } + } +} + +/// The internal state of the two-way substring search algorithm. +#[derive(Clone, Debug)] +struct TwoWaySearcher { + // constants + /// critical factorization index + crit_pos: usize, + /// critical factorization index for reversed needle + crit_pos_back: usize, + period: usize, + /// `byteset` is an extension (not part of the two way algorithm); + /// it's a 64-bit "fingerprint" where each set bit `j` corresponds + /// to a (byte & 63) == j present in the needle. + byteset: u64, + + // variables + position: usize, + end: usize, + /// index into needle before which we have already matched + memory: usize, + /// index into needle after which we have already matched + memory_back: usize, +} + +/* + This is the Two-Way search algorithm, which was introduced in the paper: + Crochemore, M., Perrin, D., 1991, Two-way string-matching, Journal of the ACM 38(3):651-675. + + Here's some background information. + + A *word* is a string of symbols. The *length* of a word should be a familiar + notion, and here we denote it for any word x by |x|. + (We also allow for the possibility of the *empty word*, a word of length zero). + + If x is any non-empty word, then an integer p with 0 < p <= |x| is said to be a + *period* for x iff for all i with 0 <= i <= |x| - p - 1, we have x[i] == x[i+p]. + For example, both 1 and 2 are periods for the string "aa". As another example, + the only period of the string "abcd" is 4. + + We denote by period(x) the *smallest* period of x (provided that x is non-empty). + This is always well-defined since every non-empty word x has at least one period, + |x|. We sometimes call this *the period* of x. + + If u, v and x are words such that x = uv, where uv is the concatenation of u and + v, then we say that (u, v) is a *factorization* of x. + + Let (u, v) be a factorization for a word x. Then if w is a non-empty word such + that both of the following hold + + - either w is a suffix of u or u is a suffix of w + - either w is a prefix of v or v is a prefix of w + + then w is said to be a *repetition* for the factorization (u, v). + + Just to unpack this, there are four possibilities here. Let w = "abc". Then we + might have: + + - w is a suffix of u and w is a prefix of v. ex: ("lolabc", "abcde") + - w is a suffix of u and v is a prefix of w. ex: ("lolabc", "ab") + - u is a suffix of w and w is a prefix of v. ex: ("bc", "abchi") + - u is a suffix of w and v is a prefix of w. ex: ("bc", "a") + + Note that the word vu is a repetition for any factorization (u,v) of x = uv, + so every factorization has at least one repetition. + + If x is a string and (u, v) is a factorization for x, then a *local period* for + (u, v) is an integer r such that there is some word w such that |w| = r and w is + a repetition for (u, v). + + We denote by local_period(u, v) the smallest local period of (u, v). We sometimes + call this *the local period* of (u, v). Provided that x = uv is non-empty, this + is well-defined (because each non-empty word has at least one factorization, as + noted above). + + It can be proven that the following is an equivalent definition of a local period + for a factorization (u, v): any positive integer r such that x[i] == x[i+r] for + all i such that |u| - r <= i <= |u| - 1 and such that both x[i] and x[i+r] are + defined. (i.e., i > 0 and i + r < |x|). + + Using the above reformulation, it is easy to prove that + + 1 <= local_period(u, v) <= period(uv) + + A factorization (u, v) of x such that local_period(u,v) = period(x) is called a + *critical factorization*. + + The algorithm hinges on the following theorem, which is stated without proof: + + **Critical Factorization Theorem** Any word x has at least one critical + factorization (u, v) such that |u| < period(x). + + The purpose of maximal_suffix is to find such a critical factorization. + + If the period is short, compute another factorization x = u' v' to use + for reverse search, chosen instead so that |v'| < period(x). + +*/ +impl TwoWaySearcher { + fn new(needle: &[u8], end: usize) -> TwoWaySearcher { + let (crit_pos_false, period_false) = TwoWaySearcher::maximal_suffix(needle, false); + let (crit_pos_true, period_true) = TwoWaySearcher::maximal_suffix(needle, true); + + let (crit_pos, period) = if crit_pos_false > crit_pos_true { + (crit_pos_false, period_false) + } else { + (crit_pos_true, period_true) + }; + + // A particularly readable explanation of what's going on here can be found + // in Crochemore and Rytter's book "Text Algorithms", ch 13. Specifically + // see the code for "Algorithm CP" on p. 323. + // + // What's going on is we have some critical factorization (u, v) of the + // needle, and we want to determine whether u is a suffix of + // &v[..period]. If it is, we use "Algorithm CP1". Otherwise we use + // "Algorithm CP2", which is optimized for when the period of the needle + // is large. + if needle[..crit_pos] == needle[period..period + crit_pos] { + // short period case -- the period is exact + // compute a separate critical factorization for the reversed needle + // x = u' v' where |v'| < period(x). + // + // This is sped up by the period being known already. + // Note that a case like x = "acba" may be factored exactly forwards + // (crit_pos = 1, period = 3) while being factored with approximate + // period in reverse (crit_pos = 2, period = 2). We use the given + // reverse factorization but keep the exact period. + let crit_pos_back = needle.len() + - cmp::max( + TwoWaySearcher::reverse_maximal_suffix(needle, period, false), + TwoWaySearcher::reverse_maximal_suffix(needle, period, true), + ); + + TwoWaySearcher { + crit_pos, + crit_pos_back, + period, + byteset: Self::byteset_create(&needle[..period]), + + position: 0, + end, + memory: 0, + memory_back: needle.len(), + } + } else { + // long period case -- we have an approximation to the actual period, + // and don't use memorization. + // + // Approximate the period by lower bound max(|u|, |v|) + 1. + // The critical factorization is efficient to use for both forward and + // reverse search. + + TwoWaySearcher { + crit_pos, + crit_pos_back: crit_pos, + period: cmp::max(crit_pos, needle.len() - crit_pos) + 1, + byteset: Self::byteset_create(needle), + + position: 0, + end, + memory: usize::MAX, // Dummy value to signify that the period is long + memory_back: usize::MAX, + } + } + } + + #[inline] + fn byteset_create(bytes: &[u8]) -> u64 { + bytes.iter().fold(0, |a, &b| (1 << (b & 0x3f)) | a) + } + + #[inline] + fn byteset_contains(&self, byte: u8) -> bool { + (self.byteset >> ((byte & 0x3f) as usize)) & 1 != 0 + } + + // One of the main ideas of Two-Way is that we factorize the needle into + // two halves, (u, v), and begin trying to find v in the haystack by scanning + // left to right. If v matches, we try to match u by scanning right to left. + // How far we can jump when we encounter a mismatch is all based on the fact + // that (u, v) is a critical factorization for the needle. + #[inline] + fn next(&mut self, haystack: &[u8], needle: &[u8], long_period: bool) -> S::Output + where + S: TwoWayStrategy, + { + // `next()` uses `self.position` as its cursor + let old_pos = self.position; + let needle_last = needle.len() - 1; + 'search: loop { + // Check that we have room to search in + // position + needle_last can not overflow if we assume slices + // are bounded by isize's range. + let tail_byte = match haystack.get(self.position + needle_last) { + Some(&b) => b, + None => { + self.position = haystack.len(); + return S::rejecting(old_pos, self.position); + } + }; + + if S::use_early_reject() && old_pos != self.position { + return S::rejecting(old_pos, self.position); + } + + // Quickly skip by large portions unrelated to our substring + if !self.byteset_contains(tail_byte) { + self.position += needle.len(); + if !long_period { + self.memory = 0; + } + continue 'search; + } + + // See if the right part of the needle matches + let start = + if long_period { self.crit_pos } else { cmp::max(self.crit_pos, self.memory) }; + for i in start..needle.len() { + if needle[i] != haystack[self.position + i] { + self.position += i - self.crit_pos + 1; + if !long_period { + self.memory = 0; + } + continue 'search; + } + } + + // See if the left part of the needle matches + let start = if long_period { 0 } else { self.memory }; + for i in (start..self.crit_pos).rev() { + if needle[i] != haystack[self.position + i] { + self.position += self.period; + if !long_period { + self.memory = needle.len() - self.period; + } + continue 'search; + } + } + + // We have found a match! + let match_pos = self.position; + + // Note: add self.period instead of needle.len() to have overlapping matches + self.position += needle.len(); + if !long_period { + self.memory = 0; // set to needle.len() - self.period for overlapping matches + } + + return S::matching(match_pos, match_pos + needle.len()); + } + } + + // Follows the ideas in `next()`. + // + // The definitions are symmetrical, with period(x) = period(reverse(x)) + // and local_period(u, v) = local_period(reverse(v), reverse(u)), so if (u, v) + // is a critical factorization, so is (reverse(v), reverse(u)). + // + // For the reverse case we have computed a critical factorization x = u' v' + // (field `crit_pos_back`). We need |u| < period(x) for the forward case and + // thus |v'| < period(x) for the reverse. + // + // To search in reverse through the haystack, we search forward through + // a reversed haystack with a reversed needle, matching first u' and then v'. + #[inline] + fn next_back(&mut self, haystack: &[u8], needle: &[u8], long_period: bool) -> S::Output + where + S: TwoWayStrategy, + { + // `next_back()` uses `self.end` as its cursor -- so that `next()` and `next_back()` + // are independent. + let old_end = self.end; + 'search: loop { + // Check that we have room to search in + // end - needle.len() will wrap around when there is no more room, + // but due to slice length limits it can never wrap all the way back + // into the length of haystack. + let front_byte = match haystack.get(self.end.wrapping_sub(needle.len())) { + Some(&b) => b, + None => { + self.end = 0; + return S::rejecting(0, old_end); + } + }; + + if S::use_early_reject() && old_end != self.end { + return S::rejecting(self.end, old_end); + } + + // Quickly skip by large portions unrelated to our substring + if !self.byteset_contains(front_byte) { + self.end -= needle.len(); + if !long_period { + self.memory_back = needle.len(); + } + continue 'search; + } + + // See if the left part of the needle matches + let crit = if long_period { + self.crit_pos_back + } else { + cmp::min(self.crit_pos_back, self.memory_back) + }; + for i in (0..crit).rev() { + if needle[i] != haystack[self.end - needle.len() + i] { + self.end -= self.crit_pos_back - i; + if !long_period { + self.memory_back = needle.len(); + } + continue 'search; + } + } + + // See if the right part of the needle matches + let needle_end = if long_period { needle.len() } else { self.memory_back }; + for i in self.crit_pos_back..needle_end { + if needle[i] != haystack[self.end - needle.len() + i] { + self.end -= self.period; + if !long_period { + self.memory_back = self.period; + } + continue 'search; + } + } + + // We have found a match! + let match_pos = self.end - needle.len(); + // Note: sub self.period instead of needle.len() to have overlapping matches + self.end -= needle.len(); + if !long_period { + self.memory_back = needle.len(); + } + + return S::matching(match_pos, match_pos + needle.len()); + } + } + + // Compute the maximal suffix of `arr`. + // + // The maximal suffix is a possible critical factorization (u, v) of `arr`. + // + // Returns (`i`, `p`) where `i` is the starting index of v and `p` is the + // period of v. + // + // `order_greater` determines if lexical order is `<` or `>`. Both + // orders must be computed -- the ordering with the largest `i` gives + // a critical factorization. + // + // For long period cases, the resulting period is not exact (it is too short). + #[inline] + fn maximal_suffix(arr: &[u8], order_greater: bool) -> (usize, usize) { + let mut left = 0; // Corresponds to i in the paper + let mut right = 1; // Corresponds to j in the paper + let mut offset = 0; // Corresponds to k in the paper, but starting at 0 + // to match 0-based indexing. + let mut period = 1; // Corresponds to p in the paper + + while let Some(&a) = arr.get(right + offset) { + // `left` will be inbounds when `right` is. + let b = arr[left + offset]; + if (a < b && !order_greater) || (a > b && order_greater) { + // Suffix is smaller, period is entire prefix so far. + right += offset + 1; + offset = 0; + period = right - left; + } else if a == b { + // Advance through repetition of the current period. + if offset + 1 == period { + right += offset + 1; + offset = 0; + } else { + offset += 1; + } + } else { + // Suffix is larger, start over from current location. + left = right; + right += 1; + offset = 0; + period = 1; + } + } + (left, period) + } + + // Compute the maximal suffix of the reverse of `arr`. + // + // The maximal suffix is a possible critical factorization (u', v') of `arr`. + // + // Returns `i` where `i` is the starting index of v', from the back; + // returns immediately when a period of `known_period` is reached. + // + // `order_greater` determines if lexical order is `<` or `>`. Both + // orders must be computed -- the ordering with the largest `i` gives + // a critical factorization. + // + // For long period cases, the resulting period is not exact (it is too short). + fn reverse_maximal_suffix(arr: &[u8], known_period: usize, order_greater: bool) -> usize { + let mut left = 0; // Corresponds to i in the paper + let mut right = 1; // Corresponds to j in the paper + let mut offset = 0; // Corresponds to k in the paper, but starting at 0 + // to match 0-based indexing. + let mut period = 1; // Corresponds to p in the paper + let n = arr.len(); + + while right + offset < n { + let a = arr[n - (1 + right + offset)]; + let b = arr[n - (1 + left + offset)]; + if (a < b && !order_greater) || (a > b && order_greater) { + // Suffix is smaller, period is entire prefix so far. + right += offset + 1; + offset = 0; + period = right - left; + } else if a == b { + // Advance through repetition of the current period. + if offset + 1 == period { + right += offset + 1; + offset = 0; + } else { + offset += 1; + } + } else { + // Suffix is larger, start over from current location. + left = right; + right += 1; + offset = 0; + period = 1; + } + if period == known_period { + break; + } + } + debug_assert!(period <= known_period); + left + } +} + +// TwoWayStrategy allows the algorithm to either skip non-matches as quickly +// as possible, or to work in a mode where it emits Rejects relatively quickly. +trait TwoWayStrategy { + type Output; + fn use_early_reject() -> bool; + fn rejecting(a: usize, b: usize) -> Self::Output; + fn matching(a: usize, b: usize) -> Self::Output; +} + +/// Skip to match intervals as quickly as possible +enum MatchOnly {} + +impl TwoWayStrategy for MatchOnly { + type Output = Option<(usize, usize)>; + + #[inline] + fn use_early_reject() -> bool { + false + } + #[inline] + fn rejecting(_a: usize, _b: usize) -> Self::Output { + None + } + #[inline] + fn matching(a: usize, b: usize) -> Self::Output { + Some((a, b)) + } +} + +/// Emit Rejects regularly +enum RejectAndMatch {} + +impl TwoWayStrategy for RejectAndMatch { + type Output = SearchStep; + + #[inline] + fn use_early_reject() -> bool { + true + } + #[inline] + fn rejecting(a: usize, b: usize) -> Self::Output { + SearchStep::Reject(a, b) + } + #[inline] + fn matching(a: usize, b: usize) -> Self::Output { + SearchStep::Match(a, b) + } +} diff --git a/library/core/src/str/traits.rs b/library/core/src/str/traits.rs new file mode 100644 index 000000000..e9649fc91 --- /dev/null +++ b/library/core/src/str/traits.rs @@ -0,0 +1,604 @@ +//! Trait implementations for `str`. + +use crate::cmp::Ordering; +use crate::ops; +use crate::ptr; +use crate::slice::SliceIndex; + +use super::ParseBoolError; + +/// Implements ordering of strings. +/// +/// Strings are ordered [lexicographically](Ord#lexicographical-comparison) by their byte values. This orders Unicode code +/// points based on their positions in the code charts. This is not necessarily the same as +/// "alphabetical" order, which varies by language and locale. Sorting strings according to +/// culturally-accepted standards requires locale-specific data that is outside the scope of +/// the `str` type. +#[stable(feature = "rust1", since = "1.0.0")] +impl Ord for str { + #[inline] + fn cmp(&self, other: &str) -> Ordering { + self.as_bytes().cmp(other.as_bytes()) + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl PartialEq for str { + #[inline] + fn eq(&self, other: &str) -> bool { + self.as_bytes() == other.as_bytes() + } + #[inline] + fn ne(&self, other: &str) -> bool { + !(*self).eq(other) + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl Eq for str {} + +/// Implements comparison operations on strings. +/// +/// Strings are compared [lexicographically](Ord#lexicographical-comparison) by their byte values. This compares Unicode code +/// points based on their positions in the code charts. This is not necessarily the same as +/// "alphabetical" order, which varies by language and locale. Comparing strings according to +/// culturally-accepted standards requires locale-specific data that is outside the scope of +/// the `str` type. +#[stable(feature = "rust1", since = "1.0.0")] +impl PartialOrd for str { + #[inline] + fn partial_cmp(&self, other: &str) -> Option { + Some(self.cmp(other)) + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +#[rustc_const_unstable(feature = "const_slice_index", issue = "none")] +impl const ops::Index for str +where + I: ~const SliceIndex, +{ + type Output = I::Output; + + #[inline] + fn index(&self, index: I) -> &I::Output { + index.index(self) + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +#[rustc_const_unstable(feature = "const_slice_index", issue = "none")] +impl const ops::IndexMut for str +where + I: ~const SliceIndex, +{ + #[inline] + fn index_mut(&mut self, index: I) -> &mut I::Output { + index.index_mut(self) + } +} + +#[inline(never)] +#[cold] +#[track_caller] +const fn str_index_overflow_fail() -> ! { + panic!("attempted to index str up to maximum usize"); +} + +/// Implements substring slicing with syntax `&self[..]` or `&mut self[..]`. +/// +/// Returns a slice of the whole string, i.e., returns `&self` or `&mut +/// self`. Equivalent to `&self[0 .. len]` or `&mut self[0 .. len]`. Unlike +/// other indexing operations, this can never panic. +/// +/// This operation is *O*(1). +/// +/// Prior to 1.20.0, these indexing operations were still supported by +/// direct implementation of `Index` and `IndexMut`. +/// +/// Equivalent to `&self[0 .. len]` or `&mut self[0 .. len]`. +#[stable(feature = "str_checked_slicing", since = "1.20.0")] +#[rustc_const_unstable(feature = "const_slice_index", issue = "none")] +unsafe impl const SliceIndex for ops::RangeFull { + type Output = str; + #[inline] + fn get(self, slice: &str) -> Option<&Self::Output> { + Some(slice) + } + #[inline] + fn get_mut(self, slice: &mut str) -> Option<&mut Self::Output> { + Some(slice) + } + #[inline] + unsafe fn get_unchecked(self, slice: *const str) -> *const Self::Output { + slice + } + #[inline] + unsafe fn get_unchecked_mut(self, slice: *mut str) -> *mut Self::Output { + slice + } + #[inline] + fn index(self, slice: &str) -> &Self::Output { + slice + } + #[inline] + fn index_mut(self, slice: &mut str) -> &mut Self::Output { + slice + } +} + +/// Implements substring slicing with syntax `&self[begin .. end]` or `&mut +/// self[begin .. end]`. +/// +/// Returns a slice of the given string from the byte range +/// [`begin`, `end`). +/// +/// This operation is *O*(1). +/// +/// Prior to 1.20.0, these indexing operations were still supported by +/// direct implementation of `Index` and `IndexMut`. +/// +/// # Panics +/// +/// Panics if `begin` or `end` does not point to the starting byte offset of +/// a character (as defined by `is_char_boundary`), if `begin > end`, or if +/// `end > len`. +/// +/// # Examples +/// +/// ``` +/// let s = "Löwe 老虎 Léopard"; +/// assert_eq!(&s[0 .. 1], "L"); +/// +/// assert_eq!(&s[1 .. 9], "öwe 老"); +/// +/// // these will panic: +/// // byte 2 lies within `ö`: +/// // &s[2 ..3]; +/// +/// // byte 8 lies within `老` +/// // &s[1 .. 8]; +/// +/// // byte 100 is outside the string +/// // &s[3 .. 100]; +/// ``` +#[stable(feature = "str_checked_slicing", since = "1.20.0")] +#[rustc_const_unstable(feature = "const_slice_index", issue = "none")] +unsafe impl const SliceIndex for ops::Range { + type Output = str; + #[inline] + fn get(self, slice: &str) -> Option<&Self::Output> { + if self.start <= self.end + && slice.is_char_boundary(self.start) + && slice.is_char_boundary(self.end) + { + // SAFETY: just checked that `start` and `end` are on a char boundary, + // and we are passing in a safe reference, so the return value will also be one. + // We also checked char boundaries, so this is valid UTF-8. + Some(unsafe { &*self.get_unchecked(slice) }) + } else { + None + } + } + #[inline] + fn get_mut(self, slice: &mut str) -> Option<&mut Self::Output> { + if self.start <= self.end + && slice.is_char_boundary(self.start) + && slice.is_char_boundary(self.end) + { + // SAFETY: just checked that `start` and `end` are on a char boundary. + // We know the pointer is unique because we got it from `slice`. + Some(unsafe { &mut *self.get_unchecked_mut(slice) }) + } else { + None + } + } + #[inline] + unsafe fn get_unchecked(self, slice: *const str) -> *const Self::Output { + let slice = slice as *const [u8]; + // SAFETY: the caller guarantees that `self` is in bounds of `slice` + // which satisfies all the conditions for `add`. + let ptr = unsafe { slice.as_ptr().add(self.start) }; + let len = self.end - self.start; + ptr::slice_from_raw_parts(ptr, len) as *const str + } + #[inline] + unsafe fn get_unchecked_mut(self, slice: *mut str) -> *mut Self::Output { + let slice = slice as *mut [u8]; + // SAFETY: see comments for `get_unchecked`. + let ptr = unsafe { slice.as_mut_ptr().add(self.start) }; + let len = self.end - self.start; + ptr::slice_from_raw_parts_mut(ptr, len) as *mut str + } + #[inline] + fn index(self, slice: &str) -> &Self::Output { + let (start, end) = (self.start, self.end); + match self.get(slice) { + Some(s) => s, + None => super::slice_error_fail(slice, start, end), + } + } + #[inline] + fn index_mut(self, slice: &mut str) -> &mut Self::Output { + // is_char_boundary checks that the index is in [0, .len()] + // cannot reuse `get` as above, because of NLL trouble + if self.start <= self.end + && slice.is_char_boundary(self.start) + && slice.is_char_boundary(self.end) + { + // SAFETY: just checked that `start` and `end` are on a char boundary, + // and we are passing in a safe reference, so the return value will also be one. + unsafe { &mut *self.get_unchecked_mut(slice) } + } else { + super::slice_error_fail(slice, self.start, self.end) + } + } +} + +/// Implements substring slicing with syntax `&self[.. end]` or `&mut +/// self[.. end]`. +/// +/// Returns a slice of the given string from the byte range \[0, `end`). +/// Equivalent to `&self[0 .. end]` or `&mut self[0 .. end]`. +/// +/// This operation is *O*(1). +/// +/// Prior to 1.20.0, these indexing operations were still supported by +/// direct implementation of `Index` and `IndexMut`. +/// +/// # Panics +/// +/// Panics if `end` does not point to the starting byte offset of a +/// character (as defined by `is_char_boundary`), or if `end > len`. +#[stable(feature = "str_checked_slicing", since = "1.20.0")] +#[rustc_const_unstable(feature = "const_slice_index", issue = "none")] +unsafe impl const SliceIndex for ops::RangeTo { + type Output = str; + #[inline] + fn get(self, slice: &str) -> Option<&Self::Output> { + if slice.is_char_boundary(self.end) { + // SAFETY: just checked that `end` is on a char boundary, + // and we are passing in a safe reference, so the return value will also be one. + Some(unsafe { &*self.get_unchecked(slice) }) + } else { + None + } + } + #[inline] + fn get_mut(self, slice: &mut str) -> Option<&mut Self::Output> { + if slice.is_char_boundary(self.end) { + // SAFETY: just checked that `end` is on a char boundary, + // and we are passing in a safe reference, so the return value will also be one. + Some(unsafe { &mut *self.get_unchecked_mut(slice) }) + } else { + None + } + } + #[inline] + unsafe fn get_unchecked(self, slice: *const str) -> *const Self::Output { + let slice = slice as *const [u8]; + let ptr = slice.as_ptr(); + ptr::slice_from_raw_parts(ptr, self.end) as *const str + } + #[inline] + unsafe fn get_unchecked_mut(self, slice: *mut str) -> *mut Self::Output { + let slice = slice as *mut [u8]; + let ptr = slice.as_mut_ptr(); + ptr::slice_from_raw_parts_mut(ptr, self.end) as *mut str + } + #[inline] + fn index(self, slice: &str) -> &Self::Output { + let end = self.end; + match self.get(slice) { + Some(s) => s, + None => super::slice_error_fail(slice, 0, end), + } + } + #[inline] + fn index_mut(self, slice: &mut str) -> &mut Self::Output { + if slice.is_char_boundary(self.end) { + // SAFETY: just checked that `end` is on a char boundary, + // and we are passing in a safe reference, so the return value will also be one. + unsafe { &mut *self.get_unchecked_mut(slice) } + } else { + super::slice_error_fail(slice, 0, self.end) + } + } +} + +/// Implements substring slicing with syntax `&self[begin ..]` or `&mut +/// self[begin ..]`. +/// +/// Returns a slice of the given string from the byte range \[`begin`, `len`). +/// Equivalent to `&self[begin .. len]` or `&mut self[begin .. len]`. +/// +/// This operation is *O*(1). +/// +/// Prior to 1.20.0, these indexing operations were still supported by +/// direct implementation of `Index` and `IndexMut`. +/// +/// # Panics +/// +/// Panics if `begin` does not point to the starting byte offset of +/// a character (as defined by `is_char_boundary`), or if `begin > len`. +#[stable(feature = "str_checked_slicing", since = "1.20.0")] +#[rustc_const_unstable(feature = "const_slice_index", issue = "none")] +unsafe impl const SliceIndex for ops::RangeFrom { + type Output = str; + #[inline] + fn get(self, slice: &str) -> Option<&Self::Output> { + if slice.is_char_boundary(self.start) { + // SAFETY: just checked that `start` is on a char boundary, + // and we are passing in a safe reference, so the return value will also be one. + Some(unsafe { &*self.get_unchecked(slice) }) + } else { + None + } + } + #[inline] + fn get_mut(self, slice: &mut str) -> Option<&mut Self::Output> { + if slice.is_char_boundary(self.start) { + // SAFETY: just checked that `start` is on a char boundary, + // and we are passing in a safe reference, so the return value will also be one. + Some(unsafe { &mut *self.get_unchecked_mut(slice) }) + } else { + None + } + } + #[inline] + unsafe fn get_unchecked(self, slice: *const str) -> *const Self::Output { + let slice = slice as *const [u8]; + // SAFETY: the caller guarantees that `self` is in bounds of `slice` + // which satisfies all the conditions for `add`. + let ptr = unsafe { slice.as_ptr().add(self.start) }; + let len = slice.len() - self.start; + ptr::slice_from_raw_parts(ptr, len) as *const str + } + #[inline] + unsafe fn get_unchecked_mut(self, slice: *mut str) -> *mut Self::Output { + let slice = slice as *mut [u8]; + // SAFETY: identical to `get_unchecked`. + let ptr = unsafe { slice.as_mut_ptr().add(self.start) }; + let len = slice.len() - self.start; + ptr::slice_from_raw_parts_mut(ptr, len) as *mut str + } + #[inline] + fn index(self, slice: &str) -> &Self::Output { + let (start, end) = (self.start, slice.len()); + match self.get(slice) { + Some(s) => s, + None => super::slice_error_fail(slice, start, end), + } + } + #[inline] + fn index_mut(self, slice: &mut str) -> &mut Self::Output { + if slice.is_char_boundary(self.start) { + // SAFETY: just checked that `start` is on a char boundary, + // and we are passing in a safe reference, so the return value will also be one. + unsafe { &mut *self.get_unchecked_mut(slice) } + } else { + super::slice_error_fail(slice, self.start, slice.len()) + } + } +} + +/// Implements substring slicing with syntax `&self[begin ..= end]` or `&mut +/// self[begin ..= end]`. +/// +/// Returns a slice of the given string from the byte range +/// [`begin`, `end`]. Equivalent to `&self [begin .. end + 1]` or `&mut +/// self[begin .. end + 1]`, except if `end` has the maximum value for +/// `usize`. +/// +/// This operation is *O*(1). +/// +/// # Panics +/// +/// Panics if `begin` does not point to the starting byte offset of +/// a character (as defined by `is_char_boundary`), if `end` does not point +/// to the ending byte offset of a character (`end + 1` is either a starting +/// byte offset or equal to `len`), if `begin > end`, or if `end >= len`. +#[stable(feature = "inclusive_range", since = "1.26.0")] +#[rustc_const_unstable(feature = "const_slice_index", issue = "none")] +unsafe impl const SliceIndex for ops::RangeInclusive { + type Output = str; + #[inline] + fn get(self, slice: &str) -> Option<&Self::Output> { + if *self.end() == usize::MAX { None } else { self.into_slice_range().get(slice) } + } + #[inline] + fn get_mut(self, slice: &mut str) -> Option<&mut Self::Output> { + if *self.end() == usize::MAX { None } else { self.into_slice_range().get_mut(slice) } + } + #[inline] + unsafe fn get_unchecked(self, slice: *const str) -> *const Self::Output { + // SAFETY: the caller must uphold the safety contract for `get_unchecked`. + unsafe { self.into_slice_range().get_unchecked(slice) } + } + #[inline] + unsafe fn get_unchecked_mut(self, slice: *mut str) -> *mut Self::Output { + // SAFETY: the caller must uphold the safety contract for `get_unchecked_mut`. + unsafe { self.into_slice_range().get_unchecked_mut(slice) } + } + #[inline] + fn index(self, slice: &str) -> &Self::Output { + if *self.end() == usize::MAX { + str_index_overflow_fail(); + } + self.into_slice_range().index(slice) + } + #[inline] + fn index_mut(self, slice: &mut str) -> &mut Self::Output { + if *self.end() == usize::MAX { + str_index_overflow_fail(); + } + self.into_slice_range().index_mut(slice) + } +} + +/// Implements substring slicing with syntax `&self[..= end]` or `&mut +/// self[..= end]`. +/// +/// Returns a slice of the given string from the byte range \[0, `end`\]. +/// Equivalent to `&self [0 .. end + 1]`, except if `end` has the maximum +/// value for `usize`. +/// +/// This operation is *O*(1). +/// +/// # Panics +/// +/// Panics if `end` does not point to the ending byte offset of a character +/// (`end + 1` is either a starting byte offset as defined by +/// `is_char_boundary`, or equal to `len`), or if `end >= len`. +#[stable(feature = "inclusive_range", since = "1.26.0")] +#[rustc_const_unstable(feature = "const_slice_index", issue = "none")] +unsafe impl const SliceIndex for ops::RangeToInclusive { + type Output = str; + #[inline] + fn get(self, slice: &str) -> Option<&Self::Output> { + if self.end == usize::MAX { None } else { (..self.end + 1).get(slice) } + } + #[inline] + fn get_mut(self, slice: &mut str) -> Option<&mut Self::Output> { + if self.end == usize::MAX { None } else { (..self.end + 1).get_mut(slice) } + } + #[inline] + unsafe fn get_unchecked(self, slice: *const str) -> *const Self::Output { + // SAFETY: the caller must uphold the safety contract for `get_unchecked`. + unsafe { (..self.end + 1).get_unchecked(slice) } + } + #[inline] + unsafe fn get_unchecked_mut(self, slice: *mut str) -> *mut Self::Output { + // SAFETY: the caller must uphold the safety contract for `get_unchecked_mut`. + unsafe { (..self.end + 1).get_unchecked_mut(slice) } + } + #[inline] + fn index(self, slice: &str) -> &Self::Output { + if self.end == usize::MAX { + str_index_overflow_fail(); + } + (..self.end + 1).index(slice) + } + #[inline] + fn index_mut(self, slice: &mut str) -> &mut Self::Output { + if self.end == usize::MAX { + str_index_overflow_fail(); + } + (..self.end + 1).index_mut(slice) + } +} + +/// Parse a value from a string +/// +/// `FromStr`'s [`from_str`] method is often used implicitly, through +/// [`str`]'s [`parse`] method. See [`parse`]'s documentation for examples. +/// +/// [`from_str`]: FromStr::from_str +/// [`parse`]: str::parse +/// +/// `FromStr` does not have a lifetime parameter, and so you can only parse types +/// that do not contain a lifetime parameter themselves. In other words, you can +/// parse an `i32` with `FromStr`, but not a `&i32`. You can parse a struct that +/// contains an `i32`, but not one that contains an `&i32`. +/// +/// # Examples +/// +/// Basic implementation of `FromStr` on an example `Point` type: +/// +/// ``` +/// use std::str::FromStr; +/// use std::num::ParseIntError; +/// +/// #[derive(Debug, PartialEq)] +/// struct Point { +/// x: i32, +/// y: i32 +/// } +/// +/// impl FromStr for Point { +/// type Err = ParseIntError; +/// +/// fn from_str(s: &str) -> Result { +/// let (x, y) = s +/// .strip_prefix('(') +/// .and_then(|s| s.strip_suffix(')')) +/// .and_then(|s| s.split_once(',')) +/// .unwrap(); +/// +/// let x_fromstr = x.parse::()?; +/// let y_fromstr = y.parse::()?; +/// +/// Ok(Point { x: x_fromstr, y: y_fromstr }) +/// } +/// } +/// +/// let expected = Ok(Point { x: 1, y: 2 }); +/// // Explicit call +/// assert_eq!(Point::from_str("(1,2)"), expected); +/// // Implicit calls, through parse +/// assert_eq!("(1,2)".parse(), expected); +/// assert_eq!("(1,2)".parse::(), expected); +/// ``` +#[stable(feature = "rust1", since = "1.0.0")] +pub trait FromStr: Sized { + /// The associated error which can be returned from parsing. + #[stable(feature = "rust1", since = "1.0.0")] + type Err; + + /// Parses a string `s` to return a value of this type. + /// + /// If parsing succeeds, return the value inside [`Ok`], otherwise + /// when the string is ill-formatted return an error specific to the + /// inside [`Err`]. The error type is specific to the implementation of the trait. + /// + /// # Examples + /// + /// Basic usage with [`i32`], a type that implements `FromStr`: + /// + /// ``` + /// use std::str::FromStr; + /// + /// let s = "5"; + /// let x = i32::from_str(s).unwrap(); + /// + /// assert_eq!(5, x); + /// ``` + #[stable(feature = "rust1", since = "1.0.0")] + fn from_str(s: &str) -> Result; +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl FromStr for bool { + type Err = ParseBoolError; + + /// Parse a `bool` from a string. + /// + /// Yields a `Result`, because `s` may or may not + /// actually be parseable. + /// + /// # Examples + /// + /// ``` + /// use std::str::FromStr; + /// + /// assert_eq!(FromStr::from_str("true"), Ok(true)); + /// assert_eq!(FromStr::from_str("false"), Ok(false)); + /// assert!(::from_str("not even a boolean").is_err()); + /// ``` + /// + /// Note, in many cases, the `.parse()` method on `str` is more proper. + /// + /// ``` + /// assert_eq!("true".parse(), Ok(true)); + /// assert_eq!("false".parse(), Ok(false)); + /// assert!("not even a boolean".parse::().is_err()); + /// ``` + #[inline] + fn from_str(s: &str) -> Result { + match s { + "true" => Ok(true), + "false" => Ok(false), + _ => Err(ParseBoolError), + } + } +} diff --git a/library/core/src/str/validations.rs b/library/core/src/str/validations.rs new file mode 100644 index 000000000..04bc66523 --- /dev/null +++ b/library/core/src/str/validations.rs @@ -0,0 +1,274 @@ +//! Operations related to UTF-8 validation. + +use crate::mem; + +use super::Utf8Error; + +/// Returns the initial codepoint accumulator for the first byte. +/// The first byte is special, only want bottom 5 bits for width 2, 4 bits +/// for width 3, and 3 bits for width 4. +#[inline] +const fn utf8_first_byte(byte: u8, width: u32) -> u32 { + (byte & (0x7F >> width)) as u32 +} + +/// Returns the value of `ch` updated with continuation byte `byte`. +#[inline] +const fn utf8_acc_cont_byte(ch: u32, byte: u8) -> u32 { + (ch << 6) | (byte & CONT_MASK) as u32 +} + +/// Checks whether the byte is a UTF-8 continuation byte (i.e., starts with the +/// bits `10`). +#[inline] +pub(super) const fn utf8_is_cont_byte(byte: u8) -> bool { + (byte as i8) < -64 +} + +/// Reads the next code point out of a byte iterator (assuming a +/// UTF-8-like encoding). +/// +/// # Safety +/// +/// `bytes` must produce a valid UTF-8-like (UTF-8 or WTF-8) string +#[unstable(feature = "str_internals", issue = "none")] +#[inline] +pub unsafe fn next_code_point<'a, I: Iterator>(bytes: &mut I) -> Option { + // Decode UTF-8 + let x = *bytes.next()?; + if x < 128 { + return Some(x as u32); + } + + // Multibyte case follows + // Decode from a byte combination out of: [[[x y] z] w] + // NOTE: Performance is sensitive to the exact formulation here + let init = utf8_first_byte(x, 2); + // SAFETY: `bytes` produces an UTF-8-like string, + // so the iterator must produce a value here. + let y = unsafe { *bytes.next().unwrap_unchecked() }; + let mut ch = utf8_acc_cont_byte(init, y); + if x >= 0xE0 { + // [[x y z] w] case + // 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid + // SAFETY: `bytes` produces an UTF-8-like string, + // so the iterator must produce a value here. + let z = unsafe { *bytes.next().unwrap_unchecked() }; + let y_z = utf8_acc_cont_byte((y & CONT_MASK) as u32, z); + ch = init << 12 | y_z; + if x >= 0xF0 { + // [x y z w] case + // use only the lower 3 bits of `init` + // SAFETY: `bytes` produces an UTF-8-like string, + // so the iterator must produce a value here. + let w = unsafe { *bytes.next().unwrap_unchecked() }; + ch = (init & 7) << 18 | utf8_acc_cont_byte(y_z, w); + } + } + + Some(ch) +} + +/// Reads the last code point out of a byte iterator (assuming a +/// UTF-8-like encoding). +/// +/// # Safety +/// +/// `bytes` must produce a valid UTF-8-like (UTF-8 or WTF-8) string +#[inline] +pub(super) unsafe fn next_code_point_reverse<'a, I>(bytes: &mut I) -> Option +where + I: DoubleEndedIterator, +{ + // Decode UTF-8 + let w = match *bytes.next_back()? { + next_byte if next_byte < 128 => return Some(next_byte as u32), + back_byte => back_byte, + }; + + // Multibyte case follows + // Decode from a byte combination out of: [x [y [z w]]] + let mut ch; + // SAFETY: `bytes` produces an UTF-8-like string, + // so the iterator must produce a value here. + let z = unsafe { *bytes.next_back().unwrap_unchecked() }; + ch = utf8_first_byte(z, 2); + if utf8_is_cont_byte(z) { + // SAFETY: `bytes` produces an UTF-8-like string, + // so the iterator must produce a value here. + let y = unsafe { *bytes.next_back().unwrap_unchecked() }; + ch = utf8_first_byte(y, 3); + if utf8_is_cont_byte(y) { + // SAFETY: `bytes` produces an UTF-8-like string, + // so the iterator must produce a value here. + let x = unsafe { *bytes.next_back().unwrap_unchecked() }; + ch = utf8_first_byte(x, 4); + ch = utf8_acc_cont_byte(ch, y); + } + ch = utf8_acc_cont_byte(ch, z); + } + ch = utf8_acc_cont_byte(ch, w); + + Some(ch) +} + +const NONASCII_MASK: usize = usize::repeat_u8(0x80); + +/// Returns `true` if any byte in the word `x` is nonascii (>= 128). +#[inline] +const fn contains_nonascii(x: usize) -> bool { + (x & NONASCII_MASK) != 0 +} + +/// Walks through `v` checking that it's a valid UTF-8 sequence, +/// returning `Ok(())` in that case, or, if it is invalid, `Err(err)`. +#[inline(always)] +#[rustc_const_unstable(feature = "str_internals", issue = "none")] +pub(super) const fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> { + let mut index = 0; + let len = v.len(); + + let usize_bytes = mem::size_of::(); + let ascii_block_size = 2 * usize_bytes; + let blocks_end = if len >= ascii_block_size { len - ascii_block_size + 1 } else { 0 }; + let align = v.as_ptr().align_offset(usize_bytes); + + while index < len { + let old_offset = index; + macro_rules! err { + ($error_len: expr) => { + return Err(Utf8Error { valid_up_to: old_offset, error_len: $error_len }) + }; + } + + macro_rules! next { + () => {{ + index += 1; + // we needed data, but there was none: error! + if index >= len { + err!(None) + } + v[index] + }}; + } + + let first = v[index]; + if first >= 128 { + let w = utf8_char_width(first); + // 2-byte encoding is for codepoints \u{0080} to \u{07ff} + // first C2 80 last DF BF + // 3-byte encoding is for codepoints \u{0800} to \u{ffff} + // first E0 A0 80 last EF BF BF + // excluding surrogates codepoints \u{d800} to \u{dfff} + // ED A0 80 to ED BF BF + // 4-byte encoding is for codepoints \u{1000}0 to \u{10ff}ff + // first F0 90 80 80 last F4 8F BF BF + // + // Use the UTF-8 syntax from the RFC + // + // https://tools.ietf.org/html/rfc3629 + // UTF8-1 = %x00-7F + // UTF8-2 = %xC2-DF UTF8-tail + // UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) / + // %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail ) + // UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) / + // %xF4 %x80-8F 2( UTF8-tail ) + match w { + 2 => { + if next!() as i8 >= -64 { + err!(Some(1)) + } + } + 3 => { + match (first, next!()) { + (0xE0, 0xA0..=0xBF) + | (0xE1..=0xEC, 0x80..=0xBF) + | (0xED, 0x80..=0x9F) + | (0xEE..=0xEF, 0x80..=0xBF) => {} + _ => err!(Some(1)), + } + if next!() as i8 >= -64 { + err!(Some(2)) + } + } + 4 => { + match (first, next!()) { + (0xF0, 0x90..=0xBF) | (0xF1..=0xF3, 0x80..=0xBF) | (0xF4, 0x80..=0x8F) => {} + _ => err!(Some(1)), + } + if next!() as i8 >= -64 { + err!(Some(2)) + } + if next!() as i8 >= -64 { + err!(Some(3)) + } + } + _ => err!(Some(1)), + } + index += 1; + } else { + // Ascii case, try to skip forward quickly. + // When the pointer is aligned, read 2 words of data per iteration + // until we find a word containing a non-ascii byte. + if align != usize::MAX && align.wrapping_sub(index) % usize_bytes == 0 { + let ptr = v.as_ptr(); + while index < blocks_end { + // SAFETY: since `align - index` and `ascii_block_size` are + // multiples of `usize_bytes`, `block = ptr.add(index)` is + // always aligned with a `usize` so it's safe to dereference + // both `block` and `block.offset(1)`. + unsafe { + let block = ptr.add(index) as *const usize; + // break if there is a nonascii byte + let zu = contains_nonascii(*block); + let zv = contains_nonascii(*block.offset(1)); + if zu || zv { + break; + } + } + index += ascii_block_size; + } + // step from the point where the wordwise loop stopped + while index < len && v[index] < 128 { + index += 1; + } + } else { + index += 1; + } + } + } + + Ok(()) +} + +// https://tools.ietf.org/html/rfc3629 +const UTF8_CHAR_WIDTH: &[u8; 256] = &[ + // 1 2 3 4 5 6 7 8 9 A B C D E F + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 1 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 2 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 3 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 4 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 5 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 6 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 7 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B + 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E + 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // F +]; + +/// Given a first byte, determines how many bytes are in this UTF-8 character. +#[unstable(feature = "str_internals", issue = "none")] +#[must_use] +#[inline] +pub const fn utf8_char_width(b: u8) -> usize { + UTF8_CHAR_WIDTH[b as usize] as usize +} + +/// Mask of the value bits of a continuation byte. +const CONT_MASK: u8 = 0b0011_1111; -- cgit v1.2.3