summaryrefslogtreecommitdiffstats
path: root/library/std/src/sys_common/wtf8.rs
diff options
context:
space:
mode:
Diffstat (limited to 'library/std/src/sys_common/wtf8.rs')
-rw-r--r--library/std/src/sys_common/wtf8.rs95
1 files changed, 77 insertions, 18 deletions
diff --git a/library/std/src/sys_common/wtf8.rs b/library/std/src/sys_common/wtf8.rs
index 57fa49893..dd53767d4 100644
--- a/library/std/src/sys_common/wtf8.rs
+++ b/library/std/src/sys_common/wtf8.rs
@@ -89,6 +89,24 @@ impl CodePoint {
self.value
}
+ /// Returns the numeric value of the code point if it is a leading surrogate.
+ #[inline]
+ pub fn to_lead_surrogate(&self) -> Option<u16> {
+ match self.value {
+ lead @ 0xD800..=0xDBFF => Some(lead as u16),
+ _ => None,
+ }
+ }
+
+ /// Returns the numeric value of the code point if it is a trailing surrogate.
+ #[inline]
+ pub fn to_trail_surrogate(&self) -> Option<u16> {
+ match self.value {
+ trail @ 0xDC00..=0xDFFF => Some(trail as u16),
+ _ => None,
+ }
+ }
+
/// Optionally returns a Unicode scalar value for the code point.
///
/// Returns `None` if the code point is a surrogate (from U+D800 to U+DFFF).
@@ -117,6 +135,14 @@ impl CodePoint {
#[derive(Eq, PartialEq, Ord, PartialOrd, Clone)]
pub struct Wtf8Buf {
bytes: Vec<u8>,
+
+ /// Do we know that `bytes` holds a valid UTF-8 encoding? We can easily
+ /// know this if we're constructed from a `String` or `&str`.
+ ///
+ /// It is possible for `bytes` to have valid UTF-8 without this being
+ /// set, such as when we're concatenating `&Wtf8`'s and surrogates become
+ /// paired, as we don't bother to rescan the entire string.
+ is_known_utf8: bool,
}
impl ops::Deref for Wtf8Buf {
@@ -147,13 +173,13 @@ impl Wtf8Buf {
/// Creates a new, empty WTF-8 string.
#[inline]
pub fn new() -> Wtf8Buf {
- Wtf8Buf { bytes: Vec::new() }
+ Wtf8Buf { bytes: Vec::new(), is_known_utf8: true }
}
/// Creates a new, empty WTF-8 string with pre-allocated capacity for `capacity` bytes.
#[inline]
pub fn with_capacity(capacity: usize) -> Wtf8Buf {
- Wtf8Buf { bytes: Vec::with_capacity(capacity) }
+ Wtf8Buf { bytes: Vec::with_capacity(capacity), is_known_utf8: true }
}
/// Creates a WTF-8 string from a UTF-8 `String`.
@@ -163,7 +189,7 @@ impl Wtf8Buf {
/// Since WTF-8 is a superset of UTF-8, this always succeeds.
#[inline]
pub fn from_string(string: String) -> Wtf8Buf {
- Wtf8Buf { bytes: string.into_bytes() }
+ Wtf8Buf { bytes: string.into_bytes(), is_known_utf8: true }
}
/// Creates a WTF-8 string from a UTF-8 `&str` slice.
@@ -173,11 +199,12 @@ impl Wtf8Buf {
/// Since WTF-8 is a superset of UTF-8, this always succeeds.
#[inline]
pub fn from_str(str: &str) -> Wtf8Buf {
- Wtf8Buf { bytes: <[_]>::to_vec(str.as_bytes()) }
+ Wtf8Buf { bytes: <[_]>::to_vec(str.as_bytes()), is_known_utf8: true }
}
pub fn clear(&mut self) {
- self.bytes.clear()
+ self.bytes.clear();
+ self.is_known_utf8 = true;
}
/// Creates a WTF-8 string from a potentially ill-formed UTF-16 slice of 16-bit code units.
@@ -193,9 +220,11 @@ impl Wtf8Buf {
let surrogate = surrogate.unpaired_surrogate();
// Surrogates are known to be in the code point range.
let code_point = unsafe { CodePoint::from_u32_unchecked(surrogate as u32) };
+ // The string will now contain an unpaired surrogate.
+ string.is_known_utf8 = false;
// Skip the WTF-8 concatenation check,
// surrogate pairs are already decoded by decode_utf16
- string.push_code_point_unchecked(code_point)
+ string.push_code_point_unchecked(code_point);
}
}
}
@@ -203,7 +232,7 @@ impl Wtf8Buf {
}
/// Copied from String::push
- /// This does **not** include the WTF-8 concatenation check.
+ /// This does **not** include the WTF-8 concatenation check or `is_known_utf8` check.
fn push_code_point_unchecked(&mut self, code_point: CodePoint) {
let mut bytes = [0; 4];
let bytes = char::encode_utf8_raw(code_point.value, &mut bytes);
@@ -217,6 +246,9 @@ impl Wtf8Buf {
#[inline]
pub fn as_mut_slice(&mut self) -> &mut Wtf8 {
+ // Safety: `Wtf8` doesn't expose any way to mutate the bytes that would
+ // cause them to change from well-formed UTF-8 to ill-formed UTF-8,
+ // which would break the assumptions of the `is_known_utf8` field.
unsafe { Wtf8::from_mut_bytes_unchecked(&mut self.bytes) }
}
@@ -236,7 +268,8 @@ impl Wtf8Buf {
/// in the given `Wtf8Buf`. The `Wtf8Buf` may reserve more space to avoid
/// frequent reallocations. After calling `try_reserve`, capacity will be
/// greater than or equal to `self.len() + additional`. Does nothing if
- /// capacity is already sufficient.
+ /// capacity is already sufficient. This method preserves the contents even
+ /// if an error occurs.
///
/// # Errors
///
@@ -313,7 +346,15 @@ impl Wtf8Buf {
self.push_char(decode_surrogate_pair(lead, trail));
self.bytes.extend_from_slice(other_without_trail_surrogate);
}
- _ => self.bytes.extend_from_slice(&other.bytes),
+ _ => {
+ // If we'll be pushing a string containing a surrogate, we may
+ // no longer have UTF-8.
+ if other.next_surrogate(0).is_some() {
+ self.is_known_utf8 = false;
+ }
+
+ self.bytes.extend_from_slice(&other.bytes);
+ }
}
}
@@ -330,13 +371,19 @@ impl Wtf8Buf {
/// like concatenating ill-formed UTF-16 strings effectively would.
#[inline]
pub fn push(&mut self, code_point: CodePoint) {
- if let trail @ 0xDC00..=0xDFFF = code_point.to_u32() {
+ if let Some(trail) = code_point.to_trail_surrogate() {
if let Some(lead) = (&*self).final_lead_surrogate() {
let len_without_lead_surrogate = self.len() - 3;
self.bytes.truncate(len_without_lead_surrogate);
- self.push_char(decode_surrogate_pair(lead, trail as u16));
+ self.push_char(decode_surrogate_pair(lead, trail));
return;
}
+
+ // We're pushing a trailing surrogate.
+ self.is_known_utf8 = false;
+ } else if code_point.to_lead_surrogate().is_some() {
+ // We're pushing a leading surrogate.
+ self.is_known_utf8 = false;
}
// No newly paired surrogates at the boundary.
@@ -363,9 +410,10 @@ impl Wtf8Buf {
/// (that is, if the string contains surrogates),
/// the original WTF-8 string is returned instead.
pub fn into_string(self) -> Result<String, Wtf8Buf> {
- match self.next_surrogate(0) {
- None => Ok(unsafe { String::from_utf8_unchecked(self.bytes) }),
- Some(_) => Err(self),
+ if self.is_known_utf8 || self.next_surrogate(0).is_none() {
+ Ok(unsafe { String::from_utf8_unchecked(self.bytes) })
+ } else {
+ Err(self)
}
}
@@ -375,6 +423,11 @@ impl Wtf8Buf {
///
/// Surrogates are replaced with `"\u{FFFD}"` (the replacement character “�”)
pub fn into_string_lossy(mut self) -> String {
+ // Fast path: If we already have UTF-8, we can return it immediately.
+ if self.is_known_utf8 {
+ return unsafe { String::from_utf8_unchecked(self.bytes) };
+ }
+
let mut pos = 0;
loop {
match self.next_surrogate(pos) {
@@ -397,7 +450,7 @@ impl Wtf8Buf {
/// Converts a `Box<Wtf8>` into a `Wtf8Buf`.
pub fn from_box(boxed: Box<Wtf8>) -> Wtf8Buf {
let bytes: Box<[u8]> = unsafe { mem::transmute(boxed) };
- Wtf8Buf { bytes: bytes.into_vec() }
+ Wtf8Buf { bytes: bytes.into_vec(), is_known_utf8: false }
}
}
@@ -575,6 +628,11 @@ impl Wtf8 {
}
}
+ /// Creates an owned `Wtf8Buf` from a borrowed `Wtf8`.
+ pub fn to_owned(&self) -> Wtf8Buf {
+ Wtf8Buf { bytes: self.bytes.to_vec(), is_known_utf8: false }
+ }
+
/// Lossily converts the string to UTF-8.
/// Returns a UTF-8 `&str` slice if the contents are well-formed in UTF-8.
///
@@ -664,7 +722,8 @@ impl Wtf8 {
}
pub fn clone_into(&self, buf: &mut Wtf8Buf) {
- self.bytes.clone_into(&mut buf.bytes)
+ buf.is_known_utf8 = false;
+ self.bytes.clone_into(&mut buf.bytes);
}
/// Boxes this `Wtf8`.
@@ -704,12 +763,12 @@ impl Wtf8 {
#[inline]
pub fn to_ascii_lowercase(&self) -> Wtf8Buf {
- Wtf8Buf { bytes: self.bytes.to_ascii_lowercase() }
+ Wtf8Buf { bytes: self.bytes.to_ascii_lowercase(), is_known_utf8: false }
}
#[inline]
pub fn to_ascii_uppercase(&self) -> Wtf8Buf {
- Wtf8Buf { bytes: self.bytes.to_ascii_uppercase() }
+ Wtf8Buf { bytes: self.bytes.to_ascii_uppercase(), is_known_utf8: false }
}
#[inline]