diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 19:33:14 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 19:33:14 +0000 |
commit | 36d22d82aa202bb199967e9512281e9a53db42c9 (patch) | |
tree | 105e8c98ddea1c1e4784a60a5a6410fa416be2de /third_party/rust/os_str_bytes/src/windows | |
parent | Initial commit. (diff) | |
download | firefox-esr-36d22d82aa202bb199967e9512281e9a53db42c9.tar.xz firefox-esr-36d22d82aa202bb199967e9512281e9a53db42c9.zip |
Adding upstream version 115.7.0esr.upstream/115.7.0esr
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/rust/os_str_bytes/src/windows')
6 files changed, 554 insertions, 0 deletions
diff --git a/third_party/rust/os_str_bytes/src/windows/mod.rs b/third_party/rust/os_str_bytes/src/windows/mod.rs new file mode 100644 index 0000000000..ed9e60b050 --- /dev/null +++ b/third_party/rust/os_str_bytes/src/windows/mod.rs @@ -0,0 +1,113 @@ +// These functions are necessarily inefficient, because they must revert +// encoding conversions performed by the standard library. However, there is +// currently no better alternative. + +use std::borrow::Cow; +use std::error::Error; +use std::ffi::OsStr; +use std::ffi::OsString; +use std::fmt; +use std::fmt::Display; +use std::fmt::Formatter; +use std::ops::Not; +use std::os::windows::ffi::OsStrExt; +use std::os::windows::ffi::OsStringExt; +use std::result; +use std::str; + +if_raw_str! { + pub(super) mod raw; +} + +mod wtf8; +use wtf8::DecodeWide; + +#[cfg(test)] +mod tests; + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub(super) enum EncodingError { + Byte(u8), + CodePoint(u32), + End(), +} + +impl EncodingError { + fn position(&self) -> Cow<'_, str> { + match self { + Self::Byte(byte) => Cow::Owned(format!("byte b'\\x{:02X}'", byte)), + Self::CodePoint(code_point) => { + Cow::Owned(format!("code point U+{:04X}", code_point)) + } + Self::End() => Cow::Borrowed("end of string"), + } + } +} + +impl Display for EncodingError { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + write!( + f, + "byte sequence is not representable in the platform encoding; \ + error at {}", + self.position(), + ) + } +} + +impl Error for EncodingError {} + +type Result<T> = result::Result<T, EncodingError>; + +fn from_bytes(string: &[u8]) -> Result<Option<OsString>> { + let mut encoder = wtf8::encode_wide(string); + + // Collecting an iterator into a result ignores the size hint: + // https://github.com/rust-lang/rust/issues/48994 + let mut encoded_string = Vec::with_capacity(encoder.size_hint().0); + for wchar in &mut encoder { + encoded_string.push(wchar?); + } + + debug_assert_eq!(str::from_utf8(string).is_ok(), encoder.is_still_utf8()); + Ok(encoder + .is_still_utf8() + .not() + .then(|| OsStringExt::from_wide(&encoded_string))) +} + +fn to_bytes(os_string: &OsStr) -> Vec<u8> { + let encoder = OsStrExt::encode_wide(os_string); + + let mut string = Vec::with_capacity(encoder.size_hint().0); + string.extend(DecodeWide::new(encoder)); + string +} + +pub(super) fn os_str_from_bytes(string: &[u8]) -> Result<Cow<'_, OsStr>> { + from_bytes(string).map(|os_string| { + os_string.map(Cow::Owned).unwrap_or_else(|| { + // SAFETY: This slice was validated to be UTF-8. + Cow::Borrowed(OsStr::new(unsafe { + str::from_utf8_unchecked(string) + })) + }) + }) +} + +pub(super) fn os_str_to_bytes(os_string: &OsStr) -> Cow<'_, [u8]> { + Cow::Owned(to_bytes(os_string)) +} + +pub(super) fn os_string_from_vec(string: Vec<u8>) -> Result<OsString> { + from_bytes(&string).map(|os_string| { + os_string.unwrap_or_else(|| { + // SAFETY: This slice was validated to be UTF-8. + unsafe { String::from_utf8_unchecked(string) }.into() + }) + }) +} + +pub(super) fn os_string_into_vec(os_string: OsString) -> Vec<u8> { + to_bytes(&os_string) +} diff --git a/third_party/rust/os_str_bytes/src/windows/raw.rs b/third_party/rust/os_str_bytes/src/windows/raw.rs new file mode 100644 index 0000000000..80953dea79 --- /dev/null +++ b/third_party/rust/os_str_bytes/src/windows/raw.rs @@ -0,0 +1,46 @@ +use std::fmt; +use std::fmt::Formatter; + +pub(crate) use crate::util::is_continuation; + +use super::wtf8; +pub(crate) use super::wtf8::ends_with; +pub(crate) use super::wtf8::starts_with; +use super::wtf8::CodePoints; +use super::Result; + +pub(crate) fn validate_bytes(string: &[u8]) -> Result<()> { + wtf8::encode_wide(string).try_for_each(|x| x.map(drop)) +} + +pub(crate) fn encode_wide_unchecked( + string: &[u8], +) -> impl '_ + Iterator<Item = u16> { + wtf8::encode_wide(string).map(|x| expect_encoded!(x)) +} + +pub(crate) fn decode_code_point(string: &[u8]) -> u32 { + let mut code_points = CodePoints::new(string.iter().copied()); + let code_point = expect_encoded!(code_points + .next() + .expect("cannot parse code point from empty string")); + assert_eq!(None, code_points.next(), "multiple code points found"); + code_point +} + +pub(crate) fn debug(string: &[u8], f: &mut Formatter<'_>) -> fmt::Result { + for wchar in encode_wide_unchecked(string) { + write!(f, "\\u{{{:X}}}", wchar)?; + } + Ok(()) +} + +#[cfg(feature = "uniquote")] +pub(crate) mod uniquote { + use uniquote::Formatter; + use uniquote::Result; + + pub(crate) fn escape(string: &[u8], f: &mut Formatter<'_>) -> Result { + f.escape_utf16(super::encode_wide_unchecked(string)) + } +} diff --git a/third_party/rust/os_str_bytes/src/windows/wtf8/code_points.rs b/third_party/rust/os_str_bytes/src/windows/wtf8/code_points.rs new file mode 100644 index 0000000000..9800d781fc --- /dev/null +++ b/third_party/rust/os_str_bytes/src/windows/wtf8/code_points.rs @@ -0,0 +1,129 @@ +use std::iter::FusedIterator; +use std::iter::Peekable; +use std::mem; + +use crate::util::is_continuation; +use crate::util::BYTE_SHIFT; +use crate::util::CONT_MASK; + +use super::EncodingError; +use super::Result; + +pub(in super::super) struct CodePoints<I> +where + I: Iterator<Item = u8>, +{ + iter: Peekable<I>, + surrogate: bool, + still_utf8: bool, +} + +impl<I> CodePoints<I> +where + I: Iterator<Item = u8>, +{ + pub(in super::super) fn new<S>(string: S) -> Self + where + S: IntoIterator<IntoIter = I>, + { + Self { + iter: string.into_iter().peekable(), + surrogate: false, + still_utf8: true, + } + } + + pub(super) fn is_still_utf8(&self) -> bool { + self.still_utf8 + } + + fn consume_next(&mut self, code_point: &mut u32) -> Result<()> { + let &byte = self.iter.peek().ok_or(EncodingError::End())?; + + if !is_continuation(byte) { + self.surrogate = false; + // Not consuming this byte will be useful if this crate ever offers + // a way to encode lossily. + return Err(EncodingError::Byte(byte)); + } + *code_point = + (*code_point << BYTE_SHIFT) | u32::from(byte & CONT_MASK); + + let removed = self.iter.next(); + debug_assert_eq!(Some(byte), removed); + + Ok(()) + } + + pub(super) fn inner_size_hint(&self) -> (usize, Option<usize>) { + self.iter.size_hint() + } +} + +impl<I> FusedIterator for CodePoints<I> where + I: FusedIterator + Iterator<Item = u8> +{ +} + +impl<I> Iterator for CodePoints<I> +where + I: Iterator<Item = u8>, +{ + type Item = Result<u32>; + + fn next(&mut self) -> Option<Self::Item> { + let byte = self.iter.next()?; + let mut code_point: u32 = byte.into(); + + macro_rules! consume_next { + () => {{ + if let Err(error) = self.consume_next(&mut code_point) { + return Some(Err(error)); + } + }}; + } + + let prev_surrogate = mem::replace(&mut self.surrogate, false); + + let mut invalid = false; + if !byte.is_ascii() { + if byte < 0xC2 { + return Some(Err(EncodingError::Byte(byte))); + } + + if byte < 0xE0 { + code_point &= 0x1F; + } else { + code_point &= 0x0F; + consume_next!(); + + if byte >= 0xF0 { + if code_point.wrapping_sub(0x10) >= 0x100 { + invalid = true; + } + consume_next!(); + + // This condition is optimized to detect surrogate code points. + } else if code_point & 0xFE0 == 0x360 { + self.still_utf8 = false; + if code_point & 0x10 == 0 { + self.surrogate = true; + } else if prev_surrogate { + // Decoding a broken surrogate pair would be lossy. + invalid = true; + } + } + + if code_point < 0x20 { + invalid = true; + } + } + consume_next!(); + } + if invalid { + return Some(Err(EncodingError::CodePoint(code_point))); + } + + Some(Ok(code_point)) + } +} diff --git a/third_party/rust/os_str_bytes/src/windows/wtf8/convert.rs b/third_party/rust/os_str_bytes/src/windows/wtf8/convert.rs new file mode 100644 index 0000000000..70a8a9f58c --- /dev/null +++ b/third_party/rust/os_str_bytes/src/windows/wtf8/convert.rs @@ -0,0 +1,181 @@ +use std::char; +use std::char::DecodeUtf16; +use std::iter::FusedIterator; +use std::num::NonZeroU16; + +use crate::util::BYTE_SHIFT; +use crate::util::CONT_MASK; +use crate::util::CONT_TAG; + +use super::CodePoints; +use super::Result; + +const MIN_HIGH_SURROGATE: u16 = 0xD800; + +const MIN_LOW_SURROGATE: u16 = 0xDC00; + +const MIN_SURROGATE_CODE: u32 = (u16::MAX as u32) + 1; + +macro_rules! static_assert { + ( $condition:expr ) => { + const _: () = assert!($condition, "static assertion failed"); + }; +} + +pub(in super::super) struct DecodeWide<I> +where + I: Iterator<Item = u16>, +{ + iter: DecodeUtf16<I>, + code_point: u32, + shifts: u8, +} + +impl<I> DecodeWide<I> +where + I: Iterator<Item = u16>, +{ + pub(in super::super) fn new<S>(string: S) -> Self + where + S: IntoIterator<IntoIter = I, Item = I::Item>, + { + Self { + iter: char::decode_utf16(string), + code_point: 0, + shifts: 0, + } + } + + #[inline(always)] + fn get_raw_byte(&self) -> u8 { + (self.code_point >> (self.shifts * BYTE_SHIFT)) as u8 + } +} + +impl<I> Iterator for DecodeWide<I> +where + I: Iterator<Item = u16>, +{ + type Item = u8; + + fn next(&mut self) -> Option<Self::Item> { + if let Some(shifts) = self.shifts.checked_sub(1) { + self.shifts = shifts; + return Some((self.get_raw_byte() & CONT_MASK) | CONT_TAG); + } + + self.code_point = self + .iter + .next()? + .map(Into::into) + .unwrap_or_else(|x| x.unpaired_surrogate().into()); + + macro_rules! decode { + ( $tag:expr ) => { + Some(self.get_raw_byte() | $tag) + }; + } + macro_rules! try_decode { + ( $tag:expr , $upper_bound:expr ) => { + if self.code_point < $upper_bound { + return decode!($tag); + } + self.shifts += 1; + }; + } + try_decode!(0, 0x80); + try_decode!(0xC0, 0x800); + try_decode!(0xE0, MIN_SURROGATE_CODE); + decode!(0xF0) + } + + fn size_hint(&self) -> (usize, Option<usize>) { + let (low, high) = self.iter.size_hint(); + let shifts = self.shifts.into(); + ( + low.saturating_add(shifts), + high.and_then(|x| x.checked_mul(4)) + .and_then(|x| x.checked_add(shifts)), + ) + } +} + +pub(in super::super) struct EncodeWide<I> +where + I: Iterator<Item = u8>, +{ + iter: CodePoints<I>, + surrogate: Option<NonZeroU16>, +} + +impl<I> EncodeWide<I> +where + I: Iterator<Item = u8>, +{ + fn new<S>(string: S) -> Self + where + S: IntoIterator<IntoIter = I>, + { + Self { + iter: CodePoints::new(string), + surrogate: None, + } + } + + pub(in super::super) fn is_still_utf8(&self) -> bool { + self.iter.is_still_utf8() + } +} + +impl<I> FusedIterator for EncodeWide<I> where + I: FusedIterator + Iterator<Item = u8> +{ +} + +impl<I> Iterator for EncodeWide<I> +where + I: Iterator<Item = u8>, +{ + type Item = Result<u16>; + + fn next(&mut self) -> Option<Self::Item> { + if let Some(surrogate) = self.surrogate.take() { + return Some(Ok(surrogate.get())); + } + + self.iter.next().map(|code_point| { + code_point.map(|code_point| { + code_point + .checked_sub(MIN_SURROGATE_CODE) + .map(|offset| { + static_assert!(MIN_LOW_SURROGATE != 0); + + // SAFETY: The above static assertion guarantees that + // this value will not be zero. + self.surrogate = Some(unsafe { + NonZeroU16::new_unchecked( + (offset & 0x3FF) as u16 | MIN_LOW_SURROGATE, + ) + }); + (offset >> 10) as u16 | MIN_HIGH_SURROGATE + }) + .unwrap_or(code_point as u16) + }) + }) + } + + fn size_hint(&self) -> (usize, Option<usize>) { + let (low, high) = self.iter.inner_size_hint(); + let additional = self.surrogate.is_some().into(); + ( + (low.saturating_add(2) / 3).saturating_add(additional), + high.and_then(|x| x.checked_add(additional)), + ) + } +} + +pub(in super::super) fn encode_wide( + string: &[u8], +) -> EncodeWide<impl '_ + Iterator<Item = u8>> { + EncodeWide::new(string.iter().copied()) +} diff --git a/third_party/rust/os_str_bytes/src/windows/wtf8/mod.rs b/third_party/rust/os_str_bytes/src/windows/wtf8/mod.rs new file mode 100644 index 0000000000..d8b0dc4a7f --- /dev/null +++ b/third_party/rust/os_str_bytes/src/windows/wtf8/mod.rs @@ -0,0 +1,18 @@ +// This module implements the WTF-8 encoding specification: +// https://simonsapin.github.io/wtf-8/ + +use super::EncodingError; +use super::Result; + +mod code_points; +pub(super) use code_points::CodePoints; + +mod convert; +pub(super) use convert::encode_wide; +pub(super) use convert::DecodeWide; + +if_raw_str! { + mod string; + pub(crate) use string::ends_with; + pub(crate) use string::starts_with; +} diff --git a/third_party/rust/os_str_bytes/src/windows/wtf8/string.rs b/third_party/rust/os_str_bytes/src/windows/wtf8/string.rs new file mode 100644 index 0000000000..b3523a2eff --- /dev/null +++ b/third_party/rust/os_str_bytes/src/windows/wtf8/string.rs @@ -0,0 +1,67 @@ +use crate::util; + +const SURROGATE_LENGTH: usize = 3; + +pub(crate) fn ends_with(string: &[u8], mut suffix: &[u8]) -> bool { + let index = if let Some(index) = string.len().checked_sub(suffix.len()) { + index + } else { + return false; + }; + if let Some(&byte) = string.get(index) { + if util::is_continuation(byte) { + let index = expect_encoded!(index.checked_sub(1)); + let mut wide_surrogate = + if let Some(surrogate) = suffix.get(..SURROGATE_LENGTH) { + super::encode_wide(surrogate) + } else { + return false; + }; + let surrogate_wchar = wide_surrogate + .next() + .expect("failed decoding non-empty suffix"); + + if wide_surrogate.next().is_some() + || super::encode_wide(&string[index..]) + .take_while(Result::is_ok) + .nth(1) + != Some(surrogate_wchar) + { + return false; + } + suffix = &suffix[SURROGATE_LENGTH..]; + } + } + string.ends_with(suffix) +} + +pub(crate) fn starts_with(string: &[u8], mut prefix: &[u8]) -> bool { + if let Some(&byte) = string.get(prefix.len()) { + if util::is_continuation(byte) { + let index = if let Some(index) = + prefix.len().checked_sub(SURROGATE_LENGTH) + { + index + } else { + return false; + }; + let (substring, surrogate) = prefix.split_at(index); + let mut wide_surrogate = super::encode_wide(surrogate); + let surrogate_wchar = wide_surrogate + .next() + .expect("failed decoding non-empty prefix"); + + if surrogate_wchar.is_err() + || wide_surrogate.next().is_some() + || super::encode_wide(&string[index..]) + .next() + .expect("failed decoding non-empty substring") + != surrogate_wchar + { + return false; + } + prefix = substring; + } + } + string.starts_with(prefix) +} |