summaryrefslogtreecommitdiffstats
path: root/third_party/rust/os_str_bytes/src/windows
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 19:33:14 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 19:33:14 +0000
commit36d22d82aa202bb199967e9512281e9a53db42c9 (patch)
tree105e8c98ddea1c1e4784a60a5a6410fa416be2de /third_party/rust/os_str_bytes/src/windows
parentInitial commit. (diff)
downloadfirefox-esr-36d22d82aa202bb199967e9512281e9a53db42c9.tar.xz
firefox-esr-36d22d82aa202bb199967e9512281e9a53db42c9.zip
Adding upstream version 115.7.0esr.upstream/115.7.0esr
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/rust/os_str_bytes/src/windows')
-rw-r--r--third_party/rust/os_str_bytes/src/windows/mod.rs113
-rw-r--r--third_party/rust/os_str_bytes/src/windows/raw.rs46
-rw-r--r--third_party/rust/os_str_bytes/src/windows/wtf8/code_points.rs129
-rw-r--r--third_party/rust/os_str_bytes/src/windows/wtf8/convert.rs181
-rw-r--r--third_party/rust/os_str_bytes/src/windows/wtf8/mod.rs18
-rw-r--r--third_party/rust/os_str_bytes/src/windows/wtf8/string.rs67
6 files changed, 554 insertions, 0 deletions
diff --git a/third_party/rust/os_str_bytes/src/windows/mod.rs b/third_party/rust/os_str_bytes/src/windows/mod.rs
new file mode 100644
index 0000000000..ed9e60b050
--- /dev/null
+++ b/third_party/rust/os_str_bytes/src/windows/mod.rs
@@ -0,0 +1,113 @@
+// These functions are necessarily inefficient, because they must revert
+// encoding conversions performed by the standard library. However, there is
+// currently no better alternative.
+
+use std::borrow::Cow;
+use std::error::Error;
+use std::ffi::OsStr;
+use std::ffi::OsString;
+use std::fmt;
+use std::fmt::Display;
+use std::fmt::Formatter;
+use std::ops::Not;
+use std::os::windows::ffi::OsStrExt;
+use std::os::windows::ffi::OsStringExt;
+use std::result;
+use std::str;
+
+if_raw_str! {
+ pub(super) mod raw;
+}
+
+mod wtf8;
+use wtf8::DecodeWide;
+
+#[cfg(test)]
+mod tests;
+
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+pub(super) enum EncodingError {
+ Byte(u8),
+ CodePoint(u32),
+ End(),
+}
+
+impl EncodingError {
+ fn position(&self) -> Cow<'_, str> {
+ match self {
+ Self::Byte(byte) => Cow::Owned(format!("byte b'\\x{:02X}'", byte)),
+ Self::CodePoint(code_point) => {
+ Cow::Owned(format!("code point U+{:04X}", code_point))
+ }
+ Self::End() => Cow::Borrowed("end of string"),
+ }
+ }
+}
+
+impl Display for EncodingError {
+ fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
+ write!(
+ f,
+ "byte sequence is not representable in the platform encoding; \
+ error at {}",
+ self.position(),
+ )
+ }
+}
+
+impl Error for EncodingError {}
+
+type Result<T> = result::Result<T, EncodingError>;
+
+fn from_bytes(string: &[u8]) -> Result<Option<OsString>> {
+ let mut encoder = wtf8::encode_wide(string);
+
+ // Collecting an iterator into a result ignores the size hint:
+ // https://github.com/rust-lang/rust/issues/48994
+ let mut encoded_string = Vec::with_capacity(encoder.size_hint().0);
+ for wchar in &mut encoder {
+ encoded_string.push(wchar?);
+ }
+
+ debug_assert_eq!(str::from_utf8(string).is_ok(), encoder.is_still_utf8());
+ Ok(encoder
+ .is_still_utf8()
+ .not()
+ .then(|| OsStringExt::from_wide(&encoded_string)))
+}
+
+fn to_bytes(os_string: &OsStr) -> Vec<u8> {
+ let encoder = OsStrExt::encode_wide(os_string);
+
+ let mut string = Vec::with_capacity(encoder.size_hint().0);
+ string.extend(DecodeWide::new(encoder));
+ string
+}
+
+pub(super) fn os_str_from_bytes(string: &[u8]) -> Result<Cow<'_, OsStr>> {
+ from_bytes(string).map(|os_string| {
+ os_string.map(Cow::Owned).unwrap_or_else(|| {
+ // SAFETY: This slice was validated to be UTF-8.
+ Cow::Borrowed(OsStr::new(unsafe {
+ str::from_utf8_unchecked(string)
+ }))
+ })
+ })
+}
+
+pub(super) fn os_str_to_bytes(os_string: &OsStr) -> Cow<'_, [u8]> {
+ Cow::Owned(to_bytes(os_string))
+}
+
+pub(super) fn os_string_from_vec(string: Vec<u8>) -> Result<OsString> {
+ from_bytes(&string).map(|os_string| {
+ os_string.unwrap_or_else(|| {
+ // SAFETY: This slice was validated to be UTF-8.
+ unsafe { String::from_utf8_unchecked(string) }.into()
+ })
+ })
+}
+
+pub(super) fn os_string_into_vec(os_string: OsString) -> Vec<u8> {
+ to_bytes(&os_string)
+}
diff --git a/third_party/rust/os_str_bytes/src/windows/raw.rs b/third_party/rust/os_str_bytes/src/windows/raw.rs
new file mode 100644
index 0000000000..80953dea79
--- /dev/null
+++ b/third_party/rust/os_str_bytes/src/windows/raw.rs
@@ -0,0 +1,46 @@
+use std::fmt;
+use std::fmt::Formatter;
+
+pub(crate) use crate::util::is_continuation;
+
+use super::wtf8;
+pub(crate) use super::wtf8::ends_with;
+pub(crate) use super::wtf8::starts_with;
+use super::wtf8::CodePoints;
+use super::Result;
+
+pub(crate) fn validate_bytes(string: &[u8]) -> Result<()> {
+ wtf8::encode_wide(string).try_for_each(|x| x.map(drop))
+}
+
+pub(crate) fn encode_wide_unchecked(
+ string: &[u8],
+) -> impl '_ + Iterator<Item = u16> {
+ wtf8::encode_wide(string).map(|x| expect_encoded!(x))
+}
+
+pub(crate) fn decode_code_point(string: &[u8]) -> u32 {
+ let mut code_points = CodePoints::new(string.iter().copied());
+ let code_point = expect_encoded!(code_points
+ .next()
+ .expect("cannot parse code point from empty string"));
+ assert_eq!(None, code_points.next(), "multiple code points found");
+ code_point
+}
+
+pub(crate) fn debug(string: &[u8], f: &mut Formatter<'_>) -> fmt::Result {
+ for wchar in encode_wide_unchecked(string) {
+ write!(f, "\\u{{{:X}}}", wchar)?;
+ }
+ Ok(())
+}
+
+#[cfg(feature = "uniquote")]
+pub(crate) mod uniquote {
+ use uniquote::Formatter;
+ use uniquote::Result;
+
+ pub(crate) fn escape(string: &[u8], f: &mut Formatter<'_>) -> Result {
+ f.escape_utf16(super::encode_wide_unchecked(string))
+ }
+}
diff --git a/third_party/rust/os_str_bytes/src/windows/wtf8/code_points.rs b/third_party/rust/os_str_bytes/src/windows/wtf8/code_points.rs
new file mode 100644
index 0000000000..9800d781fc
--- /dev/null
+++ b/third_party/rust/os_str_bytes/src/windows/wtf8/code_points.rs
@@ -0,0 +1,129 @@
+use std::iter::FusedIterator;
+use std::iter::Peekable;
+use std::mem;
+
+use crate::util::is_continuation;
+use crate::util::BYTE_SHIFT;
+use crate::util::CONT_MASK;
+
+use super::EncodingError;
+use super::Result;
+
+pub(in super::super) struct CodePoints<I>
+where
+ I: Iterator<Item = u8>,
+{
+ iter: Peekable<I>,
+ surrogate: bool,
+ still_utf8: bool,
+}
+
+impl<I> CodePoints<I>
+where
+ I: Iterator<Item = u8>,
+{
+ pub(in super::super) fn new<S>(string: S) -> Self
+ where
+ S: IntoIterator<IntoIter = I>,
+ {
+ Self {
+ iter: string.into_iter().peekable(),
+ surrogate: false,
+ still_utf8: true,
+ }
+ }
+
+ pub(super) fn is_still_utf8(&self) -> bool {
+ self.still_utf8
+ }
+
+ fn consume_next(&mut self, code_point: &mut u32) -> Result<()> {
+ let &byte = self.iter.peek().ok_or(EncodingError::End())?;
+
+ if !is_continuation(byte) {
+ self.surrogate = false;
+ // Not consuming this byte will be useful if this crate ever offers
+ // a way to encode lossily.
+ return Err(EncodingError::Byte(byte));
+ }
+ *code_point =
+ (*code_point << BYTE_SHIFT) | u32::from(byte & CONT_MASK);
+
+ let removed = self.iter.next();
+ debug_assert_eq!(Some(byte), removed);
+
+ Ok(())
+ }
+
+ pub(super) fn inner_size_hint(&self) -> (usize, Option<usize>) {
+ self.iter.size_hint()
+ }
+}
+
+impl<I> FusedIterator for CodePoints<I> where
+ I: FusedIterator + Iterator<Item = u8>
+{
+}
+
+impl<I> Iterator for CodePoints<I>
+where
+ I: Iterator<Item = u8>,
+{
+ type Item = Result<u32>;
+
+ fn next(&mut self) -> Option<Self::Item> {
+ let byte = self.iter.next()?;
+ let mut code_point: u32 = byte.into();
+
+ macro_rules! consume_next {
+ () => {{
+ if let Err(error) = self.consume_next(&mut code_point) {
+ return Some(Err(error));
+ }
+ }};
+ }
+
+ let prev_surrogate = mem::replace(&mut self.surrogate, false);
+
+ let mut invalid = false;
+ if !byte.is_ascii() {
+ if byte < 0xC2 {
+ return Some(Err(EncodingError::Byte(byte)));
+ }
+
+ if byte < 0xE0 {
+ code_point &= 0x1F;
+ } else {
+ code_point &= 0x0F;
+ consume_next!();
+
+ if byte >= 0xF0 {
+ if code_point.wrapping_sub(0x10) >= 0x100 {
+ invalid = true;
+ }
+ consume_next!();
+
+ // This condition is optimized to detect surrogate code points.
+ } else if code_point & 0xFE0 == 0x360 {
+ self.still_utf8 = false;
+ if code_point & 0x10 == 0 {
+ self.surrogate = true;
+ } else if prev_surrogate {
+ // Decoding a broken surrogate pair would be lossy.
+ invalid = true;
+ }
+ }
+
+ if code_point < 0x20 {
+ invalid = true;
+ }
+ }
+ consume_next!();
+ }
+ if invalid {
+ return Some(Err(EncodingError::CodePoint(code_point)));
+ }
+
+ Some(Ok(code_point))
+ }
+}
diff --git a/third_party/rust/os_str_bytes/src/windows/wtf8/convert.rs b/third_party/rust/os_str_bytes/src/windows/wtf8/convert.rs
new file mode 100644
index 0000000000..70a8a9f58c
--- /dev/null
+++ b/third_party/rust/os_str_bytes/src/windows/wtf8/convert.rs
@@ -0,0 +1,181 @@
+use std::char;
+use std::char::DecodeUtf16;
+use std::iter::FusedIterator;
+use std::num::NonZeroU16;
+
+use crate::util::BYTE_SHIFT;
+use crate::util::CONT_MASK;
+use crate::util::CONT_TAG;
+
+use super::CodePoints;
+use super::Result;
+
+const MIN_HIGH_SURROGATE: u16 = 0xD800;
+
+const MIN_LOW_SURROGATE: u16 = 0xDC00;
+
+const MIN_SURROGATE_CODE: u32 = (u16::MAX as u32) + 1;
+
+macro_rules! static_assert {
+ ( $condition:expr ) => {
+ const _: () = assert!($condition, "static assertion failed");
+ };
+}
+
+pub(in super::super) struct DecodeWide<I>
+where
+ I: Iterator<Item = u16>,
+{
+ iter: DecodeUtf16<I>,
+ code_point: u32,
+ shifts: u8,
+}
+
+impl<I> DecodeWide<I>
+where
+ I: Iterator<Item = u16>,
+{
+ pub(in super::super) fn new<S>(string: S) -> Self
+ where
+ S: IntoIterator<IntoIter = I, Item = I::Item>,
+ {
+ Self {
+ iter: char::decode_utf16(string),
+ code_point: 0,
+ shifts: 0,
+ }
+ }
+
+ #[inline(always)]
+ fn get_raw_byte(&self) -> u8 {
+ (self.code_point >> (self.shifts * BYTE_SHIFT)) as u8
+ }
+}
+
+impl<I> Iterator for DecodeWide<I>
+where
+ I: Iterator<Item = u16>,
+{
+ type Item = u8;
+
+ fn next(&mut self) -> Option<Self::Item> {
+ if let Some(shifts) = self.shifts.checked_sub(1) {
+ self.shifts = shifts;
+ return Some((self.get_raw_byte() & CONT_MASK) | CONT_TAG);
+ }
+
+ self.code_point = self
+ .iter
+ .next()?
+ .map(Into::into)
+ .unwrap_or_else(|x| x.unpaired_surrogate().into());
+
+ macro_rules! decode {
+ ( $tag:expr ) => {
+ Some(self.get_raw_byte() | $tag)
+ };
+ }
+ macro_rules! try_decode {
+ ( $tag:expr , $upper_bound:expr ) => {
+ if self.code_point < $upper_bound {
+ return decode!($tag);
+ }
+ self.shifts += 1;
+ };
+ }
+ try_decode!(0, 0x80);
+ try_decode!(0xC0, 0x800);
+ try_decode!(0xE0, MIN_SURROGATE_CODE);
+ decode!(0xF0)
+ }
+
+ fn size_hint(&self) -> (usize, Option<usize>) {
+ let (low, high) = self.iter.size_hint();
+ let shifts = self.shifts.into();
+ (
+ low.saturating_add(shifts),
+ high.and_then(|x| x.checked_mul(4))
+ .and_then(|x| x.checked_add(shifts)),
+ )
+ }
+}
+
+pub(in super::super) struct EncodeWide<I>
+where
+ I: Iterator<Item = u8>,
+{
+ iter: CodePoints<I>,
+ surrogate: Option<NonZeroU16>,
+}
+
+impl<I> EncodeWide<I>
+where
+ I: Iterator<Item = u8>,
+{
+ fn new<S>(string: S) -> Self
+ where
+ S: IntoIterator<IntoIter = I>,
+ {
+ Self {
+ iter: CodePoints::new(string),
+ surrogate: None,
+ }
+ }
+
+ pub(in super::super) fn is_still_utf8(&self) -> bool {
+ self.iter.is_still_utf8()
+ }
+}
+
+impl<I> FusedIterator for EncodeWide<I> where
+ I: FusedIterator + Iterator<Item = u8>
+{
+}
+
+impl<I> Iterator for EncodeWide<I>
+where
+ I: Iterator<Item = u8>,
+{
+ type Item = Result<u16>;
+
+ fn next(&mut self) -> Option<Self::Item> {
+ if let Some(surrogate) = self.surrogate.take() {
+ return Some(Ok(surrogate.get()));
+ }
+
+ self.iter.next().map(|code_point| {
+ code_point.map(|code_point| {
+ code_point
+ .checked_sub(MIN_SURROGATE_CODE)
+ .map(|offset| {
+ static_assert!(MIN_LOW_SURROGATE != 0);
+
+ // SAFETY: The above static assertion guarantees that
+ // this value will not be zero.
+ self.surrogate = Some(unsafe {
+ NonZeroU16::new_unchecked(
+ (offset & 0x3FF) as u16 | MIN_LOW_SURROGATE,
+ )
+ });
+ (offset >> 10) as u16 | MIN_HIGH_SURROGATE
+ })
+ .unwrap_or(code_point as u16)
+ })
+ })
+ }
+
+ fn size_hint(&self) -> (usize, Option<usize>) {
+ let (low, high) = self.iter.inner_size_hint();
+ let additional = self.surrogate.is_some().into();
+ (
+ (low.saturating_add(2) / 3).saturating_add(additional),
+ high.and_then(|x| x.checked_add(additional)),
+ )
+ }
+}
+
+pub(in super::super) fn encode_wide(
+ string: &[u8],
+) -> EncodeWide<impl '_ + Iterator<Item = u8>> {
+ EncodeWide::new(string.iter().copied())
+}
diff --git a/third_party/rust/os_str_bytes/src/windows/wtf8/mod.rs b/third_party/rust/os_str_bytes/src/windows/wtf8/mod.rs
new file mode 100644
index 0000000000..d8b0dc4a7f
--- /dev/null
+++ b/third_party/rust/os_str_bytes/src/windows/wtf8/mod.rs
@@ -0,0 +1,18 @@
+// This module implements the WTF-8 encoding specification:
+// https://simonsapin.github.io/wtf-8/
+
+use super::EncodingError;
+use super::Result;
+
+mod code_points;
+pub(super) use code_points::CodePoints;
+
+mod convert;
+pub(super) use convert::encode_wide;
+pub(super) use convert::DecodeWide;
+
+if_raw_str! {
+ mod string;
+ pub(crate) use string::ends_with;
+ pub(crate) use string::starts_with;
+}
diff --git a/third_party/rust/os_str_bytes/src/windows/wtf8/string.rs b/third_party/rust/os_str_bytes/src/windows/wtf8/string.rs
new file mode 100644
index 0000000000..b3523a2eff
--- /dev/null
+++ b/third_party/rust/os_str_bytes/src/windows/wtf8/string.rs
@@ -0,0 +1,67 @@
+use crate::util;
+
+const SURROGATE_LENGTH: usize = 3;
+
+pub(crate) fn ends_with(string: &[u8], mut suffix: &[u8]) -> bool {
+ let index = if let Some(index) = string.len().checked_sub(suffix.len()) {
+ index
+ } else {
+ return false;
+ };
+ if let Some(&byte) = string.get(index) {
+ if util::is_continuation(byte) {
+ let index = expect_encoded!(index.checked_sub(1));
+ let mut wide_surrogate =
+ if let Some(surrogate) = suffix.get(..SURROGATE_LENGTH) {
+ super::encode_wide(surrogate)
+ } else {
+ return false;
+ };
+ let surrogate_wchar = wide_surrogate
+ .next()
+ .expect("failed decoding non-empty suffix");
+
+ if wide_surrogate.next().is_some()
+ || super::encode_wide(&string[index..])
+ .take_while(Result::is_ok)
+ .nth(1)
+ != Some(surrogate_wchar)
+ {
+ return false;
+ }
+ suffix = &suffix[SURROGATE_LENGTH..];
+ }
+ }
+ string.ends_with(suffix)
+}
+
+pub(crate) fn starts_with(string: &[u8], mut prefix: &[u8]) -> bool {
+ if let Some(&byte) = string.get(prefix.len()) {
+ if util::is_continuation(byte) {
+ let index = if let Some(index) =
+ prefix.len().checked_sub(SURROGATE_LENGTH)
+ {
+ index
+ } else {
+ return false;
+ };
+ let (substring, surrogate) = prefix.split_at(index);
+ let mut wide_surrogate = super::encode_wide(surrogate);
+ let surrogate_wchar = wide_surrogate
+ .next()
+ .expect("failed decoding non-empty prefix");
+
+ if surrogate_wchar.is_err()
+ || wide_surrogate.next().is_some()
+ || super::encode_wide(&string[index..])
+ .next()
+ .expect("failed decoding non-empty substring")
+ != surrogate_wchar
+ {
+ return false;
+ }
+ prefix = substring;
+ }
+ }
+ string.starts_with(prefix)
+}