Adding upstream version 115.7.0esr.upstream/115.7.0esr

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-07 19:33:14 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-07 19:33:14 +0000
commit: 36d22d82aa202bb199967e9512281e9a53db42c9 (patch)
tree: 105e8c98ddea1c1e4784a60a5a6410fa416be2de /third_party/rust/os_str_bytes/src/windows
parent: Initial commit. (diff)
download: firefox-esr-36d22d82aa202bb199967e9512281e9a53db42c9.tar.xz
firefox-esr-36d22d82aa202bb199967e9512281e9a53db42c9.zip
6 files changed, 554 insertions, 0 deletions
diff --git a/third_party/rust/os_str_bytes/src/windows/mod.rs b/third_party/rust/os_str_bytes/src/windows/mod.rs
new file mode 100644
index 0000000000..ed9e60b050
--- /dev/null
+++ b/third_party/rust/os_str_bytes/src/windows/mod.rs
@@ -0,0 +1,113 @@
+// These functions are necessarily inefficient, because they must revert
+// encoding conversions performed by the standard library. However, there is
+// currently no better alternative.
+
+use std::borrow::Cow;
+use std::error::Error;
+use std::ffi::OsStr;
+use std::ffi::OsString;
+use std::fmt;
+use std::fmt::Display;
+use std::fmt::Formatter;
+use std::ops::Not;
+use std::os::windows::ffi::OsStrExt;
+use std::os::windows::ffi::OsStringExt;
+use std::result;
+use std::str;
+
+if_raw_str! {
+    pub(super) mod raw;
+}
+
+mod wtf8;
+use wtf8::DecodeWide;
+
+#[cfg(test)]
+mod tests;
+
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+pub(super) enum EncodingError {
+    Byte(u8),
+    CodePoint(u32),
+    End(),
+}
+
+impl EncodingError {
+    fn position(&self) -> Cow<'_, str> {
+        match self {
+            Self::Byte(byte) => Cow::Owned(format!("byte b'\\x{:02X}'", byte)),
+            Self::CodePoint(code_point) => {
+                Cow::Owned(format!("code point U+{:04X}", code_point))
+            }
+            Self::End() => Cow::Borrowed("end of string"),
+        }
+    }
+}
+
+impl Display for EncodingError {
+    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
+        write!(
+            f,
+            "byte sequence is not representable in the platform encoding; \
+             error at {}",
+            self.position(),
+        )
+    }
+}
+
+impl Error for EncodingError {}
+
+type Result<T> = result::Result<T, EncodingError>;
+
+fn from_bytes(string: &[u8]) -> Result<Option<OsString>> {
+    let mut encoder = wtf8::encode_wide(string);
+
+    // Collecting an iterator into a result ignores the size hint:
+    // https://github.com/rust-lang/rust/issues/48994
+    let mut encoded_string = Vec::with_capacity(encoder.size_hint().0);
+    for wchar in &mut encoder {
+        encoded_string.push(wchar?);
+    }
+
+    debug_assert_eq!(str::from_utf8(string).is_ok(), encoder.is_still_utf8());
+    Ok(encoder
+        .is_still_utf8()
+        .not()
+        .then(|| OsStringExt::from_wide(&encoded_string)))
+}
+
+fn to_bytes(os_string: &OsStr) -> Vec<u8> {
+    let encoder = OsStrExt::encode_wide(os_string);
+
+    let mut string = Vec::with_capacity(encoder.size_hint().0);
+    string.extend(DecodeWide::new(encoder));
+    string
+}
+
+pub(super) fn os_str_from_bytes(string: &[u8]) -> Result<Cow<'_, OsStr>> {
+    from_bytes(string).map(|os_string| {
+        os_string.map(Cow::Owned).unwrap_or_else(|| {
+            // SAFETY: This slice was validated to be UTF-8.
+            Cow::Borrowed(OsStr::new(unsafe {
+                str::from_utf8_unchecked(string)
+            }))
+        })
+    })
+}
+
+pub(super) fn os_str_to_bytes(os_string: &OsStr) -> Cow<'_, [u8]> {
+    Cow::Owned(to_bytes(os_string))
+}
+
+pub(super) fn os_string_from_vec(string: Vec<u8>) -> Result<OsString> {
+    from_bytes(&string).map(|os_string| {
+        os_string.unwrap_or_else(|| {
+            // SAFETY: This slice was validated to be UTF-8.
+            unsafe { String::from_utf8_unchecked(string) }.into()
+        })
+    })
+}
+
+pub(super) fn os_string_into_vec(os_string: OsString) -> Vec<u8> {
+    to_bytes(&os_string)
+}
diff --git a/third_party/rust/os_str_bytes/src/windows/raw.rs b/third_party/rust/os_str_bytes/src/windows/raw.rs
new file mode 100644
index 0000000000..80953dea79
--- /dev/null
+++ b/third_party/rust/os_str_bytes/src/windows/raw.rs
@@ -0,0 +1,46 @@
+use std::fmt;
+use std::fmt::Formatter;
+
+pub(crate) use crate::util::is_continuation;
+
+use super::wtf8;
+pub(crate) use super::wtf8::ends_with;
+pub(crate) use super::wtf8::starts_with;
+use super::wtf8::CodePoints;
+use super::Result;
+
+pub(crate) fn validate_bytes(string: &[u8]) -> Result<()> {
+    wtf8::encode_wide(string).try_for_each(|x| x.map(drop))
+}
+
+pub(crate) fn encode_wide_unchecked(
+    string: &[u8],
+) -> impl '_ + Iterator<Item = u16> {
+    wtf8::encode_wide(string).map(|x| expect_encoded!(x))
+}
+
+pub(crate) fn decode_code_point(string: &[u8]) -> u32 {
+    let mut code_points = CodePoints::new(string.iter().copied());
+    let code_point = expect_encoded!(code_points
+        .next()
+        .expect("cannot parse code point from empty string"));
+    assert_eq!(None, code_points.next(), "multiple code points found");
+    code_point
+}
+
+pub(crate) fn debug(string: &[u8], f: &mut Formatter<'_>) -> fmt::Result {
+    for wchar in encode_wide_unchecked(string) {
+        write!(f, "\\u{{{:X}}}", wchar)?;
+    }
+    Ok(())
+}
+
+#[cfg(feature = "uniquote")]
+pub(crate) mod uniquote {
+    use uniquote::Formatter;
+    use uniquote::Result;
+
+    pub(crate) fn escape(string: &[u8], f: &mut Formatter<'_>) -> Result {
+        f.escape_utf16(super::encode_wide_unchecked(string))
+    }
+}
diff --git a/third_party/rust/os_str_bytes/src/windows/wtf8/code_points.rs b/third_party/rust/os_str_bytes/src/windows/wtf8/code_points.rs
new file mode 100644
index 0000000000..9800d781fc
--- /dev/null
+++ b/third_party/rust/os_str_bytes/src/windows/wtf8/code_points.rs
@@ -0,0 +1,129 @@
+use std::iter::FusedIterator;
+use std::iter::Peekable;
+use std::mem;
+
+use crate::util::is_continuation;
+use crate::util::BYTE_SHIFT;
+use crate::util::CONT_MASK;
+
+use super::EncodingError;
+use super::Result;
+
+pub(in super::super) struct CodePoints<I>
+where
+    I: Iterator<Item = u8>,
+{
+    iter: Peekable<I>,
+    surrogate: bool,
+    still_utf8: bool,
+}
+
+impl<I> CodePoints<I>
+where
+    I: Iterator<Item = u8>,
+{
+    pub(in super::super) fn new<S>(string: S) -> Self
+    where
+        S: IntoIterator<IntoIter = I>,
+    {
+        Self {
+            iter: string.into_iter().peekable(),
+            surrogate: false,
+            still_utf8: true,
+        }
+    }
+
+    pub(super) fn is_still_utf8(&self) -> bool {
+        self.still_utf8
+    }
+
+    fn consume_next(&mut self, code_point: &mut u32) -> Result<()> {
+        let &byte = self.iter.peek().ok_or(EncodingError::End())?;
+
+        if !is_continuation(byte) {
+            self.surrogate = false;
+            // Not consuming this byte will be useful if this crate ever offers
+            // a way to encode lossily.
+            return Err(EncodingError::Byte(byte));
+        }
+        *code_point =
+            (*code_point << BYTE_SHIFT) | u32::from(byte & CONT_MASK);
+
+        let removed = self.iter.next();
+        debug_assert_eq!(Some(byte), removed);
+
+        Ok(())
+    }
+
+    pub(super) fn inner_size_hint(&self) -> (usize, Option<usize>) {
+        self.iter.size_hint()
+    }
+}
+
+impl<I> FusedIterator for CodePoints<I> where
+    I: FusedIterator + Iterator<Item = u8>
+{
+}
+
+impl<I> Iterator for CodePoints<I>
+where
+    I: Iterator<Item = u8>,
+{
+    type Item = Result<u32>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        let byte = self.iter.next()?;
+        let mut code_point: u32 = byte.into();
+
+        macro_rules! consume_next {
+            () => {{
+                if let Err(error) = self.consume_next(&mut code_point) {
+                    return Some(Err(error));
+                }
+            }};
+        }
+
+        let prev_surrogate = mem::replace(&mut self.surrogate, false);
+
+        let mut invalid = false;
+        if !byte.is_ascii() {
+            if byte < 0xC2 {
+                return Some(Err(EncodingError::Byte(byte)));
+            }
+
+            if byte < 0xE0 {
+                code_point &= 0x1F;
+            } else {
+                code_point &= 0x0F;
+                consume_next!();
+
+                if byte >= 0xF0 {
+                    if code_point.wrapping_sub(0x10) >= 0x100 {
+                        invalid = true;
+                    }
+                    consume_next!();
+
+                // This condition is optimized to detect surrogate code points.
+                } else if code_point & 0xFE0 == 0x360 {
+                    self.still_utf8 = false;
+                    if code_point & 0x10 == 0 {
+                        self.surrogate = true;
+                    } else if prev_surrogate {
+                        // Decoding a broken surrogate pair would be lossy.
+                        invalid = true;
+                    }
+                }
+
+                if code_point < 0x20 {
+                    invalid = true;
+                }
+            }
+            consume_next!();
+        }
+        if invalid {
+            return Some(Err(EncodingError::CodePoint(code_point)));
+        }
+
+        Some(Ok(code_point))
+    }
+}
diff --git a/third_party/rust/os_str_bytes/src/windows/wtf8/convert.rs b/third_party/rust/os_str_bytes/src/windows/wtf8/convert.rs
new file mode 100644
index 0000000000..70a8a9f58c
--- /dev/null
+++ b/third_party/rust/os_str_bytes/src/windows/wtf8/convert.rs
@@ -0,0 +1,181 @@
+use std::char;
+use std::char::DecodeUtf16;
+use std::iter::FusedIterator;
+use std::num::NonZeroU16;
+
+use crate::util::BYTE_SHIFT;
+use crate::util::CONT_MASK;
+use crate::util::CONT_TAG;
+
+use super::CodePoints;
+use super::Result;
+
+const MIN_HIGH_SURROGATE: u16 = 0xD800;
+
+const MIN_LOW_SURROGATE: u16 = 0xDC00;
+
+const MIN_SURROGATE_CODE: u32 = (u16::MAX as u32) + 1;
+
+macro_rules! static_assert {
+    ( $condition:expr ) => {
+        const _: () = assert!($condition, "static assertion failed");
+    };
+}
+
+pub(in super::super) struct DecodeWide<I>
+where
+    I: Iterator<Item = u16>,
+{
+    iter: DecodeUtf16<I>,
+    code_point: u32,
+    shifts: u8,
+}
+
+impl<I> DecodeWide<I>
+where
+    I: Iterator<Item = u16>,
+{
+    pub(in super::super) fn new<S>(string: S) -> Self
+    where
+        S: IntoIterator<IntoIter = I, Item = I::Item>,
+    {
+        Self {
+            iter: char::decode_utf16(string),
+            code_point: 0,
+            shifts: 0,
+        }
+    }
+
+    #[inline(always)]
+    fn get_raw_byte(&self) -> u8 {
+        (self.code_point >> (self.shifts * BYTE_SHIFT)) as u8
+    }
+}
+
+impl<I> Iterator for DecodeWide<I>
+where
+    I: Iterator<Item = u16>,
+{
+    type Item = u8;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        if let Some(shifts) = self.shifts.checked_sub(1) {
+            self.shifts = shifts;
+            return Some((self.get_raw_byte() & CONT_MASK) | CONT_TAG);
+        }
+
+        self.code_point = self
+            .iter
+            .next()?
+            .map(Into::into)
+            .unwrap_or_else(|x| x.unpaired_surrogate().into());
+
+        macro_rules! decode {
+            ( $tag:expr ) => {
+                Some(self.get_raw_byte() | $tag)
+            };
+        }
+        macro_rules! try_decode {
+            ( $tag:expr , $upper_bound:expr ) => {
+                if self.code_point < $upper_bound {
+                    return decode!($tag);
+                }
+                self.shifts += 1;
+            };
+        }
+        try_decode!(0, 0x80);
+        try_decode!(0xC0, 0x800);
+        try_decode!(0xE0, MIN_SURROGATE_CODE);
+        decode!(0xF0)
+    }
+
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        let (low, high) = self.iter.size_hint();
+        let shifts = self.shifts.into();
+        (
+            low.saturating_add(shifts),
+            high.and_then(|x| x.checked_mul(4))
+                .and_then(|x| x.checked_add(shifts)),
+        )
+    }
+}
+
+pub(in super::super) struct EncodeWide<I>
+where
+    I: Iterator<Item = u8>,
+{
+    iter: CodePoints<I>,
+    surrogate: Option<NonZeroU16>,
+}
+
+impl<I> EncodeWide<I>
+where
+    I: Iterator<Item = u8>,
+{
+    fn new<S>(string: S) -> Self
+    where
+        S: IntoIterator<IntoIter = I>,
+    {
+        Self {
+            iter: CodePoints::new(string),
+            surrogate: None,
+        }
+    }
+
+    pub(in super::super) fn is_still_utf8(&self) -> bool {
+        self.iter.is_still_utf8()
+    }
+}
+
+impl<I> FusedIterator for EncodeWide<I> where
+    I: FusedIterator + Iterator<Item = u8>
+{
+}
+
+impl<I> Iterator for EncodeWide<I>
+where
+    I: Iterator<Item = u8>,
+{
+    type Item = Result<u16>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        if let Some(surrogate) = self.surrogate.take() {
+            return Some(Ok(surrogate.get()));
+        }
+
+        self.iter.next().map(|code_point| {
+            code_point.map(|code_point| {
+                code_point
+                    .checked_sub(MIN_SURROGATE_CODE)
+                    .map(|offset| {
+                        static_assert!(MIN_LOW_SURROGATE != 0);
+
+                        // SAFETY: The above static assertion guarantees that
+                        // this value will not be zero.
+                        self.surrogate = Some(unsafe {
+                            NonZeroU16::new_unchecked(
+                                (offset & 0x3FF) as u16 | MIN_LOW_SURROGATE,
+                            )
+                        });
+                        (offset >> 10) as u16 | MIN_HIGH_SURROGATE
+                    })
+                    .unwrap_or(code_point as u16)
+            })
+        })
+    }
+
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        let (low, high) = self.iter.inner_size_hint();
+        let additional = self.surrogate.is_some().into();
+        (
+            (low.saturating_add(2) / 3).saturating_add(additional),
+            high.and_then(|x| x.checked_add(additional)),
+        )
+    }
+}
+
+pub(in super::super) fn encode_wide(
+    string: &[u8],
+) -> EncodeWide<impl '_ + Iterator<Item = u8>> {
+    EncodeWide::new(string.iter().copied())
+}
diff --git a/third_party/rust/os_str_bytes/src/windows/wtf8/mod.rs b/third_party/rust/os_str_bytes/src/windows/wtf8/mod.rs
new file mode 100644
index 0000000000..d8b0dc4a7f
--- /dev/null
+++ b/third_party/rust/os_str_bytes/src/windows/wtf8/mod.rs
@@ -0,0 +1,18 @@
+// This module implements the WTF-8 encoding specification:
+// https://simonsapin.github.io/wtf-8/
+
+use super::EncodingError;
+use super::Result;
+
+mod code_points;
+pub(super) use code_points::CodePoints;
+
+mod convert;
+pub(super) use convert::encode_wide;
+pub(super) use convert::DecodeWide;
+
+if_raw_str! {
+    mod string;
+    pub(crate) use string::ends_with;
+    pub(crate) use string::starts_with;
+}
diff --git a/third_party/rust/os_str_bytes/src/windows/wtf8/string.rs b/third_party/rust/os_str_bytes/src/windows/wtf8/string.rs
new file mode 100644
index 0000000000..b3523a2eff
--- /dev/null
+++ b/third_party/rust/os_str_bytes/src/windows/wtf8/string.rs
@@ -0,0 +1,67 @@
+use crate::util;
+
+const SURROGATE_LENGTH: usize = 3;
+
+pub(crate) fn ends_with(string: &[u8], mut suffix: &[u8]) -> bool {
+    let index = if let Some(index) = string.len().checked_sub(suffix.len()) {
+        index
+    } else {
+        return false;
+    };
+    if let Some(&byte) = string.get(index) {
+        if util::is_continuation(byte) {
+            let index = expect_encoded!(index.checked_sub(1));
+            let mut wide_surrogate =
+                if let Some(surrogate) = suffix.get(..SURROGATE_LENGTH) {
+                    super::encode_wide(surrogate)
+                } else {
+                    return false;
+                };
+            let surrogate_wchar = wide_surrogate
+                .next()
+                .expect("failed decoding non-empty suffix");
+
+            if wide_surrogate.next().is_some()
+                || super::encode_wide(&string[index..])
+                    .take_while(Result::is_ok)
+                    .nth(1)
+                    != Some(surrogate_wchar)
+            {
+                return false;
+            }
+            suffix = &suffix[SURROGATE_LENGTH..];
+        }
+    }
+    string.ends_with(suffix)
+}
+
+pub(crate) fn starts_with(string: &[u8], mut prefix: &[u8]) -> bool {
+    if let Some(&byte) = string.get(prefix.len()) {
+        if util::is_continuation(byte) {
+            let index = if let Some(index) =
+                prefix.len().checked_sub(SURROGATE_LENGTH)
+            {
+                index
+            } else {
+                return false;
+            };
+            let (substring, surrogate) = prefix.split_at(index);
+            let mut wide_surrogate = super::encode_wide(surrogate);
+            let surrogate_wchar = wide_surrogate
+                .next()
+                .expect("failed decoding non-empty prefix");
+
+            if surrogate_wchar.is_err()
+                || wide_surrogate.next().is_some()
+                || super::encode_wide(&string[index..])
+                    .next()
+                    .expect("failed decoding non-empty substring")
+                    != surrogate_wchar
+            {
+                return false;
+            }
+            prefix = substring;
+        }
+    }
+    string.starts_with(prefix)
+}
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-07 19:33:14 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-07 19:33:14 +0000
commit	36d22d82aa202bb199967e9512281e9a53db42c9 (patch)
tree	105e8c98ddea1c1e4784a60a5a6410fa416be2de /third_party/rust/os_str_bytes/src/windows
parent	Initial commit. (diff)
download	firefox-esr-36d22d82aa202bb199967e9512281e9a53db42c9.tar.xz firefox-esr-36d22d82aa202bb199967e9512281e9a53db42c9.zip