diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 19:33:14 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 19:33:14 +0000 |
commit | 36d22d82aa202bb199967e9512281e9a53db42c9 (patch) | |
tree | 105e8c98ddea1c1e4784a60a5a6410fa416be2de /third_party/rust/os_str_bytes/src | |
parent | Initial commit. (diff) | |
download | firefox-esr-36d22d82aa202bb199967e9512281e9a53db42c9.tar.xz firefox-esr-36d22d82aa202bb199967e9512281e9a53db42c9.zip |
Adding upstream version 115.7.0esr.upstream/115.7.0esrupstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/rust/os_str_bytes/src')
-rw-r--r-- | third_party/rust/os_str_bytes/src/common/mod.rs | 43 | ||||
-rw-r--r-- | third_party/rust/os_str_bytes/src/common/raw.rs | 45 | ||||
-rw-r--r-- | third_party/rust/os_str_bytes/src/iter.rs | 111 | ||||
-rw-r--r-- | third_party/rust/os_str_bytes/src/lib.rs | 623 | ||||
-rw-r--r-- | third_party/rust/os_str_bytes/src/pattern.rs | 71 | ||||
-rw-r--r-- | third_party/rust/os_str_bytes/src/raw_str.rs | 1547 | ||||
-rw-r--r-- | third_party/rust/os_str_bytes/src/util.rs | 9 | ||||
-rw-r--r-- | third_party/rust/os_str_bytes/src/wasm/mod.rs | 58 | ||||
-rw-r--r-- | third_party/rust/os_str_bytes/src/wasm/raw.rs | 34 | ||||
-rw-r--r-- | third_party/rust/os_str_bytes/src/windows/mod.rs | 113 | ||||
-rw-r--r-- | third_party/rust/os_str_bytes/src/windows/raw.rs | 46 | ||||
-rw-r--r-- | third_party/rust/os_str_bytes/src/windows/wtf8/code_points.rs | 129 | ||||
-rw-r--r-- | third_party/rust/os_str_bytes/src/windows/wtf8/convert.rs | 181 | ||||
-rw-r--r-- | third_party/rust/os_str_bytes/src/windows/wtf8/mod.rs | 18 | ||||
-rw-r--r-- | third_party/rust/os_str_bytes/src/windows/wtf8/string.rs | 67 |
15 files changed, 3095 insertions, 0 deletions
diff --git a/third_party/rust/os_str_bytes/src/common/mod.rs b/third_party/rust/os_str_bytes/src/common/mod.rs new file mode 100644 index 0000000000..e28aba6696 --- /dev/null +++ b/third_party/rust/os_str_bytes/src/common/mod.rs @@ -0,0 +1,43 @@ +use std::borrow::Cow; +use std::convert::Infallible; +use std::ffi::OsStr; +use std::ffi::OsString; +use std::result; + +#[cfg(all(target_vendor = "fortanix", target_env = "sgx"))] +use std::os::fortanix_sgx as os; +#[cfg(target_os = "solid_asp3")] +use std::os::solid as os; +#[cfg(any(target_os = "hermit", unix))] +use std::os::unix as os; +#[cfg(target_os = "wasi")] +use std::os::wasi as os; +#[cfg(target_os = "xous")] +use std::os::xous as os; + +use os::ffi::OsStrExt; +use os::ffi::OsStringExt; + +if_raw_str! { + pub(super) mod raw; +} + +pub(super) type EncodingError = Infallible; + +type Result<T> = result::Result<T, EncodingError>; + +pub(super) fn os_str_from_bytes(string: &[u8]) -> Result<Cow<'_, OsStr>> { + Ok(Cow::Borrowed(OsStrExt::from_bytes(string))) +} + +pub(super) fn os_str_to_bytes(os_string: &OsStr) -> Cow<'_, [u8]> { + Cow::Borrowed(OsStrExt::as_bytes(os_string)) +} + +pub(super) fn os_string_from_vec(string: Vec<u8>) -> Result<OsString> { + Ok(OsStringExt::from_vec(string)) +} + +pub(super) fn os_string_into_vec(os_string: OsString) -> Vec<u8> { + OsStringExt::into_vec(os_string) +} diff --git a/third_party/rust/os_str_bytes/src/common/raw.rs b/third_party/rust/os_str_bytes/src/common/raw.rs new file mode 100644 index 0000000000..97d0353d7e --- /dev/null +++ b/third_party/rust/os_str_bytes/src/common/raw.rs @@ -0,0 +1,45 @@ +use std::fmt; +use std::fmt::Formatter; + +use super::Result; + +#[inline(always)] +pub(crate) const fn is_continuation(_: u8) -> bool { + false +} + +#[inline(always)] +pub(crate) fn validate_bytes(_: &[u8]) -> Result<()> { + Ok(()) +} + +#[inline(always)] +pub(crate) fn decode_code_point(_: &[u8]) -> u32 { + unreachable!(); +} + +pub(crate) fn ends_with(string: &[u8], suffix: &[u8]) -> bool { + string.ends_with(suffix) +} + +pub(crate) fn starts_with(string: &[u8], prefix: &[u8]) -> bool { + string.starts_with(prefix) +} + +pub(crate) fn debug(string: &[u8], f: &mut Formatter<'_>) -> fmt::Result { + for byte in string { + write!(f, "\\x{:02X}", byte)?; + } + Ok(()) +} + +#[cfg(feature = "uniquote")] +pub(crate) mod uniquote { + use uniquote::Formatter; + use uniquote::Quote; + use uniquote::Result; + + pub(crate) fn escape(string: &[u8], f: &mut Formatter<'_>) -> Result { + string.escape(f) + } +} diff --git a/third_party/rust/os_str_bytes/src/iter.rs b/third_party/rust/os_str_bytes/src/iter.rs new file mode 100644 index 0000000000..03ff982412 --- /dev/null +++ b/third_party/rust/os_str_bytes/src/iter.rs @@ -0,0 +1,111 @@ +//! Iterators provided by this crate. + +#![cfg_attr(os_str_bytes_docs_rs, doc(cfg(feature = "raw_os_str")))] + +use std::convert; +use std::fmt; +use std::fmt::Debug; +use std::fmt::Formatter; +use std::iter::FusedIterator; + +use super::pattern::Encoded; +use super::Pattern; +use super::RawOsStr; + +// [memchr::memmem::FindIter] is not currently used, since this struct would +// become self-referential. Additionally, that iterator does not implement +// [DoubleEndedIterator], and its implementation would likely require +// significant changes to implement that trait. +/// The iterator returned by [`RawOsStr::split`]. +pub struct Split<'a, P> +where + P: Pattern, +{ + string: Option<&'a RawOsStr>, + pat: P::__Encoded, +} + +impl<'a, P> Split<'a, P> +where + P: Pattern, +{ + #[track_caller] + pub(super) fn new(string: &'a RawOsStr, pat: P) -> Self { + let pat = pat.__encode(); + assert!( + !pat.__get().is_empty(), + "cannot split using an empty pattern", + ); + Self { + string: Some(string), + pat, + } + } +} + +macro_rules! impl_next { + ( $self:ident , $split_method:ident , $swap_fn:expr ) => {{ + $self + .string? + .$split_method(&$self.pat) + .map(|substrings| { + let (substring, string) = $swap_fn(substrings); + $self.string = Some(string); + substring + }) + .or_else(|| $self.string.take()) + }}; +} + +impl<P> Clone for Split<'_, P> +where + P: Pattern, +{ + #[inline] + fn clone(&self) -> Self { + Self { + string: self.string, + pat: self.pat.clone(), + } + } +} + +impl<P> Debug for Split<'_, P> +where + P: Pattern, +{ + #[inline] + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + f.debug_struct("Split") + .field("string", &self.string) + .field("pat", &self.pat) + .finish() + } +} + +impl<P> DoubleEndedIterator for Split<'_, P> +where + P: Pattern, +{ + fn next_back(&mut self) -> Option<Self::Item> { + impl_next!(self, rsplit_once_raw, |(prefix, suffix)| (suffix, prefix)) + } +} + +impl<P> FusedIterator for Split<'_, P> where P: Pattern {} + +impl<'a, P> Iterator for Split<'a, P> +where + P: Pattern, +{ + type Item = &'a RawOsStr; + + #[inline] + fn last(mut self) -> Option<Self::Item> { + self.next_back() + } + + fn next(&mut self) -> Option<Self::Item> { + impl_next!(self, split_once_raw, convert::identity) + } +} diff --git a/third_party/rust/os_str_bytes/src/lib.rs b/third_party/rust/os_str_bytes/src/lib.rs new file mode 100644 index 0000000000..40154c99be --- /dev/null +++ b/third_party/rust/os_str_bytes/src/lib.rs @@ -0,0 +1,623 @@ +//! This crate allows interacting with the data stored by [`OsStr`] and +//! [`OsString`], without resorting to panics or corruption for invalid UTF-8. +//! Thus, methods can be used that are already defined on [`[u8]`][slice] and +//! [`Vec<u8>`]. +//! +//! Typically, the only way to losslessly construct [`OsStr`] or [`OsString`] +//! from a byte sequence is to use `OsStr::new(str::from_utf8(bytes)?)`, which +//! requires the bytes to be valid in UTF-8. However, since this crate makes +//! conversions directly between the platform encoding and raw bytes, even some +//! strings invalid in UTF-8 can be converted. +//! +//! In most cases, [`RawOsStr`] and [`RawOsString`] should be used. +//! [`OsStrBytes`] and [`OsStringBytes`] provide lower-level APIs that are +//! easier to misuse. +//! +//! # Encoding +//! +//! The encoding of bytes returned or accepted by methods of this crate is +//! intentionally left unspecified. It may vary for different platforms, so +//! defining it would run contrary to the goal of generic string handling. +//! However, the following invariants will always be upheld: +//! +//! - The encoding will be compatible with UTF-8. In particular, splitting an +//! encoded byte sequence by a UTF-8–encoded character always produces +//! other valid byte sequences. They can be re-encoded without error using +//! [`RawOsString::into_os_string`] and similar methods. +//! +//! - All characters valid in platform strings are representable. [`OsStr`] and +//! [`OsString`] can always be losslessly reconstructed from extracted bytes. +//! +//! Note that the chosen encoding may not match how Rust stores these strings +//! internally, which is undocumented. For instance, the result of calling +//! [`OsStr::len`] will not necessarily match the number of bytes this crate +//! uses to represent the same string. +//! +//! Additionally, concatenation may yield unexpected results without a UTF-8 +//! separator. If two platform strings need to be concatenated, the only safe +//! way to do so is using [`OsString::push`]. This limitation also makes it +//! undesirable to use the bytes in interchange. +//! +//! Since this encoding can change between versions and platforms, it should +//! not be used for storage. The standard library provides implementations of +//! [`OsStrExt`] and [`OsStringExt`] for various platforms, which should be +//! preferred for that use case. +//! +//! # User Input +//! +//! Traits in this crate should ideally not be used to convert byte sequences +//! that did not originate from [`OsStr`] or a related struct. The encoding +//! used by this crate is an implementation detail, so it does not make sense +//! to expose it to users. +//! +//! Crate [bstr] offers some useful alternative methods, such as +//! [`ByteSlice::to_os_str`] and [`ByteVec::into_os_string`], that are meant +//! for user input. But, they reject some byte sequences used to represent +//! valid platform strings, which would be undesirable for reliable path +//! handling. They are best used only when accepting unknown input. +//! +//! This crate is meant to help when you already have an instance of [`OsStr`] +//! and need to modify the data in a lossless way. +//! +//! # Features +//! +//! These features are optional and can be enabled or disabled in a +//! "Cargo.toml" file. +//! +//! ### Default Features +//! +//! - **memchr** - +//! Changes the implementation to use crate [memchr] for better performance. +//! This feature is useless when "raw\_os\_str" is disabled. +//! +//! For more information, see [`RawOsStr`][memchr complexity]. +//! +//! - **raw\_os\_str** - +//! Provides: +//! - [`iter`] +//! - [`Pattern`] +//! - [`RawOsStr`] +//! - [`RawOsStrCow`] +//! - [`RawOsString`] +//! +//! ### Optional Features +//! +//! - **checked\_conversions** - +//! Provides: +//! - [`EncodingError`] +//! - [`OsStrBytes::from_raw_bytes`] +//! - [`OsStringBytes::from_raw_vec`] +//! - [`RawOsStr::from_raw_bytes`] +//! - [`RawOsString::from_raw_vec`] +//! +//! Because this feature should not be used in libraries, the +//! "OS_STR_BYTES_CHECKED_CONVERSIONS" environment variable must be defined +//! during compilation. +//! +//! - **print\_bytes** - +//! Provides implementations of [`print_bytes::ToBytes`] for [`RawOsStr`] and +//! [`RawOsString`]. +//! +//! - **uniquote** - +//! Provides implementations of [`uniquote::Quote`] for [`RawOsStr`] and +//! [`RawOsString`]. +//! +//! # Implementation +//! +//! Some methods return [`Cow`] to account for platform differences. However, +//! no guarantee is made that the same variant of that enum will always be +//! returned for the same platform. Whichever can be constructed most +//! efficiently will be returned. +//! +//! All traits are [sealed], meaning that they can only be implemented by this +//! crate. Otherwise, backward compatibility would be more difficult to +//! maintain for new features. +//! +//! # Complexity +//! +//! Conversion method complexities will vary based on what functionality is +//! available for the platform. At worst, they will all be linear, but some can +//! take constant time. For example, [`RawOsString::into_os_string`] might be +//! able to reuse its allocation. +//! +//! # Examples +//! +//! ``` +//! # use std::io; +//! # +//! # #[cfg(feature = "raw_os_str")] +//! # { +//! # #[cfg(any())] +//! use std::env; +//! use std::fs; +//! +//! use os_str_bytes::RawOsStr; +//! +//! # mod env { +//! # use std::env; +//! # use std::ffi::OsString; +//! # +//! # pub fn args_os() -> impl Iterator<Item = OsString> { +//! # let mut file = env::temp_dir(); +//! # file.push("os_str_bytes\u{E9}.txt"); +//! # return vec![OsString::new(), file.into_os_string()].into_iter(); +//! # } +//! # } +//! # +//! for file in env::args_os().skip(1) { +//! if !RawOsStr::new(&file).starts_with('-') { +//! let string = "Hello, world!"; +//! fs::write(&file, string)?; +//! assert_eq!(string, fs::read_to_string(file)?); +//! } +//! } +//! # } +//! # +//! # Ok::<_, io::Error>(()) +//! ``` +//! +//! [bstr]: https://crates.io/crates/bstr +//! [`ByteSlice::to_os_str`]: https://docs.rs/bstr/0.2.12/bstr/trait.ByteSlice.html#method.to_os_str +//! [`ByteVec::into_os_string`]: https://docs.rs/bstr/0.2.12/bstr/trait.ByteVec.html#method.into_os_string +//! [memchr complexity]: RawOsStr#complexity +//! [memchr]: https://crates.io/crates/memchr +//! [`OsStrExt`]: ::std::os::unix::ffi::OsStrExt +//! [`OsStringExt`]: ::std::os::unix::ffi::OsStringExt +//! [sealed]: https://rust-lang.github.io/api-guidelines/future-proofing.html#c-sealed +//! [print\_bytes]: https://crates.io/crates/print_bytes + +#![cfg_attr(not(feature = "checked_conversions"), allow(deprecated))] +// Only require a nightly compiler when building documentation for docs.rs. +// This is a private option that should not be used. +// https://github.com/rust-lang/docs.rs/issues/147#issuecomment-389544407 +// https://github.com/dylni/os_str_bytes/issues/2 +#![cfg_attr(os_str_bytes_docs_rs, feature(doc_cfg))] +// Nightly is also currently required for the SGX platform. +#![cfg_attr( + all(target_vendor = "fortanix", target_env = "sgx"), + feature(sgx_platform) +)] +#![warn(unsafe_op_in_unsafe_fn)] +#![warn(unused_results)] + +use std::borrow::Cow; +use std::error::Error; +use std::ffi::OsStr; +use std::ffi::OsString; +use std::fmt; +use std::fmt::Display; +use std::fmt::Formatter; +use std::path::Path; +use std::path::PathBuf; +use std::result; + +macro_rules! if_checked_conversions { + ( $($item:item)+ ) => { + $( + #[cfg(feature = "checked_conversions")] + $item + )+ + }; +} + +#[cfg(not(os_str_bytes_docs_rs))] +if_checked_conversions! { + const _: &str = env!( + "OS_STR_BYTES_CHECKED_CONVERSIONS", + "The 'OS_STR_BYTES_CHECKED_CONVERSIONS' environment variable must be \ + defined to use the 'checked_conversions' feature.", + ); +} + +#[rustfmt::skip] +macro_rules! deprecated_checked_conversion { + ( $message:expr , $item:item ) => { + #[cfg_attr( + not(feature = "checked_conversions"), + deprecated = $message + )] + $item + }; +} + +macro_rules! expect_encoded { + ( $result:expr ) => { + $result.expect("invalid raw bytes") + }; +} + +macro_rules! if_raw_str { + ( $($item:item)+ ) => { + $( + #[cfg(feature = "raw_os_str")] + $item + )+ + }; +} + +#[cfg_attr( + all(target_family = "wasm", target_os = "unknown"), + path = "wasm/mod.rs" +)] +#[cfg_attr(windows, path = "windows/mod.rs")] +#[cfg_attr( + not(any(all(target_family = "wasm", target_os = "unknown"), windows)), + path = "common/mod.rs" +)] +mod imp; + +#[cfg(any( + all( + feature = "raw_os_str", + target_family = "wasm", + target_os = "unknown", + ), + windows, +))] +mod util; + +if_raw_str! { + pub mod iter; + + mod pattern; + pub use pattern::Pattern; + + mod raw_str; + pub use raw_str::RawOsStr; + pub use raw_str::RawOsStrCow; + pub use raw_str::RawOsString; +} + +deprecated_checked_conversion! { + "use `OsStrBytes::assert_from_raw_bytes` or \ + `OsStringBytes::assert_from_raw_vec` instead, or enable the \ + 'checked_conversions' feature", + /// The error that occurs when a byte sequence is not representable in the + /// platform encoding. + /// + /// [`Result::unwrap`] should almost always be called on results containing + /// this error. It should be known whether or not byte sequences are + /// properly encoded for the platform, since [the module-level + /// documentation][encoding] discourages using encoded bytes in + /// interchange. Results are returned primarily to make panicking behavior + /// explicit. + /// + /// On Unix, this error is never returned, but [`OsStrExt`] or + /// [`OsStringExt`] should be used instead if that needs to be guaranteed. + /// + /// [encoding]: self#encoding + /// [`OsStrExt`]: ::std::os::unix::ffi::OsStrExt + /// [`OsStringExt`]: ::std::os::unix::ffi::OsStringExt + /// [`Result::unwrap`]: ::std::result::Result::unwrap + #[derive(Clone, Debug, Eq, PartialEq)] + #[cfg_attr( + os_str_bytes_docs_rs, + doc(cfg(feature = "checked_conversions")) + )] + pub struct EncodingError(imp::EncodingError); +} + +impl Display for EncodingError { + #[inline] + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + self.0.fmt(f) + } +} + +impl Error for EncodingError {} + +type Result<T> = result::Result<T, EncodingError>; + +fn from_raw_bytes<'a, S>( + string: S, +) -> result::Result<Cow<'a, OsStr>, imp::EncodingError> +where + S: Into<Cow<'a, [u8]>>, +{ + match string.into() { + Cow::Borrowed(string) => imp::os_str_from_bytes(string), + Cow::Owned(string) => imp::os_string_from_vec(string).map(Cow::Owned), + } +} + +fn cow_os_str_into_path(string: Cow<'_, OsStr>) -> Cow<'_, Path> { + match string { + Cow::Borrowed(string) => Cow::Borrowed(Path::new(string)), + Cow::Owned(string) => Cow::Owned(string.into()), + } +} + +/// A platform agnostic variant of [`OsStrExt`]. +/// +/// For more information, see [the module-level documentation][module]. +/// +/// [module]: self +/// [`OsStrExt`]: ::std::os::unix::ffi::OsStrExt +pub trait OsStrBytes: private::Sealed + ToOwned { + /// Converts a byte string into an equivalent platform-native string. + /// + /// # Panics + /// + /// Panics if the string is not valid for the [unspecified encoding] used + /// by this crate. + /// + /// # Examples + /// + /// ``` + /// use std::env; + /// use std::ffi::OsStr; + /// # use std::io; + /// + /// use os_str_bytes::OsStrBytes; + /// + /// let os_string = env::current_exe()?; + /// let os_bytes = os_string.to_raw_bytes(); + /// assert_eq!(os_string, OsStr::assert_from_raw_bytes(os_bytes)); + /// # + /// # Ok::<_, io::Error>(()) + /// ``` + /// + /// [unspecified encoding]: self#encoding + #[must_use = "method should not be used for validation"] + #[track_caller] + fn assert_from_raw_bytes<'a, S>(string: S) -> Cow<'a, Self> + where + S: Into<Cow<'a, [u8]>>; + + deprecated_checked_conversion! { + "use `assert_from_raw_bytes` instead, or enable the \ + 'checked_conversions' feature", + /// Converts a byte string into an equivalent platform-native string. + /// + /// [`assert_from_raw_bytes`] should almost always be used instead. For + /// more information, see [`EncodingError`]. + /// + /// # Errors + /// + /// See documentation for [`EncodingError`]. + /// + /// # Examples + /// + /// ``` + /// use std::env; + /// use std::ffi::OsStr; + /// # use std::io; + /// + /// use os_str_bytes::OsStrBytes; + /// + /// let os_string = env::current_exe()?; + /// let os_bytes = os_string.to_raw_bytes(); + /// assert_eq!(os_string, OsStr::from_raw_bytes(os_bytes).unwrap()); + /// # + /// # Ok::<_, io::Error>(()) + /// ``` + /// + /// [`assert_from_raw_bytes`]: Self::assert_from_raw_bytes + #[cfg_attr( + os_str_bytes_docs_rs, + doc(cfg(feature = "checked_conversions")) + )] + fn from_raw_bytes<'a, S>(string: S) -> Result<Cow<'a, Self>> + where + S: Into<Cow<'a, [u8]>>; + } + + /// Converts a platform-native string into an equivalent byte string. + /// + /// The returned string will use an [unspecified encoding]. + /// + /// # Examples + /// + /// ``` + /// use std::ffi::OsStr; + /// + /// use os_str_bytes::OsStrBytes; + /// + /// let string = "foobar"; + /// let os_string = OsStr::new(string); + /// assert_eq!(string.as_bytes(), &*os_string.to_raw_bytes()); + /// ``` + /// + /// [unspecified encoding]: self#encoding + #[must_use] + fn to_raw_bytes(&self) -> Cow<'_, [u8]>; +} + +impl OsStrBytes for OsStr { + #[inline] + fn assert_from_raw_bytes<'a, S>(string: S) -> Cow<'a, Self> + where + S: Into<Cow<'a, [u8]>>, + { + expect_encoded!(from_raw_bytes(string)) + } + + #[inline] + fn from_raw_bytes<'a, S>(string: S) -> Result<Cow<'a, Self>> + where + S: Into<Cow<'a, [u8]>>, + { + from_raw_bytes(string).map_err(EncodingError) + } + + #[inline] + fn to_raw_bytes(&self) -> Cow<'_, [u8]> { + imp::os_str_to_bytes(self) + } +} + +impl OsStrBytes for Path { + #[inline] + fn assert_from_raw_bytes<'a, S>(string: S) -> Cow<'a, Self> + where + S: Into<Cow<'a, [u8]>>, + { + cow_os_str_into_path(OsStr::assert_from_raw_bytes(string)) + } + + #[inline] + fn from_raw_bytes<'a, S>(string: S) -> Result<Cow<'a, Self>> + where + S: Into<Cow<'a, [u8]>>, + { + OsStr::from_raw_bytes(string).map(cow_os_str_into_path) + } + + #[inline] + fn to_raw_bytes(&self) -> Cow<'_, [u8]> { + self.as_os_str().to_raw_bytes() + } +} + +/// A platform agnostic variant of [`OsStringExt`]. +/// +/// For more information, see [the module-level documentation][module]. +/// +/// [module]: self +/// [`OsStringExt`]: ::std::os::unix::ffi::OsStringExt +pub trait OsStringBytes: private::Sealed + Sized { + /// Converts a byte string into an equivalent platform-native string. + /// + /// # Panics + /// + /// Panics if the string is not valid for the [unspecified encoding] used + /// by this crate. + /// + /// # Examples + /// + /// ``` + /// use std::env; + /// use std::ffi::OsString; + /// # use std::io; + /// + /// use os_str_bytes::OsStringBytes; + /// + /// let os_string = env::current_exe()?; + /// let os_bytes = os_string.clone().into_raw_vec(); + /// assert_eq!(os_string, OsString::assert_from_raw_vec(os_bytes)); + /// # + /// # Ok::<_, io::Error>(()) + /// ``` + /// + /// [unspecified encoding]: self#encoding + #[must_use = "method should not be used for validation"] + #[track_caller] + fn assert_from_raw_vec(string: Vec<u8>) -> Self; + + deprecated_checked_conversion! { + "use `assert_from_raw_vec` instead, or enable the \ + 'checked_conversions' feature", + /// Converts a byte string into an equivalent platform-native string. + /// + /// [`assert_from_raw_vec`] should almost always be used instead. For + /// more information, see [`EncodingError`]. + /// + /// # Errors + /// + /// See documentation for [`EncodingError`]. + /// + /// # Examples + /// + /// ``` + /// use std::env; + /// use std::ffi::OsString; + /// # use std::io; + /// + /// use os_str_bytes::OsStringBytes; + /// + /// let os_string = env::current_exe()?; + /// let os_bytes = os_string.clone().into_raw_vec(); + /// assert_eq!(os_string, OsString::from_raw_vec(os_bytes).unwrap()); + /// # + /// # Ok::<_, io::Error>(()) + /// ``` + /// + /// [`assert_from_raw_vec`]: Self::assert_from_raw_vec + #[cfg_attr( + os_str_bytes_docs_rs, + doc(cfg(feature = "checked_conversions")) + )] + fn from_raw_vec(string: Vec<u8>) -> Result<Self>; + } + + /// Converts a platform-native string into an equivalent byte string. + /// + /// The returned string will use an [unspecified encoding]. + /// + /// # Examples + /// + /// ``` + /// use std::ffi::OsString; + /// + /// use os_str_bytes::OsStringBytes; + /// + /// let string = "foobar".to_owned(); + /// let os_string: OsString = string.clone().into(); + /// assert_eq!(string.into_bytes(), os_string.into_raw_vec()); + /// ``` + /// + /// [unspecified encoding]: self#encoding + #[must_use] + fn into_raw_vec(self) -> Vec<u8>; +} + +impl OsStringBytes for OsString { + #[inline] + fn assert_from_raw_vec(string: Vec<u8>) -> Self { + expect_encoded!(imp::os_string_from_vec(string)) + } + + #[inline] + fn from_raw_vec(string: Vec<u8>) -> Result<Self> { + imp::os_string_from_vec(string).map_err(EncodingError) + } + + #[inline] + fn into_raw_vec(self) -> Vec<u8> { + imp::os_string_into_vec(self) + } +} + +impl OsStringBytes for PathBuf { + #[inline] + fn assert_from_raw_vec(string: Vec<u8>) -> Self { + OsString::assert_from_raw_vec(string).into() + } + + #[inline] + fn from_raw_vec(string: Vec<u8>) -> Result<Self> { + OsString::from_raw_vec(string).map(Into::into) + } + + #[inline] + fn into_raw_vec(self) -> Vec<u8> { + self.into_os_string().into_raw_vec() + } +} + +mod private { + use std::ffi::OsStr; + use std::ffi::OsString; + use std::path::Path; + use std::path::PathBuf; + + if_raw_str! { + use std::borrow::Cow; + + use super::RawOsStr; + } + + pub trait Sealed {} + + impl Sealed for char {} + impl Sealed for OsStr {} + impl Sealed for OsString {} + impl Sealed for Path {} + impl Sealed for PathBuf {} + impl Sealed for &str {} + impl Sealed for &String {} + + if_raw_str! { + impl Sealed for Cow<'_, RawOsStr> {} + } +} diff --git a/third_party/rust/os_str_bytes/src/pattern.rs b/third_party/rust/os_str_bytes/src/pattern.rs new file mode 100644 index 0000000000..11f86bf31d --- /dev/null +++ b/third_party/rust/os_str_bytes/src/pattern.rs @@ -0,0 +1,71 @@ +use std::fmt::Debug; + +use super::private; + +pub trait Encoded { + fn __get(&self) -> &[u8]; +} + +#[derive(Clone, Debug)] +pub struct EncodedChar { + buffer: [u8; 4], + length: usize, +} + +impl Encoded for EncodedChar { + fn __get(&self) -> &[u8] { + &self.buffer[..self.length] + } +} + +impl Encoded for &str { + fn __get(&self) -> &[u8] { + self.as_bytes() + } +} + +/// Allows a type to be used for searching by [`RawOsStr`] and [`RawOsString`]. +/// +/// This trait is very similar to [`str::pattern::Pattern`], but its methods +/// are private and it is implemented for different types. +/// +/// [`RawOsStr`]: super::RawOsStr +/// [`RawOsString`]: super::RawOsString +/// [`str::pattern::Pattern`]: ::std::str::pattern::Pattern +#[cfg_attr(os_str_bytes_docs_rs, doc(cfg(feature = "raw_os_str")))] +pub trait Pattern: private::Sealed { + #[doc(hidden)] + type __Encoded: Clone + Debug + Encoded; + + #[doc(hidden)] + fn __encode(self) -> Self::__Encoded; +} + +impl Pattern for char { + type __Encoded = EncodedChar; + + fn __encode(self) -> Self::__Encoded { + let mut encoded = EncodedChar { + buffer: [0; 4], + length: 0, + }; + encoded.length = self.encode_utf8(&mut encoded.buffer).len(); + encoded + } +} + +impl Pattern for &str { + type __Encoded = Self; + + fn __encode(self) -> Self::__Encoded { + self + } +} + +impl<'a> Pattern for &'a String { + type __Encoded = <&'a str as Pattern>::__Encoded; + + fn __encode(self) -> Self::__Encoded { + (**self).__encode() + } +} diff --git a/third_party/rust/os_str_bytes/src/raw_str.rs b/third_party/rust/os_str_bytes/src/raw_str.rs new file mode 100644 index 0000000000..659b34d9cb --- /dev/null +++ b/third_party/rust/os_str_bytes/src/raw_str.rs @@ -0,0 +1,1547 @@ +use std::borrow::Borrow; +use std::borrow::Cow; +use std::borrow::ToOwned; +use std::ffi::OsStr; +use std::ffi::OsString; +use std::fmt; +use std::fmt::Debug; +use std::fmt::Display; +use std::fmt::Formatter; +use std::mem; +use std::ops::Deref; +use std::ops::Index; +use std::ops::Range; +use std::ops::RangeFrom; +use std::ops::RangeFull; +use std::ops::RangeInclusive; +use std::ops::RangeTo; +use std::ops::RangeToInclusive; +use std::result; +use std::str; + +#[cfg(feature = "memchr")] +use memchr::memmem::find; +#[cfg(feature = "memchr")] +use memchr::memmem::rfind; + +use super::imp; +use super::imp::raw; +use super::iter::Split; +use super::pattern::Encoded as EncodedPattern; +use super::private; +use super::Pattern; + +if_checked_conversions! { + use super::EncodingError; + use super::Result; +} + +#[cfg(not(feature = "memchr"))] +fn find(string: &[u8], pat: &[u8]) -> Option<usize> { + (0..=string.len().checked_sub(pat.len())?) + .find(|&x| string[x..].starts_with(pat)) +} + +#[cfg(not(feature = "memchr"))] +fn rfind(string: &[u8], pat: &[u8]) -> Option<usize> { + (pat.len()..=string.len()) + .rfind(|&x| string[..x].ends_with(pat)) + .map(|x| x - pat.len()) +} + +#[allow(clippy::missing_safety_doc)] +unsafe trait TransmuteBox { + fn transmute_box<R>(self: Box<Self>) -> Box<R> + where + R: ?Sized + TransmuteBox, + { + let value = Box::into_raw(self); + // SAFETY: This trait is only implemented for types that can be + // transmuted. + unsafe { Box::from_raw(mem::transmute_copy(&value)) } + } +} + +// SAFETY: This struct has a layout that makes this operation safe. +unsafe impl TransmuteBox for RawOsStr {} +unsafe impl TransmuteBox for [u8] {} + +/// A container for borrowed byte strings converted by this crate. +/// +/// This wrapper is intended to prevent violating the invariants of the +/// [unspecified encoding] used by this crate and minimize encoding +/// conversions. +/// +/// # Indices +/// +/// Methods of this struct that accept indices require that the index lie on a +/// UTF-8 boundary. Although it is possible to manipulate platform strings +/// based on other indices, this crate currently does not support them for +/// slicing methods. They would add significant complication to the +/// implementation and are generally not necessary. However, all indices +/// returned by this struct can be used for slicing. +/// +/// On Unix, all indices are permitted, to avoid false positives. However, +/// relying on this implementation detail is discouraged. Platform-specific +/// indices are error-prone. +/// +/// # Complexity +/// +/// All searching methods have worst-case multiplicative time complexity (i.e., +/// `O(self.raw_len() * pat.len())`). Enabling the "memchr" feature allows +/// these methods to instead run in linear time in the worst case (documented +/// for [`memchr::memmem::find`][memchr complexity]). +/// +/// # Safety +/// +/// Although this type is annotated with `#[repr(transparent)]`, the inner +/// representation is not stable. Transmuting between this type and any other +/// causes immediate undefined behavior. +/// +/// [memchr complexity]: memchr::memmem::find#complexity +/// [unspecified encoding]: super#encoding +#[derive(Eq, Hash, Ord, PartialEq, PartialOrd)] +#[cfg_attr(os_str_bytes_docs_rs, doc(cfg(feature = "raw_os_str")))] +#[repr(transparent)] +pub struct RawOsStr([u8]); + +impl RawOsStr { + const fn from_inner(string: &[u8]) -> &Self { + // SAFETY: This struct has a layout that makes this operation safe. + unsafe { mem::transmute(string) } + } + + /// Converts a platform-native string into a representation that can be + /// more easily manipulated. + /// + /// This method performs the necessary conversion immediately, so it can be + /// expensive to call. It is recommended to continue using the returned + /// instance as long as possible (instead of the original [`OsStr`]), to + /// avoid repeated conversions. + /// + /// # Examples + /// + /// ``` + /// use std::env; + /// # use std::io; + /// + /// use os_str_bytes::RawOsStr; + /// + /// let os_string = env::current_exe()?.into_os_string(); + /// println!("{:?}", RawOsStr::new(&os_string)); + /// # + /// # Ok::<_, io::Error>(()) + /// ``` + #[inline] + #[must_use] + pub fn new(string: &OsStr) -> Cow<'_, Self> { + match imp::os_str_to_bytes(string) { + Cow::Borrowed(string) => Cow::Borrowed(Self::from_inner(string)), + Cow::Owned(string) => Cow::Owned(RawOsString(string)), + } + } + + /// Wraps a string, without copying or encoding conversion. + /// + /// This method is much more efficient than [`RawOsStr::new`], since the + /// [encoding] used by this crate is compatible with UTF-8. + /// + /// # Examples + /// + /// ``` + /// use os_str_bytes::RawOsStr; + /// + /// let string = "foobar"; + /// let raw = RawOsStr::from_str(string); + /// assert_eq!(string, raw); + /// ``` + /// + /// [encoding]: super#encoding + #[allow(clippy::should_implement_trait)] + #[inline] + #[must_use] + pub fn from_str(string: &str) -> &Self { + Self::from_inner(string.as_bytes()) + } + + /// Wraps a byte string, without copying or encoding conversion. + /// + /// # Panics + /// + /// Panics if the string is not valid for the [unspecified encoding] used + /// by this crate. + /// + /// # Examples + /// + /// ``` + /// use std::env; + /// # use std::io; + /// + /// use os_str_bytes::RawOsStr; + /// + /// let os_string = env::current_exe()?.into_os_string(); + /// let raw = RawOsStr::new(&os_string); + /// let raw_bytes = raw.as_raw_bytes(); + /// assert_eq!(&*raw, RawOsStr::assert_from_raw_bytes(raw_bytes)); + /// # + /// # Ok::<_, io::Error>(()) + /// ``` + /// + /// [unspecified encoding]: super#encoding + #[inline] + #[must_use = "method should not be used for validation"] + #[track_caller] + pub fn assert_from_raw_bytes(string: &[u8]) -> &Self { + expect_encoded!(raw::validate_bytes(string)); + + Self::from_inner(string) + } + + if_checked_conversions! { + /// Wraps a byte string, without copying or encoding conversion. + /// + /// [`assert_from_raw_bytes`] should almost always be used instead. For + /// more information, see [`EncodingError`]. + /// + /// # Errors + /// + /// See documentation for [`EncodingError`]. + /// + /// # Examples + /// + /// ``` + /// use std::env; + /// # use std::io; + /// + /// use os_str_bytes::RawOsStr; + /// + /// let os_string = env::current_exe()?.into_os_string(); + /// let raw = RawOsStr::new(&os_string); + /// assert_eq!(Ok(&*raw), RawOsStr::from_raw_bytes(raw.as_raw_bytes())); + /// # + /// # Ok::<_, io::Error>(()) + /// ``` + /// + /// [`assert_from_raw_bytes`]: Self::assert_from_raw_bytes + #[cfg_attr( + os_str_bytes_docs_rs, + doc(cfg(feature = "checked_conversions")) + )] + #[inline] + pub fn from_raw_bytes(string: &[u8]) -> Result<&Self> { + raw::validate_bytes(string) + .map(|()| Self::from_inner(string)) + .map_err(EncodingError) + } + } + + /// Wraps a byte string, without copying or encoding conversion. + /// + /// # Safety + /// + /// The string must be valid for the [unspecified encoding] used by this + /// crate. + /// + /// # Examples + /// + /// ``` + /// use std::env; + /// # use std::io; + /// + /// use os_str_bytes::RawOsStr; + /// + /// let os_string = env::current_exe()?.into_os_string(); + /// let raw = RawOsStr::new(&os_string); + /// let raw_bytes = raw.as_raw_bytes(); + /// assert_eq!(&*raw, unsafe { + /// RawOsStr::from_raw_bytes_unchecked(raw_bytes) + /// }); + /// # + /// # Ok::<_, io::Error>(()) + /// ``` + /// + /// [unspecified encoding]: super#encoding + #[inline] + #[must_use] + #[track_caller] + pub unsafe fn from_raw_bytes_unchecked(string: &[u8]) -> &Self { + if cfg!(debug_assertions) { + expect_encoded!(raw::validate_bytes(string)); + } + + Self::from_inner(string) + } + + /// Returns the byte string stored by this container. + /// + /// The returned string will use an [unspecified encoding]. + /// + /// # Examples + /// + /// ``` + /// use os_str_bytes::RawOsStr; + /// + /// let string = "foobar"; + /// let raw = RawOsStr::from_str(string); + /// assert_eq!(string.as_bytes(), raw.as_raw_bytes()); + /// ``` + /// + /// [unspecified encoding]: super#encoding + #[inline] + #[must_use] + pub fn as_raw_bytes(&self) -> &[u8] { + &self.0 + } + + /// Equivalent to [`str::contains`]. + /// + /// # Examples + /// + /// ``` + /// use os_str_bytes::RawOsStr; + /// + /// let raw = RawOsStr::from_str("foobar"); + /// assert!(raw.contains("oo")); + /// assert!(!raw.contains("of")); + /// ``` + #[inline] + #[must_use] + pub fn contains<P>(&self, pat: P) -> bool + where + P: Pattern, + { + self.find(pat).is_some() + } + + /// Equivalent to [`str::ends_with`]. + /// + /// # Examples + /// + /// ``` + /// use os_str_bytes::RawOsStr; + /// + /// let raw = RawOsStr::from_str("foobar"); + /// assert!(raw.ends_with("bar")); + /// assert!(!raw.ends_with("foo")); + /// ``` + #[inline] + #[must_use] + pub fn ends_with<P>(&self, pat: P) -> bool + where + P: Pattern, + { + let pat = pat.__encode(); + let pat = pat.__get(); + + self.0.ends_with(pat) + } + + /// Equivalent to [`str::ends_with`] but accepts this type for the pattern. + /// + /// # Examples + /// + /// ``` + /// use os_str_bytes::RawOsStr; + /// + /// let raw = RawOsStr::from_str("foobar"); + /// assert!(raw.ends_with_os(RawOsStr::from_str("bar"))); + /// assert!(!raw.ends_with_os(RawOsStr::from_str("foo"))); + /// ``` + #[inline] + #[must_use] + pub fn ends_with_os(&self, pat: &Self) -> bool { + raw::ends_with(&self.0, &pat.0) + } + + /// Equivalent to [`str::find`]. + /// + /// # Examples + /// + /// ``` + /// use os_str_bytes::RawOsStr; + /// + /// let raw = RawOsStr::from_str("foobar"); + /// assert_eq!(Some(1), raw.find("o")); + /// assert_eq!(None, raw.find("of")); + /// ``` + #[inline] + #[must_use] + pub fn find<P>(&self, pat: P) -> Option<usize> + where + P: Pattern, + { + let pat = pat.__encode(); + let pat = pat.__get(); + + find(&self.0, pat) + } + + /// Equivalent to [`str::is_empty`]. + /// + /// # Examples + /// + /// ``` + /// use os_str_bytes::RawOsStr; + /// + /// assert!(RawOsStr::from_str("").is_empty()); + /// assert!(!RawOsStr::from_str("foobar").is_empty()); + /// ``` + #[inline] + #[must_use] + pub fn is_empty(&self) -> bool { + self.0.is_empty() + } + + /// Returns the length of the byte string stored by this container. + /// + /// Only the following assumptions can be made about the result: + /// - The length of any Unicode character is the length of its UTF-8 + /// representation (i.e., [`char::len_utf8`]). + /// - Splitting a string at a UTF-8 boundary will return two strings with + /// lengths that sum to the length of the original string. + /// + /// This method may return a different result than would [`OsStr::len`] + /// when called on same string, since [`OsStr`] uses an unspecified + /// encoding. + /// + /// # Examples + /// + /// ``` + /// use os_str_bytes::RawOsStr; + /// + /// assert_eq!(6, RawOsStr::from_str("foobar").raw_len()); + /// assert_eq!(0, RawOsStr::from_str("").raw_len()); + /// ``` + #[inline] + #[must_use] + pub fn raw_len(&self) -> usize { + self.0.len() + } + + /// Equivalent to [`str::rfind`]. + /// + /// # Examples + /// + /// ``` + /// use os_str_bytes::RawOsStr; + /// + /// let raw = RawOsStr::from_str("foobar"); + /// assert_eq!(Some(2), raw.rfind("o")); + /// assert_eq!(None, raw.rfind("of")); + /// ``` + #[inline] + #[must_use] + pub fn rfind<P>(&self, pat: P) -> Option<usize> + where + P: Pattern, + { + let pat = pat.__encode(); + let pat = pat.__get(); + + rfind(&self.0, pat) + } + + fn split_once_raw_with<P, F>( + &self, + pat: &P, + find_fn: F, + ) -> Option<(&Self, &Self)> + where + F: FnOnce(&[u8], &[u8]) -> Option<usize>, + P: EncodedPattern, + { + let pat = pat.__get(); + + let index = find_fn(&self.0, pat)?; + let prefix = &self.0[..index]; + let suffix = &self.0[index + pat.len()..]; + Some((Self::from_inner(prefix), Self::from_inner(suffix))) + } + + pub(super) fn rsplit_once_raw<P>(&self, pat: &P) -> Option<(&Self, &Self)> + where + P: EncodedPattern, + { + self.split_once_raw_with(pat, rfind) + } + + /// Equivalent to [`str::rsplit_once`]. + /// + /// # Examples + /// + /// ``` + /// use os_str_bytes::RawOsStr; + /// + /// let raw = RawOsStr::from_str("foobar"); + /// assert_eq!( + /// Some((RawOsStr::from_str("fo"), RawOsStr::from_str("bar"))), + /// raw.rsplit_once("o"), + /// ); + /// assert_eq!(None, raw.rsplit_once("of")); + /// ``` + #[inline] + #[must_use] + pub fn rsplit_once<P>(&self, pat: P) -> Option<(&Self, &Self)> + where + P: Pattern, + { + self.rsplit_once_raw(&pat.__encode()) + } + + // https://github.com/rust-lang/rust/blob/49c68bd53f90e375bfb3cbba8c1c67a9e0adb9c0/src/libcore/str/mod.rs#L2184-L2221 + #[cold] + #[inline(never)] + #[track_caller] + fn index_boundary_error(&self, index: usize) -> ! { + debug_assert!(raw::is_continuation(self.0[index])); + + let start = expect_encoded!(self.0[..index] + .iter() + .rposition(|&x| !raw::is_continuation(x))); + let mut end = index + 1; + end += self.0[end..] + .iter() + .take_while(|&&x| raw::is_continuation(x)) + .count(); + let code_point = raw::decode_code_point(&self.0[start..end]); + panic!( + "byte index {} is not a valid boundary; it is inside U+{:04X} \ + (bytes {}..{})", + index, code_point, start, end, + ); + } + + #[track_caller] + fn check_bound(&self, index: usize) { + if let Some(&byte) = self.0.get(index) { + if raw::is_continuation(byte) { + self.index_boundary_error(index); + } + } + } + + /// Equivalent to [`str::split`], but empty patterns are not accepted. + /// + /// # Panics + /// + /// Panics if the pattern is empty. + /// + /// # Examples + /// + /// ``` + /// use os_str_bytes::RawOsStr; + /// + /// let raw = RawOsStr::from_str("foobar"); + /// assert_eq!(["f", "", "bar"], *raw.split("o").collect::<Vec<_>>()); + /// ``` + #[inline] + #[must_use] + #[track_caller] + pub fn split<P>(&self, pat: P) -> Split<'_, P> + where + P: Pattern, + { + Split::new(self, pat) + } + + /// Equivalent to [`str::split_at`]. + /// + /// # Panics + /// + /// Panics if the index is not a [valid boundary]. + /// + /// # Examples + /// + /// ``` + /// use os_str_bytes::RawOsStr; + /// + /// let raw = RawOsStr::from_str("foobar"); + /// assert_eq!( + /// ((RawOsStr::from_str("fo"), RawOsStr::from_str("obar"))), + /// raw.split_at(2), + /// ); + /// ``` + /// + /// [valid boundary]: #indices + #[inline] + #[must_use] + #[track_caller] + pub fn split_at(&self, mid: usize) -> (&Self, &Self) { + self.check_bound(mid); + + let (prefix, suffix) = self.0.split_at(mid); + (Self::from_inner(prefix), Self::from_inner(suffix)) + } + + pub(super) fn split_once_raw<P>(&self, pat: &P) -> Option<(&Self, &Self)> + where + P: EncodedPattern, + { + self.split_once_raw_with(pat, find) + } + + /// Equivalent to [`str::split_once`]. + /// + /// # Examples + /// + /// ``` + /// use os_str_bytes::RawOsStr; + /// + /// let raw = RawOsStr::from_str("foobar"); + /// assert_eq!( + /// Some((RawOsStr::from_str("f"), RawOsStr::from_str("obar"))), + /// raw.split_once("o"), + /// ); + /// assert_eq!(None, raw.split_once("of")); + /// ``` + #[inline] + #[must_use] + pub fn split_once<P>(&self, pat: P) -> Option<(&Self, &Self)> + where + P: Pattern, + { + self.split_once_raw(&pat.__encode()) + } + + /// Equivalent to [`str::starts_with`]. + /// + /// # Examples + /// + /// ``` + /// use os_str_bytes::RawOsStr; + /// + /// let raw = RawOsStr::from_str("foobar"); + /// assert!(raw.starts_with("foo")); + /// assert!(!raw.starts_with("bar")); + /// ``` + #[inline] + #[must_use] + pub fn starts_with<P>(&self, pat: P) -> bool + where + P: Pattern, + { + let pat = pat.__encode(); + let pat = pat.__get(); + + self.0.starts_with(pat) + } + + /// Equivalent to [`str::starts_with`] but accepts this type for the + /// pattern. + /// + /// # Examples + /// + /// ``` + /// use os_str_bytes::RawOsStr; + /// + /// let raw = RawOsStr::from_str("foobar"); + /// assert!(raw.starts_with_os(RawOsStr::from_str("foo"))); + /// assert!(!raw.starts_with_os(RawOsStr::from_str("bar"))); + /// ``` + #[inline] + #[must_use] + pub fn starts_with_os(&self, pat: &Self) -> bool { + raw::starts_with(&self.0, &pat.0) + } + + /// Equivalent to [`str::strip_prefix`]. + /// + /// # Examples + /// + /// ``` + /// use os_str_bytes::RawOsStr; + /// + /// let raw = RawOsStr::from_str("111foo1bar111"); + /// assert_eq!( + /// Some(RawOsStr::from_str("11foo1bar111")), + /// raw.strip_prefix("1"), + /// ); + /// assert_eq!(None, raw.strip_prefix("o")); + /// ``` + #[inline] + #[must_use] + pub fn strip_prefix<P>(&self, pat: P) -> Option<&Self> + where + P: Pattern, + { + let pat = pat.__encode(); + let pat = pat.__get(); + + self.0.strip_prefix(pat).map(Self::from_inner) + } + + /// Equivalent to [`str::strip_suffix`]. + /// + /// # Examples + /// + /// ``` + /// use os_str_bytes::RawOsStr; + /// + /// let raw = RawOsStr::from_str("111foo1bar111"); + /// assert_eq!( + /// Some(RawOsStr::from_str("111foo1bar11")), + /// raw.strip_suffix("1"), + /// ); + /// assert_eq!(None, raw.strip_suffix("o")); + /// ``` + #[inline] + #[must_use] + pub fn strip_suffix<P>(&self, pat: P) -> Option<&Self> + where + P: Pattern, + { + let pat = pat.__encode(); + let pat = pat.__get(); + + self.0.strip_suffix(pat).map(Self::from_inner) + } + + /// Converts this representation back to a platform-native string. + /// + /// When possible, use [`RawOsStrCow::into_os_str`] for a more efficient + /// conversion on some platforms. + /// + /// # Examples + /// + /// ``` + /// use std::env; + /// # use std::io; + /// + /// use os_str_bytes::RawOsStr; + /// + /// let os_string = env::current_exe()?.into_os_string(); + /// let raw = RawOsStr::new(&os_string); + /// assert_eq!(os_string, raw.to_os_str()); + /// # + /// # Ok::<_, io::Error>(()) + /// ``` + #[inline] + #[must_use] + pub fn to_os_str(&self) -> Cow<'_, OsStr> { + expect_encoded!(imp::os_str_from_bytes(&self.0)) + } + + /// Equivalent to [`OsStr::to_str`]. + /// + /// # Examples + /// + /// ``` + /// use os_str_bytes::RawOsStr; + /// + /// let string = "foobar"; + /// let raw = RawOsStr::from_str(string); + /// assert_eq!(Some(string), raw.to_str()); + /// ``` + #[inline] + #[must_use] + pub fn to_str(&self) -> Option<&str> { + str::from_utf8(&self.0).ok() + } + + /// Converts this string to the best UTF-8 representation possible. + /// + /// Invalid sequences will be replaced with + /// [`char::REPLACEMENT_CHARACTER`]. + /// + /// This method may return a different result than would + /// [`OsStr::to_string_lossy`] when called on same string, since [`OsStr`] + /// uses an unspecified encoding. + /// + /// # Examples + /// + /// ``` + /// use std::env; + /// # use std::io; + /// + /// use os_str_bytes::RawOsStr; + /// + /// let os_string = env::current_exe()?.into_os_string(); + /// let raw = RawOsStr::new(&os_string); + /// println!("{}", raw.to_str_lossy()); + /// # + /// # Ok::<_, io::Error>(()) + /// ``` + #[inline] + #[must_use] + pub fn to_str_lossy(&self) -> Cow<'_, str> { + String::from_utf8_lossy(&self.0) + } + + fn trim_matches_raw_with<P, F>(&self, pat: &P, strip_fn: F) -> &Self + where + F: for<'a> Fn(&'a [u8], &[u8]) -> Option<&'a [u8]>, + P: EncodedPattern, + { + let pat = pat.__get(); + if pat.is_empty() { + return self; + } + + let mut string = &self.0; + while let Some(substring) = strip_fn(string, pat) { + string = substring; + } + Self::from_inner(string) + } + + fn trim_end_matches_raw<P>(&self, pat: &P) -> &Self + where + P: EncodedPattern, + { + self.trim_matches_raw_with(pat, <[_]>::strip_suffix) + } + + /// Equivalent to [`str::trim_end_matches`]. + /// + /// # Examples + /// + /// ``` + /// use os_str_bytes::RawOsStr; + /// + /// let raw = RawOsStr::from_str("111foo1bar111"); + /// assert_eq!("111foo1bar", raw.trim_end_matches("1")); + /// assert_eq!("111foo1bar111", raw.trim_end_matches("o")); + /// ``` + #[inline] + #[must_use] + pub fn trim_end_matches<P>(&self, pat: P) -> &Self + where + P: Pattern, + { + self.trim_end_matches_raw(&pat.__encode()) + } + + /// Equivalent to [`str::trim_matches`]. + /// + /// # Examples + /// + /// ``` + /// use os_str_bytes::RawOsStr; + /// + /// let raw = RawOsStr::from_str("111foo1bar111"); + /// assert_eq!("foo1bar", raw.trim_matches("1")); + /// assert_eq!("111foo1bar111", raw.trim_matches("o")); + /// ``` + #[inline] + #[must_use] + pub fn trim_matches<P>(&self, pat: P) -> &Self + where + P: Pattern, + { + let pat = pat.__encode(); + self.trim_start_matches_raw(&pat).trim_end_matches_raw(&pat) + } + + fn trim_start_matches_raw<P>(&self, pat: &P) -> &Self + where + P: EncodedPattern, + { + self.trim_matches_raw_with(pat, <[_]>::strip_prefix) + } + + /// Equivalent to [`str::trim_start_matches`]. + /// + /// # Examples + /// + /// ``` + /// use os_str_bytes::RawOsStr; + /// + /// let raw = RawOsStr::from_str("111foo1bar111"); + /// assert_eq!("foo1bar111", raw.trim_start_matches("1")); + /// assert_eq!("111foo1bar111", raw.trim_start_matches("o")); + /// ``` + #[inline] + #[must_use] + pub fn trim_start_matches<P>(&self, pat: P) -> &Self + where + P: Pattern, + { + self.trim_start_matches_raw(&pat.__encode()) + } +} + +impl AsRef<Self> for RawOsStr { + #[inline] + fn as_ref(&self) -> &Self { + self + } +} + +impl AsRef<RawOsStr> for str { + #[inline] + fn as_ref(&self) -> &RawOsStr { + RawOsStr::from_str(self) + } +} + +impl AsRef<RawOsStr> for String { + #[inline] + fn as_ref(&self) -> &RawOsStr { + (**self).as_ref() + } +} + +impl Default for &RawOsStr { + #[inline] + fn default() -> Self { + RawOsStr::from_str("") + } +} + +impl<'a> From<&'a RawOsStr> for Cow<'a, RawOsStr> { + #[inline] + fn from(value: &'a RawOsStr) -> Self { + Cow::Borrowed(value) + } +} + +impl From<Box<str>> for Box<RawOsStr> { + #[inline] + fn from(value: Box<str>) -> Self { + value.into_boxed_bytes().transmute_box() + } +} + +impl ToOwned for RawOsStr { + type Owned = RawOsString; + + #[inline] + fn to_owned(&self) -> Self::Owned { + RawOsString(self.0.to_owned()) + } +} + +/// Extensions to [`Cow<RawOsStr>`] for additional conversions. +/// +/// [`Cow<RawOsStr>`]: Cow +#[cfg_attr(os_str_bytes_docs_rs, doc(cfg(feature = "raw_os_str")))] +pub trait RawOsStrCow<'a>: private::Sealed { + /// Converts this representation back to a platform-native string. + /// + /// # Examples + /// + /// ``` + /// use std::env; + /// # use std::io; + /// + /// use os_str_bytes::RawOsStr; + /// use os_str_bytes::RawOsStrCow; + /// + /// let os_string = env::current_exe()?.into_os_string(); + /// let raw = RawOsStr::new(&os_string); + /// assert_eq!(os_string, raw.into_os_str()); + /// # + /// # Ok::<_, io::Error>(()) + /// ``` + #[must_use] + fn into_os_str(self) -> Cow<'a, OsStr>; + + /// Returns the byte string stored by this container. + /// + /// The returned string will use an [unspecified encoding]. + /// + /// # Examples + /// + /// ``` + /// use std::borrow::Cow; + /// + /// use os_str_bytes::RawOsStr; + /// use os_str_bytes::RawOsStrCow; + /// + /// let string = "foobar"; + /// let raw = Cow::Borrowed(RawOsStr::from_str(string)); + /// assert_eq!(string.as_bytes(), &*raw.into_raw_bytes()); + /// ``` + /// + /// [unspecified encoding]: super#encoding + #[must_use] + fn into_raw_bytes(self) -> Cow<'a, [u8]>; +} + +impl<'a> RawOsStrCow<'a> for Cow<'a, RawOsStr> { + #[inline] + fn into_os_str(self) -> Cow<'a, OsStr> { + match self { + Cow::Borrowed(string) => string.to_os_str(), + Cow::Owned(string) => Cow::Owned(string.into_os_string()), + } + } + + #[inline] + fn into_raw_bytes(self) -> Cow<'a, [u8]> { + match self { + Cow::Borrowed(string) => Cow::Borrowed(&string.0), + Cow::Owned(string) => Cow::Owned(string.0), + } + } +} + +/// A container for owned byte strings converted by this crate. +/// +/// For more information, see [`RawOsStr`]. +#[derive(Clone, Default, Eq, Hash, Ord, PartialEq, PartialOrd)] +#[cfg_attr(os_str_bytes_docs_rs, doc(cfg(feature = "raw_os_str")))] +pub struct RawOsString(Vec<u8>); + +impl RawOsString { + /// Converts a platform-native string into a representation that can be + /// more easily manipulated. + /// + /// For more information, see [`RawOsStr::new`]. + /// + /// # Examples + /// + /// ``` + /// use std::env; + /// # use std::io; + /// + /// use os_str_bytes::RawOsString; + /// + /// let os_string = env::current_exe()?.into_os_string(); + /// println!("{:?}", RawOsString::new(os_string)); + /// # + /// # Ok::<_, io::Error>(()) + /// ``` + #[inline] + #[must_use] + pub fn new(string: OsString) -> Self { + Self(imp::os_string_into_vec(string)) + } + + /// Wraps a string, without copying or encoding conversion. + /// + /// This method is much more efficient than [`RawOsString::new`], since the + /// [encoding] used by this crate is compatible with UTF-8. + /// + /// # Examples + /// + /// ``` + /// use os_str_bytes::RawOsString; + /// + /// let string = "foobar".to_owned(); + /// let raw = RawOsString::from_string(string.clone()); + /// assert_eq!(string, raw); + /// ``` + /// + /// [encoding]: super#encoding + #[inline] + #[must_use] + pub fn from_string(string: String) -> Self { + Self(string.into_bytes()) + } + + /// Wraps a byte string, without copying or encoding conversion. + /// + /// # Panics + /// + /// Panics if the string is not valid for the [unspecified encoding] used + /// by this crate. + /// + /// # Examples + /// + /// ``` + /// use std::env; + /// # use std::io; + /// + /// use os_str_bytes::RawOsString; + /// + /// let os_string = env::current_exe()?.into_os_string(); + /// let raw = RawOsString::new(os_string); + /// let raw_bytes = raw.clone().into_raw_vec(); + /// assert_eq!(raw, RawOsString::assert_from_raw_vec(raw_bytes)); + /// # + /// # Ok::<_, io::Error>(()) + /// ``` + /// + /// [unspecified encoding]: super#encoding + #[inline] + #[must_use = "method should not be used for validation"] + #[track_caller] + pub fn assert_from_raw_vec(string: Vec<u8>) -> Self { + expect_encoded!(raw::validate_bytes(&string)); + + Self(string) + } + + if_checked_conversions! { + /// Wraps a byte string, without copying or encoding conversion. + /// + /// [`assert_from_raw_vec`] should almost always be used instead. For + /// more information, see [`EncodingError`]. + /// + /// # Errors + /// + /// See documentation for [`EncodingError`]. + /// + /// # Examples + /// + /// ``` + /// use std::env; + /// # use std::io; + /// + /// use os_str_bytes::RawOsString; + /// + /// let os_string = env::current_exe()?.into_os_string(); + /// let raw = RawOsString::new(os_string); + /// let raw_clone = raw.clone(); + /// assert_eq!(Ok(raw), RawOsString::from_raw_vec(raw_clone.into_raw_vec())); + /// # + /// # Ok::<_, io::Error>(()) + /// ``` + /// + /// [`assert_from_raw_vec`]: Self::assert_from_raw_vec + #[cfg_attr( + os_str_bytes_docs_rs, + doc(cfg(feature = "checked_conversions")) + )] + #[inline] + pub fn from_raw_vec(string: Vec<u8>) -> Result<Self> { + raw::validate_bytes(&string) + .map(|()| Self(string)) + .map_err(EncodingError) + } + } + + /// Wraps a byte string, without copying or encoding conversion. + /// + /// # Safety + /// + /// The string must be valid for the [unspecified encoding] used by this + /// crate. + /// + /// # Examples + /// + /// ``` + /// use std::env; + /// # use std::io; + /// + /// use os_str_bytes::RawOsString; + /// + /// let os_string = env::current_exe()?.into_os_string(); + /// let raw = RawOsString::new(os_string); + /// let raw_bytes = raw.clone().into_raw_vec(); + /// assert_eq!(raw, unsafe { + /// RawOsString::from_raw_vec_unchecked(raw_bytes) + /// }); + /// # + /// # Ok::<_, io::Error>(()) + /// ``` + /// + /// [unspecified encoding]: super#encoding + #[inline] + #[must_use] + #[track_caller] + pub unsafe fn from_raw_vec_unchecked(string: Vec<u8>) -> Self { + if cfg!(debug_assertions) { + expect_encoded!(raw::validate_bytes(&string)); + } + + Self(string) + } + + /// Equivalent to [`String::clear`]. + /// + /// # Examples + /// + /// ``` + /// use std::env; + /// # use std::io; + /// + /// use os_str_bytes::RawOsString; + /// + /// let os_string = env::current_exe()?.into_os_string(); + /// let mut raw = RawOsString::new(os_string); + /// raw.clear(); + /// assert!(raw.is_empty()); + /// # + /// # Ok::<_, io::Error>(()) + /// ``` + #[inline] + pub fn clear(&mut self) { + self.0.clear(); + } + + /// Equivalent to [`String::into_boxed_str`]. + /// + /// # Examples + /// + /// ``` + /// use os_str_bytes::RawOsString; + /// + /// let string = "foobar".to_owned(); + /// let raw = RawOsString::from_string(string.clone()); + /// assert_eq!(string, *raw.into_box()); + /// ``` + #[inline] + #[must_use] + pub fn into_box(self) -> Box<RawOsStr> { + self.0.into_boxed_slice().transmute_box() + } + + /// Converts this representation back to a platform-native string. + /// + /// # Examples + /// + /// ``` + /// use std::env; + /// # use std::io; + /// + /// use os_str_bytes::RawOsString; + /// + /// let os_string = env::current_exe()?.into_os_string(); + /// let raw = RawOsString::new(os_string.clone()); + /// assert_eq!(os_string, raw.into_os_string()); + /// # + /// # Ok::<_, io::Error>(()) + /// ``` + #[inline] + #[must_use] + pub fn into_os_string(self) -> OsString { + expect_encoded!(imp::os_string_from_vec(self.0)) + } + + /// Returns the byte string stored by this container. + /// + /// The returned string will use an [unspecified encoding]. + /// + /// # Examples + /// + /// ``` + /// use os_str_bytes::RawOsString; + /// + /// let string = "foobar".to_owned(); + /// let raw = RawOsString::from_string(string.clone()); + /// assert_eq!(string.into_bytes(), raw.into_raw_vec()); + /// ``` + /// + /// [unspecified encoding]: super#encoding + #[inline] + #[must_use] + pub fn into_raw_vec(self) -> Vec<u8> { + self.0 + } + + /// Equivalent to [`OsString::into_string`]. + /// + /// # Examples + /// + /// ``` + /// use os_str_bytes::RawOsString; + /// + /// let string = "foobar".to_owned(); + /// let raw = RawOsString::from_string(string.clone()); + /// assert_eq!(Ok(string), raw.into_string()); + /// ``` + #[inline] + pub fn into_string(self) -> result::Result<String, Self> { + String::from_utf8(self.0).map_err(|x| Self(x.into_bytes())) + } + + /// Equivalent to [`String::shrink_to_fit`]. + /// + /// # Examples + /// + /// ``` + /// use os_str_bytes::RawOsString; + /// + /// let string = "foobar".to_owned(); + /// let mut raw = RawOsString::from_string(string.clone()); + /// raw.shrink_to_fit(); + /// assert_eq!(string, raw); + /// ``` + #[inline] + pub fn shrink_to_fit(&mut self) { + self.0.shrink_to_fit(); + } + + /// Equivalent to [`String::split_off`]. + /// + /// # Panics + /// + /// Panics if the index is not a [valid boundary]. + /// + /// # Examples + /// + /// ``` + /// use os_str_bytes::RawOsString; + /// + /// let mut raw = RawOsString::from_string("foobar".to_owned()); + /// assert_eq!("bar", raw.split_off(3)); + /// assert_eq!("foo", raw); + /// ``` + /// + /// [valid boundary]: RawOsStr#indices + #[inline] + #[must_use] + #[track_caller] + pub fn split_off(&mut self, at: usize) -> Self { + self.check_bound(at); + + Self(self.0.split_off(at)) + } + + /// Equivalent to [`String::truncate`]. + /// + /// # Panics + /// + /// Panics if the index is not a [valid boundary]. + /// + /// # Examples + /// + /// ``` + /// use os_str_bytes::RawOsString; + /// + /// let mut raw = RawOsString::from_string("foobar".to_owned()); + /// raw.truncate(3); + /// assert_eq!("foo", raw); + /// ``` + /// + /// [valid boundary]: RawOsStr#indices + #[inline] + #[track_caller] + pub fn truncate(&mut self, new_len: usize) { + self.check_bound(new_len); + + self.0.truncate(new_len); + } +} + +impl AsRef<RawOsStr> for RawOsString { + #[inline] + fn as_ref(&self) -> &RawOsStr { + self + } +} + +impl Borrow<RawOsStr> for RawOsString { + #[inline] + fn borrow(&self) -> &RawOsStr { + self + } +} + +impl Deref for RawOsString { + type Target = RawOsStr; + + #[inline] + fn deref(&self) -> &Self::Target { + RawOsStr::from_inner(&self.0) + } +} + +impl From<RawOsString> for Box<RawOsStr> { + #[inline] + fn from(value: RawOsString) -> Self { + value.into_box() + } +} + +impl From<Box<RawOsStr>> for RawOsString { + #[inline] + fn from(value: Box<RawOsStr>) -> Self { + Self(value.transmute_box::<[_]>().into_vec()) + } +} + +impl From<RawOsString> for Cow<'_, RawOsStr> { + #[inline] + fn from(value: RawOsString) -> Self { + Cow::Owned(value) + } +} + +impl From<String> for RawOsString { + #[inline] + fn from(value: String) -> Self { + Self::from_string(value) + } +} + +struct DebugBuffer<'a>(&'a [u8]); + +impl Debug for DebugBuffer<'_> { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + f.write_str("\"")?; + + let mut string = self.0; + let mut invalid_length = 0; + while !string.is_empty() { + let (invalid, substring) = string.split_at(invalid_length); + + let valid = match str::from_utf8(substring) { + Ok(valid) => { + string = &[]; + valid + } + Err(error) => { + let (valid, substring) = + substring.split_at(error.valid_up_to()); + + let invalid_char_length = + error.error_len().unwrap_or_else(|| substring.len()); + if valid.is_empty() { + invalid_length += invalid_char_length; + continue; + } + string = substring; + invalid_length = invalid_char_length; + + // SAFETY: This slice was validated to be UTF-8. + unsafe { str::from_utf8_unchecked(valid) } + } + }; + + raw::debug(invalid, f)?; + Display::fmt(&valid.escape_debug(), f)?; + } + + f.write_str("\"") + } +} + +macro_rules! r#impl { + ( $type:ty ) => { + impl Debug for $type { + #[inline] + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + f.debug_tuple(stringify!($type)) + .field(&DebugBuffer(&self.0)) + .finish() + } + } + }; +} +r#impl!(RawOsStr); +r#impl!(RawOsString); + +macro_rules! r#impl { + ( $index_type:ty $(, $index_var:ident , $($bound:expr),+)? ) => { + impl Index<$index_type> for RawOsStr { + type Output = Self; + + #[inline] + fn index(&self, idx: $index_type) -> &Self::Output { + $( + let $index_var = &idx; + $(self.check_bound($bound);)+ + )? + + Self::from_inner(&self.0[idx]) + } + } + + impl Index<$index_type> for RawOsString { + type Output = RawOsStr; + + #[allow(clippy::indexing_slicing)] + #[inline] + fn index(&self, idx: $index_type) -> &Self::Output { + &(**self)[idx] + } + } + }; +} +r#impl!(Range<usize>, x, x.start, x.end); +r#impl!(RangeFrom<usize>, x, x.start); +r#impl!(RangeFull); +// [usize::MAX] will always be a valid inclusive end index. +#[rustfmt::skip] +r#impl!(RangeInclusive<usize>, x, *x.start(), x.end().wrapping_add(1)); +r#impl!(RangeTo<usize>, x, x.end); +r#impl!(RangeToInclusive<usize>, x, x.end.wrapping_add(1)); + +macro_rules! r#impl { + ( $type:ty , $other_type:ty ) => { + impl PartialEq<$other_type> for $type { + #[inline] + fn eq(&self, other: &$other_type) -> bool { + let raw: &RawOsStr = self; + let other: &RawOsStr = other.as_ref(); + raw == other + } + } + + impl PartialEq<$type> for $other_type { + #[inline] + fn eq(&self, other: &$type) -> bool { + other == self + } + } + }; +} +r#impl!(RawOsStr, RawOsString); +r#impl!(&RawOsStr, RawOsString); +r#impl!(RawOsStr, str); +r#impl!(RawOsStr, String); +r#impl!(&RawOsStr, String); +r#impl!(RawOsString, str); +r#impl!(RawOsString, &str); +r#impl!(RawOsString, String); + +#[cfg(feature = "print_bytes")] +#[cfg_attr(os_str_bytes_docs_rs, doc(cfg(feature = "print_bytes")))] +mod print_bytes { + use print_bytes::ByteStr; + use print_bytes::ToBytes; + #[cfg(windows)] + use print_bytes::WideStr; + + #[cfg(windows)] + use crate::imp::raw; + + use super::RawOsStr; + use super::RawOsString; + + impl ToBytes for RawOsStr { + #[inline] + fn to_bytes(&self) -> ByteStr<'_> { + self.0.to_bytes() + } + + #[cfg(windows)] + #[inline] + fn to_wide(&self) -> Option<WideStr> { + Some(WideStr::new(raw::encode_wide_unchecked(&self.0).collect())) + } + } + + impl ToBytes for RawOsString { + #[inline] + fn to_bytes(&self) -> ByteStr<'_> { + (**self).to_bytes() + } + + #[cfg(windows)] + #[inline] + fn to_wide(&self) -> Option<WideStr> { + (**self).to_wide() + } + } +} + +#[cfg(feature = "uniquote")] +#[cfg_attr(os_str_bytes_docs_rs, doc(cfg(feature = "uniquote")))] +mod uniquote { + use uniquote::Formatter; + use uniquote::Quote; + use uniquote::Result; + + use crate::imp::raw; + + use super::RawOsStr; + use super::RawOsString; + + impl Quote for RawOsStr { + #[inline] + fn escape(&self, f: &mut Formatter<'_>) -> Result { + raw::uniquote::escape(&self.0, f) + } + } + + impl Quote for RawOsString { + #[inline] + fn escape(&self, f: &mut Formatter<'_>) -> Result { + (**self).escape(f) + } + } +} diff --git a/third_party/rust/os_str_bytes/src/util.rs b/third_party/rust/os_str_bytes/src/util.rs new file mode 100644 index 0000000000..f931969c52 --- /dev/null +++ b/third_party/rust/os_str_bytes/src/util.rs @@ -0,0 +1,9 @@ +pub(super) const BYTE_SHIFT: u8 = 6; + +pub(super) const CONT_MASK: u8 = (1 << BYTE_SHIFT) - 1; + +pub(super) const CONT_TAG: u8 = 0b1000_0000; + +pub(super) const fn is_continuation(byte: u8) -> bool { + byte & !CONT_MASK == CONT_TAG +} diff --git a/third_party/rust/os_str_bytes/src/wasm/mod.rs b/third_party/rust/os_str_bytes/src/wasm/mod.rs new file mode 100644 index 0000000000..a8a2996018 --- /dev/null +++ b/third_party/rust/os_str_bytes/src/wasm/mod.rs @@ -0,0 +1,58 @@ +use std::borrow::Cow; +use std::error::Error; +use std::ffi::OsStr; +use std::ffi::OsString; +use std::fmt; +use std::fmt::Display; +use std::fmt::Formatter; +use std::result; +use std::str; +use std::str::Utf8Error; + +if_raw_str! { + pub(super) mod raw; +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub(super) struct EncodingError(Utf8Error); + +impl Display for EncodingError { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + write!(f, "os_str_bytes: {}", self.0) + } +} + +impl Error for EncodingError {} + +type Result<T> = result::Result<T, EncodingError>; + +macro_rules! expect_utf8 { + ( $result:expr ) => { + $result.expect( + "platform string contains invalid UTF-8, which should not be \ + possible", + ) + }; +} + +fn from_bytes(string: &[u8]) -> Result<&str> { + str::from_utf8(string).map_err(EncodingError) +} + +pub(super) fn os_str_from_bytes(string: &[u8]) -> Result<Cow<'_, OsStr>> { + from_bytes(string).map(|x| Cow::Borrowed(OsStr::new(x))) +} + +pub(super) fn os_str_to_bytes(os_string: &OsStr) -> Cow<'_, [u8]> { + Cow::Borrowed(expect_utf8!(os_string.to_str()).as_bytes()) +} + +pub(super) fn os_string_from_vec(string: Vec<u8>) -> Result<OsString> { + String::from_utf8(string) + .map(Into::into) + .map_err(|x| EncodingError(x.utf8_error())) +} + +pub(super) fn os_string_into_vec(os_string: OsString) -> Vec<u8> { + expect_utf8!(os_string.into_string()).into_bytes() +} diff --git a/third_party/rust/os_str_bytes/src/wasm/raw.rs b/third_party/rust/os_str_bytes/src/wasm/raw.rs new file mode 100644 index 0000000000..fb291a65fa --- /dev/null +++ b/third_party/rust/os_str_bytes/src/wasm/raw.rs @@ -0,0 +1,34 @@ +use std::fmt; +use std::fmt::Formatter; +use std::str; + +pub(crate) use crate::util::is_continuation; + +use super::Result; + +#[allow(dead_code)] +#[path = "../common/raw.rs"] +mod common_raw; +pub(crate) use common_raw::ends_with; +pub(crate) use common_raw::starts_with; +#[cfg(feature = "uniquote")] +pub(crate) use common_raw::uniquote; + +pub(crate) fn validate_bytes(string: &[u8]) -> Result<()> { + super::from_bytes(string).map(drop) +} + +pub(crate) fn decode_code_point(string: &[u8]) -> u32 { + let string = expect_encoded!(str::from_utf8(string)); + let mut chars = string.chars(); + let ch = chars + .next() + .expect("cannot parse code point from empty string"); + assert_eq!(None, chars.next(), "multiple code points found"); + ch.into() +} + +pub(crate) fn debug(string: &[u8], _: &mut Formatter<'_>) -> fmt::Result { + assert!(string.is_empty()); + Ok(()) +} diff --git a/third_party/rust/os_str_bytes/src/windows/mod.rs b/third_party/rust/os_str_bytes/src/windows/mod.rs new file mode 100644 index 0000000000..ed9e60b050 --- /dev/null +++ b/third_party/rust/os_str_bytes/src/windows/mod.rs @@ -0,0 +1,113 @@ +// These functions are necessarily inefficient, because they must revert +// encoding conversions performed by the standard library. However, there is +// currently no better alternative. + +use std::borrow::Cow; +use std::error::Error; +use std::ffi::OsStr; +use std::ffi::OsString; +use std::fmt; +use std::fmt::Display; +use std::fmt::Formatter; +use std::ops::Not; +use std::os::windows::ffi::OsStrExt; +use std::os::windows::ffi::OsStringExt; +use std::result; +use std::str; + +if_raw_str! { + pub(super) mod raw; +} + +mod wtf8; +use wtf8::DecodeWide; + +#[cfg(test)] +mod tests; + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub(super) enum EncodingError { + Byte(u8), + CodePoint(u32), + End(), +} + +impl EncodingError { + fn position(&self) -> Cow<'_, str> { + match self { + Self::Byte(byte) => Cow::Owned(format!("byte b'\\x{:02X}'", byte)), + Self::CodePoint(code_point) => { + Cow::Owned(format!("code point U+{:04X}", code_point)) + } + Self::End() => Cow::Borrowed("end of string"), + } + } +} + +impl Display for EncodingError { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + write!( + f, + "byte sequence is not representable in the platform encoding; \ + error at {}", + self.position(), + ) + } +} + +impl Error for EncodingError {} + +type Result<T> = result::Result<T, EncodingError>; + +fn from_bytes(string: &[u8]) -> Result<Option<OsString>> { + let mut encoder = wtf8::encode_wide(string); + + // Collecting an iterator into a result ignores the size hint: + // https://github.com/rust-lang/rust/issues/48994 + let mut encoded_string = Vec::with_capacity(encoder.size_hint().0); + for wchar in &mut encoder { + encoded_string.push(wchar?); + } + + debug_assert_eq!(str::from_utf8(string).is_ok(), encoder.is_still_utf8()); + Ok(encoder + .is_still_utf8() + .not() + .then(|| OsStringExt::from_wide(&encoded_string))) +} + +fn to_bytes(os_string: &OsStr) -> Vec<u8> { + let encoder = OsStrExt::encode_wide(os_string); + + let mut string = Vec::with_capacity(encoder.size_hint().0); + string.extend(DecodeWide::new(encoder)); + string +} + +pub(super) fn os_str_from_bytes(string: &[u8]) -> Result<Cow<'_, OsStr>> { + from_bytes(string).map(|os_string| { + os_string.map(Cow::Owned).unwrap_or_else(|| { + // SAFETY: This slice was validated to be UTF-8. + Cow::Borrowed(OsStr::new(unsafe { + str::from_utf8_unchecked(string) + })) + }) + }) +} + +pub(super) fn os_str_to_bytes(os_string: &OsStr) -> Cow<'_, [u8]> { + Cow::Owned(to_bytes(os_string)) +} + +pub(super) fn os_string_from_vec(string: Vec<u8>) -> Result<OsString> { + from_bytes(&string).map(|os_string| { + os_string.unwrap_or_else(|| { + // SAFETY: This slice was validated to be UTF-8. + unsafe { String::from_utf8_unchecked(string) }.into() + }) + }) +} + +pub(super) fn os_string_into_vec(os_string: OsString) -> Vec<u8> { + to_bytes(&os_string) +} diff --git a/third_party/rust/os_str_bytes/src/windows/raw.rs b/third_party/rust/os_str_bytes/src/windows/raw.rs new file mode 100644 index 0000000000..80953dea79 --- /dev/null +++ b/third_party/rust/os_str_bytes/src/windows/raw.rs @@ -0,0 +1,46 @@ +use std::fmt; +use std::fmt::Formatter; + +pub(crate) use crate::util::is_continuation; + +use super::wtf8; +pub(crate) use super::wtf8::ends_with; +pub(crate) use super::wtf8::starts_with; +use super::wtf8::CodePoints; +use super::Result; + +pub(crate) fn validate_bytes(string: &[u8]) -> Result<()> { + wtf8::encode_wide(string).try_for_each(|x| x.map(drop)) +} + +pub(crate) fn encode_wide_unchecked( + string: &[u8], +) -> impl '_ + Iterator<Item = u16> { + wtf8::encode_wide(string).map(|x| expect_encoded!(x)) +} + +pub(crate) fn decode_code_point(string: &[u8]) -> u32 { + let mut code_points = CodePoints::new(string.iter().copied()); + let code_point = expect_encoded!(code_points + .next() + .expect("cannot parse code point from empty string")); + assert_eq!(None, code_points.next(), "multiple code points found"); + code_point +} + +pub(crate) fn debug(string: &[u8], f: &mut Formatter<'_>) -> fmt::Result { + for wchar in encode_wide_unchecked(string) { + write!(f, "\\u{{{:X}}}", wchar)?; + } + Ok(()) +} + +#[cfg(feature = "uniquote")] +pub(crate) mod uniquote { + use uniquote::Formatter; + use uniquote::Result; + + pub(crate) fn escape(string: &[u8], f: &mut Formatter<'_>) -> Result { + f.escape_utf16(super::encode_wide_unchecked(string)) + } +} diff --git a/third_party/rust/os_str_bytes/src/windows/wtf8/code_points.rs b/third_party/rust/os_str_bytes/src/windows/wtf8/code_points.rs new file mode 100644 index 0000000000..9800d781fc --- /dev/null +++ b/third_party/rust/os_str_bytes/src/windows/wtf8/code_points.rs @@ -0,0 +1,129 @@ +use std::iter::FusedIterator; +use std::iter::Peekable; +use std::mem; + +use crate::util::is_continuation; +use crate::util::BYTE_SHIFT; +use crate::util::CONT_MASK; + +use super::EncodingError; +use super::Result; + +pub(in super::super) struct CodePoints<I> +where + I: Iterator<Item = u8>, +{ + iter: Peekable<I>, + surrogate: bool, + still_utf8: bool, +} + +impl<I> CodePoints<I> +where + I: Iterator<Item = u8>, +{ + pub(in super::super) fn new<S>(string: S) -> Self + where + S: IntoIterator<IntoIter = I>, + { + Self { + iter: string.into_iter().peekable(), + surrogate: false, + still_utf8: true, + } + } + + pub(super) fn is_still_utf8(&self) -> bool { + self.still_utf8 + } + + fn consume_next(&mut self, code_point: &mut u32) -> Result<()> { + let &byte = self.iter.peek().ok_or(EncodingError::End())?; + + if !is_continuation(byte) { + self.surrogate = false; + // Not consuming this byte will be useful if this crate ever offers + // a way to encode lossily. + return Err(EncodingError::Byte(byte)); + } + *code_point = + (*code_point << BYTE_SHIFT) | u32::from(byte & CONT_MASK); + + let removed = self.iter.next(); + debug_assert_eq!(Some(byte), removed); + + Ok(()) + } + + pub(super) fn inner_size_hint(&self) -> (usize, Option<usize>) { + self.iter.size_hint() + } +} + +impl<I> FusedIterator for CodePoints<I> where + I: FusedIterator + Iterator<Item = u8> +{ +} + +impl<I> Iterator for CodePoints<I> +where + I: Iterator<Item = u8>, +{ + type Item = Result<u32>; + + fn next(&mut self) -> Option<Self::Item> { + let byte = self.iter.next()?; + let mut code_point: u32 = byte.into(); + + macro_rules! consume_next { + () => {{ + if let Err(error) = self.consume_next(&mut code_point) { + return Some(Err(error)); + } + }}; + } + + let prev_surrogate = mem::replace(&mut self.surrogate, false); + + let mut invalid = false; + if !byte.is_ascii() { + if byte < 0xC2 { + return Some(Err(EncodingError::Byte(byte))); + } + + if byte < 0xE0 { + code_point &= 0x1F; + } else { + code_point &= 0x0F; + consume_next!(); + + if byte >= 0xF0 { + if code_point.wrapping_sub(0x10) >= 0x100 { + invalid = true; + } + consume_next!(); + + // This condition is optimized to detect surrogate code points. + } else if code_point & 0xFE0 == 0x360 { + self.still_utf8 = false; + if code_point & 0x10 == 0 { + self.surrogate = true; + } else if prev_surrogate { + // Decoding a broken surrogate pair would be lossy. + invalid = true; + } + } + + if code_point < 0x20 { + invalid = true; + } + } + consume_next!(); + } + if invalid { + return Some(Err(EncodingError::CodePoint(code_point))); + } + + Some(Ok(code_point)) + } +} diff --git a/third_party/rust/os_str_bytes/src/windows/wtf8/convert.rs b/third_party/rust/os_str_bytes/src/windows/wtf8/convert.rs new file mode 100644 index 0000000000..70a8a9f58c --- /dev/null +++ b/third_party/rust/os_str_bytes/src/windows/wtf8/convert.rs @@ -0,0 +1,181 @@ +use std::char; +use std::char::DecodeUtf16; +use std::iter::FusedIterator; +use std::num::NonZeroU16; + +use crate::util::BYTE_SHIFT; +use crate::util::CONT_MASK; +use crate::util::CONT_TAG; + +use super::CodePoints; +use super::Result; + +const MIN_HIGH_SURROGATE: u16 = 0xD800; + +const MIN_LOW_SURROGATE: u16 = 0xDC00; + +const MIN_SURROGATE_CODE: u32 = (u16::MAX as u32) + 1; + +macro_rules! static_assert { + ( $condition:expr ) => { + const _: () = assert!($condition, "static assertion failed"); + }; +} + +pub(in super::super) struct DecodeWide<I> +where + I: Iterator<Item = u16>, +{ + iter: DecodeUtf16<I>, + code_point: u32, + shifts: u8, +} + +impl<I> DecodeWide<I> +where + I: Iterator<Item = u16>, +{ + pub(in super::super) fn new<S>(string: S) -> Self + where + S: IntoIterator<IntoIter = I, Item = I::Item>, + { + Self { + iter: char::decode_utf16(string), + code_point: 0, + shifts: 0, + } + } + + #[inline(always)] + fn get_raw_byte(&self) -> u8 { + (self.code_point >> (self.shifts * BYTE_SHIFT)) as u8 + } +} + +impl<I> Iterator for DecodeWide<I> +where + I: Iterator<Item = u16>, +{ + type Item = u8; + + fn next(&mut self) -> Option<Self::Item> { + if let Some(shifts) = self.shifts.checked_sub(1) { + self.shifts = shifts; + return Some((self.get_raw_byte() & CONT_MASK) | CONT_TAG); + } + + self.code_point = self + .iter + .next()? + .map(Into::into) + .unwrap_or_else(|x| x.unpaired_surrogate().into()); + + macro_rules! decode { + ( $tag:expr ) => { + Some(self.get_raw_byte() | $tag) + }; + } + macro_rules! try_decode { + ( $tag:expr , $upper_bound:expr ) => { + if self.code_point < $upper_bound { + return decode!($tag); + } + self.shifts += 1; + }; + } + try_decode!(0, 0x80); + try_decode!(0xC0, 0x800); + try_decode!(0xE0, MIN_SURROGATE_CODE); + decode!(0xF0) + } + + fn size_hint(&self) -> (usize, Option<usize>) { + let (low, high) = self.iter.size_hint(); + let shifts = self.shifts.into(); + ( + low.saturating_add(shifts), + high.and_then(|x| x.checked_mul(4)) + .and_then(|x| x.checked_add(shifts)), + ) + } +} + +pub(in super::super) struct EncodeWide<I> +where + I: Iterator<Item = u8>, +{ + iter: CodePoints<I>, + surrogate: Option<NonZeroU16>, +} + +impl<I> EncodeWide<I> +where + I: Iterator<Item = u8>, +{ + fn new<S>(string: S) -> Self + where + S: IntoIterator<IntoIter = I>, + { + Self { + iter: CodePoints::new(string), + surrogate: None, + } + } + + pub(in super::super) fn is_still_utf8(&self) -> bool { + self.iter.is_still_utf8() + } +} + +impl<I> FusedIterator for EncodeWide<I> where + I: FusedIterator + Iterator<Item = u8> +{ +} + +impl<I> Iterator for EncodeWide<I> +where + I: Iterator<Item = u8>, +{ + type Item = Result<u16>; + + fn next(&mut self) -> Option<Self::Item> { + if let Some(surrogate) = self.surrogate.take() { + return Some(Ok(surrogate.get())); + } + + self.iter.next().map(|code_point| { + code_point.map(|code_point| { + code_point + .checked_sub(MIN_SURROGATE_CODE) + .map(|offset| { + static_assert!(MIN_LOW_SURROGATE != 0); + + // SAFETY: The above static assertion guarantees that + // this value will not be zero. + self.surrogate = Some(unsafe { + NonZeroU16::new_unchecked( + (offset & 0x3FF) as u16 | MIN_LOW_SURROGATE, + ) + }); + (offset >> 10) as u16 | MIN_HIGH_SURROGATE + }) + .unwrap_or(code_point as u16) + }) + }) + } + + fn size_hint(&self) -> (usize, Option<usize>) { + let (low, high) = self.iter.inner_size_hint(); + let additional = self.surrogate.is_some().into(); + ( + (low.saturating_add(2) / 3).saturating_add(additional), + high.and_then(|x| x.checked_add(additional)), + ) + } +} + +pub(in super::super) fn encode_wide( + string: &[u8], +) -> EncodeWide<impl '_ + Iterator<Item = u8>> { + EncodeWide::new(string.iter().copied()) +} diff --git a/third_party/rust/os_str_bytes/src/windows/wtf8/mod.rs b/third_party/rust/os_str_bytes/src/windows/wtf8/mod.rs new file mode 100644 index 0000000000..d8b0dc4a7f --- /dev/null +++ b/third_party/rust/os_str_bytes/src/windows/wtf8/mod.rs @@ -0,0 +1,18 @@ +// This module implements the WTF-8 encoding specification: +// https://simonsapin.github.io/wtf-8/ + +use super::EncodingError; +use super::Result; + +mod code_points; +pub(super) use code_points::CodePoints; + +mod convert; +pub(super) use convert::encode_wide; +pub(super) use convert::DecodeWide; + +if_raw_str! { + mod string; + pub(crate) use string::ends_with; + pub(crate) use string::starts_with; +} diff --git a/third_party/rust/os_str_bytes/src/windows/wtf8/string.rs b/third_party/rust/os_str_bytes/src/windows/wtf8/string.rs new file mode 100644 index 0000000000..b3523a2eff --- /dev/null +++ b/third_party/rust/os_str_bytes/src/windows/wtf8/string.rs @@ -0,0 +1,67 @@ +use crate::util; + +const SURROGATE_LENGTH: usize = 3; + +pub(crate) fn ends_with(string: &[u8], mut suffix: &[u8]) -> bool { + let index = if let Some(index) = string.len().checked_sub(suffix.len()) { + index + } else { + return false; + }; + if let Some(&byte) = string.get(index) { + if util::is_continuation(byte) { + let index = expect_encoded!(index.checked_sub(1)); + let mut wide_surrogate = + if let Some(surrogate) = suffix.get(..SURROGATE_LENGTH) { + super::encode_wide(surrogate) + } else { + return false; + }; + let surrogate_wchar = wide_surrogate + .next() + .expect("failed decoding non-empty suffix"); + + if wide_surrogate.next().is_some() + || super::encode_wide(&string[index..]) + .take_while(Result::is_ok) + .nth(1) + != Some(surrogate_wchar) + { + return false; + } + suffix = &suffix[SURROGATE_LENGTH..]; + } + } + string.ends_with(suffix) +} + +pub(crate) fn starts_with(string: &[u8], mut prefix: &[u8]) -> bool { + if let Some(&byte) = string.get(prefix.len()) { + if util::is_continuation(byte) { + let index = if let Some(index) = + prefix.len().checked_sub(SURROGATE_LENGTH) + { + index + } else { + return false; + }; + let (substring, surrogate) = prefix.split_at(index); + let mut wide_surrogate = super::encode_wide(surrogate); + let surrogate_wchar = wide_surrogate + .next() + .expect("failed decoding non-empty prefix"); + + if surrogate_wchar.is_err() + || wide_surrogate.next().is_some() + || super::encode_wide(&string[index..]) + .next() + .expect("failed decoding non-empty substring") + != surrogate_wchar + { + return false; + } + prefix = substring; + } + } + string.starts_with(prefix) +} |