diff options
Diffstat (limited to 'vendor/gix-path/src')
-rw-r--r-- | vendor/gix-path/src/convert.rs | 273 | ||||
-rw-r--r-- | vendor/gix-path/src/lib.rs | 68 | ||||
-rw-r--r-- | vendor/gix-path/src/realpath.rs | 90 | ||||
-rw-r--r-- | vendor/gix-path/src/spec.rs | 53 | ||||
-rw-r--r-- | vendor/gix-path/src/util.rs | 8 |
5 files changed, 492 insertions, 0 deletions
diff --git a/vendor/gix-path/src/convert.rs b/vendor/gix-path/src/convert.rs new file mode 100644 index 000000000..6a949529f --- /dev/null +++ b/vendor/gix-path/src/convert.rs @@ -0,0 +1,273 @@ +use std::{ + borrow::Cow, + ffi::{OsStr, OsString}, + path::{Path, PathBuf}, +}; + +use bstr::{BStr, BString}; + +#[derive(Debug)] +/// The error type returned by [`into_bstr()`] and others may suffer from failed conversions from or to bytes. +pub struct Utf8Error; + +impl std::fmt::Display for Utf8Error { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.write_str("Could not convert to UTF8 or from UTF8 due to ill-formed input") + } +} + +impl std::error::Error for Utf8Error {} + +/// Like [`into_bstr()`], but takes `OsStr` as input for a lossless, but fallible, conversion. +pub fn os_str_into_bstr(path: &OsStr) -> Result<&BStr, Utf8Error> { + let path = try_into_bstr(Cow::Borrowed(path.as_ref()))?; + match path { + Cow::Borrowed(path) => Ok(path), + Cow::Owned(_) => unreachable!("borrowed cows stay borrowed"), + } +} + +/// Like [`into_bstr()`], but takes `OsString` as input for a lossless, but fallible, conversion. +pub fn os_string_into_bstring(path: OsString) -> Result<BString, Utf8Error> { + let path = try_into_bstr(Cow::Owned(path.into()))?; + match path { + Cow::Borrowed(_path) => unreachable!("borrowed cows stay borrowed"), + Cow::Owned(path) => Ok(path), + } +} + +/// Convert the given path either into its raw bytes on unix or its UTF8 encoded counterpart on windows. +/// +/// On windows, if the source Path contains ill-formed, lone surrogates, the UTF-8 conversion will fail +/// causing `Utf8Error` to be returned. +pub fn try_into_bstr<'a>(path: impl Into<Cow<'a, Path>>) -> Result<Cow<'a, BStr>, Utf8Error> { + let path = path.into(); + let path_str = match path { + Cow::Owned(path) => Cow::Owned({ + #[cfg(unix)] + let p: BString = { + use std::os::unix::ffi::OsStringExt; + path.into_os_string().into_vec().into() + }; + #[cfg(target_os = "wasi")] + let p: BString = { + use std::os::wasi::ffi::OsStringExt; + path.into_os_string().into_vec().into() + }; + #[cfg(not(any(unix, target_os = "wasi")))] + let p: BString = path.into_os_string().into_string().map_err(|_| Utf8Error)?.into(); + p + }), + Cow::Borrowed(path) => Cow::Borrowed({ + #[cfg(unix)] + let p: &BStr = { + use std::os::unix::ffi::OsStrExt; + path.as_os_str().as_bytes().into() + }; + #[cfg(target_os = "wasi")] + let p: &BStr = { + use std::os::wasi::ffi::OsStrExt; + path.as_os_str().as_bytes().into() + }; + #[cfg(not(any(unix, target_os = "wasi")))] + let p: &BStr = path.to_str().ok_or(Utf8Error)?.as_bytes().into(); + p + }), + }; + Ok(path_str) +} + +/// Similar to [`try_into_bstr()`] but **panics** if malformed surrogates are encountered on windows. +pub fn into_bstr<'a>(path: impl Into<Cow<'a, Path>>) -> Cow<'a, BStr> { + try_into_bstr(path).expect("prefix path doesn't contain ill-formed UTF-8") +} + +/// Given `input` bytes, produce a `Path` from them ignoring encoding entirely if on unix. +/// +/// On windows, the input is required to be valid UTF-8, which is guaranteed if we wrote it before. There are some potential +/// git versions and windows installation which produce mal-formed UTF-16 if certain emojies are in the path. It's as rare as +/// it sounds, but possible. +pub fn try_from_byte_slice(input: &[u8]) -> Result<&Path, Utf8Error> { + #[cfg(unix)] + let p = { + use std::os::unix::ffi::OsStrExt; + OsStr::from_bytes(input).as_ref() + }; + #[cfg(target_os = "wasi")] + let p: &Path = { + use std::os::wasi::ffi::OsStrExt; + OsStr::from_bytes(input).as_ref() + }; + #[cfg(not(any(unix, target_os = "wasi")))] + let p = Path::new(std::str::from_utf8(input).map_err(|_| Utf8Error)?); + Ok(p) +} + +/// Similar to [`from_byte_slice()`], but takes either borrowed or owned `input`. +pub fn try_from_bstr<'a>(input: impl Into<Cow<'a, BStr>>) -> Result<Cow<'a, Path>, Utf8Error> { + let input = input.into(); + match input { + Cow::Borrowed(input) => try_from_byte_slice(input).map(Cow::Borrowed), + Cow::Owned(input) => try_from_bstring(input).map(Cow::Owned), + } +} + +/// Similar to [`try_from_bstr()`], but **panics** if malformed surrogates are encountered on windows. +pub fn from_bstr<'a>(input: impl Into<Cow<'a, BStr>>) -> Cow<'a, Path> { + try_from_bstr(input).expect("prefix path doesn't contain ill-formed UTF-8") +} + +/// Similar to [`try_from_bstr()`], but takes and produces owned data. +pub fn try_from_bstring(input: impl Into<BString>) -> Result<PathBuf, Utf8Error> { + let input = input.into(); + #[cfg(unix)] + let p = { + use std::os::unix::ffi::OsStringExt; + std::ffi::OsString::from_vec(input.into()).into() + }; + #[cfg(target_os = "wasi")] + let p: PathBuf = { + use std::os::wasi::ffi::OsStringExt; + std::ffi::OsString::from_vec(input.into()).into() + }; + #[cfg(not(any(unix, target_os = "wasi")))] + let p = { + use bstr::ByteVec; + PathBuf::from( + { + let v: Vec<_> = input.into(); + v + } + .into_string() + .map_err(|_| Utf8Error)?, + ) + }; + Ok(p) +} + +/// Similar to [`try_from_bstring()`], but will **panic** if there is ill-formed UTF-8 in the `input`. +pub fn from_bstring(input: impl Into<BString>) -> PathBuf { + try_from_bstring(input).expect("well-formed UTF-8 on windows") +} + +/// Similar to [`try_from_byte_slice()`], but will **panic** if there is ill-formed UTF-8 in the `input`. +pub fn from_byte_slice(input: &[u8]) -> &Path { + try_from_byte_slice(input).expect("well-formed UTF-8 on windows") +} + +fn replace<'a>(path: impl Into<Cow<'a, BStr>>, find: u8, replace: u8) -> Cow<'a, BStr> { + let path = path.into(); + match path { + Cow::Owned(mut path) => { + for b in path.iter_mut().filter(|b| **b == find) { + *b = replace; + } + path.into() + } + Cow::Borrowed(path) => { + if !path.contains(&find) { + return path.into(); + } + let mut path = path.to_owned(); + for b in path.iter_mut().filter(|b| **b == find) { + *b = replace; + } + path.into() + } + } +} + +/// Assures the given bytes use the native path separator. +pub fn to_native_separators<'a>(path: impl Into<Cow<'a, BStr>>) -> Cow<'a, BStr> { + #[cfg(not(windows))] + let p = to_unix_separators(path); + #[cfg(windows)] + let p = to_windows_separators(path); + p +} + +/// Convert paths with slashes to backslashes on windows and do nothing on unix, but **panics** if malformed surrogates are encountered on windows. +pub fn to_native_path_on_windows<'a>(path: impl Into<Cow<'a, BStr>>) -> Cow<'a, std::path::Path> { + #[cfg(not(windows))] + { + crate::from_bstr(path) + } + #[cfg(windows)] + { + crate::from_bstr(to_windows_separators(path)) + } +} + +/// Replaces windows path separators with slashes, but only do so on windows. +pub fn to_unix_separators_on_windows<'a>(path: impl Into<Cow<'a, BStr>>) -> Cow<'a, BStr> { + #[cfg(windows)] + { + replace(path, b'\\', b'/') + } + #[cfg(not(windows))] + { + path.into() + } +} + +/// Replaces windows path separators with slashes, unconditionally. +/// +/// **Note** Do not use these and prefer the conditional versions of this method. +// TODO: use https://lib.rs/crates/path-slash to handle escapes +pub fn to_unix_separators<'a>(path: impl Into<Cow<'a, BStr>>) -> Cow<'a, BStr> { + replace(path, b'\\', b'/') +} + +/// Find backslashes and replace them with slashes, which typically resembles a unix path, unconditionally. +/// +/// **Note** Do not use these and prefer the conditional versions of this method. +// TODO: use https://lib.rs/crates/path-slash to handle escapes +pub fn to_windows_separators<'a>(path: impl Into<Cow<'a, BStr>>) -> Cow<'a, BStr> { + replace(path, b'/', b'\\') +} + +/// Resolve relative components virtually without accessing the file system, e.g. turn `a/./b/c/.././..` into `a`, +/// without keeping intermediate `..` and `/a/../b/..` becomes `/`. +/// If the input path was relative and ends up being the `current_dir`, `.` is returned instead of the full path to `current_dir`. +/// +/// This is particularly useful when manipulating paths that are based on user input, and not resolving intermediate +/// symlinks keeps the path similar to what the user provided. If that's not desirable, use `[realpath()][crate::realpath()` +/// instead. +/// +/// Note that we might access the `current_dir` if we run out of path components to pop off, which is expected to be absolute +/// as typical return value of `std::env::current_dir()`. +/// As a `current_dir` like `/c` can be exhausted by paths like `../../r`, `None` will be returned to indicate the inability +/// to produce a logically consistent path. +pub fn normalize<'a>(path: impl Into<Cow<'a, Path>>, current_dir: impl AsRef<Path>) -> Option<Cow<'a, Path>> { + use std::path::Component::ParentDir; + + let path = path.into(); + if !path.components().any(|c| matches!(c, ParentDir)) { + return Some(path); + } + let current_dir = current_dir.as_ref(); + let mut current_dir_opt = Some(current_dir); + let was_relative = path.is_relative(); + let components = path.components(); + let mut path = PathBuf::new(); + for component in components { + if let ParentDir = component { + let path_was_dot = path == Path::new("."); + if path.as_os_str().is_empty() || path_was_dot { + path.push(current_dir_opt.take()?); + } + if !path.pop() { + return None; + } + } else { + path.push(component) + } + } + + if (path.as_os_str().is_empty() || path == current_dir) && was_relative { + Cow::Borrowed(Path::new(".")) + } else { + path.into() + } + .into() +} diff --git a/vendor/gix-path/src/lib.rs b/vendor/gix-path/src/lib.rs new file mode 100644 index 000000000..70a9bc53f --- /dev/null +++ b/vendor/gix-path/src/lib.rs @@ -0,0 +1,68 @@ +//! This crate contains an assortment of utilities to deal with paths and their conversions. +//! +//! Generally `git` treats paths as bytes, but inherently assumes non-illformed UTF-8 as encoding on windows. Internally, it expects +//! slashes to be used as path separators and paths in files must have slashes, with conversions being performed on windows accordingly. +//! +//! <details> +//! +//! ### Research +//! +//! * **windows** +//! - [`dirent.c`](https://github.com/git/git/blob/main/compat/win32/dirent.c#L31:L31) contains all implementation (seemingly) of opening directories and reading their entries, along with all path conversions (UTF-16 for windows). This is done on the fly so git can work with [in UTF-8](https://github.com/git/git/blob/main/compat/win32/dirent.c#L12:L12). +//! - mingw [is used for the conversion](https://github.com/git/git/blob/main/compat/mingw.h#L579:L579) and it appears they handle surrogates during the conversion, maybe some sort of non-strict UTF-8 converter? Actually it uses [WideCharToMultiByte](https://docs.microsoft.com/en-us/windows/win32/api/stringapiset/nf-stringapiset-widechartomultibyte) +//! under the hood which by now does fail if the UTF-8 would be invalid unicode, i.e. unicode pairs. +//! - `OsString` on windows already stores strings as WTF-8, which supports [surrogate pairs](https://unicodebook.readthedocs.io/unicode_encodings.html), +//! something that UTF-8 isn't allowed do it for security reasons, after all it's UTF-16 specific and exists only to extend +//! the encodable code-points. +//! - informative reading on [WTF-8](https://simonsapin.github.io/wtf-8/#motivation) which is the encoding used by Rust +//! internally that deals with surrogates and non-wellformed surrogates (those that aren't in pairs). +//! * **unix** +//! - It uses [opendir](https://man7.org/linux/man-pages/man3/opendir.3.html) and [readdir](https://man7.org/linux/man-pages/man3/readdir.3.html) +//! respectively. There is no encoding specified, except that these paths are null-terminated. +//! +//! ### Learnings +//! +//! Surrogate pairs are a way to extend the encodable value range in UTF-16 encodings, used primarily on windows and in Javascript. +//! For a long time these codepoints used for surrogates, always to be used in pairs, were not assigned, until…they were for rare +//! emojies and the likes. The unicode standard does not require surrogates to happen in pairs, even though by now unpaired surrogates +//! in UTF-16 are considered ill-formed, which aren't supposed to be converted to UTF-8 for example. +//! +//! This is the reason we have to deal with `to_string_lossy()`, it's _just_ for that quirk. +//! +//! This also means the only platform ever eligible to see conversion errors is windows, and there it's only older pre-vista +//! windows versions which incorrectly allow ill-formed UTF-16 strings. Newer versions don't perform such conversions anymore, for +//! example when going from UTF-16 to UTF-8, they will trigger an error. +//! +//! ### Conclusions +//! +//! Since [WideCharToMultiByte](https://docs.microsoft.com/en-us/windows/win32/api/stringapiset/nf-stringapiset-widechartomultibyte) by now is +//! fixed (Vista onward) to produce valid UTF-8, lone surrogate codepoints will cause failure, which `git` +//! [doesn't care about](https://github.com/git/git/blob/main/compat/win32/dirent.c#L12:L12). +//! +//! We will, though, which means from now on we can just convert to UTF-8 on windows and bubble up errors where necessary, +//! preventing potential mismatched surrogate pairs to ever be saved on disk by gitoxide. +//! +//! Even though the error only exists on older windows versions, we will represent it in the type system through fallible function calls. +//! Callers may `.expect()` on the result to indicate they don't wish to handle this special and rare case. Note that servers should not +//! ever get into a code-path which does panic though. +//! </details> +#![deny(missing_docs, rust_2018_idioms)] +#![forbid(unsafe_code)] + +/// A dummy type to represent path specs and help finding all spots that take path specs once it is implemented. + +/// A preliminary version of a path-spec based on glances of the code. +#[derive(Clone, Debug)] +pub struct Spec(bstr::BString); + +mod convert; +pub use convert::*; + +mod util; +pub use util::is_absolute; + +mod spec; + +/// +pub mod realpath; +pub use realpath::function::{realpath, realpath_opts}; diff --git a/vendor/gix-path/src/realpath.rs b/vendor/gix-path/src/realpath.rs new file mode 100644 index 000000000..807cb74cc --- /dev/null +++ b/vendor/gix-path/src/realpath.rs @@ -0,0 +1,90 @@ +/// The error returned by [`realpath()`][super::realpath()]. +#[derive(Debug, thiserror::Error)] +#[allow(missing_docs)] +pub enum Error { + #[error("The maximum allowed number {} of symlinks in path is exceeded", .max_symlinks)] + MaxSymlinksExceeded { max_symlinks: u8 }, + #[error(transparent)] + ReadLink(std::io::Error), + #[error(transparent)] + CurrentWorkingDir(std::io::Error), + #[error("Empty is not a valid path")] + EmptyPath, + #[error("Ran out of path components while following parent component '..'")] + MissingParent, +} + +/// The default amount of symlinks we may follow when resolving a path in [`realpath()`][crate::realpath()]. +pub const MAX_SYMLINKS: u8 = 32; + +pub(crate) mod function { + use std::path::{ + Component::{CurDir, Normal, ParentDir, Prefix, RootDir}, + Path, PathBuf, + }; + + use super::Error; + use crate::realpath::MAX_SYMLINKS; + + /// Check each component of `path` and see if it is a symlink. If so, resolve it. + /// Do not fail for non-existing components, but assume these are as is. + /// + /// If `path` is relative, the current working directory be used to make it absolute. + pub fn realpath(path: impl AsRef<Path>) -> Result<PathBuf, Error> { + let cwd = path + .as_ref() + .is_relative() + .then(std::env::current_dir) + .unwrap_or_else(|| Ok(PathBuf::default())) + .map_err(Error::CurrentWorkingDir)?; + realpath_opts(path, cwd, MAX_SYMLINKS) + } + + /// The same as [`realpath()`], but allow to configure `max_symlinks` to configure how many symbolic links we are going to follow. + /// This serves to avoid running into cycles or doing unreasonable amounts of work. + pub fn realpath_opts(path: impl AsRef<Path>, cwd: impl AsRef<Path>, max_symlinks: u8) -> Result<PathBuf, Error> { + let path = path.as_ref(); + if path.as_os_str().is_empty() { + return Err(Error::EmptyPath); + } + + let mut real_path = PathBuf::new(); + if path.is_relative() { + real_path.push(cwd); + } + + let mut num_symlinks = 0; + let mut path_backing: PathBuf; + let mut components = path.components(); + while let Some(component) = components.next() { + match component { + part @ RootDir | part @ Prefix(_) => real_path.push(part), + CurDir => {} + ParentDir => { + if !real_path.pop() { + return Err(Error::MissingParent); + } + } + Normal(part) => { + real_path.push(part); + if real_path.is_symlink() { + num_symlinks += 1; + if num_symlinks > max_symlinks { + return Err(Error::MaxSymlinksExceeded { max_symlinks }); + } + let mut link_destination = std::fs::read_link(real_path.as_path()).map_err(Error::ReadLink)?; + if link_destination.is_absolute() { + // pushing absolute path to real_path resets it to the pushed absolute path + } else { + assert!(real_path.pop(), "we just pushed a component"); + } + link_destination.extend(components); + path_backing = link_destination; + components = path_backing.components(); + } + } + } + } + Ok(real_path) + } +} diff --git a/vendor/gix-path/src/spec.rs b/vendor/gix-path/src/spec.rs new file mode 100644 index 000000000..0ff9e661c --- /dev/null +++ b/vendor/gix-path/src/spec.rs @@ -0,0 +1,53 @@ +use std::ffi::OsStr; + +use bstr::{BStr, ByteSlice, ByteVec}; + +use crate::Spec; + +impl std::convert::TryFrom<&OsStr> for Spec { + type Error = crate::Utf8Error; + + fn try_from(value: &OsStr) -> Result<Self, Self::Error> { + crate::os_str_into_bstr(value).map(|value| { + assert_valid_hack(value); + Spec(value.into()) + }) + } +} + +fn assert_valid_hack(input: &BStr) { + assert!(!input.contains_str(b"/../")); + assert!(!input.contains_str(b"/./")); + assert!(!input.starts_with_str(b"../")); + assert!(!input.starts_with_str(b"./")); + assert!(!input.starts_with_str(b"/")); +} + +impl Spec { + /// Parse `input` into a `Spec` or `None` if it could not be parsed + // TODO: tests, actual implementation probably via `gix-pathspec` to make use of the crate after all. + pub fn from_bytes(input: &BStr) -> Option<Self> { + assert_valid_hack(input); + Spec(input.into()).into() + } + /// Return all paths described by this path spec, using slashes on all platforms. + pub fn items(&self) -> impl Iterator<Item = &BStr> { + std::iter::once(self.0.as_bstr()) + } + /// Adjust this path specification according to the given `prefix`, which may be empty to indicate we are the at work-tree root. + // TODO: this is a hack, needs test and time to do according to spec. This is just a minimum version to have -something-. + pub fn apply_prefix(&mut self, prefix: &std::path::Path) -> &Self { + // many more things we can't handle. `Path` never ends with trailing path separator. + let prefix = crate::into_bstr(prefix); + if !prefix.is_empty() { + let mut prefix = crate::to_unix_separators_on_windows(prefix); + { + let path = prefix.to_mut(); + path.push_byte(b'/'); + path.extend_from_slice(&self.0); + } + self.0 = prefix.into_owned(); + } + self + } +} diff --git a/vendor/gix-path/src/util.rs b/vendor/gix-path/src/util.rs new file mode 100644 index 000000000..7920910d7 --- /dev/null +++ b/vendor/gix-path/src/util.rs @@ -0,0 +1,8 @@ +use std::path::Path; + +/// return true if `path` is absolute, which depends on the platform but is always true if it starts with a `slash`, hence looks like +/// a linux path. +pub fn is_absolute(path: impl AsRef<Path>) -> bool { + let path = path.as_ref(); + path.is_absolute() || path.to_str().and_then(|s| s.chars().next()) == Some('/') +} |