summaryrefslogtreecommitdiffstats
path: root/vendor/gix-diff/src/blob/pipeline.rs
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/gix-diff/src/blob/pipeline.rs')
-rw-r--r--vendor/gix-diff/src/blob/pipeline.rs538
1 files changed, 538 insertions, 0 deletions
diff --git a/vendor/gix-diff/src/blob/pipeline.rs b/vendor/gix-diff/src/blob/pipeline.rs
new file mode 100644
index 000000000..58dddd90b
--- /dev/null
+++ b/vendor/gix-diff/src/blob/pipeline.rs
@@ -0,0 +1,538 @@
+use std::{
+ io::{Read, Write},
+ path::{Path, PathBuf},
+ process::{Command, Stdio},
+};
+
+use bstr::{BStr, ByteSlice};
+use gix_filter::{
+ driver::apply::{Delay, MaybeDelayed},
+ pipeline::convert::{ToGitOutcome, ToWorktreeOutcome},
+};
+use gix_object::tree::EntryKind;
+
+use crate::blob::{Driver, Pipeline, ResourceKind};
+
+/// A way to access roots for different kinds of resources that are possibly located and accessible in a worktree.
+#[derive(Clone, Debug, Default)]
+pub struct WorktreeRoots {
+ /// A place where the source of a rewrite, rename or copy, or generally the previous version of resources, are located.
+ pub old_root: Option<PathBuf>,
+ /// A place where the destination of a rewrite, rename or copy, or generally the new version of resources, are located.
+ pub new_root: Option<PathBuf>,
+}
+
+impl WorktreeRoots {
+ /// Return the root path for the given `kind`
+ pub fn by_kind(&self, kind: ResourceKind) -> Option<&Path> {
+ match kind {
+ ResourceKind::OldOrSource => self.old_root.as_deref(),
+ ResourceKind::NewOrDestination => self.new_root.as_deref(),
+ }
+ }
+}
+
+/// Data as part of an [Outcome].
+#[derive(Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Debug)]
+pub enum Data {
+ /// The data to use for diffing was written into the buffer that was passed during the call to [`Pipeline::convert_to_diffable()`].
+ Buffer,
+ /// The size that the binary blob had at the given revision, without having applied filters, as it's either
+ /// considered binary or above the big-file threshold.
+ ///
+ /// In this state, the binary file cannot be diffed.
+ Binary {
+ /// The size of the object prior to performing any filtering or as it was found on disk.
+ ///
+ /// Note that technically, the size isn't always representative of the same 'state' of the
+ /// content, as once it can be the size of the blob in git, and once it's the size of file
+ /// in the worktree.
+ size: u64,
+ },
+}
+
+/// The outcome returned by [Pipeline::convert_to_diffable()](super::Pipeline::convert_to_diffable()).
+#[derive(Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Debug)]
+pub struct Outcome {
+ /// If available, an index into the `drivers` field to access more diff-related information of the driver for items
+ /// at the given path, as previously determined by git-attributes.
+ ///
+ /// Note that drivers are queried even if there is no object available.
+ pub driver_index: Option<usize>,
+ /// The data itself, suitable for diffing, and if the object or worktree item is present at all.
+ pub data: Option<Data>,
+}
+
+/// Options for use in a [`Pipeline`].
+#[derive(Default, Clone, Copy, PartialEq, Eq, Debug, Hash, Ord, PartialOrd)]
+pub struct Options {
+ /// The amount of bytes that an object has to reach before being treated as binary.
+ /// These objects will not be queried, nor will their data be processed in any way.
+ /// If `0`, no file is ever considered binary due to their size.
+ ///
+ /// Note that for files stored in `git`, what counts is their stored, decompressed size,
+ /// thus `git-lfs` files would typically not be considered binary unless one explicitly sets
+ /// them
+ pub large_file_threshold_bytes: u64,
+ /// Capabilities of the file system which affect how we read worktree files.
+ pub fs: gix_fs::Capabilities,
+}
+
+/// The specific way to convert a resource.
+#[derive(Default, Debug, Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash)]
+pub enum Mode {
+ /// Always prepare the version of the resource as it would be in the work-tree, and
+ /// apply binary-to-text filters if present.
+ ///
+ /// This is typically free for resources in the worktree, and will apply filters to resources in the
+ /// object database.
+ #[default]
+ ToWorktreeAndBinaryToText,
+ /// Prepare the version of the resource as it would be in the work-tree if
+ /// binary-to-text filters are present (and apply them), or use the version in `git` otherwise.
+ ToGitUnlessBinaryToTextIsPresent,
+ /// Always prepare resources as they are stored in `git`.
+ ///
+ /// This is usually fastest, even though resources in the worktree needed to be converted files.
+ ToGit,
+}
+
+impl Mode {
+ fn to_worktree(self) -> bool {
+ matches!(
+ self,
+ Mode::ToGitUnlessBinaryToTextIsPresent | Mode::ToWorktreeAndBinaryToText
+ )
+ }
+
+ fn to_git(self) -> bool {
+ matches!(self, Mode::ToGitUnlessBinaryToTextIsPresent | Mode::ToGit)
+ }
+}
+
+///
+pub mod convert_to_diffable {
+ use bstr::BString;
+ use gix_object::tree::EntryKind;
+
+ /// The error returned by [Pipeline::convert_to_diffable()](super::Pipeline::convert_to_diffable()).
+ #[derive(Debug, thiserror::Error)]
+ #[allow(missing_docs)]
+ pub enum Error {
+ #[error("Entry at '{rela_path}' must be regular file or symlink, but was {actual:?}")]
+ InvalidEntryKind { rela_path: BString, actual: EntryKind },
+ #[error("Entry at '{rela_path}' could not be read as symbolic link")]
+ ReadLink { rela_path: BString, source: std::io::Error },
+ #[error("Entry at '{rela_path}' could not be opened for reading or read from")]
+ OpenOrRead { rela_path: BString, source: std::io::Error },
+ #[error("Entry at '{rela_path}' could not be copied from a filter process to a memory buffer")]
+ StreamCopy { rela_path: BString, source: std::io::Error },
+ #[error("Failed to run '{cmd}' for binary-to-text conversion of entry at {rela_path}")]
+ RunTextConvFilter {
+ rela_path: BString,
+ cmd: String,
+ source: std::io::Error,
+ },
+ #[error("Tempfile for binary-to-text conversion for entry at {rela_path} could not be created")]
+ CreateTempfile { rela_path: BString, source: std::io::Error },
+ #[error("Binary-to-text conversion '{cmd}' for entry at {rela_path} failed with: {stderr}")]
+ TextConvFilterFailed {
+ rela_path: BString,
+ cmd: String,
+ stderr: BString,
+ },
+ #[error(transparent)]
+ FindObject(#[from] gix_object::find::existing_object::Error),
+ #[error(transparent)]
+ ConvertToWorktree(#[from] gix_filter::pipeline::convert::to_worktree::Error),
+ #[error(transparent)]
+ ConvertToGit(#[from] gix_filter::pipeline::convert::to_git::Error),
+ }
+}
+
+/// Lifecycle
+impl Pipeline {
+ /// Create a new instance of a pipeline which produces blobs suitable for diffing. `roots` allow to read worktree files directly, otherwise
+ /// `worktree_filter` is used to transform object database data directly. `drivers` further configure individual paths.
+ /// `options` are used to further configure the way we act..
+ pub fn new(
+ roots: WorktreeRoots,
+ worktree_filter: gix_filter::Pipeline,
+ mut drivers: Vec<super::Driver>,
+ options: Options,
+ ) -> Self {
+ drivers.sort_by(|a, b| a.name.cmp(&b.name));
+ Pipeline {
+ roots,
+ worktree_filter,
+ drivers,
+ options,
+ attrs: {
+ let mut out = gix_filter::attributes::search::Outcome::default();
+ out.initialize_with_selection(&Default::default(), Some("diff"));
+ out
+ },
+ path: Default::default(),
+ }
+ }
+}
+
+/// Access
+impl Pipeline {
+ /// Return all drivers that this instance was initialized with.
+ pub fn drivers(&self) -> &[super::Driver] {
+ &self.drivers
+ }
+}
+
+/// Conversion
+impl Pipeline {
+ /// Convert the object at `id`, `mode`, `rela_path` and `kind`, providing access to `attributes` and `objects`.
+ /// The resulting diff-able data is written into `out`, assuming it's not too large. The returned [`Outcome`]
+ /// contains information on how to use `out`, or if it's filled at all.
+ ///
+ /// `attributes` must be returning the attributes at `rela_path`, and `objects` must be usable if `kind` is
+ /// a resource in the object database, i.e. has no worktree root available.
+ ///
+ /// If `id` [is null](gix_hash::ObjectId::is_null()) or the file in question doesn't exist in the worktree in case
+ /// [a root](WorktreeRoots) is present, then `out` will be left cleared and [Outcome::data] will be `None`.
+ ///
+ /// Note that `mode` is trusted, and we will not re-validate that the entry in the worktree actually is of that mode.
+ ///
+ /// Use `convert` to control what kind of the resource will be produced.
+ ///
+ /// ### About Tempfiles
+ ///
+ /// When querying from the object database and a binary and a [binary-to-text](Driver::binary_to_text_command) is set,
+ /// a temporary file will be created to serve as input for the converter program, containing the worktree-data that
+ /// exactly as it would be present in the worktree if checked out.
+ ///
+ /// As these files are ultimately named tempfiles, they will be leaked unless the [gix_tempfile] is configured with
+ /// a signal handler. If they leak, they would remain in the system's `$TMP` directory.
+ #[allow(clippy::too_many_arguments)]
+ pub fn convert_to_diffable(
+ &mut self,
+ id: &gix_hash::oid,
+ mode: EntryKind,
+ rela_path: &BStr,
+ kind: ResourceKind,
+ attributes: &mut dyn FnMut(&BStr, &mut gix_filter::attributes::search::Outcome),
+ objects: &dyn gix_object::FindObjectOrHeader,
+ convert: Mode,
+ out: &mut Vec<u8>,
+ ) -> Result<Outcome, convert_to_diffable::Error> {
+ let is_symlink = match mode {
+ EntryKind::Link if self.options.fs.symlink => true,
+ EntryKind::Blob | EntryKind::BlobExecutable => false,
+ _ => {
+ return Err(convert_to_diffable::Error::InvalidEntryKind {
+ rela_path: rela_path.to_owned(),
+ actual: mode,
+ })
+ }
+ };
+
+ out.clear();
+ attributes(rela_path, &mut self.attrs);
+ let attr = self.attrs.iter_selected().next().expect("pre-initialized with 'diff'");
+ let driver_index = attr
+ .assignment
+ .state
+ .as_bstr()
+ .and_then(|name| self.drivers.binary_search_by(|d| d.name.as_bstr().cmp(name)).ok());
+ let driver = driver_index.map(|idx| &self.drivers[idx]);
+ let mut is_binary = if let Some(driver) = driver {
+ driver
+ .is_binary
+ .map(|is_binary| is_binary && driver.binary_to_text_command.is_none())
+ } else {
+ attr.assignment.state.is_unset().then_some(true)
+ };
+ match self.roots.by_kind(kind) {
+ Some(root) => {
+ self.path.clear();
+ self.path.push(root);
+ self.path.push(gix_path::from_bstr(rela_path));
+ let data = if is_symlink {
+ let target = none_if_missing(std::fs::read_link(&self.path)).map_err(|err| {
+ convert_to_diffable::Error::ReadLink {
+ rela_path: rela_path.to_owned(),
+ source: err,
+ }
+ })?;
+ target.map(|target| {
+ out.extend_from_slice(gix_path::into_bstr(target).as_ref());
+ Data::Buffer
+ })
+ } else {
+ let need_size_only = is_binary == Some(true);
+ let size_in_bytes = (need_size_only
+ || (is_binary != Some(false) && self.options.large_file_threshold_bytes > 0))
+ .then(|| {
+ none_if_missing(self.path.metadata().map(|md| md.len())).map_err(|err| {
+ convert_to_diffable::Error::OpenOrRead {
+ rela_path: rela_path.to_owned(),
+ source: err,
+ }
+ })
+ })
+ .transpose()?;
+ match size_in_bytes {
+ Some(None) => None, // missing as identified by the size check
+ Some(Some(size)) if size > self.options.large_file_threshold_bytes || need_size_only => {
+ Some(Data::Binary { size })
+ }
+ _ => {
+ match driver
+ .filter(|_| convert.to_worktree())
+ .and_then(|d| d.prepare_binary_to_text_cmd(&self.path))
+ {
+ Some(cmd) => {
+ // Avoid letting the driver program fail if it doesn't exist.
+ if self.options.large_file_threshold_bytes == 0
+ && none_if_missing(std::fs::symlink_metadata(&self.path))
+ .map_err(|err| convert_to_diffable::Error::OpenOrRead {
+ rela_path: rela_path.to_owned(),
+ source: err,
+ })?
+ .is_none()
+ {
+ None
+ } else {
+ run_cmd(rela_path, cmd, out)?;
+ Some(Data::Buffer)
+ }
+ }
+ None => {
+ let file = none_if_missing(std::fs::File::open(&self.path)).map_err(|err| {
+ convert_to_diffable::Error::OpenOrRead {
+ rela_path: rela_path.to_owned(),
+ source: err,
+ }
+ })?;
+
+ match file {
+ Some(mut file) => {
+ if convert.to_git() {
+ let res = self.worktree_filter.convert_to_git(
+ file,
+ gix_path::from_bstr(rela_path).as_ref(),
+ attributes,
+ &mut |buf| objects.try_find(id, buf).map(|obj| obj.map(|_| ())),
+ )?;
+
+ match res {
+ ToGitOutcome::Unchanged(mut file) => {
+ file.read_to_end(out).map_err(|err| {
+ convert_to_diffable::Error::OpenOrRead {
+ rela_path: rela_path.to_owned(),
+ source: err,
+ }
+ })?;
+ }
+ ToGitOutcome::Process(mut stream) => {
+ stream.read_to_end(out).map_err(|err| {
+ convert_to_diffable::Error::OpenOrRead {
+ rela_path: rela_path.to_owned(),
+ source: err,
+ }
+ })?;
+ }
+ ToGitOutcome::Buffer(buf) => {
+ out.resize(buf.len(), 0);
+ out.copy_from_slice(buf);
+ }
+ }
+ } else {
+ file.read_to_end(out).map_err(|err| {
+ convert_to_diffable::Error::OpenOrRead {
+ rela_path: rela_path.to_owned(),
+ source: err,
+ }
+ })?;
+ }
+
+ Some(if is_binary.unwrap_or_else(|| is_binary_buf(out)) {
+ let size = out.len() as u64;
+ out.clear();
+ Data::Binary { size }
+ } else {
+ Data::Buffer
+ })
+ }
+ None => None,
+ }
+ }
+ }
+ }
+ }
+ };
+ Ok(Outcome { driver_index, data })
+ }
+ None => {
+ let data = if id.is_null() {
+ None
+ } else {
+ let header = objects
+ .try_header(id)
+ .map_err(gix_object::find::existing_object::Error::Find)?
+ .ok_or_else(|| gix_object::find::existing_object::Error::NotFound { oid: id.to_owned() })?;
+ if is_binary.is_none()
+ && self.options.large_file_threshold_bytes > 0
+ && header.size > self.options.large_file_threshold_bytes
+ {
+ is_binary = Some(true);
+ };
+ let data = if is_binary == Some(true) {
+ Data::Binary { size: header.size }
+ } else {
+ objects
+ .try_find(id, out)
+ .map_err(gix_object::find::existing_object::Error::Find)?
+ .ok_or_else(|| gix_object::find::existing_object::Error::NotFound { oid: id.to_owned() })?;
+ if matches!(mode, EntryKind::Blob | EntryKind::BlobExecutable)
+ && convert == Mode::ToWorktreeAndBinaryToText
+ || (convert == Mode::ToGitUnlessBinaryToTextIsPresent
+ && driver.map_or(false, |d| d.binary_to_text_command.is_some()))
+ {
+ let res =
+ self.worktree_filter
+ .convert_to_worktree(out, rela_path, attributes, Delay::Forbid)?;
+
+ let cmd_and_file = driver
+ .and_then(|d| {
+ d.binary_to_text_command.is_some().then(|| {
+ gix_tempfile::new(
+ std::env::temp_dir(),
+ gix_tempfile::ContainingDirectory::Exists,
+ gix_tempfile::AutoRemove::Tempfile,
+ )
+ .and_then(|mut tmp_file| {
+ self.path.clear();
+ tmp_file.with_mut(|tmp| self.path.push(tmp.path()))?;
+ Ok(tmp_file)
+ })
+ .map(|tmp_file| {
+ (
+ d.prepare_binary_to_text_cmd(&self.path)
+ .expect("always get cmd if command is set"),
+ tmp_file,
+ )
+ })
+ })
+ })
+ .transpose()
+ .map_err(|err| convert_to_diffable::Error::CreateTempfile {
+ source: err,
+ rela_path: rela_path.to_owned(),
+ })?;
+ match cmd_and_file {
+ Some((cmd, mut tmp_file)) => {
+ match res {
+ ToWorktreeOutcome::Unchanged(buf) | ToWorktreeOutcome::Buffer(buf) => {
+ tmp_file.write_all(buf)
+ }
+ ToWorktreeOutcome::Process(MaybeDelayed::Immediate(mut stream)) => {
+ std::io::copy(&mut stream, &mut tmp_file).map(|_| ())
+ }
+ ToWorktreeOutcome::Process(MaybeDelayed::Delayed(_)) => {
+ unreachable!("we prohibit this")
+ }
+ }
+ .map_err(|err| {
+ convert_to_diffable::Error::CreateTempfile {
+ source: err,
+ rela_path: rela_path.to_owned(),
+ }
+ })?;
+ out.clear();
+ run_cmd(rela_path, cmd, out)?;
+ }
+ None => {
+ match res {
+ ToWorktreeOutcome::Unchanged(_) => {}
+ ToWorktreeOutcome::Buffer(src) => {
+ out.resize(src.len(), 0);
+ out.copy_from_slice(src);
+ }
+ ToWorktreeOutcome::Process(MaybeDelayed::Immediate(mut stream)) => {
+ std::io::copy(&mut stream, out).map_err(|err| {
+ convert_to_diffable::Error::StreamCopy {
+ rela_path: rela_path.to_owned(),
+ source: err,
+ }
+ })?;
+ }
+ ToWorktreeOutcome::Process(MaybeDelayed::Delayed(_)) => {
+ unreachable!("we prohibit this")
+ }
+ };
+ }
+ }
+ }
+
+ if driver.map_or(true, |d| d.binary_to_text_command.is_none())
+ && is_binary.unwrap_or_else(|| is_binary_buf(out))
+ {
+ let size = out.len() as u64;
+ out.clear();
+ Data::Binary { size }
+ } else {
+ Data::Buffer
+ }
+ };
+ Some(data)
+ };
+ Ok(Outcome { driver_index, data })
+ }
+ }
+ }
+}
+
+fn is_binary_buf(buf: &[u8]) -> bool {
+ let buf = &buf[..buf.len().min(8000)];
+ buf.contains(&0)
+}
+
+fn none_if_missing<T>(res: std::io::Result<T>) -> std::io::Result<Option<T>> {
+ match res {
+ Ok(data) => Ok(Some(data)),
+ Err(err) if err.kind() == std::io::ErrorKind::NotFound => Ok(None),
+ Err(err) => Err(err),
+ }
+}
+
+fn run_cmd(rela_path: &BStr, mut cmd: Command, out: &mut Vec<u8>) -> Result<(), convert_to_diffable::Error> {
+ gix_trace::debug!(cmd = ?cmd, "Running binary-to-text command");
+ let mut res = cmd
+ .output()
+ .map_err(|err| convert_to_diffable::Error::RunTextConvFilter {
+ rela_path: rela_path.to_owned(),
+ cmd: format!("{cmd:?}"),
+ source: err,
+ })?;
+ if !res.status.success() {
+ return Err(convert_to_diffable::Error::TextConvFilterFailed {
+ rela_path: rela_path.to_owned(),
+ cmd: format!("{cmd:?}"),
+ stderr: res.stderr.into(),
+ });
+ }
+ out.append(&mut res.stdout);
+ Ok(())
+}
+
+impl Driver {
+ /// Produce an invocable command pre-configured to produce the filtered output on stdout after reading `path`.
+ pub fn prepare_binary_to_text_cmd(&self, path: &Path) -> Option<std::process::Command> {
+ let command: &BStr = self.binary_to_text_command.as_ref()?.as_ref();
+ let cmd = gix_command::prepare(gix_path::from_bstr(command).into_owned())
+ .with_shell()
+ .stdin(Stdio::null())
+ .stdout(Stdio::piped())
+ .stderr(Stdio::piped())
+ .arg(path)
+ .into();
+ Some(cmd)
+ }
+}