use std::{ io::{Read, Write}, path::{Path, PathBuf}, process::{Command, Stdio}, }; use bstr::{BStr, ByteSlice}; use gix_filter::{ driver::apply::{Delay, MaybeDelayed}, pipeline::convert::{ToGitOutcome, ToWorktreeOutcome}, }; use gix_object::tree::EntryKind; use crate::blob::{Driver, Pipeline, ResourceKind}; /// A way to access roots for different kinds of resources that are possibly located and accessible in a worktree. #[derive(Clone, Debug, Default)] pub struct WorktreeRoots { /// A place where the source of a rewrite, rename or copy, or generally the previous version of resources, are located. pub old_root: Option, /// A place where the destination of a rewrite, rename or copy, or generally the new version of resources, are located. pub new_root: Option, } impl WorktreeRoots { /// Return the root path for the given `kind` pub fn by_kind(&self, kind: ResourceKind) -> Option<&Path> { match kind { ResourceKind::OldOrSource => self.old_root.as_deref(), ResourceKind::NewOrDestination => self.new_root.as_deref(), } } } /// Data as part of an [Outcome]. #[derive(Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Debug)] pub enum Data { /// The data to use for diffing was written into the buffer that was passed during the call to [`Pipeline::convert_to_diffable()`]. Buffer, /// The size that the binary blob had at the given revision, without having applied filters, as it's either /// considered binary or above the big-file threshold. /// /// In this state, the binary file cannot be diffed. Binary { /// The size of the object prior to performing any filtering or as it was found on disk. /// /// Note that technically, the size isn't always representative of the same 'state' of the /// content, as once it can be the size of the blob in git, and once it's the size of file /// in the worktree. size: u64, }, } /// The outcome returned by [Pipeline::convert_to_diffable()](super::Pipeline::convert_to_diffable()). #[derive(Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Debug)] pub struct Outcome { /// If available, an index into the `drivers` field to access more diff-related information of the driver for items /// at the given path, as previously determined by git-attributes. /// /// Note that drivers are queried even if there is no object available. pub driver_index: Option, /// The data itself, suitable for diffing, and if the object or worktree item is present at all. pub data: Option, } /// Options for use in a [`Pipeline`]. #[derive(Default, Clone, Copy, PartialEq, Eq, Debug, Hash, Ord, PartialOrd)] pub struct Options { /// The amount of bytes that an object has to reach before being treated as binary. /// These objects will not be queried, nor will their data be processed in any way. /// If `0`, no file is ever considered binary due to their size. /// /// Note that for files stored in `git`, what counts is their stored, decompressed size, /// thus `git-lfs` files would typically not be considered binary unless one explicitly sets /// them pub large_file_threshold_bytes: u64, /// Capabilities of the file system which affect how we read worktree files. pub fs: gix_fs::Capabilities, } /// The specific way to convert a resource. #[derive(Default, Debug, Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash)] pub enum Mode { /// Always prepare the version of the resource as it would be in the work-tree, and /// apply binary-to-text filters if present. /// /// This is typically free for resources in the worktree, and will apply filters to resources in the /// object database. #[default] ToWorktreeAndBinaryToText, /// Prepare the version of the resource as it would be in the work-tree if /// binary-to-text filters are present (and apply them), or use the version in `git` otherwise. ToGitUnlessBinaryToTextIsPresent, /// Always prepare resources as they are stored in `git`. /// /// This is usually fastest, even though resources in the worktree needed to be converted files. ToGit, } impl Mode { fn to_worktree(self) -> bool { matches!( self, Mode::ToGitUnlessBinaryToTextIsPresent | Mode::ToWorktreeAndBinaryToText ) } fn to_git(self) -> bool { matches!(self, Mode::ToGitUnlessBinaryToTextIsPresent | Mode::ToGit) } } /// pub mod convert_to_diffable { use bstr::BString; use gix_object::tree::EntryKind; /// The error returned by [Pipeline::convert_to_diffable()](super::Pipeline::convert_to_diffable()). #[derive(Debug, thiserror::Error)] #[allow(missing_docs)] pub enum Error { #[error("Entry at '{rela_path}' must be regular file or symlink, but was {actual:?}")] InvalidEntryKind { rela_path: BString, actual: EntryKind }, #[error("Entry at '{rela_path}' could not be read as symbolic link")] ReadLink { rela_path: BString, source: std::io::Error }, #[error("Entry at '{rela_path}' could not be opened for reading or read from")] OpenOrRead { rela_path: BString, source: std::io::Error }, #[error("Entry at '{rela_path}' could not be copied from a filter process to a memory buffer")] StreamCopy { rela_path: BString, source: std::io::Error }, #[error("Failed to run '{cmd}' for binary-to-text conversion of entry at {rela_path}")] RunTextConvFilter { rela_path: BString, cmd: String, source: std::io::Error, }, #[error("Tempfile for binary-to-text conversion for entry at {rela_path} could not be created")] CreateTempfile { rela_path: BString, source: std::io::Error }, #[error("Binary-to-text conversion '{cmd}' for entry at {rela_path} failed with: {stderr}")] TextConvFilterFailed { rela_path: BString, cmd: String, stderr: BString, }, #[error(transparent)] FindObject(#[from] gix_object::find::existing_object::Error), #[error(transparent)] ConvertToWorktree(#[from] gix_filter::pipeline::convert::to_worktree::Error), #[error(transparent)] ConvertToGit(#[from] gix_filter::pipeline::convert::to_git::Error), } } /// Lifecycle impl Pipeline { /// Create a new instance of a pipeline which produces blobs suitable for diffing. `roots` allow to read worktree files directly, otherwise /// `worktree_filter` is used to transform object database data directly. `drivers` further configure individual paths. /// `options` are used to further configure the way we act.. pub fn new( roots: WorktreeRoots, worktree_filter: gix_filter::Pipeline, mut drivers: Vec, options: Options, ) -> Self { drivers.sort_by(|a, b| a.name.cmp(&b.name)); Pipeline { roots, worktree_filter, drivers, options, attrs: { let mut out = gix_filter::attributes::search::Outcome::default(); out.initialize_with_selection(&Default::default(), Some("diff")); out }, path: Default::default(), } } } /// Access impl Pipeline { /// Return all drivers that this instance was initialized with. pub fn drivers(&self) -> &[super::Driver] { &self.drivers } } /// Conversion impl Pipeline { /// Convert the object at `id`, `mode`, `rela_path` and `kind`, providing access to `attributes` and `objects`. /// The resulting diff-able data is written into `out`, assuming it's not too large. The returned [`Outcome`] /// contains information on how to use `out`, or if it's filled at all. /// /// `attributes` must be returning the attributes at `rela_path`, and `objects` must be usable if `kind` is /// a resource in the object database, i.e. has no worktree root available. /// /// If `id` [is null](gix_hash::ObjectId::is_null()) or the file in question doesn't exist in the worktree in case /// [a root](WorktreeRoots) is present, then `out` will be left cleared and [Outcome::data] will be `None`. /// /// Note that `mode` is trusted, and we will not re-validate that the entry in the worktree actually is of that mode. /// /// Use `convert` to control what kind of the resource will be produced. /// /// ### About Tempfiles /// /// When querying from the object database and a binary and a [binary-to-text](Driver::binary_to_text_command) is set, /// a temporary file will be created to serve as input for the converter program, containing the worktree-data that /// exactly as it would be present in the worktree if checked out. /// /// As these files are ultimately named tempfiles, they will be leaked unless the [gix_tempfile] is configured with /// a signal handler. If they leak, they would remain in the system's `$TMP` directory. #[allow(clippy::too_many_arguments)] pub fn convert_to_diffable( &mut self, id: &gix_hash::oid, mode: EntryKind, rela_path: &BStr, kind: ResourceKind, attributes: &mut dyn FnMut(&BStr, &mut gix_filter::attributes::search::Outcome), objects: &dyn gix_object::FindObjectOrHeader, convert: Mode, out: &mut Vec, ) -> Result { let is_symlink = match mode { EntryKind::Link if self.options.fs.symlink => true, EntryKind::Blob | EntryKind::BlobExecutable => false, _ => { return Err(convert_to_diffable::Error::InvalidEntryKind { rela_path: rela_path.to_owned(), actual: mode, }) } }; out.clear(); attributes(rela_path, &mut self.attrs); let attr = self.attrs.iter_selected().next().expect("pre-initialized with 'diff'"); let driver_index = attr .assignment .state .as_bstr() .and_then(|name| self.drivers.binary_search_by(|d| d.name.as_bstr().cmp(name)).ok()); let driver = driver_index.map(|idx| &self.drivers[idx]); let mut is_binary = if let Some(driver) = driver { driver .is_binary .map(|is_binary| is_binary && driver.binary_to_text_command.is_none()) } else { attr.assignment.state.is_unset().then_some(true) }; match self.roots.by_kind(kind) { Some(root) => { self.path.clear(); self.path.push(root); self.path.push(gix_path::from_bstr(rela_path)); let data = if is_symlink { let target = none_if_missing(std::fs::read_link(&self.path)).map_err(|err| { convert_to_diffable::Error::ReadLink { rela_path: rela_path.to_owned(), source: err, } })?; target.map(|target| { out.extend_from_slice(gix_path::into_bstr(target).as_ref()); Data::Buffer }) } else { let need_size_only = is_binary == Some(true); let size_in_bytes = (need_size_only || (is_binary != Some(false) && self.options.large_file_threshold_bytes > 0)) .then(|| { none_if_missing(self.path.metadata().map(|md| md.len())).map_err(|err| { convert_to_diffable::Error::OpenOrRead { rela_path: rela_path.to_owned(), source: err, } }) }) .transpose()?; match size_in_bytes { Some(None) => None, // missing as identified by the size check Some(Some(size)) if size > self.options.large_file_threshold_bytes || need_size_only => { Some(Data::Binary { size }) } _ => { match driver .filter(|_| convert.to_worktree()) .and_then(|d| d.prepare_binary_to_text_cmd(&self.path)) { Some(cmd) => { // Avoid letting the driver program fail if it doesn't exist. if self.options.large_file_threshold_bytes == 0 && none_if_missing(std::fs::symlink_metadata(&self.path)) .map_err(|err| convert_to_diffable::Error::OpenOrRead { rela_path: rela_path.to_owned(), source: err, })? .is_none() { None } else { run_cmd(rela_path, cmd, out)?; Some(Data::Buffer) } } None => { let file = none_if_missing(std::fs::File::open(&self.path)).map_err(|err| { convert_to_diffable::Error::OpenOrRead { rela_path: rela_path.to_owned(), source: err, } })?; match file { Some(mut file) => { if convert.to_git() { let res = self.worktree_filter.convert_to_git( file, gix_path::from_bstr(rela_path).as_ref(), attributes, &mut |buf| objects.try_find(id, buf).map(|obj| obj.map(|_| ())), )?; match res { ToGitOutcome::Unchanged(mut file) => { file.read_to_end(out).map_err(|err| { convert_to_diffable::Error::OpenOrRead { rela_path: rela_path.to_owned(), source: err, } })?; } ToGitOutcome::Process(mut stream) => { stream.read_to_end(out).map_err(|err| { convert_to_diffable::Error::OpenOrRead { rela_path: rela_path.to_owned(), source: err, } })?; } ToGitOutcome::Buffer(buf) => { out.resize(buf.len(), 0); out.copy_from_slice(buf); } } } else { file.read_to_end(out).map_err(|err| { convert_to_diffable::Error::OpenOrRead { rela_path: rela_path.to_owned(), source: err, } })?; } Some(if is_binary.unwrap_or_else(|| is_binary_buf(out)) { let size = out.len() as u64; out.clear(); Data::Binary { size } } else { Data::Buffer }) } None => None, } } } } } }; Ok(Outcome { driver_index, data }) } None => { let data = if id.is_null() { None } else { let header = objects .try_header(id) .map_err(gix_object::find::existing_object::Error::Find)? .ok_or_else(|| gix_object::find::existing_object::Error::NotFound { oid: id.to_owned() })?; if is_binary.is_none() && self.options.large_file_threshold_bytes > 0 && header.size > self.options.large_file_threshold_bytes { is_binary = Some(true); }; let data = if is_binary == Some(true) { Data::Binary { size: header.size } } else { objects .try_find(id, out) .map_err(gix_object::find::existing_object::Error::Find)? .ok_or_else(|| gix_object::find::existing_object::Error::NotFound { oid: id.to_owned() })?; if matches!(mode, EntryKind::Blob | EntryKind::BlobExecutable) && convert == Mode::ToWorktreeAndBinaryToText || (convert == Mode::ToGitUnlessBinaryToTextIsPresent && driver.map_or(false, |d| d.binary_to_text_command.is_some())) { let res = self.worktree_filter .convert_to_worktree(out, rela_path, attributes, Delay::Forbid)?; let cmd_and_file = driver .and_then(|d| { d.binary_to_text_command.is_some().then(|| { gix_tempfile::new( std::env::temp_dir(), gix_tempfile::ContainingDirectory::Exists, gix_tempfile::AutoRemove::Tempfile, ) .and_then(|mut tmp_file| { self.path.clear(); tmp_file.with_mut(|tmp| self.path.push(tmp.path()))?; Ok(tmp_file) }) .map(|tmp_file| { ( d.prepare_binary_to_text_cmd(&self.path) .expect("always get cmd if command is set"), tmp_file, ) }) }) }) .transpose() .map_err(|err| convert_to_diffable::Error::CreateTempfile { source: err, rela_path: rela_path.to_owned(), })?; match cmd_and_file { Some((cmd, mut tmp_file)) => { match res { ToWorktreeOutcome::Unchanged(buf) | ToWorktreeOutcome::Buffer(buf) => { tmp_file.write_all(buf) } ToWorktreeOutcome::Process(MaybeDelayed::Immediate(mut stream)) => { std::io::copy(&mut stream, &mut tmp_file).map(|_| ()) } ToWorktreeOutcome::Process(MaybeDelayed::Delayed(_)) => { unreachable!("we prohibit this") } } .map_err(|err| { convert_to_diffable::Error::CreateTempfile { source: err, rela_path: rela_path.to_owned(), } })?; out.clear(); run_cmd(rela_path, cmd, out)?; } None => { match res { ToWorktreeOutcome::Unchanged(_) => {} ToWorktreeOutcome::Buffer(src) => { out.resize(src.len(), 0); out.copy_from_slice(src); } ToWorktreeOutcome::Process(MaybeDelayed::Immediate(mut stream)) => { std::io::copy(&mut stream, out).map_err(|err| { convert_to_diffable::Error::StreamCopy { rela_path: rela_path.to_owned(), source: err, } })?; } ToWorktreeOutcome::Process(MaybeDelayed::Delayed(_)) => { unreachable!("we prohibit this") } }; } } } if driver.map_or(true, |d| d.binary_to_text_command.is_none()) && is_binary.unwrap_or_else(|| is_binary_buf(out)) { let size = out.len() as u64; out.clear(); Data::Binary { size } } else { Data::Buffer } }; Some(data) }; Ok(Outcome { driver_index, data }) } } } } fn is_binary_buf(buf: &[u8]) -> bool { let buf = &buf[..buf.len().min(8000)]; buf.contains(&0) } fn none_if_missing(res: std::io::Result) -> std::io::Result> { match res { Ok(data) => Ok(Some(data)), Err(err) if err.kind() == std::io::ErrorKind::NotFound => Ok(None), Err(err) => Err(err), } } fn run_cmd(rela_path: &BStr, mut cmd: Command, out: &mut Vec) -> Result<(), convert_to_diffable::Error> { gix_trace::debug!(cmd = ?cmd, "Running binary-to-text command"); let mut res = cmd .output() .map_err(|err| convert_to_diffable::Error::RunTextConvFilter { rela_path: rela_path.to_owned(), cmd: format!("{cmd:?}"), source: err, })?; if !res.status.success() { return Err(convert_to_diffable::Error::TextConvFilterFailed { rela_path: rela_path.to_owned(), cmd: format!("{cmd:?}"), stderr: res.stderr.into(), }); } out.append(&mut res.stdout); Ok(()) } impl Driver { /// Produce an invocable command pre-configured to produce the filtered output on stdout after reading `path`. pub fn prepare_binary_to_text_cmd(&self, path: &Path) -> Option { let command: &BStr = self.binary_to_text_command.as_ref()?.as_ref(); let cmd = gix_command::prepare(gix_path::from_bstr(command).into_owned()) .with_shell() .stdin(Stdio::null()) .stdout(Stdio::piped()) .stderr(Stdio::piped()) .arg(path) .into(); Some(cmd) } }