diff options
Diffstat (limited to 'vendor/gix/src/object/tree')
-rw-r--r-- | vendor/gix/src/object/tree/diff/change.rs | 111 | ||||
-rw-r--r-- | vendor/gix/src/object/tree/diff/for_each.rs | 235 | ||||
-rw-r--r-- | vendor/gix/src/object/tree/diff/mod.rs | 118 | ||||
-rw-r--r-- | vendor/gix/src/object/tree/diff/rewrites.rs | 108 | ||||
-rw-r--r-- | vendor/gix/src/object/tree/diff/tracked.rs | 491 | ||||
-rw-r--r-- | vendor/gix/src/object/tree/iter.rs | 53 | ||||
-rw-r--r-- | vendor/gix/src/object/tree/mod.rs | 158 | ||||
-rw-r--r-- | vendor/gix/src/object/tree/traverse.rs | 62 |
8 files changed, 1336 insertions, 0 deletions
diff --git a/vendor/gix/src/object/tree/diff/change.rs b/vendor/gix/src/object/tree/diff/change.rs new file mode 100644 index 000000000..e6826d6ed --- /dev/null +++ b/vendor/gix/src/object/tree/diff/change.rs @@ -0,0 +1,111 @@ +use crate::{bstr::BStr, Id}; + +/// Information about the diff performed to detect similarity of a [Rewrite][Event::Rewrite]. +#[derive(Debug, Default, Clone, Copy, Eq, PartialEq)] +pub struct DiffLineStats { + /// The amount of lines to remove from the source to get to the destination. + pub removals: u32, + /// The amount of lines to add to the source to get to the destination. + pub insertions: u32, + /// The amount of lines of the previous state, in the source. + pub before: u32, + /// The amount of lines of the new state, in the destination. + pub after: u32, +} + +/// An event emitted when finding differences between two trees. +#[derive(Debug, Clone, Copy)] +pub enum Event<'a, 'old, 'new> { + /// An entry was added, like the addition of a file or directory. + Addition { + /// The mode of the added entry. + entry_mode: gix_object::tree::EntryMode, + /// The object id of the added entry. + id: Id<'new>, + }, + /// An entry was deleted, like the deletion of a file or directory. + Deletion { + /// The mode of the deleted entry. + entry_mode: gix_object::tree::EntryMode, + /// The object id of the deleted entry. + id: Id<'old>, + }, + /// An entry was modified, e.g. changing the contents of a file adjusts its object id and turning + /// a file into a symbolic link adjusts its mode. + Modification { + /// The mode of the entry before the modification. + previous_entry_mode: gix_object::tree::EntryMode, + /// The object id of the entry before the modification. + previous_id: Id<'old>, + + /// The mode of the entry after the modification. + entry_mode: gix_object::tree::EntryMode, + /// The object id after the modification. + id: Id<'new>, + }, + /// Entries are considered rewritten if they are not trees and they, according to some understanding of identity, were renamed + /// or copied. + /// In case of renames, this means they originally appeared as [`Deletion`][Event::Deletion] signalling their source as well as an + /// [`Addition`][Event::Addition] acting as destination. + /// + /// In case of copies, the `copy` flag is true and typically represents a perfect copy of a source was made. + /// + /// This variant can only be encountered if [rewrite tracking][super::Platform::track_rewrites()] is enabled. + /// + /// Note that mode changes may have occurred as well, i.e. changes from executable to non-executable or vice-versa. + Rewrite { + /// The location of the source of the rename operation. + /// + /// It may be empty if neither [file names][super::Platform::track_filename()] nor [file paths][super::Platform::track_path()] + /// are tracked. + source_location: &'a BStr, + /// The mode of the entry before the rename. + source_entry_mode: gix_object::tree::EntryMode, + /// The object id of the entry before the rename. + /// + /// Note that this is the same as `id` if we require the [similarity to be 100%][super::Rewrites::percentage], but may + /// be different otherwise. + source_id: Id<'old>, + /// Information about the diff we performed to detect similarity and match the `source_id` with the current state at `id`. + /// It's `None` if `source_id` is equal to `id`, as identity made an actual diff computation unnecessary. + diff: Option<DiffLineStats>, + /// The mode of the entry after the rename. + /// It could differ but still be considered a rename as we are concerned only about content. + entry_mode: gix_object::tree::EntryMode, + /// The object id after the rename. + id: Id<'new>, + /// If true, this rewrite is created by copy, and `source_id` is pointing to its source. Otherwise it's a rename, and `source_id` + /// points to a deleted object, as renames are tracked as deletions and additions of the same or similar content. + copy: bool, + }, +} + +impl<'a, 'old, 'new> Event<'a, 'old, 'new> { + /// Produce a platform for performing a line-diff, or `None` if this is not a [`Modification`][Event::Modification] + /// or one of the entries to compare is not a blob. + pub fn diff( + &self, + ) -> Option<Result<crate::object::blob::diff::Platform<'old, 'new>, crate::object::blob::diff::init::Error>> { + match self { + Event::Modification { + previous_entry_mode, + previous_id, + entry_mode, + id, + } if entry_mode.is_blob() && previous_entry_mode.is_blob() => { + Some(crate::object::blob::diff::Platform::from_ids(previous_id, id)) + } + _ => None, + } + } + + /// Return the current mode of this instance. + pub fn entry_mode(&self) -> gix_object::tree::EntryMode { + match self { + Event::Addition { entry_mode, .. } + | Event::Deletion { entry_mode, .. } + | Event::Modification { entry_mode, .. } + | Event::Rewrite { entry_mode, .. } => *entry_mode, + } + } +} diff --git a/vendor/gix/src/object/tree/diff/for_each.rs b/vendor/gix/src/object/tree/diff/for_each.rs new file mode 100644 index 000000000..5cae4cf2f --- /dev/null +++ b/vendor/gix/src/object/tree/diff/for_each.rs @@ -0,0 +1,235 @@ +use gix_object::TreeRefIter; +use gix_odb::FindExt; + +use super::{change, Action, Change, Platform}; +use crate::{ + bstr::BStr, + ext::ObjectIdExt, + object::tree::{ + diff, + diff::{rewrites, tracked}, + }, + Repository, Tree, +}; + +/// The error return by methods on the [diff platform][Platform]. +#[derive(Debug, thiserror::Error)] +#[allow(missing_docs)] +pub enum Error { + #[error(transparent)] + Diff(#[from] gix_diff::tree::changes::Error), + #[error("The user-provided callback failed")] + ForEach(#[source] Box<dyn std::error::Error + Send + Sync + 'static>), + #[error("Could not find blob for similarity checking")] + FindExistingBlob(#[from] crate::object::find::existing::Error), + #[error("Could not configure diff algorithm prior to checking similarity")] + ConfigureDiffAlgorithm(#[from] crate::config::diff::algorithm::Error), + #[error("Could not traverse tree to obtain possible sources for copies")] + TraverseTreeForExhaustiveCopyDetection(#[from] gix_traverse::tree::breadthfirst::Error), +} + +/// +#[derive(Clone, Debug, Copy, PartialEq)] +pub struct Outcome { + /// Available only if [rewrite-tracking was enabled][Platform::track_rewrites()]. + pub rewrites: Option<rewrites::Outcome>, +} + +/// Add the item to compare to. +impl<'a, 'old> Platform<'a, 'old> { + /// Call `for_each` repeatedly with all changes that are needed to convert the source of the diff to the tree to `other`. + /// + /// `other` could also be created with the [`empty_tree()`][crate::Repository::empty_tree()] method to handle the first commit + /// in a repository - it doesn't have a parent, equivalent to compare 'nothing' to something. + pub fn for_each_to_obtain_tree<'new, E>( + &mut self, + other: &Tree<'new>, + for_each: impl FnMut(Change<'_, 'old, 'new>) -> Result<Action, E>, + ) -> Result<Outcome, Error> + where + E: std::error::Error + Sync + Send + 'static, + { + let repo = self.lhs.repo; + let mut delegate = Delegate { + src_tree: self.lhs, + other_repo: other.repo, + recorder: gix_diff::tree::Recorder::default().track_location(self.tracking), + visit: for_each, + tracked: self.rewrites.map(|r| tracked::State::new(r, self.tracking)), + err: None, + }; + match gix_diff::tree::Changes::from(TreeRefIter::from_bytes(&self.lhs.data)).needed_to_obtain( + TreeRefIter::from_bytes(&other.data), + &mut self.state, + |oid, buf| repo.objects.find_tree_iter(oid, buf), + &mut delegate, + ) { + Ok(()) => { + let outcome = Outcome { + rewrites: delegate.process_tracked_changes()?, + }; + match delegate.err { + Some(err) => Err(Error::ForEach(Box::new(err))), + None => Ok(outcome), + } + } + Err(gix_diff::tree::changes::Error::Cancelled) => delegate + .err + .map(|err| Err(Error::ForEach(Box::new(err)))) + .unwrap_or(Err(Error::Diff(gix_diff::tree::changes::Error::Cancelled))), + Err(err) => Err(err.into()), + } + } +} + +struct Delegate<'a, 'old, 'new, VisitFn, E> { + src_tree: &'a Tree<'old>, + other_repo: &'new Repository, + recorder: gix_diff::tree::Recorder, + visit: VisitFn, + tracked: Option<tracked::State>, + err: Option<E>, +} + +impl<'a, 'old, 'new, VisitFn, E> Delegate<'a, 'old, 'new, VisitFn, E> +where + VisitFn: for<'delegate> FnMut(Change<'delegate, 'old, 'new>) -> Result<Action, E>, + E: std::error::Error + Sync + Send + 'static, +{ + /// Call `visit` on an attached version of `change`. + fn emit_change( + change: gix_diff::tree::visit::Change, + location: &BStr, + visit: &mut VisitFn, + repo: &'old Repository, + other_repo: &'new Repository, + stored_err: &mut Option<E>, + ) -> gix_diff::tree::visit::Action { + use gix_diff::tree::visit::Change::*; + let event = match change { + Addition { entry_mode, oid } => change::Event::Addition { + entry_mode, + id: oid.attach(other_repo), + }, + Deletion { entry_mode, oid } => change::Event::Deletion { + entry_mode, + id: oid.attach(repo), + }, + Modification { + previous_entry_mode, + previous_oid, + entry_mode, + oid, + } => change::Event::Modification { + previous_entry_mode, + entry_mode, + previous_id: previous_oid.attach(repo), + id: oid.attach(other_repo), + }, + }; + match visit(Change { event, location }) { + Ok(Action::Cancel) => gix_diff::tree::visit::Action::Cancel, + Ok(Action::Continue) => gix_diff::tree::visit::Action::Continue, + Err(err) => { + *stored_err = Some(err); + gix_diff::tree::visit::Action::Cancel + } + } + } + + fn process_tracked_changes(&mut self) -> Result<Option<rewrites::Outcome>, Error> { + let tracked = match self.tracked.as_mut() { + Some(t) => t, + None => return Ok(None), + }; + + let outcome = tracked.emit( + |dest, source| match source { + Some(source) => { + let (oid, mode) = dest.change.oid_and_entry_mode(); + let change = diff::Change { + location: dest.location, + event: diff::change::Event::Rewrite { + source_location: source.location, + source_entry_mode: source.mode, + source_id: source.id.attach(self.src_tree.repo), + entry_mode: mode, + id: oid.to_owned().attach(self.other_repo), + diff: source.diff, + copy: match source.kind { + tracked::visit::Kind::RenameTarget => false, + tracked::visit::Kind::CopyDestination => true, + }, + }, + }; + match (self.visit)(change) { + Ok(Action::Cancel) => gix_diff::tree::visit::Action::Cancel, + Ok(Action::Continue) => gix_diff::tree::visit::Action::Continue, + Err(err) => { + self.err = Some(err); + gix_diff::tree::visit::Action::Cancel + } + } + } + None => Self::emit_change( + dest.change, + dest.location, + &mut self.visit, + self.src_tree.repo, + self.other_repo, + &mut self.err, + ), + }, + self.src_tree, + )?; + Ok(Some(outcome)) + } +} + +impl<'a, 'old, 'new, VisitFn, E> gix_diff::tree::Visit for Delegate<'a, 'old, 'new, VisitFn, E> +where + VisitFn: for<'delegate> FnMut(Change<'delegate, 'old, 'new>) -> Result<Action, E>, + E: std::error::Error + Sync + Send + 'static, +{ + fn pop_front_tracked_path_and_set_current(&mut self) { + self.recorder.pop_front_tracked_path_and_set_current() + } + + fn push_back_tracked_path_component(&mut self, component: &BStr) { + self.recorder.push_back_tracked_path_component(component) + } + + fn push_path_component(&mut self, component: &BStr) { + self.recorder.push_path_component(component) + } + + fn pop_path_component(&mut self) { + self.recorder.pop_path_component() + } + + fn visit(&mut self, change: gix_diff::tree::visit::Change) -> gix_diff::tree::visit::Action { + match self.tracked.as_mut() { + Some(tracked) => tracked + .try_push_change(change, self.recorder.path()) + .map(|change| { + Self::emit_change( + change, + self.recorder.path(), + &mut self.visit, + self.src_tree.repo, + self.other_repo, + &mut self.err, + ) + }) + .unwrap_or(gix_diff::tree::visit::Action::Continue), + None => Self::emit_change( + change, + self.recorder.path(), + &mut self.visit, + self.src_tree.repo, + self.other_repo, + &mut self.err, + ), + } + } +} diff --git a/vendor/gix/src/object/tree/diff/mod.rs b/vendor/gix/src/object/tree/diff/mod.rs new file mode 100644 index 000000000..5a3bf6ddf --- /dev/null +++ b/vendor/gix/src/object/tree/diff/mod.rs @@ -0,0 +1,118 @@ +use gix_diff::tree::recorder::Location; + +use crate::{bstr::BStr, Tree}; + +/// Returned by the `for_each` function to control flow. +#[derive(Clone, Copy, PartialOrd, PartialEq, Ord, Eq, Hash)] +pub enum Action { + /// Continue the traversal of changes. + Continue, + /// Stop the traversal of changes and stop calling this function. + Cancel, +} + +impl Default for Action { + fn default() -> Self { + Action::Continue + } +} + +/// Represents any possible change in order to turn one tree into another. +#[derive(Debug, Clone, Copy)] +pub struct Change<'a, 'old, 'new> { + /// The location of the file or directory described by `event`, if tracking was enabled. + /// + /// Otherwise this value is always an empty path. + pub location: &'a BStr, + /// The diff event itself to provide information about what would need to change. + pub event: change::Event<'a, 'old, 'new>, +} + +/// +pub mod change; + +/// Diffing +impl<'repo> Tree<'repo> { + /// Return a platform to see the changes needed to create other trees, for instance. + /// + /// # Performance + /// + /// It's highly recommended to set an object cache to avoid extracting the same object multiple times. + /// By default, similar to `git diff`, rename tracking will be enabled if it is not configured. + #[allow(clippy::result_large_err)] + pub fn changes<'a>(&'a self) -> Result<Platform<'a, 'repo>, rewrites::Error> { + Ok(Platform { + state: Default::default(), + lhs: self, + tracking: None, + rewrites: self.repo.config.diff_renames()?.unwrap_or_default().into(), + }) + } +} + +/// The diffing platform returned by [`Tree::changes()`]. +#[derive(Clone)] +pub struct Platform<'a, 'repo> { + state: gix_diff::tree::State, + lhs: &'a Tree<'repo>, + tracking: Option<Location>, + rewrites: Option<Rewrites>, +} + +/// A structure to capture how to perform rename and copy tracking +#[derive(Debug, Copy, Clone, PartialEq)] +pub struct Rewrites { + /// If `Some(…)`, do also find copies. `None` is the default which does not try to detect copies at all. + /// + /// Note that this is an even more expensive operation than detecting renames as files. + pub copies: Option<rewrites::Copies>, + /// The percentage of similarity needed for files to be considered renamed, defaulting to `Some(0.5)`. + /// This field is similar to `git diff -M50%`. + /// + /// If `None`, files are only considered equal if their content matches 100%. + /// Note that values greater than 1.0 have no different effect than 1.0. + pub percentage: Option<f32>, + /// The amount of files to consider for fuzzy rename or copy tracking. Defaults to 1000, meaning that only 1000*1000 + /// combinations can be tested for fuzzy matches, i.e. the ones that try to find matches by comparing similarity. + /// If 0, there is no limit. + /// + /// If the limit would not be enough to test the entire set of combinations, the algorithm will trade in precision and not + /// run the fuzzy version of identity tests at all. That way results are never partial. + pub limit: usize, +} + +/// +pub mod rewrites; + +/// types to actually perform rename tracking. +pub(crate) mod tracked; + +/// Configuration +impl<'a, 'repo> Platform<'a, 'repo> { + /// Keep track of file-names, which makes the [`location`][Change::location] field usable with the filename of the changed item. + pub fn track_filename(&mut self) -> &mut Self { + self.tracking = Some(Location::FileName); + self + } + + /// Keep track of the entire path of a change, relative to the repository. + /// + /// This makes the [`location`][Change::location] field usable. + pub fn track_path(&mut self) -> &mut Self { + self.tracking = Some(Location::Path); + self + } + + /// Provide `None` to disable rewrite tracking entirely, or pass `Some(<configuration>)` to control to + /// what extend rename and copy tracking is performed. + /// + /// Note that by default, the git configuration determines rewrite tracking and git defaults are used + /// if nothing is configured, which turns rename tracking with 50% similarity on, while not tracking copies at all. + pub fn track_rewrites(&mut self, renames: Option<Rewrites>) -> &mut Self { + self.rewrites = renames; + self + } +} + +/// +pub mod for_each; diff --git a/vendor/gix/src/object/tree/diff/rewrites.rs b/vendor/gix/src/object/tree/diff/rewrites.rs new file mode 100644 index 000000000..304894d15 --- /dev/null +++ b/vendor/gix/src/object/tree/diff/rewrites.rs @@ -0,0 +1,108 @@ +use crate::{ + config::{cache::util::ApplyLeniency, tree::Diff}, + diff::rename::Tracking, + object::tree::diff::Rewrites, +}; + +/// From where to source copies +#[derive(Debug, Copy, Clone, Eq, PartialEq)] +pub enum CopySource { + /// Find copies from the set of modified files only. + FromSetOfModifiedFiles, + /// Find copies from the set of changed files, as well as all files known to the source (i.e. previous state) of the tree. + /// + /// This can be an expensive operation as it scales exponentially with the total amount of files in the tree. + FromSetOfModifiedFilesAndSourceTree, +} + +/// How to determine copied files. +#[derive(Debug, Copy, Clone, PartialEq)] +pub struct Copies { + /// The set of files to search when finding the source of copies. + pub source: CopySource, + /// Equivalent to [`Rewrites::percentage`], but used for copy tracking. + /// + /// Useful to have similarity-based rename tracking and cheaper copy tracking, which also is the default + /// as only identity plays a role. + pub percentage: Option<f32>, +} + +impl Default for Copies { + fn default() -> Self { + Copies { + source: CopySource::FromSetOfModifiedFiles, + percentage: Some(0.5), + } + } +} + +/// Information collected while handling rewrites of files which may be tracked. +#[derive(Default, Clone, Copy, Debug, PartialEq)] +pub struct Outcome { + /// The options used to guide the rewrite tracking. Either fully provided by the caller or retrieved from git configuration. + pub options: Rewrites, + /// The amount of similarity checks that have been conducted to find renamed files and potentially copies. + pub num_similarity_checks: usize, + /// Set to the amount of worst-case rename permutations we didn't search as our limit didn't allow it. + pub num_similarity_checks_skipped_for_rename_tracking_due_to_limit: usize, + /// Set to the amount of worst-case copy permutations we didn't search as our limit didn't allow it. + pub num_similarity_checks_skipped_for_copy_tracking_due_to_limit: usize, +} + +/// The error returned by [`Rewrites::try_from_config()]. +#[derive(Debug, thiserror::Error)] +#[allow(missing_docs)] +pub enum Error { + #[error(transparent)] + ConfigDiffRenames(#[from] crate::config::key::GenericError), + #[error(transparent)] + ConfigDiffRenameLimit(#[from] crate::config::unsigned_integer::Error), +} + +/// The default settings for rewrites according to the git configuration defaults. +impl Default for Rewrites { + fn default() -> Self { + Rewrites { + copies: None, + percentage: Some(0.5), + limit: 1000, + } + } +} + +impl Rewrites { + /// Create an instance by reading all relevant information from the `config`uration, while being `lenient` or not. + /// Returns `Ok(None)` if nothing is configured. + /// + /// Note that missing values will be defaulted similar to what git does. + #[allow(clippy::result_large_err)] + pub fn try_from_config(config: &gix_config::File<'static>, lenient: bool) -> Result<Option<Self>, Error> { + let key = "diff.renames"; + let copies = match config + .boolean_by_key(key) + .map(|value| Diff::RENAMES.try_into_renames(value, || config.string_by_key(key))) + .transpose() + .with_leniency(lenient)? + { + Some(renames) => match renames { + Tracking::Disabled => return Ok(None), + Tracking::Renames => None, + Tracking::RenamesAndCopies => Some(Copies::default()), + }, + None => return Ok(None), + }; + + let default = Self::default(); + Ok(Rewrites { + copies, + limit: config + .integer_by_key("diff.renameLimit") + .map(|value| Diff::RENAME_LIMIT.try_into_usize(value)) + .transpose() + .with_leniency(lenient)? + .unwrap_or(default.limit), + ..default + } + .into()) + } +} diff --git a/vendor/gix/src/object/tree/diff/tracked.rs b/vendor/gix/src/object/tree/diff/tracked.rs new file mode 100644 index 000000000..3bbe01624 --- /dev/null +++ b/vendor/gix/src/object/tree/diff/tracked.rs @@ -0,0 +1,491 @@ +use std::ops::Range; + +use gix_diff::tree::visit::Change; +use gix_object::tree::EntryMode; + +use crate::{ + bstr::BStr, + ext::ObjectIdExt, + object::tree::diff::{ + change::DiffLineStats, + rewrites::{CopySource, Outcome}, + Rewrites, + }, + Repository, Tree, +}; + +/// A set of tracked items allows to figure out their relations by figuring out their similarity. +pub struct Item { + /// The underlying raw change + change: Change, + /// That slice into the backing for paths. + location: Range<usize>, + /// If true, this item was already emitted, i.e. seen by the caller. + emitted: bool, +} + +impl Item { + fn location<'a>(&self, backing: &'a [u8]) -> &'a BStr { + backing[self.location.clone()].as_ref() + } + fn entry_mode_compatible(&self, mode: EntryMode) -> bool { + use EntryMode::*; + matches!( + (mode, self.change.entry_mode()), + (Blob | BlobExecutable, Blob | BlobExecutable) | (Link, Link) + ) + } + + fn is_source_for_destination_of(&self, kind: visit::Kind, dest_item_mode: EntryMode) -> bool { + self.entry_mode_compatible(dest_item_mode) + && match kind { + visit::Kind::RenameTarget => !self.emitted && matches!(self.change, Change::Deletion { .. }), + visit::Kind::CopyDestination => { + matches!(self.change, Change::Modification { .. }) + } + } + } +} + +pub struct State { + items: Vec<Item>, + path_backing: Vec<u8>, + rewrites: Rewrites, + tracking: Option<gix_diff::tree::recorder::Location>, +} + +pub mod visit { + use crate::{bstr::BStr, object::tree::diff::change::DiffLineStats}; + + pub struct Source<'a> { + pub mode: gix_object::tree::EntryMode, + pub id: gix_hash::ObjectId, + pub kind: Kind, + pub location: &'a BStr, + pub diff: Option<DiffLineStats>, + } + + #[derive(Debug, Copy, Clone, Eq, PartialEq)] + pub enum Kind { + RenameTarget, + CopyDestination, + } + + pub struct Destination<'a> { + pub change: gix_diff::tree::visit::Change, + pub location: &'a BStr, + } +} + +impl State { + pub(crate) fn new(renames: Rewrites, tracking: Option<gix_diff::tree::recorder::Location>) -> Self { + State { + items: vec![], + path_backing: vec![], + rewrites: renames, + tracking, + } + } +} + +/// build state and find matches. +impl State { + /// We may refuse the push if that information isn't needed for what we have to track. + pub fn try_push_change(&mut self, change: Change, location: &BStr) -> Option<Change> { + if !change.entry_mode().is_blob_or_symlink() { + return Some(change); + } + let keep = match (self.rewrites.copies, &change) { + (Some(_find_copies), _) => true, + (None, Change::Modification { .. }) => false, + (None, _) => true, + }; + + if !keep { + return Some(change); + } + + let start = self.path_backing.len(); + self.path_backing.extend_from_slice(location); + self.items.push(Item { + location: start..self.path_backing.len(), + change, + emitted: false, + }); + None + } + + /// Can only be called once effectively as it alters its own state. + /// + /// `cb(destination, source)` is called for each item, either with `Some(source)` if it's + /// the destination of a copy or rename, or with `None` for source if no relation to other + /// items in the tracked set exist. + pub fn emit( + &mut self, + mut cb: impl FnMut(visit::Destination<'_>, Option<visit::Source<'_>>) -> gix_diff::tree::visit::Action, + src_tree: &Tree<'_>, + ) -> Result<Outcome, crate::object::tree::diff::for_each::Error> { + fn by_id_and_location(a: &Item, b: &Item) -> std::cmp::Ordering { + a.change.oid().cmp(b.change.oid()).then_with(|| { + a.location + .start + .cmp(&b.location.start) + .then(a.location.end.cmp(&b.location.end)) + }) + } + self.items.sort_by(by_id_and_location); + + let mut out = Outcome { + options: self.rewrites, + ..Default::default() + }; + out = self.match_pairs_of_kind( + visit::Kind::RenameTarget, + &mut cb, + self.rewrites.percentage, + out, + src_tree.repo, + )?; + + if let Some(copies) = self.rewrites.copies { + out = self.match_pairs_of_kind( + visit::Kind::CopyDestination, + &mut cb, + copies.percentage, + out, + src_tree.repo, + )?; + + match copies.source { + CopySource::FromSetOfModifiedFiles => {} + CopySource::FromSetOfModifiedFilesAndSourceTree => { + src_tree + .traverse() + .breadthfirst(&mut tree_to_events::Delegate::new(self))?; + self.items.sort_by(by_id_and_location); + + out = self.match_pairs_of_kind( + visit::Kind::CopyDestination, + &mut cb, + copies.percentage, + out, + src_tree.repo, + )?; + } + } + } + + self.items + .sort_by(|a, b| a.location(&self.path_backing).cmp(b.location(&self.path_backing))); + for item in self.items.drain(..).filter(|item| !item.emitted) { + if cb( + visit::Destination { + location: item.location(&self.path_backing), + change: item.change, + }, + None, + ) == gix_diff::tree::visit::Action::Cancel + { + break; + } + } + Ok(out) + } + + fn match_pairs_of_kind( + &mut self, + kind: visit::Kind, + cb: &mut impl FnMut(visit::Destination<'_>, Option<visit::Source<'_>>) -> gix_diff::tree::visit::Action, + percentage: Option<f32>, + mut out: Outcome, + repo: &Repository, + ) -> Result<Outcome, crate::object::tree::diff::for_each::Error> { + // we try to cheaply reduce the set of possibilities first, before possibly looking more exhaustively. + let needs_second_pass = !needs_exact_match(percentage); + if self.match_pairs(cb, None /* by identity */, kind, repo, &mut out)? == gix_diff::tree::visit::Action::Cancel + { + return Ok(out); + } + if needs_second_pass { + let is_limited = if self.rewrites.limit == 0 { + false + } else if let Some(permutations) = permutations_over_limit(&self.items, self.rewrites.limit, kind) { + match kind { + visit::Kind::RenameTarget => { + out.num_similarity_checks_skipped_for_rename_tracking_due_to_limit = permutations; + } + visit::Kind::CopyDestination => { + out.num_similarity_checks_skipped_for_copy_tracking_due_to_limit = permutations; + } + } + true + } else { + false + }; + if !is_limited { + self.match_pairs(cb, self.rewrites.percentage, kind, repo, &mut out)?; + } + } + Ok(out) + } + + fn match_pairs( + &mut self, + cb: &mut impl FnMut(visit::Destination<'_>, Option<visit::Source<'_>>) -> gix_diff::tree::visit::Action, + percentage: Option<f32>, + kind: visit::Kind, + repo: &Repository, + stats: &mut Outcome, + ) -> Result<gix_diff::tree::visit::Action, crate::object::tree::diff::for_each::Error> { + // TODO(perf): reuse object data and interner state and interned tokens, make these available to `find_match()` + let mut dest_ofs = 0; + while let Some((mut dest_idx, dest)) = self.items[dest_ofs..].iter().enumerate().find_map(|(idx, item)| { + (!item.emitted && matches!(item.change, Change::Addition { .. })).then_some((idx, item)) + }) { + dest_idx += dest_ofs; + dest_ofs = dest_idx + 1; + let src = + find_match(&self.items, dest, dest_idx, percentage, kind, repo, stats)?.map(|(src_idx, src, diff)| { + let (id, mode) = src.change.oid_and_entry_mode(); + let id = id.to_owned(); + let location = src.location(&self.path_backing); + ( + visit::Source { + mode, + id, + kind, + location, + diff, + }, + src_idx, + ) + }); + if src.is_none() { + continue; + } + let location = dest.location(&self.path_backing); + let change = dest.change.clone(); + let dest = visit::Destination { change, location }; + self.items[dest_idx].emitted = true; + if let Some(src_idx) = src.as_ref().map(|t| t.1) { + self.items[src_idx].emitted = true; + } + if cb(dest, src.map(|t| t.0)) == gix_diff::tree::visit::Action::Cancel { + return Ok(gix_diff::tree::visit::Action::Cancel); + } + } + Ok(gix_diff::tree::visit::Action::Continue) + } +} + +fn permutations_over_limit(items: &[Item], limit: usize, kind: visit::Kind) -> Option<usize> { + let (sources, destinations) = items + .iter() + .filter(|item| match kind { + visit::Kind::RenameTarget => !item.emitted, + visit::Kind::CopyDestination => true, + }) + .fold((0, 0), |(mut src, mut dest), item| { + match item.change { + Change::Addition { .. } => { + dest += 1; + } + Change::Deletion { .. } => { + if kind == visit::Kind::RenameTarget { + src += 1 + } + } + Change::Modification { .. } => { + if kind == visit::Kind::CopyDestination { + src += 1 + } + } + } + (src, dest) + }); + let permutations = sources * destinations; + (permutations > limit * limit).then_some(permutations) +} + +fn needs_exact_match(percentage: Option<f32>) -> bool { + percentage.map_or(true, |p| p >= 1.0) +} + +/// <src_idx, src, possibly diff stat> +type SourceTuple<'a> = (usize, &'a Item, Option<DiffLineStats>); + +/// Find `item` in our set of items ignoring `item_idx` to avoid finding ourselves, by similarity indicated by `percentage`. +/// The latter can be `None` or `Some(x)` where `x>=1` for identity, and anything else for similarity. +/// We also ignore emitted items entirely. +/// Use `kind` to indicate what kind of match we are looking for, which might be deletions matching an `item` addition, or +/// any non-deletion otherwise. +/// Note that we always try to find by identity first even if a percentage is given as it's much faster and may reduce the set +/// of items to be searched. +fn find_match<'a>( + items: &'a [Item], + item: &Item, + item_idx: usize, + percentage: Option<f32>, + kind: visit::Kind, + repo: &Repository, + stats: &mut Outcome, +) -> Result<Option<SourceTuple<'a>>, crate::object::tree::diff::for_each::Error> { + let (item_id, item_mode) = item.change.oid_and_entry_mode(); + if needs_exact_match(percentage) || item_mode == gix_object::tree::EntryMode::Link { + let first_idx = items.partition_point(|a| a.change.oid() < item_id); + let range = match items.get(first_idx..).map(|items| { + let end = items + .iter() + .position(|a| a.change.oid() != item_id) + .map(|idx| first_idx + idx) + .unwrap_or(items.len()); + first_idx..end + }) { + Some(range) => range, + None => return Ok(None), + }; + if range.is_empty() { + return Ok(None); + } + let res = items[range.clone()].iter().enumerate().find_map(|(mut src_idx, src)| { + src_idx += range.start; + (src_idx != item_idx && src.is_source_for_destination_of(kind, item_mode)).then_some((src_idx, src, None)) + }); + if let Some(src) = res { + return Ok(Some(src)); + } + } else { + let new = item_id.to_owned().attach(repo).object()?; + let percentage = percentage.expect("it's set to something below 1.0 and we assured this"); + debug_assert!( + item.change.entry_mode().is_blob(), + "symlinks are matched exactly, and trees aren't used here" + ); + let algo = repo.config.diff_algorithm()?; + for (can_idx, src) in items + .iter() + .enumerate() + .filter(|(src_idx, src)| *src_idx != item_idx && src.is_source_for_destination_of(kind, item_mode)) + { + let old = src.change.oid().to_owned().attach(repo).object()?; + // TODO: make sure we get attribute handling and binary skips and filters right here. There is crate::object::blob::diff::Platform + // which should have facilities for that one day, but we don't use it because we need newlines in our tokens. + let tokens = gix_diff::blob::intern::InternedInput::new( + gix_diff::blob::sources::byte_lines_with_terminator(&old.data), + gix_diff::blob::sources::byte_lines_with_terminator(&new.data), + ); + let counts = gix_diff::blob::diff( + algo, + &tokens, + gix_diff::blob::sink::Counter::new(diff::Statistics { + removed_bytes: 0, + input: &tokens, + }), + ); + let similarity = (old.data.len() - counts.wrapped) as f32 / old.data.len().max(new.data.len()) as f32; + stats.num_similarity_checks += 1; + if similarity >= percentage { + return Ok(Some(( + can_idx, + src, + DiffLineStats { + removals: counts.removals, + insertions: counts.insertions, + before: tokens.before.len().try_into().expect("interner handles only u32"), + after: tokens.after.len().try_into().expect("interner handles only u32"), + } + .into(), + ))); + } + } + } + Ok(None) +} + +mod diff { + use std::ops::Range; + + pub struct Statistics<'a, 'data> { + pub removed_bytes: usize, + pub input: &'a gix_diff::blob::intern::InternedInput<&'data [u8]>, + } + + impl<'a, 'data> gix_diff::blob::Sink for Statistics<'a, 'data> { + type Out = usize; + + fn process_change(&mut self, before: Range<u32>, _after: Range<u32>) { + self.removed_bytes = self.input.before[before.start as usize..before.end as usize] + .iter() + .map(|token| self.input.interner[*token].len()) + .sum(); + } + + fn finish(self) -> Self::Out { + self.removed_bytes + } + } +} + +mod tree_to_events { + use gix_diff::tree::visit::Change; + use gix_object::tree::EntryRef; + + use crate::bstr::BStr; + + pub struct Delegate<'a> { + parent: &'a mut super::State, + recorder: gix_traverse::tree::Recorder, + } + + impl<'a> Delegate<'a> { + pub fn new(parent: &'a mut super::State) -> Self { + let tracking = parent.tracking.map(|t| match t { + gix_diff::tree::recorder::Location::FileName => gix_traverse::tree::recorder::Location::FileName, + gix_diff::tree::recorder::Location::Path => gix_traverse::tree::recorder::Location::Path, + }); + Self { + parent, + recorder: gix_traverse::tree::Recorder::default().track_location(tracking), + } + } + } + + impl gix_traverse::tree::Visit for Delegate<'_> { + fn pop_front_tracked_path_and_set_current(&mut self) { + self.recorder.pop_front_tracked_path_and_set_current() + } + + fn push_back_tracked_path_component(&mut self, component: &BStr) { + self.recorder.push_back_tracked_path_component(component) + } + + fn push_path_component(&mut self, component: &BStr) { + self.recorder.push_path_component(component) + } + + fn pop_path_component(&mut self) { + self.recorder.pop_path_component(); + } + + fn visit_tree(&mut self, _entry: &EntryRef<'_>) -> gix_traverse::tree::visit::Action { + gix_traverse::tree::visit::Action::Continue + } + + fn visit_nontree(&mut self, entry: &EntryRef<'_>) -> gix_traverse::tree::visit::Action { + if entry.mode.is_blob() { + self.parent.try_push_change( + Change::Modification { + previous_entry_mode: entry.mode, + previous_oid: gix_hash::ObjectId::null(entry.oid.kind()), + entry_mode: entry.mode, + oid: entry.oid.to_owned(), + }, + self.recorder.path(), + ); + // make sure these aren't viable to be emitted anymore. + self.parent.items.last_mut().expect("just pushed").emitted = true; + } + gix_traverse::tree::visit::Action::Continue + } + } +} diff --git a/vendor/gix/src/object/tree/iter.rs b/vendor/gix/src/object/tree/iter.rs new file mode 100644 index 000000000..c841e2574 --- /dev/null +++ b/vendor/gix/src/object/tree/iter.rs @@ -0,0 +1,53 @@ +use super::Tree; +use crate::Repository; + +/// An entry within a tree +pub struct EntryRef<'repo, 'a> { + /// The actual entry ref we are wrapping. + pub inner: gix_object::tree::EntryRef<'a>, + + pub(crate) repo: &'repo Repository, +} + +impl<'repo, 'a> EntryRef<'repo, 'a> { + /// The kind of object to which [`id()`][Self::id()] is pointing. + pub fn mode(&self) -> gix_object::tree::EntryMode { + self.inner.mode + } + + /// The name of the file in the parent tree. + pub fn filename(&self) -> &gix_object::bstr::BStr { + self.inner.filename + } + + /// Return the entries id, connected to the underlying repository. + pub fn id(&self) -> crate::Id<'repo> { + crate::Id::from_id(self.inner.oid, self.repo) + } + + /// Return the entries id, without repository connection. + pub fn oid(&self) -> gix_hash::ObjectId { + self.inner.oid.to_owned() + } +} + +impl<'repo, 'a> std::fmt::Display for EntryRef<'repo, 'a> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "{:06o} {:>6} {}\t{}", + self.mode() as u32, + self.mode().as_str(), + self.id().shorten_or_id(), + self.filename() + ) + } +} + +impl<'repo> Tree<'repo> { + /// Return an iterator over tree entries to obtain information about files and directories this tree contains. + pub fn iter(&self) -> impl Iterator<Item = Result<EntryRef<'repo, '_>, gix_object::decode::Error>> { + let repo = self.repo; + gix_object::TreeRefIter::from_bytes(&self.data).map(move |e| e.map(|entry| EntryRef { inner: entry, repo })) + } +} diff --git a/vendor/gix/src/object/tree/mod.rs b/vendor/gix/src/object/tree/mod.rs new file mode 100644 index 000000000..db094bcb9 --- /dev/null +++ b/vendor/gix/src/object/tree/mod.rs @@ -0,0 +1,158 @@ +use gix_hash::ObjectId; +use gix_object::{bstr::BStr, TreeRefIter}; + +use crate::{object::find, Id, Tree}; + +/// Initialization +impl<'repo> Tree<'repo> { + /// Obtain a tree instance by handing in all components that it is made up of. + pub fn from_data(id: impl Into<ObjectId>, data: Vec<u8>, repo: &'repo crate::Repository) -> Self { + Tree { + id: id.into(), + data, + repo, + } + } +} + +/// Access +impl<'repo> Tree<'repo> { + /// Return this tree's identifier. + pub fn id(&self) -> Id<'repo> { + Id::from_id(self.id, self.repo) + } + + // TODO: tests. + /// Follow a sequence of `path` components starting from this instance, and look them up one by one until the last component + /// is looked up and its tree entry is returned. + /// + /// # Performance Notes + /// + /// Searching tree entries is currently done in sequence, which allows to the search to be allocation free. It would be possible + /// to re-use a vector and use a binary search instead, which might be able to improve performance over all. + /// However, a benchmark should be created first to have some data and see which trade-off to choose here. + /// + /// # Why is this consuming? + /// + /// The borrow checker shows pathological behaviour in loops that mutate a buffer, but also want to return from it. + /// Workarounds include keeping an index and doing a separate access to the memory, which seems hard to do here without + /// re-parsing the entries. + pub fn lookup_entry<I, P>(mut self, path: I) -> Result<Option<Entry<'repo>>, find::existing::Error> + where + I: IntoIterator<Item = P>, + P: PartialEq<BStr>, + { + let mut path = path.into_iter().peekable(); + while let Some(component) = path.next() { + match TreeRefIter::from_bytes(&self.data) + .filter_map(Result::ok) + .find(|entry| component.eq(entry.filename)) + { + Some(entry) => { + if path.peek().is_none() { + return Ok(Some(Entry { + inner: entry.into(), + repo: self.repo, + })); + } else { + let next_id = entry.oid.to_owned(); + let repo = self.repo; + drop(self); + self = match repo.find_object(next_id)?.try_into_tree() { + Ok(tree) => tree, + Err(_) => return Ok(None), + }; + } + } + None => return Ok(None), + } + } + Ok(None) + } + + /// Like [`lookup_entry()`][Self::lookup_entry()], but takes a `Path` directly via `relative_path`, a path relative to this tree. + /// + /// # Note + /// + /// If any path component contains illformed UTF-8 and thus can't be converted to bytes on platforms which can't do so natively, + /// the returned component will be empty which makes the lookup fail. + pub fn lookup_entry_by_path( + self, + relative_path: impl AsRef<std::path::Path>, + ) -> Result<Option<Entry<'repo>>, find::existing::Error> { + use crate::bstr::ByteSlice; + self.lookup_entry(relative_path.as_ref().components().map(|c: std::path::Component<'_>| { + gix_path::os_str_into_bstr(c.as_os_str()) + .unwrap_or_else(|_| "".into()) + .as_bytes() + })) + } +} + +/// +pub mod diff; + +/// +pub mod traverse; + +/// +mod iter; +pub use iter::EntryRef; + +impl<'r> std::fmt::Debug for Tree<'r> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "Tree({})", self.id) + } +} + +/// An entry in a [`Tree`], similar to an entry in a directory. +#[derive(PartialEq, Debug, Clone)] +pub struct Entry<'repo> { + inner: gix_object::tree::Entry, + repo: &'repo crate::Repository, +} + +mod entry { + use crate::{bstr::BStr, ext::ObjectIdExt, object::tree::Entry}; + + /// Access + impl<'repo> Entry<'repo> { + /// The kind of object to which `oid` is pointing to. + pub fn mode(&self) -> gix_object::tree::EntryMode { + self.inner.mode + } + + /// The name of the file in the parent tree. + pub fn filename(&self) -> &BStr { + self.inner.filename.as_ref() + } + + /// Return the object id of the entry. + pub fn id(&self) -> crate::Id<'repo> { + self.inner.oid.attach(self.repo) + } + + /// Return the object this entry points to. + pub fn object(&self) -> Result<crate::Object<'repo>, crate::object::find::existing::Error> { + self.id().object() + } + + /// Return the plain object id of this entry, without access to the repository. + pub fn oid(&self) -> &gix_hash::oid { + &self.inner.oid + } + + /// Return the plain object id of this entry, without access to the repository. + pub fn object_id(&self) -> gix_hash::ObjectId { + self.inner.oid + } + } + + /// Consuming + impl Entry<'_> { + /// Return the contained object. + pub fn detach(self) -> gix_object::tree::Entry { + self.inner + } + } +} diff --git a/vendor/gix/src/object/tree/traverse.rs b/vendor/gix/src/object/tree/traverse.rs new file mode 100644 index 000000000..974df6b0d --- /dev/null +++ b/vendor/gix/src/object/tree/traverse.rs @@ -0,0 +1,62 @@ +use gix_odb::FindExt; + +use crate::Tree; + +/// Traversal +impl<'repo> Tree<'repo> { + /// Obtain a platform for initiating a variety of traversals. + pub fn traverse(&self) -> Platform<'_, 'repo> { + Platform { + root: self, + breadthfirst: BreadthFirstPresets { root: self }, + } + } +} + +/// An intermediate object to start traversing the parent tree from. +pub struct Platform<'a, 'repo> { + root: &'a Tree<'repo>, + /// Provides easy access to presets for common breadth-first traversal. + pub breadthfirst: BreadthFirstPresets<'a, 'repo>, +} + +/// Presets for common choices in breadth-first traversal. +#[derive(Copy, Clone)] +pub struct BreadthFirstPresets<'a, 'repo> { + root: &'a Tree<'repo>, +} + +impl<'a, 'repo> BreadthFirstPresets<'a, 'repo> { + /// Returns all entries and their file paths, recursively, as reachable from this tree. + pub fn files(&self) -> Result<Vec<gix_traverse::tree::recorder::Entry>, gix_traverse::tree::breadthfirst::Error> { + let mut recorder = gix_traverse::tree::Recorder::default(); + Platform { + root: self.root, + breadthfirst: *self, + } + .breadthfirst(&mut recorder)?; + Ok(recorder.records) + } +} + +impl<'a, 'repo> Platform<'a, 'repo> { + /// Start a breadth-first, recursive traversal using `delegate`, for which a [`Recorder`][gix_traverse::tree::Recorder] can be used to get started. + /// + /// # Note + /// + /// - Results are returned in sort order according to tree-entry sorting rules, one level at a time. + /// - for obtaining the direct children of the tree, use [.iter()][crate::Tree::iter()] instead. + pub fn breadthfirst<V>(&self, delegate: &mut V) -> Result<(), gix_traverse::tree::breadthfirst::Error> + where + V: gix_traverse::tree::Visit, + { + let root = gix_object::TreeRefIter::from_bytes(&self.root.data); + let state = gix_traverse::tree::breadthfirst::State::default(); + gix_traverse::tree::breadthfirst( + root, + state, + |oid, buf| self.root.repo.objects.find_tree_iter(oid, buf).ok(), + delegate, + ) + } +} |