//! ### Deviation //! //! Note that the algorithm implemented here is in many ways different from what `git` does. //! //! - it's less sophisticated and doesn't use any ranking of candidates. Instead, it picks the first possible match. //! - the set used for copy-detection is probably smaller by default. use std::ops::Range; use bstr::BStr; use gix_object::tree::{EntryKind, EntryMode}; use crate::{ blob::{platform::prepare_diff::Operation, DiffLineStats, ResourceKind}, rewrites::{CopySource, Outcome, Tracker}, Rewrites, }; /// The kind of a change. #[derive(Debug, Copy, Clone, Ord, PartialOrd, PartialEq, Eq)] pub enum ChangeKind { /// The change represents the *deletion* of an item. Deletion, /// The change represents the *modification* of an item. Modification, /// The change represents the *addition* of an item. Addition, } /// A trait providing all functionality to abstract over the concept of a change, as seen by the [`Tracker`]. pub trait Change: Clone { /// Return the hash of this change for identification. /// /// Note that this is the id of the object as stored in `git`, i.e. it must have gone through workspace /// conversions. fn id(&self) -> &gix_hash::oid; /// Return the kind of this change. fn kind(&self) -> ChangeKind; /// Return more information about the kind of entry affected by this change. fn entry_mode(&self) -> EntryMode; /// Return the id of the change along with its mode. fn id_and_entry_mode(&self) -> (&gix_hash::oid, EntryMode); } /// A set of tracked items allows to figure out their relations by figuring out their similarity. pub(crate) struct Item { /// The underlying raw change change: T, /// That slice into the backing for paths. path: Range, /// If true, this item was already emitted, i.e. seen by the caller. emitted: bool, } impl Item { fn location<'a>(&self, backing: &'a [u8]) -> &'a BStr { backing[self.path.clone()].as_ref() } fn entry_mode_compatible(&self, mode: EntryMode) -> bool { use EntryKind::*; matches!( (mode.kind(), self.change.entry_mode().kind()), (Blob | BlobExecutable, Blob | BlobExecutable) | (Link, Link) ) } fn is_source_for_destination_of(&self, kind: visit::SourceKind, dest_item_mode: EntryMode) -> bool { self.entry_mode_compatible(dest_item_mode) && match kind { visit::SourceKind::Rename => !self.emitted && matches!(self.change.kind(), ChangeKind::Deletion), visit::SourceKind::Copy => { matches!(self.change.kind(), ChangeKind::Modification) } } } } /// A module with types used in the user-callback in [Tracker::emit()](crate::rewrites::Tracker::emit()). pub mod visit { use bstr::BStr; use gix_object::tree::EntryMode; use crate::blob::DiffLineStats; /// The source of a rewrite, rename or copy. #[derive(Debug, Clone, PartialEq, PartialOrd)] pub struct Source<'a> { /// The kind of entry. pub entry_mode: EntryMode, /// The hash of the state of the source as seen in the object database. pub id: gix_hash::ObjectId, /// Further specify what kind of source this is. pub kind: SourceKind, /// The repository-relative location of this entry. pub location: &'a BStr, /// If this is a rewrite, indicate how many lines would need to change to turn this source into the destination. pub diff: Option, } /// Further identify the kind of [Source]. #[derive(Debug, Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash)] pub enum SourceKind { /// This is the source of an entry that was renamed, as `source` was renamed to `destination`. Rename, /// This is the source of a copy, as `source` was copied into `destination`. Copy, } /// A change along with a location. #[derive(Clone)] pub struct Destination<'a, T: Clone> { /// The change at the given `location`. pub change: T, /// The repository-relative location of this destination. pub location: &'a BStr, } } /// pub mod emit { /// The error returned by [Tracker::emit()](super::Tracker::emit()). #[derive(Debug, thiserror::Error)] #[allow(missing_docs)] pub enum Error { #[error("Could not find blob for similarity checking")] FindExistingBlob(#[from] gix_object::find::existing_object::Error), #[error("Could not obtain exhaustive item set to use as possible sources for copy detection")] GetItemsForExhaustiveCopyDetection(#[source] Box), #[error(transparent)] SetResource(#[from] crate::blob::platform::set_resource::Error), #[error(transparent)] PrepareDiff(#[from] crate::blob::platform::prepare_diff::Error), } } /// Lifecycle impl Tracker { /// Create a new instance with `rewrites` configuration. pub fn new(rewrites: Rewrites) -> Self { Tracker { items: vec![], path_backing: vec![], rewrites, } } } /// build state and find matches. impl Tracker { /// We may refuse the push if that information isn't needed for what we have to track. pub fn try_push_change(&mut self, change: T, location: &BStr) -> Option { if !change.entry_mode().is_blob_or_symlink() { return Some(change); } let keep = match (self.rewrites.copies, change.kind()) { (Some(_find_copies), _) => true, (None, ChangeKind::Modification { .. }) => false, (None, _) => true, }; if !keep { return Some(change); } let start = self.path_backing.len(); self.path_backing.extend_from_slice(location); self.items.push(Item { path: start..self.path_backing.len(), change, emitted: false, }); None } /// Can only be called once effectively as it alters its own state to assure each item is only emitted once. /// /// `cb(destination, source)` is called for each item, either with `Some(source)` if it's /// the destination of a copy or rename, or with `None` for source if no relation to other /// items in the tracked set exist, which is like saying 'no rename or rewrite or copy' happened. /// /// `objects` is used to access blob data for similarity checks if required and is taken directly from the object database. /// Worktree filters and text conversions will be applied afterwards automatically. Note that object-caching *should not* /// be enabled as caching is implemented by `diff_cache`, after all, the blob that's actually diffed is going /// through conversion steps. /// /// `diff_cache` is a way to retain a cache of resources that are prepared for rapid diffing, and it also controls /// the diff-algorithm (provided no user-algorithm is set). /// Note that we control a few options of `diff_cache` to assure it will ignore external commands. /// Note that we do not control how the `diff_cache` converts resources, it's left to the caller to decide /// if it should look at what's stored in `git`, or in the working tree, along with all diff-specific conversions. /// /// `push_source_tree(push_fn: push(change, location))` is a function that is called when the entire tree of the source /// should be added as modifications by calling `push` repeatedly to use for perfect copy tracking. Note that `push` /// will panic if `change` is not a modification, and it's valid to not call `push` at all. pub fn emit( &mut self, mut cb: impl FnMut(visit::Destination<'_, T>, Option>) -> crate::tree::visit::Action, diff_cache: &mut crate::blob::Platform, objects: &impl gix_object::FindObjectOrHeader, mut push_source_tree: PushSourceTreeFn, ) -> Result where PushSourceTreeFn: FnMut(&mut dyn FnMut(T, &BStr)) -> Result<(), E>, E: std::error::Error + Send + Sync + 'static, { diff_cache.options.skip_internal_diff_if_external_is_configured = false; fn by_id_and_location(a: &Item, b: &Item) -> std::cmp::Ordering { a.change .id() .cmp(b.change.id()) .then_with(|| a.path.start.cmp(&b.path.start).then(a.path.end.cmp(&b.path.end))) } self.items.sort_by(by_id_and_location); let mut out = Outcome { options: self.rewrites, ..Default::default() }; self.match_pairs_of_kind( visit::SourceKind::Rename, &mut cb, self.rewrites.percentage, &mut out, diff_cache, objects, )?; if let Some(copies) = self.rewrites.copies { self.match_pairs_of_kind( visit::SourceKind::Copy, &mut cb, copies.percentage, &mut out, diff_cache, objects, )?; match copies.source { CopySource::FromSetOfModifiedFiles => {} CopySource::FromSetOfModifiedFilesAndAllSources => { push_source_tree(&mut |change, location| { assert!( self.try_push_change(change, location).is_none(), "we must accept every change" ); // make sure these aren't viable to be emitted anymore. self.items.last_mut().expect("just pushed").emitted = true; }) .map_err(|err| emit::Error::GetItemsForExhaustiveCopyDetection(Box::new(err)))?; self.items.sort_by(by_id_and_location); self.match_pairs_of_kind( visit::SourceKind::Copy, &mut cb, copies.percentage, &mut out, diff_cache, objects, )?; } } } self.items .sort_by(|a, b| a.location(&self.path_backing).cmp(b.location(&self.path_backing))); for item in self.items.drain(..).filter(|item| !item.emitted) { if cb( visit::Destination { location: item.location(&self.path_backing), change: item.change, }, None, ) == crate::tree::visit::Action::Cancel { break; } } Ok(out) } } impl Tracker { fn match_pairs_of_kind( &mut self, kind: visit::SourceKind, cb: &mut impl FnMut(visit::Destination<'_, T>, Option>) -> crate::tree::visit::Action, percentage: Option, out: &mut Outcome, diff_cache: &mut crate::blob::Platform, objects: &impl gix_object::FindObjectOrHeader, ) -> Result<(), emit::Error> { // we try to cheaply reduce the set of possibilities first, before possibly looking more exhaustively. let needs_second_pass = !needs_exact_match(percentage); if self.match_pairs(cb, None /* by identity */, kind, out, diff_cache, objects)? == crate::tree::visit::Action::Cancel { return Ok(()); } if needs_second_pass { let is_limited = if self.rewrites.limit == 0 { false } else { let (num_src, num_dst) = estimate_involved_items(self.items.iter().map(|item| (item.emitted, item.change.kind())), kind); let permutations = num_src * num_dst; if permutations > self.rewrites.limit { match kind { visit::SourceKind::Rename => { out.num_similarity_checks_skipped_for_rename_tracking_due_to_limit = permutations; } visit::SourceKind::Copy => { out.num_similarity_checks_skipped_for_copy_tracking_due_to_limit = permutations; } } true } else { false } }; if !is_limited { self.match_pairs(cb, percentage, kind, out, diff_cache, objects)?; } } Ok(()) } fn match_pairs( &mut self, cb: &mut impl FnMut(visit::Destination<'_, T>, Option>) -> crate::tree::visit::Action, percentage: Option, kind: visit::SourceKind, stats: &mut Outcome, diff_cache: &mut crate::blob::Platform, objects: &impl gix_object::FindObjectOrHeader, ) -> Result { let mut dest_ofs = 0; while let Some((mut dest_idx, dest)) = self.items[dest_ofs..].iter().enumerate().find_map(|(idx, item)| { (!item.emitted && matches!(item.change.kind(), ChangeKind::Addition)).then_some((idx, item)) }) { dest_idx += dest_ofs; dest_ofs = dest_idx + 1; let src = find_match( &self.items, dest, dest_idx, percentage, kind, stats, objects, diff_cache, &self.path_backing, )? .map(|(src_idx, src, diff)| { let (id, entry_mode) = src.change.id_and_entry_mode(); let id = id.to_owned(); let location = src.location(&self.path_backing); ( visit::Source { entry_mode, id, kind, location, diff, }, src_idx, ) }); if src.is_none() { continue; } let location = dest.location(&self.path_backing); let change = dest.change.clone(); let dest = visit::Destination { change, location }; self.items[dest_idx].emitted = true; if let Some(src_idx) = src.as_ref().map(|t| t.1) { self.items[src_idx].emitted = true; } if cb(dest, src.map(|t| t.0)) == crate::tree::visit::Action::Cancel { return Ok(crate::tree::visit::Action::Cancel); } } Ok(crate::tree::visit::Action::Continue) } } /// Returns the amount of viable sources and destinations for `items` as eligible for the given `kind` of operation. fn estimate_involved_items( items: impl IntoIterator, kind: visit::SourceKind, ) -> (usize, usize) { items .into_iter() .filter(|(emitted, _)| match kind { visit::SourceKind::Rename => !*emitted, visit::SourceKind::Copy => true, }) .fold((0, 0), |(mut src, mut dest), (emitted, change_kind)| { match change_kind { ChangeKind::Addition => { if kind == visit::SourceKind::Rename || !emitted { dest += 1; } } ChangeKind::Deletion => { if kind == visit::SourceKind::Rename { src += 1 } } ChangeKind::Modification => { if kind == visit::SourceKind::Copy { src += 1 } } } (src, dest) }) } fn needs_exact_match(percentage: Option) -> bool { percentage.map_or(true, |p| p >= 1.0) } /// <`src_idx`, src, possibly diff stat> type SourceTuple<'a, T> = (usize, &'a Item, Option); /// Find `item` in our set of items ignoring `item_idx` to avoid finding ourselves, by similarity indicated by `percentage`. /// The latter can be `None` or `Some(x)` where `x>=1` for identity, and anything else for similarity. /// We also ignore emitted items entirely. /// Use `kind` to indicate what kind of match we are looking for, which might be deletions matching an `item` addition, or /// any non-deletion otherwise. /// Note that we always try to find by identity first even if a percentage is given as it's much faster and may reduce the set /// of items to be searched. #[allow(clippy::too_many_arguments)] fn find_match<'a, T: Change>( items: &'a [Item], item: &Item, item_idx: usize, percentage: Option, kind: visit::SourceKind, stats: &mut Outcome, objects: &impl gix_object::FindObjectOrHeader, diff_cache: &mut crate::blob::Platform, path_backing: &[u8], ) -> Result>, emit::Error> { let (item_id, item_mode) = item.change.id_and_entry_mode(); if needs_exact_match(percentage) || item_mode.is_link() { let first_idx = items.partition_point(|a| a.change.id() < item_id); let range = match items.get(first_idx..).map(|items| { let end = items .iter() .position(|a| a.change.id() != item_id) .map_or(items.len(), |idx| first_idx + idx); first_idx..end }) { Some(range) => range, None => return Ok(None), }; if range.is_empty() { return Ok(None); } let res = items[range.clone()].iter().enumerate().find_map(|(mut src_idx, src)| { src_idx += range.start; (src_idx != item_idx && src.is_source_for_destination_of(kind, item_mode)).then_some((src_idx, src, None)) }); if let Some(src) = res { return Ok(Some(src)); } } else { let mut has_new = false; let percentage = percentage.expect("it's set to something below 1.0 and we assured this"); debug_assert_eq!( item.change.entry_mode().kind(), EntryKind::Blob, "symlinks are matched exactly, and trees aren't used here" ); for (can_idx, src) in items .iter() .enumerate() .filter(|(src_idx, src)| *src_idx != item_idx && src.is_source_for_destination_of(kind, item_mode)) { if !has_new { diff_cache.set_resource( item_id.to_owned(), item_mode.kind(), item.location(path_backing), ResourceKind::NewOrDestination, objects, )?; has_new = true; } let (src_id, src_mode) = src.change.id_and_entry_mode(); diff_cache.set_resource( src_id.to_owned(), src_mode.kind(), src.location(path_backing), ResourceKind::OldOrSource, objects, )?; let prep = diff_cache.prepare_diff()?; stats.num_similarity_checks += 1; match prep.operation { Operation::InternalDiff { algorithm } => { let tokens = crate::blob::intern::InternedInput::new(prep.old.intern_source(), prep.new.intern_source()); let counts = crate::blob::diff( algorithm, &tokens, crate::blob::sink::Counter::new(diff::Statistics { removed_bytes: 0, input: &tokens, }), ); let old_data_len = prep.old.data.as_slice().unwrap_or_default().len(); let new_data_len = prep.new.data.as_slice().unwrap_or_default().len(); let similarity = (old_data_len - counts.wrapped) as f32 / old_data_len.max(new_data_len) as f32; if similarity >= percentage { return Ok(Some(( can_idx, src, DiffLineStats { removals: counts.removals, insertions: counts.insertions, before: tokens.before.len().try_into().expect("interner handles only u32"), after: tokens.after.len().try_into().expect("interner handles only u32"), similarity, } .into(), ))); } } Operation::ExternalCommand { .. } => { unreachable!("we have disabled this possibility with an option") } Operation::SourceOrDestinationIsBinary => { // TODO: figure out if git does more here } }; } } Ok(None) } mod diff { use std::ops::Range; pub struct Statistics<'a, 'data> { pub removed_bytes: usize, pub input: &'a crate::blob::intern::InternedInput<&'data [u8]>, } impl<'a, 'data> crate::blob::Sink for Statistics<'a, 'data> { type Out = usize; fn process_change(&mut self, before: Range, _after: Range) { self.removed_bytes = self.input.before[before.start as usize..before.end as usize] .iter() .map(|token| self.input.interner[*token].len()) .sum(); } fn finish(self) -> Self::Out { self.removed_bytes } } } #[cfg(test)] mod estimate_involved_items { use super::estimate_involved_items; use crate::rewrites::tracker::{visit::SourceKind, ChangeKind}; #[test] fn renames_count_unemitted_as_sources_and_destinations() { let items = [ (false, ChangeKind::Addition), (true, ChangeKind::Deletion), (true, ChangeKind::Deletion), ]; assert_eq!( estimate_involved_items(items, SourceKind::Rename), (0, 1), "here we only have one eligible source, hence nothing to do" ); assert_eq!( estimate_involved_items(items.into_iter().map(|t| (false, t.1)), SourceKind::Rename), (2, 1), "now we have more possibilities as renames count un-emitted deletions as source" ); } #[test] fn copies_do_not_count_additions_as_sources() { let items = [ (false, ChangeKind::Addition), (true, ChangeKind::Addition), (true, ChangeKind::Deletion), ]; assert_eq!( estimate_involved_items(items, SourceKind::Copy), (0, 1), "one addition as source, the other isn't counted as it's emitted, nor is it considered a copy-source.\ deletions don't count" ); } #[test] fn copies_count_modifications_as_sources() { let items = [ (false, ChangeKind::Addition), (true, ChangeKind::Modification), (false, ChangeKind::Modification), ]; assert_eq!( estimate_involved_items(items, SourceKind::Copy), (2, 1), "any modifications is a valid source, emitted or not" ); } }