diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 12:41:41 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 12:41:41 +0000 |
commit | 10ee2acdd26a7f1298c6f6d6b7af9b469fe29b87 (patch) | |
tree | bdffd5d80c26cf4a7a518281a204be1ace85b4c1 /vendor/similar/src/algorithms/myers.rs | |
parent | Releasing progress-linux version 1.70.0+dfsg1-9~progress7.99u1. (diff) | |
download | rustc-10ee2acdd26a7f1298c6f6d6b7af9b469fe29b87.tar.xz rustc-10ee2acdd26a7f1298c6f6d6b7af9b469fe29b87.zip |
Merging upstream version 1.70.0+dfsg2.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'vendor/similar/src/algorithms/myers.rs')
-rw-r--r-- | vendor/similar/src/algorithms/myers.rs | 415 |
1 files changed, 415 insertions, 0 deletions
diff --git a/vendor/similar/src/algorithms/myers.rs b/vendor/similar/src/algorithms/myers.rs new file mode 100644 index 000000000..542c7ed75 --- /dev/null +++ b/vendor/similar/src/algorithms/myers.rs @@ -0,0 +1,415 @@ +//! Myers' diff algorithm. +//! +//! * time: `O((N+M)D)` +//! * space `O(N+M)` +//! +//! See [the original article by Eugene W. Myers](http://www.xmailserver.org/diff2.pdf) +//! describing it. +//! +//! The implementation of this algorithm is based on the implementation by +//! Brandon Williams. +//! +//! # Heuristics +//! +//! At present this implementation of Myers' does not implement any more advanced +//! heuristics that would solve some pathological cases. For instane passing two +//! large and completely distinct sequences to the algorithm will make it spin +//! without making reasonable progress. Currently the only protection in the +//! library against this is to pass a deadline to the diffing algorithm. +//! +//! For potential improvements here see [similar#15](https://github.com/mitsuhiko/similar/issues/15). + +use std::ops::{Index, IndexMut, Range}; +use std::time::Instant; + +use crate::algorithms::utils::{common_prefix_len, common_suffix_len, is_empty_range}; +use crate::algorithms::DiffHook; + +/// Myers' diff algorithm. +/// +/// Diff `old`, between indices `old_range` and `new` between indices `new_range`. +pub fn diff<Old, New, D>( + d: &mut D, + old: &Old, + old_range: Range<usize>, + new: &New, + new_range: Range<usize>, +) -> Result<(), D::Error> +where + Old: Index<usize> + ?Sized, + New: Index<usize> + ?Sized, + D: DiffHook, + New::Output: PartialEq<Old::Output>, +{ + diff_deadline(d, old, old_range, new, new_range, None) +} + +/// Myers' diff algorithm with deadline. +/// +/// Diff `old`, between indices `old_range` and `new` between indices `new_range`. +/// +/// This diff is done with an optional deadline that defines the maximal +/// execution time permitted before it bails and falls back to an approximation. +pub fn diff_deadline<Old, New, D>( + d: &mut D, + old: &Old, + old_range: Range<usize>, + new: &New, + new_range: Range<usize>, + deadline: Option<Instant>, +) -> Result<(), D::Error> +where + Old: Index<usize> + ?Sized, + New: Index<usize> + ?Sized, + D: DiffHook, + New::Output: PartialEq<Old::Output>, +{ + let max_d = max_d(old_range.len(), new_range.len()); + let mut vb = V::new(max_d); + let mut vf = V::new(max_d); + conquer( + d, old, old_range, new, new_range, &mut vf, &mut vb, deadline, + )?; + d.finish() +} + +// A D-path is a path which starts at (0,0) that has exactly D non-diagonal +// edges. All D-paths consist of a (D - 1)-path followed by a non-diagonal edge +// and then a possibly empty sequence of diagonal edges called a snake. + +/// `V` contains the endpoints of the furthest reaching `D-paths`. For each +/// recorded endpoint `(x,y)` in diagonal `k`, we only need to retain `x` because +/// `y` can be computed from `x - k`. In other words, `V` is an array of integers +/// where `V[k]` contains the row index of the endpoint of the furthest reaching +/// path in diagonal `k`. +/// +/// We can't use a traditional Vec to represent `V` since we use `k` as an index +/// and it can take on negative values. So instead `V` is represented as a +/// light-weight wrapper around a Vec plus an `offset` which is the maximum value +/// `k` can take on in order to map negative `k`'s back to a value >= 0. +#[derive(Debug)] +struct V { + offset: isize, + v: Vec<usize>, // Look into initializing this to -1 and storing isize +} + +impl V { + fn new(max_d: usize) -> Self { + Self { + offset: max_d as isize, + v: vec![0; 2 * max_d], + } + } + + fn len(&self) -> usize { + self.v.len() + } +} + +impl Index<isize> for V { + type Output = usize; + + fn index(&self, index: isize) -> &Self::Output { + &self.v[(index + self.offset) as usize] + } +} + +impl IndexMut<isize> for V { + fn index_mut(&mut self, index: isize) -> &mut Self::Output { + &mut self.v[(index + self.offset) as usize] + } +} + +fn max_d(len1: usize, len2: usize) -> usize { + // XXX look into reducing the need to have the additional '+ 1' + (len1 + len2 + 1) / 2 + 1 +} + +#[inline(always)] +fn split_at(range: Range<usize>, at: usize) -> (Range<usize>, Range<usize>) { + (range.start..at, at..range.end) +} + +/// A `Snake` is a sequence of diagonal edges in the edit graph. Normally +/// a snake has a start end end point (and it is possible for a snake to have +/// a length of zero, meaning the start and end points are the same) however +/// we do not need the end point which is why it's not implemented here. +/// +/// The divide part of a divide-and-conquer strategy. A D-path has D+1 snakes +/// some of which may be empty. The divide step requires finding the ceil(D/2) + +/// 1 or middle snake of an optimal D-path. The idea for doing so is to +/// simultaneously run the basic algorithm in both the forward and reverse +/// directions until furthest reaching forward and reverse paths starting at +/// opposing corners 'overlap'. +fn find_middle_snake<Old, New>( + old: &Old, + old_range: Range<usize>, + new: &New, + new_range: Range<usize>, + vf: &mut V, + vb: &mut V, + deadline: Option<Instant>, +) -> Option<(usize, usize)> +where + Old: Index<usize> + ?Sized, + New: Index<usize> + ?Sized, + New::Output: PartialEq<Old::Output>, +{ + let n = old_range.len(); + let m = new_range.len(); + + // By Lemma 1 in the paper, the optimal edit script length is odd or even as + // `delta` is odd or even. + let delta = n as isize - m as isize; + let odd = delta & 1 == 1; + + // The initial point at (0, -1) + vf[1] = 0; + // The initial point at (N, M+1) + vb[1] = 0; + + // We only need to explore ceil(D/2) + 1 + let d_max = max_d(n, m); + assert!(vf.len() >= d_max); + assert!(vb.len() >= d_max); + + for d in 0..d_max as isize { + // are we running for too long? + if let Some(deadline) = deadline { + if Instant::now() > deadline { + break; + } + } + + // Forward path + for k in (-d..=d).rev().step_by(2) { + let mut x = if k == -d || (k != d && vf[k - 1] < vf[k + 1]) { + vf[k + 1] + } else { + vf[k - 1] + 1 + }; + let y = (x as isize - k) as usize; + + // The coordinate of the start of a snake + let (x0, y0) = (x, y); + // While these sequences are identical, keep moving through the + // graph with no cost + if x < old_range.len() && y < new_range.len() { + let advance = common_prefix_len( + old, + old_range.start + x..old_range.end, + new, + new_range.start + y..new_range.end, + ); + x += advance; + } + + // This is the new best x value + vf[k] = x; + + // Only check for connections from the forward search when N - M is + // odd and when there is a reciprocal k line coming from the other + // direction. + if odd && (k - delta).abs() <= (d - 1) { + // TODO optimize this so we don't have to compare against n + if vf[k] + vb[-(k - delta)] >= n { + // Return the snake + return Some((x0 + old_range.start, y0 + new_range.start)); + } + } + } + + // Backward path + for k in (-d..=d).rev().step_by(2) { + let mut x = if k == -d || (k != d && vb[k - 1] < vb[k + 1]) { + vb[k + 1] + } else { + vb[k - 1] + 1 + }; + let mut y = (x as isize - k) as usize; + + // The coordinate of the start of a snake + if x < n && y < m { + let advance = common_suffix_len( + old, + old_range.start..old_range.start + n - x, + new, + new_range.start..new_range.start + m - y, + ); + x += advance; + y += advance; + } + + // This is the new best x value + vb[k] = x; + + if !odd && (k - delta).abs() <= d { + // TODO optimize this so we don't have to compare against n + if vb[k] + vf[-(k - delta)] >= n { + // Return the snake + return Some((n - x + old_range.start, m - y + new_range.start)); + } + } + } + + // TODO: Maybe there's an opportunity to optimize and bail early? + } + + // deadline reached + None +} + +#[allow(clippy::too_many_arguments)] +fn conquer<Old, New, D>( + d: &mut D, + old: &Old, + mut old_range: Range<usize>, + new: &New, + mut new_range: Range<usize>, + vf: &mut V, + vb: &mut V, + deadline: Option<Instant>, +) -> Result<(), D::Error> +where + Old: Index<usize> + ?Sized, + New: Index<usize> + ?Sized, + D: DiffHook, + New::Output: PartialEq<Old::Output>, +{ + // Check for common prefix + let common_prefix_len = common_prefix_len(old, old_range.clone(), new, new_range.clone()); + if common_prefix_len > 0 { + d.equal(old_range.start, new_range.start, common_prefix_len)?; + } + old_range.start += common_prefix_len; + new_range.start += common_prefix_len; + + // Check for common suffix + let common_suffix_len = common_suffix_len(old, old_range.clone(), new, new_range.clone()); + let common_suffix = ( + old_range.end - common_suffix_len, + new_range.end - common_suffix_len, + ); + old_range.end -= common_suffix_len; + new_range.end -= common_suffix_len; + + if is_empty_range(&old_range) && is_empty_range(&new_range) { + // Do nothing + } else if is_empty_range(&new_range) { + d.delete(old_range.start, old_range.len(), new_range.start)?; + } else if is_empty_range(&old_range) { + d.insert(old_range.start, new_range.start, new_range.len())?; + } else if let Some((x_start, y_start)) = find_middle_snake( + old, + old_range.clone(), + new, + new_range.clone(), + vf, + vb, + deadline, + ) { + let (old_a, old_b) = split_at(old_range, x_start); + let (new_a, new_b) = split_at(new_range, y_start); + conquer(d, old, old_a, new, new_a, vf, vb, deadline)?; + conquer(d, old, old_b, new, new_b, vf, vb, deadline)?; + } else { + d.delete( + old_range.start, + old_range.end - old_range.start, + new_range.start, + )?; + d.insert( + old_range.start, + new_range.start, + new_range.end - new_range.start, + )?; + } + + if common_suffix_len > 0 { + d.equal(common_suffix.0, common_suffix.1, common_suffix_len)?; + } + + Ok(()) +} + +#[test] +fn test_find_middle_snake() { + let a = &b"ABCABBA"[..]; + let b = &b"CBABAC"[..]; + let max_d = max_d(a.len(), b.len()); + let mut vf = V::new(max_d); + let mut vb = V::new(max_d); + let (x_start, y_start) = + find_middle_snake(a, 0..a.len(), b, 0..b.len(), &mut vf, &mut vb, None).unwrap(); + assert_eq!(x_start, 4); + assert_eq!(y_start, 1); +} + +#[test] +fn test_diff() { + let a: &[usize] = &[0, 1, 2, 3, 4]; + let b: &[usize] = &[0, 1, 2, 9, 4]; + + let mut d = crate::algorithms::Replace::new(crate::algorithms::Capture::new()); + diff(&mut d, a, 0..a.len(), b, 0..b.len()).unwrap(); + insta::assert_debug_snapshot!(d.into_inner().ops()); +} + +#[test] +fn test_contiguous() { + let a: &[usize] = &[0, 1, 2, 3, 4, 4, 4, 5]; + let b: &[usize] = &[0, 1, 2, 8, 9, 4, 4, 7]; + + let mut d = crate::algorithms::Replace::new(crate::algorithms::Capture::new()); + diff(&mut d, a, 0..a.len(), b, 0..b.len()).unwrap(); + insta::assert_debug_snapshot!(d.into_inner().ops()); +} + +#[test] +fn test_pat() { + let a: &[usize] = &[0, 1, 3, 4, 5]; + let b: &[usize] = &[0, 1, 4, 5, 8, 9]; + + let mut d = crate::algorithms::Capture::new(); + diff(&mut d, a, 0..a.len(), b, 0..b.len()).unwrap(); + insta::assert_debug_snapshot!(d.ops()); +} + +#[test] +fn test_deadline_reached() { + use std::ops::Index; + use std::time::Duration; + + let a = (0..100).collect::<Vec<_>>(); + let mut b = (0..100).collect::<Vec<_>>(); + b[10] = 99; + b[50] = 99; + b[25] = 99; + + struct SlowIndex<'a>(&'a [usize]); + + impl<'a> Index<usize> for SlowIndex<'a> { + type Output = usize; + + fn index(&self, index: usize) -> &Self::Output { + std::thread::sleep(Duration::from_millis(1)); + &self.0[index] + } + } + + let slow_a = SlowIndex(&a); + let slow_b = SlowIndex(&b); + + // don't give it enough time to do anything interesting + let mut d = crate::algorithms::Replace::new(crate::algorithms::Capture::new()); + diff_deadline( + &mut d, + &slow_a, + 0..a.len(), + &slow_b, + 0..b.len(), + Some(Instant::now() + Duration::from_millis(50)), + ) + .unwrap(); + insta::assert_debug_snapshot!(d.into_inner().ops()); +} |