//! Myers' diff algorithm. //! //! * time: `O((N+M)D)` //! * space `O(N+M)` //! //! See [the original article by Eugene W. Myers](http://www.xmailserver.org/diff2.pdf) //! describing it. //! //! The implementation of this algorithm is based on the implementation by //! Brandon Williams. //! //! # Heuristics //! //! At present this implementation of Myers' does not implement any more advanced //! heuristics that would solve some pathological cases. For instance passing two //! large and completely distinct sequences to the algorithm will make it spin //! without making reasonable progress. Currently the only protection in the //! library against this is to pass a deadline to the diffing algorithm. //! //! For potential improvements here see [similar#15](https://github.com/mitsuhiko/similar/issues/15). use std::ops::{Index, IndexMut, Range}; use std::time::Instant; use crate::algorithms::utils::{common_prefix_len, common_suffix_len, is_empty_range}; use crate::algorithms::DiffHook; /// Myers' diff algorithm. /// /// Diff `old`, between indices `old_range` and `new` between indices `new_range`. pub fn diff( d: &mut D, old: &Old, old_range: Range, new: &New, new_range: Range, ) -> Result<(), D::Error> where Old: Index + ?Sized, New: Index + ?Sized, D: DiffHook, New::Output: PartialEq, { diff_deadline(d, old, old_range, new, new_range, None) } /// Myers' diff algorithm with deadline. /// /// Diff `old`, between indices `old_range` and `new` between indices `new_range`. /// /// This diff is done with an optional deadline that defines the maximal /// execution time permitted before it bails and falls back to an approximation. pub fn diff_deadline( d: &mut D, old: &Old, old_range: Range, new: &New, new_range: Range, deadline: Option, ) -> Result<(), D::Error> where Old: Index + ?Sized, New: Index + ?Sized, D: DiffHook, New::Output: PartialEq, { let max_d = max_d(old_range.len(), new_range.len()); let mut vb = V::new(max_d); let mut vf = V::new(max_d); conquer( d, old, old_range, new, new_range, &mut vf, &mut vb, deadline, )?; d.finish() } // A D-path is a path which starts at (0,0) that has exactly D non-diagonal // edges. All D-paths consist of a (D - 1)-path followed by a non-diagonal edge // and then a possibly empty sequence of diagonal edges called a snake. /// `V` contains the endpoints of the furthest reaching `D-paths`. For each /// recorded endpoint `(x,y)` in diagonal `k`, we only need to retain `x` because /// `y` can be computed from `x - k`. In other words, `V` is an array of integers /// where `V[k]` contains the row index of the endpoint of the furthest reaching /// path in diagonal `k`. /// /// We can't use a traditional Vec to represent `V` since we use `k` as an index /// and it can take on negative values. So instead `V` is represented as a /// light-weight wrapper around a Vec plus an `offset` which is the maximum value /// `k` can take on in order to map negative `k`'s back to a value >= 0. #[derive(Debug)] struct V { offset: isize, v: Vec, // Look into initializing this to -1 and storing isize } impl V { fn new(max_d: usize) -> Self { Self { offset: max_d as isize, v: vec![0; 2 * max_d], } } fn len(&self) -> usize { self.v.len() } } impl Index for V { type Output = usize; fn index(&self, index: isize) -> &Self::Output { &self.v[(index + self.offset) as usize] } } impl IndexMut for V { fn index_mut(&mut self, index: isize) -> &mut Self::Output { &mut self.v[(index + self.offset) as usize] } } fn max_d(len1: usize, len2: usize) -> usize { // XXX look into reducing the need to have the additional '+ 1' (len1 + len2 + 1) / 2 + 1 } #[inline(always)] fn split_at(range: Range, at: usize) -> (Range, Range) { (range.start..at, at..range.end) } /// A `Snake` is a sequence of diagonal edges in the edit graph. Normally /// a snake has a start end end point (and it is possible for a snake to have /// a length of zero, meaning the start and end points are the same) however /// we do not need the end point which is why it's not implemented here. /// /// The divide part of a divide-and-conquer strategy. A D-path has D+1 snakes /// some of which may be empty. The divide step requires finding the ceil(D/2) + /// 1 or middle snake of an optimal D-path. The idea for doing so is to /// simultaneously run the basic algorithm in both the forward and reverse /// directions until furthest reaching forward and reverse paths starting at /// opposing corners 'overlap'. fn find_middle_snake( old: &Old, old_range: Range, new: &New, new_range: Range, vf: &mut V, vb: &mut V, deadline: Option, ) -> Option<(usize, usize)> where Old: Index + ?Sized, New: Index + ?Sized, New::Output: PartialEq, { let n = old_range.len(); let m = new_range.len(); // By Lemma 1 in the paper, the optimal edit script length is odd or even as // `delta` is odd or even. let delta = n as isize - m as isize; let odd = delta & 1 == 1; // The initial point at (0, -1) vf[1] = 0; // The initial point at (N, M+1) vb[1] = 0; // We only need to explore ceil(D/2) + 1 let d_max = max_d(n, m); assert!(vf.len() >= d_max); assert!(vb.len() >= d_max); for d in 0..d_max as isize { // are we running for too long? if let Some(deadline) = deadline { if Instant::now() > deadline { break; } } // Forward path for k in (-d..=d).rev().step_by(2) { let mut x = if k == -d || (k != d && vf[k - 1] < vf[k + 1]) { vf[k + 1] } else { vf[k - 1] + 1 }; let y = (x as isize - k) as usize; // The coordinate of the start of a snake let (x0, y0) = (x, y); // While these sequences are identical, keep moving through the // graph with no cost if x < old_range.len() && y < new_range.len() { let advance = common_prefix_len( old, old_range.start + x..old_range.end, new, new_range.start + y..new_range.end, ); x += advance; } // This is the new best x value vf[k] = x; // Only check for connections from the forward search when N - M is // odd and when there is a reciprocal k line coming from the other // direction. if odd && (k - delta).abs() <= (d - 1) { // TODO optimize this so we don't have to compare against n if vf[k] + vb[-(k - delta)] >= n { // Return the snake return Some((x0 + old_range.start, y0 + new_range.start)); } } } // Backward path for k in (-d..=d).rev().step_by(2) { let mut x = if k == -d || (k != d && vb[k - 1] < vb[k + 1]) { vb[k + 1] } else { vb[k - 1] + 1 }; let mut y = (x as isize - k) as usize; // The coordinate of the start of a snake if x < n && y < m { let advance = common_suffix_len( old, old_range.start..old_range.start + n - x, new, new_range.start..new_range.start + m - y, ); x += advance; y += advance; } // This is the new best x value vb[k] = x; if !odd && (k - delta).abs() <= d { // TODO optimize this so we don't have to compare against n if vb[k] + vf[-(k - delta)] >= n { // Return the snake return Some((n - x + old_range.start, m - y + new_range.start)); } } } // TODO: Maybe there's an opportunity to optimize and bail early? } // deadline reached None } #[allow(clippy::too_many_arguments)] fn conquer( d: &mut D, old: &Old, mut old_range: Range, new: &New, mut new_range: Range, vf: &mut V, vb: &mut V, deadline: Option, ) -> Result<(), D::Error> where Old: Index + ?Sized, New: Index + ?Sized, D: DiffHook, New::Output: PartialEq, { // Check for common prefix let common_prefix_len = common_prefix_len(old, old_range.clone(), new, new_range.clone()); if common_prefix_len > 0 { d.equal(old_range.start, new_range.start, common_prefix_len)?; } old_range.start += common_prefix_len; new_range.start += common_prefix_len; // Check for common suffix let common_suffix_len = common_suffix_len(old, old_range.clone(), new, new_range.clone()); let common_suffix = ( old_range.end - common_suffix_len, new_range.end - common_suffix_len, ); old_range.end -= common_suffix_len; new_range.end -= common_suffix_len; if is_empty_range(&old_range) && is_empty_range(&new_range) { // Do nothing } else if is_empty_range(&new_range) { d.delete(old_range.start, old_range.len(), new_range.start)?; } else if is_empty_range(&old_range) { d.insert(old_range.start, new_range.start, new_range.len())?; } else if let Some((x_start, y_start)) = find_middle_snake( old, old_range.clone(), new, new_range.clone(), vf, vb, deadline, ) { let (old_a, old_b) = split_at(old_range, x_start); let (new_a, new_b) = split_at(new_range, y_start); conquer(d, old, old_a, new, new_a, vf, vb, deadline)?; conquer(d, old, old_b, new, new_b, vf, vb, deadline)?; } else { d.delete( old_range.start, old_range.end - old_range.start, new_range.start, )?; d.insert( old_range.start, new_range.start, new_range.end - new_range.start, )?; } if common_suffix_len > 0 { d.equal(common_suffix.0, common_suffix.1, common_suffix_len)?; } Ok(()) } #[test] fn test_find_middle_snake() { let a = &b"ABCABBA"[..]; let b = &b"CBABAC"[..]; let max_d = max_d(a.len(), b.len()); let mut vf = V::new(max_d); let mut vb = V::new(max_d); let (x_start, y_start) = find_middle_snake(a, 0..a.len(), b, 0..b.len(), &mut vf, &mut vb, None).unwrap(); assert_eq!(x_start, 4); assert_eq!(y_start, 1); } #[test] fn test_diff() { let a: &[usize] = &[0, 1, 2, 3, 4]; let b: &[usize] = &[0, 1, 2, 9, 4]; let mut d = crate::algorithms::Replace::new(crate::algorithms::Capture::new()); diff(&mut d, a, 0..a.len(), b, 0..b.len()).unwrap(); insta::assert_debug_snapshot!(d.into_inner().ops()); } #[test] fn test_contiguous() { let a: &[usize] = &[0, 1, 2, 3, 4, 4, 4, 5]; let b: &[usize] = &[0, 1, 2, 8, 9, 4, 4, 7]; let mut d = crate::algorithms::Replace::new(crate::algorithms::Capture::new()); diff(&mut d, a, 0..a.len(), b, 0..b.len()).unwrap(); insta::assert_debug_snapshot!(d.into_inner().ops()); } #[test] fn test_pat() { let a: &[usize] = &[0, 1, 3, 4, 5]; let b: &[usize] = &[0, 1, 4, 5, 8, 9]; let mut d = crate::algorithms::Capture::new(); diff(&mut d, a, 0..a.len(), b, 0..b.len()).unwrap(); insta::assert_debug_snapshot!(d.ops()); } #[test] fn test_deadline_reached() { use std::ops::Index; use std::time::Duration; let a = (0..100).collect::>(); let mut b = (0..100).collect::>(); b[10] = 99; b[50] = 99; b[25] = 99; struct SlowIndex<'a>(&'a [usize]); impl<'a> Index for SlowIndex<'a> { type Output = usize; fn index(&self, index: usize) -> &Self::Output { std::thread::sleep(Duration::from_millis(1)); &self.0[index] } } let slow_a = SlowIndex(&a); let slow_b = SlowIndex(&b); // don't give it enough time to do anything interesting let mut d = crate::algorithms::Replace::new(crate::algorithms::Capture::new()); diff_deadline( &mut d, &slow_a, 0..a.len(), &slow_b, 0..b.len(), Some(Instant::now() + Duration::from_millis(50)), ) .unwrap(); insta::assert_debug_snapshot!(d.into_inner().ops()); }