From 698f8c2f01ea549d77d7dc3338a12e04c11057b9 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Wed, 17 Apr 2024 14:02:58 +0200 Subject: Adding upstream version 1.64.0+dfsg1. Signed-off-by: Daniel Baumann --- vendor/dissimilar/src/find.rs | 232 ++++++++++ vendor/dissimilar/src/lib.rs | 932 +++++++++++++++++++++++++++++++++++++++++ vendor/dissimilar/src/range.rs | 148 +++++++ vendor/dissimilar/src/tests.rs | 580 +++++++++++++++++++++++++ 4 files changed, 1892 insertions(+) create mode 100644 vendor/dissimilar/src/find.rs create mode 100644 vendor/dissimilar/src/lib.rs create mode 100644 vendor/dissimilar/src/range.rs create mode 100644 vendor/dissimilar/src/tests.rs (limited to 'vendor/dissimilar/src') diff --git a/vendor/dissimilar/src/find.rs b/vendor/dissimilar/src/find.rs new file mode 100644 index 000000000..90ca2c6c5 --- /dev/null +++ b/vendor/dissimilar/src/find.rs @@ -0,0 +1,232 @@ +// The strstr implementation in this file is extracted from the Rust standard +// library's str::find. The algorithm works for arbitrary &[u8] haystack and +// needle but is only exposed by the standard library on UTF-8 strings. +// +// https://github.com/rust-lang/rust/blob/1.40.0/src/libcore/str/pattern.rs +// +// --- +// +// This is the Two-Way search algorithm, which was introduced in the paper: +// Crochemore, M., Perrin, D., 1991, Two-way string-matching, Journal of the ACM 38(3):651-675. +// +// Here's some background information. +// +// A *word* is a string of symbols. The *length* of a word should be a familiar +// notion, and here we denote it for any word x by |x|. (We also allow for the +// possibility of the *empty word*, a word of length zero.) +// +// If x is any non-empty word, then an integer p with 0 < p <= |x| is said to be +// a *period* for x iff for all i with 0 <= i <= |x| - p - 1, we have x[i] == +// x[i+p]. For example, both 1 and 2 are periods for the string "aa". As another +// example, the only period of the string "abcd" is 4. +// +// We denote by period(x) the *smallest* period of x (provided that x is +// non-empty). This is always well-defined since every non-empty word x has at +// least one period, |x|. We sometimes call this *the period* of x. +// +// If u, v and x are words such that x = uv, where uv is the concatenation of u +// and v, then we say that (u, v) is a *factorization* of x. +// +// Let (u, v) be a factorization for a word x. Then if w is a non-empty word +// such that both of the following hold +// +// - either w is a suffix of u or u is a suffix of w +// - either w is a prefix of v or v is a prefix of w +// +// then w is said to be a *repetition* for the factorization (u, v). +// +// Just to unpack this, there are four possibilities here. Let w = "abc". Then +// we might have: +// +// - w is a suffix of u and w is a prefix of v. ex: ("lolabc", "abcde") +// - w is a suffix of u and v is a prefix of w. ex: ("lolabc", "ab") +// - u is a suffix of w and w is a prefix of v. ex: ("bc", "abchi") +// - u is a suffix of w and v is a prefix of w. ex: ("bc", "a") +// +// Note that the word vu is a repetition for any factorization (u,v) of x = uv, +// so every factorization has at least one repetition. +// +// If x is a string and (u, v) is a factorization for x, then a *local period* +// for (u, v) is an integer r such that there is some word w such that |w| = r +// and w is a repetition for (u, v). +// +// We denote by local_period(u, v) the smallest local period of (u, v). We +// sometimes call this *the local period* of (u, v). Provided that x = uv is +// non-empty, this is well-defined (because each non-empty word has at least one +// factorization, as noted above). +// +// It can be proven that the following is an equivalent definition of a local +// period for a factorization (u, v): any positive integer r such that x[i] == +// x[i+r] for all i such that |u| - r <= i <= |u| - 1 and such that both x[i] +// and x[i+r] are defined. (i.e., i > 0 and i + r < |x|). +// +// Using the above reformulation, it is easy to prove that +// +// 1 <= local_period(u, v) <= period(uv) +// +// A factorization (u, v) of x such that local_period(u,v) = period(x) is called +// a *critical factorization*. +// +// The algorithm hinges on the following theorem, which is stated without proof: +// +// **Critical Factorization Theorem** Any word x has at least one critical +// factorization (u, v) such that |u| < period(x). +// +// The purpose of maximal_suffix is to find such a critical factorization. +// +// If the period is short, compute another factorization x = u' v' to use for +// reverse search, chosen instead so that |v'| < period(x). + +use std::cmp; +use std::usize; + +pub fn find(haystack: &[u8], needle: &[u8]) -> Option { + assert!(!needle.is_empty()); + + // crit_pos: critical factorization index + let (crit_pos_false, period_false) = maximal_suffix(needle, false); + let (crit_pos_true, period_true) = maximal_suffix(needle, true); + let (crit_pos, mut period) = if crit_pos_false > crit_pos_true { + (crit_pos_false, period_false) + } else { + (crit_pos_true, period_true) + }; + + // Byteset is an extension (not part of the two way algorithm); it is a + // 64-bit "fingerprint" where each set bit j corresponds to a (byte & 63) == + // j present in the needle. + let byteset; + // Index into needle before which we have already matched. + let mut memory; + + // A particularly readable explanation of what's going on here can be found + // in Crochemore and Rytter's book "Text Algorithms", ch 13. Specifically + // see the code for "Algorithm CP" on p. 323. + // + // What's going on is we have some critical factorization (u, v) of the + // needle, and we want to determine whether u is a suffix of &v[..period]. + // If it is, we use "Algorithm CP1". Otherwise we use "Algorithm CP2", which + // is optimized for when the period of the needle is large. + let long_period = needle[..crit_pos] != needle[period..period + crit_pos]; + if long_period { + // Long period case -- we have an approximation to the actual period, + // and don't use memorization. + // + // Approximate the period by lower bound max(|u|, |v|) + 1. + period = cmp::max(crit_pos, needle.len() - crit_pos) + 1; + byteset = byteset_create(needle); + // Dummy value to signify that the period is long. + memory = usize::MAX; + } else { + // Short period case -- the period is exact. + byteset = byteset_create(&needle[..period]); + memory = 0; + } + + // One of the main ideas of Two-Way is that we factorize the needle into two + // halves, (u, v), and begin trying to find v in the haystack by scanning + // left to right. If v matches, we try to match u by scanning right to left. + // How far we can jump when we encounter a mismatch is all based on the fact + // that (u, v) is a critical factorization for the needle. + let mut position = 0; + let needle_last = needle.len() - 1; + 'search: loop { + // Check that we have room to search in. position + needle_last cannot + // overflow if we assume slices are bounded by isize's range. + let tail_byte = *haystack.get(position + needle_last)?; + + // Quickly skip by large portions unrelated to our substring. + if !byteset_contains(byteset, tail_byte) { + position += needle.len(); + if !long_period { + memory = 0; + } + continue 'search; + } + + // See if the right part of the needle matches. + let start = if long_period { + crit_pos + } else { + cmp::max(crit_pos, memory) + }; + for i in start..needle.len() { + if needle[i] != haystack[position + i] { + position += i - crit_pos + 1; + if !long_period { + memory = 0; + } + continue 'search; + } + } + + // See if the left part of the needle matches. + let start = if long_period { 0 } else { memory }; + for i in (start..crit_pos).rev() { + if needle[i] != haystack[position + i] { + position += period; + if !long_period { + memory = needle.len() - period; + } + continue 'search; + } + } + + // We have found a match! + return Some(position); + } +} + +fn byteset_create(bytes: &[u8]) -> u64 { + bytes.iter().fold(0, |a, &b| (1 << (b & 0x3f)) | a) +} + +fn byteset_contains(byteset: u64, byte: u8) -> bool { + (byteset >> ((byte & 0x3f) as usize)) & 1 != 0 +} + +// Compute the maximal suffix of `arr`. +// +// The maximal suffix is a possible critical factorization (u, v) of `arr`. +// +// Returns (`i`, `p`) where `i` is the starting index of v and `p` is the +// period of v. +// +// `order_greater` determines if lexical order is `<` or `>`. Both +// orders must be computed -- the ordering with the largest `i` gives +// a critical factorization. +// +// For long period cases, the resulting period is not exact (it is too short). +fn maximal_suffix(arr: &[u8], order_greater: bool) -> (usize, usize) { + let mut left = 0; // Corresponds to i in the paper + let mut right = 1; // Corresponds to j in the paper + let mut offset = 0; // Corresponds to k in the paper, but starting at 0 + // to match 0-based indexing. + let mut period = 1; // Corresponds to p in the paper + + while let Some(&a) = arr.get(right + offset) { + // `left` will be inbounds when `right` is. + let b = arr[left + offset]; + if (a < b && !order_greater) || (a > b && order_greater) { + // Suffix is smaller, period is entire prefix so far. + right += offset + 1; + offset = 0; + period = right - left; + } else if a == b { + // Advance through repetition of the current period. + if offset + 1 == period { + right += offset + 1; + offset = 0; + } else { + offset += 1; + } + } else { + // Suffix is larger, start over from current location. + left = right; + right += 1; + offset = 0; + period = 1; + } + } + (left, period) +} diff --git a/vendor/dissimilar/src/lib.rs b/vendor/dissimilar/src/lib.rs new file mode 100644 index 000000000..8ce9faad3 --- /dev/null +++ b/vendor/dissimilar/src/lib.rs @@ -0,0 +1,932 @@ +//! [![github]](https://github.com/dtolnay/dissimilar) [![crates-io]](https://crates.io/crates/dissimilar) [![docs-rs]](https://docs.rs/dissimilar) +//! +//! [github]: https://img.shields.io/badge/github-8da0cb?style=for-the-badge&labelColor=555555&logo=github +//! [crates-io]: https://img.shields.io/badge/crates.io-fc8d62?style=for-the-badge&labelColor=555555&logo=rust +//! [docs-rs]: https://img.shields.io/badge/docs.rs-66c2a5?style=for-the-badge&labelColor=555555&logoColor=white&logo=data:image/svg+xml;base64,PHN2ZyByb2xlPSJpbWciIHhtbG5zPSJodHRwOi8vd3d3LnczLm9yZy8yMDAwL3N2ZyIgdmlld0JveD0iMCAwIDUxMiA1MTIiPjxwYXRoIGZpbGw9IiNmNWY1ZjUiIGQ9Ik00ODguNiAyNTAuMkwzOTIgMjE0VjEwNS41YzAtMTUtOS4zLTI4LjQtMjMuNC0zMy43bC0xMDAtMzcuNWMtOC4xLTMuMS0xNy4xLTMuMS0yNS4zIDBsLTEwMCAzNy41Yy0xNC4xIDUuMy0yMy40IDE4LjctMjMuNCAzMy43VjIxNGwtOTYuNiAzNi4yQzkuMyAyNTUuNSAwIDI2OC45IDAgMjgzLjlWMzk0YzAgMTMuNiA3LjcgMjYuMSAxOS45IDMyLjJsMTAwIDUwYzEwLjEgNS4xIDIyLjEgNS4xIDMyLjIgMGwxMDMuOS01MiAxMDMuOSA1MmMxMC4xIDUuMSAyMi4xIDUuMSAzMi4yIDBsMTAwLTUwYzEyLjItNi4xIDE5LjktMTguNiAxOS45LTMyLjJWMjgzLjljMC0xNS05LjMtMjguNC0yMy40LTMzLjd6TTM1OCAyMTQuOGwtODUgMzEuOXYtNjguMmw4NS0zN3Y3My4zek0xNTQgMTA0LjFsMTAyLTM4LjIgMTAyIDM4LjJ2LjZsLTEwMiA0MS40LTEwMi00MS40di0uNnptODQgMjkxLjFsLTg1IDQyLjV2LTc5LjFsODUtMzguOHY3NS40em0wLTExMmwtMTAyIDQxLjQtMTAyLTQxLjR2LS42bDEwMi0zOC4yIDEwMiAzOC4ydi42em0yNDAgMTEybC04NSA0Mi41di03OS4xbDg1LTM4Ljh2NzUuNHptMC0xMTJsLTEwMiA0MS40LTEwMi00MS40di0uNmwxMDItMzguMiAxMDIgMzguMnYuNnoiPjwvcGF0aD48L3N2Zz4K +//! +//!
+//! +//! ## Diff library with semantic cleanup, based on Google's diff-match-patch +//! +//! This library is a port of the Diff component of [Diff Match Patch] to Rust. +//! The diff implementation is based on [Myers' diff algorithm] but includes +//! some [semantic cleanups] to increase human readability by factoring out +//! commonalities which are likely to be coincidental. +//! +//! Diff Match Patch was originally built in 2006 to power Google Docs. +//! +//! # Interface +//! +//! Here is the entire API of the Rust implementation. It operates on borrowed +//! strings and the return value of the diff algorithm is a vector of chunks +//! pointing into slices of those input strings. +//! +//! ``` +//! pub enum Chunk<'a> { +//! Equal(&'a str), +//! Delete(&'a str), +//! Insert(&'a str), +//! } +//! +//! # const IGNORE: &str = stringify! { +//! pub fn diff(text1: &str, text2: &str) -> Vec; +//! # }; +//! ``` +//! +//! [Diff Match Patch]: https://github.com/google/diff-match-patch +//! [Myers' diff algorithm]: https://neil.fraser.name/writing/diff/myers.pdf +//! [semantic cleanups]: https://neil.fraser.name/writing/diff/ + +#![doc(html_root_url = "https://docs.rs/dissimilar/1.0.4")] +#![allow( + clippy::blocks_in_if_conditions, + clippy::cast_possible_wrap, + clippy::cast_sign_loss, + clippy::cloned_instead_of_copied, // https://github.com/rust-lang/rust-clippy/issues/7127 + clippy::collapsible_else_if, + clippy::comparison_chain, + clippy::match_same_arms, + clippy::module_name_repetitions, + clippy::must_use_candidate, + clippy::new_without_default, + clippy::octal_escapes, + clippy::shadow_unrelated, + clippy::similar_names, + clippy::too_many_lines, + clippy::unseparated_literal_suffix, + unused_parens, // false positive on Some(&(mut diff)) pattern +)] + +mod find; +mod range; + +#[cfg(test)] +mod tests; + +use crate::range::{bytes, str, Range}; +use std::cmp; +use std::collections::VecDeque; +use std::fmt::{self, Debug}; + +#[derive(Copy, Clone, PartialEq, Eq)] +pub enum Chunk<'a> { + Equal(&'a str), + Delete(&'a str), + Insert(&'a str), +} + +#[derive(Copy, Clone)] +enum Diff<'a, 'b> { + Equal(Range<'a>, Range<'b>), + Delete(Range<'a>), + Insert(Range<'b>), +} + +impl<'tmp, 'a: 'tmp, 'b: 'tmp> Diff<'a, 'b> { + fn text(&self) -> Range<'tmp> { + match *self { + Diff::Equal(range, _) | Diff::Delete(range) | Diff::Insert(range) => range, + } + } + + fn grow_left(&mut self, increment: usize) { + self.for_each(|range| { + range.offset -= increment; + range.len += increment; + }); + } + + fn grow_right(&mut self, increment: usize) { + self.for_each(|range| range.len += increment); + } + + fn shift_left(&mut self, increment: usize) { + self.for_each(|range| range.offset -= increment); + } + + fn shift_right(&mut self, increment: usize) { + self.for_each(|range| range.offset += increment); + } + + fn for_each(&mut self, f: impl Fn(&mut Range)) { + match self { + Diff::Equal(range1, range2) => { + f(range1); + f(range2); + } + Diff::Delete(range) => f(range), + Diff::Insert(range) => f(range), + } + } +} + +pub fn diff<'a>(text1: &'a str, text2: &'a str) -> Vec> { + let text1 = Range::new(text1, ..); + let text2 = Range::new(text2, ..); + let mut solution = main(text1, text2); + cleanup_char_boundary(&mut solution); + cleanup_semantic(&mut solution); + cleanup_merge(&mut solution); + solution.diffs.into_iter().map(Chunk::from).collect() +} + +struct Solution<'a, 'b> { + text1: Range<'a>, + text2: Range<'b>, + diffs: Vec>, + utf8: bool, +} + +fn main<'a, 'b>(mut text1: Range<'a>, mut text2: Range<'b>) -> Solution<'a, 'b> { + let whole1 = text1; + let whole2 = text2; + + // Trim off common prefix. + let common_prefix_len = common_prefix_bytes(text1, text2); + let common_prefix = Diff::Equal( + text1.substring(..common_prefix_len), + text2.substring(..common_prefix_len), + ); + text1 = text1.substring(common_prefix_len..); + text2 = text2.substring(common_prefix_len..); + + // Trim off common suffix. + let common_suffix_len = common_suffix_bytes(text1, text2); + let common_suffix = Diff::Equal( + text1.substring(text1.len - common_suffix_len..), + text2.substring(text2.len - common_suffix_len..), + ); + text1 = text1.substring(..text1.len - common_suffix_len); + text2 = text2.substring(..text2.len - common_suffix_len); + + // Compute the diff on the middle block. + let mut solution = Solution { + text1: whole1, + text2: whole2, + diffs: compute(text1, text2), + utf8: false, + }; + + // Restore the prefix and suffix. + if common_prefix_len > 0 { + solution.diffs.insert(0, common_prefix); + } + if common_suffix_len > 0 { + solution.diffs.push(common_suffix); + } + + cleanup_merge(&mut solution); + + solution +} + +// Find the differences between two texts. Assumes that the texts do not have +// any common prefix or suffix. +fn compute<'a, 'b>(text1: Range<'a>, text2: Range<'b>) -> Vec> { + match (text1.is_empty(), text2.is_empty()) { + (true, true) => return Vec::new(), + (true, false) => return vec![Diff::Insert(text2)], + (false, true) => return vec![Diff::Delete(text1)], + (false, false) => {} + } + + // Check for entire shorter text inside the longer text. + if text1.len > text2.len { + if let Some(i) = text1.find(text2) { + return vec![ + Diff::Delete(text1.substring(..i)), + Diff::Equal(text1.substring(i..i + text2.len), text2), + Diff::Delete(text1.substring(i + text2.len..)), + ]; + } + } else { + if let Some(i) = text2.find(text1) { + return vec![ + Diff::Insert(text2.substring(..i)), + Diff::Equal(text1, text2.substring(i..i + text1.len)), + Diff::Insert(text2.substring(i + text1.len..)), + ]; + } + } + + if text1.len == 1 || text2.len == 1 { + // Single character string. + // After the previous check, the character can't be an equality. + return vec![Diff::Delete(text1), Diff::Insert(text2)]; + } + + bisect(text1, text2) +} + +// Find the 'middle snake' of a diff, split the problem in two and return the +// recursively constructed diff. +// +// See Myers 1986 paper: An O(ND) Difference Algorithm and Its Variations. +fn bisect<'a, 'b>(text1: Range<'a>, text2: Range<'b>) -> Vec> { + let max_d = (text1.len + text2.len + 1) / 2; + let v_offset = max_d; + let v_len = 2 * max_d; + let mut v1 = vec![-1isize; v_len]; + let mut v2 = vec![-1isize; v_len]; + v1[v_offset + 1] = 0; + v2[v_offset + 1] = 0; + let delta = text1.len as isize - text2.len as isize; + // If the total number of characters is odd, then the front path will + // collide with the reverse path. + let front = delta % 2 != 0; + // Offsets for start and end of k loop. + // Prevents mapping of space beyond the grid. + let mut k1start = 0; + let mut k1end = 0; + let mut k2start = 0; + let mut k2end = 0; + for d in 0..max_d as isize { + // Walk the front path one step. + let mut k1 = -d + k1start; + while k1 <= d - k1end { + let k1_offset = (v_offset as isize + k1) as usize; + let mut x1 = if k1 == -d || (k1 != d && v1[k1_offset - 1] < v1[k1_offset + 1]) { + v1[k1_offset + 1] + } else { + v1[k1_offset - 1] + 1 + } as usize; + let mut y1 = (x1 as isize - k1) as usize; + if let (Some(s1), Some(s2)) = (text1.get(x1..), text2.get(y1..)) { + let advance = common_prefix_bytes(s1, s2); + x1 += advance; + y1 += advance; + } + v1[k1_offset] = x1 as isize; + if x1 > text1.len { + // Ran off the right of the graph. + k1end += 2; + } else if y1 > text2.len { + // Ran off the bottom of the graph. + k1start += 2; + } else if front { + let k2_offset = v_offset as isize + delta - k1; + if k2_offset >= 0 && k2_offset < v_len as isize && v2[k2_offset as usize] != -1 { + // Mirror x2 onto top-left coordinate system. + let x2 = text1.len as isize - v2[k2_offset as usize]; + if x1 as isize >= x2 { + // Overlap detected. + return bisect_split(text1, text2, x1, y1); + } + } + } + k1 += 2; + } + + // Walk the reverse path one step. + let mut k2 = -d + k2start; + while k2 <= d - k2end { + let k2_offset = (v_offset as isize + k2) as usize; + let mut x2 = if k2 == -d || (k2 != d && v2[k2_offset - 1] < v2[k2_offset + 1]) { + v2[k2_offset + 1] + } else { + v2[k2_offset - 1] + 1 + } as usize; + let mut y2 = (x2 as isize - k2) as usize; + if x2 < text1.len && y2 < text2.len { + let advance = common_suffix_bytes( + text1.substring(..text1.len - x2), + text2.substring(..text2.len - y2), + ); + x2 += advance; + y2 += advance; + } + v2[k2_offset] = x2 as isize; + if x2 > text1.len { + // Ran off the left of the graph. + k2end += 2; + } else if y2 > text2.len { + // Ran off the top of the graph. + k2start += 2; + } else if !front { + let k1_offset = v_offset as isize + delta - k2; + if k1_offset >= 0 && k1_offset < v_len as isize && v1[k1_offset as usize] != -1 { + let x1 = v1[k1_offset as usize] as usize; + let y1 = v_offset + x1 - k1_offset as usize; + // Mirror x2 onto top-left coordinate system. + x2 = text1.len - x2; + if x1 >= x2 { + // Overlap detected. + return bisect_split(text1, text2, x1, y1); + } + } + } + k2 += 2; + } + } + // Number of diffs equals number of characters, no commonality at all. + vec![Diff::Delete(text1), Diff::Insert(text2)] +} + +// Given the location of the 'middle snake', split the diff in two parts and +// recurse. +fn bisect_split<'a, 'b>( + text1: Range<'a>, + text2: Range<'b>, + x: usize, + y: usize, +) -> Vec> { + let (text1a, text1b) = text1.split_at(x); + let (text2a, text2b) = text2.split_at(y); + + // Compute both diffs serially. + let mut diffs = main(text1a, text2a).diffs; + diffs.extend(main(text1b, text2b).diffs); + + diffs +} + +// Determine the length of the common prefix of two strings. +fn common_prefix(text1: Range, text2: Range) -> usize { + for ((i, ch1), ch2) in text1.char_indices().zip(text2.chars()) { + if ch1 != ch2 { + return i; + } + } + cmp::min(text1.len, text2.len) +} + +// Determine the length of the common suffix of two strings. +fn common_suffix(text1: Range, text2: Range) -> usize { + for ((i, ch1), ch2) in text1.char_indices().rev().zip(text2.chars().rev()) { + if ch1 != ch2 { + return text1.len - i - ch1.len_utf8(); + } + } + cmp::min(text1.len, text2.len) +} + +fn common_prefix_bytes(text1: Range, text2: Range) -> usize { + for (i, (b1, b2)) in text1.bytes().zip(text2.bytes()).enumerate() { + if b1 != b2 { + return i; + } + } + cmp::min(text1.len, text2.len) +} + +fn common_suffix_bytes(text1: Range, text2: Range) -> usize { + for (i, (b1, b2)) in text1.bytes().rev().zip(text2.bytes().rev()).enumerate() { + if b1 != b2 { + return i; + } + } + cmp::min(text1.len, text2.len) +} + +// Determine if the suffix of one string is the prefix of another. +// +// Returns the number of characters common to the end of the first string and +// the start of the second string. +fn common_overlap(mut text1: Range, mut text2: Range) -> usize { + // Eliminate the null case. + if text1.is_empty() || text2.is_empty() { + return 0; + } + // Truncate the longer string. + if text1.len > text2.len { + text1 = text1.substring(text1.len - text2.len..); + } else if text1.len < text2.len { + text2 = text2.substring(..text1.len); + } + // Quick check for the worst case. + if bytes(text1) == bytes(text2) { + return text1.len; + } + + // Start by looking for a single character match + // and increase length until no match is found. + // Performance analysis: https://neil.fraser.name/news/2010/11/04/ + let mut best = 0; + let mut length = 1; + loop { + let pattern = text1.substring(text1.len - length..); + let found = match text2.find(pattern) { + Some(found) => found, + None => return best, + }; + length += found; + if found == 0 + || bytes(text1.substring(text1.len - length..)) == bytes(text2.substring(..length)) + { + best = length; + length += 1; + } + } +} + +fn cleanup_char_boundary(solution: &mut Solution) { + fn boundary_down(doc: &str, pos: usize) -> usize { + let mut adjust = 0; + while !doc.is_char_boundary(pos - adjust) { + adjust += 1; + } + adjust + } + + fn boundary_up(doc: &str, pos: usize) -> usize { + let mut adjust = 0; + while !doc.is_char_boundary(pos + adjust) { + adjust += 1; + } + adjust + } + + fn skip_overlap<'a>(prev: &Range<'a>, range: &mut Range<'a>) { + let prev_end = prev.offset + prev.len; + if prev_end > range.offset { + let delta = cmp::min(prev_end - range.offset, range.len); + range.offset += delta; + range.len -= delta; + } + } + + let mut read = 0; + let mut retain = 0; + let mut last_delete = Range::empty(); + let mut last_insert = Range::empty(); + while let Some(&(mut diff)) = solution.diffs.get(read) { + read += 1; + match &mut diff { + Diff::Equal(range1, range2) => { + let adjust = boundary_up(range1.doc, range1.offset); + // If the whole range is sub-character, skip it. + if range1.len <= adjust { + continue; + } + range1.offset += adjust; + range1.len -= adjust; + range2.offset += adjust; + range2.len -= adjust; + let adjust = boundary_down(range1.doc, range1.offset + range1.len); + range1.len -= adjust; + range2.len -= adjust; + last_delete = Range::empty(); + last_insert = Range::empty(); + } + Diff::Delete(range) => { + skip_overlap(&last_delete, range); + if range.len == 0 { + continue; + } + let adjust = boundary_down(range.doc, range.offset); + range.offset -= adjust; + range.len += adjust; + let adjust = boundary_up(range.doc, range.offset + range.len); + range.len += adjust; + last_delete = *range; + } + Diff::Insert(range) => { + skip_overlap(&last_insert, range); + if range.len == 0 { + continue; + } + let adjust = boundary_down(range.doc, range.offset); + range.offset -= adjust; + range.len += adjust; + let adjust = boundary_up(range.doc, range.offset + range.len); + range.len += adjust; + last_insert = *range; + } + } + solution.diffs[retain] = diff; + retain += 1; + } + + solution.diffs.truncate(retain); + solution.utf8 = true; +} + +// Reduce the number of edits by eliminating semantically trivial equalities. +fn cleanup_semantic(solution: &mut Solution) { + let mut diffs = &mut solution.diffs; + if diffs.is_empty() { + return; + } + + let mut changes = false; + let mut equalities = VecDeque::new(); // Double-ended queue of equalities. + let mut last_equality = None; // Always equal to equalities.peek().text + let mut pointer = 0; + // Number of characters that changed prior to the equality. + let mut len_insertions1 = 0; + let mut len_deletions1 = 0; + // Number of characters that changed after the equality. + let mut len_insertions2 = 0; + let mut len_deletions2 = 0; + while let Some(&this_diff) = diffs.get(pointer) { + match this_diff { + Diff::Equal(text1, text2) => { + equalities.push_back(pointer); + len_insertions1 = len_insertions2; + len_deletions1 = len_deletions2; + len_insertions2 = 0; + len_deletions2 = 0; + last_equality = Some((text1, text2)); + pointer += 1; + continue; + } + Diff::Delete(text) => len_deletions2 += text.len, + Diff::Insert(text) => len_insertions2 += text.len, + } + // Eliminate an equality that is smaller or equal to the edits on both + // sides of it. + if last_equality.map_or(false, |(last_equality, _)| { + last_equality.len <= cmp::max(len_insertions1, len_deletions1) + && last_equality.len <= cmp::max(len_insertions2, len_deletions2) + }) { + // Jump back to offending equality. + pointer = equalities.pop_back().unwrap(); + + // Replace equality with a delete. + diffs[pointer] = Diff::Delete(last_equality.unwrap().0); + // Insert a corresponding insert. + diffs.insert(pointer + 1, Diff::Insert(last_equality.unwrap().1)); + + len_insertions1 = 0; // Reset the counters. + len_insertions2 = 0; + len_deletions1 = 0; + len_deletions2 = 0; + last_equality = None; + changes = true; + + // Throw away the previous equality (it needs to be reevaluated). + equalities.pop_back(); + if let Some(back) = equalities.back() { + // There is a safe equality we can fall back to. + pointer = *back; + } else { + // There are no previous equalities, jump back to the start. + pointer = 0; + continue; + } + } + pointer += 1; + } + + // Normalize the diff. + if changes { + cleanup_merge(solution); + } + cleanup_semantic_lossless(solution); + diffs = &mut solution.diffs; + + // Find any overlaps between deletions and insertions. + // e.g: abcxxxxxxdef + // -> abcxxxdef + // e.g: xxxabcdefxxx + // -> defxxxabc + // Only extract an overlap if it is as big as the edit ahead or behind it. + let mut pointer = 1; + while let Some(&this_diff) = diffs.get(pointer) { + let prev_diff = diffs[pointer - 1]; + if let (Diff::Delete(deletion), Diff::Insert(insertion)) = (prev_diff, this_diff) { + let overlap_len1 = common_overlap(deletion, insertion); + let overlap_len2 = common_overlap(insertion, deletion); + let overlap_min = cmp::min(deletion.len, insertion.len); + if overlap_len1 >= overlap_len2 && 2 * overlap_len1 >= overlap_min { + // Overlap found. Insert an equality and trim the surrounding edits. + diffs.insert( + pointer, + Diff::Equal( + deletion.substring(deletion.len - overlap_len1..deletion.len), + insertion.substring(..overlap_len1), + ), + ); + diffs[pointer - 1] = + Diff::Delete(deletion.substring(..deletion.len - overlap_len1)); + diffs[pointer + 1] = Diff::Insert(insertion.substring(overlap_len1..)); + } else if overlap_len1 < overlap_len2 && 2 * overlap_len2 >= overlap_min { + // Reverse overlap found. + // Insert an equality and swap and trim the surrounding edits. + diffs.insert( + pointer, + Diff::Equal( + deletion.substring(..overlap_len2), + insertion.substring(insertion.len - overlap_len2..insertion.len), + ), + ); + diffs[pointer - 1] = + Diff::Insert(insertion.substring(..insertion.len - overlap_len2)); + diffs[pointer + 1] = Diff::Delete(deletion.substring(overlap_len2..)); + } + pointer += 1; + } + pointer += 1; + } +} + +// Look for single edits surrounded on both sides by equalities which can be +// shifted sideways to align the edit to a word boundary. +// +// e.g: The cat came. -> The cat came. +fn cleanup_semantic_lossless(solution: &mut Solution) { + let diffs = &mut solution.diffs; + let mut pointer = 1; + while let Some(&next_diff) = diffs.get(pointer + 1) { + let prev_diff = diffs[pointer - 1]; + if let ( + Diff::Equal(mut prev_equal1, mut prev_equal2), + Diff::Equal(mut next_equal1, mut next_equal2), + ) = (prev_diff, next_diff) + { + // This is a single edit surrounded by equalities. + let mut edit = diffs[pointer]; + + // First, shift the edit as far left as possible. + let common_offset = common_suffix(prev_equal1, edit.text()); + let original_prev_len = prev_equal1.len; + prev_equal1.len -= common_offset; + prev_equal2.len -= common_offset; + edit.shift_left(common_offset); + next_equal1.offset -= common_offset; + next_equal1.len += common_offset; + next_equal2.offset -= common_offset; + next_equal2.len += common_offset; + + // Second, step character by character right, looking for the best fit. + let mut best_prev_equal = (prev_equal1, prev_equal2); + let mut best_edit = edit; + let mut best_next_equal = (next_equal1, next_equal2); + let mut best_score = cleanup_semantic_score(prev_equal1, edit.text()) + + cleanup_semantic_score(edit.text(), next_equal1); + while !edit.text().is_empty() + && !next_equal1.is_empty() + && edit.text().chars().next().unwrap() == next_equal1.chars().next().unwrap() + { + let increment = edit.text().chars().next().unwrap().len_utf8(); + prev_equal1.len += increment; + prev_equal2.len += increment; + edit.shift_right(increment); + next_equal1.offset += increment; + next_equal1.len -= increment; + next_equal2.offset += increment; + next_equal2.len -= increment; + let score = cleanup_semantic_score(prev_equal1, edit.text()) + + cleanup_semantic_score(edit.text(), next_equal1); + // The >= encourages trailing rather than leading whitespace on edits. + if score >= best_score { + best_score = score; + best_prev_equal = (prev_equal1, prev_equal2); + best_edit = edit; + best_next_equal = (next_equal1, next_equal2); + } + } + + if original_prev_len != best_prev_equal.0.len { + // We have an improvement, save it back to the diff. + if best_next_equal.0.is_empty() { + diffs.remove(pointer + 1); + } else { + diffs[pointer + 1] = Diff::Equal(best_next_equal.0, best_next_equal.1); + } + diffs[pointer] = best_edit; + if best_prev_equal.0.is_empty() { + diffs.remove(pointer - 1); + pointer -= 1; + } else { + diffs[pointer - 1] = Diff::Equal(best_prev_equal.0, best_prev_equal.1); + } + } + } + pointer += 1; + } +} + +// Given two strings, compute a score representing whether the internal boundary +// falls on logical boundaries. +// +// Scores range from 6 (best) to 0 (worst). +fn cleanup_semantic_score(one: Range, two: Range) -> usize { + if one.is_empty() || two.is_empty() { + // Edges are the best. + return 6; + } + + // Each port of this function behaves slightly differently due to subtle + // differences in each language's definition of things like 'whitespace'. + // Since this function's purpose is largely cosmetic, the choice has been + // made to use each language's native features rather than force total + // conformity. + let char1 = one.chars().next_back().unwrap(); + let char2 = two.chars().next().unwrap(); + let non_alphanumeric1 = !char1.is_ascii_alphanumeric(); + let non_alphanumeric2 = !char2.is_ascii_alphanumeric(); + let whitespace1 = non_alphanumeric1 && char1.is_ascii_whitespace(); + let whitespace2 = non_alphanumeric2 && char2.is_ascii_whitespace(); + let line_break1 = whitespace1 && char1.is_control(); + let line_break2 = whitespace2 && char2.is_control(); + let blank_line1 = line_break1 && (one.ends_with("\n\n") || one.ends_with("\n\r\n")); + let blank_line2 = line_break2 && (two.starts_with("\n\n") || two.starts_with("\r\n\r\n")); + + if blank_line1 || blank_line2 { + // Five points for blank lines. + 5 + } else if line_break1 || line_break2 { + // Four points for line breaks. + 4 + } else if non_alphanumeric1 && !whitespace1 && whitespace2 { + // Three points for end of sentences. + 3 + } else if whitespace1 || whitespace2 { + // Two points for whitespace. + 2 + } else if non_alphanumeric1 || non_alphanumeric2 { + // One point for non-alphanumeric. + 1 + } else { + 0 + } +} + +// Reorder and merge like edit sections. Merge equalities. Any edit section can +// move as long as it doesn't cross an equality. +fn cleanup_merge(solution: &mut Solution) { + let diffs = &mut solution.diffs; + let common_prefix = if solution.utf8 { + common_prefix + } else { + common_prefix_bytes + }; + let common_suffix = if solution.utf8 { + common_suffix + } else { + common_suffix_bytes + }; + + loop { + if diffs.is_empty() { + return; + } + + diffs.push(Diff::Equal( + solution.text1.substring(solution.text1.len..), + solution.text2.substring(solution.text2.len..), + )); // Add a dummy entry at the end. + let mut pointer = 0; + let mut count_delete = 0; + let mut count_insert = 0; + let mut text_delete = Range::empty(); + let mut text_insert = Range::empty(); + while let Some(&this_diff) = diffs.get(pointer) { + match this_diff { + Diff::Insert(text) => { + count_insert += 1; + if text_insert.is_empty() { + text_insert = text; + } else { + text_insert.len += text.len; + } + } + Diff::Delete(text) => { + count_delete += 1; + if text_delete.is_empty() { + text_delete = text; + } else { + text_delete.len += text.len; + } + } + Diff::Equal(text, _) => { + let count_both = count_delete + count_insert; + if count_both > 1 { + let both_types = count_delete != 0 && count_insert != 0; + // Delete the offending records. + diffs.splice(pointer - count_both..pointer, None); + pointer -= count_both; + if both_types { + // Factor out any common prefix. + let common_length = common_prefix(text_insert, text_delete); + if common_length != 0 { + if pointer > 0 { + match &mut diffs[pointer - 1] { + Diff::Equal(this_diff1, this_diff2) => { + this_diff1.len += common_length; + this_diff2.len += common_length; + } + _ => unreachable!( + "previous diff should have been an equality" + ), + } + } else { + diffs.insert( + pointer, + Diff::Equal( + text_delete.substring(..common_length), + text_insert.substring(..common_length), + ), + ); + pointer += 1; + } + text_insert = text_insert.substring(common_length..); + text_delete = text_delete.substring(common_length..); + } + // Factor out any common suffix. + let common_length = common_suffix(text_insert, text_delete); + if common_length != 0 { + diffs[pointer].grow_left(common_length); + text_insert.len -= common_length; + text_delete.len -= common_length; + } + } + // Insert the merged records. + if !text_delete.is_empty() { + diffs.insert(pointer, Diff::Delete(text_delete)); + pointer += 1; + } + if !text_insert.is_empty() { + diffs.insert(pointer, Diff::Insert(text_insert)); + pointer += 1; + } + } else if pointer > 0 { + if let Some(Diff::Equal(prev_equal1, prev_equal2)) = + diffs.get_mut(pointer - 1) + { + // Merge this equality with the previous one. + prev_equal1.len += text.len; + prev_equal2.len += text.len; + diffs.remove(pointer); + pointer -= 1; + } + } + count_insert = 0; + count_delete = 0; + text_delete = Range::empty(); + text_insert = Range::empty(); + } + } + pointer += 1; + } + if diffs.last().unwrap().text().is_empty() { + diffs.pop(); // Remove the dummy entry at the end. + } + + // Second pass: look for single edits surrounded on both sides by equalities + // which can be shifted sideways to eliminate an equality. + // e.g: ABAC -> ABAC + let mut changes = false; + let mut pointer = 1; + // Intentionally ignore the first and last element (don't need checking). + while let Some(&next_diff) = diffs.get(pointer + 1) { + let prev_diff = diffs[pointer - 1]; + let this_diff = diffs[pointer]; + if let (Diff::Equal(prev_diff, _), Diff::Equal(next_diff, _)) = (prev_diff, next_diff) { + // This is a single edit surrounded by equalities. + if this_diff.text().ends_with(prev_diff) { + // Shift the edit over the previous equality. + diffs[pointer].shift_left(prev_diff.len); + diffs[pointer + 1].grow_left(prev_diff.len); + diffs.remove(pointer - 1); // Delete prev_diff. + changes = true; + } else if this_diff.text().starts_with(next_diff) { + // Shift the edit over the next equality. + diffs[pointer - 1].grow_right(next_diff.len); + diffs[pointer].shift_right(next_diff.len); + diffs.remove(pointer + 1); // Delete next_diff. + changes = true; + } + } + pointer += 1; + } + // If shifts were made, the diff needs reordering and another shift sweep. + if !changes { + return; + } + } +} + +impl Debug for Chunk<'_> { + fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result { + let (name, text) = match *self { + Chunk::Equal(text) => ("Equal", text), + Chunk::Delete(text) => ("Delete", text), + Chunk::Insert(text) => ("Insert", text), + }; + write!(formatter, "{}({:?})", name, text) + } +} + +impl Debug for Diff<'_, '_> { + fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result { + let (name, bytes) = match *self { + Diff::Equal(range, _) => ("Equal", bytes(range)), + Diff::Delete(range) => ("Delete", bytes(range)), + Diff::Insert(range) => ("Insert", bytes(range)), + }; + let text = String::from_utf8_lossy(bytes); + write!(formatter, "{}({:?})", name, text) + } +} + +impl<'a> From> for Chunk<'a> { + fn from(diff: Diff<'a, 'a>) -> Self { + match diff { + Diff::Equal(range, _) => Chunk::Equal(str(range)), + Diff::Delete(range) => Chunk::Delete(str(range)), + Diff::Insert(range) => Chunk::Insert(str(range)), + } + } +} diff --git a/vendor/dissimilar/src/range.rs b/vendor/dissimilar/src/range.rs new file mode 100644 index 000000000..565a94c06 --- /dev/null +++ b/vendor/dissimilar/src/range.rs @@ -0,0 +1,148 @@ +use crate::find::find; +use std::fmt::Debug; +use std::ops::{self, RangeFrom, RangeFull, RangeTo}; +use std::str::{CharIndices, Chars}; + +#[derive(Copy, Clone)] +pub struct Range<'a> { + pub doc: &'a str, + pub offset: usize, + pub len: usize, +} + +impl<'a> Range<'a> { + pub fn empty() -> Self { + Range { + doc: "", + offset: 0, + len: 0, + } + } + + pub fn new(doc: &'a str, bounds: impl RangeBounds) -> Self { + let (offset, len) = bounds.index(doc.len()); + Range { doc, offset, len } + } + + pub fn is_empty(&self) -> bool { + self.len == 0 + } + + pub fn substring(&self, bounds: impl RangeBounds) -> Self { + let (offset, len) = bounds.index(self.len); + Range { + doc: self.doc, + offset: self.offset + offset, + len, + } + } + + pub fn get(&self, bounds: impl RangeBounds) -> Option { + let (offset, len) = bounds.try_index(self.len)?; + Some(Range { + doc: self.doc, + offset: self.offset + offset, + len, + }) + } + + pub fn split_at(&self, mid: usize) -> (Self, Self) { + (self.substring(..mid), self.substring(mid..)) + } + + pub fn chars(&self) -> Chars<'a> { + str(*self).chars() + } + + pub fn char_indices(&self) -> CharIndices<'a> { + str(*self).char_indices() + } + + pub fn bytes(&self) -> impl Iterator + DoubleEndedIterator + ExactSizeIterator + 'a { + bytes(*self).iter().cloned() + } + + pub fn starts_with(&self, prefix: impl AsRef<[u8]>) -> bool { + bytes(*self).starts_with(prefix.as_ref()) + } + + pub fn ends_with(&self, suffix: impl AsRef<[u8]>) -> bool { + bytes(*self).ends_with(suffix.as_ref()) + } + + pub fn find(&self, needle: impl AsRef<[u8]>) -> Option { + find(bytes(*self), needle.as_ref()) + } +} + +pub fn str(range: Range) -> &str { + if cfg!(debug) + && range + .doc + .get(range.offset..range.offset + range.len) + .is_none() + { + eprintln!( + "doc={:?} offset={} len={}", + range.doc, range.offset, range.len + ); + } + &range.doc[range.offset..range.offset + range.len] +} + +pub fn bytes(range: Range) -> &[u8] { + &range.doc.as_bytes()[range.offset..range.offset + range.len] +} + +impl AsRef<[u8]> for Range<'_> { + fn as_ref(&self) -> &[u8] { + bytes(*self) + } +} + +pub trait RangeBounds: Sized + Clone + Debug { + // Returns (offset, len). + fn try_index(self, len: usize) -> Option<(usize, usize)>; + fn index(self, len: usize) -> (usize, usize) { + match self.clone().try_index(len) { + Some(range) => range, + None => panic!("index out of range, index={:?}, len={}", self, len), + } + } +} + +impl RangeBounds for ops::Range { + fn try_index(self, len: usize) -> Option<(usize, usize)> { + if self.start <= self.end && self.end <= len { + Some((self.start, self.end - self.start)) + } else { + None + } + } +} + +impl RangeBounds for RangeFrom { + fn try_index(self, len: usize) -> Option<(usize, usize)> { + if self.start <= len { + Some((self.start, len - self.start)) + } else { + None + } + } +} + +impl RangeBounds for RangeTo { + fn try_index(self, len: usize) -> Option<(usize, usize)> { + if self.end <= len { + Some((0, self.end)) + } else { + None + } + } +} + +impl RangeBounds for RangeFull { + fn try_index(self, len: usize) -> Option<(usize, usize)> { + Some((0, len)) + } +} diff --git a/vendor/dissimilar/src/tests.rs b/vendor/dissimilar/src/tests.rs new file mode 100644 index 000000000..450d7f7e4 --- /dev/null +++ b/vendor/dissimilar/src/tests.rs @@ -0,0 +1,580 @@ +use super::*; + +macro_rules! diff_list { + () => { + Solution { + text1: Range::empty(), + text2: Range::empty(), + diffs: Vec::new(), + utf8: true, + } + }; + ($($kind:ident($text:literal)),+ $(,)?) => {{ + macro_rules! text1 { + (Insert, $s:literal) => { "" }; + (Delete, $s:literal) => { $s }; + (Equal, $s:literal) => { $s }; + } + macro_rules! text2 { + (Insert, $s:literal) => { $s }; + (Delete, $s:literal) => { "" }; + (Equal, $s:literal) => { $s }; + } + let text1 = concat!($(text1!($kind, $text)),*); + let text2 = concat!($(text2!($kind, $text)),*); + let (_i, _j) = (&mut 0, &mut 0); + macro_rules! range { + (Insert, $s:literal) => { + Diff::Insert(range(text2, _j, $s)) + }; + (Delete, $s:literal) => { + Diff::Delete(range(text1, _i, $s)) + }; + (Equal, $s:literal) => { + Diff::Equal(range(text1, _i, $s), range(text2, _j, $s)) + }; + } + Solution { + text1: Range::new(text1, ..), + text2: Range::new(text2, ..), + diffs: vec![$(range!($kind, $text)),*], + utf8: true, + } + }}; +} + +fn range<'a>(doc: &'a str, offset: &mut usize, text: &str) -> Range<'a> { + let range = Range { + doc, + offset: *offset, + len: text.len(), + }; + *offset += text.len(); + range +} + +macro_rules! assert_diffs { + ([$($kind:ident($text:literal)),* $(,)?], $solution:ident, $msg:expr $(,)?) => { + let expected = &[$(Chunk::$kind($text)),*]; + assert!( + same_diffs(expected, &$solution.diffs), + concat!($msg, "\nexpected={:#?}\nactual={:#?}"), + expected, $solution.diffs, + ); + }; +} + +fn same_diffs(expected: &[Chunk], actual: &[Diff]) -> bool { + expected.len() == actual.len() + && expected.iter().zip(actual).all(|pair| match pair { + (Chunk::Insert(expected), Diff::Insert(actual)) => *expected == str(*actual), + (Chunk::Delete(expected), Diff::Delete(actual)) => *expected == str(*actual), + (Chunk::Equal(expected), Diff::Equal(actual1, actual2)) => { + *expected == str(*actual1) && *expected == str(*actual2) + } + (_, _) => false, + }) +} + +#[test] +fn test_common_prefix() { + let text1 = Range::new("abc", ..); + let text2 = Range::new("xyz", ..); + assert_eq!(0, common_prefix_bytes(text1, text2), "Null case"); + + let text1 = Range::new("1234abcdef", ..); + let text2 = Range::new("1234xyz", ..); + assert_eq!(4, common_prefix_bytes(text1, text2), "Non-null case"); + + let text1 = Range::new("1234", ..); + let text2 = Range::new("1234xyz", ..); + assert_eq!(4, common_prefix_bytes(text1, text2), "Whole case"); +} + +#[test] +fn test_common_suffix() { + let text1 = Range::new("abc", ..); + let text2 = Range::new("xyz", ..); + assert_eq!(0, common_suffix(text1, text2), "Null case"); + assert_eq!(0, common_suffix_bytes(text1, text2), "Null case"); + + let text1 = Range::new("abcdef1234", ..); + let text2 = Range::new("xyz1234", ..); + assert_eq!(4, common_suffix(text1, text2), "Non-null case"); + assert_eq!(4, common_suffix_bytes(text1, text2), "Non-null case"); + + let text1 = Range::new("1234", ..); + let text2 = Range::new("xyz1234", ..); + assert_eq!(4, common_suffix(text1, text2), "Whole case"); + assert_eq!(4, common_suffix_bytes(text1, text2), "Whole case"); +} + +#[test] +fn test_common_overlap() { + let text1 = Range::empty(); + let text2 = Range::new("abcd", ..); + assert_eq!(0, common_overlap(text1, text2), "Null case"); + + let text1 = Range::new("abc", ..); + let text2 = Range::new("abcd", ..); + assert_eq!(3, common_overlap(text1, text2), "Whole case"); + + let text1 = Range::new("123456", ..); + let text2 = Range::new("abcd", ..); + assert_eq!(0, common_overlap(text1, text2), "No overlap"); + + let text1 = Range::new("123456xxx", ..); + let text2 = Range::new("xxxabcd", ..); + assert_eq!(3, common_overlap(text1, text2), "Overlap"); + + // Some overly clever languages (C#) may treat ligatures as equal to their + // component letters. E.g. U+FB01 == 'fi' + let text1 = Range::new("fi", ..); + let text2 = Range::new("\u{fb01}i", ..); + assert_eq!(0, common_overlap(text1, text2), "Unicode"); +} + +#[test] +fn test_cleanup_merge() { + let mut solution = diff_list![]; + cleanup_merge(&mut solution); + assert_diffs!([], solution, "Null case"); + + let mut solution = diff_list![Equal("a"), Delete("b"), Insert("c")]; + cleanup_merge(&mut solution); + assert_diffs!( + [Equal("a"), Delete("b"), Insert("c")], + solution, + "No change case", + ); + + let mut solution = diff_list![Equal("a"), Equal("b"), Equal("c")]; + cleanup_merge(&mut solution); + assert_diffs!([Equal("abc")], solution, "Merge equalities"); + + let mut solution = diff_list![Delete("a"), Delete("b"), Delete("c")]; + cleanup_merge(&mut solution); + assert_diffs!([Delete("abc")], solution, "Merge deletions"); + + let mut solution = diff_list![Insert("a"), Insert("b"), Insert("c")]; + cleanup_merge(&mut solution); + assert_diffs!([Insert("abc")], solution, "Merge insertions"); + + let mut solution = diff_list![ + Delete("a"), + Insert("b"), + Delete("c"), + Insert("d"), + Equal("e"), + Equal("f"), + ]; + cleanup_merge(&mut solution); + assert_diffs!( + [Delete("ac"), Insert("bd"), Equal("ef")], + solution, + "Merge interweave", + ); + + let mut solution = diff_list![Delete("a"), Insert("abc"), Delete("dc")]; + cleanup_merge(&mut solution); + assert_diffs!( + [Equal("a"), Delete("d"), Insert("b"), Equal("c")], + solution, + "Prefix and suffix detection", + ); + + let mut solution = diff_list![ + Equal("x"), + Delete("a"), + Insert("abc"), + Delete("dc"), + Equal("y"), + ]; + cleanup_merge(&mut solution); + assert_diffs!( + [Equal("xa"), Delete("d"), Insert("b"), Equal("cy")], + solution, + "Prefix and suffix detection with equalities", + ); + + let mut solution = diff_list![Equal("a"), Insert("ba"), Equal("c")]; + cleanup_merge(&mut solution); + assert_diffs!([Insert("ab"), Equal("ac")], solution, "Slide edit left"); + + let mut solution = diff_list![Equal("c"), Insert("ab"), Equal("a")]; + cleanup_merge(&mut solution); + assert_diffs!([Equal("ca"), Insert("ba")], solution, "Slide edit right"); + + let mut solution = diff_list![ + Equal("a"), + Delete("b"), + Equal("c"), + Delete("ac"), + Equal("x"), + ]; + cleanup_merge(&mut solution); + assert_diffs!( + [Delete("abc"), Equal("acx")], + solution, + "Slide edit left recursive", + ); + + let mut solution = diff_list![ + Equal("x"), + Delete("ca"), + Equal("c"), + Delete("b"), + Equal("a"), + ]; + cleanup_merge(&mut solution); + assert_diffs!( + [Equal("xca"), Delete("cba")], + solution, + "Slide edit right recursive", + ); + + let mut solution = diff_list![Delete("b"), Insert("ab"), Equal("c")]; + cleanup_merge(&mut solution); + assert_diffs!([Insert("a"), Equal("bc")], solution, "Empty range"); + + let mut solution = diff_list![Equal(""), Insert("a"), Equal("b")]; + cleanup_merge(&mut solution); + assert_diffs!([Insert("a"), Equal("b")], solution, "Empty equality"); +} + +#[test] +fn test_cleanup_semantic_lossless() { + let mut solution = diff_list![]; + cleanup_semantic_lossless(&mut solution); + assert_diffs!([], solution, "Null case"); + + let mut solution = diff_list![ + Equal("AAA\r\n\r\nBBB"), + Insert("\r\nDDD\r\n\r\nBBB"), + Equal("\r\nEEE"), + ]; + cleanup_semantic_lossless(&mut solution); + assert_diffs!( + [ + Equal("AAA\r\n\r\n"), + Insert("BBB\r\nDDD\r\n\r\n"), + Equal("BBB\r\nEEE"), + ], + solution, + "Blank lines", + ); + + let mut solution = diff_list![Equal("AAA\r\nBBB"), Insert(" DDD\r\nBBB"), Equal(" EEE")]; + cleanup_semantic_lossless(&mut solution); + assert_diffs!( + [Equal("AAA\r\n"), Insert("BBB DDD\r\n"), Equal("BBB EEE")], + solution, + "Line boundaries", + ); + + let mut solution = diff_list![Equal("The c"), Insert("ow and the c"), Equal("at.")]; + cleanup_semantic_lossless(&mut solution); + assert_diffs!( + [Equal("The "), Insert("cow and the "), Equal("cat.")], + solution, + "Word boundaries", + ); + + let mut solution = diff_list![Equal("The-c"), Insert("ow-and-the-c"), Equal("at.")]; + cleanup_semantic_lossless(&mut solution); + assert_diffs!( + [Equal("The-"), Insert("cow-and-the-"), Equal("cat.")], + solution, + "Alphanumeric boundaries", + ); + + let mut solution = diff_list![Equal("a"), Delete("a"), Equal("ax")]; + cleanup_semantic_lossless(&mut solution); + assert_diffs!([Delete("a"), Equal("aax")], solution, "Hitting the start"); + + let mut solution = diff_list![Equal("xa"), Delete("a"), Equal("a")]; + cleanup_semantic_lossless(&mut solution); + assert_diffs!([Equal("xaa"), Delete("a")], solution, "Hitting the end"); + + let mut solution = diff_list![Equal("The xxx. The "), Insert("zzz. The "), Equal("yyy.")]; + cleanup_semantic_lossless(&mut solution); + assert_diffs!( + [Equal("The xxx."), Insert(" The zzz."), Equal(" The yyy.")], + solution, + "Sentence boundaries", + ); +} + +#[test] +fn test_cleanup_semantic() { + let mut solution = diff_list![]; + cleanup_semantic(&mut solution); + assert_diffs!([], solution, "Null case"); + + let mut solution = diff_list![Delete("ab"), Insert("cd"), Equal("12"), Delete("e")]; + cleanup_semantic(&mut solution); + assert_diffs!( + [Delete("ab"), Insert("cd"), Equal("12"), Delete("e")], + solution, + "No elimination #1", + ); + + let mut solution = diff_list![Delete("abc"), Insert("ABC"), Equal("1234"), Delete("wxyz")]; + cleanup_semantic(&mut solution); + assert_diffs!( + [Delete("abc"), Insert("ABC"), Equal("1234"), Delete("wxyz")], + solution, + "No elimination #2", + ); + + let mut solution = diff_list![Delete("a"), Equal("b"), Delete("c")]; + cleanup_semantic(&mut solution); + assert_diffs!([Delete("abc"), Insert("b")], solution, "Simple elimination",); + + let mut solution = diff_list![ + Delete("ab"), + Equal("cd"), + Delete("e"), + Equal("f"), + Insert("g"), + ]; + cleanup_semantic(&mut solution); + assert_diffs!( + [Delete("abcdef"), Insert("cdfg")], + solution, + "Backpass elimination", + ); + + let mut solution = diff_list![ + Insert("1"), + Equal("A"), + Delete("B"), + Insert("2"), + Equal("_"), + Insert("1"), + Equal("A"), + Delete("B"), + Insert("2"), + ]; + cleanup_semantic(&mut solution); + assert_diffs!( + [Delete("AB_AB"), Insert("1A2_1A2")], + solution, + "Multiple elimination", + ); + + let mut solution = diff_list![Equal("The c"), Delete("ow and the c"), Equal("at.")]; + cleanup_semantic(&mut solution); + assert_diffs!( + [Equal("The "), Delete("cow and the "), Equal("cat.")], + solution, + "Word boundaries", + ); + + let mut solution = diff_list![Delete("abcxx"), Insert("xxdef")]; + cleanup_semantic(&mut solution); + assert_diffs!( + [Delete("abcxx"), Insert("xxdef")], + solution, + "No overlap elimination", + ); + + let mut solution = diff_list![Delete("abcxxx"), Insert("xxxdef")]; + cleanup_semantic(&mut solution); + assert_diffs!( + [Delete("abc"), Equal("xxx"), Insert("def")], + solution, + "Overlap elimination", + ); + + let mut solution = diff_list![Delete("xxxabc"), Insert("defxxx")]; + cleanup_semantic(&mut solution); + assert_diffs!( + [Insert("def"), Equal("xxx"), Delete("abc")], + solution, + "Reverse overlap elimination", + ); + + let mut solution = diff_list![ + Delete("abcd1212"), + Insert("1212efghi"), + Equal("----"), + Delete("A3"), + Insert("3BC"), + ]; + cleanup_semantic(&mut solution); + assert_diffs!( + [ + Delete("abcd"), + Equal("1212"), + Insert("efghi"), + Equal("----"), + Delete("A"), + Equal("3"), + Insert("BC"), + ], + solution, + "Two overlap eliminations", + ); +} + +#[test] +fn test_bisect() { + let text1 = Range::new("cat", ..); + let text2 = Range::new("map", ..); + let solution = Solution { + text1, + text2, + diffs: bisect(text1, text2), + utf8: false, + }; + assert_diffs!( + [ + Delete("c"), + Insert("m"), + Equal("a"), + Delete("t"), + Insert("p"), + ], + solution, + "Normal", + ); +} + +#[test] +fn test_main() { + let solution = main(Range::empty(), Range::empty()); + assert_diffs!([], solution, "Null case"); + + let solution = main(Range::new("abc", ..), Range::new("abc", ..)); + assert_diffs!([Equal("abc")], solution, "Equality"); + + let solution = main(Range::new("abc", ..), Range::new("ab123c", ..)); + assert_diffs!( + [Equal("ab"), Insert("123"), Equal("c")], + solution, + "Simple insertion", + ); + + let solution = main(Range::new("a123bc", ..), Range::new("abc", ..)); + assert_diffs!( + [Equal("a"), Delete("123"), Equal("bc")], + solution, + "Simple deletion", + ); + + let solution = main(Range::new("abc", ..), Range::new("a123b456c", ..)); + assert_diffs!( + [ + Equal("a"), + Insert("123"), + Equal("b"), + Insert("456"), + Equal("c"), + ], + solution, + "Two insertions", + ); + + let solution = main(Range::new("a123b456c", ..), Range::new("abc", ..)); + assert_diffs!( + [ + Equal("a"), + Delete("123"), + Equal("b"), + Delete("456"), + Equal("c"), + ], + solution, + "Two deletions", + ); + + let solution = main(Range::new("a", ..), Range::new("b", ..)); + assert_diffs!([Delete("a"), Insert("b")], solution, "Simple case #1"); + + let solution = main( + Range::new("Apples are a fruit.", ..), + Range::new("Bananas are also fruit.", ..), + ); + assert_diffs!( + [ + Delete("Apple"), + Insert("Banana"), + Equal("s are a"), + Insert("lso"), + Equal(" fruit."), + ], + solution, + "Simple case #2", + ); + + let solution = main(Range::new("ax\t", ..), Range::new("\u{0680}x\000", ..)); + assert_diffs!( + [ + Delete("a"), + Insert("\u{0680}"), + Equal("x"), + Delete("\t"), + Insert("\000"), + ], + solution, + "Simple case #3", + ); + + let solution = main(Range::new("1ayb2", ..), Range::new("abxab", ..)); + assert_diffs!( + [ + Delete("1"), + Equal("a"), + Delete("y"), + Equal("b"), + Delete("2"), + Insert("xab"), + ], + solution, + "Overlap #1", + ); + + let solution = main(Range::new("abcy", ..), Range::new("xaxcxabc", ..)); + assert_diffs!( + [Insert("xaxcx"), Equal("abc"), Delete("y")], + solution, + "Overlap #2", + ); + + let solution = main( + Range::new("ABCDa=bcd=efghijklmnopqrsEFGHIJKLMNOefg", ..), + Range::new("a-bcd-efghijklmnopqrs", ..), + ); + assert_diffs!( + [ + Delete("ABCD"), + Equal("a"), + Delete("="), + Insert("-"), + Equal("bcd"), + Delete("="), + Insert("-"), + Equal("efghijklmnopqrs"), + Delete("EFGHIJKLMNOefg"), + ], + solution, + "Overlap #3", + ); + + let solution = main( + Range::new("a [[Pennsylvania]] and [[New", ..), + Range::new(" and [[Pennsylvania]]", ..), + ); + assert_diffs!( + [ + Insert(" "), + Equal("a"), + Insert("nd"), + Equal(" [[Pennsylvania]]"), + Delete(" and [[New"), + ], + solution, + "Large equality", + ); +} -- cgit v1.2.3