diff options
Diffstat (limited to 'vendor/imara-diff/src/lib.rs')
-rw-r--r-- | vendor/imara-diff/src/lib.rs | 267 |
1 files changed, 0 insertions, 267 deletions
diff --git a/vendor/imara-diff/src/lib.rs b/vendor/imara-diff/src/lib.rs deleted file mode 100644 index c20b750ca..000000000 --- a/vendor/imara-diff/src/lib.rs +++ /dev/null @@ -1,267 +0,0 @@ -//! Imara-diff is a solid (imara in swahili) diff library for rust. -//! Solid refers to the fact that imara-diff provides very good runtime performance even -//! in pathologic cases so that your application never appears to freeze while waiting on a diff. -//! The performance improvements are achieved using battle tested heuristics used in gnu-diff and git -//! that are known to yield fast runtime and performance. -//! -//! Imara-diff is also designed to be flexible so that it can be used with arbitrary collections and -//! not just lists and strings and even allows reusing large parts of the computation when -//! comparing the same file to multiple different files. -//! -//! Imara-diff provides two diff algorithms: -//! -//! * The linear-space variant of the well known [**myer** algorithm](http://www.xmailserver.org/diff2.pdf) -//! * The **Histogram** algorithm which variant of the patience diff algorithm. -//! -//! Myers algorithm has been enhanced with preprocessing and multiple heuristics to ensure fast runtime in pathological -//! cases to avoid quadratic time complexity and closely matches the behaviour of gnu-diff and git. -//! The Histogram algorithm was originally ported from git but has been heavily optimized. -//! The **Histogram algorithm outperforms Myers diff** by 10% - 100% across a **wide variety of workloads**. -//! -//! Imara-diffs algorithms have been benchmarked over a wide variety of real-world code. -//! For example while comparing multiple different linux kernel it performs up to 30 times better than the `similar` crate: -#![cfg_attr(doc, doc=concat!("<img width=\"600\" class=\"figure\" src=\"data:image/svg+xml;base64,", include_str!("../plots/linux_comparison.svg.base64"), "\"></img>"))] -//! -//! # Api Overview -//! -//! Imara-diff provides the [`UnifiedDiffBuilder`](crate::UnifiedDiffBuilder) for building -//! a human-redable diff similar to the output of `git diff` or `diff -u`. -//! This makes building a tool similar to gnu diff easy: -//! -//! ``` -//! use imara_diff::intern::InternedInput; -//! use imara_diff::{diff, Algorithm, UnifiedDiffBuilder}; -//! -//! let before = r#"fn foo() -> Bar { -//! let mut foo = 2; -//! foo *= 50; -//! println!("hello world") -//! }"#; -//! -//! let after = r#"// lorem ipsum -//! fn foo() -> Bar { -//! let mut foo = 2; -//! foo *= 50; -//! println!("hello world"); -//! println!("{foo}"); -//! } -//! // foo -//! "#; -//! -//! let input = InternedInput::new(before, after); -//! let diff = diff(Algorithm::Histogram, &input, UnifiedDiffBuilder::new(&input)); -//! assert_eq!( -//! diff, -//! r#"@@ -1,5 +1,8 @@ -//! +// lorem ipsum -//! fn foo() -> Bar { -//! let mut foo = 2; -//! foo *= 50; -//! - println!("hello world") -//! + println!("hello world"); -//! + println!("{foo}"); -//! } -//! +// foo -//! "# -//! ); -//! ``` -//! -//! If you want to process the diff in some way you can provide your own implementation of [`Sink`](crate::sink::Sink). -//! For closures [`Sink`](crate::sink::Sink) is already implemented, so simple [`Sink`]s can be easily added: -//! -//! ``` -//! use std::ops::Range; -//! -//! use imara_diff::intern::InternedInput; -//! use imara_diff::{diff, Algorithm, UnifiedDiffBuilder}; -//! -//! let before = r#"fn foo() -> Bar { -//! let mut foo = 2; -//! foo *= 50; -//! println!("hello world") -//! }"#; -//! -//! let after = r#"// lorem ipsum -//! fn foo() -> Bar { -//! let mut foo = 2; -//! foo *= 50; -//! println!("hello world"); -//! println!("{foo}"); -//! } -//! // foo -//! "#; -//! -//! let mut insertions = Vec::new(); -//! let mut removals = Vec::new(); -//! let mut replacements = Vec::new(); -//! -//! let input = InternedInput::new(before, after); -//! let sink = |before: Range<u32>, after: Range<u32>| { -//! let hunk_before: Vec<_> = input.before[before.start as usize..before.end as usize] -//! .iter() -//! .map(|&line| input.interner[line]) -//! .collect(); -//! let hunk_after: Vec<_> = input.after[after.start as usize..after.end as usize] -//! .iter() -//! .map(|&line| input.interner[line]) -//! .collect(); -//! if hunk_after.is_empty() { -//! removals.push(hunk_before) -//! } else if hunk_before.is_empty() { -//! insertions.push(hunk_after) -//! } else { -//! replacements.push((hunk_before, hunk_after)) -//! } -//! }; -//! let diff = diff(Algorithm::Histogram, &input, sink); -//! assert_eq!(&insertions, &[vec!["// lorem ipsum"], vec!["// foo"]]); -//! assert!(removals.is_empty()); -//! assert_eq!( -//! &replacements, -//! &[( -//! vec![" println!(\"hello world\")"], -//! vec![" println!(\"hello world\");", " println!(\"{foo}\");"] -//! )] -//! ); -//! ``` -//! -//! For `&str` and `&[u8]` imara-diff will compute a line diff by default. -//! To perform diffs of different tokenizations and collections you can implement the [`TokenSource`](crate::intern::TokenSource) trait. -//! For example the imara-diff provides an alternative tokenziser for line-diffs that includes the line terminator in the line: -//! -//! ``` -//! use imara_diff::intern::InternedInput; -//! use imara_diff::sink::Counter; -//! use imara_diff::sources::lines_with_terminator; -//! use imara_diff::{diff, Algorithm, UnifiedDiffBuilder}; -//! -//! let before = "foo"; -//! let after = "foo\n"; -//! -//! let input = InternedInput::new(before, after); -//! let changes = diff(Algorithm::Histogram, &input, Counter::default()); -//! assert_eq!(changes.insertions, 0); -//! assert_eq!(changes.removals, 0); -//! -//! let input = InternedInput::new(lines_with_terminator(before), lines_with_terminator(after)); -//! let changes = diff(Algorithm::Histogram, &input, Counter::default()); -//! assert_eq!(changes.insertions, 1); -//! assert_eq!(changes.removals, 1); -//! ``` - -use std::hash::Hash; - -#[cfg(feature = "unified_diff")] -pub use unified_diff::UnifiedDiffBuilder; - -use crate::intern::{InternedInput, Token, TokenSource}; -pub use crate::sink::Sink; -mod histogram; -pub mod intern; -mod myers; -pub mod sink; -pub mod sources; -#[cfg(feature = "unified_diff")] -mod unified_diff; -mod util; - -#[cfg(test)] -mod tests; - -/// `imara-diff` supports multiple different algorithms -/// for computing an edit sequence. -/// These algorithms have different performance and all produce different output. -#[derive(Debug, PartialEq, Eq, Clone, Copy)] -pub enum Algorithm { - /// A variation of the [`patience` diff algorithm described by Bram Cohen's blog post](https://bramcohen.livejournal.com/73318.html) - /// that uses a histogram to find the least common LCS. - /// Just like the `patience` diff algorithm, this algorithm usually produces - /// more human readable output then myers algorithm. - /// However compared to the `patience` diff algorithm (which is slower then myers algorithm), - /// the Histogram algorithm performs much better. - /// - /// The implementation here was originally ported from `git` but has been significantly - /// modified to improve performance. - /// As a result it consistently **performs better then myers algorithm** (5%-100%) over - /// a wide variety of test data. For example a benchmark of diffing linux kernel commits is shown below: - #[cfg_attr(doc, doc=concat!("<img width=\"600\" class=\"figure\" src=\"data:image/svg+xml;base64,", include_str!("../plots/linux_speedup.svg.base64"), "\"></img>"))] - /// - /// For pathological subsequences that only contain highly repeating tokens (64+ occurrences) - /// the algorithm falls back on Myers algorithm (with heuristics) to avoid quadratic behavior. - /// - /// Compared to Myers algorithm, the Histogram diff algorithm is more focused on providing - /// human readable diffs instead of minimal diffs. In practice this means that the edit-sequences - /// produced by the histogram diff are often longer then those produced by Myers algorithm. - /// - /// The heuristic used by the histogram diff does not work well for inputs with small (often repeated) - /// tokens. For example **character diffs do not work well** as most (english) text is madeup of - /// a fairly small set of characters. The `Histogram` algorithm will automatically these cases and - /// fallback to Myers algorithm. However this detection has a nontrivial overhead, so - /// if its known upfront that the sort of tokens is very small `Myers` algorithm should - /// be used instead. - Histogram, - /// An implementation of the linear space variant of - /// [Myers `O((N+M)D)` algorithm](http://www.xmailserver.org/diff2.pdf). - /// The algorithm is enhanced with preprocessing that removes - /// tokens that don't occur in the other file at all. - /// Furthermore two heuristics to the middle snake search are implemented - /// that ensure reasonable runtime (mostly linear time complexity) even for large files. - /// - /// Due to the divide and conquer nature of the algorithm - /// the edit sequenced produced are still fairly small even when the middle snake - /// search is aborted by a heuristic. - /// However, the produced edit sequences are not guaranteed to be fully minimal. - /// If that property is vital to you, use the `MyersMinimal` algorithm instead. - /// - /// The implementation (including the preprocessing) are mostly - /// ported from `git` and `gnu-diff` where Myers algorithm is used - /// as the default diff algorithm. - /// Therefore the used heuristics have been heavily battle tested and - /// are known to behave well over a large variety of inputs - Myers, - /// Same as `Myers` but the early abort heuristics are disabled to guarantee - /// a minimal edit sequence. - /// This can mean significant slowdown in pathological cases. - MyersMinimal, -} - -impl Algorithm { - #[cfg(test)] - const ALL: [Self; 2] = [Algorithm::Histogram, Algorithm::Myers]; -} - -impl Default for Algorithm { - fn default() -> Self { - Algorithm::Histogram - } -} - -/// Computes an edit-script that transforms `input.before` into `input.after` using -/// the specified `algorithm` -/// The edit-script is passed to `sink.process_change` while it is produced. -pub fn diff<S: Sink, T: Eq + Hash>( - algorithm: Algorithm, - input: &InternedInput<T>, - sink: S, -) -> S::Out { - diff_with_tokens(algorithm, &input.before, &input.after, input.interner.num_tokens(), sink) -} - -/// Computes an edit-script that transforms `before` into `after` using -/// the specified `algorithm` -/// The edit-script is passed to `sink.process_change` while it is produced. -pub fn diff_with_tokens<S: Sink>( - algorithm: Algorithm, - before: &[Token], - after: &[Token], - num_tokens: u32, - sink: S, -) -> S::Out { - assert!(before.len() < i32::MAX as usize, "imara-diff only supports up to {} tokens", i32::MAX); - assert!(after.len() < i32::MAX as usize, "imara-diff only supports up to {} tokens", i32::MAX); - match algorithm { - Algorithm::Histogram => histogram::diff(before, after, num_tokens, sink), - Algorithm::Myers => myers::diff(before, after, num_tokens, sink, false), - Algorithm::MyersMinimal => myers::diff(before, after, num_tokens, sink, true), - } -} |