use std::collections::hash_map::Entry; use std::collections::HashMap; use std::fmt::Debug; use std::hash::{Hash, Hasher}; use std::ops::{Add, Index, Range}; /// Utility function to check if a range is empty that works on older rust versions #[inline(always)] #[allow(clippy::neg_cmp_op_on_partial_ord)] pub fn is_empty_range>(range: &Range) -> bool { !(range.start < range.end) } /// Represents an item in the vector returned by [`unique`]. /// /// It compares like the underlying item does it was created from but /// carries the index it was originally created from. pub struct UniqueItem<'a, Idx: ?Sized> { lookup: &'a Idx, index: usize, } impl<'a, Idx: ?Sized> UniqueItem<'a, Idx> where Idx: Index, { /// Returns the value. #[inline(always)] pub fn value(&self) -> &Idx::Output { &self.lookup[self.index] } /// Returns the original index. #[inline(always)] pub fn original_index(&self) -> usize { self.index } } impl<'a, Idx: Index + 'a> Debug for UniqueItem<'a, Idx> where Idx::Output: Debug, { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { f.debug_struct("UniqueItem") .field("value", &self.value()) .field("original_index", &self.original_index()) .finish() } } impl<'a, 'b, A, B> PartialEq> for UniqueItem<'b, B> where A: Index + 'b + ?Sized, B: Index + 'b + ?Sized, B::Output: PartialEq, { #[inline(always)] fn eq(&self, other: &UniqueItem<'a, A>) -> bool { self.value() == other.value() } } /// Returns only unique items in the sequence as vector. /// /// Each item is wrapped in a [`UniqueItem`] so that both the value and the /// index can be extracted. pub fn unique(lookup: &Idx, range: Range) -> Vec> where Idx: Index + ?Sized, Idx::Output: Hash + Eq, { let mut by_item = HashMap::new(); for index in range { match by_item.entry(&lookup[index]) { Entry::Vacant(entry) => { entry.insert(Some(index)); } Entry::Occupied(mut entry) => { let entry = entry.get_mut(); if entry.is_some() { *entry = None } } } } let mut rv = by_item .into_iter() .filter_map(|(_, x)| x) .map(|index| UniqueItem { lookup, index }) .collect::>(); rv.sort_by_key(|a| a.original_index()); rv } /// Given two lookups and ranges calculates the length of the common prefix. pub fn common_prefix_len( old: &Old, old_range: Range, new: &New, new_range: Range, ) -> usize where Old: Index + ?Sized, New: Index + ?Sized, New::Output: PartialEq, { if is_empty_range(&old_range) || is_empty_range(&new_range) { return 0; } new_range .zip(old_range) .take_while( #[inline(always)] |x| new[x.0] == old[x.1], ) .count() } /// Given two lookups and ranges calculates the length of common suffix. pub fn common_suffix_len( old: &Old, old_range: Range, new: &New, new_range: Range, ) -> usize where Old: Index + ?Sized, New: Index + ?Sized, New::Output: PartialEq, { if is_empty_range(&old_range) || is_empty_range(&new_range) { return 0; } new_range .rev() .zip(old_range.rev()) .take_while( #[inline(always)] |x| new[x.0] == old[x.1], ) .count() } struct OffsetLookup { offset: usize, vec: Vec, } impl Index for OffsetLookup { type Output = Int; #[inline(always)] fn index(&self, index: usize) -> &Self::Output { &self.vec[index - self.offset] } } /// A utility struct to convert distinct items to unique integers. /// /// This can be helpful on larger inputs to speed up the comparisons /// performed by doing a first pass where the data set gets reduced /// to (small) integers. /// /// The idea is that instead of passing two sequences to a diffling algorithm /// you first pass it via [`IdentifyDistinct`]: /// /// ```rust /// use similar::capture_diff; /// use similar::algorithms::{Algorithm, IdentifyDistinct}; /// /// let old = &["foo", "bar", "baz"][..]; /// let new = &["foo", "blah", "baz"][..]; /// let h = IdentifyDistinct::::new(old, 0..old.len(), new, 0..new.len()); /// let ops = capture_diff( /// Algorithm::Myers, /// h.old_lookup(), /// h.old_range(), /// h.new_lookup(), /// h.new_range(), /// ); /// ``` /// /// The indexes are the same as with the passed source ranges. pub struct IdentifyDistinct { old: OffsetLookup, new: OffsetLookup, } impl IdentifyDistinct where Int: Add + From + Default + Copy, { /// Creates an int hasher for two sequences. pub fn new( old: &Old, old_range: Range, new: &New, new_range: Range, ) -> Self where Old: Index + ?Sized, Old::Output: Eq + Hash, New: Index + ?Sized, New::Output: Eq + Hash + PartialEq, { enum Key<'old, 'new, Old: ?Sized, New: ?Sized> { Old(&'old Old), New(&'new New), } impl<'old, 'new, Old, New> Hash for Key<'old, 'new, Old, New> where Old: Hash + ?Sized, New: Hash + ?Sized, { fn hash(&self, state: &mut H) { match *self { Key::Old(val) => val.hash(state), Key::New(val) => val.hash(state), } } } impl<'old, 'new, Old, New> PartialEq for Key<'old, 'new, Old, New> where Old: Eq + ?Sized, New: Eq + PartialEq + ?Sized, { #[inline(always)] fn eq(&self, other: &Self) -> bool { match (self, other) { (Key::Old(a), Key::Old(b)) => a == b, (Key::New(a), Key::New(b)) => a == b, (Key::Old(a), Key::New(b)) | (Key::New(b), Key::Old(a)) => b == a, } } } impl<'old, 'new, Old, New> Eq for Key<'old, 'new, Old, New> where Old: Eq + ?Sized, New: Eq + PartialEq + ?Sized, { } let mut map = HashMap::new(); let mut old_seq = Vec::new(); let mut new_seq = Vec::new(); let mut next_id = Int::default(); let step = Int::from(1); let old_start = old_range.start; let new_start = new_range.start; for idx in old_range { let item = Key::Old(&old[idx]); let id = match map.entry(item) { Entry::Occupied(o) => *o.get(), Entry::Vacant(v) => { let id = next_id; next_id = next_id + step; *v.insert(id) } }; old_seq.push(id); } for idx in new_range { let item = Key::New(&new[idx]); let id = match map.entry(item) { Entry::Occupied(o) => *o.get(), Entry::Vacant(v) => { let id = next_id; next_id = next_id + step; *v.insert(id) } }; new_seq.push(id); } IdentifyDistinct { old: OffsetLookup { offset: old_start, vec: old_seq, }, new: OffsetLookup { offset: new_start, vec: new_seq, }, } } /// Returns a lookup for the old side. pub fn old_lookup(&self) -> &impl Index { &self.old } /// Returns a lookup for the new side. pub fn new_lookup(&self) -> &impl Index { &self.new } /// Convenience method to get back the old range. pub fn old_range(&self) -> Range { self.old.offset..self.old.offset + self.old.vec.len() } /// Convenience method to get back the new range. pub fn new_range(&self) -> Range { self.new.offset..self.new.offset + self.new.vec.len() } } #[test] fn test_unique() { let u = unique(&vec!['a', 'b', 'c', 'd', 'd', 'b'], 0..6) .into_iter() .map(|x| (*x.value(), x.original_index())) .collect::>(); assert_eq!(u, vec![('a', 0), ('c', 2)]); } #[test] fn test_int_hasher() { let ih = IdentifyDistinct::::new( &["", "foo", "bar", "baz"][..], 1..4, &["", "foo", "blah", "baz"][..], 1..4, ); assert_eq!(ih.old_lookup()[1], 0); assert_eq!(ih.old_lookup()[2], 1); assert_eq!(ih.old_lookup()[3], 2); assert_eq!(ih.new_lookup()[1], 0); assert_eq!(ih.new_lookup()[2], 3); assert_eq!(ih.new_lookup()[3], 2); assert_eq!(ih.old_range(), 1..4); assert_eq!(ih.new_range(), 1..4); } #[test] fn test_common_prefix_len() { assert_eq!( common_prefix_len("".as_bytes(), 0..0, "".as_bytes(), 0..0), 0 ); assert_eq!( common_prefix_len("foobarbaz".as_bytes(), 0..9, "foobarblah".as_bytes(), 0..10), 7 ); assert_eq!( common_prefix_len("foobarbaz".as_bytes(), 0..9, "blablabla".as_bytes(), 0..9), 0 ); assert_eq!( common_prefix_len("foobarbaz".as_bytes(), 3..9, "foobarblah".as_bytes(), 3..10), 4 ); } #[test] fn test_common_suffix_len() { assert_eq!( common_suffix_len("".as_bytes(), 0..0, "".as_bytes(), 0..0), 0 ); assert_eq!( common_suffix_len("1234".as_bytes(), 0..4, "X0001234".as_bytes(), 0..8), 4 ); assert_eq!( common_suffix_len("1234".as_bytes(), 0..4, "Xxxx".as_bytes(), 0..4), 0 ); assert_eq!( common_suffix_len("1234".as_bytes(), 2..4, "01234".as_bytes(), 2..5), 2 ); }