diff options
Diffstat (limited to 'vendor/similar/src/text/abstraction.rs')
-rw-r--r-- | vendor/similar/src/text/abstraction.rs | 450 |
1 files changed, 450 insertions, 0 deletions
diff --git a/vendor/similar/src/text/abstraction.rs b/vendor/similar/src/text/abstraction.rs new file mode 100644 index 0000000..99678ff --- /dev/null +++ b/vendor/similar/src/text/abstraction.rs @@ -0,0 +1,450 @@ +use std::borrow::Cow; +use std::hash::Hash; +use std::ops::Range; + +/// Reference to a [`DiffableStr`]. +/// +/// This type exists because while the library only really provides ways to +/// work with `&str` and `&[u8]` there are types that deref into those string +/// slices such as `String` and `Vec<u8>`. +/// +/// This trait is used in the library whenever it's nice to be able to pass +/// strings of different types in. +/// +/// Requires the `text` feature. +pub trait DiffableStrRef { + /// The type of the resolved [`DiffableStr`]. + type Output: DiffableStr + ?Sized; + + /// Resolves the reference. + fn as_diffable_str(&self) -> &Self::Output; +} + +impl<T: DiffableStr + ?Sized> DiffableStrRef for T { + type Output = T; + + fn as_diffable_str(&self) -> &T { + self + } +} + +impl DiffableStrRef for String { + type Output = str; + + fn as_diffable_str(&self) -> &str { + self.as_str() + } +} + +impl<'a, T: DiffableStr + ?Sized> DiffableStrRef for Cow<'a, T> { + type Output = T; + + fn as_diffable_str(&self) -> &T { + self + } +} + +/// All supported diffable strings. +/// +/// The text module can work with different types of strings depending +/// on how the crate is compiled. Out of the box `&str` is always supported +/// but with the `bytes` feature one can also work with `[u8]` slices for +/// as long as they are ASCII compatible. +/// +/// Requires the `text` feature. +pub trait DiffableStr: Hash + PartialEq + PartialOrd + Ord + Eq + ToOwned { + /// Splits the value into newlines with newlines attached. + fn tokenize_lines(&self) -> Vec<&Self>; + + /// Splits the value into newlines with newlines separated. + fn tokenize_lines_and_newlines(&self) -> Vec<&Self>; + + /// Tokenizes into words. + fn tokenize_words(&self) -> Vec<&Self>; + + /// Tokenizes the input into characters. + fn tokenize_chars(&self) -> Vec<&Self>; + + /// Tokenizes into unicode words. + #[cfg(feature = "unicode")] + fn tokenize_unicode_words(&self) -> Vec<&Self>; + + /// Tokenizes into unicode graphemes. + #[cfg(feature = "unicode")] + fn tokenize_graphemes(&self) -> Vec<&Self>; + + /// Decodes the string (potentially) lossy. + fn as_str(&self) -> Option<&str>; + + /// Decodes the string (potentially) lossy. + fn to_string_lossy(&self) -> Cow<'_, str>; + + /// Checks if the string ends in a newline. + fn ends_with_newline(&self) -> bool; + + /// The length of the string. + fn len(&self) -> usize; + + /// Slices the string. + fn slice(&self, rng: Range<usize>) -> &Self; + + /// Returns the string as slice of raw bytes. + fn as_bytes(&self) -> &[u8]; + + /// Checks if the string is empty. + fn is_empty(&self) -> bool { + self.len() == 0 + } +} + +impl DiffableStr for str { + fn tokenize_lines(&self) -> Vec<&Self> { + let mut iter = self.char_indices().peekable(); + let mut last_pos = 0; + let mut lines = vec![]; + + while let Some((idx, c)) = iter.next() { + if c == '\r' { + if iter.peek().map_or(false, |x| x.1 == '\n') { + lines.push(&self[last_pos..=idx + 1]); + iter.next(); + last_pos = idx + 2; + } else { + lines.push(&self[last_pos..=idx]); + last_pos = idx + 1; + } + } else if c == '\n' { + lines.push(&self[last_pos..=idx]); + last_pos = idx + 1; + } + } + + if last_pos < self.len() { + lines.push(&self[last_pos..]); + } + + lines + } + + fn tokenize_lines_and_newlines(&self) -> Vec<&Self> { + let mut rv = vec![]; + let mut iter = self.char_indices().peekable(); + + while let Some((idx, c)) = iter.next() { + let is_newline = c == '\r' || c == '\n'; + let start = idx; + let mut end = idx + c.len_utf8(); + while let Some(&(_, next_char)) = iter.peek() { + if (next_char == '\r' || next_char == '\n') != is_newline { + break; + } + iter.next(); + end += next_char.len_utf8(); + } + rv.push(&self[start..end]); + } + + rv + } + + fn tokenize_words(&self) -> Vec<&Self> { + let mut iter = self.char_indices().peekable(); + let mut rv = vec![]; + + while let Some((idx, c)) = iter.next() { + let is_whitespace = c.is_whitespace(); + let start = idx; + let mut end = idx + c.len_utf8(); + while let Some(&(_, next_char)) = iter.peek() { + if next_char.is_whitespace() != is_whitespace { + break; + } + iter.next(); + end += next_char.len_utf8(); + } + rv.push(&self[start..end]); + } + + rv + } + + fn tokenize_chars(&self) -> Vec<&Self> { + self.char_indices() + .map(move |(i, c)| &self[i..i + c.len_utf8()]) + .collect() + } + + #[cfg(feature = "unicode")] + fn tokenize_unicode_words(&self) -> Vec<&Self> { + unicode_segmentation::UnicodeSegmentation::split_word_bounds(self).collect() + } + + #[cfg(feature = "unicode")] + fn tokenize_graphemes(&self) -> Vec<&Self> { + unicode_segmentation::UnicodeSegmentation::graphemes(self, true).collect() + } + + fn as_str(&self) -> Option<&str> { + Some(self) + } + + fn to_string_lossy(&self) -> Cow<'_, str> { + Cow::Borrowed(self) + } + + fn ends_with_newline(&self) -> bool { + self.ends_with(&['\r', '\n'][..]) + } + + fn len(&self) -> usize { + str::len(self) + } + + fn slice(&self, rng: Range<usize>) -> &Self { + &self[rng] + } + + fn as_bytes(&self) -> &[u8] { + str::as_bytes(self) + } +} + +#[cfg(feature = "bytes")] +mod bytes_support { + use super::*; + + use bstr::ByteSlice; + + impl DiffableStrRef for Vec<u8> { + type Output = [u8]; + + fn as_diffable_str(&self) -> &[u8] { + self.as_slice() + } + } + + /// Allows viewing ASCII compatible byte slices as strings. + /// + /// Requires the `bytes` feature. + impl DiffableStr for [u8] { + fn tokenize_lines(&self) -> Vec<&Self> { + let mut iter = self.char_indices().peekable(); + let mut last_pos = 0; + let mut lines = vec![]; + + while let Some((_, end, c)) = iter.next() { + if c == '\r' { + if iter.peek().map_or(false, |x| x.2 == '\n') { + lines.push(&self[last_pos..end + 1]); + iter.next(); + last_pos = end + 1; + } else { + lines.push(&self[last_pos..end]); + last_pos = end; + } + } else if c == '\n' { + lines.push(&self[last_pos..end]); + last_pos = end; + } + } + + if last_pos < self.len() { + lines.push(&self[last_pos..]); + } + + lines + } + + fn tokenize_lines_and_newlines(&self) -> Vec<&Self> { + let mut rv = vec![]; + let mut iter = self.char_indices().peekable(); + + while let Some((start, mut end, c)) = iter.next() { + let is_newline = c == '\r' || c == '\n'; + while let Some(&(_, new_end, next_char)) = iter.peek() { + if (next_char == '\r' || next_char == '\n') != is_newline { + break; + } + iter.next(); + end = new_end; + } + rv.push(&self[start..end]); + } + + rv + } + + fn tokenize_words(&self) -> Vec<&Self> { + let mut iter = self.char_indices().peekable(); + let mut rv = vec![]; + + while let Some((start, mut end, c)) = iter.next() { + let is_whitespace = c.is_whitespace(); + while let Some(&(_, new_end, next_char)) = iter.peek() { + if next_char.is_whitespace() != is_whitespace { + break; + } + iter.next(); + end = new_end; + } + rv.push(&self[start..end]); + } + + rv + } + + #[cfg(feature = "unicode")] + fn tokenize_unicode_words(&self) -> Vec<&Self> { + self.words_with_breaks().map(|x| x.as_bytes()).collect() + } + + #[cfg(feature = "unicode")] + fn tokenize_graphemes(&self) -> Vec<&Self> { + self.graphemes().map(|x| x.as_bytes()).collect() + } + + fn tokenize_chars(&self) -> Vec<&Self> { + self.char_indices() + .map(move |(start, end, _)| &self[start..end]) + .collect() + } + + fn as_str(&self) -> Option<&str> { + std::str::from_utf8(self).ok() + } + + fn to_string_lossy(&self) -> Cow<'_, str> { + String::from_utf8_lossy(self) + } + + fn ends_with_newline(&self) -> bool { + if let Some(b'\r') | Some(b'\n') = self.last_byte() { + true + } else { + false + } + } + + fn len(&self) -> usize { + <[u8]>::len(self) + } + + fn slice(&self, rng: Range<usize>) -> &Self { + &self[rng] + } + + fn as_bytes(&self) -> &[u8] { + self + } + } +} + +#[test] +fn test_split_lines() { + assert_eq!( + DiffableStr::tokenize_lines("first\nsecond\rthird\r\nfourth\nlast"), + vec!["first\n", "second\r", "third\r\n", "fourth\n", "last"] + ); + assert_eq!(DiffableStr::tokenize_lines("\n\n"), vec!["\n", "\n"]); + assert_eq!(DiffableStr::tokenize_lines("\n"), vec!["\n"]); + assert!(DiffableStr::tokenize_lines("").is_empty()); +} + +#[test] +fn test_split_words() { + assert_eq!( + DiffableStr::tokenize_words("foo bar baz\n\n aha"), + ["foo", " ", "bar", " ", "baz", "\n\n ", "aha"] + ); +} + +#[test] +fn test_split_chars() { + assert_eq!( + DiffableStr::tokenize_chars("abcfö❄️"), + vec!["a", "b", "c", "f", "ö", "❄", "\u{fe0f}"] + ); +} + +#[test] +#[cfg(feature = "unicode")] +fn test_split_graphemes() { + assert_eq!( + DiffableStr::tokenize_graphemes("abcfö❄️"), + vec!["a", "b", "c", "f", "ö", "❄️"] + ); +} + +#[test] +#[cfg(feature = "bytes")] +fn test_split_lines_bytes() { + assert_eq!( + DiffableStr::tokenize_lines("first\nsecond\rthird\r\nfourth\nlast".as_bytes()), + vec![ + "first\n".as_bytes(), + "second\r".as_bytes(), + "third\r\n".as_bytes(), + "fourth\n".as_bytes(), + "last".as_bytes() + ] + ); + assert_eq!( + DiffableStr::tokenize_lines("\n\n".as_bytes()), + vec!["\n".as_bytes(), "\n".as_bytes()] + ); + assert_eq!( + DiffableStr::tokenize_lines("\n".as_bytes()), + vec!["\n".as_bytes()] + ); + assert!(DiffableStr::tokenize_lines("".as_bytes()).is_empty()); +} + +#[test] +#[cfg(feature = "bytes")] +fn test_split_words_bytes() { + assert_eq!( + DiffableStr::tokenize_words("foo bar baz\n\n aha".as_bytes()), + [ + &b"foo"[..], + &b" "[..], + &b"bar"[..], + &b" "[..], + &b"baz"[..], + &b"\n\n "[..], + &b"aha"[..] + ] + ); +} + +#[test] +#[cfg(feature = "bytes")] +fn test_split_chars_bytes() { + assert_eq!( + DiffableStr::tokenize_chars("abcfö❄️".as_bytes()), + vec![ + &b"a"[..], + &b"b"[..], + &b"c"[..], + &b"f"[..], + "ö".as_bytes(), + "❄".as_bytes(), + "\u{fe0f}".as_bytes() + ] + ); +} + +#[test] +#[cfg(all(feature = "bytes", feature = "unicode"))] +fn test_split_graphemes_bytes() { + assert_eq!( + DiffableStr::tokenize_graphemes("abcfö❄️".as_bytes()), + vec![ + &b"a"[..], + &b"b"[..], + &b"c"[..], + &b"f"[..], + "ö".as_bytes(), + "❄️".as_bytes() + ] + ); +} |