use std::borrow::Cow; use std::hash::Hash; use std::ops::Range; /// Reference to a [`DiffableStr`]. /// /// This type exists because while the library only really provides ways to /// work with `&str` and `&[u8]` there are types that deref into those string /// slices such as `String` and `Vec`. /// /// This trait is used in the library whenever it's nice to be able to pass /// strings of different types in. /// /// Requires the `text` feature. pub trait DiffableStrRef { /// The type of the resolved [`DiffableStr`]. type Output: DiffableStr + ?Sized; /// Resolves the reference. fn as_diffable_str(&self) -> &Self::Output; } impl DiffableStrRef for T { type Output = T; fn as_diffable_str(&self) -> &T { self } } impl DiffableStrRef for String { type Output = str; fn as_diffable_str(&self) -> &str { self.as_str() } } impl<'a, T: DiffableStr + ?Sized> DiffableStrRef for Cow<'a, T> { type Output = T; fn as_diffable_str(&self) -> &T { self } } /// All supported diffable strings. /// /// The text module can work with different types of strings depending /// on how the crate is compiled. Out of the box `&str` is always supported /// but with the `bytes` feature one can also work with `[u8]` slices for /// as long as they are ASCII compatible. /// /// Requires the `text` feature. pub trait DiffableStr: Hash + PartialEq + PartialOrd + Ord + Eq + ToOwned { /// Splits the value into newlines with newlines attached. fn tokenize_lines(&self) -> Vec<&Self>; /// Splits the value into newlines with newlines separated. fn tokenize_lines_and_newlines(&self) -> Vec<&Self>; /// Tokenizes into words. fn tokenize_words(&self) -> Vec<&Self>; /// Tokenizes the input into characters. fn tokenize_chars(&self) -> Vec<&Self>; /// Tokenizes into unicode words. #[cfg(feature = "unicode")] fn tokenize_unicode_words(&self) -> Vec<&Self>; /// Tokenizes into unicode graphemes. #[cfg(feature = "unicode")] fn tokenize_graphemes(&self) -> Vec<&Self>; /// Decodes the string (potentially) lossy. fn as_str(&self) -> Option<&str>; /// Decodes the string (potentially) lossy. fn to_string_lossy(&self) -> Cow<'_, str>; /// Checks if the string ends in a newline. fn ends_with_newline(&self) -> bool; /// The length of the string. fn len(&self) -> usize; /// Slices the string. fn slice(&self, rng: Range) -> &Self; /// Returns the string as slice of raw bytes. fn as_bytes(&self) -> &[u8]; /// Checks if the string is empty. fn is_empty(&self) -> bool { self.len() == 0 } } impl DiffableStr for str { fn tokenize_lines(&self) -> Vec<&Self> { let mut iter = self.char_indices().peekable(); let mut last_pos = 0; let mut lines = vec![]; while let Some((idx, c)) = iter.next() { if c == '\r' { if iter.peek().map_or(false, |x| x.1 == '\n') { lines.push(&self[last_pos..=idx + 1]); iter.next(); last_pos = idx + 2; } else { lines.push(&self[last_pos..=idx]); last_pos = idx + 1; } } else if c == '\n' { lines.push(&self[last_pos..=idx]); last_pos = idx + 1; } } if last_pos < self.len() { lines.push(&self[last_pos..]); } lines } fn tokenize_lines_and_newlines(&self) -> Vec<&Self> { let mut rv = vec![]; let mut iter = self.char_indices().peekable(); while let Some((idx, c)) = iter.next() { let is_newline = c == '\r' || c == '\n'; let start = idx; let mut end = idx + c.len_utf8(); while let Some(&(_, next_char)) = iter.peek() { if (next_char == '\r' || next_char == '\n') != is_newline { break; } iter.next(); end += next_char.len_utf8(); } rv.push(&self[start..end]); } rv } fn tokenize_words(&self) -> Vec<&Self> { let mut iter = self.char_indices().peekable(); let mut rv = vec![]; while let Some((idx, c)) = iter.next() { let is_whitespace = c.is_whitespace(); let start = idx; let mut end = idx + c.len_utf8(); while let Some(&(_, next_char)) = iter.peek() { if next_char.is_whitespace() != is_whitespace { break; } iter.next(); end += next_char.len_utf8(); } rv.push(&self[start..end]); } rv } fn tokenize_chars(&self) -> Vec<&Self> { self.char_indices() .map(move |(i, c)| &self[i..i + c.len_utf8()]) .collect() } #[cfg(feature = "unicode")] fn tokenize_unicode_words(&self) -> Vec<&Self> { unicode_segmentation::UnicodeSegmentation::split_word_bounds(self).collect() } #[cfg(feature = "unicode")] fn tokenize_graphemes(&self) -> Vec<&Self> { unicode_segmentation::UnicodeSegmentation::graphemes(self, true).collect() } fn as_str(&self) -> Option<&str> { Some(self) } fn to_string_lossy(&self) -> Cow<'_, str> { Cow::Borrowed(self) } fn ends_with_newline(&self) -> bool { self.ends_with(&['\r', '\n'][..]) } fn len(&self) -> usize { str::len(self) } fn slice(&self, rng: Range) -> &Self { &self[rng] } fn as_bytes(&self) -> &[u8] { str::as_bytes(self) } } #[cfg(feature = "bytes")] mod bytes_support { use super::*; use bstr::ByteSlice; impl DiffableStrRef for Vec { type Output = [u8]; fn as_diffable_str(&self) -> &[u8] { self.as_slice() } } /// Allows viewing ASCII compatible byte slices as strings. /// /// Requires the `bytes` feature. impl DiffableStr for [u8] { fn tokenize_lines(&self) -> Vec<&Self> { let mut iter = self.char_indices().peekable(); let mut last_pos = 0; let mut lines = vec![]; while let Some((_, end, c)) = iter.next() { if c == '\r' { if iter.peek().map_or(false, |x| x.2 == '\n') { lines.push(&self[last_pos..end + 1]); iter.next(); last_pos = end + 1; } else { lines.push(&self[last_pos..end]); last_pos = end; } } else if c == '\n' { lines.push(&self[last_pos..end]); last_pos = end; } } if last_pos < self.len() { lines.push(&self[last_pos..]); } lines } fn tokenize_lines_and_newlines(&self) -> Vec<&Self> { let mut rv = vec![]; let mut iter = self.char_indices().peekable(); while let Some((start, mut end, c)) = iter.next() { let is_newline = c == '\r' || c == '\n'; while let Some(&(_, new_end, next_char)) = iter.peek() { if (next_char == '\r' || next_char == '\n') != is_newline { break; } iter.next(); end = new_end; } rv.push(&self[start..end]); } rv } fn tokenize_words(&self) -> Vec<&Self> { let mut iter = self.char_indices().peekable(); let mut rv = vec![]; while let Some((start, mut end, c)) = iter.next() { let is_whitespace = c.is_whitespace(); while let Some(&(_, new_end, next_char)) = iter.peek() { if next_char.is_whitespace() != is_whitespace { break; } iter.next(); end = new_end; } rv.push(&self[start..end]); } rv } #[cfg(feature = "unicode")] fn tokenize_unicode_words(&self) -> Vec<&Self> { self.words_with_breaks().map(|x| x.as_bytes()).collect() } #[cfg(feature = "unicode")] fn tokenize_graphemes(&self) -> Vec<&Self> { self.graphemes().map(|x| x.as_bytes()).collect() } fn tokenize_chars(&self) -> Vec<&Self> { self.char_indices() .map(move |(start, end, _)| &self[start..end]) .collect() } fn as_str(&self) -> Option<&str> { std::str::from_utf8(self).ok() } fn to_string_lossy(&self) -> Cow<'_, str> { String::from_utf8_lossy(self) } fn ends_with_newline(&self) -> bool { if let Some(b'\r') | Some(b'\n') = self.last_byte() { true } else { false } } fn len(&self) -> usize { <[u8]>::len(self) } fn slice(&self, rng: Range) -> &Self { &self[rng] } fn as_bytes(&self) -> &[u8] { self } } } #[test] fn test_split_lines() { assert_eq!( DiffableStr::tokenize_lines("first\nsecond\rthird\r\nfourth\nlast"), vec!["first\n", "second\r", "third\r\n", "fourth\n", "last"] ); assert_eq!(DiffableStr::tokenize_lines("\n\n"), vec!["\n", "\n"]); assert_eq!(DiffableStr::tokenize_lines("\n"), vec!["\n"]); assert!(DiffableStr::tokenize_lines("").is_empty()); } #[test] fn test_split_words() { assert_eq!( DiffableStr::tokenize_words("foo bar baz\n\n aha"), ["foo", " ", "bar", " ", "baz", "\n\n ", "aha"] ); } #[test] fn test_split_chars() { assert_eq!( DiffableStr::tokenize_chars("abcfö❄️"), vec!["a", "b", "c", "f", "ö", "❄", "\u{fe0f}"] ); } #[test] #[cfg(feature = "unicode")] fn test_split_graphemes() { assert_eq!( DiffableStr::tokenize_graphemes("abcfö❄️"), vec!["a", "b", "c", "f", "ö", "❄️"] ); } #[test] #[cfg(feature = "bytes")] fn test_split_lines_bytes() { assert_eq!( DiffableStr::tokenize_lines("first\nsecond\rthird\r\nfourth\nlast".as_bytes()), vec![ "first\n".as_bytes(), "second\r".as_bytes(), "third\r\n".as_bytes(), "fourth\n".as_bytes(), "last".as_bytes() ] ); assert_eq!( DiffableStr::tokenize_lines("\n\n".as_bytes()), vec!["\n".as_bytes(), "\n".as_bytes()] ); assert_eq!( DiffableStr::tokenize_lines("\n".as_bytes()), vec!["\n".as_bytes()] ); assert!(DiffableStr::tokenize_lines("".as_bytes()).is_empty()); } #[test] #[cfg(feature = "bytes")] fn test_split_words_bytes() { assert_eq!( DiffableStr::tokenize_words("foo bar baz\n\n aha".as_bytes()), [ &b"foo"[..], &b" "[..], &b"bar"[..], &b" "[..], &b"baz"[..], &b"\n\n "[..], &b"aha"[..] ] ); } #[test] #[cfg(feature = "bytes")] fn test_split_chars_bytes() { assert_eq!( DiffableStr::tokenize_chars("abcfö❄️".as_bytes()), vec![ &b"a"[..], &b"b"[..], &b"c"[..], &b"f"[..], "ö".as_bytes(), "❄".as_bytes(), "\u{fe0f}".as_bytes() ] ); } #[test] #[cfg(all(feature = "bytes", feature = "unicode"))] fn test_split_graphemes_bytes() { assert_eq!( DiffableStr::tokenize_graphemes("abcfö❄️".as_bytes()), vec![ &b"a"[..], &b"b"[..], &b"c"[..], &b"f"[..], "ö".as_bytes(), "❄️".as_bytes() ] ); }