summaryrefslogtreecommitdiffstats
path: root/vendor/similar/src/text/abstraction.rs
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-04 12:47:55 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-04 12:47:55 +0000
commit2aadc03ef15cb5ca5cc2af8a7c08e070742f0ac4 (patch)
tree033cc839730fda84ff08db877037977be94e5e3a /vendor/similar/src/text/abstraction.rs
parentInitial commit. (diff)
downloadcargo-upstream.tar.xz
cargo-upstream.zip
Adding upstream version 0.70.1+ds1.upstream/0.70.1+ds1upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'vendor/similar/src/text/abstraction.rs')
-rw-r--r--vendor/similar/src/text/abstraction.rs450
1 files changed, 450 insertions, 0 deletions
diff --git a/vendor/similar/src/text/abstraction.rs b/vendor/similar/src/text/abstraction.rs
new file mode 100644
index 0000000..99678ff
--- /dev/null
+++ b/vendor/similar/src/text/abstraction.rs
@@ -0,0 +1,450 @@
+use std::borrow::Cow;
+use std::hash::Hash;
+use std::ops::Range;
+
+/// Reference to a [`DiffableStr`].
+///
+/// This type exists because while the library only really provides ways to
+/// work with `&str` and `&[u8]` there are types that deref into those string
+/// slices such as `String` and `Vec<u8>`.
+///
+/// This trait is used in the library whenever it's nice to be able to pass
+/// strings of different types in.
+///
+/// Requires the `text` feature.
+pub trait DiffableStrRef {
+ /// The type of the resolved [`DiffableStr`].
+ type Output: DiffableStr + ?Sized;
+
+ /// Resolves the reference.
+ fn as_diffable_str(&self) -> &Self::Output;
+}
+
+impl<T: DiffableStr + ?Sized> DiffableStrRef for T {
+ type Output = T;
+
+ fn as_diffable_str(&self) -> &T {
+ self
+ }
+}
+
+impl DiffableStrRef for String {
+ type Output = str;
+
+ fn as_diffable_str(&self) -> &str {
+ self.as_str()
+ }
+}
+
+impl<'a, T: DiffableStr + ?Sized> DiffableStrRef for Cow<'a, T> {
+ type Output = T;
+
+ fn as_diffable_str(&self) -> &T {
+ self
+ }
+}
+
+/// All supported diffable strings.
+///
+/// The text module can work with different types of strings depending
+/// on how the crate is compiled. Out of the box `&str` is always supported
+/// but with the `bytes` feature one can also work with `[u8]` slices for
+/// as long as they are ASCII compatible.
+///
+/// Requires the `text` feature.
+pub trait DiffableStr: Hash + PartialEq + PartialOrd + Ord + Eq + ToOwned {
+ /// Splits the value into newlines with newlines attached.
+ fn tokenize_lines(&self) -> Vec<&Self>;
+
+ /// Splits the value into newlines with newlines separated.
+ fn tokenize_lines_and_newlines(&self) -> Vec<&Self>;
+
+ /// Tokenizes into words.
+ fn tokenize_words(&self) -> Vec<&Self>;
+
+ /// Tokenizes the input into characters.
+ fn tokenize_chars(&self) -> Vec<&Self>;
+
+ /// Tokenizes into unicode words.
+ #[cfg(feature = "unicode")]
+ fn tokenize_unicode_words(&self) -> Vec<&Self>;
+
+ /// Tokenizes into unicode graphemes.
+ #[cfg(feature = "unicode")]
+ fn tokenize_graphemes(&self) -> Vec<&Self>;
+
+ /// Decodes the string (potentially) lossy.
+ fn as_str(&self) -> Option<&str>;
+
+ /// Decodes the string (potentially) lossy.
+ fn to_string_lossy(&self) -> Cow<'_, str>;
+
+ /// Checks if the string ends in a newline.
+ fn ends_with_newline(&self) -> bool;
+
+ /// The length of the string.
+ fn len(&self) -> usize;
+
+ /// Slices the string.
+ fn slice(&self, rng: Range<usize>) -> &Self;
+
+ /// Returns the string as slice of raw bytes.
+ fn as_bytes(&self) -> &[u8];
+
+ /// Checks if the string is empty.
+ fn is_empty(&self) -> bool {
+ self.len() == 0
+ }
+}
+
+impl DiffableStr for str {
+ fn tokenize_lines(&self) -> Vec<&Self> {
+ let mut iter = self.char_indices().peekable();
+ let mut last_pos = 0;
+ let mut lines = vec![];
+
+ while let Some((idx, c)) = iter.next() {
+ if c == '\r' {
+ if iter.peek().map_or(false, |x| x.1 == '\n') {
+ lines.push(&self[last_pos..=idx + 1]);
+ iter.next();
+ last_pos = idx + 2;
+ } else {
+ lines.push(&self[last_pos..=idx]);
+ last_pos = idx + 1;
+ }
+ } else if c == '\n' {
+ lines.push(&self[last_pos..=idx]);
+ last_pos = idx + 1;
+ }
+ }
+
+ if last_pos < self.len() {
+ lines.push(&self[last_pos..]);
+ }
+
+ lines
+ }
+
+ fn tokenize_lines_and_newlines(&self) -> Vec<&Self> {
+ let mut rv = vec![];
+ let mut iter = self.char_indices().peekable();
+
+ while let Some((idx, c)) = iter.next() {
+ let is_newline = c == '\r' || c == '\n';
+ let start = idx;
+ let mut end = idx + c.len_utf8();
+ while let Some(&(_, next_char)) = iter.peek() {
+ if (next_char == '\r' || next_char == '\n') != is_newline {
+ break;
+ }
+ iter.next();
+ end += next_char.len_utf8();
+ }
+ rv.push(&self[start..end]);
+ }
+
+ rv
+ }
+
+ fn tokenize_words(&self) -> Vec<&Self> {
+ let mut iter = self.char_indices().peekable();
+ let mut rv = vec![];
+
+ while let Some((idx, c)) = iter.next() {
+ let is_whitespace = c.is_whitespace();
+ let start = idx;
+ let mut end = idx + c.len_utf8();
+ while let Some(&(_, next_char)) = iter.peek() {
+ if next_char.is_whitespace() != is_whitespace {
+ break;
+ }
+ iter.next();
+ end += next_char.len_utf8();
+ }
+ rv.push(&self[start..end]);
+ }
+
+ rv
+ }
+
+ fn tokenize_chars(&self) -> Vec<&Self> {
+ self.char_indices()
+ .map(move |(i, c)| &self[i..i + c.len_utf8()])
+ .collect()
+ }
+
+ #[cfg(feature = "unicode")]
+ fn tokenize_unicode_words(&self) -> Vec<&Self> {
+ unicode_segmentation::UnicodeSegmentation::split_word_bounds(self).collect()
+ }
+
+ #[cfg(feature = "unicode")]
+ fn tokenize_graphemes(&self) -> Vec<&Self> {
+ unicode_segmentation::UnicodeSegmentation::graphemes(self, true).collect()
+ }
+
+ fn as_str(&self) -> Option<&str> {
+ Some(self)
+ }
+
+ fn to_string_lossy(&self) -> Cow<'_, str> {
+ Cow::Borrowed(self)
+ }
+
+ fn ends_with_newline(&self) -> bool {
+ self.ends_with(&['\r', '\n'][..])
+ }
+
+ fn len(&self) -> usize {
+ str::len(self)
+ }
+
+ fn slice(&self, rng: Range<usize>) -> &Self {
+ &self[rng]
+ }
+
+ fn as_bytes(&self) -> &[u8] {
+ str::as_bytes(self)
+ }
+}
+
+#[cfg(feature = "bytes")]
+mod bytes_support {
+ use super::*;
+
+ use bstr::ByteSlice;
+
+ impl DiffableStrRef for Vec<u8> {
+ type Output = [u8];
+
+ fn as_diffable_str(&self) -> &[u8] {
+ self.as_slice()
+ }
+ }
+
+ /// Allows viewing ASCII compatible byte slices as strings.
+ ///
+ /// Requires the `bytes` feature.
+ impl DiffableStr for [u8] {
+ fn tokenize_lines(&self) -> Vec<&Self> {
+ let mut iter = self.char_indices().peekable();
+ let mut last_pos = 0;
+ let mut lines = vec![];
+
+ while let Some((_, end, c)) = iter.next() {
+ if c == '\r' {
+ if iter.peek().map_or(false, |x| x.2 == '\n') {
+ lines.push(&self[last_pos..end + 1]);
+ iter.next();
+ last_pos = end + 1;
+ } else {
+ lines.push(&self[last_pos..end]);
+ last_pos = end;
+ }
+ } else if c == '\n' {
+ lines.push(&self[last_pos..end]);
+ last_pos = end;
+ }
+ }
+
+ if last_pos < self.len() {
+ lines.push(&self[last_pos..]);
+ }
+
+ lines
+ }
+
+ fn tokenize_lines_and_newlines(&self) -> Vec<&Self> {
+ let mut rv = vec![];
+ let mut iter = self.char_indices().peekable();
+
+ while let Some((start, mut end, c)) = iter.next() {
+ let is_newline = c == '\r' || c == '\n';
+ while let Some(&(_, new_end, next_char)) = iter.peek() {
+ if (next_char == '\r' || next_char == '\n') != is_newline {
+ break;
+ }
+ iter.next();
+ end = new_end;
+ }
+ rv.push(&self[start..end]);
+ }
+
+ rv
+ }
+
+ fn tokenize_words(&self) -> Vec<&Self> {
+ let mut iter = self.char_indices().peekable();
+ let mut rv = vec![];
+
+ while let Some((start, mut end, c)) = iter.next() {
+ let is_whitespace = c.is_whitespace();
+ while let Some(&(_, new_end, next_char)) = iter.peek() {
+ if next_char.is_whitespace() != is_whitespace {
+ break;
+ }
+ iter.next();
+ end = new_end;
+ }
+ rv.push(&self[start..end]);
+ }
+
+ rv
+ }
+
+ #[cfg(feature = "unicode")]
+ fn tokenize_unicode_words(&self) -> Vec<&Self> {
+ self.words_with_breaks().map(|x| x.as_bytes()).collect()
+ }
+
+ #[cfg(feature = "unicode")]
+ fn tokenize_graphemes(&self) -> Vec<&Self> {
+ self.graphemes().map(|x| x.as_bytes()).collect()
+ }
+
+ fn tokenize_chars(&self) -> Vec<&Self> {
+ self.char_indices()
+ .map(move |(start, end, _)| &self[start..end])
+ .collect()
+ }
+
+ fn as_str(&self) -> Option<&str> {
+ std::str::from_utf8(self).ok()
+ }
+
+ fn to_string_lossy(&self) -> Cow<'_, str> {
+ String::from_utf8_lossy(self)
+ }
+
+ fn ends_with_newline(&self) -> bool {
+ if let Some(b'\r') | Some(b'\n') = self.last_byte() {
+ true
+ } else {
+ false
+ }
+ }
+
+ fn len(&self) -> usize {
+ <[u8]>::len(self)
+ }
+
+ fn slice(&self, rng: Range<usize>) -> &Self {
+ &self[rng]
+ }
+
+ fn as_bytes(&self) -> &[u8] {
+ self
+ }
+ }
+}
+
+#[test]
+fn test_split_lines() {
+ assert_eq!(
+ DiffableStr::tokenize_lines("first\nsecond\rthird\r\nfourth\nlast"),
+ vec!["first\n", "second\r", "third\r\n", "fourth\n", "last"]
+ );
+ assert_eq!(DiffableStr::tokenize_lines("\n\n"), vec!["\n", "\n"]);
+ assert_eq!(DiffableStr::tokenize_lines("\n"), vec!["\n"]);
+ assert!(DiffableStr::tokenize_lines("").is_empty());
+}
+
+#[test]
+fn test_split_words() {
+ assert_eq!(
+ DiffableStr::tokenize_words("foo bar baz\n\n aha"),
+ ["foo", " ", "bar", " ", "baz", "\n\n ", "aha"]
+ );
+}
+
+#[test]
+fn test_split_chars() {
+ assert_eq!(
+ DiffableStr::tokenize_chars("abcfö❄️"),
+ vec!["a", "b", "c", "f", "ö", "❄", "\u{fe0f}"]
+ );
+}
+
+#[test]
+#[cfg(feature = "unicode")]
+fn test_split_graphemes() {
+ assert_eq!(
+ DiffableStr::tokenize_graphemes("abcfö❄️"),
+ vec!["a", "b", "c", "f", "ö", "❄️"]
+ );
+}
+
+#[test]
+#[cfg(feature = "bytes")]
+fn test_split_lines_bytes() {
+ assert_eq!(
+ DiffableStr::tokenize_lines("first\nsecond\rthird\r\nfourth\nlast".as_bytes()),
+ vec![
+ "first\n".as_bytes(),
+ "second\r".as_bytes(),
+ "third\r\n".as_bytes(),
+ "fourth\n".as_bytes(),
+ "last".as_bytes()
+ ]
+ );
+ assert_eq!(
+ DiffableStr::tokenize_lines("\n\n".as_bytes()),
+ vec!["\n".as_bytes(), "\n".as_bytes()]
+ );
+ assert_eq!(
+ DiffableStr::tokenize_lines("\n".as_bytes()),
+ vec!["\n".as_bytes()]
+ );
+ assert!(DiffableStr::tokenize_lines("".as_bytes()).is_empty());
+}
+
+#[test]
+#[cfg(feature = "bytes")]
+fn test_split_words_bytes() {
+ assert_eq!(
+ DiffableStr::tokenize_words("foo bar baz\n\n aha".as_bytes()),
+ [
+ &b"foo"[..],
+ &b" "[..],
+ &b"bar"[..],
+ &b" "[..],
+ &b"baz"[..],
+ &b"\n\n "[..],
+ &b"aha"[..]
+ ]
+ );
+}
+
+#[test]
+#[cfg(feature = "bytes")]
+fn test_split_chars_bytes() {
+ assert_eq!(
+ DiffableStr::tokenize_chars("abcfö❄️".as_bytes()),
+ vec![
+ &b"a"[..],
+ &b"b"[..],
+ &b"c"[..],
+ &b"f"[..],
+ "ö".as_bytes(),
+ "❄".as_bytes(),
+ "\u{fe0f}".as_bytes()
+ ]
+ );
+}
+
+#[test]
+#[cfg(all(feature = "bytes", feature = "unicode"))]
+fn test_split_graphemes_bytes() {
+ assert_eq!(
+ DiffableStr::tokenize_graphemes("abcfö❄️".as_bytes()),
+ vec![
+ &b"a"[..],
+ &b"b"[..],
+ &b"c"[..],
+ &b"f"[..],
+ "ö".as_bytes(),
+ "❄️".as_bytes()
+ ]
+ );
+}