use std::mem::take; use std::str::from_utf8_unchecked; use crate::TokenSource; /// Returns a [`TokenSource`](crate::intern::TokenSource) that uses /// the lines in `data` as Tokens. The newline seperator (`\r\n` or `\n`) is /// not included in the emitted tokens. /// This means that changing the newline seperator from `\r\n` to `\n` /// (or omitting it fully on the last line) is not detected by [`diff`](crate::diff). pub fn lines(data: &str) -> Lines<'_, false> { Lines(ByteLines(data.as_bytes())) } /// Returns a [`TokenSource`](crate::intern::TokenSource) that uses /// the lines in `data` as Tokens. The newline seperator (`\r\n` or `\n`) is /// included in the emitted tokens. /// This means that changing the newline seperator from `\r\n` to `\n` /// (or omitting it fully on the last line) is detected by [`diff`](crate::diff). pub fn lines_with_terminator(data: &str) -> Lines<'_, true> { Lines(ByteLines(data.as_bytes())) } /// Returns a [`TokenSource`](crate::intern::TokenSource) that uses /// the lines in `data` as Tokens. A lines is a continous subslice of /// `data` which does not contain `\n` (or `\r\n`). /// The newline seperator (`\r\n` or `\n`) is not included in the emitted tokens. /// This means that changing the newline seperator from `\r\n` to `\n` /// (or omitting it fully on the last line) is not detected by [`diff`](crate::diff). pub fn byte_lines_with_terminator(data: &[u8]) -> ByteLines<'_, true> { ByteLines(data) } /// Returns a [`TokenSource`](crate::intern::TokenSource) that uses /// the lines in `data` as Tokens. The newline seperator (`\r\n` or `\n`) is /// included in the emitted tokens. /// This means that changing the newline seperator from `\r\n` to `\n` /// (or omitting it fully on the last line) is detected by [`diff`](crate::diff). pub fn byte_lines(data: &[u8]) -> ByteLines<'_, false> { ByteLines(data) } /// By default a line diff is produced for a string impl<'a> TokenSource for &'a str { type Token = &'a str; type Tokenizer = Lines<'a, false>; fn tokenize(&self) -> Self::Tokenizer { lines(self) } fn estimate_tokens(&self) -> u32 { lines_with_terminator(self).estimate_tokens() } } /// By default a line diff is produced for a bytes impl<'a> TokenSource for &'a [u8] { type Token = Self; type Tokenizer = ByteLines<'a, false>; fn tokenize(&self) -> Self::Tokenizer { byte_lines(self) } fn estimate_tokens(&self) -> u32 { byte_lines(self).estimate_tokens() } } /// A [`TokenSource`](crate::intern::TokenSource) that returns the lines of a `str` as tokens. /// See [`lines`](crate::sources::lines) and [`lines_with_terminator`](crate::sources::lines_with_terminator) for details #[derive(Clone, Copy, PartialEq, Eq)] pub struct Lines<'a, const INCLUDE_LINE_TERMINATOR: bool>(ByteLines<'a, INCLUDE_LINE_TERMINATOR>); impl<'a, const INCLUDE_LINE_TERMINATOR: bool> Iterator for Lines<'a, INCLUDE_LINE_TERMINATOR> { type Item = &'a str; fn next(&mut self) -> Option { // safety invariant: this struct may only contain valid utf8 // dividing valid utf8 bytes by ascii characters always produces valid utf-8 self.0.next().map(|it| unsafe { from_utf8_unchecked(it) }) } } /// By default a line diff is produced for a string impl<'a, const INCLUDE_LINE_TERMINATOR: bool> TokenSource for Lines<'a, INCLUDE_LINE_TERMINATOR> { type Token = &'a str; type Tokenizer = Self; fn tokenize(&self) -> Self::Tokenizer { *self } fn estimate_tokens(&self) -> u32 { self.0.estimate_tokens() } } /// A [`TokenSource`](crate::intern::TokenSource) that returns the lines of a byte slice as tokens. /// See [`byte_lines`](crate::sources::lines) and [`byte_lines_with_terminator`](crate::sources::byte_lines_with_terminator) for details #[derive(Clone, Copy, PartialEq, Eq)] pub struct ByteLines<'a, const INCLUDE_LINE_TERMINATOR: bool>(&'a [u8]); impl<'a, const INCLUDE_LINE_TERMINATOR: bool> Iterator for ByteLines<'a, INCLUDE_LINE_TERMINATOR> { type Item = &'a [u8]; fn next(&mut self) -> Option { let mut saw_carriage_return = false; let mut iter = self.0.iter().enumerate(); let line_len = loop { match iter.next() { Some((i, b'\n')) => break i + 1, None => { return (!self.0.is_empty()).then(|| take(&mut self.0)); } Some((_, &it)) => saw_carriage_return = it == b'\r', } }; let (mut line, rem) = self.0.split_at(line_len); self.0 = rem; if !INCLUDE_LINE_TERMINATOR { line = &line[..line_len - 1 - saw_carriage_return as usize]; } Some(line) } } /// By default a line diff is produced for a string impl<'a, const INCLUDE_LINE_TERMINATOR: bool> TokenSource for ByteLines<'a, INCLUDE_LINE_TERMINATOR> { type Token = &'a [u8]; type Tokenizer = Self; fn tokenize(&self) -> Self::Tokenizer { *self } fn estimate_tokens(&self) -> u32 { let len: usize = self.take(20).map(|line| line.len()).sum(); if len == 0 { 100 } else { (self.0.len() * 20 / len) as u32 } } }