From d8bbc7858622b6d9c278469aab701ca0b609cddf Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Wed, 15 May 2024 05:35:49 +0200 Subject: Merging upstream version 126.0. Signed-off-by: Daniel Baumann --- third_party/rust/textwrap/src/word_separators.rs | 481 +++++++++++++++++++++++ 1 file changed, 481 insertions(+) create mode 100644 third_party/rust/textwrap/src/word_separators.rs (limited to 'third_party/rust/textwrap/src/word_separators.rs') diff --git a/third_party/rust/textwrap/src/word_separators.rs b/third_party/rust/textwrap/src/word_separators.rs new file mode 100644 index 0000000000..e06e9b88aa --- /dev/null +++ b/third_party/rust/textwrap/src/word_separators.rs @@ -0,0 +1,481 @@ +//! Functionality for finding words. +//! +//! In order to wrap text, we need to know where the legal break +//! points are, i.e., where the words of the text are. This means that +//! we need to define what a "word" is. +//! +//! A simple approach is to simply split the text on whitespace, but +//! this does not work for East-Asian languages such as Chinese or +//! Japanese where there are no spaces between words. Breaking a long +//! sequence of emojis is another example where line breaks might be +//! wanted even if there are no whitespace to be found. +//! +//! The [`WordSeparator`] enum is responsible for determining where +//! there words are in a line of text. Please refer to the enum and +//! its variants for more information. + +#[cfg(feature = "unicode-linebreak")] +use crate::core::skip_ansi_escape_sequence; +use crate::core::Word; + +/// Describes where words occur in a line of text. +/// +/// The simplest approach is say that words are separated by one or +/// more ASCII spaces (`' '`). This works for Western languages +/// without emojis. A more complex approach is to use the Unicode line +/// breaking algorithm, which finds break points in non-ASCII text. +/// +/// The line breaks occur between words, please see +/// [`WordSplitter`](crate::WordSplitter) for options of how to handle +/// hyphenation of individual words. +/// +/// # Examples +/// +/// ``` +/// use textwrap::core::Word; +/// use textwrap::WordSeparator::AsciiSpace; +/// +/// let words = AsciiSpace.find_words("Hello World!").collect::>(); +/// assert_eq!(words, vec![Word::from("Hello "), Word::from("World!")]); +/// ``` +#[derive(Clone, Copy)] +pub enum WordSeparator { + /// Find words by splitting on runs of `' '` characters. + /// + /// # Examples + /// + /// ``` + /// use textwrap::core::Word; + /// use textwrap::WordSeparator::AsciiSpace; + /// + /// let words = AsciiSpace.find_words("Hello World!").collect::>(); + /// assert_eq!(words, vec![Word::from("Hello "), + /// Word::from("World!")]); + /// ``` + AsciiSpace, + + /// Split `line` into words using Unicode break properties. + /// + /// This word separator uses the Unicode line breaking algorithm + /// described in [Unicode Standard Annex + /// #14](https://www.unicode.org/reports/tr14/) to find legal places + /// to break lines. There is a small difference in that the U+002D + /// (Hyphen-Minus) and U+00AD (Soft Hyphen) don’t create a line break: + /// to allow a line break at a hyphen, use + /// [`WordSplitter::HyphenSplitter`](crate::WordSplitter::HyphenSplitter). + /// Soft hyphens are not currently supported. + /// + /// # Examples + /// + /// Unlike [`WordSeparator::AsciiSpace`], the Unicode line + /// breaking algorithm will find line break opportunities between + /// some characters with no intervening whitespace: + /// + /// ``` + /// #[cfg(feature = "unicode-linebreak")] { + /// use textwrap::core::Word; + /// use textwrap::WordSeparator::UnicodeBreakProperties; + /// + /// assert_eq!(UnicodeBreakProperties.find_words("Emojis: πŸ˜‚πŸ˜").collect::>(), + /// vec![Word::from("Emojis: "), + /// Word::from("πŸ˜‚"), + /// Word::from("😍")]); + /// + /// assert_eq!(UnicodeBreakProperties.find_words("CJK: δ½ ε₯½").collect::>(), + /// vec![Word::from("CJK: "), + /// Word::from("δ½ "), + /// Word::from("ε₯½")]); + /// } + /// ``` + /// + /// A U+2060 (Word Joiner) character can be inserted if you want to + /// manually override the defaults and keep the characters together: + /// + /// ``` + /// #[cfg(feature = "unicode-linebreak")] { + /// use textwrap::core::Word; + /// use textwrap::WordSeparator::UnicodeBreakProperties; + /// + /// assert_eq!(UnicodeBreakProperties.find_words("Emojis: πŸ˜‚\u{2060}😍").collect::>(), + /// vec![Word::from("Emojis: "), + /// Word::from("πŸ˜‚\u{2060}😍")]); + /// } + /// ``` + /// + /// The Unicode line breaking algorithm will also automatically + /// suppress break breaks around certain punctuation characters:: + /// + /// ``` + /// #[cfg(feature = "unicode-linebreak")] { + /// use textwrap::core::Word; + /// use textwrap::WordSeparator::UnicodeBreakProperties; + /// + /// assert_eq!(UnicodeBreakProperties.find_words("[ foo ] bar !").collect::>(), + /// vec![Word::from("[ foo ] "), + /// Word::from("bar !")]); + /// } + /// ``` + #[cfg(feature = "unicode-linebreak")] + UnicodeBreakProperties, + + /// Find words using a custom word separator + Custom(fn(line: &str) -> Box> + '_>), +} + +impl PartialEq for WordSeparator { + /// Compare two word separators. + /// + /// ``` + /// use textwrap::WordSeparator; + /// + /// assert_eq!(WordSeparator::AsciiSpace, WordSeparator::AsciiSpace); + /// #[cfg(feature = "unicode-linebreak")] { + /// assert_eq!(WordSeparator::UnicodeBreakProperties, + /// WordSeparator::UnicodeBreakProperties); + /// } + /// ``` + /// + /// Note that `WordSeparator::Custom` values never compare equal: + /// + /// ``` + /// use textwrap::WordSeparator; + /// use textwrap::core::Word; + /// fn word_separator(line: &str) -> Box> + '_> { + /// Box::new(line.split_inclusive(' ').map(Word::from)) + /// } + /// assert_ne!(WordSeparator::Custom(word_separator), + /// WordSeparator::Custom(word_separator)); + /// ``` + fn eq(&self, other: &Self) -> bool { + match (self, other) { + (WordSeparator::AsciiSpace, WordSeparator::AsciiSpace) => true, + #[cfg(feature = "unicode-linebreak")] + (WordSeparator::UnicodeBreakProperties, WordSeparator::UnicodeBreakProperties) => true, + (_, _) => false, + } + } +} + +impl std::fmt::Debug for WordSeparator { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + WordSeparator::AsciiSpace => f.write_str("AsciiSpace"), + #[cfg(feature = "unicode-linebreak")] + WordSeparator::UnicodeBreakProperties => f.write_str("UnicodeBreakProperties"), + WordSeparator::Custom(_) => f.write_str("Custom(...)"), + } + } +} + +impl WordSeparator { + /// Create a new word separator. + /// + /// The best available algorithm is used by default, i.e., + /// [`WordSeparator::UnicodeBreakProperties`] if available, + /// otherwise [`WordSeparator::AsciiSpace`]. + pub const fn new() -> Self { + #[cfg(feature = "unicode-linebreak")] + { + WordSeparator::UnicodeBreakProperties + } + + #[cfg(not(feature = "unicode-linebreak"))] + { + WordSeparator::AsciiSpace + } + } + + // This function should really return impl Iterator, but + // this isn't possible until Rust supports higher-kinded types: + // https://github.com/rust-lang/rfcs/blob/master/text/1522-conservative-impl-trait.md + /// Find all words in `line`. + pub fn find_words<'a>(&self, line: &'a str) -> Box> + 'a> { + match self { + WordSeparator::AsciiSpace => find_words_ascii_space(line), + #[cfg(feature = "unicode-linebreak")] + WordSeparator::UnicodeBreakProperties => find_words_unicode_break_properties(line), + WordSeparator::Custom(func) => func(line), + } + } +} + +fn find_words_ascii_space<'a>(line: &'a str) -> Box> + 'a> { + let mut start = 0; + let mut in_whitespace = false; + let mut char_indices = line.char_indices(); + + Box::new(std::iter::from_fn(move || { + for (idx, ch) in char_indices.by_ref() { + if in_whitespace && ch != ' ' { + let word = Word::from(&line[start..idx]); + start = idx; + in_whitespace = ch == ' '; + return Some(word); + } + + in_whitespace = ch == ' '; + } + + if start < line.len() { + let word = Word::from(&line[start..]); + start = line.len(); + return Some(word); + } + + None + })) +} + +// Strip all ANSI escape sequences from `text`. +#[cfg(feature = "unicode-linebreak")] +fn strip_ansi_escape_sequences(text: &str) -> String { + let mut result = String::with_capacity(text.len()); + + let mut chars = text.chars(); + while let Some(ch) = chars.next() { + if skip_ansi_escape_sequence(ch, &mut chars) { + continue; + } + result.push(ch); + } + + result +} + +/// Soft hyphen, also knows as a β€œshy hyphen”. Should show up as β€˜-’ +/// if a line is broken at this point, and otherwise be invisible. +/// Textwrap does not currently support breaking words at soft +/// hyphens. +#[cfg(feature = "unicode-linebreak")] +const SHY: char = '\u{00ad}'; + +/// Find words in line. ANSI escape sequences are ignored in `line`. +#[cfg(feature = "unicode-linebreak")] +fn find_words_unicode_break_properties<'a>( + line: &'a str, +) -> Box> + 'a> { + // Construct an iterator over (original index, stripped index) + // tuples. We find the Unicode linebreaks on a stripped string, + // but we need the original indices so we can form words based on + // the original string. + let mut last_stripped_idx = 0; + let mut char_indices = line.char_indices(); + let mut idx_map = std::iter::from_fn(move || match char_indices.next() { + Some((orig_idx, ch)) => { + let stripped_idx = last_stripped_idx; + if !skip_ansi_escape_sequence(ch, &mut char_indices.by_ref().map(|(_, ch)| ch)) { + last_stripped_idx += ch.len_utf8(); + } + Some((orig_idx, stripped_idx)) + } + None => None, + }); + + let stripped = strip_ansi_escape_sequences(line); + let mut opportunities = unicode_linebreak::linebreaks(&stripped) + .filter(|(idx, _)| { + #[allow(clippy::match_like_matches_macro)] + match &stripped[..*idx].chars().next_back() { + // We suppress breaks at β€˜-’ since we want to control + // this via the WordSplitter. + Some('-') => false, + // Soft hyphens are currently not supported since we + // require all `Word` fragments to be continuous in + // the input string. + Some(SHY) => false, + // Other breaks should be fine! + _ => true, + } + }) + .collect::>() + .into_iter(); + + // Remove final break opportunity, we will add it below using + // &line[start..]; This ensures that we correctly include a + // trailing ANSI escape sequence. + opportunities.next_back(); + + let mut start = 0; + Box::new(std::iter::from_fn(move || { + for (idx, _) in opportunities.by_ref() { + if let Some((orig_idx, _)) = idx_map.find(|&(_, stripped_idx)| stripped_idx == idx) { + let word = Word::from(&line[start..orig_idx]); + start = orig_idx; + return Some(word); + } + } + + if start < line.len() { + let word = Word::from(&line[start..]); + start = line.len(); + return Some(word); + } + + None + })) +} + +#[cfg(test)] +mod tests { + use super::WordSeparator::*; + use super::*; + + // Like assert_eq!, but the left expression is an iterator. + macro_rules! assert_iter_eq { + ($left:expr, $right:expr) => { + assert_eq!($left.collect::>(), $right); + }; + } + + fn to_words(words: Vec<&str>) -> Vec> { + words.into_iter().map(Word::from).collect() + } + + macro_rules! test_find_words { + ($ascii_name:ident, + $unicode_name:ident, + $([ $line:expr, $ascii_words:expr, $unicode_words:expr ]),+) => { + #[test] + fn $ascii_name() { + $( + let expected_words = to_words($ascii_words.to_vec()); + let actual_words = WordSeparator::AsciiSpace + .find_words($line) + .collect::>(); + assert_eq!(actual_words, expected_words, "Line: {:?}", $line); + )+ + } + + #[test] + #[cfg(feature = "unicode-linebreak")] + fn $unicode_name() { + $( + let expected_words = to_words($unicode_words.to_vec()); + let actual_words = WordSeparator::UnicodeBreakProperties + .find_words($line) + .collect::>(); + assert_eq!(actual_words, expected_words, "Line: {:?}", $line); + )+ + } + }; + } + + test_find_words!(ascii_space_empty, unicode_empty, ["", [], []]); + + test_find_words!( + ascii_single_word, + unicode_single_word, + ["foo", ["foo"], ["foo"]] + ); + + test_find_words!( + ascii_two_words, + unicode_two_words, + ["foo bar", ["foo ", "bar"], ["foo ", "bar"]] + ); + + test_find_words!( + ascii_multiple_words, + unicode_multiple_words, + ["foo bar", ["foo ", "bar"], ["foo ", "bar"]], + ["x y z", ["x ", "y ", "z"], ["x ", "y ", "z"]] + ); + + test_find_words!( + ascii_only_whitespace, + unicode_only_whitespace, + [" ", [" "], [" "]], + [" ", [" "], [" "]] + ); + + test_find_words!( + ascii_inter_word_whitespace, + unicode_inter_word_whitespace, + ["foo bar", ["foo ", "bar"], ["foo ", "bar"]] + ); + + test_find_words!( + ascii_trailing_whitespace, + unicode_trailing_whitespace, + ["foo ", ["foo "], ["foo "]] + ); + + test_find_words!( + ascii_leading_whitespace, + unicode_leading_whitespace, + [" foo", [" ", "foo"], [" ", "foo"]] + ); + + test_find_words!( + ascii_multi_column_char, + unicode_multi_column_char, + ["\u{1f920}", ["\u{1f920}"], ["\u{1f920}"]] // cowboy emoji 🀠 + ); + + test_find_words!( + ascii_hyphens, + unicode_hyphens, + ["foo-bar", ["foo-bar"], ["foo-bar"]], + ["foo- bar", ["foo- ", "bar"], ["foo- ", "bar"]], + ["foo - bar", ["foo ", "- ", "bar"], ["foo ", "- ", "bar"]], + ["foo -bar", ["foo ", "-bar"], ["foo ", "-bar"]] + ); + + test_find_words!( + ascii_newline, + unicode_newline, + ["foo\nbar", ["foo\nbar"], ["foo\n", "bar"]] + ); + + test_find_words!( + ascii_tab, + unicode_tab, + ["foo\tbar", ["foo\tbar"], ["foo\t", "bar"]] + ); + + test_find_words!( + ascii_non_breaking_space, + unicode_non_breaking_space, + ["foo\u{00A0}bar", ["foo\u{00A0}bar"], ["foo\u{00A0}bar"]] + ); + + #[test] + #[cfg(unix)] + fn find_words_colored_text() { + use termion::color::{Blue, Fg, Green, Reset}; + + let green_hello = format!("{}Hello{} ", Fg(Green), Fg(Reset)); + let blue_world = format!("{}World!{}", Fg(Blue), Fg(Reset)); + assert_iter_eq!( + AsciiSpace.find_words(&format!("{}{}", green_hello, blue_world)), + vec![Word::from(&green_hello), Word::from(&blue_world)] + ); + + #[cfg(feature = "unicode-linebreak")] + assert_iter_eq!( + UnicodeBreakProperties.find_words(&format!("{}{}", green_hello, blue_world)), + vec![Word::from(&green_hello), Word::from(&blue_world)] + ); + } + + #[test] + fn find_words_color_inside_word() { + let text = "foo\u{1b}[0m\u{1b}[32mbar\u{1b}[0mbaz"; + assert_iter_eq!(AsciiSpace.find_words(text), vec![Word::from(text)]); + + #[cfg(feature = "unicode-linebreak")] + assert_iter_eq!( + UnicodeBreakProperties.find_words(text), + vec![Word::from(text)] + ); + } + + #[test] + fn word_separator_new() { + #[cfg(feature = "unicode-linebreak")] + assert!(matches!(WordSeparator::new(), UnicodeBreakProperties)); + + #[cfg(not(feature = "unicode-linebreak"))] + assert!(matches!(WordSeparator::new(), AsciiSpace)); + } +} -- cgit v1.2.3