summaryrefslogtreecommitdiffstats
path: root/third_party/rust/textwrap/src/word_separators.rs
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/rust/textwrap/src/word_separators.rs')
-rw-r--r--third_party/rust/textwrap/src/word_separators.rs481
1 files changed, 481 insertions, 0 deletions
diff --git a/third_party/rust/textwrap/src/word_separators.rs b/third_party/rust/textwrap/src/word_separators.rs
new file mode 100644
index 0000000000..e06e9b88aa
--- /dev/null
+++ b/third_party/rust/textwrap/src/word_separators.rs
@@ -0,0 +1,481 @@
+//! Functionality for finding words.
+//!
+//! In order to wrap text, we need to know where the legal break
+//! points are, i.e., where the words of the text are. This means that
+//! we need to define what a "word" is.
+//!
+//! A simple approach is to simply split the text on whitespace, but
+//! this does not work for East-Asian languages such as Chinese or
+//! Japanese where there are no spaces between words. Breaking a long
+//! sequence of emojis is another example where line breaks might be
+//! wanted even if there are no whitespace to be found.
+//!
+//! The [`WordSeparator`] enum is responsible for determining where
+//! there words are in a line of text. Please refer to the enum and
+//! its variants for more information.
+
+#[cfg(feature = "unicode-linebreak")]
+use crate::core::skip_ansi_escape_sequence;
+use crate::core::Word;
+
+/// Describes where words occur in a line of text.
+///
+/// The simplest approach is say that words are separated by one or
+/// more ASCII spaces (`' '`). This works for Western languages
+/// without emojis. A more complex approach is to use the Unicode line
+/// breaking algorithm, which finds break points in non-ASCII text.
+///
+/// The line breaks occur between words, please see
+/// [`WordSplitter`](crate::WordSplitter) for options of how to handle
+/// hyphenation of individual words.
+///
+/// # Examples
+///
+/// ```
+/// use textwrap::core::Word;
+/// use textwrap::WordSeparator::AsciiSpace;
+///
+/// let words = AsciiSpace.find_words("Hello World!").collect::<Vec<_>>();
+/// assert_eq!(words, vec![Word::from("Hello "), Word::from("World!")]);
+/// ```
+#[derive(Clone, Copy)]
+pub enum WordSeparator {
+ /// Find words by splitting on runs of `' '` characters.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// use textwrap::core::Word;
+ /// use textwrap::WordSeparator::AsciiSpace;
+ ///
+ /// let words = AsciiSpace.find_words("Hello World!").collect::<Vec<_>>();
+ /// assert_eq!(words, vec![Word::from("Hello "),
+ /// Word::from("World!")]);
+ /// ```
+ AsciiSpace,
+
+ /// Split `line` into words using Unicode break properties.
+ ///
+ /// This word separator uses the Unicode line breaking algorithm
+ /// described in [Unicode Standard Annex
+ /// #14](https://www.unicode.org/reports/tr14/) to find legal places
+ /// to break lines. There is a small difference in that the U+002D
+ /// (Hyphen-Minus) and U+00AD (Soft Hyphen) don’t create a line break:
+ /// to allow a line break at a hyphen, use
+ /// [`WordSplitter::HyphenSplitter`](crate::WordSplitter::HyphenSplitter).
+ /// Soft hyphens are not currently supported.
+ ///
+ /// # Examples
+ ///
+ /// Unlike [`WordSeparator::AsciiSpace`], the Unicode line
+ /// breaking algorithm will find line break opportunities between
+ /// some characters with no intervening whitespace:
+ ///
+ /// ```
+ /// #[cfg(feature = "unicode-linebreak")] {
+ /// use textwrap::core::Word;
+ /// use textwrap::WordSeparator::UnicodeBreakProperties;
+ ///
+ /// assert_eq!(UnicodeBreakProperties.find_words("Emojis: πŸ˜‚πŸ˜").collect::<Vec<_>>(),
+ /// vec![Word::from("Emojis: "),
+ /// Word::from("πŸ˜‚"),
+ /// Word::from("😍")]);
+ ///
+ /// assert_eq!(UnicodeBreakProperties.find_words("CJK: δ½ ε₯½").collect::<Vec<_>>(),
+ /// vec![Word::from("CJK: "),
+ /// Word::from("δ½ "),
+ /// Word::from("ε₯½")]);
+ /// }
+ /// ```
+ ///
+ /// A U+2060 (Word Joiner) character can be inserted if you want to
+ /// manually override the defaults and keep the characters together:
+ ///
+ /// ```
+ /// #[cfg(feature = "unicode-linebreak")] {
+ /// use textwrap::core::Word;
+ /// use textwrap::WordSeparator::UnicodeBreakProperties;
+ ///
+ /// assert_eq!(UnicodeBreakProperties.find_words("Emojis: πŸ˜‚\u{2060}😍").collect::<Vec<_>>(),
+ /// vec![Word::from("Emojis: "),
+ /// Word::from("πŸ˜‚\u{2060}😍")]);
+ /// }
+ /// ```
+ ///
+ /// The Unicode line breaking algorithm will also automatically
+ /// suppress break breaks around certain punctuation characters::
+ ///
+ /// ```
+ /// #[cfg(feature = "unicode-linebreak")] {
+ /// use textwrap::core::Word;
+ /// use textwrap::WordSeparator::UnicodeBreakProperties;
+ ///
+ /// assert_eq!(UnicodeBreakProperties.find_words("[ foo ] bar !").collect::<Vec<_>>(),
+ /// vec![Word::from("[ foo ] "),
+ /// Word::from("bar !")]);
+ /// }
+ /// ```
+ #[cfg(feature = "unicode-linebreak")]
+ UnicodeBreakProperties,
+
+ /// Find words using a custom word separator
+ Custom(fn(line: &str) -> Box<dyn Iterator<Item = Word<'_>> + '_>),
+}
+
+impl PartialEq for WordSeparator {
+ /// Compare two word separators.
+ ///
+ /// ```
+ /// use textwrap::WordSeparator;
+ ///
+ /// assert_eq!(WordSeparator::AsciiSpace, WordSeparator::AsciiSpace);
+ /// #[cfg(feature = "unicode-linebreak")] {
+ /// assert_eq!(WordSeparator::UnicodeBreakProperties,
+ /// WordSeparator::UnicodeBreakProperties);
+ /// }
+ /// ```
+ ///
+ /// Note that `WordSeparator::Custom` values never compare equal:
+ ///
+ /// ```
+ /// use textwrap::WordSeparator;
+ /// use textwrap::core::Word;
+ /// fn word_separator(line: &str) -> Box<dyn Iterator<Item = Word<'_>> + '_> {
+ /// Box::new(line.split_inclusive(' ').map(Word::from))
+ /// }
+ /// assert_ne!(WordSeparator::Custom(word_separator),
+ /// WordSeparator::Custom(word_separator));
+ /// ```
+ fn eq(&self, other: &Self) -> bool {
+ match (self, other) {
+ (WordSeparator::AsciiSpace, WordSeparator::AsciiSpace) => true,
+ #[cfg(feature = "unicode-linebreak")]
+ (WordSeparator::UnicodeBreakProperties, WordSeparator::UnicodeBreakProperties) => true,
+ (_, _) => false,
+ }
+ }
+}
+
+impl std::fmt::Debug for WordSeparator {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ match self {
+ WordSeparator::AsciiSpace => f.write_str("AsciiSpace"),
+ #[cfg(feature = "unicode-linebreak")]
+ WordSeparator::UnicodeBreakProperties => f.write_str("UnicodeBreakProperties"),
+ WordSeparator::Custom(_) => f.write_str("Custom(...)"),
+ }
+ }
+}
+
+impl WordSeparator {
+ /// Create a new word separator.
+ ///
+ /// The best available algorithm is used by default, i.e.,
+ /// [`WordSeparator::UnicodeBreakProperties`] if available,
+ /// otherwise [`WordSeparator::AsciiSpace`].
+ pub const fn new() -> Self {
+ #[cfg(feature = "unicode-linebreak")]
+ {
+ WordSeparator::UnicodeBreakProperties
+ }
+
+ #[cfg(not(feature = "unicode-linebreak"))]
+ {
+ WordSeparator::AsciiSpace
+ }
+ }
+
+ // This function should really return impl Iterator<Item = Word>, but
+ // this isn't possible until Rust supports higher-kinded types:
+ // https://github.com/rust-lang/rfcs/blob/master/text/1522-conservative-impl-trait.md
+ /// Find all words in `line`.
+ pub fn find_words<'a>(&self, line: &'a str) -> Box<dyn Iterator<Item = Word<'a>> + 'a> {
+ match self {
+ WordSeparator::AsciiSpace => find_words_ascii_space(line),
+ #[cfg(feature = "unicode-linebreak")]
+ WordSeparator::UnicodeBreakProperties => find_words_unicode_break_properties(line),
+ WordSeparator::Custom(func) => func(line),
+ }
+ }
+}
+
+fn find_words_ascii_space<'a>(line: &'a str) -> Box<dyn Iterator<Item = Word<'a>> + 'a> {
+ let mut start = 0;
+ let mut in_whitespace = false;
+ let mut char_indices = line.char_indices();
+
+ Box::new(std::iter::from_fn(move || {
+ for (idx, ch) in char_indices.by_ref() {
+ if in_whitespace && ch != ' ' {
+ let word = Word::from(&line[start..idx]);
+ start = idx;
+ in_whitespace = ch == ' ';
+ return Some(word);
+ }
+
+ in_whitespace = ch == ' ';
+ }
+
+ if start < line.len() {
+ let word = Word::from(&line[start..]);
+ start = line.len();
+ return Some(word);
+ }
+
+ None
+ }))
+}
+
+// Strip all ANSI escape sequences from `text`.
+#[cfg(feature = "unicode-linebreak")]
+fn strip_ansi_escape_sequences(text: &str) -> String {
+ let mut result = String::with_capacity(text.len());
+
+ let mut chars = text.chars();
+ while let Some(ch) = chars.next() {
+ if skip_ansi_escape_sequence(ch, &mut chars) {
+ continue;
+ }
+ result.push(ch);
+ }
+
+ result
+}
+
+/// Soft hyphen, also knows as a β€œshy hyphen”. Should show up as β€˜-’
+/// if a line is broken at this point, and otherwise be invisible.
+/// Textwrap does not currently support breaking words at soft
+/// hyphens.
+#[cfg(feature = "unicode-linebreak")]
+const SHY: char = '\u{00ad}';
+
+/// Find words in line. ANSI escape sequences are ignored in `line`.
+#[cfg(feature = "unicode-linebreak")]
+fn find_words_unicode_break_properties<'a>(
+ line: &'a str,
+) -> Box<dyn Iterator<Item = Word<'a>> + 'a> {
+ // Construct an iterator over (original index, stripped index)
+ // tuples. We find the Unicode linebreaks on a stripped string,
+ // but we need the original indices so we can form words based on
+ // the original string.
+ let mut last_stripped_idx = 0;
+ let mut char_indices = line.char_indices();
+ let mut idx_map = std::iter::from_fn(move || match char_indices.next() {
+ Some((orig_idx, ch)) => {
+ let stripped_idx = last_stripped_idx;
+ if !skip_ansi_escape_sequence(ch, &mut char_indices.by_ref().map(|(_, ch)| ch)) {
+ last_stripped_idx += ch.len_utf8();
+ }
+ Some((orig_idx, stripped_idx))
+ }
+ None => None,
+ });
+
+ let stripped = strip_ansi_escape_sequences(line);
+ let mut opportunities = unicode_linebreak::linebreaks(&stripped)
+ .filter(|(idx, _)| {
+ #[allow(clippy::match_like_matches_macro)]
+ match &stripped[..*idx].chars().next_back() {
+ // We suppress breaks at β€˜-’ since we want to control
+ // this via the WordSplitter.
+ Some('-') => false,
+ // Soft hyphens are currently not supported since we
+ // require all `Word` fragments to be continuous in
+ // the input string.
+ Some(SHY) => false,
+ // Other breaks should be fine!
+ _ => true,
+ }
+ })
+ .collect::<Vec<_>>()
+ .into_iter();
+
+ // Remove final break opportunity, we will add it below using
+ // &line[start..]; This ensures that we correctly include a
+ // trailing ANSI escape sequence.
+ opportunities.next_back();
+
+ let mut start = 0;
+ Box::new(std::iter::from_fn(move || {
+ for (idx, _) in opportunities.by_ref() {
+ if let Some((orig_idx, _)) = idx_map.find(|&(_, stripped_idx)| stripped_idx == idx) {
+ let word = Word::from(&line[start..orig_idx]);
+ start = orig_idx;
+ return Some(word);
+ }
+ }
+
+ if start < line.len() {
+ let word = Word::from(&line[start..]);
+ start = line.len();
+ return Some(word);
+ }
+
+ None
+ }))
+}
+
+#[cfg(test)]
+mod tests {
+ use super::WordSeparator::*;
+ use super::*;
+
+ // Like assert_eq!, but the left expression is an iterator.
+ macro_rules! assert_iter_eq {
+ ($left:expr, $right:expr) => {
+ assert_eq!($left.collect::<Vec<_>>(), $right);
+ };
+ }
+
+ fn to_words(words: Vec<&str>) -> Vec<Word<'_>> {
+ words.into_iter().map(Word::from).collect()
+ }
+
+ macro_rules! test_find_words {
+ ($ascii_name:ident,
+ $unicode_name:ident,
+ $([ $line:expr, $ascii_words:expr, $unicode_words:expr ]),+) => {
+ #[test]
+ fn $ascii_name() {
+ $(
+ let expected_words = to_words($ascii_words.to_vec());
+ let actual_words = WordSeparator::AsciiSpace
+ .find_words($line)
+ .collect::<Vec<_>>();
+ assert_eq!(actual_words, expected_words, "Line: {:?}", $line);
+ )+
+ }
+
+ #[test]
+ #[cfg(feature = "unicode-linebreak")]
+ fn $unicode_name() {
+ $(
+ let expected_words = to_words($unicode_words.to_vec());
+ let actual_words = WordSeparator::UnicodeBreakProperties
+ .find_words($line)
+ .collect::<Vec<_>>();
+ assert_eq!(actual_words, expected_words, "Line: {:?}", $line);
+ )+
+ }
+ };
+ }
+
+ test_find_words!(ascii_space_empty, unicode_empty, ["", [], []]);
+
+ test_find_words!(
+ ascii_single_word,
+ unicode_single_word,
+ ["foo", ["foo"], ["foo"]]
+ );
+
+ test_find_words!(
+ ascii_two_words,
+ unicode_two_words,
+ ["foo bar", ["foo ", "bar"], ["foo ", "bar"]]
+ );
+
+ test_find_words!(
+ ascii_multiple_words,
+ unicode_multiple_words,
+ ["foo bar", ["foo ", "bar"], ["foo ", "bar"]],
+ ["x y z", ["x ", "y ", "z"], ["x ", "y ", "z"]]
+ );
+
+ test_find_words!(
+ ascii_only_whitespace,
+ unicode_only_whitespace,
+ [" ", [" "], [" "]],
+ [" ", [" "], [" "]]
+ );
+
+ test_find_words!(
+ ascii_inter_word_whitespace,
+ unicode_inter_word_whitespace,
+ ["foo bar", ["foo ", "bar"], ["foo ", "bar"]]
+ );
+
+ test_find_words!(
+ ascii_trailing_whitespace,
+ unicode_trailing_whitespace,
+ ["foo ", ["foo "], ["foo "]]
+ );
+
+ test_find_words!(
+ ascii_leading_whitespace,
+ unicode_leading_whitespace,
+ [" foo", [" ", "foo"], [" ", "foo"]]
+ );
+
+ test_find_words!(
+ ascii_multi_column_char,
+ unicode_multi_column_char,
+ ["\u{1f920}", ["\u{1f920}"], ["\u{1f920}"]] // cowboy emoji 🀠
+ );
+
+ test_find_words!(
+ ascii_hyphens,
+ unicode_hyphens,
+ ["foo-bar", ["foo-bar"], ["foo-bar"]],
+ ["foo- bar", ["foo- ", "bar"], ["foo- ", "bar"]],
+ ["foo - bar", ["foo ", "- ", "bar"], ["foo ", "- ", "bar"]],
+ ["foo -bar", ["foo ", "-bar"], ["foo ", "-bar"]]
+ );
+
+ test_find_words!(
+ ascii_newline,
+ unicode_newline,
+ ["foo\nbar", ["foo\nbar"], ["foo\n", "bar"]]
+ );
+
+ test_find_words!(
+ ascii_tab,
+ unicode_tab,
+ ["foo\tbar", ["foo\tbar"], ["foo\t", "bar"]]
+ );
+
+ test_find_words!(
+ ascii_non_breaking_space,
+ unicode_non_breaking_space,
+ ["foo\u{00A0}bar", ["foo\u{00A0}bar"], ["foo\u{00A0}bar"]]
+ );
+
+ #[test]
+ #[cfg(unix)]
+ fn find_words_colored_text() {
+ use termion::color::{Blue, Fg, Green, Reset};
+
+ let green_hello = format!("{}Hello{} ", Fg(Green), Fg(Reset));
+ let blue_world = format!("{}World!{}", Fg(Blue), Fg(Reset));
+ assert_iter_eq!(
+ AsciiSpace.find_words(&format!("{}{}", green_hello, blue_world)),
+ vec![Word::from(&green_hello), Word::from(&blue_world)]
+ );
+
+ #[cfg(feature = "unicode-linebreak")]
+ assert_iter_eq!(
+ UnicodeBreakProperties.find_words(&format!("{}{}", green_hello, blue_world)),
+ vec![Word::from(&green_hello), Word::from(&blue_world)]
+ );
+ }
+
+ #[test]
+ fn find_words_color_inside_word() {
+ let text = "foo\u{1b}[0m\u{1b}[32mbar\u{1b}[0mbaz";
+ assert_iter_eq!(AsciiSpace.find_words(text), vec![Word::from(text)]);
+
+ #[cfg(feature = "unicode-linebreak")]
+ assert_iter_eq!(
+ UnicodeBreakProperties.find_words(text),
+ vec![Word::from(text)]
+ );
+ }
+
+ #[test]
+ fn word_separator_new() {
+ #[cfg(feature = "unicode-linebreak")]
+ assert!(matches!(WordSeparator::new(), UnicodeBreakProperties));
+
+ #[cfg(not(feature = "unicode-linebreak"))]
+ assert!(matches!(WordSeparator::new(), AsciiSpace));
+ }
+}