diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 19:33:14 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 19:33:14 +0000 |
commit | 36d22d82aa202bb199967e9512281e9a53db42c9 (patch) | |
tree | 105e8c98ddea1c1e4784a60a5a6410fa416be2de /third_party/rust/textwrap/src | |
parent | Initial commit. (diff) | |
download | firefox-esr-36d22d82aa202bb199967e9512281e9a53db42c9.tar.xz firefox-esr-36d22d82aa202bb199967e9512281e9a53db42c9.zip |
Adding upstream version 115.7.0esr.upstream/115.7.0esrupstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/rust/textwrap/src')
-rw-r--r-- | third_party/rust/textwrap/src/core.rs | 433 | ||||
-rw-r--r-- | third_party/rust/textwrap/src/indentation.rs | 347 | ||||
-rw-r--r-- | third_party/rust/textwrap/src/lib.rs | 1847 | ||||
-rw-r--r-- | third_party/rust/textwrap/src/word_separators.rs | 428 | ||||
-rw-r--r-- | third_party/rust/textwrap/src/word_splitters.rs | 314 | ||||
-rw-r--r-- | third_party/rust/textwrap/src/wrap_algorithms.rs | 381 | ||||
-rw-r--r-- | third_party/rust/textwrap/src/wrap_algorithms/optimal_fit.rs | 433 |
7 files changed, 4183 insertions, 0 deletions
diff --git a/third_party/rust/textwrap/src/core.rs b/third_party/rust/textwrap/src/core.rs new file mode 100644 index 0000000000..0ab4ef8134 --- /dev/null +++ b/third_party/rust/textwrap/src/core.rs @@ -0,0 +1,433 @@ +//! Building blocks for advanced wrapping functionality. +//! +//! The functions and structs in this module can be used to implement +//! advanced wrapping functionality when the [`wrap`](super::wrap) and +//! [`fill`](super::fill) function don't do what you want. +//! +//! In general, you want to follow these steps when wrapping +//! something: +//! +//! 1. Split your input into [`Fragment`]s. These are abstract blocks +//! of text or content which can be wrapped into lines. See +//! [`WordSeparator`](crate::word_separators::WordSeparator) for +//! how to do this for text. +//! +//! 2. Potentially split your fragments into smaller pieces. This +//! allows you to implement things like hyphenation. If you use the +//! `Word` type, you can use [`WordSplitter`](crate::WordSplitter) +//! enum for this. +//! +//! 3. Potentially break apart fragments that are still too large to +//! fit on a single line. This is implemented in [`break_words`]. +//! +//! 4. Finally take your fragments and put them into lines. There are +//! two algorithms for this in the +//! [`wrap_algorithms`](crate::wrap_algorithms) module: +//! [`wrap_optimal_fit`](crate::wrap_algorithms::wrap_optimal_fit) +//! and [`wrap_first_fit`](crate::wrap_algorithms::wrap_first_fit). +//! The former produces better line breaks, the latter is faster. +//! +//! 5. Iterate through the slices returned by the wrapping functions +//! and construct your lines of output. +//! +//! Please [open an issue](https://github.com/mgeisler/textwrap/) if +//! the functionality here is not sufficient or if you have ideas for +//! improving it. We would love to hear from you! + +/// The CSI or “Control Sequence Introducer” introduces an ANSI escape +/// sequence. This is typically used for colored text and will be +/// ignored when computing the text width. +const CSI: (char, char) = ('\x1b', '['); +/// The final bytes of an ANSI escape sequence must be in this range. +const ANSI_FINAL_BYTE: std::ops::RangeInclusive<char> = '\x40'..='\x7e'; + +/// Skip ANSI escape sequences. The `ch` is the current `char`, the +/// `chars` provide the following characters. The `chars` will be +/// modified if `ch` is the start of an ANSI escape sequence. +#[inline] +pub(crate) fn skip_ansi_escape_sequence<I: Iterator<Item = char>>(ch: char, chars: &mut I) -> bool { + if ch == CSI.0 && chars.next() == Some(CSI.1) { + // We have found the start of an ANSI escape code, typically + // used for colored terminal text. We skip until we find a + // "final byte" in the range 0x40–0x7E. + for ch in chars { + if ANSI_FINAL_BYTE.contains(&ch) { + return true; + } + } + } + false +} + +#[cfg(feature = "unicode-width")] +#[inline] +fn ch_width(ch: char) -> usize { + unicode_width::UnicodeWidthChar::width(ch).unwrap_or(0) +} + +/// First character which [`ch_width`] will classify as double-width. +/// Please see [`display_width`]. +#[cfg(not(feature = "unicode-width"))] +const DOUBLE_WIDTH_CUTOFF: char = '\u{1100}'; + +#[cfg(not(feature = "unicode-width"))] +#[inline] +fn ch_width(ch: char) -> usize { + if ch < DOUBLE_WIDTH_CUTOFF { + 1 + } else { + 2 + } +} + +/// Compute the display width of `text` while skipping over ANSI +/// escape sequences. +/// +/// # Examples +/// +/// ``` +/// use textwrap::core::display_width; +/// +/// assert_eq!(display_width("Café Plain"), 10); +/// assert_eq!(display_width("\u{1b}[31mCafé Rouge\u{1b}[0m"), 10); +/// ``` +/// +/// **Note:** When the `unicode-width` Cargo feature is disabled, the +/// width of a `char` is determined by a crude approximation which +/// simply counts chars below U+1100 as 1 column wide, and all other +/// characters as 2 columns wide. With the feature enabled, function +/// will correctly deal with [combining characters] in their +/// decomposed form (see [Unicode equivalence]). +/// +/// An example of a decomposed character is “é”, which can be +/// decomposed into: “e” followed by a combining acute accent: “◌́”. +/// Without the `unicode-width` Cargo feature, every `char` below +/// U+1100 has a width of 1. This includes the combining accent: +/// +/// ``` +/// use textwrap::core::display_width; +/// +/// assert_eq!(display_width("Cafe Plain"), 10); +/// #[cfg(feature = "unicode-width")] +/// assert_eq!(display_width("Cafe\u{301} Plain"), 10); +/// #[cfg(not(feature = "unicode-width"))] +/// assert_eq!(display_width("Cafe\u{301} Plain"), 11); +/// ``` +/// +/// ## Emojis and CJK Characters +/// +/// Characters such as emojis and [CJK characters] used in the +/// Chinese, Japanese, and Korean langauges are seen as double-width, +/// even if the `unicode-width` feature is disabled: +/// +/// ``` +/// use textwrap::core::display_width; +/// +/// assert_eq!(display_width("😂😭🥺🤣✨😍🙏🥰😊🔥"), 20); +/// assert_eq!(display_width("你好"), 4); // “Nǐ hǎo” or “Hello” in Chinese +/// ``` +/// +/// # Limitations +/// +/// The displayed width of a string cannot always be computed from the +/// string alone. This is because the width depends on the rendering +/// engine used. This is particularly visible with [emoji modifier +/// sequences] where a base emoji is modified with, e.g., skin tone or +/// hair color modifiers. It is up to the rendering engine to detect +/// this and to produce a suitable emoji. +/// +/// A simple example is “❤️”, which consists of “❤” (U+2764: Black +/// Heart Symbol) followed by U+FE0F (Variation Selector-16). By +/// itself, “❤” is a black heart, but if you follow it with the +/// variant selector, you may get a wider red heart. +/// +/// A more complex example would be “👨🦰” which should depict a man +/// with red hair. Here the computed width is too large — and the +/// width differs depending on the use of the `unicode-width` feature: +/// +/// ``` +/// use textwrap::core::display_width; +/// +/// assert_eq!("👨🦰".chars().collect::<Vec<char>>(), ['\u{1f468}', '\u{200d}', '\u{1f9b0}']); +/// #[cfg(feature = "unicode-width")] +/// assert_eq!(display_width("👨🦰"), 4); +/// #[cfg(not(feature = "unicode-width"))] +/// assert_eq!(display_width("👨🦰"), 6); +/// ``` +/// +/// This happens because the grapheme consists of three code points: +/// “👨” (U+1F468: Man), Zero Width Joiner (U+200D), and “🦰” +/// (U+1F9B0: Red Hair). You can see them above in the test. With +/// `unicode-width` enabled, the ZWJ is correctly seen as having zero +/// width, without it is counted as a double-width character. +/// +/// ## Terminal Support +/// +/// Modern browsers typically do a great job at combining characters +/// as shown above, but terminals often struggle more. As an example, +/// Gnome Terminal version 3.38.1, shows “❤️” as a big red heart, but +/// shows "👨🦰" as “👨🦰”. +/// +/// [combining characters]: https://en.wikipedia.org/wiki/Combining_character +/// [Unicode equivalence]: https://en.wikipedia.org/wiki/Unicode_equivalence +/// [CJK characters]: https://en.wikipedia.org/wiki/CJK_characters +/// [emoji modifier sequences]: https://unicode.org/emoji/charts/full-emoji-modifiers.html +pub fn display_width(text: &str) -> usize { + let mut chars = text.chars(); + let mut width = 0; + while let Some(ch) = chars.next() { + if skip_ansi_escape_sequence(ch, &mut chars) { + continue; + } + width += ch_width(ch); + } + width +} + +/// A (text) fragment denotes the unit which we wrap into lines. +/// +/// Fragments represent an abstract _word_ plus the _whitespace_ +/// following the word. In case the word falls at the end of the line, +/// the whitespace is dropped and a so-called _penalty_ is inserted +/// instead (typically `"-"` if the word was hyphenated). +/// +/// For wrapping purposes, the precise content of the word, the +/// whitespace, and the penalty is irrelevant. All we need to know is +/// the displayed width of each part, which this trait provides. +pub trait Fragment: std::fmt::Debug { + /// Displayed width of word represented by this fragment. + fn width(&self) -> f64; + + /// Displayed width of the whitespace that must follow the word + /// when the word is not at the end of a line. + fn whitespace_width(&self) -> f64; + + /// Displayed width of the penalty that must be inserted if the + /// word falls at the end of a line. + fn penalty_width(&self) -> f64; +} + +/// A piece of wrappable text, including any trailing whitespace. +/// +/// A `Word` is an example of a [`Fragment`], so it has a width, +/// trailing whitespace, and potentially a penalty item. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub struct Word<'a> { + /// Word content. + pub word: &'a str, + /// Whitespace to insert if the word does not fall at the end of a line. + pub whitespace: &'a str, + /// Penalty string to insert if the word falls at the end of a line. + pub penalty: &'a str, + // Cached width in columns. + pub(crate) width: usize, +} + +impl std::ops::Deref for Word<'_> { + type Target = str; + + fn deref(&self) -> &Self::Target { + self.word + } +} + +impl<'a> Word<'a> { + /// Construct a `Word` from a string. + /// + /// A trailing stretch of `' '` is automatically taken to be the + /// whitespace part of the word. + pub fn from(word: &str) -> Word<'_> { + let trimmed = word.trim_end_matches(' '); + Word { + word: trimmed, + width: display_width(trimmed), + whitespace: &word[trimmed.len()..], + penalty: "", + } + } + + /// Break this word into smaller words with a width of at most + /// `line_width`. The whitespace and penalty from this `Word` is + /// added to the last piece. + /// + /// # Examples + /// + /// ``` + /// use textwrap::core::Word; + /// assert_eq!( + /// Word::from("Hello! ").break_apart(3).collect::<Vec<_>>(), + /// vec![Word::from("Hel"), Word::from("lo! ")] + /// ); + /// ``` + pub fn break_apart<'b>(&'b self, line_width: usize) -> impl Iterator<Item = Word<'a>> + 'b { + let mut char_indices = self.word.char_indices(); + let mut offset = 0; + let mut width = 0; + + std::iter::from_fn(move || { + while let Some((idx, ch)) = char_indices.next() { + if skip_ansi_escape_sequence(ch, &mut char_indices.by_ref().map(|(_, ch)| ch)) { + continue; + } + + if width > 0 && width + ch_width(ch) > line_width { + let word = Word { + word: &self.word[offset..idx], + width: width, + whitespace: "", + penalty: "", + }; + offset = idx; + width = ch_width(ch); + return Some(word); + } + + width += ch_width(ch); + } + + if offset < self.word.len() { + let word = Word { + word: &self.word[offset..], + width: width, + whitespace: self.whitespace, + penalty: self.penalty, + }; + offset = self.word.len(); + return Some(word); + } + + None + }) + } +} + +impl Fragment for Word<'_> { + #[inline] + fn width(&self) -> f64 { + self.width as f64 + } + + // We assume the whitespace consist of ' ' only. This allows us to + // compute the display width in constant time. + #[inline] + fn whitespace_width(&self) -> f64 { + self.whitespace.len() as f64 + } + + // We assume the penalty is `""` or `"-"`. This allows us to + // compute the display width in constant time. + #[inline] + fn penalty_width(&self) -> f64 { + self.penalty.len() as f64 + } +} + +/// Forcibly break words wider than `line_width` into smaller words. +/// +/// This simply calls [`Word::break_apart`] on words that are too +/// wide. This means that no extra `'-'` is inserted, the word is +/// simply broken into smaller pieces. +pub fn break_words<'a, I>(words: I, line_width: usize) -> Vec<Word<'a>> +where + I: IntoIterator<Item = Word<'a>>, +{ + let mut shortened_words = Vec::new(); + for word in words { + if word.width() > line_width as f64 { + shortened_words.extend(word.break_apart(line_width)); + } else { + shortened_words.push(word); + } + } + shortened_words +} + +#[cfg(test)] +mod tests { + use super::*; + + #[cfg(feature = "unicode-width")] + use unicode_width::UnicodeWidthChar; + + #[test] + fn skip_ansi_escape_sequence_works() { + let blue_text = "\u{1b}[34mHello\u{1b}[0m"; + let mut chars = blue_text.chars(); + let ch = chars.next().unwrap(); + assert!(skip_ansi_escape_sequence(ch, &mut chars)); + assert_eq!(chars.next(), Some('H')); + } + + #[test] + fn emojis_have_correct_width() { + use unic_emoji_char::is_emoji; + + // Emojis in the Basic Latin (ASCII) and Latin-1 Supplement + // blocks all have a width of 1 column. This includes + // characters such as '#' and '©'. + for ch in '\u{1}'..'\u{FF}' { + if is_emoji(ch) { + let desc = format!("{:?} U+{:04X}", ch, ch as u32); + + #[cfg(feature = "unicode-width")] + assert_eq!(ch.width().unwrap(), 1, "char: {}", desc); + + #[cfg(not(feature = "unicode-width"))] + assert_eq!(ch_width(ch), 1, "char: {}", desc); + } + } + + // Emojis in the remaining blocks of the Basic Multilingual + // Plane (BMP), in the Supplementary Multilingual Plane (SMP), + // and in the Supplementary Ideographic Plane (SIP), are all 1 + // or 2 columns wide when unicode-width is used, and always 2 + // columns wide otherwise. This includes all of our favorite + // emojis such as 😊. + for ch in '\u{FF}'..'\u{2FFFF}' { + if is_emoji(ch) { + let desc = format!("{:?} U+{:04X}", ch, ch as u32); + + #[cfg(feature = "unicode-width")] + assert!(ch.width().unwrap() <= 2, "char: {}", desc); + + #[cfg(not(feature = "unicode-width"))] + assert_eq!(ch_width(ch), 2, "char: {}", desc); + } + } + + // The remaining planes contain almost no assigned code points + // and thus also no emojis. + } + + #[test] + fn display_width_works() { + assert_eq!("Café Plain".len(), 11); // “é” is two bytes + assert_eq!(display_width("Café Plain"), 10); + assert_eq!(display_width("\u{1b}[31mCafé Rouge\u{1b}[0m"), 10); + } + + #[test] + fn display_width_narrow_emojis() { + #[cfg(feature = "unicode-width")] + assert_eq!(display_width("⁉"), 1); + + // The ⁉ character is above DOUBLE_WIDTH_CUTOFF. + #[cfg(not(feature = "unicode-width"))] + assert_eq!(display_width("⁉"), 2); + } + + #[test] + fn display_width_narrow_emojis_variant_selector() { + #[cfg(feature = "unicode-width")] + assert_eq!(display_width("⁉\u{fe0f}"), 1); + + // The variant selector-16 is also counted. + #[cfg(not(feature = "unicode-width"))] + assert_eq!(display_width("⁉\u{fe0f}"), 4); + } + + #[test] + fn display_width_emojis() { + assert_eq!(display_width("😂😭🥺🤣✨😍🙏🥰😊🔥"), 20); + } +} diff --git a/third_party/rust/textwrap/src/indentation.rs b/third_party/rust/textwrap/src/indentation.rs new file mode 100644 index 0000000000..5d90c06156 --- /dev/null +++ b/third_party/rust/textwrap/src/indentation.rs @@ -0,0 +1,347 @@ +//! Functions related to adding and removing indentation from lines of +//! text. +//! +//! The functions here can be used to uniformly indent or dedent +//! (unindent) word wrapped lines of text. + +/// Indent each line by the given prefix. +/// +/// # Examples +/// +/// ``` +/// use textwrap::indent; +/// +/// assert_eq!(indent("First line.\nSecond line.\n", " "), +/// " First line.\n Second line.\n"); +/// ``` +/// +/// When indenting, trailing whitespace is stripped from the prefix. +/// This means that empty lines remain empty afterwards: +/// +/// ``` +/// use textwrap::indent; +/// +/// assert_eq!(indent("First line.\n\n\nSecond line.\n", " "), +/// " First line.\n\n\n Second line.\n"); +/// ``` +/// +/// Notice how `"\n\n\n"` remained as `"\n\n\n"`. +/// +/// This feature is useful when you want to indent text and have a +/// space between your prefix and the text. In this case, you _don't_ +/// want a trailing space on empty lines: +/// +/// ``` +/// use textwrap::indent; +/// +/// assert_eq!(indent("foo = 123\n\nprint(foo)\n", "# "), +/// "# foo = 123\n#\n# print(foo)\n"); +/// ``` +/// +/// Notice how `"\n\n"` became `"\n#\n"` instead of `"\n# \n"` which +/// would have trailing whitespace. +/// +/// Leading and trailing whitespace coming from the text itself is +/// kept unchanged: +/// +/// ``` +/// use textwrap::indent; +/// +/// assert_eq!(indent(" \t Foo ", "->"), "-> \t Foo "); +/// ``` +pub fn indent(s: &str, prefix: &str) -> String { + // We know we'll need more than s.len() bytes for the output, but + // without counting '\n' characters (which is somewhat slow), we + // don't know exactly how much. However, we can preemptively do + // the first doubling of the output size. + let mut result = String::with_capacity(2 * s.len()); + let trimmed_prefix = prefix.trim_end(); + for (idx, line) in s.split_terminator('\n').enumerate() { + if idx > 0 { + result.push('\n'); + } + if line.trim().is_empty() { + result.push_str(trimmed_prefix); + } else { + result.push_str(prefix); + } + result.push_str(line); + } + if s.ends_with('\n') { + // split_terminator will have eaten the final '\n'. + result.push('\n'); + } + result +} + +/// Removes common leading whitespace from each line. +/// +/// This function will look at each non-empty line and determine the +/// maximum amount of whitespace that can be removed from all lines: +/// +/// ``` +/// use textwrap::dedent; +/// +/// assert_eq!(dedent(" +/// 1st line +/// 2nd line +/// 3rd line +/// "), " +/// 1st line +/// 2nd line +/// 3rd line +/// "); +/// ``` +pub fn dedent(s: &str) -> String { + let mut prefix = ""; + let mut lines = s.lines(); + + // We first search for a non-empty line to find a prefix. + for line in &mut lines { + let mut whitespace_idx = line.len(); + for (idx, ch) in line.char_indices() { + if !ch.is_whitespace() { + whitespace_idx = idx; + break; + } + } + + // Check if the line had anything but whitespace + if whitespace_idx < line.len() { + prefix = &line[..whitespace_idx]; + break; + } + } + + // We then continue looking through the remaining lines to + // possibly shorten the prefix. + for line in &mut lines { + let mut whitespace_idx = line.len(); + for ((idx, a), b) in line.char_indices().zip(prefix.chars()) { + if a != b { + whitespace_idx = idx; + break; + } + } + + // Check if the line had anything but whitespace and if we + // have found a shorter prefix + if whitespace_idx < line.len() && whitespace_idx < prefix.len() { + prefix = &line[..whitespace_idx]; + } + } + + // We now go over the lines a second time to build the result. + let mut result = String::new(); + for line in s.lines() { + if line.starts_with(&prefix) && line.chars().any(|c| !c.is_whitespace()) { + let (_, tail) = line.split_at(prefix.len()); + result.push_str(tail); + } + result.push('\n'); + } + + if result.ends_with('\n') && !s.ends_with('\n') { + let new_len = result.len() - 1; + result.truncate(new_len); + } + + result +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn indent_empty() { + assert_eq!(indent("\n", " "), "\n"); + } + + #[test] + #[rustfmt::skip] + fn indent_nonempty() { + let text = [ + " foo\n", + "bar\n", + " baz\n", + ].join(""); + let expected = [ + "// foo\n", + "// bar\n", + "// baz\n", + ].join(""); + assert_eq!(indent(&text, "// "), expected); + } + + #[test] + #[rustfmt::skip] + fn indent_empty_line() { + let text = [ + " foo", + "bar", + "", + " baz", + ].join("\n"); + let expected = [ + "// foo", + "// bar", + "//", + "// baz", + ].join("\n"); + assert_eq!(indent(&text, "// "), expected); + } + + #[test] + fn dedent_empty() { + assert_eq!(dedent(""), ""); + } + + #[test] + #[rustfmt::skip] + fn dedent_multi_line() { + let x = [ + " foo", + " bar", + " baz", + ].join("\n"); + let y = [ + " foo", + "bar", + " baz" + ].join("\n"); + assert_eq!(dedent(&x), y); + } + + #[test] + #[rustfmt::skip] + fn dedent_empty_line() { + let x = [ + " foo", + " bar", + " ", + " baz" + ].join("\n"); + let y = [ + " foo", + "bar", + "", + " baz" + ].join("\n"); + assert_eq!(dedent(&x), y); + } + + #[test] + #[rustfmt::skip] + fn dedent_blank_line() { + let x = [ + " foo", + "", + " bar", + " foo", + " bar", + " baz", + ].join("\n"); + let y = [ + "foo", + "", + " bar", + " foo", + " bar", + " baz", + ].join("\n"); + assert_eq!(dedent(&x), y); + } + + #[test] + #[rustfmt::skip] + fn dedent_whitespace_line() { + let x = [ + " foo", + " ", + " bar", + " foo", + " bar", + " baz", + ].join("\n"); + let y = [ + "foo", + "", + " bar", + " foo", + " bar", + " baz", + ].join("\n"); + assert_eq!(dedent(&x), y); + } + + #[test] + #[rustfmt::skip] + fn dedent_mixed_whitespace() { + let x = [ + "\tfoo", + " bar", + ].join("\n"); + let y = [ + "\tfoo", + " bar", + ].join("\n"); + assert_eq!(dedent(&x), y); + } + + #[test] + #[rustfmt::skip] + fn dedent_tabbed_whitespace() { + let x = [ + "\t\tfoo", + "\t\t\tbar", + ].join("\n"); + let y = [ + "foo", + "\tbar", + ].join("\n"); + assert_eq!(dedent(&x), y); + } + + #[test] + #[rustfmt::skip] + fn dedent_mixed_tabbed_whitespace() { + let x = [ + "\t \tfoo", + "\t \t\tbar", + ].join("\n"); + let y = [ + "foo", + "\tbar", + ].join("\n"); + assert_eq!(dedent(&x), y); + } + + #[test] + #[rustfmt::skip] + fn dedent_mixed_tabbed_whitespace2() { + let x = [ + "\t \tfoo", + "\t \tbar", + ].join("\n"); + let y = [ + "\tfoo", + " \tbar", + ].join("\n"); + assert_eq!(dedent(&x), y); + } + + #[test] + #[rustfmt::skip] + fn dedent_preserve_no_terminating_newline() { + let x = [ + " foo", + " bar", + ].join("\n"); + let y = [ + "foo", + " bar", + ].join("\n"); + assert_eq!(dedent(&x), y); + } +} diff --git a/third_party/rust/textwrap/src/lib.rs b/third_party/rust/textwrap/src/lib.rs new file mode 100644 index 0000000000..e570eac2a8 --- /dev/null +++ b/third_party/rust/textwrap/src/lib.rs @@ -0,0 +1,1847 @@ +//! The textwrap library provides functions for word wrapping and +//! indenting text. +//! +//! # Wrapping Text +//! +//! Wrapping text can be very useful in command-line programs where +//! you want to format dynamic output nicely so it looks good in a +//! terminal. A quick example: +//! +//! ``` +//! # #[cfg(feature = "smawk")] { +//! let text = "textwrap: a small library for wrapping text."; +//! assert_eq!(textwrap::wrap(text, 18), +//! vec!["textwrap: a", +//! "small library for", +//! "wrapping text."]); +//! # } +//! ``` +//! +//! The [`wrap`] function returns the individual lines, use [`fill`] +//! is you want the lines joined with `'\n'` to form a `String`. +//! +//! If you enable the `hyphenation` Cargo feature, you can get +//! automatic hyphenation for a number of languages: +//! +//! ``` +//! #[cfg(feature = "hyphenation")] { +//! use hyphenation::{Language, Load, Standard}; +//! use textwrap::{wrap, Options, WordSplitter}; +//! +//! let text = "textwrap: a small library for wrapping text."; +//! let dictionary = Standard::from_embedded(Language::EnglishUS).unwrap(); +//! let options = Options::new(18).word_splitter(WordSplitter::Hyphenation(dictionary)); +//! assert_eq!(wrap(text, &options), +//! vec!["textwrap: a small", +//! "library for wrap-", +//! "ping text."]); +//! } +//! ``` +//! +//! See also the [`unfill`] and [`refill`] functions which allow you to +//! manipulate already wrapped text. +//! +//! ## Wrapping Strings at Compile Time +//! +//! If your strings are known at compile time, please take a look at +//! the procedural macros from the [textwrap-macros] crate. +//! +//! ## Displayed Width vs Byte Size +//! +//! To word wrap text, one must know the width of each word so one can +//! know when to break lines. This library will by default measure the +//! width of text using the _displayed width_, not the size in bytes. +//! The `unicode-width` Cargo feature controls this. +//! +//! This is important for non-ASCII text. ASCII characters such as `a` +//! and `!` are simple and take up one column each. This means that +//! the displayed width is equal to the string length in bytes. +//! However, non-ASCII characters and symbols take up more than one +//! byte when UTF-8 encoded: `é` is `0xc3 0xa9` (two bytes) and `⚙` is +//! `0xe2 0x9a 0x99` (three bytes) in UTF-8, respectively. +//! +//! This is why we take care to use the displayed width instead of the +//! byte count when computing line lengths. All functions in this +//! library handle Unicode characters like this when the +//! `unicode-width` Cargo feature is enabled (it is enabled by +//! default). +//! +//! # Indentation and Dedentation +//! +//! The textwrap library also offers functions for adding a prefix to +//! every line of a string and to remove leading whitespace. As an +//! example, the [`indent`] function allows you to turn lines of text +//! into a bullet list: +//! +//! ``` +//! let before = "\ +//! foo +//! bar +//! baz +//! "; +//! let after = "\ +//! * foo +//! * bar +//! * baz +//! "; +//! assert_eq!(textwrap::indent(before, "* "), after); +//! ``` +//! +//! Removing leading whitespace is done with [`dedent`]: +//! +//! ``` +//! let before = " +//! Some +//! indented +//! text +//! "; +//! let after = " +//! Some +//! indented +//! text +//! "; +//! assert_eq!(textwrap::dedent(before), after); +//! ``` +//! +//! # Cargo Features +//! +//! The textwrap library can be slimmed down as needed via a number of +//! Cargo features. This means you only pay for the features you +//! actually use. +//! +//! The full dependency graph, where dashed lines indicate optional +//! dependencies, is shown below: +//! +//! <img src="https://raw.githubusercontent.com/mgeisler/textwrap/master/images/textwrap-0.15.2.svg"> +//! +//! ## Default Features +//! +//! These features are enabled by default: +//! +//! * `unicode-linebreak`: enables finding words using the +//! [unicode-linebreak] crate, which implements the line breaking +//! algorithm described in [Unicode Standard Annex +//! #14](https://www.unicode.org/reports/tr14/). +//! +//! This feature can be disabled if you are happy to find words +//! separated by ASCII space characters only. People wrapping text +//! with emojis or East-Asian characters will want most likely want +//! to enable this feature. See [`WordSeparator`] for details. +//! +//! * `unicode-width`: enables correct width computation of non-ASCII +//! characters via the [unicode-width] crate. Without this feature, +//! every [`char`] is 1 column wide, except for emojis which are 2 +//! columns wide. See the [`core::display_width`] function for +//! details. +//! +//! This feature can be disabled if you only need to wrap ASCII +//! text, or if the functions in [`core`] are used directly with +//! [`core::Fragment`]s for which the widths have been computed in +//! other ways. +//! +//! * `smawk`: enables linear-time wrapping of the whole paragraph via +//! the [smawk] crate. See the [`wrap_algorithms::wrap_optimal_fit`] +//! function for details on the optimal-fit algorithm. +//! +//! This feature can be disabled if you only ever intend to use +//! [`wrap_algorithms::wrap_first_fit`]. +//! +//! With Rust 1.59.0, the size impact of the above features on your +//! binary is as follows: +//! +//! | Configuration | Binary Size | Delta | +//! | :--- | ---: | ---: | +//! | quick-and-dirty implementation | 289 KB | — KB | +//! | textwrap without default features | 301 KB | 12 KB | +//! | textwrap with smawk | 317 KB | 28 KB | +//! | textwrap with unicode-width | 313 KB | 24 KB | +//! | textwrap with unicode-linebreak | 395 KB | 106 KB | +//! +//! The above sizes are the stripped sizes and the binary is compiled +//! in release mode with this profile: +//! +//! ```toml +//! [profile.release] +//! lto = true +//! codegen-units = 1 +//! ``` +//! +//! See the [binary-sizes demo] if you want to reproduce these +//! results. +//! +//! ## Optional Features +//! +//! These Cargo features enable new functionality: +//! +//! * `terminal_size`: enables automatic detection of the terminal +//! width via the [terminal_size] crate. See the +//! [`Options::with_termwidth`] constructor for details. +//! +//! * `hyphenation`: enables language-sensitive hyphenation via the +//! [hyphenation] crate. See the [`word_splitters::WordSplitter`] +//! trait for details. +//! +//! [unicode-linebreak]: https://docs.rs/unicode-linebreak/ +//! [unicode-width]: https://docs.rs/unicode-width/ +//! [smawk]: https://docs.rs/smawk/ +//! [binary-sizes demo]: https://github.com/mgeisler/textwrap/tree/master/examples/binary-sizes +//! [textwrap-macros]: https://docs.rs/textwrap-macros/ +//! [terminal_size]: https://docs.rs/terminal_size/ +//! [hyphenation]: https://docs.rs/hyphenation/ + +#![doc(html_root_url = "https://docs.rs/textwrap/0.15.2")] +#![forbid(unsafe_code)] // See https://github.com/mgeisler/textwrap/issues/210 +#![deny(missing_docs)] +#![deny(missing_debug_implementations)] +#![allow(clippy::redundant_field_names)] + +// Make `cargo test` execute the README doctests. +#[cfg(doctest)] +#[doc = include_str!("../README.md")] +mod readme_doctest {} + +use std::borrow::Cow; + +mod indentation; +pub use crate::indentation::{dedent, indent}; + +mod word_separators; +pub use word_separators::WordSeparator; + +pub mod word_splitters; +pub use word_splitters::WordSplitter; + +pub mod wrap_algorithms; +pub use wrap_algorithms::WrapAlgorithm; + +pub mod core; + +#[cfg(feature = "unicode-linebreak")] +macro_rules! DefaultWordSeparator { + () => { + WordSeparator::UnicodeBreakProperties + }; +} + +#[cfg(not(feature = "unicode-linebreak"))] +macro_rules! DefaultWordSeparator { + () => { + WordSeparator::AsciiSpace + }; +} + +/// Holds configuration options for wrapping and filling text. +#[derive(Debug, Clone)] +pub struct Options<'a> { + /// The width in columns at which the text will be wrapped. + pub width: usize, + /// Indentation used for the first line of output. See the + /// [`Options::initial_indent`] method. + pub initial_indent: &'a str, + /// Indentation used for subsequent lines of output. See the + /// [`Options::subsequent_indent`] method. + pub subsequent_indent: &'a str, + /// Allow long words to be broken if they cannot fit on a line. + /// When set to `false`, some lines may be longer than + /// `self.width`. See the [`Options::break_words`] method. + pub break_words: bool, + /// Wrapping algorithm to use, see the implementations of the + /// [`wrap_algorithms::WrapAlgorithm`] trait for details. + pub wrap_algorithm: WrapAlgorithm, + /// The line breaking algorithm to use, see + /// [`word_separators::WordSeparator`] trait for an overview and + /// possible implementations. + pub word_separator: WordSeparator, + /// The method for splitting words. This can be used to prohibit + /// splitting words on hyphens, or it can be used to implement + /// language-aware machine hyphenation. + pub word_splitter: WordSplitter, +} + +impl<'a> From<&'a Options<'a>> for Options<'a> { + fn from(options: &'a Options<'a>) -> Self { + Self { + width: options.width, + initial_indent: options.initial_indent, + subsequent_indent: options.subsequent_indent, + break_words: options.break_words, + word_separator: options.word_separator, + wrap_algorithm: options.wrap_algorithm, + word_splitter: options.word_splitter.clone(), + } + } +} + +impl<'a> From<usize> for Options<'a> { + fn from(width: usize) -> Self { + Options::new(width) + } +} + +impl<'a> Options<'a> { + /// Creates a new [`Options`] with the specified width. Equivalent to + /// + /// ``` + /// # use textwrap::{Options, WordSplitter, WordSeparator, WrapAlgorithm}; + /// # let width = 80; + /// # let actual = Options::new(width); + /// # let expected = + /// Options { + /// width: width, + /// initial_indent: "", + /// subsequent_indent: "", + /// break_words: true, + /// #[cfg(feature = "unicode-linebreak")] + /// word_separator: WordSeparator::UnicodeBreakProperties, + /// #[cfg(not(feature = "unicode-linebreak"))] + /// word_separator: WordSeparator::AsciiSpace, + /// #[cfg(feature = "smawk")] + /// wrap_algorithm: WrapAlgorithm::new_optimal_fit(), + /// #[cfg(not(feature = "smawk"))] + /// wrap_algorithm: WrapAlgorithm::FirstFit, + /// word_splitter: WordSplitter::HyphenSplitter, + /// } + /// # ; + /// # assert_eq!(actual.width, expected.width); + /// # assert_eq!(actual.initial_indent, expected.initial_indent); + /// # assert_eq!(actual.subsequent_indent, expected.subsequent_indent); + /// # assert_eq!(actual.break_words, expected.break_words); + /// # assert_eq!(actual.word_splitter, expected.word_splitter); + /// ``` + /// + /// Note that the default word separator and wrap algorithms + /// changes based on the available Cargo features. The best + /// available algorithms are used by default. + pub const fn new(width: usize) -> Self { + Options { + width, + initial_indent: "", + subsequent_indent: "", + break_words: true, + word_separator: DefaultWordSeparator!(), + wrap_algorithm: WrapAlgorithm::new(), + word_splitter: WordSplitter::HyphenSplitter, + } + } + + /// Creates a new [`Options`] with `width` set to the current + /// terminal width. If the terminal width cannot be determined + /// (typically because the standard input and output is not + /// connected to a terminal), a width of 80 characters will be + /// used. Other settings use the same defaults as + /// [`Options::new`]. + /// + /// Equivalent to: + /// + /// ```no_run + /// use textwrap::{termwidth, Options}; + /// + /// let options = Options::new(termwidth()); + /// ``` + /// + /// **Note:** Only available when the `terminal_size` feature is + /// enabled. + #[cfg(feature = "terminal_size")] + pub fn with_termwidth() -> Self { + Self::new(termwidth()) + } +} + +impl<'a> Options<'a> { + /// Change [`self.initial_indent`]. The initial indentation is + /// used on the very first line of output. + /// + /// # Examples + /// + /// Classic paragraph indentation can be achieved by specifying an + /// initial indentation and wrapping each paragraph by itself: + /// + /// ``` + /// use textwrap::{wrap, Options}; + /// + /// let options = Options::new(16).initial_indent(" "); + /// assert_eq!(wrap("This is a little example.", options), + /// vec![" This is a", + /// "little example."]); + /// ``` + /// + /// [`self.initial_indent`]: #structfield.initial_indent + pub fn initial_indent(self, indent: &'a str) -> Self { + Options { + initial_indent: indent, + ..self + } + } + + /// Change [`self.subsequent_indent`]. The subsequent indentation + /// is used on lines following the first line of output. + /// + /// # Examples + /// + /// Combining initial and subsequent indentation lets you format a + /// single paragraph as a bullet list: + /// + /// ``` + /// use textwrap::{wrap, Options}; + /// + /// let options = Options::new(12) + /// .initial_indent("* ") + /// .subsequent_indent(" "); + /// #[cfg(feature = "smawk")] + /// assert_eq!(wrap("This is a little example.", options), + /// vec!["* This is", + /// " a little", + /// " example."]); + /// + /// // Without the `smawk` feature, the wrapping is a little different: + /// #[cfg(not(feature = "smawk"))] + /// assert_eq!(wrap("This is a little example.", options), + /// vec!["* This is a", + /// " little", + /// " example."]); + /// ``` + /// + /// [`self.subsequent_indent`]: #structfield.subsequent_indent + pub fn subsequent_indent(self, indent: &'a str) -> Self { + Options { + subsequent_indent: indent, + ..self + } + } + + /// Change [`self.break_words`]. This controls if words longer + /// than `self.width` can be broken, or if they will be left + /// sticking out into the right margin. + /// + /// # Examples + /// + /// ``` + /// use textwrap::{wrap, Options}; + /// + /// let options = Options::new(4).break_words(true); + /// assert_eq!(wrap("This is a little example.", options), + /// vec!["This", + /// "is a", + /// "litt", + /// "le", + /// "exam", + /// "ple."]); + /// ``` + /// + /// [`self.break_words`]: #structfield.break_words + pub fn break_words(self, setting: bool) -> Self { + Options { + break_words: setting, + ..self + } + } + + /// Change [`self.word_separator`]. + /// + /// See [`word_separators::WordSeparator`] for details on the choices. + /// + /// [`self.word_separator`]: #structfield.word_separator + pub fn word_separator(self, word_separator: WordSeparator) -> Options<'a> { + Options { + width: self.width, + initial_indent: self.initial_indent, + subsequent_indent: self.subsequent_indent, + break_words: self.break_words, + word_separator: word_separator, + wrap_algorithm: self.wrap_algorithm, + word_splitter: self.word_splitter, + } + } + + /// Change [`self.wrap_algorithm`]. + /// + /// See the [`wrap_algorithms::WrapAlgorithm`] trait for details on + /// the choices. + /// + /// [`self.wrap_algorithm`]: #structfield.wrap_algorithm + pub fn wrap_algorithm(self, wrap_algorithm: WrapAlgorithm) -> Options<'a> { + Options { + width: self.width, + initial_indent: self.initial_indent, + subsequent_indent: self.subsequent_indent, + break_words: self.break_words, + word_separator: self.word_separator, + wrap_algorithm: wrap_algorithm, + word_splitter: self.word_splitter, + } + } + + /// Change [`self.word_splitter`]. The + /// [`word_splitters::WordSplitter`] is used to fit part of a word + /// into the current line when wrapping text. + /// + /// # Examples + /// + /// ``` + /// use textwrap::{Options, WordSplitter}; + /// let opt = Options::new(80); + /// assert_eq!(opt.word_splitter, WordSplitter::HyphenSplitter); + /// let opt = opt.word_splitter(WordSplitter::NoHyphenation); + /// assert_eq!(opt.word_splitter, WordSplitter::NoHyphenation); + /// ``` + /// + /// [`self.word_splitter`]: #structfield.word_splitter + pub fn word_splitter(self, word_splitter: WordSplitter) -> Options<'a> { + Options { + width: self.width, + initial_indent: self.initial_indent, + subsequent_indent: self.subsequent_indent, + break_words: self.break_words, + word_separator: self.word_separator, + wrap_algorithm: self.wrap_algorithm, + word_splitter, + } + } +} + +/// Return the current terminal width. +/// +/// If the terminal width cannot be determined (typically because the +/// standard output is not connected to a terminal), a default width +/// of 80 characters will be used. +/// +/// # Examples +/// +/// Create an [`Options`] for wrapping at the current terminal width +/// with a two column margin to the left and the right: +/// +/// ```no_run +/// use textwrap::{termwidth, Options}; +/// +/// let width = termwidth() - 4; // Two columns on each side. +/// let options = Options::new(width) +/// .initial_indent(" ") +/// .subsequent_indent(" "); +/// ``` +/// +/// **Note:** Only available when the `terminal_size` Cargo feature is +/// enabled. +#[cfg(feature = "terminal_size")] +pub fn termwidth() -> usize { + terminal_size::terminal_size().map_or(80, |(terminal_size::Width(w), _)| w.into()) +} + +/// Fill a line of text at a given width. +/// +/// The result is a [`String`], complete with newlines between each +/// line. Use the [`wrap`] function if you need access to the +/// individual lines. +/// +/// The easiest way to use this function is to pass an integer for +/// `width_or_options`: +/// +/// ``` +/// use textwrap::fill; +/// +/// assert_eq!( +/// fill("Memory safety without garbage collection.", 15), +/// "Memory safety\nwithout garbage\ncollection." +/// ); +/// ``` +/// +/// If you need to customize the wrapping, you can pass an [`Options`] +/// instead of an `usize`: +/// +/// ``` +/// use textwrap::{fill, Options}; +/// +/// let options = Options::new(15) +/// .initial_indent("- ") +/// .subsequent_indent(" "); +/// assert_eq!( +/// fill("Memory safety without garbage collection.", &options), +/// "- Memory safety\n without\n garbage\n collection." +/// ); +/// ``` +pub fn fill<'a, Opt>(text: &str, width_or_options: Opt) -> String +where + Opt: Into<Options<'a>>, +{ + // This will avoid reallocation in simple cases (no + // indentation, no hyphenation). + let mut result = String::with_capacity(text.len()); + + for (i, line) in wrap(text, width_or_options).iter().enumerate() { + if i > 0 { + result.push('\n'); + } + result.push_str(line); + } + + result +} + +/// Unpack a paragraph of already-wrapped text. +/// +/// This function attempts to recover the original text from a single +/// paragraph of text produced by the [`fill`] function. This means +/// that it turns +/// +/// ```text +/// textwrap: a small +/// library for +/// wrapping text. +/// ``` +/// +/// back into +/// +/// ```text +/// textwrap: a small library for wrapping text. +/// ``` +/// +/// In addition, it will recognize a common prefix among the lines. +/// The prefix of the first line is returned in +/// [`Options::initial_indent`] and the prefix (if any) of the the +/// other lines is returned in [`Options::subsequent_indent`]. +/// +/// In addition to `' '`, the prefixes can consist of characters used +/// for unordered lists (`'-'`, `'+'`, and `'*'`) and block quotes +/// (`'>'`) in Markdown as well as characters often used for inline +/// comments (`'#'` and `'/'`). +/// +/// The text must come from a single wrapped paragraph. This means +/// that there can be no `"\n\n"` within the text. +/// +/// # Examples +/// +/// ``` +/// use textwrap::unfill; +/// +/// let (text, options) = unfill("\ +/// * This is an +/// example of +/// a list item. +/// "); +/// +/// assert_eq!(text, "This is an example of a list item.\n"); +/// assert_eq!(options.initial_indent, "* "); +/// assert_eq!(options.subsequent_indent, " "); +/// ``` +pub fn unfill(text: &str) -> (String, Options<'_>) { + let trimmed = text.trim_end_matches('\n'); + let prefix_chars: &[_] = &[' ', '-', '+', '*', '>', '#', '/']; + + let mut options = Options::new(0); + for (idx, line) in trimmed.split('\n').enumerate() { + options.width = std::cmp::max(options.width, core::display_width(line)); + let without_prefix = line.trim_start_matches(prefix_chars); + let prefix = &line[..line.len() - without_prefix.len()]; + + if idx == 0 { + options.initial_indent = prefix; + } else if idx == 1 { + options.subsequent_indent = prefix; + } else if idx > 1 { + for ((idx, x), y) in prefix.char_indices().zip(options.subsequent_indent.chars()) { + if x != y { + options.subsequent_indent = &prefix[..idx]; + break; + } + } + if prefix.len() < options.subsequent_indent.len() { + options.subsequent_indent = prefix; + } + } + } + + let mut unfilled = String::with_capacity(text.len()); + for (idx, line) in trimmed.split('\n').enumerate() { + if idx == 0 { + unfilled.push_str(&line[options.initial_indent.len()..]); + } else { + unfilled.push(' '); + unfilled.push_str(&line[options.subsequent_indent.len()..]); + } + } + + unfilled.push_str(&text[trimmed.len()..]); + (unfilled, options) +} + +/// Refill a paragraph of wrapped text with a new width. +/// +/// This function will first use the [`unfill`] function to remove +/// newlines from the text. Afterwards the text is filled again using +/// the [`fill`] function. +/// +/// The `new_width_or_options` argument specify the new width and can +/// specify other options as well — except for +/// [`Options::initial_indent`] and [`Options::subsequent_indent`], +/// which are deduced from `filled_text`. +/// +/// # Examples +/// +/// ``` +/// use textwrap::refill; +/// +/// // Some loosely wrapped text. The "> " prefix is recognized automatically. +/// let text = "\ +/// > Memory +/// > safety without garbage +/// > collection. +/// "; +/// +/// assert_eq!(refill(text, 20), "\ +/// > Memory safety +/// > without garbage +/// > collection. +/// "); +/// +/// assert_eq!(refill(text, 40), "\ +/// > Memory safety without garbage +/// > collection. +/// "); +/// +/// assert_eq!(refill(text, 60), "\ +/// > Memory safety without garbage collection. +/// "); +/// ``` +/// +/// You can also reshape bullet points: +/// +/// ``` +/// use textwrap::refill; +/// +/// let text = "\ +/// - This is my +/// list item. +/// "; +/// +/// assert_eq!(refill(text, 20), "\ +/// - This is my list +/// item. +/// "); +/// ``` +pub fn refill<'a, Opt>(filled_text: &str, new_width_or_options: Opt) -> String +where + Opt: Into<Options<'a>>, +{ + let trimmed = filled_text.trim_end_matches('\n'); + let (text, options) = unfill(trimmed); + let mut new_options = new_width_or_options.into(); + new_options.initial_indent = options.initial_indent; + new_options.subsequent_indent = options.subsequent_indent; + let mut refilled = fill(&text, new_options); + refilled.push_str(&filled_text[trimmed.len()..]); + refilled +} + +/// Wrap a line of text at a given width. +/// +/// The result is a vector of lines, each line is of type [`Cow<'_, +/// str>`](Cow), which means that the line will borrow from the input +/// `&str` if possible. The lines do not have trailing whitespace, +/// including a final `'\n'`. Please use the [`fill`] function if you +/// need a [`String`] instead. +/// +/// The easiest way to use this function is to pass an integer for +/// `width_or_options`: +/// +/// ``` +/// use textwrap::wrap; +/// +/// let lines = wrap("Memory safety without garbage collection.", 15); +/// assert_eq!(lines, &[ +/// "Memory safety", +/// "without garbage", +/// "collection.", +/// ]); +/// ``` +/// +/// If you need to customize the wrapping, you can pass an [`Options`] +/// instead of an `usize`: +/// +/// ``` +/// use textwrap::{wrap, Options}; +/// +/// let options = Options::new(15) +/// .initial_indent("- ") +/// .subsequent_indent(" "); +/// let lines = wrap("Memory safety without garbage collection.", &options); +/// assert_eq!(lines, &[ +/// "- Memory safety", +/// " without", +/// " garbage", +/// " collection.", +/// ]); +/// ``` +/// +/// # Optimal-Fit Wrapping +/// +/// By default, `wrap` will try to ensure an even right margin by +/// finding breaks which avoid short lines. We call this an +/// “optimal-fit algorithm” since the line breaks are computed by +/// considering all possible line breaks. The alternative is a +/// “first-fit algorithm” which simply accumulates words until they no +/// longer fit on the line. +/// +/// As an example, using the first-fit algorithm to wrap the famous +/// Hamlet quote “To be, or not to be: that is the question” in a +/// narrow column with room for only 10 characters looks like this: +/// +/// ``` +/// # use textwrap::{WrapAlgorithm::FirstFit, Options, wrap}; +/// # +/// # let lines = wrap("To be, or not to be: that is the question", +/// # Options::new(10).wrap_algorithm(FirstFit)); +/// # assert_eq!(lines.join("\n") + "\n", "\ +/// To be, or +/// not to be: +/// that is +/// the +/// question +/// # "); +/// ``` +/// +/// Notice how the second to last line is quite narrow because +/// “question” was too large to fit? The greedy first-fit algorithm +/// doesn’t look ahead, so it has no other option than to put +/// “question” onto its own line. +/// +/// With the optimal-fit wrapping algorithm, the previous lines are +/// shortened slightly in order to make the word “is” go into the +/// second last line: +/// +/// ``` +/// # #[cfg(feature = "smawk")] { +/// # use textwrap::{Options, WrapAlgorithm, wrap}; +/// # +/// # let lines = wrap( +/// # "To be, or not to be: that is the question", +/// # Options::new(10).wrap_algorithm(WrapAlgorithm::new_optimal_fit()) +/// # ); +/// # assert_eq!(lines.join("\n") + "\n", "\ +/// To be, +/// or not to +/// be: that +/// is the +/// question +/// # "); } +/// ``` +/// +/// Please see [`WrapAlgorithm`] for details on the choices. +/// +/// # Examples +/// +/// The returned iterator yields lines of type `Cow<'_, str>`. If +/// possible, the wrapped lines will borrow from the input string. As +/// an example, a hanging indentation, the first line can borrow from +/// the input, but the subsequent lines become owned strings: +/// +/// ``` +/// use std::borrow::Cow::{Borrowed, Owned}; +/// use textwrap::{wrap, Options}; +/// +/// let options = Options::new(15).subsequent_indent("...."); +/// let lines = wrap("Wrapping text all day long.", &options); +/// let annotated = lines +/// .iter() +/// .map(|line| match line { +/// Borrowed(text) => format!("[Borrowed] {}", text), +/// Owned(text) => format!("[Owned] {}", text), +/// }) +/// .collect::<Vec<_>>(); +/// assert_eq!( +/// annotated, +/// &[ +/// "[Borrowed] Wrapping text", +/// "[Owned] ....all day", +/// "[Owned] ....long.", +/// ] +/// ); +/// ``` +/// +/// ## Leading and Trailing Whitespace +/// +/// As a rule, leading whitespace (indentation) is preserved and +/// trailing whitespace is discarded. +/// +/// In more details, when wrapping words into lines, words are found +/// by splitting the input text on space characters. One or more +/// spaces (shown here as “␣”) are attached to the end of each word: +/// +/// ```text +/// "Foo␣␣␣bar␣baz" -> ["Foo␣␣␣", "bar␣", "baz"] +/// ``` +/// +/// These words are then put into lines. The interword whitespace is +/// preserved, unless the lines are wrapped so that the `"Foo␣␣␣"` +/// word falls at the end of a line: +/// +/// ``` +/// use textwrap::wrap; +/// +/// assert_eq!(wrap("Foo bar baz", 10), vec!["Foo bar", "baz"]); +/// assert_eq!(wrap("Foo bar baz", 8), vec!["Foo", "bar baz"]); +/// ``` +/// +/// Notice how the trailing whitespace is removed in both case: in the +/// first example, `"bar␣"` becomes `"bar"` and in the second case +/// `"Foo␣␣␣"` becomes `"Foo"`. +/// +/// Leading whitespace is preserved when the following word fits on +/// the first line. To understand this, consider how words are found +/// in a text with leading spaces: +/// +/// ```text +/// "␣␣foo␣bar" -> ["␣␣", "foo␣", "bar"] +/// ``` +/// +/// When put into lines, the indentation is preserved if `"foo"` fits +/// on the first line, otherwise you end up with an empty line: +/// +/// ``` +/// use textwrap::wrap; +/// +/// assert_eq!(wrap(" foo bar", 8), vec![" foo", "bar"]); +/// assert_eq!(wrap(" foo bar", 4), vec!["", "foo", "bar"]); +/// ``` +pub fn wrap<'a, Opt>(text: &str, width_or_options: Opt) -> Vec<Cow<'_, str>> +where + Opt: Into<Options<'a>>, +{ + let options = width_or_options.into(); + + let initial_width = options + .width + .saturating_sub(core::display_width(options.initial_indent)); + let subsequent_width = options + .width + .saturating_sub(core::display_width(options.subsequent_indent)); + + let mut lines = Vec::new(); + for line in text.split('\n') { + let words = options.word_separator.find_words(line); + let split_words = word_splitters::split_words(words, &options.word_splitter); + let broken_words = if options.break_words { + let mut broken_words = core::break_words(split_words, subsequent_width); + if !options.initial_indent.is_empty() { + // Without this, the first word will always go into + // the first line. However, since we break words based + // on the _second_ line width, it can be wrong to + // unconditionally put the first word onto the first + // line. An empty zero-width word fixed this. + broken_words.insert(0, core::Word::from("")); + } + broken_words + } else { + split_words.collect::<Vec<_>>() + }; + + let line_widths = [initial_width, subsequent_width]; + let wrapped_words = options.wrap_algorithm.wrap(&broken_words, &line_widths); + + let mut idx = 0; + for words in wrapped_words { + let last_word = match words.last() { + None => { + lines.push(Cow::from("")); + continue; + } + Some(word) => word, + }; + + // We assume here that all words are contiguous in `line`. + // That is, the sum of their lengths should add up to the + // length of `line`. + let len = words + .iter() + .map(|word| word.len() + word.whitespace.len()) + .sum::<usize>() + - last_word.whitespace.len(); + + // The result is owned if we have indentation, otherwise + // we can simply borrow an empty string. + let mut result = if lines.is_empty() && !options.initial_indent.is_empty() { + Cow::Owned(options.initial_indent.to_owned()) + } else if !lines.is_empty() && !options.subsequent_indent.is_empty() { + Cow::Owned(options.subsequent_indent.to_owned()) + } else { + // We can use an empty string here since string + // concatenation for `Cow` preserves a borrowed value + // when either side is empty. + Cow::from("") + }; + + result += &line[idx..idx + len]; + + if !last_word.penalty.is_empty() { + result.to_mut().push_str(last_word.penalty); + } + + lines.push(result); + + // Advance by the length of `result`, plus the length of + // `last_word.whitespace` -- even if we had a penalty, we + // need to skip over the whitespace. + idx += len + last_word.whitespace.len(); + } + } + + lines +} + +/// Wrap text into columns with a given total width. +/// +/// The `left_gap`, `middle_gap` and `right_gap` arguments specify the +/// strings to insert before, between, and after the columns. The +/// total width of all columns and all gaps is specified using the +/// `total_width_or_options` argument. This argument can simply be an +/// integer if you want to use default settings when wrapping, or it +/// can be a [`Options`] value if you want to customize the wrapping. +/// +/// If the columns are narrow, it is recommended to set +/// [`Options::break_words`] to `true` to prevent words from +/// protruding into the margins. +/// +/// The per-column width is computed like this: +/// +/// ``` +/// # let (left_gap, middle_gap, right_gap) = ("", "", ""); +/// # let columns = 2; +/// # let options = textwrap::Options::new(80); +/// let inner_width = options.width +/// - textwrap::core::display_width(left_gap) +/// - textwrap::core::display_width(right_gap) +/// - textwrap::core::display_width(middle_gap) * (columns - 1); +/// let column_width = inner_width / columns; +/// ``` +/// +/// The `text` is wrapped using [`wrap`] and the given `options` +/// argument, but the width is overwritten to the computed +/// `column_width`. +/// +/// # Panics +/// +/// Panics if `columns` is zero. +/// +/// # Examples +/// +/// ``` +/// use textwrap::wrap_columns; +/// +/// let text = "\ +/// This is an example text, which is wrapped into three columns. \ +/// Notice how the final column can be shorter than the others."; +/// +/// #[cfg(feature = "smawk")] +/// assert_eq!(wrap_columns(text, 3, 50, "| ", " | ", " |"), +/// vec!["| This is | into three | column can be |", +/// "| an example | columns. | shorter than |", +/// "| text, which | Notice how | the others. |", +/// "| is wrapped | the final | |"]); +/// +/// // Without the `smawk` feature, the middle column is a little more uneven: +/// #[cfg(not(feature = "smawk"))] +/// assert_eq!(wrap_columns(text, 3, 50, "| ", " | ", " |"), +/// vec!["| This is an | three | column can be |", +/// "| example text, | columns. | shorter than |", +/// "| which is | Notice how | the others. |", +/// "| wrapped into | the final | |"]); +pub fn wrap_columns<'a, Opt>( + text: &str, + columns: usize, + total_width_or_options: Opt, + left_gap: &str, + middle_gap: &str, + right_gap: &str, +) -> Vec<String> +where + Opt: Into<Options<'a>>, +{ + assert!(columns > 0); + + let mut options = total_width_or_options.into(); + + let inner_width = options + .width + .saturating_sub(core::display_width(left_gap)) + .saturating_sub(core::display_width(right_gap)) + .saturating_sub(core::display_width(middle_gap) * (columns - 1)); + + let column_width = std::cmp::max(inner_width / columns, 1); + options.width = column_width; + let last_column_padding = " ".repeat(inner_width % column_width); + let wrapped_lines = wrap(text, options); + let lines_per_column = + wrapped_lines.len() / columns + usize::from(wrapped_lines.len() % columns > 0); + let mut lines = Vec::new(); + for line_no in 0..lines_per_column { + let mut line = String::from(left_gap); + for column_no in 0..columns { + match wrapped_lines.get(line_no + column_no * lines_per_column) { + Some(column_line) => { + line.push_str(column_line); + line.push_str(&" ".repeat(column_width - core::display_width(column_line))); + } + None => { + line.push_str(&" ".repeat(column_width)); + } + } + if column_no == columns - 1 { + line.push_str(&last_column_padding); + } else { + line.push_str(middle_gap); + } + } + line.push_str(right_gap); + lines.push(line); + } + + lines +} + +/// Fill `text` in-place without reallocating the input string. +/// +/// This function works by modifying the input string: some `' '` +/// characters will be replaced by `'\n'` characters. The rest of the +/// text remains untouched. +/// +/// Since we can only replace existing whitespace in the input with +/// `'\n'`, we cannot do hyphenation nor can we split words longer +/// than the line width. We also need to use `AsciiSpace` as the word +/// separator since we need `' '` characters between words in order to +/// replace some of them with a `'\n'`. Indentation is also ruled out. +/// In other words, `fill_inplace(width)` behaves as if you had called +/// [`fill`] with these options: +/// +/// ``` +/// # use textwrap::{core, Options, WordSplitter, WordSeparator, WrapAlgorithm}; +/// # let width = 80; +/// Options { +/// width: width, +/// initial_indent: "", +/// subsequent_indent: "", +/// break_words: false, +/// word_separator: WordSeparator::AsciiSpace, +/// wrap_algorithm: WrapAlgorithm::FirstFit, +/// word_splitter: WordSplitter::NoHyphenation, +/// }; +/// ``` +/// +/// The wrap algorithm is [`WrapAlgorithm::FirstFit`] since this +/// is the fastest algorithm — and the main reason to use +/// `fill_inplace` is to get the string broken into newlines as fast +/// as possible. +/// +/// A last difference is that (unlike [`fill`]) `fill_inplace` can +/// leave trailing whitespace on lines. This is because we wrap by +/// inserting a `'\n'` at the final whitespace in the input string: +/// +/// ``` +/// let mut text = String::from("Hello World!"); +/// textwrap::fill_inplace(&mut text, 10); +/// assert_eq!(text, "Hello \nWorld!"); +/// ``` +/// +/// If we didn't do this, the word `World!` would end up being +/// indented. You can avoid this if you make sure that your input text +/// has no double spaces. +/// +/// # Performance +/// +/// In benchmarks, `fill_inplace` is about twice as fast as [`fill`]. +/// Please see the [`linear` +/// benchmark](https://github.com/mgeisler/textwrap/blob/master/benches/linear.rs) +/// for details. +pub fn fill_inplace(text: &mut String, width: usize) { + let mut indices = Vec::new(); + + let mut offset = 0; + for line in text.split('\n') { + let words = WordSeparator::AsciiSpace + .find_words(line) + .collect::<Vec<_>>(); + let wrapped_words = wrap_algorithms::wrap_first_fit(&words, &[width as f64]); + + let mut line_offset = offset; + for words in &wrapped_words[..wrapped_words.len() - 1] { + let line_len = words + .iter() + .map(|word| word.len() + word.whitespace.len()) + .sum::<usize>(); + + line_offset += line_len; + // We've advanced past all ' ' characters -- want to move + // one ' ' backwards and insert our '\n' there. + indices.push(line_offset - 1); + } + + // Advance past entire line, plus the '\n' which was removed + // by the split call above. + offset += line.len() + 1; + } + + let mut bytes = std::mem::take(text).into_bytes(); + for idx in indices { + bytes[idx] = b'\n'; + } + *text = String::from_utf8(bytes).unwrap(); +} + +#[cfg(test)] +mod tests { + use super::*; + + #[cfg(feature = "hyphenation")] + use hyphenation::{Language, Load, Standard}; + + #[test] + fn options_agree_with_usize() { + let opt_usize = Options::from(42_usize); + let opt_options = Options::new(42); + + assert_eq!(opt_usize.width, opt_options.width); + assert_eq!(opt_usize.initial_indent, opt_options.initial_indent); + assert_eq!(opt_usize.subsequent_indent, opt_options.subsequent_indent); + assert_eq!(opt_usize.break_words, opt_options.break_words); + assert_eq!( + opt_usize.word_splitter.split_points("hello-world"), + opt_options.word_splitter.split_points("hello-world") + ); + } + + #[test] + fn no_wrap() { + assert_eq!(wrap("foo", 10), vec!["foo"]); + } + + #[test] + fn wrap_simple() { + assert_eq!(wrap("foo bar baz", 5), vec!["foo", "bar", "baz"]); + } + + #[test] + fn to_be_or_not() { + assert_eq!( + wrap( + "To be, or not to be, that is the question.", + Options::new(10).wrap_algorithm(WrapAlgorithm::FirstFit) + ), + vec!["To be, or", "not to be,", "that is", "the", "question."] + ); + } + + #[test] + fn multiple_words_on_first_line() { + assert_eq!(wrap("foo bar baz", 10), vec!["foo bar", "baz"]); + } + + #[test] + fn long_word() { + assert_eq!(wrap("foo", 0), vec!["f", "o", "o"]); + } + + #[test] + fn long_words() { + assert_eq!(wrap("foo bar", 0), vec!["f", "o", "o", "b", "a", "r"]); + } + + #[test] + fn max_width() { + assert_eq!(wrap("foo bar", usize::MAX), vec!["foo bar"]); + + let text = "Hello there! This is some English text. \ + It should not be wrapped given the extents below."; + assert_eq!(wrap(text, usize::MAX), vec![text]); + } + + #[test] + fn leading_whitespace() { + assert_eq!(wrap(" foo bar", 6), vec![" foo", "bar"]); + } + + #[test] + fn leading_whitespace_empty_first_line() { + // If there is no space for the first word, the first line + // will be empty. This is because the string is split into + // words like [" ", "foobar ", "baz"], which puts "foobar " on + // the second line. We never output trailing whitespace + assert_eq!(wrap(" foobar baz", 6), vec!["", "foobar", "baz"]); + } + + #[test] + fn trailing_whitespace() { + // Whitespace is only significant inside a line. After a line + // gets too long and is broken, the first word starts in + // column zero and is not indented. + assert_eq!(wrap("foo bar baz ", 5), vec!["foo", "bar", "baz"]); + } + + #[test] + fn issue_99() { + // We did not reset the in_whitespace flag correctly and did + // not handle single-character words after a line break. + assert_eq!( + wrap("aaabbbccc x yyyzzzwww", 9), + vec!["aaabbbccc", "x", "yyyzzzwww"] + ); + } + + #[test] + fn issue_129() { + // The dash is an em-dash which takes up four bytes. We used + // to panic since we tried to index into the character. + let options = Options::new(1).word_separator(WordSeparator::AsciiSpace); + assert_eq!(wrap("x – x", options), vec!["x", "–", "x"]); + } + + #[test] + fn wide_character_handling() { + assert_eq!(wrap("Hello, World!", 15), vec!["Hello, World!"]); + assert_eq!( + wrap( + "Hello, World!", + Options::new(15).word_separator(WordSeparator::AsciiSpace) + ), + vec!["Hello,", "World!"] + ); + + // Wide characters are allowed to break if the + // unicode-linebreak feature is enabled. + #[cfg(feature = "unicode-linebreak")] + assert_eq!( + wrap( + "Hello, World!", + Options::new(15).word_separator(WordSeparator::UnicodeBreakProperties) + ), + vec!["Hello, W", "orld!"] + ); + } + + #[test] + fn empty_line_is_indented() { + // Previously, indentation was not applied to empty lines. + // However, this is somewhat inconsistent and undesirable if + // the indentation is something like a border ("| ") which you + // want to apply to all lines, empty or not. + let options = Options::new(10).initial_indent("!!!"); + assert_eq!(fill("", &options), "!!!"); + } + + #[test] + fn indent_single_line() { + let options = Options::new(10).initial_indent(">>>"); // No trailing space + assert_eq!(fill("foo", &options), ">>>foo"); + } + + #[test] + fn indent_first_emoji() { + let options = Options::new(10).initial_indent("👉👉"); + assert_eq!( + wrap("x x x x x x x x x x x x x", &options), + vec!["👉👉x x x", "x x x x x", "x x x x x"] + ); + } + + #[test] + fn indent_multiple_lines() { + let options = Options::new(6).initial_indent("* ").subsequent_indent(" "); + assert_eq!( + wrap("foo bar baz", &options), + vec!["* foo", " bar", " baz"] + ); + } + + #[test] + fn indent_break_words() { + let options = Options::new(5).initial_indent("* ").subsequent_indent(" "); + assert_eq!(wrap("foobarbaz", &options), vec!["* foo", " bar", " baz"]); + } + + #[test] + fn initial_indent_break_words() { + // This is a corner-case showing how the long word is broken + // according to the width of the subsequent lines. The first + // fragment of the word no longer fits on the first line, + // which ends up being pure indentation. + let options = Options::new(5).initial_indent("-->"); + assert_eq!(wrap("foobarbaz", &options), vec!["-->", "fooba", "rbaz"]); + } + + #[test] + fn hyphens() { + assert_eq!(wrap("foo-bar", 5), vec!["foo-", "bar"]); + } + + #[test] + fn trailing_hyphen() { + let options = Options::new(5).break_words(false); + assert_eq!(wrap("foobar-", &options), vec!["foobar-"]); + } + + #[test] + fn multiple_hyphens() { + assert_eq!(wrap("foo-bar-baz", 5), vec!["foo-", "bar-", "baz"]); + } + + #[test] + fn hyphens_flag() { + let options = Options::new(5).break_words(false); + assert_eq!( + wrap("The --foo-bar flag.", &options), + vec!["The", "--foo-", "bar", "flag."] + ); + } + + #[test] + fn repeated_hyphens() { + let options = Options::new(4).break_words(false); + assert_eq!(wrap("foo--bar", &options), vec!["foo--bar"]); + } + + #[test] + fn hyphens_alphanumeric() { + assert_eq!(wrap("Na2-CH4", 5), vec!["Na2-", "CH4"]); + } + + #[test] + fn hyphens_non_alphanumeric() { + let options = Options::new(5).break_words(false); + assert_eq!(wrap("foo(-)bar", &options), vec!["foo(-)bar"]); + } + + #[test] + fn multiple_splits() { + assert_eq!(wrap("foo-bar-baz", 9), vec!["foo-bar-", "baz"]); + } + + #[test] + fn forced_split() { + let options = Options::new(5).break_words(false); + assert_eq!(wrap("foobar-baz", &options), vec!["foobar-", "baz"]); + } + + #[test] + fn multiple_unbroken_words_issue_193() { + let options = Options::new(3).break_words(false); + assert_eq!( + wrap("small large tiny", &options), + vec!["small", "large", "tiny"] + ); + assert_eq!( + wrap("small large tiny", &options), + vec!["small", "large", "tiny"] + ); + } + + #[test] + fn very_narrow_lines_issue_193() { + let options = Options::new(1).break_words(false); + assert_eq!(wrap("fooo x y", &options), vec!["fooo", "x", "y"]); + assert_eq!(wrap("fooo x y", &options), vec!["fooo", "x", "y"]); + } + + #[test] + fn simple_hyphens() { + let options = Options::new(8).word_splitter(WordSplitter::HyphenSplitter); + assert_eq!(wrap("foo bar-baz", &options), vec!["foo bar-", "baz"]); + } + + #[test] + fn no_hyphenation() { + let options = Options::new(8).word_splitter(WordSplitter::NoHyphenation); + assert_eq!(wrap("foo bar-baz", &options), vec!["foo", "bar-baz"]); + } + + #[test] + #[cfg(feature = "hyphenation")] + fn auto_hyphenation_double_hyphenation() { + let dictionary = Standard::from_embedded(Language::EnglishUS).unwrap(); + let options = Options::new(10); + assert_eq!( + wrap("Internationalization", &options), + vec!["Internatio", "nalization"] + ); + + let options = Options::new(10).word_splitter(WordSplitter::Hyphenation(dictionary)); + assert_eq!( + wrap("Internationalization", &options), + vec!["Interna-", "tionaliza-", "tion"] + ); + } + + #[test] + #[cfg(feature = "hyphenation")] + fn auto_hyphenation_issue_158() { + let dictionary = Standard::from_embedded(Language::EnglishUS).unwrap(); + let options = Options::new(10); + assert_eq!( + wrap("participation is the key to success", &options), + vec!["participat", "ion is", "the key to", "success"] + ); + + let options = Options::new(10).word_splitter(WordSplitter::Hyphenation(dictionary)); + assert_eq!( + wrap("participation is the key to success", &options), + vec!["partici-", "pation is", "the key to", "success"] + ); + } + + #[test] + #[cfg(feature = "hyphenation")] + fn split_len_hyphenation() { + // Test that hyphenation takes the width of the whitespace + // into account. + let dictionary = Standard::from_embedded(Language::EnglishUS).unwrap(); + let options = Options::new(15).word_splitter(WordSplitter::Hyphenation(dictionary)); + assert_eq!( + wrap("garbage collection", &options), + vec!["garbage col-", "lection"] + ); + } + + #[test] + #[cfg(feature = "hyphenation")] + fn borrowed_lines() { + // Lines that end with an extra hyphen are owned, the final + // line is borrowed. + use std::borrow::Cow::{Borrowed, Owned}; + let dictionary = Standard::from_embedded(Language::EnglishUS).unwrap(); + let options = Options::new(10).word_splitter(WordSplitter::Hyphenation(dictionary)); + let lines = wrap("Internationalization", &options); + assert_eq!(lines, vec!["Interna-", "tionaliza-", "tion"]); + if let Borrowed(s) = lines[0] { + assert!(false, "should not have been borrowed: {:?}", s); + } + if let Borrowed(s) = lines[1] { + assert!(false, "should not have been borrowed: {:?}", s); + } + if let Owned(ref s) = lines[2] { + assert!(false, "should not have been owned: {:?}", s); + } + } + + #[test] + #[cfg(feature = "hyphenation")] + fn auto_hyphenation_with_hyphen() { + let dictionary = Standard::from_embedded(Language::EnglishUS).unwrap(); + let options = Options::new(8).break_words(false); + assert_eq!( + wrap("over-caffinated", &options), + vec!["over-", "caffinated"] + ); + + let options = options.word_splitter(WordSplitter::Hyphenation(dictionary)); + assert_eq!( + wrap("over-caffinated", &options), + vec!["over-", "caffi-", "nated"] + ); + } + + #[test] + fn break_words() { + assert_eq!(wrap("foobarbaz", 3), vec!["foo", "bar", "baz"]); + } + + #[test] + fn break_words_wide_characters() { + // Even the poor man's version of `ch_width` counts these + // characters as wide. + let options = Options::new(5).word_separator(WordSeparator::AsciiSpace); + assert_eq!(wrap("Hello", options), vec!["He", "ll", "o"]); + } + + #[test] + fn break_words_zero_width() { + assert_eq!(wrap("foobar", 0), vec!["f", "o", "o", "b", "a", "r"]); + } + + #[test] + fn break_long_first_word() { + assert_eq!(wrap("testx y", 4), vec!["test", "x y"]); + } + + #[test] + fn break_words_line_breaks() { + assert_eq!(fill("ab\ncdefghijkl", 5), "ab\ncdefg\nhijkl"); + assert_eq!(fill("abcdefgh\nijkl", 5), "abcde\nfgh\nijkl"); + } + + #[test] + fn break_words_empty_lines() { + assert_eq!( + fill("foo\nbar", &Options::new(2).break_words(false)), + "foo\nbar" + ); + } + + #[test] + fn preserve_line_breaks() { + assert_eq!(fill("", 80), ""); + assert_eq!(fill("\n", 80), "\n"); + assert_eq!(fill("\n\n\n", 80), "\n\n\n"); + assert_eq!(fill("test\n", 80), "test\n"); + assert_eq!(fill("test\n\na\n\n", 80), "test\n\na\n\n"); + assert_eq!( + fill( + "1 3 5 7\n1 3 5 7", + Options::new(7).wrap_algorithm(WrapAlgorithm::FirstFit) + ), + "1 3 5 7\n1 3 5 7" + ); + assert_eq!( + fill( + "1 3 5 7\n1 3 5 7", + Options::new(5).wrap_algorithm(WrapAlgorithm::FirstFit) + ), + "1 3 5\n7\n1 3 5\n7" + ); + } + + #[test] + fn preserve_line_breaks_with_whitespace() { + assert_eq!(fill(" ", 80), ""); + assert_eq!(fill(" \n ", 80), "\n"); + assert_eq!(fill(" \n \n \n ", 80), "\n\n\n"); + } + + #[test] + fn non_breaking_space() { + let options = Options::new(5).break_words(false); + assert_eq!(fill("foo bar baz", &options), "foo bar baz"); + } + + #[test] + fn non_breaking_hyphen() { + let options = Options::new(5).break_words(false); + assert_eq!(fill("foo‑bar‑baz", &options), "foo‑bar‑baz"); + } + + #[test] + fn fill_simple() { + assert_eq!(fill("foo bar baz", 10), "foo bar\nbaz"); + } + + #[test] + fn fill_colored_text() { + // The words are much longer than 6 bytes, but they remain + // intact after filling the text. + let green_hello = "\u{1b}[0m\u{1b}[32mHello\u{1b}[0m"; + let blue_world = "\u{1b}[0m\u{1b}[34mWorld!\u{1b}[0m"; + assert_eq!( + fill(&(String::from(green_hello) + " " + &blue_world), 6), + String::from(green_hello) + "\n" + &blue_world + ); + } + + #[test] + fn fill_unicode_boundary() { + // https://github.com/mgeisler/textwrap/issues/390 + fill("\u{1b}!Ͽ", 10); + } + + #[test] + fn fill_inplace_empty() { + let mut text = String::from(""); + fill_inplace(&mut text, 80); + assert_eq!(text, ""); + } + + #[test] + fn fill_inplace_simple() { + let mut text = String::from("foo bar baz"); + fill_inplace(&mut text, 10); + assert_eq!(text, "foo bar\nbaz"); + } + + #[test] + fn fill_inplace_multiple_lines() { + let mut text = String::from("Some text to wrap over multiple lines"); + fill_inplace(&mut text, 12); + assert_eq!(text, "Some text to\nwrap over\nmultiple\nlines"); + } + + #[test] + fn fill_inplace_long_word() { + let mut text = String::from("Internationalization is hard"); + fill_inplace(&mut text, 10); + assert_eq!(text, "Internationalization\nis hard"); + } + + #[test] + fn fill_inplace_no_hyphen_splitting() { + let mut text = String::from("A well-chosen example"); + fill_inplace(&mut text, 10); + assert_eq!(text, "A\nwell-chosen\nexample"); + } + + #[test] + fn fill_inplace_newlines() { + let mut text = String::from("foo bar\n\nbaz\n\n\n"); + fill_inplace(&mut text, 10); + assert_eq!(text, "foo bar\n\nbaz\n\n\n"); + } + + #[test] + fn fill_inplace_newlines_reset_line_width() { + let mut text = String::from("1 3 5\n1 3 5 7 9\n1 3 5 7 9 1 3"); + fill_inplace(&mut text, 10); + assert_eq!(text, "1 3 5\n1 3 5 7 9\n1 3 5 7 9\n1 3"); + } + + #[test] + fn fill_inplace_leading_whitespace() { + let mut text = String::from(" foo bar baz"); + fill_inplace(&mut text, 10); + assert_eq!(text, " foo bar\nbaz"); + } + + #[test] + fn fill_inplace_trailing_whitespace() { + let mut text = String::from("foo bar baz "); + fill_inplace(&mut text, 10); + assert_eq!(text, "foo bar\nbaz "); + } + + #[test] + fn fill_inplace_interior_whitespace() { + // To avoid an unwanted indentation of "baz", it is important + // to replace the final ' ' with '\n'. + let mut text = String::from("foo bar baz"); + fill_inplace(&mut text, 10); + assert_eq!(text, "foo bar \nbaz"); + } + + #[test] + fn unfill_simple() { + let (text, options) = unfill("foo\nbar"); + assert_eq!(text, "foo bar"); + assert_eq!(options.width, 3); + } + + #[test] + fn unfill_trailing_newlines() { + let (text, options) = unfill("foo\nbar\n\n\n"); + assert_eq!(text, "foo bar\n\n\n"); + assert_eq!(options.width, 3); + } + + #[test] + fn unfill_initial_indent() { + let (text, options) = unfill(" foo\nbar\nbaz"); + assert_eq!(text, "foo bar baz"); + assert_eq!(options.width, 5); + assert_eq!(options.initial_indent, " "); + } + + #[test] + fn unfill_differing_indents() { + let (text, options) = unfill(" foo\n bar\n baz"); + assert_eq!(text, "foo bar baz"); + assert_eq!(options.width, 7); + assert_eq!(options.initial_indent, " "); + assert_eq!(options.subsequent_indent, " "); + } + + #[test] + fn unfill_list_item() { + let (text, options) = unfill("* foo\n bar\n baz"); + assert_eq!(text, "foo bar baz"); + assert_eq!(options.width, 5); + assert_eq!(options.initial_indent, "* "); + assert_eq!(options.subsequent_indent, " "); + } + + #[test] + fn unfill_multiple_char_prefix() { + let (text, options) = unfill(" // foo bar\n // baz\n // quux"); + assert_eq!(text, "foo bar baz quux"); + assert_eq!(options.width, 14); + assert_eq!(options.initial_indent, " // "); + assert_eq!(options.subsequent_indent, " // "); + } + + #[test] + fn unfill_block_quote() { + let (text, options) = unfill("> foo\n> bar\n> baz"); + assert_eq!(text, "foo bar baz"); + assert_eq!(options.width, 5); + assert_eq!(options.initial_indent, "> "); + assert_eq!(options.subsequent_indent, "> "); + } + + #[test] + fn unfill_whitespace() { + assert_eq!(unfill("foo bar").0, "foo bar"); + } + + #[test] + fn wrap_columns_empty_text() { + assert_eq!(wrap_columns("", 1, 10, "| ", "", " |"), vec!["| |"]); + } + + #[test] + fn wrap_columns_single_column() { + assert_eq!( + wrap_columns("Foo", 3, 30, "| ", " | ", " |"), + vec!["| Foo | | |"] + ); + } + + #[test] + fn wrap_columns_uneven_columns() { + // The gaps take up a total of 5 columns, so the columns are + // (21 - 5)/4 = 4 columns wide: + assert_eq!( + wrap_columns("Foo Bar Baz Quux", 4, 21, "|", "|", "|"), + vec!["|Foo |Bar |Baz |Quux|"] + ); + // As the total width increases, the last column absorbs the + // excess width: + assert_eq!( + wrap_columns("Foo Bar Baz Quux", 4, 24, "|", "|", "|"), + vec!["|Foo |Bar |Baz |Quux |"] + ); + // Finally, when the width is 25, the columns can be resized + // to a width of (25 - 5)/4 = 5 columns: + assert_eq!( + wrap_columns("Foo Bar Baz Quux", 4, 25, "|", "|", "|"), + vec!["|Foo |Bar |Baz |Quux |"] + ); + } + + #[test] + #[cfg(feature = "unicode-width")] + fn wrap_columns_with_emojis() { + assert_eq!( + wrap_columns( + "Words and a few emojis 😍 wrapped in ⓶ columns", + 2, + 30, + "✨ ", + " ⚽ ", + " 👀" + ), + vec![ + "✨ Words ⚽ wrapped in 👀", + "✨ and a few ⚽ ⓶ columns 👀", + "✨ emojis 😍 ⚽ 👀" + ] + ); + } + + #[test] + fn wrap_columns_big_gaps() { + // The column width shrinks to 1 because the gaps take up all + // the space. + assert_eq!( + wrap_columns("xyz", 2, 10, "----> ", " !!! ", " <----"), + vec![ + "----> x !!! z <----", // + "----> y !!! <----" + ] + ); + } + + #[test] + #[should_panic] + fn wrap_columns_panic_with_zero_columns() { + wrap_columns("", 0, 10, "", "", ""); + } +} diff --git a/third_party/rust/textwrap/src/word_separators.rs b/third_party/rust/textwrap/src/word_separators.rs new file mode 100644 index 0000000000..25adf31b12 --- /dev/null +++ b/third_party/rust/textwrap/src/word_separators.rs @@ -0,0 +1,428 @@ +//! Functionality for finding words. +//! +//! In order to wrap text, we need to know where the legal break +//! points are, i.e., where the words of the text are. This means that +//! we need to define what a "word" is. +//! +//! A simple approach is to simply split the text on whitespace, but +//! this does not work for East-Asian languages such as Chinese or +//! Japanese where there are no spaces between words. Breaking a long +//! sequence of emojis is another example where line breaks might be +//! wanted even if there are no whitespace to be found. +//! +//! The [`WordSeparator`] trait is responsible for determining where +//! there words are in a line of text. Please refer to the trait and +//! the structs which implement it for more information. + +#[cfg(feature = "unicode-linebreak")] +use crate::core::skip_ansi_escape_sequence; +use crate::core::Word; + +/// Describes where words occur in a line of text. +/// +/// The simplest approach is say that words are separated by one or +/// more ASCII spaces (`' '`). This works for Western languages +/// without emojis. A more complex approach is to use the Unicode line +/// breaking algorithm, which finds break points in non-ASCII text. +/// +/// The line breaks occur between words, please see +/// [`WordSplitter`](crate::WordSplitter) for options of how to handle +/// hyphenation of individual words. +/// +/// # Examples +/// +/// ``` +/// use textwrap::core::Word; +/// use textwrap::WordSeparator::AsciiSpace; +/// +/// let words = AsciiSpace.find_words("Hello World!").collect::<Vec<_>>(); +/// assert_eq!(words, vec![Word::from("Hello "), Word::from("World!")]); +/// ``` +#[derive(Clone, Copy)] +pub enum WordSeparator { + /// Find words by splitting on runs of `' '` characters. + /// + /// # Examples + /// + /// ``` + /// use textwrap::core::Word; + /// use textwrap::WordSeparator::AsciiSpace; + /// + /// let words = AsciiSpace.find_words("Hello World!").collect::<Vec<_>>(); + /// assert_eq!(words, vec![Word::from("Hello "), + /// Word::from("World!")]); + /// ``` + AsciiSpace, + + /// Split `line` into words using Unicode break properties. + /// + /// This word separator uses the Unicode line breaking algorithm + /// described in [Unicode Standard Annex + /// #14](https://www.unicode.org/reports/tr14/) to find legal places + /// to break lines. There is a small difference in that the U+002D + /// (Hyphen-Minus) and U+00AD (Soft Hyphen) don’t create a line break: + /// to allow a line break at a hyphen, use + /// [`WordSplitter::HyphenSplitter`](crate::WordSplitter::HyphenSplitter). + /// Soft hyphens are not currently supported. + /// + /// # Examples + /// + /// Unlike [`WordSeparator::AsciiSpace`], the Unicode line + /// breaking algorithm will find line break opportunities between + /// some characters with no intervening whitespace: + /// + /// ``` + /// #[cfg(feature = "unicode-linebreak")] { + /// use textwrap::core::Word; + /// use textwrap::WordSeparator::UnicodeBreakProperties; + /// + /// assert_eq!(UnicodeBreakProperties.find_words("Emojis: 😂😍").collect::<Vec<_>>(), + /// vec![Word::from("Emojis: "), + /// Word::from("😂"), + /// Word::from("😍")]); + /// + /// assert_eq!(UnicodeBreakProperties.find_words("CJK: 你好").collect::<Vec<_>>(), + /// vec![Word::from("CJK: "), + /// Word::from("你"), + /// Word::from("好")]); + /// } + /// ``` + /// + /// A U+2060 (Word Joiner) character can be inserted if you want to + /// manually override the defaults and keep the characters together: + /// + /// ``` + /// #[cfg(feature = "unicode-linebreak")] { + /// use textwrap::core::Word; + /// use textwrap::WordSeparator::UnicodeBreakProperties; + /// + /// assert_eq!(UnicodeBreakProperties.find_words("Emojis: 😂\u{2060}😍").collect::<Vec<_>>(), + /// vec![Word::from("Emojis: "), + /// Word::from("😂\u{2060}😍")]); + /// } + /// ``` + /// + /// The Unicode line breaking algorithm will also automatically + /// suppress break breaks around certain punctuation characters:: + /// + /// ``` + /// #[cfg(feature = "unicode-linebreak")] { + /// use textwrap::core::Word; + /// use textwrap::WordSeparator::UnicodeBreakProperties; + /// + /// assert_eq!(UnicodeBreakProperties.find_words("[ foo ] bar !").collect::<Vec<_>>(), + /// vec![Word::from("[ foo ] "), + /// Word::from("bar !")]); + /// } + /// ``` + #[cfg(feature = "unicode-linebreak")] + UnicodeBreakProperties, + + /// Find words using a custom word separator + Custom(fn(line: &str) -> Box<dyn Iterator<Item = Word<'_>> + '_>), +} + +impl std::fmt::Debug for WordSeparator { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + WordSeparator::AsciiSpace => f.write_str("AsciiSpace"), + #[cfg(feature = "unicode-linebreak")] + WordSeparator::UnicodeBreakProperties => f.write_str("UnicodeBreakProperties"), + WordSeparator::Custom(_) => f.write_str("Custom(...)"), + } + } +} + +impl WordSeparator { + // This function should really return impl Iterator<Item = Word>, but + // this isn't possible until Rust supports higher-kinded types: + // https://github.com/rust-lang/rfcs/blob/master/text/1522-conservative-impl-trait.md + /// Find all words in `line`. + pub fn find_words<'a>(&self, line: &'a str) -> Box<dyn Iterator<Item = Word<'a>> + 'a> { + match self { + WordSeparator::AsciiSpace => find_words_ascii_space(line), + #[cfg(feature = "unicode-linebreak")] + WordSeparator::UnicodeBreakProperties => find_words_unicode_break_properties(line), + WordSeparator::Custom(func) => func(line), + } + } +} + +fn find_words_ascii_space<'a>(line: &'a str) -> Box<dyn Iterator<Item = Word<'a>> + 'a> { + let mut start = 0; + let mut in_whitespace = false; + let mut char_indices = line.char_indices(); + + Box::new(std::iter::from_fn(move || { + // for (idx, ch) in char_indices does not work, gives this + // error: + // + // > cannot move out of `char_indices`, a captured variable in + // > an `FnMut` closure + #[allow(clippy::while_let_on_iterator)] + while let Some((idx, ch)) = char_indices.next() { + if in_whitespace && ch != ' ' { + let word = Word::from(&line[start..idx]); + start = idx; + in_whitespace = ch == ' '; + return Some(word); + } + + in_whitespace = ch == ' '; + } + + if start < line.len() { + let word = Word::from(&line[start..]); + start = line.len(); + return Some(word); + } + + None + })) +} + +// Strip all ANSI escape sequences from `text`. +#[cfg(feature = "unicode-linebreak")] +fn strip_ansi_escape_sequences(text: &str) -> String { + let mut result = String::with_capacity(text.len()); + + let mut chars = text.chars(); + while let Some(ch) = chars.next() { + if skip_ansi_escape_sequence(ch, &mut chars) { + continue; + } + result.push(ch); + } + + result +} + +/// Soft hyphen, also knows as a “shy hyphen”. Should show up as ‘-’ +/// if a line is broken at this point, and otherwise be invisible. +/// Textwrap does not currently support breaking words at soft +/// hyphens. +#[cfg(feature = "unicode-linebreak")] +const SHY: char = '\u{00ad}'; + +/// Find words in line. ANSI escape sequences are ignored in `line`. +#[cfg(feature = "unicode-linebreak")] +fn find_words_unicode_break_properties<'a>( + line: &'a str, +) -> Box<dyn Iterator<Item = Word<'a>> + 'a> { + // Construct an iterator over (original index, stripped index) + // tuples. We find the Unicode linebreaks on a stripped string, + // but we need the original indices so we can form words based on + // the original string. + let mut last_stripped_idx = 0; + let mut char_indices = line.char_indices(); + let mut idx_map = std::iter::from_fn(move || match char_indices.next() { + Some((orig_idx, ch)) => { + let stripped_idx = last_stripped_idx; + if !skip_ansi_escape_sequence(ch, &mut char_indices.by_ref().map(|(_, ch)| ch)) { + last_stripped_idx += ch.len_utf8(); + } + Some((orig_idx, stripped_idx)) + } + None => None, + }); + + let stripped = strip_ansi_escape_sequences(line); + let mut opportunities = unicode_linebreak::linebreaks(&stripped) + .filter(|(idx, _)| { + #[allow(clippy::match_like_matches_macro)] + match &stripped[..*idx].chars().next_back() { + // We suppress breaks at ‘-’ since we want to control + // this via the WordSplitter. + Some('-') => false, + // Soft hyphens are currently not supported since we + // require all `Word` fragments to be continuous in + // the input string. + Some(SHY) => false, + // Other breaks should be fine! + _ => true, + } + }) + .collect::<Vec<_>>() + .into_iter(); + + // Remove final break opportunity, we will add it below using + // &line[start..]; This ensures that we correctly include a + // trailing ANSI escape sequence. + opportunities.next_back(); + + let mut start = 0; + Box::new(std::iter::from_fn(move || { + #[allow(clippy::while_let_on_iterator)] + while let Some((idx, _)) = opportunities.next() { + if let Some((orig_idx, _)) = idx_map.find(|&(_, stripped_idx)| stripped_idx == idx) { + let word = Word::from(&line[start..orig_idx]); + start = orig_idx; + return Some(word); + } + } + + if start < line.len() { + let word = Word::from(&line[start..]); + start = line.len(); + return Some(word); + } + + None + })) +} + +#[cfg(test)] +mod tests { + use super::WordSeparator::*; + use super::*; + + // Like assert_eq!, but the left expression is an iterator. + macro_rules! assert_iter_eq { + ($left:expr, $right:expr) => { + assert_eq!($left.collect::<Vec<_>>(), $right); + }; + } + + fn to_words<'a>(words: Vec<&'a str>) -> Vec<Word<'a>> { + words.into_iter().map(|w: &str| Word::from(&w)).collect() + } + + macro_rules! test_find_words { + ($ascii_name:ident, + $unicode_name:ident, + $([ $line:expr, $ascii_words:expr, $unicode_words:expr ]),+) => { + #[test] + fn $ascii_name() { + $( + let expected_words = to_words($ascii_words.to_vec()); + let actual_words = WordSeparator::AsciiSpace + .find_words($line) + .collect::<Vec<_>>(); + assert_eq!(actual_words, expected_words, "Line: {:?}", $line); + )+ + } + + #[test] + #[cfg(feature = "unicode-linebreak")] + fn $unicode_name() { + $( + let expected_words = to_words($unicode_words.to_vec()); + let actual_words = WordSeparator::UnicodeBreakProperties + .find_words($line) + .collect::<Vec<_>>(); + assert_eq!(actual_words, expected_words, "Line: {:?}", $line); + )+ + } + }; + } + + test_find_words!(ascii_space_empty, unicode_empty, ["", [], []]); + + test_find_words!( + ascii_single_word, + unicode_single_word, + ["foo", ["foo"], ["foo"]] + ); + + test_find_words!( + ascii_two_words, + unicode_two_words, + ["foo bar", ["foo ", "bar"], ["foo ", "bar"]] + ); + + test_find_words!( + ascii_multiple_words, + unicode_multiple_words, + ["foo bar", ["foo ", "bar"], ["foo ", "bar"]], + ["x y z", ["x ", "y ", "z"], ["x ", "y ", "z"]] + ); + + test_find_words!( + ascii_only_whitespace, + unicode_only_whitespace, + [" ", [" "], [" "]], + [" ", [" "], [" "]] + ); + + test_find_words!( + ascii_inter_word_whitespace, + unicode_inter_word_whitespace, + ["foo bar", ["foo ", "bar"], ["foo ", "bar"]] + ); + + test_find_words!( + ascii_trailing_whitespace, + unicode_trailing_whitespace, + ["foo ", ["foo "], ["foo "]] + ); + + test_find_words!( + ascii_leading_whitespace, + unicode_leading_whitespace, + [" foo", [" ", "foo"], [" ", "foo"]] + ); + + test_find_words!( + ascii_multi_column_char, + unicode_multi_column_char, + ["\u{1f920}", ["\u{1f920}"], ["\u{1f920}"]] // cowboy emoji 🤠 + ); + + test_find_words!( + ascii_hyphens, + unicode_hyphens, + ["foo-bar", ["foo-bar"], ["foo-bar"]], + ["foo- bar", ["foo- ", "bar"], ["foo- ", "bar"]], + ["foo - bar", ["foo ", "- ", "bar"], ["foo ", "- ", "bar"]], + ["foo -bar", ["foo ", "-bar"], ["foo ", "-bar"]] + ); + + test_find_words!( + ascii_newline, + unicode_newline, + ["foo\nbar", ["foo\nbar"], ["foo\n", "bar"]] + ); + + test_find_words!( + ascii_tab, + unicode_tab, + ["foo\tbar", ["foo\tbar"], ["foo\t", "bar"]] + ); + + test_find_words!( + ascii_non_breaking_space, + unicode_non_breaking_space, + ["foo\u{00A0}bar", ["foo\u{00A0}bar"], ["foo\u{00A0}bar"]] + ); + + #[test] + #[cfg(unix)] + fn find_words_colored_text() { + use termion::color::{Blue, Fg, Green, Reset}; + + let green_hello = format!("{}Hello{} ", Fg(Green), Fg(Reset)); + let blue_world = format!("{}World!{}", Fg(Blue), Fg(Reset)); + assert_iter_eq!( + AsciiSpace.find_words(&format!("{}{}", green_hello, blue_world)), + vec![Word::from(&green_hello), Word::from(&blue_world)] + ); + + #[cfg(feature = "unicode-linebreak")] + assert_iter_eq!( + UnicodeBreakProperties.find_words(&format!("{}{}", green_hello, blue_world)), + vec![Word::from(&green_hello), Word::from(&blue_world)] + ); + } + + #[test] + fn find_words_color_inside_word() { + let text = "foo\u{1b}[0m\u{1b}[32mbar\u{1b}[0mbaz"; + assert_iter_eq!(AsciiSpace.find_words(&text), vec![Word::from(text)]); + + #[cfg(feature = "unicode-linebreak")] + assert_iter_eq!( + UnicodeBreakProperties.find_words(&text), + vec![Word::from(text)] + ); + } +} diff --git a/third_party/rust/textwrap/src/word_splitters.rs b/third_party/rust/textwrap/src/word_splitters.rs new file mode 100644 index 0000000000..69e246f0b8 --- /dev/null +++ b/third_party/rust/textwrap/src/word_splitters.rs @@ -0,0 +1,314 @@ +//! Word splitting functionality. +//! +//! To wrap text into lines, long words sometimes need to be split +//! across lines. The [`WordSplitter`] enum defines this +//! functionality. + +use crate::core::{display_width, Word}; + +/// The `WordSplitter` enum describes where words can be split. +/// +/// If the textwrap crate has been compiled with the `hyphenation` +/// Cargo feature enabled, you will find a +/// [`WordSplitter::Hyphenation`] variant. Use this struct for +/// language-aware hyphenation: +/// +/// ``` +/// #[cfg(feature = "hyphenation")] { +/// use hyphenation::{Language, Load, Standard}; +/// use textwrap::{wrap, Options, WordSplitter}; +/// +/// let text = "Oxidation is the loss of electrons."; +/// let dictionary = Standard::from_embedded(Language::EnglishUS).unwrap(); +/// let options = Options::new(8).word_splitter(WordSplitter::Hyphenation(dictionary)); +/// assert_eq!(wrap(text, &options), vec!["Oxida-", +/// "tion is", +/// "the loss", +/// "of elec-", +/// "trons."]); +/// } +/// ``` +/// +/// Please see the documentation for the [hyphenation] crate for more +/// details. +/// +/// [hyphenation]: https://docs.rs/hyphenation/ +#[derive(Clone)] +pub enum WordSplitter { + /// Use this as a [`Options.word_splitter`] to avoid any kind of + /// hyphenation: + /// + /// ``` + /// use textwrap::{wrap, Options, WordSplitter}; + /// + /// let options = Options::new(8).word_splitter(WordSplitter::NoHyphenation); + /// assert_eq!(wrap("foo bar-baz", &options), + /// vec!["foo", "bar-baz"]); + /// ``` + /// + /// [`Options.word_splitter`]: super::Options::word_splitter + NoHyphenation, + + /// `HyphenSplitter` is the default `WordSplitter` used by + /// [`Options::new`](super::Options::new). It will split words on + /// existing hyphens in the word. + /// + /// It will only use hyphens that are surrounded by alphanumeric + /// characters, which prevents a word like `"--foo-bar"` from + /// being split into `"--"` and `"foo-bar"`. + /// + /// # Examples + /// + /// ``` + /// use textwrap::WordSplitter; + /// + /// assert_eq!(WordSplitter::HyphenSplitter.split_points("--foo-bar"), + /// vec![6]); + /// ``` + HyphenSplitter, + + /// Use a custom function as the word splitter. + /// + /// This varian lets you implement a custom word splitter using + /// your own function. + /// + /// # Examples + /// + /// ``` + /// use textwrap::WordSplitter; + /// + /// fn split_at_underscore(word: &str) -> Vec<usize> { + /// word.match_indices('_').map(|(idx, _)| idx + 1).collect() + /// } + /// + /// let word_splitter = WordSplitter::Custom(split_at_underscore); + /// assert_eq!(word_splitter.split_points("a_long_identifier"), + /// vec![2, 7]); + /// ``` + Custom(fn(word: &str) -> Vec<usize>), + + /// A hyphenation dictionary can be used to do language-specific + /// hyphenation using patterns from the [hyphenation] crate. + /// + /// **Note:** Only available when the `hyphenation` Cargo feature is + /// enabled. + /// + /// [hyphenation]: https://docs.rs/hyphenation/ + #[cfg(feature = "hyphenation")] + Hyphenation(hyphenation::Standard), +} + +impl std::fmt::Debug for WordSplitter { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + WordSplitter::NoHyphenation => f.write_str("NoHyphenation"), + WordSplitter::HyphenSplitter => f.write_str("HyphenSplitter"), + WordSplitter::Custom(_) => f.write_str("Custom(...)"), + #[cfg(feature = "hyphenation")] + WordSplitter::Hyphenation(dict) => write!(f, "Hyphenation({})", dict.language()), + } + } +} + +impl PartialEq<WordSplitter> for WordSplitter { + fn eq(&self, other: &WordSplitter) -> bool { + match (self, other) { + (WordSplitter::NoHyphenation, WordSplitter::NoHyphenation) => true, + (WordSplitter::HyphenSplitter, WordSplitter::HyphenSplitter) => true, + #[cfg(feature = "hyphenation")] + (WordSplitter::Hyphenation(this_dict), WordSplitter::Hyphenation(other_dict)) => { + this_dict.language() == other_dict.language() + } + (_, _) => false, + } + } +} + +impl WordSplitter { + /// Return all possible indices where `word` can be split. + /// + /// The indices are in the range `0..word.len()`. They point to + /// the index _after_ the split point, i.e., after `-` if + /// splitting on hyphens. This way, `word.split_at(idx)` will + /// break the word into two well-formed pieces. + /// + /// # Examples + /// + /// ``` + /// use textwrap::WordSplitter; + /// assert_eq!(WordSplitter::NoHyphenation.split_points("cannot-be-split"), vec![]); + /// assert_eq!(WordSplitter::HyphenSplitter.split_points("can-be-split"), vec![4, 7]); + /// assert_eq!(WordSplitter::Custom(|word| vec![word.len()/2]).split_points("middle"), vec![3]); + /// ``` + pub fn split_points(&self, word: &str) -> Vec<usize> { + match self { + WordSplitter::NoHyphenation => Vec::new(), + WordSplitter::HyphenSplitter => { + let mut splits = Vec::new(); + + for (idx, _) in word.match_indices('-') { + // We only use hyphens that are surrounded by alphanumeric + // characters. This is to avoid splitting on repeated hyphens, + // such as those found in --foo-bar. + let prev = word[..idx].chars().next_back(); + let next = word[idx + 1..].chars().next(); + + if prev.filter(|ch| ch.is_alphanumeric()).is_some() + && next.filter(|ch| ch.is_alphanumeric()).is_some() + { + splits.push(idx + 1); // +1 due to width of '-'. + } + } + + splits + } + WordSplitter::Custom(splitter_func) => splitter_func(word), + #[cfg(feature = "hyphenation")] + WordSplitter::Hyphenation(dictionary) => { + use hyphenation::Hyphenator; + dictionary.hyphenate(word).breaks + } + } + } +} + +/// Split words into smaller words according to the split points given +/// by `word_splitter`. +/// +/// Note that we split all words, regardless of their length. This is +/// to more cleanly separate the business of splitting (including +/// automatic hyphenation) from the business of word wrapping. +pub fn split_words<'a, I>( + words: I, + word_splitter: &'a WordSplitter, +) -> impl Iterator<Item = Word<'a>> +where + I: IntoIterator<Item = Word<'a>>, +{ + words.into_iter().flat_map(move |word| { + let mut prev = 0; + let mut split_points = word_splitter.split_points(&word).into_iter(); + std::iter::from_fn(move || { + if let Some(idx) = split_points.next() { + let need_hyphen = !word[..idx].ends_with('-'); + let w = Word { + word: &word.word[prev..idx], + width: display_width(&word[prev..idx]), + whitespace: "", + penalty: if need_hyphen { "-" } else { "" }, + }; + prev = idx; + return Some(w); + } + + if prev < word.word.len() || prev == 0 { + let w = Word { + word: &word.word[prev..], + width: display_width(&word[prev..]), + whitespace: word.whitespace, + penalty: word.penalty, + }; + prev = word.word.len() + 1; + return Some(w); + } + + None + }) + }) +} + +#[cfg(test)] +mod tests { + use super::*; + + // Like assert_eq!, but the left expression is an iterator. + macro_rules! assert_iter_eq { + ($left:expr, $right:expr) => { + assert_eq!($left.collect::<Vec<_>>(), $right); + }; + } + + #[test] + fn split_words_no_words() { + assert_iter_eq!(split_words(vec![], &WordSplitter::HyphenSplitter), vec![]); + } + + #[test] + fn split_words_empty_word() { + assert_iter_eq!( + split_words(vec![Word::from(" ")], &WordSplitter::HyphenSplitter), + vec![Word::from(" ")] + ); + } + + #[test] + fn split_words_single_word() { + assert_iter_eq!( + split_words(vec![Word::from("foobar")], &WordSplitter::HyphenSplitter), + vec![Word::from("foobar")] + ); + } + + #[test] + fn split_words_hyphen_splitter() { + assert_iter_eq!( + split_words(vec![Word::from("foo-bar")], &WordSplitter::HyphenSplitter), + vec![Word::from("foo-"), Word::from("bar")] + ); + } + + #[test] + fn split_words_no_hyphenation() { + assert_iter_eq!( + split_words(vec![Word::from("foo-bar")], &WordSplitter::NoHyphenation), + vec![Word::from("foo-bar")] + ); + } + + #[test] + fn split_words_adds_penalty() { + let fixed_split_point = |_: &str| vec![3]; + + assert_iter_eq!( + split_words( + vec![Word::from("foobar")].into_iter(), + &WordSplitter::Custom(fixed_split_point) + ), + vec![ + Word { + word: "foo", + width: 3, + whitespace: "", + penalty: "-" + }, + Word { + word: "bar", + width: 3, + whitespace: "", + penalty: "" + } + ] + ); + + assert_iter_eq!( + split_words( + vec![Word::from("fo-bar")].into_iter(), + &WordSplitter::Custom(fixed_split_point) + ), + vec![ + Word { + word: "fo-", + width: 3, + whitespace: "", + penalty: "" + }, + Word { + word: "bar", + width: 3, + whitespace: "", + penalty: "" + } + ] + ); + } +} diff --git a/third_party/rust/textwrap/src/wrap_algorithms.rs b/third_party/rust/textwrap/src/wrap_algorithms.rs new file mode 100644 index 0000000000..5ca49c3352 --- /dev/null +++ b/third_party/rust/textwrap/src/wrap_algorithms.rs @@ -0,0 +1,381 @@ +//! Word wrapping algorithms. +//! +//! After a text has been broken into words (or [`Fragment`]s), one +//! now has to decide how to break the fragments into lines. The +//! simplest algorithm for this is implemented by [`wrap_first_fit`]: +//! it uses no look-ahead and simply adds fragments to the line as +//! long as they fit. However, this can lead to poor line breaks if a +//! large fragment almost-but-not-quite fits on a line. When that +//! happens, the fragment is moved to the next line and it will leave +//! behind a large gap. A more advanced algorithm, implemented by +//! [`wrap_optimal_fit`], will take this into account. The optimal-fit +//! algorithm considers all possible line breaks and will attempt to +//! minimize the gaps left behind by overly short lines. +//! +//! While both algorithms run in linear time, the first-fit algorithm +//! is about 4 times faster than the optimal-fit algorithm. + +#[cfg(feature = "smawk")] +mod optimal_fit; +#[cfg(feature = "smawk")] +pub use optimal_fit::{wrap_optimal_fit, OverflowError, Penalties}; + +use crate::core::{Fragment, Word}; + +/// Describes how to wrap words into lines. +/// +/// The simplest approach is to wrap words one word at a time and +/// accept the first way of wrapping which fit +/// ([`WrapAlgorithm::FirstFit`]). If the `smawk` Cargo feature is +/// enabled, a more complex algorithm is available which will look at +/// an entire paragraph at a time in order to find optimal line breaks +/// ([`WrapAlgorithm::OptimalFit`]). +#[derive(Clone, Copy)] +pub enum WrapAlgorithm { + /// Wrap words using a fast and simple algorithm. + /// + /// This algorithm uses no look-ahead when finding line breaks. + /// Implemented by [`wrap_first_fit`], please see that function for + /// details and examples. + FirstFit, + + /// Wrap words using an advanced algorithm with look-ahead. + /// + /// This wrapping algorithm considers the entire paragraph to find + /// optimal line breaks. When wrapping text, "penalties" are + /// assigned to line breaks based on the gaps left at the end of + /// lines. See [`Penalties`] for details. + /// + /// The underlying wrapping algorithm is implemented by + /// [`wrap_optimal_fit`], please see that function for examples. + /// + /// **Note:** Only available when the `smawk` Cargo feature is + /// enabled. + #[cfg(feature = "smawk")] + OptimalFit(Penalties), + + /// Custom wrapping function. + /// + /// Use this if you want to implement your own wrapping algorithm. + /// The function can freely decide how to turn a slice of + /// [`Word`]s into lines. + /// + /// # Example + /// + /// ``` + /// use textwrap::core::Word; + /// use textwrap::{wrap, Options, WrapAlgorithm}; + /// + /// fn stair<'a, 'b>(words: &'b [Word<'a>], _: &'b [usize]) -> Vec<&'b [Word<'a>]> { + /// let mut lines = Vec::new(); + /// let mut step = 1; + /// let mut start_idx = 0; + /// while start_idx + step <= words.len() { + /// lines.push(&words[start_idx .. start_idx+step]); + /// start_idx += step; + /// step += 1; + /// } + /// lines + /// } + /// + /// let options = Options::new(10).wrap_algorithm(WrapAlgorithm::Custom(stair)); + /// assert_eq!(wrap("First, second, third, fourth, fifth, sixth", options), + /// vec!["First,", + /// "second, third,", + /// "fourth, fifth, sixth"]); + /// ``` + Custom(for<'a, 'b> fn(words: &'b [Word<'a>], line_widths: &'b [usize]) -> Vec<&'b [Word<'a>]>), +} + +impl std::fmt::Debug for WrapAlgorithm { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + WrapAlgorithm::FirstFit => f.write_str("FirstFit"), + #[cfg(feature = "smawk")] + WrapAlgorithm::OptimalFit(penalties) => write!(f, "OptimalFit({:?})", penalties), + WrapAlgorithm::Custom(_) => f.write_str("Custom(...)"), + } + } +} + +impl WrapAlgorithm { + /// Create new wrap algorithm. + /// + /// The best wrapping algorithm is used by default, i.e., + /// [`WrapAlgorithm::OptimalFit`] if available, otherwise + /// [`WrapAlgorithm::FirstFit`]. + pub const fn new() -> Self { + #[cfg(not(feature = "smawk"))] + { + WrapAlgorithm::FirstFit + } + + #[cfg(feature = "smawk")] + { + WrapAlgorithm::new_optimal_fit() + } + } + + /// New [`WrapAlgorithm::OptimalFit`] with default penalties. This + /// works well for monospace text. + /// + /// **Note:** Only available when the `smawk` Cargo feature is + /// enabled. + #[cfg(feature = "smawk")] + pub const fn new_optimal_fit() -> Self { + WrapAlgorithm::OptimalFit(Penalties::new()) + } + + /// Wrap words according to line widths. + /// + /// The `line_widths` slice gives the target line width for each + /// line (the last slice element is repeated as necessary). This + /// can be used to implement hanging indentation. + #[inline] + pub fn wrap<'a, 'b>( + &self, + words: &'b [Word<'a>], + line_widths: &'b [usize], + ) -> Vec<&'b [Word<'a>]> { + // Every integer up to 2u64.pow(f64::MANTISSA_DIGITS) = 2**53 + // = 9_007_199_254_740_992 can be represented without loss by + // a f64. Larger line widths will be rounded to the nearest + // representable number. + let f64_line_widths = line_widths.iter().map(|w| *w as f64).collect::<Vec<_>>(); + + match self { + WrapAlgorithm::FirstFit => wrap_first_fit(words, &f64_line_widths), + + #[cfg(feature = "smawk")] + WrapAlgorithm::OptimalFit(penalties) => { + // The computation cannnot overflow when the line + // widths are restricted to usize. + wrap_optimal_fit(words, &f64_line_widths, penalties).unwrap() + } + + WrapAlgorithm::Custom(func) => func(words, line_widths), + } + } +} + +impl Default for WrapAlgorithm { + fn default() -> Self { + WrapAlgorithm::new() + } +} + +/// Wrap abstract fragments into lines with a first-fit algorithm. +/// +/// The `line_widths` slice gives the target line width for each line +/// (the last slice element is repeated as necessary). This can be +/// used to implement hanging indentation. +/// +/// The fragments must already have been split into the desired +/// widths, this function will not (and cannot) attempt to split them +/// further when arranging them into lines. +/// +/// # First-Fit Algorithm +/// +/// This implements a simple “greedy” algorithm: accumulate fragments +/// one by one and when a fragment no longer fits, start a new line. +/// There is no look-ahead, we simply take first fit of the fragments +/// we find. +/// +/// While fast and predictable, this algorithm can produce poor line +/// breaks when a long fragment is moved to a new line, leaving behind +/// a large gap: +/// +/// ``` +/// use textwrap::core::Word; +/// use textwrap::wrap_algorithms::wrap_first_fit; +/// use textwrap::WordSeparator; +/// +/// // Helper to convert wrapped lines to a Vec<String>. +/// fn lines_to_strings(lines: Vec<&[Word<'_>]>) -> Vec<String> { +/// lines.iter().map(|line| { +/// line.iter().map(|word| &**word).collect::<Vec<_>>().join(" ") +/// }).collect::<Vec<_>>() +/// } +/// +/// let text = "These few words will unfortunately not wrap nicely."; +/// let words = WordSeparator::AsciiSpace.find_words(text).collect::<Vec<_>>(); +/// assert_eq!(lines_to_strings(wrap_first_fit(&words, &[15.0])), +/// vec!["These few words", +/// "will", // <-- short line +/// "unfortunately", +/// "not wrap", +/// "nicely."]); +/// +/// // We can avoid the short line if we look ahead: +/// #[cfg(feature = "smawk")] +/// use textwrap::wrap_algorithms::{wrap_optimal_fit, Penalties}; +/// #[cfg(feature = "smawk")] +/// assert_eq!(lines_to_strings(wrap_optimal_fit(&words, &[15.0], &Penalties::new()).unwrap()), +/// vec!["These few", +/// "words will", +/// "unfortunately", +/// "not wrap", +/// "nicely."]); +/// ``` +/// +/// The [`wrap_optimal_fit`] function was used above to get better +/// line breaks. It uses an advanced algorithm which tries to avoid +/// short lines. This function is about 4 times faster than +/// [`wrap_optimal_fit`]. +/// +/// # Examples +/// +/// Imagine you're building a house site and you have a number of +/// tasks you need to execute. Things like pour foundation, complete +/// framing, install plumbing, electric cabling, install insulation. +/// +/// The construction workers can only work during daytime, so they +/// need to pack up everything at night. Because they need to secure +/// their tools and move machines back to the garage, this process +/// takes much more time than the time it would take them to simply +/// switch to another task. +/// +/// You would like to make a list of tasks to execute every day based +/// on your estimates. You can model this with a program like this: +/// +/// ``` +/// use textwrap::core::{Fragment, Word}; +/// use textwrap::wrap_algorithms::wrap_first_fit; +/// +/// #[derive(Debug)] +/// struct Task<'a> { +/// name: &'a str, +/// hours: f64, // Time needed to complete task. +/// sweep: f64, // Time needed for a quick sweep after task during the day. +/// cleanup: f64, // Time needed for full cleanup if day ends with this task. +/// } +/// +/// impl Fragment for Task<'_> { +/// fn width(&self) -> f64 { self.hours } +/// fn whitespace_width(&self) -> f64 { self.sweep } +/// fn penalty_width(&self) -> f64 { self.cleanup } +/// } +/// +/// // The morning tasks +/// let tasks = vec![ +/// Task { name: "Foundation", hours: 4.0, sweep: 2.0, cleanup: 3.0 }, +/// Task { name: "Framing", hours: 3.0, sweep: 1.0, cleanup: 2.0 }, +/// Task { name: "Plumbing", hours: 2.0, sweep: 2.0, cleanup: 2.0 }, +/// Task { name: "Electrical", hours: 2.0, sweep: 1.0, cleanup: 2.0 }, +/// Task { name: "Insulation", hours: 2.0, sweep: 1.0, cleanup: 2.0 }, +/// Task { name: "Drywall", hours: 3.0, sweep: 1.0, cleanup: 2.0 }, +/// Task { name: "Floors", hours: 3.0, sweep: 1.0, cleanup: 2.0 }, +/// Task { name: "Countertops", hours: 1.0, sweep: 1.0, cleanup: 2.0 }, +/// Task { name: "Bathrooms", hours: 2.0, sweep: 1.0, cleanup: 2.0 }, +/// ]; +/// +/// // Fill tasks into days, taking `day_length` into account. The +/// // output shows the hours worked per day along with the names of +/// // the tasks for that day. +/// fn assign_days<'a>(tasks: &[Task<'a>], day_length: f64) -> Vec<(f64, Vec<&'a str>)> { +/// let mut days = Vec::new(); +/// // Assign tasks to days. The assignment is a vector of slices, +/// // with a slice per day. +/// let assigned_days: Vec<&[Task<'a>]> = wrap_first_fit(&tasks, &[day_length]); +/// for day in assigned_days.iter() { +/// let last = day.last().unwrap(); +/// let work_hours: f64 = day.iter().map(|t| t.hours + t.sweep).sum(); +/// let names = day.iter().map(|t| t.name).collect::<Vec<_>>(); +/// days.push((work_hours - last.sweep + last.cleanup, names)); +/// } +/// days +/// } +/// +/// // With a single crew working 8 hours a day: +/// assert_eq!( +/// assign_days(&tasks, 8.0), +/// [ +/// (7.0, vec!["Foundation"]), +/// (8.0, vec!["Framing", "Plumbing"]), +/// (7.0, vec!["Electrical", "Insulation"]), +/// (5.0, vec!["Drywall"]), +/// (7.0, vec!["Floors", "Countertops"]), +/// (4.0, vec!["Bathrooms"]), +/// ] +/// ); +/// +/// // With two crews working in shifts, 16 hours a day: +/// assert_eq!( +/// assign_days(&tasks, 16.0), +/// [ +/// (14.0, vec!["Foundation", "Framing", "Plumbing"]), +/// (15.0, vec!["Electrical", "Insulation", "Drywall", "Floors"]), +/// (6.0, vec!["Countertops", "Bathrooms"]), +/// ] +/// ); +/// ``` +/// +/// Apologies to anyone who actually knows how to build a house and +/// knows how long each step takes :-) +pub fn wrap_first_fit<'a, 'b, T: Fragment>( + fragments: &'a [T], + line_widths: &'b [f64], +) -> Vec<&'a [T]> { + // The final line width is used for all remaining lines. + let default_line_width = line_widths.last().copied().unwrap_or(0.0); + let mut lines = Vec::new(); + let mut start = 0; + let mut width = 0.0; + + for (idx, fragment) in fragments.iter().enumerate() { + let line_width = line_widths + .get(lines.len()) + .copied() + .unwrap_or(default_line_width); + if width + fragment.width() + fragment.penalty_width() > line_width && idx > start { + lines.push(&fragments[start..idx]); + start = idx; + width = 0.0; + } + width += fragment.width() + fragment.whitespace_width(); + } + lines.push(&fragments[start..]); + lines +} + +#[cfg(test)] +mod tests { + use super::*; + + #[derive(Debug, PartialEq)] + struct Word(f64); + + #[rustfmt::skip] + impl Fragment for Word { + fn width(&self) -> f64 { self.0 } + fn whitespace_width(&self) -> f64 { 1.0 } + fn penalty_width(&self) -> f64 { 0.0 } + } + + #[test] + fn wrap_string_longer_than_f64() { + let words = vec![ + Word(1e307), + Word(2e307), + Word(3e307), + Word(4e307), + Word(5e307), + Word(6e307), + ]; + // Wrap at just under f64::MAX (~19e307). The tiny + // whitespace_widths disappear because of loss of precision. + assert_eq!( + wrap_first_fit(&words, &[15e307]), + &[ + vec![ + Word(1e307), + Word(2e307), + Word(3e307), + Word(4e307), + Word(5e307) + ], + vec![Word(6e307)] + ] + ); + } +} diff --git a/third_party/rust/textwrap/src/wrap_algorithms/optimal_fit.rs b/third_party/rust/textwrap/src/wrap_algorithms/optimal_fit.rs new file mode 100644 index 0000000000..0625e28851 --- /dev/null +++ b/third_party/rust/textwrap/src/wrap_algorithms/optimal_fit.rs @@ -0,0 +1,433 @@ +use std::cell::RefCell; + +use crate::core::Fragment; + +/// Penalties for +/// [`WrapAlgorithm::OptimalFit`](crate::WrapAlgorithm::OptimalFit) +/// and [`wrap_optimal_fit`]. +/// +/// This wrapping algorithm in [`wrap_optimal_fit`] considers the +/// entire paragraph to find optimal line breaks. When wrapping text, +/// "penalties" are assigned to line breaks based on the gaps left at +/// the end of lines. The penalties are given by this struct, with +/// [`Penalties::default`] assigning penalties that work well for +/// monospace text. +/// +/// If you are wrapping proportional text, you are advised to assign +/// your own penalties according to your font size. See the individual +/// penalties below for details. +/// +/// **Note:** Only available when the `smawk` Cargo feature is +/// enabled. +#[derive(Clone, Copy, Debug)] +pub struct Penalties { + /// Per-line penalty. This is added for every line, which makes it + /// expensive to output more lines than the minimum required. + pub nline_penalty: usize, + + /// Per-character cost for lines that overflow the target line width. + /// + /// With a default value of 50², every single character costs as + /// much as leaving a gap of 50 characters behind. This is because + /// we assign as cost of `gap * gap` to a short line. When + /// wrapping monospace text, we can overflow the line by 1 + /// character in extreme cases: + /// + /// ``` + /// use textwrap::core::Word; + /// use textwrap::wrap_algorithms::{wrap_optimal_fit, Penalties}; + /// + /// let short = "foo "; + /// let long = "x".repeat(50); + /// let length = (short.len() + long.len()) as f64; + /// let fragments = vec![Word::from(short), Word::from(&long)]; + /// let penalties = Penalties::new(); + /// + /// // Perfect fit, both words are on a single line with no overflow. + /// let wrapped = wrap_optimal_fit(&fragments, &[length], &penalties).unwrap(); + /// assert_eq!(wrapped, vec![&[Word::from(short), Word::from(&long)]]); + /// + /// // The words no longer fit, yet we get a single line back. While + /// // the cost of overflow (`1 * 2500`) is the same as the cost of the + /// // gap (`50 * 50 = 2500`), the tie is broken by `nline_penalty` + /// // which makes it cheaper to overflow than to use two lines. + /// let wrapped = wrap_optimal_fit(&fragments, &[length - 1.0], &penalties).unwrap(); + /// assert_eq!(wrapped, vec![&[Word::from(short), Word::from(&long)]]); + /// + /// // The cost of overflow would be 2 * 2500, whereas the cost of + /// // the gap is only `49 * 49 + nline_penalty = 2401 + 1000 = + /// // 3401`. We therefore get two lines. + /// let wrapped = wrap_optimal_fit(&fragments, &[length - 2.0], &penalties).unwrap(); + /// assert_eq!(wrapped, vec![&[Word::from(short)], + /// &[Word::from(&long)]]); + /// ``` + /// + /// This only happens if the overflowing word is 50 characters + /// long _and_ if the word overflows the line by exactly one + /// character. If it overflows by more than one character, the + /// overflow penalty will quickly outgrow the cost of the gap, as + /// seen above. + pub overflow_penalty: usize, + + /// When should the a single word on the last line be considered + /// "too short"? + /// + /// If the last line of the text consist of a single word and if + /// this word is shorter than `1 / short_last_line_fraction` of + /// the line width, then the final line will be considered "short" + /// and `short_last_line_penalty` is added as an extra penalty. + /// + /// The effect of this is to avoid a final line consisting of a + /// single small word. For example, with a + /// `short_last_line_penalty` of 25 (the default), a gap of up to + /// 5 columns will be seen as more desirable than having a final + /// short line. + /// + /// ## Examples + /// + /// ``` + /// use textwrap::{wrap, wrap_algorithms, Options, WrapAlgorithm}; + /// + /// let text = "This is a demo of the short last line penalty."; + /// + /// // The first-fit algorithm leaves a single short word on the last line: + /// assert_eq!(wrap(text, Options::new(37).wrap_algorithm(WrapAlgorithm::FirstFit)), + /// vec!["This is a demo of the short last line", + /// "penalty."]); + /// + /// #[cfg(feature = "smawk")] { + /// let mut penalties = wrap_algorithms::Penalties::new(); + /// + /// // Since "penalty." is shorter than 25% of the line width, the + /// // optimal-fit algorithm adds a penalty of 25. This is enough + /// // to move "line " down: + /// assert_eq!(wrap(text, Options::new(37).wrap_algorithm(WrapAlgorithm::OptimalFit(penalties))), + /// vec!["This is a demo of the short last", + /// "line penalty."]); + /// + /// // We can change the meaning of "short" lines. Here, only words + /// // shorter than 1/10th of the line width will be considered short: + /// penalties.short_last_line_fraction = 10; + /// assert_eq!(wrap(text, Options::new(37).wrap_algorithm(WrapAlgorithm::OptimalFit(penalties))), + /// vec!["This is a demo of the short last line", + /// "penalty."]); + /// + /// // If desired, the penalty can also be disabled: + /// penalties.short_last_line_fraction = 4; + /// penalties.short_last_line_penalty = 0; + /// assert_eq!(wrap(text, Options::new(37).wrap_algorithm(WrapAlgorithm::OptimalFit(penalties))), + /// vec!["This is a demo of the short last line", + /// "penalty."]); + /// } + /// ``` + pub short_last_line_fraction: usize, + + /// Penalty for a last line with a single short word. + /// + /// Set this to zero if you do not want to penalize short last lines. + pub short_last_line_penalty: usize, + + /// Penalty for lines ending with a hyphen. + pub hyphen_penalty: usize, +} + +impl Penalties { + /// Default penalties for monospace text. + /// + /// The penalties here work well for monospace text. This is + /// because they expect the gaps at the end of lines to be roughly + /// in the range `0..100`. If the gaps are larger, the + /// `overflow_penalty` and `hyphen_penalty` become insignificant. + pub const fn new() -> Self { + Penalties { + nline_penalty: 1000, + overflow_penalty: 50 * 50, + short_last_line_fraction: 4, + short_last_line_penalty: 25, + hyphen_penalty: 25, + } + } +} + +impl Default for Penalties { + fn default() -> Self { + Self::new() + } +} + +/// Cache for line numbers. This is necessary to avoid a O(n**2) +/// behavior when computing line numbers in [`wrap_optimal_fit`]. +struct LineNumbers { + line_numbers: RefCell<Vec<usize>>, +} + +impl LineNumbers { + fn new(size: usize) -> Self { + let mut line_numbers = Vec::with_capacity(size); + line_numbers.push(0); + LineNumbers { + line_numbers: RefCell::new(line_numbers), + } + } + + fn get<T>(&self, i: usize, minima: &[(usize, T)]) -> usize { + while self.line_numbers.borrow_mut().len() < i + 1 { + let pos = self.line_numbers.borrow().len(); + let line_number = 1 + self.get(minima[pos].0, minima); + self.line_numbers.borrow_mut().push(line_number); + } + + self.line_numbers.borrow()[i] + } +} + +/// Overflow error during the [`wrap_optimal_fit`] computation. +#[derive(Debug, PartialEq, Eq)] +pub struct OverflowError; + +impl std::fmt::Display for OverflowError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "wrap_optimal_fit cost computation overflowed") + } +} + +impl std::error::Error for OverflowError {} + +/// Wrap abstract fragments into lines with an optimal-fit algorithm. +/// +/// The `line_widths` slice gives the target line width for each line +/// (the last slice element is repeated as necessary). This can be +/// used to implement hanging indentation. +/// +/// The fragments must already have been split into the desired +/// widths, this function will not (and cannot) attempt to split them +/// further when arranging them into lines. +/// +/// # Optimal-Fit Algorithm +/// +/// The algorithm considers all possible break points and picks the +/// breaks which minimizes the gaps at the end of each line. More +/// precisely, the algorithm assigns a cost or penalty to each break +/// point, determined by `cost = gap * gap` where `gap = target_width - +/// line_width`. Shorter lines are thus penalized more heavily since +/// they leave behind a larger gap. +/// +/// We can illustrate this with the text “To be, or not to be: that is +/// the question”. We will be wrapping it in a narrow column with room +/// for only 10 characters. The [greedy +/// algorithm](super::wrap_first_fit) will produce these lines, each +/// annotated with the corresponding penalty: +/// +/// ```text +/// "To be, or" 1² = 1 +/// "not to be:" 0² = 0 +/// "that is" 3² = 9 +/// "the" 7² = 49 +/// "question" 2² = 4 +/// ``` +/// +/// We see that line four with “the” leaves a gap of 7 columns, which +/// gives it a penalty of 49. The sum of the penalties is 63. +/// +/// There are 10 words, which means that there are `2_u32.pow(9)` or +/// 512 different ways to typeset it. We can compute +/// the sum of the penalties for each possible line break and search +/// for the one with the lowest sum: +/// +/// ```text +/// "To be," 4² = 16 +/// "or not to" 1² = 1 +/// "be: that" 2² = 4 +/// "is the" 4² = 16 +/// "question" 2² = 4 +/// ``` +/// +/// The sum of the penalties is 41, which is better than what the +/// greedy algorithm produced. +/// +/// Searching through all possible combinations would normally be +/// prohibitively slow. However, it turns out that the problem can be +/// formulated as the task of finding column minima in a cost matrix. +/// This matrix has a special form (totally monotone) which lets us +/// use a [linear-time algorithm called +/// SMAWK](https://lib.rs/crates/smawk) to find the optimal break +/// points. +/// +/// This means that the time complexity remains O(_n_) where _n_ is +/// the number of words. Compared to +/// [`wrap_first_fit`](super::wrap_first_fit), this function is about +/// 4 times slower. +/// +/// The optimization of per-line costs over the entire paragraph is +/// inspired by the line breaking algorithm used in TeX, as described +/// in the 1981 article [_Breaking Paragraphs into +/// Lines_](http://www.eprg.org/G53DOC/pdfs/knuth-plass-breaking.pdf) +/// by Knuth and Plass. The implementation here is based on [Python +/// code by David +/// Eppstein](https://github.com/jfinkels/PADS/blob/master/pads/wrap.py). +/// +/// # Errors +/// +/// In case of an overflow during the cost computation, an `Err` is +/// returned. Overflows happens when fragments or lines have infinite +/// widths (`f64::INFINITY`) or if the widths are so large that the +/// gaps at the end of lines have sizes larger than `f64::MAX.sqrt()` +/// (approximately 1e154): +/// +/// ``` +/// use textwrap::core::Fragment; +/// use textwrap::wrap_algorithms::{wrap_optimal_fit, OverflowError, Penalties}; +/// +/// #[derive(Debug, PartialEq)] +/// struct Word(f64); +/// +/// impl Fragment for Word { +/// fn width(&self) -> f64 { self.0 } +/// fn whitespace_width(&self) -> f64 { 1.0 } +/// fn penalty_width(&self) -> f64 { 0.0 } +/// } +/// +/// // Wrapping overflows because 1e155 * 1e155 = 1e310, which is +/// // larger than f64::MAX: +/// assert_eq!(wrap_optimal_fit(&[Word(0.0), Word(0.0)], &[1e155], &Penalties::default()), +/// Err(OverflowError)); +/// ``` +/// +/// When using fragment widths and line widths which fit inside an +/// `u64`, overflows cannot happen. This means that fragments derived +/// from a `&str` cannot cause overflows. +/// +/// **Note:** Only available when the `smawk` Cargo feature is +/// enabled. +pub fn wrap_optimal_fit<'a, 'b, T: Fragment>( + fragments: &'a [T], + line_widths: &'b [f64], + penalties: &'b Penalties, +) -> Result<Vec<&'a [T]>, OverflowError> { + // The final line width is used for all remaining lines. + let default_line_width = line_widths.last().copied().unwrap_or(0.0); + let mut widths = Vec::with_capacity(fragments.len() + 1); + let mut width = 0.0; + widths.push(width); + for fragment in fragments { + width += fragment.width() + fragment.whitespace_width(); + widths.push(width); + } + + let line_numbers = LineNumbers::new(fragments.len()); + + let minima = smawk::online_column_minima(0.0, widths.len(), |minima, i, j| { + // Line number for fragment `i`. + let line_number = line_numbers.get(i, minima); + let line_width = line_widths + .get(line_number) + .copied() + .unwrap_or(default_line_width); + let target_width = line_width.max(1.0); + + // Compute the width of a line spanning fragments[i..j] in + // constant time. We need to adjust widths[j] by subtracting + // the whitespace of fragment[j-1] and then add the penalty. + let line_width = widths[j] - widths[i] - fragments[j - 1].whitespace_width() + + fragments[j - 1].penalty_width(); + + // We compute cost of the line containing fragments[i..j]. We + // start with values[i].1, which is the optimal cost for + // breaking before fragments[i]. + // + // First, every extra line cost NLINE_PENALTY. + let mut cost = minima[i].1 + penalties.nline_penalty as f64; + + // Next, we add a penalty depending on the line length. + if line_width > target_width { + // Lines that overflow get a hefty penalty. + let overflow = line_width - target_width; + cost += overflow * penalties.overflow_penalty as f64; + } else if j < fragments.len() { + // Other lines (except for the last line) get a milder + // penalty which depend on the size of the gap. + let gap = target_width - line_width; + cost += gap * gap; + } else if i + 1 == j + && line_width < target_width / penalties.short_last_line_fraction as f64 + { + // The last line can have any size gap, but we do add a + // penalty if the line is very short (typically because it + // contains just a single word). + cost += penalties.short_last_line_penalty as f64; + } + + // Finally, we discourage hyphens. + if fragments[j - 1].penalty_width() > 0.0 { + // TODO: this should use a penalty value from the fragment + // instead. + cost += penalties.hyphen_penalty as f64; + } + + cost + }); + + for (_, cost) in &minima { + if cost.is_infinite() { + return Err(OverflowError); + } + } + + let mut lines = Vec::with_capacity(line_numbers.get(fragments.len(), &minima)); + let mut pos = fragments.len(); + loop { + let prev = minima[pos].0; + lines.push(&fragments[prev..pos]); + pos = prev; + if pos == 0 { + break; + } + } + + lines.reverse(); + Ok(lines) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[derive(Debug, PartialEq)] + struct Word(f64); + + #[rustfmt::skip] + impl Fragment for Word { + fn width(&self) -> f64 { self.0 } + fn whitespace_width(&self) -> f64 { 1.0 } + fn penalty_width(&self) -> f64 { 0.0 } + } + + #[test] + fn wrap_fragments_with_infinite_widths() { + let words = vec![Word(f64::INFINITY)]; + assert_eq!( + wrap_optimal_fit(&words, &[0.0], &Penalties::default()), + Err(OverflowError) + ); + } + + #[test] + fn wrap_fragments_with_huge_widths() { + let words = vec![Word(1e200), Word(1e250), Word(1e300)]; + assert_eq!( + wrap_optimal_fit(&words, &[1e300], &Penalties::default()), + Err(OverflowError) + ); + } + + #[test] + fn wrap_fragments_with_large_widths() { + // The gaps will be of the sizes between 1e25 and 1e75. This + // makes the `gap * gap` cost fit comfortably in a f64. + let words = vec![Word(1e25), Word(1e50), Word(1e75)]; + assert_eq!( + wrap_optimal_fit(&words, &[1e100], &Penalties::default()), + Ok(vec![&vec![Word(1e25), Word(1e50), Word(1e75)][..]]) + ); + } +} |