//! Building blocks for advanced wrapping functionality. //! //! The functions and structs in this module can be used to implement //! advanced wrapping functionality when the [`wrap`](super::wrap) and //! [`fill`](super::fill) function don't do what you want. //! //! In general, you want to follow these steps when wrapping //! something: //! //! 1. Split your input into [`Fragment`]s. These are abstract blocks //! of text or content which can be wrapped into lines. See //! [`WordSeparator`](crate::word_separators::WordSeparator) for //! how to do this for text. //! //! 2. Potentially split your fragments into smaller pieces. This //! allows you to implement things like hyphenation. If you use the //! `Word` type, you can use [`WordSplitter`](crate::WordSplitter) //! enum for this. //! //! 3. Potentially break apart fragments that are still too large to //! fit on a single line. This is implemented in [`break_words`]. //! //! 4. Finally take your fragments and put them into lines. There are //! two algorithms for this in the //! [`wrap_algorithms`](crate::wrap_algorithms) module: //! [`wrap_optimal_fit`](crate::wrap_algorithms::wrap_optimal_fit) //! and [`wrap_first_fit`](crate::wrap_algorithms::wrap_first_fit). //! The former produces better line breaks, the latter is faster. //! //! 5. Iterate through the slices returned by the wrapping functions //! and construct your lines of output. //! //! Please [open an issue](https://github.com/mgeisler/textwrap/) if //! the functionality here is not sufficient or if you have ideas for //! improving it. We would love to hear from you! /// The CSI or “Control Sequence Introducer” introduces an ANSI escape /// sequence. This is typically used for colored text and will be /// ignored when computing the text width. const CSI: (char, char) = ('\x1b', '['); /// The final bytes of an ANSI escape sequence must be in this range. const ANSI_FINAL_BYTE: std::ops::RangeInclusive = '\x40'..='\x7e'; /// Skip ANSI escape sequences. The `ch` is the current `char`, the /// `chars` provide the following characters. The `chars` will be /// modified if `ch` is the start of an ANSI escape sequence. #[inline] pub(crate) fn skip_ansi_escape_sequence>(ch: char, chars: &mut I) -> bool { if ch == CSI.0 && chars.next() == Some(CSI.1) { // We have found the start of an ANSI escape code, typically // used for colored terminal text. We skip until we find a // "final byte" in the range 0x40–0x7E. for ch in chars { if ANSI_FINAL_BYTE.contains(&ch) { return true; } } } false } #[cfg(feature = "unicode-width")] #[inline] fn ch_width(ch: char) -> usize { unicode_width::UnicodeWidthChar::width(ch).unwrap_or(0) } /// First character which [`ch_width`] will classify as double-width. /// Please see [`display_width`]. #[cfg(not(feature = "unicode-width"))] const DOUBLE_WIDTH_CUTOFF: char = '\u{1100}'; #[cfg(not(feature = "unicode-width"))] #[inline] fn ch_width(ch: char) -> usize { if ch < DOUBLE_WIDTH_CUTOFF { 1 } else { 2 } } /// Compute the display width of `text` while skipping over ANSI /// escape sequences. /// /// # Examples /// /// ``` /// use textwrap::core::display_width; /// /// assert_eq!(display_width("Café Plain"), 10); /// assert_eq!(display_width("\u{1b}[31mCafé Rouge\u{1b}[0m"), 10); /// ``` /// /// **Note:** When the `unicode-width` Cargo feature is disabled, the /// width of a `char` is determined by a crude approximation which /// simply counts chars below U+1100 as 1 column wide, and all other /// characters as 2 columns wide. With the feature enabled, function /// will correctly deal with [combining characters] in their /// decomposed form (see [Unicode equivalence]). /// /// An example of a decomposed character is “é”, which can be /// decomposed into: “e” followed by a combining acute accent: “◌́”. /// Without the `unicode-width` Cargo feature, every `char` below /// U+1100 has a width of 1. This includes the combining accent: /// /// ``` /// use textwrap::core::display_width; /// /// assert_eq!(display_width("Cafe Plain"), 10); /// #[cfg(feature = "unicode-width")] /// assert_eq!(display_width("Cafe\u{301} Plain"), 10); /// #[cfg(not(feature = "unicode-width"))] /// assert_eq!(display_width("Cafe\u{301} Plain"), 11); /// ``` /// /// ## Emojis and CJK Characters /// /// Characters such as emojis and [CJK characters] used in the /// Chinese, Japanese, and Korean langauges are seen as double-width, /// even if the `unicode-width` feature is disabled: /// /// ``` /// use textwrap::core::display_width; /// /// assert_eq!(display_width("😂😭🥺🤣✨😍🙏🥰😊🔥"), 20); /// assert_eq!(display_width("你好"), 4); // “Nǐ hǎo” or “Hello” in Chinese /// ``` /// /// # Limitations /// /// The displayed width of a string cannot always be computed from the /// string alone. This is because the width depends on the rendering /// engine used. This is particularly visible with [emoji modifier /// sequences] where a base emoji is modified with, e.g., skin tone or /// hair color modifiers. It is up to the rendering engine to detect /// this and to produce a suitable emoji. /// /// A simple example is “❤️”, which consists of “❤” (U+2764: Black /// Heart Symbol) followed by U+FE0F (Variation Selector-16). By /// itself, “❤” is a black heart, but if you follow it with the /// variant selector, you may get a wider red heart. /// /// A more complex example would be “👨‍🦰” which should depict a man /// with red hair. Here the computed width is too large — and the /// width differs depending on the use of the `unicode-width` feature: /// /// ``` /// use textwrap::core::display_width; /// /// assert_eq!("👨‍🦰".chars().collect::>(), ['\u{1f468}', '\u{200d}', '\u{1f9b0}']); /// #[cfg(feature = "unicode-width")] /// assert_eq!(display_width("👨‍🦰"), 4); /// #[cfg(not(feature = "unicode-width"))] /// assert_eq!(display_width("👨‍🦰"), 6); /// ``` /// /// This happens because the grapheme consists of three code points: /// “👨” (U+1F468: Man), Zero Width Joiner (U+200D), and “🦰” /// (U+1F9B0: Red Hair). You can see them above in the test. With /// `unicode-width` enabled, the ZWJ is correctly seen as having zero /// width, without it is counted as a double-width character. /// /// ## Terminal Support /// /// Modern browsers typically do a great job at combining characters /// as shown above, but terminals often struggle more. As an example, /// Gnome Terminal version 3.38.1, shows “❤️” as a big red heart, but /// shows "👨‍🦰" as “👨🦰”. /// /// [combining characters]: https://en.wikipedia.org/wiki/Combining_character /// [Unicode equivalence]: https://en.wikipedia.org/wiki/Unicode_equivalence /// [CJK characters]: https://en.wikipedia.org/wiki/CJK_characters /// [emoji modifier sequences]: https://unicode.org/emoji/charts/full-emoji-modifiers.html pub fn display_width(text: &str) -> usize { let mut chars = text.chars(); let mut width = 0; while let Some(ch) = chars.next() { if skip_ansi_escape_sequence(ch, &mut chars) { continue; } width += ch_width(ch); } width } /// A (text) fragment denotes the unit which we wrap into lines. /// /// Fragments represent an abstract _word_ plus the _whitespace_ /// following the word. In case the word falls at the end of the line, /// the whitespace is dropped and a so-called _penalty_ is inserted /// instead (typically `"-"` if the word was hyphenated). /// /// For wrapping purposes, the precise content of the word, the /// whitespace, and the penalty is irrelevant. All we need to know is /// the displayed width of each part, which this trait provides. pub trait Fragment: std::fmt::Debug { /// Displayed width of word represented by this fragment. fn width(&self) -> f64; /// Displayed width of the whitespace that must follow the word /// when the word is not at the end of a line. fn whitespace_width(&self) -> f64; /// Displayed width of the penalty that must be inserted if the /// word falls at the end of a line. fn penalty_width(&self) -> f64; } /// A piece of wrappable text, including any trailing whitespace. /// /// A `Word` is an example of a [`Fragment`], so it has a width, /// trailing whitespace, and potentially a penalty item. #[derive(Debug, Copy, Clone, PartialEq, Eq)] pub struct Word<'a> { /// Word content. pub word: &'a str, /// Whitespace to insert if the word does not fall at the end of a line. pub whitespace: &'a str, /// Penalty string to insert if the word falls at the end of a line. pub penalty: &'a str, // Cached width in columns. pub(crate) width: usize, } impl std::ops::Deref for Word<'_> { type Target = str; fn deref(&self) -> &Self::Target { self.word } } impl<'a> Word<'a> { /// Construct a `Word` from a string. /// /// A trailing stretch of `' '` is automatically taken to be the /// whitespace part of the word. pub fn from(word: &str) -> Word<'_> { let trimmed = word.trim_end_matches(' '); Word { word: trimmed, width: display_width(trimmed), whitespace: &word[trimmed.len()..], penalty: "", } } /// Break this word into smaller words with a width of at most /// `line_width`. The whitespace and penalty from this `Word` is /// added to the last piece. /// /// # Examples /// /// ``` /// use textwrap::core::Word; /// assert_eq!( /// Word::from("Hello! ").break_apart(3).collect::>(), /// vec![Word::from("Hel"), Word::from("lo! ")] /// ); /// ``` pub fn break_apart<'b>(&'b self, line_width: usize) -> impl Iterator> + 'b { let mut char_indices = self.word.char_indices(); let mut offset = 0; let mut width = 0; std::iter::from_fn(move || { while let Some((idx, ch)) = char_indices.next() { if skip_ansi_escape_sequence(ch, &mut char_indices.by_ref().map(|(_, ch)| ch)) { continue; } if width > 0 && width + ch_width(ch) > line_width { let word = Word { word: &self.word[offset..idx], width: width, whitespace: "", penalty: "", }; offset = idx; width = ch_width(ch); return Some(word); } width += ch_width(ch); } if offset < self.word.len() { let word = Word { word: &self.word[offset..], width: width, whitespace: self.whitespace, penalty: self.penalty, }; offset = self.word.len(); return Some(word); } None }) } } impl Fragment for Word<'_> { #[inline] fn width(&self) -> f64 { self.width as f64 } // We assume the whitespace consist of ' ' only. This allows us to // compute the display width in constant time. #[inline] fn whitespace_width(&self) -> f64 { self.whitespace.len() as f64 } // We assume the penalty is `""` or `"-"`. This allows us to // compute the display width in constant time. #[inline] fn penalty_width(&self) -> f64 { self.penalty.len() as f64 } } /// Forcibly break words wider than `line_width` into smaller words. /// /// This simply calls [`Word::break_apart`] on words that are too /// wide. This means that no extra `'-'` is inserted, the word is /// simply broken into smaller pieces. pub fn break_words<'a, I>(words: I, line_width: usize) -> Vec> where I: IntoIterator>, { let mut shortened_words = Vec::new(); for word in words { if word.width() > line_width as f64 { shortened_words.extend(word.break_apart(line_width)); } else { shortened_words.push(word); } } shortened_words } #[cfg(test)] mod tests { use super::*; #[cfg(feature = "unicode-width")] use unicode_width::UnicodeWidthChar; #[test] fn skip_ansi_escape_sequence_works() { let blue_text = "\u{1b}[34mHello\u{1b}[0m"; let mut chars = blue_text.chars(); let ch = chars.next().unwrap(); assert!(skip_ansi_escape_sequence(ch, &mut chars)); assert_eq!(chars.next(), Some('H')); } #[test] fn emojis_have_correct_width() { use unic_emoji_char::is_emoji; // Emojis in the Basic Latin (ASCII) and Latin-1 Supplement // blocks all have a width of 1 column. This includes // characters such as '#' and '©'. for ch in '\u{1}'..'\u{FF}' { if is_emoji(ch) { let desc = format!("{:?} U+{:04X}", ch, ch as u32); #[cfg(feature = "unicode-width")] assert_eq!(ch.width().unwrap(), 1, "char: {}", desc); #[cfg(not(feature = "unicode-width"))] assert_eq!(ch_width(ch), 1, "char: {}", desc); } } // Emojis in the remaining blocks of the Basic Multilingual // Plane (BMP), in the Supplementary Multilingual Plane (SMP), // and in the Supplementary Ideographic Plane (SIP), are all 1 // or 2 columns wide when unicode-width is used, and always 2 // columns wide otherwise. This includes all of our favorite // emojis such as 😊. for ch in '\u{FF}'..'\u{2FFFF}' { if is_emoji(ch) { let desc = format!("{:?} U+{:04X}", ch, ch as u32); #[cfg(feature = "unicode-width")] assert!(ch.width().unwrap() <= 2, "char: {}", desc); #[cfg(not(feature = "unicode-width"))] assert_eq!(ch_width(ch), 2, "char: {}", desc); } } // The remaining planes contain almost no assigned code points // and thus also no emojis. } #[test] fn display_width_works() { assert_eq!("Café Plain".len(), 11); // “é” is two bytes assert_eq!(display_width("Café Plain"), 10); assert_eq!(display_width("\u{1b}[31mCafé Rouge\u{1b}[0m"), 10); } #[test] fn display_width_narrow_emojis() { #[cfg(feature = "unicode-width")] assert_eq!(display_width("⁉"), 1); // The ⁉ character is above DOUBLE_WIDTH_CUTOFF. #[cfg(not(feature = "unicode-width"))] assert_eq!(display_width("⁉"), 2); } #[test] fn display_width_narrow_emojis_variant_selector() { #[cfg(feature = "unicode-width")] assert_eq!(display_width("⁉\u{fe0f}"), 1); // The variant selector-16 is also counted. #[cfg(not(feature = "unicode-width"))] assert_eq!(display_width("⁉\u{fe0f}"), 4); } #[test] fn display_width_emojis() { assert_eq!(display_width("😂😭🥺🤣✨😍🙏🥰😊🔥"), 20); } }