//! Word splitting functionality. //! //! To wrap text into lines, long words sometimes need to be split //! across lines. The [`WordSplitter`] enum defines this //! functionality. use crate::core::{display_width, Word}; /// The `WordSplitter` enum describes where words can be split. /// /// If the textwrap crate has been compiled with the `hyphenation` /// Cargo feature enabled, you will find a /// [`WordSplitter::Hyphenation`] variant. Use this struct for /// language-aware hyphenation: /// /// ``` /// #[cfg(feature = "hyphenation")] { /// use hyphenation::{Language, Load, Standard}; /// use textwrap::{wrap, Options, WordSplitter}; /// /// let text = "Oxidation is the loss of electrons."; /// let dictionary = Standard::from_embedded(Language::EnglishUS).unwrap(); /// let options = Options::new(8).word_splitter(WordSplitter::Hyphenation(dictionary)); /// assert_eq!(wrap(text, &options), vec!["Oxida-", /// "tion is", /// "the loss", /// "of elec-", /// "trons."]); /// } /// ``` /// /// Please see the documentation for the [hyphenation] crate for more /// details. /// /// [hyphenation]: https://docs.rs/hyphenation/ #[derive(Clone)] pub enum WordSplitter { /// Use this as a [`Options.word_splitter`] to avoid any kind of /// hyphenation: /// /// ``` /// use textwrap::{wrap, Options, WordSplitter}; /// /// let options = Options::new(8).word_splitter(WordSplitter::NoHyphenation); /// assert_eq!(wrap("foo bar-baz", &options), /// vec!["foo", "bar-baz"]); /// ``` /// /// [`Options.word_splitter`]: super::Options::word_splitter NoHyphenation, /// `HyphenSplitter` is the default `WordSplitter` used by /// [`Options::new`](super::Options::new). It will split words on /// existing hyphens in the word. /// /// It will only use hyphens that are surrounded by alphanumeric /// characters, which prevents a word like `"--foo-bar"` from /// being split into `"--"` and `"foo-bar"`. /// /// # Examples /// /// ``` /// use textwrap::WordSplitter; /// /// assert_eq!(WordSplitter::HyphenSplitter.split_points("--foo-bar"), /// vec![6]); /// ``` HyphenSplitter, /// Use a custom function as the word splitter. /// /// This varian lets you implement a custom word splitter using /// your own function. /// /// # Examples /// /// ``` /// use textwrap::WordSplitter; /// /// fn split_at_underscore(word: &str) -> Vec { /// word.match_indices('_').map(|(idx, _)| idx + 1).collect() /// } /// /// let word_splitter = WordSplitter::Custom(split_at_underscore); /// assert_eq!(word_splitter.split_points("a_long_identifier"), /// vec![2, 7]); /// ``` Custom(fn(word: &str) -> Vec), /// A hyphenation dictionary can be used to do language-specific /// hyphenation using patterns from the [hyphenation] crate. /// /// **Note:** Only available when the `hyphenation` Cargo feature is /// enabled. /// /// [hyphenation]: https://docs.rs/hyphenation/ #[cfg(feature = "hyphenation")] Hyphenation(hyphenation::Standard), } impl std::fmt::Debug for WordSplitter { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { WordSplitter::NoHyphenation => f.write_str("NoHyphenation"), WordSplitter::HyphenSplitter => f.write_str("HyphenSplitter"), WordSplitter::Custom(_) => f.write_str("Custom(...)"), #[cfg(feature = "hyphenation")] WordSplitter::Hyphenation(dict) => write!(f, "Hyphenation({})", dict.language()), } } } impl PartialEq for WordSplitter { fn eq(&self, other: &WordSplitter) -> bool { match (self, other) { (WordSplitter::NoHyphenation, WordSplitter::NoHyphenation) => true, (WordSplitter::HyphenSplitter, WordSplitter::HyphenSplitter) => true, #[cfg(feature = "hyphenation")] (WordSplitter::Hyphenation(this_dict), WordSplitter::Hyphenation(other_dict)) => { this_dict.language() == other_dict.language() } (_, _) => false, } } } impl WordSplitter { /// Return all possible indices where `word` can be split. /// /// The indices are in the range `0..word.len()`. They point to /// the index _after_ the split point, i.e., after `-` if /// splitting on hyphens. This way, `word.split_at(idx)` will /// break the word into two well-formed pieces. /// /// # Examples /// /// ``` /// use textwrap::WordSplitter; /// assert_eq!(WordSplitter::NoHyphenation.split_points("cannot-be-split"), vec![]); /// assert_eq!(WordSplitter::HyphenSplitter.split_points("can-be-split"), vec![4, 7]); /// assert_eq!(WordSplitter::Custom(|word| vec![word.len()/2]).split_points("middle"), vec![3]); /// ``` pub fn split_points(&self, word: &str) -> Vec { match self { WordSplitter::NoHyphenation => Vec::new(), WordSplitter::HyphenSplitter => { let mut splits = Vec::new(); for (idx, _) in word.match_indices('-') { // We only use hyphens that are surrounded by alphanumeric // characters. This is to avoid splitting on repeated hyphens, // such as those found in --foo-bar. let prev = word[..idx].chars().next_back(); let next = word[idx + 1..].chars().next(); if prev.filter(|ch| ch.is_alphanumeric()).is_some() && next.filter(|ch| ch.is_alphanumeric()).is_some() { splits.push(idx + 1); // +1 due to width of '-'. } } splits } WordSplitter::Custom(splitter_func) => splitter_func(word), #[cfg(feature = "hyphenation")] WordSplitter::Hyphenation(dictionary) => { use hyphenation::Hyphenator; dictionary.hyphenate(word).breaks } } } } /// Split words into smaller words according to the split points given /// by `word_splitter`. /// /// Note that we split all words, regardless of their length. This is /// to more cleanly separate the business of splitting (including /// automatic hyphenation) from the business of word wrapping. pub fn split_words<'a, I>( words: I, word_splitter: &'a WordSplitter, ) -> impl Iterator> where I: IntoIterator>, { words.into_iter().flat_map(move |word| { let mut prev = 0; let mut split_points = word_splitter.split_points(&word).into_iter(); std::iter::from_fn(move || { if let Some(idx) = split_points.next() { let need_hyphen = !word[..idx].ends_with('-'); let w = Word { word: &word.word[prev..idx], width: display_width(&word[prev..idx]), whitespace: "", penalty: if need_hyphen { "-" } else { "" }, }; prev = idx; return Some(w); } if prev < word.word.len() || prev == 0 { let w = Word { word: &word.word[prev..], width: display_width(&word[prev..]), whitespace: word.whitespace, penalty: word.penalty, }; prev = word.word.len() + 1; return Some(w); } None }) }) } #[cfg(test)] mod tests { use super::*; // Like assert_eq!, but the left expression is an iterator. macro_rules! assert_iter_eq { ($left:expr, $right:expr) => { assert_eq!($left.collect::>(), $right); }; } #[test] fn split_words_no_words() { assert_iter_eq!(split_words(vec![], &WordSplitter::HyphenSplitter), vec![]); } #[test] fn split_words_empty_word() { assert_iter_eq!( split_words(vec![Word::from(" ")], &WordSplitter::HyphenSplitter), vec![Word::from(" ")] ); } #[test] fn split_words_single_word() { assert_iter_eq!( split_words(vec![Word::from("foobar")], &WordSplitter::HyphenSplitter), vec![Word::from("foobar")] ); } #[test] fn split_words_hyphen_splitter() { assert_iter_eq!( split_words(vec![Word::from("foo-bar")], &WordSplitter::HyphenSplitter), vec![Word::from("foo-"), Word::from("bar")] ); } #[test] fn split_words_no_hyphenation() { assert_iter_eq!( split_words(vec![Word::from("foo-bar")], &WordSplitter::NoHyphenation), vec![Word::from("foo-bar")] ); } #[test] fn split_words_adds_penalty() { let fixed_split_point = |_: &str| vec![3]; assert_iter_eq!( split_words( vec![Word::from("foobar")].into_iter(), &WordSplitter::Custom(fixed_split_point) ), vec![ Word { word: "foo", width: 3, whitespace: "", penalty: "-" }, Word { word: "bar", width: 3, whitespace: "", penalty: "" } ] ); assert_iter_eq!( split_words( vec![Word::from("fo-bar")].into_iter(), &WordSplitter::Custom(fixed_split_point) ), vec![ Word { word: "fo-", width: 3, whitespace: "", penalty: "" }, Word { word: "bar", width: 3, whitespace: "", penalty: "" } ] ); } }