diff options
Diffstat (limited to '')
-rw-r--r-- | third_party/rust/unicode-segmentation/src/grapheme.rs | 801 |
1 files changed, 801 insertions, 0 deletions
diff --git a/third_party/rust/unicode-segmentation/src/grapheme.rs b/third_party/rust/unicode-segmentation/src/grapheme.rs new file mode 100644 index 0000000000..eb554c900e --- /dev/null +++ b/third_party/rust/unicode-segmentation/src/grapheme.rs @@ -0,0 +1,801 @@ +// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use core::cmp; + +use crate::tables::grapheme::GraphemeCat; + +/// External iterator for grapheme clusters and byte offsets. +/// +/// This struct is created by the [`grapheme_indices`] method on the [`UnicodeSegmentation`] +/// trait. See its documentation for more. +/// +/// [`grapheme_indices`]: trait.UnicodeSegmentation.html#tymethod.grapheme_indices +/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html +#[derive(Clone)] +pub struct GraphemeIndices<'a> { + start_offset: usize, + iter: Graphemes<'a>, +} + +impl<'a> GraphemeIndices<'a> { + #[inline] + /// View the underlying data (the part yet to be iterated) as a slice of the original string. + /// + /// ```rust + /// # use unicode_segmentation::UnicodeSegmentation; + /// let mut iter = "abc".grapheme_indices(true); + /// assert_eq!(iter.as_str(), "abc"); + /// iter.next(); + /// assert_eq!(iter.as_str(), "bc"); + /// iter.next(); + /// iter.next(); + /// assert_eq!(iter.as_str(), ""); + /// ``` + pub fn as_str(&self) -> &'a str { + self.iter.as_str() + } +} + +impl<'a> Iterator for GraphemeIndices<'a> { + type Item = (usize, &'a str); + + #[inline] + fn next(&mut self) -> Option<(usize, &'a str)> { + self.iter + .next() + .map(|s| (s.as_ptr() as usize - self.start_offset, s)) + } + + #[inline] + fn size_hint(&self) -> (usize, Option<usize>) { + self.iter.size_hint() + } +} + +impl<'a> DoubleEndedIterator for GraphemeIndices<'a> { + #[inline] + fn next_back(&mut self) -> Option<(usize, &'a str)> { + self.iter + .next_back() + .map(|s| (s.as_ptr() as usize - self.start_offset, s)) + } +} + +/// External iterator for a string's +/// [grapheme clusters](http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries). +/// +/// This struct is created by the [`graphemes`] method on the [`UnicodeSegmentation`] trait. See its +/// documentation for more. +/// +/// [`graphemes`]: trait.UnicodeSegmentation.html#tymethod.graphemes +/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html +#[derive(Clone, Debug)] +pub struct Graphemes<'a> { + string: &'a str, + cursor: GraphemeCursor, + cursor_back: GraphemeCursor, +} + +impl<'a> Graphemes<'a> { + #[inline] + /// View the underlying data (the part yet to be iterated) as a slice of the original string. + /// + /// ```rust + /// # use unicode_segmentation::UnicodeSegmentation; + /// let mut iter = "abc".graphemes(true); + /// assert_eq!(iter.as_str(), "abc"); + /// iter.next(); + /// assert_eq!(iter.as_str(), "bc"); + /// iter.next(); + /// iter.next(); + /// assert_eq!(iter.as_str(), ""); + /// ``` + pub fn as_str(&self) -> &'a str { + &self.string[self.cursor.cur_cursor()..self.cursor_back.cur_cursor()] + } +} + +impl<'a> Iterator for Graphemes<'a> { + type Item = &'a str; + + #[inline] + fn size_hint(&self) -> (usize, Option<usize>) { + let slen = self.cursor_back.cur_cursor() - self.cursor.cur_cursor(); + (cmp::min(slen, 1), Some(slen)) + } + + #[inline] + fn next(&mut self) -> Option<&'a str> { + let start = self.cursor.cur_cursor(); + if start == self.cursor_back.cur_cursor() { + return None; + } + let next = self.cursor.next_boundary(self.string, 0).unwrap().unwrap(); + Some(&self.string[start..next]) + } +} + +impl<'a> DoubleEndedIterator for Graphemes<'a> { + #[inline] + fn next_back(&mut self) -> Option<&'a str> { + let end = self.cursor_back.cur_cursor(); + if end == self.cursor.cur_cursor() { + return None; + } + let prev = self + .cursor_back + .prev_boundary(self.string, 0) + .unwrap() + .unwrap(); + Some(&self.string[prev..end]) + } +} + +#[inline] +pub fn new_graphemes<'b>(s: &'b str, is_extended: bool) -> Graphemes<'b> { + let len = s.len(); + Graphemes { + string: s, + cursor: GraphemeCursor::new(0, len, is_extended), + cursor_back: GraphemeCursor::new(len, len, is_extended), + } +} + +#[inline] +pub fn new_grapheme_indices<'b>(s: &'b str, is_extended: bool) -> GraphemeIndices<'b> { + GraphemeIndices { + start_offset: s.as_ptr() as usize, + iter: new_graphemes(s, is_extended), + } +} + +// maybe unify with PairResult? +// An enum describing information about a potential boundary. +#[derive(PartialEq, Eq, Clone, Debug)] +enum GraphemeState { + // No information is known. + Unknown, + // It is known to not be a boundary. + NotBreak, + // It is known to be a boundary. + Break, + // The codepoint after is a Regional Indicator Symbol, so a boundary iff + // it is preceded by an even number of RIS codepoints. (GB12, GB13) + Regional, + // The codepoint after is Extended_Pictographic, + // so whether it's a boundary depends on pre-context according to GB11. + Emoji, +} + +/// Cursor-based segmenter for grapheme clusters. +/// +/// This allows working with ropes and other datastructures where the string is not contiguous or +/// fully known at initialization time. +#[derive(Clone, Debug)] +pub struct GraphemeCursor { + // Current cursor position. + offset: usize, + // Total length of the string. + len: usize, + // A config flag indicating whether this cursor computes legacy or extended + // grapheme cluster boundaries (enables GB9a and GB9b if set). + is_extended: bool, + // Information about the potential boundary at `offset` + state: GraphemeState, + // Category of codepoint immediately preceding cursor, if known. + cat_before: Option<GraphemeCat>, + // Category of codepoint immediately after cursor, if known. + cat_after: Option<GraphemeCat>, + // If set, at least one more codepoint immediately preceding this offset + // is needed to resolve whether there's a boundary at `offset`. + pre_context_offset: Option<usize>, + // The number of RIS codepoints preceding `offset`. If `pre_context_offset` + // is set, then counts the number of RIS between that and `offset`, otherwise + // is an accurate count relative to the string. + ris_count: Option<usize>, + // Set if a call to `prev_boundary` or `next_boundary` was suspended due + // to needing more input. + resuming: bool, + // Cached grapheme category and associated scalar value range. + grapheme_cat_cache: (u32, u32, GraphemeCat), +} + +/// An error return indicating that not enough content was available in the +/// provided chunk to satisfy the query, and that more content must be provided. +#[derive(PartialEq, Eq, Debug)] +pub enum GraphemeIncomplete { + /// More pre-context is needed. The caller should call `provide_context` + /// with a chunk ending at the offset given, then retry the query. This + /// will only be returned if the `chunk_start` parameter is nonzero. + PreContext(usize), + + /// When requesting `prev_boundary`, the cursor is moving past the beginning + /// of the current chunk, so the chunk before that is requested. This will + /// only be returned if the `chunk_start` parameter is nonzero. + PrevChunk, + + /// When requesting `next_boundary`, the cursor is moving past the end of the + /// current chunk, so the chunk after that is requested. This will only be + /// returned if the chunk ends before the `len` parameter provided on + /// creation of the cursor. + NextChunk, // requesting chunk following the one given + + /// An error returned when the chunk given does not contain the cursor position. + InvalidOffset, +} + +// An enum describing the result from lookup of a pair of categories. +#[derive(PartialEq, Eq)] +enum PairResult { + NotBreak, // definitely not a break + Break, // definitely a break + Extended, // a break iff not in extended mode + Regional, // a break if preceded by an even number of RIS + Emoji, // a break if preceded by emoji base and (Extend)* +} + +#[inline] +fn check_pair(before: GraphemeCat, after: GraphemeCat) -> PairResult { + use self::PairResult::*; + use crate::tables::grapheme::GraphemeCat::*; + match (before, after) { + (GC_CR, GC_LF) => NotBreak, // GB3 + (GC_Control, _) => Break, // GB4 + (GC_CR, _) => Break, // GB4 + (GC_LF, _) => Break, // GB4 + (_, GC_Control) => Break, // GB5 + (_, GC_CR) => Break, // GB5 + (_, GC_LF) => Break, // GB5 + (GC_L, GC_L) => NotBreak, // GB6 + (GC_L, GC_V) => NotBreak, // GB6 + (GC_L, GC_LV) => NotBreak, // GB6 + (GC_L, GC_LVT) => NotBreak, // GB6 + (GC_LV, GC_V) => NotBreak, // GB7 + (GC_LV, GC_T) => NotBreak, // GB7 + (GC_V, GC_V) => NotBreak, // GB7 + (GC_V, GC_T) => NotBreak, // GB7 + (GC_LVT, GC_T) => NotBreak, // GB8 + (GC_T, GC_T) => NotBreak, // GB8 + (_, GC_Extend) => NotBreak, // GB9 + (_, GC_ZWJ) => NotBreak, // GB9 + (_, GC_SpacingMark) => Extended, // GB9a + (GC_Prepend, _) => Extended, // GB9b + (GC_ZWJ, GC_Extended_Pictographic) => Emoji, // GB11 + (GC_Regional_Indicator, GC_Regional_Indicator) => Regional, // GB12, GB13 + (_, _) => Break, // GB999 + } +} + +impl GraphemeCursor { + /// Create a new cursor. The string and initial offset are given at creation + /// time, but the contents of the string are not. The `is_extended` parameter + /// controls whether extended grapheme clusters are selected. + /// + /// The `offset` parameter must be on a codepoint boundary. + /// + /// ```rust + /// # use unicode_segmentation::GraphemeCursor; + /// let s = "हिन्दी"; + /// let mut legacy = GraphemeCursor::new(0, s.len(), false); + /// assert_eq!(legacy.next_boundary(s, 0), Ok(Some("ह".len()))); + /// let mut extended = GraphemeCursor::new(0, s.len(), true); + /// assert_eq!(extended.next_boundary(s, 0), Ok(Some("हि".len()))); + /// ``` + pub fn new(offset: usize, len: usize, is_extended: bool) -> GraphemeCursor { + let state = if offset == 0 || offset == len { + GraphemeState::Break + } else { + GraphemeState::Unknown + }; + GraphemeCursor { + offset: offset, + len: len, + state: state, + is_extended: is_extended, + cat_before: None, + cat_after: None, + pre_context_offset: None, + ris_count: None, + resuming: false, + grapheme_cat_cache: (0, 0, GraphemeCat::GC_Control), + } + } + + fn grapheme_category(&mut self, ch: char) -> GraphemeCat { + use crate::tables::grapheme as gr; + use crate::tables::grapheme::GraphemeCat::*; + + if ch <= '\u{7e}' { + // Special-case optimization for ascii, except U+007F. This + // improves performance even for many primarily non-ascii texts, + // due to use of punctuation and white space characters from the + // ascii range. + if ch >= '\u{20}' { + GC_Any + } else if ch == '\n' { + GC_LF + } else if ch == '\r' { + GC_CR + } else { + GC_Control + } + } else { + // If this char isn't within the cached range, update the cache to the + // range that includes it. + if (ch as u32) < self.grapheme_cat_cache.0 || (ch as u32) > self.grapheme_cat_cache.1 { + self.grapheme_cat_cache = gr::grapheme_category(ch); + } + self.grapheme_cat_cache.2 + } + } + + // Not sure I'm gonna keep this, the advantage over new() seems thin. + + /// Set the cursor to a new location in the same string. + /// + /// ```rust + /// # use unicode_segmentation::GraphemeCursor; + /// let s = "abcd"; + /// let mut cursor = GraphemeCursor::new(0, s.len(), false); + /// assert_eq!(cursor.cur_cursor(), 0); + /// cursor.set_cursor(2); + /// assert_eq!(cursor.cur_cursor(), 2); + /// ``` + pub fn set_cursor(&mut self, offset: usize) { + if offset != self.offset { + self.offset = offset; + self.state = if offset == 0 || offset == self.len { + GraphemeState::Break + } else { + GraphemeState::Unknown + }; + // reset state derived from text around cursor + self.cat_before = None; + self.cat_after = None; + self.ris_count = None; + } + } + + #[inline] + /// The current offset of the cursor. Equal to the last value provided to + /// `new()` or `set_cursor()`, or returned from `next_boundary()` or + /// `prev_boundary()`. + /// + /// ```rust + /// # use unicode_segmentation::GraphemeCursor; + /// // Two flags (🇷🇸🇮🇴), each flag is two RIS codepoints, each RIS is 4 bytes. + /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}"; + /// let mut cursor = GraphemeCursor::new(4, flags.len(), false); + /// assert_eq!(cursor.cur_cursor(), 4); + /// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(8))); + /// assert_eq!(cursor.cur_cursor(), 8); + /// ``` + pub fn cur_cursor(&self) -> usize { + self.offset + } + + /// Provide additional pre-context when it is needed to decide a boundary. + /// The end of the chunk must coincide with the value given in the + /// `GraphemeIncomplete::PreContext` request. + /// + /// ```rust + /// # use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete}; + /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}"; + /// let mut cursor = GraphemeCursor::new(8, flags.len(), false); + /// // Not enough pre-context to decide if there's a boundary between the two flags. + /// assert_eq!(cursor.is_boundary(&flags[8..], 8), Err(GraphemeIncomplete::PreContext(8))); + /// // Provide one more Regional Indicator Symbol of pre-context + /// cursor.provide_context(&flags[4..8], 4); + /// // Still not enough context to decide. + /// assert_eq!(cursor.is_boundary(&flags[8..], 8), Err(GraphemeIncomplete::PreContext(4))); + /// // Provide additional requested context. + /// cursor.provide_context(&flags[0..4], 0); + /// // That's enough to decide (it always is when context goes to the start of the string) + /// assert_eq!(cursor.is_boundary(&flags[8..], 8), Ok(true)); + /// ``` + pub fn provide_context(&mut self, chunk: &str, chunk_start: usize) { + use crate::tables::grapheme as gr; + assert!(chunk_start + chunk.len() == self.pre_context_offset.unwrap()); + self.pre_context_offset = None; + if self.is_extended && chunk_start + chunk.len() == self.offset { + let ch = chunk.chars().rev().next().unwrap(); + if self.grapheme_category(ch) == gr::GC_Prepend { + self.decide(false); // GB9b + return; + } + } + match self.state { + GraphemeState::Regional => self.handle_regional(chunk, chunk_start), + GraphemeState::Emoji => self.handle_emoji(chunk, chunk_start), + _ => { + if self.cat_before.is_none() && self.offset == chunk.len() + chunk_start { + let ch = chunk.chars().rev().next().unwrap(); + self.cat_before = Some(self.grapheme_category(ch)); + } + } + } + } + + #[inline] + fn decide(&mut self, is_break: bool) { + self.state = if is_break { + GraphemeState::Break + } else { + GraphemeState::NotBreak + }; + } + + #[inline] + fn decision(&mut self, is_break: bool) -> Result<bool, GraphemeIncomplete> { + self.decide(is_break); + Ok(is_break) + } + + #[inline] + fn is_boundary_result(&self) -> Result<bool, GraphemeIncomplete> { + if self.state == GraphemeState::Break { + Ok(true) + } else if self.state == GraphemeState::NotBreak { + Ok(false) + } else if let Some(pre_context_offset) = self.pre_context_offset { + Err(GraphemeIncomplete::PreContext(pre_context_offset)) + } else { + unreachable!("inconsistent state"); + } + } + + #[inline] + fn handle_regional(&mut self, chunk: &str, chunk_start: usize) { + use crate::tables::grapheme as gr; + let mut ris_count = self.ris_count.unwrap_or(0); + for ch in chunk.chars().rev() { + if self.grapheme_category(ch) != gr::GC_Regional_Indicator { + self.ris_count = Some(ris_count); + self.decide((ris_count % 2) == 0); + return; + } + ris_count += 1; + } + self.ris_count = Some(ris_count); + if chunk_start == 0 { + self.decide((ris_count % 2) == 0); + return; + } + self.pre_context_offset = Some(chunk_start); + self.state = GraphemeState::Regional; + } + + #[inline] + fn handle_emoji(&mut self, chunk: &str, chunk_start: usize) { + use crate::tables::grapheme as gr; + let mut iter = chunk.chars().rev(); + if let Some(ch) = iter.next() { + if self.grapheme_category(ch) != gr::GC_ZWJ { + self.decide(true); + return; + } + } + for ch in iter { + match self.grapheme_category(ch) { + gr::GC_Extend => (), + gr::GC_Extended_Pictographic => { + self.decide(false); + return; + } + _ => { + self.decide(true); + return; + } + } + } + if chunk_start == 0 { + self.decide(true); + return; + } + self.pre_context_offset = Some(chunk_start); + self.state = GraphemeState::Emoji; + } + + #[inline] + /// Determine whether the current cursor location is a grapheme cluster boundary. + /// Only a part of the string need be supplied. If `chunk_start` is nonzero or + /// the length of `chunk` is not equal to `len` on creation, then this method + /// may return `GraphemeIncomplete::PreContext`. The caller should then + /// call `provide_context` with the requested chunk, then retry calling this + /// method. + /// + /// For partial chunks, if the cursor is not at the beginning or end of the + /// string, the chunk should contain at least the codepoint following the cursor. + /// If the string is nonempty, the chunk must be nonempty. + /// + /// All calls should have consistent chunk contents (ie, if a chunk provides + /// content for a given slice, all further chunks covering that slice must have + /// the same content for it). + /// + /// ```rust + /// # use unicode_segmentation::GraphemeCursor; + /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}"; + /// let mut cursor = GraphemeCursor::new(8, flags.len(), false); + /// assert_eq!(cursor.is_boundary(flags, 0), Ok(true)); + /// cursor.set_cursor(12); + /// assert_eq!(cursor.is_boundary(flags, 0), Ok(false)); + /// ``` + pub fn is_boundary( + &mut self, + chunk: &str, + chunk_start: usize, + ) -> Result<bool, GraphemeIncomplete> { + use crate::tables::grapheme as gr; + if self.state == GraphemeState::Break { + return Ok(true); + } + if self.state == GraphemeState::NotBreak { + return Ok(false); + } + if self.offset < chunk_start || self.offset >= chunk_start + chunk.len() { + if self.offset > chunk_start + chunk.len() || self.cat_after.is_none() { + return Err(GraphemeIncomplete::InvalidOffset); + } + } + if let Some(pre_context_offset) = self.pre_context_offset { + return Err(GraphemeIncomplete::PreContext(pre_context_offset)); + } + let offset_in_chunk = self.offset - chunk_start; + if self.cat_after.is_none() { + let ch = chunk[offset_in_chunk..].chars().next().unwrap(); + self.cat_after = Some(self.grapheme_category(ch)); + } + if self.offset == chunk_start { + let mut need_pre_context = true; + match self.cat_after.unwrap() { + gr::GC_Regional_Indicator => self.state = GraphemeState::Regional, + gr::GC_Extended_Pictographic => self.state = GraphemeState::Emoji, + _ => need_pre_context = self.cat_before.is_none(), + } + if need_pre_context { + self.pre_context_offset = Some(chunk_start); + return Err(GraphemeIncomplete::PreContext(chunk_start)); + } + } + if self.cat_before.is_none() { + let ch = chunk[..offset_in_chunk].chars().rev().next().unwrap(); + self.cat_before = Some(self.grapheme_category(ch)); + } + match check_pair(self.cat_before.unwrap(), self.cat_after.unwrap()) { + PairResult::NotBreak => return self.decision(false), + PairResult::Break => return self.decision(true), + PairResult::Extended => { + let is_extended = self.is_extended; + return self.decision(!is_extended); + } + PairResult::Regional => { + if let Some(ris_count) = self.ris_count { + return self.decision((ris_count % 2) == 0); + } + self.handle_regional(&chunk[..offset_in_chunk], chunk_start); + self.is_boundary_result() + } + PairResult::Emoji => { + self.handle_emoji(&chunk[..offset_in_chunk], chunk_start); + self.is_boundary_result() + } + } + } + + #[inline] + /// Find the next boundary after the current cursor position. Only a part of + /// the string need be supplied. If the chunk is incomplete, then this + /// method might return `GraphemeIncomplete::PreContext` or + /// `GraphemeIncomplete::NextChunk`. In the former case, the caller should + /// call `provide_context` with the requested chunk, then retry. In the + /// latter case, the caller should provide the chunk following the one + /// given, then retry. + /// + /// See `is_boundary` for expectations on the provided chunk. + /// + /// ```rust + /// # use unicode_segmentation::GraphemeCursor; + /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}"; + /// let mut cursor = GraphemeCursor::new(4, flags.len(), false); + /// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(8))); + /// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(16))); + /// assert_eq!(cursor.next_boundary(flags, 0), Ok(None)); + /// ``` + /// + /// And an example that uses partial strings: + /// + /// ```rust + /// # use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete}; + /// let s = "abcd"; + /// let mut cursor = GraphemeCursor::new(0, s.len(), false); + /// assert_eq!(cursor.next_boundary(&s[..2], 0), Ok(Some(1))); + /// assert_eq!(cursor.next_boundary(&s[..2], 0), Err(GraphemeIncomplete::NextChunk)); + /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(2))); + /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(3))); + /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(4))); + /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(None)); + /// ``` + pub fn next_boundary( + &mut self, + chunk: &str, + chunk_start: usize, + ) -> Result<Option<usize>, GraphemeIncomplete> { + if self.offset == self.len { + return Ok(None); + } + let mut iter = chunk[self.offset - chunk_start..].chars(); + let mut ch = iter.next().unwrap(); + loop { + if self.resuming { + if self.cat_after.is_none() { + self.cat_after = Some(self.grapheme_category(ch)); + } + } else { + self.offset += ch.len_utf8(); + self.state = GraphemeState::Unknown; + self.cat_before = self.cat_after.take(); + if self.cat_before.is_none() { + self.cat_before = Some(self.grapheme_category(ch)); + } + if self.cat_before.unwrap() == GraphemeCat::GC_Regional_Indicator { + self.ris_count = self.ris_count.map(|c| c + 1); + } else { + self.ris_count = Some(0); + } + if let Some(next_ch) = iter.next() { + ch = next_ch; + self.cat_after = Some(self.grapheme_category(ch)); + } else if self.offset == self.len { + self.decide(true); + } else { + self.resuming = true; + return Err(GraphemeIncomplete::NextChunk); + } + } + self.resuming = true; + if self.is_boundary(chunk, chunk_start)? { + self.resuming = false; + return Ok(Some(self.offset)); + } + self.resuming = false; + } + } + + /// Find the previous boundary after the current cursor position. Only a part + /// of the string need be supplied. If the chunk is incomplete, then this + /// method might return `GraphemeIncomplete::PreContext` or + /// `GraphemeIncomplete::PrevChunk`. In the former case, the caller should + /// call `provide_context` with the requested chunk, then retry. In the + /// latter case, the caller should provide the chunk preceding the one + /// given, then retry. + /// + /// See `is_boundary` for expectations on the provided chunk. + /// + /// ```rust + /// # use unicode_segmentation::GraphemeCursor; + /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}"; + /// let mut cursor = GraphemeCursor::new(12, flags.len(), false); + /// assert_eq!(cursor.prev_boundary(flags, 0), Ok(Some(8))); + /// assert_eq!(cursor.prev_boundary(flags, 0), Ok(Some(0))); + /// assert_eq!(cursor.prev_boundary(flags, 0), Ok(None)); + /// ``` + /// + /// And an example that uses partial strings (note the exact return is not + /// guaranteed, and may be `PrevChunk` or `PreContext` arbitrarily): + /// + /// ```rust + /// # use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete}; + /// let s = "abcd"; + /// let mut cursor = GraphemeCursor::new(4, s.len(), false); + /// assert_eq!(cursor.prev_boundary(&s[2..4], 2), Ok(Some(3))); + /// assert_eq!(cursor.prev_boundary(&s[2..4], 2), Err(GraphemeIncomplete::PrevChunk)); + /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(2))); + /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(1))); + /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(0))); + /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(None)); + /// ``` + pub fn prev_boundary( + &mut self, + chunk: &str, + chunk_start: usize, + ) -> Result<Option<usize>, GraphemeIncomplete> { + if self.offset == 0 { + return Ok(None); + } + if self.offset == chunk_start { + return Err(GraphemeIncomplete::PrevChunk); + } + let mut iter = chunk[..self.offset - chunk_start].chars().rev(); + let mut ch = iter.next().unwrap(); + loop { + if self.offset == chunk_start { + self.resuming = true; + return Err(GraphemeIncomplete::PrevChunk); + } + if self.resuming { + self.cat_before = Some(self.grapheme_category(ch)); + } else { + self.offset -= ch.len_utf8(); + self.cat_after = self.cat_before.take(); + self.state = GraphemeState::Unknown; + if let Some(ris_count) = self.ris_count { + self.ris_count = if ris_count > 0 { + Some(ris_count - 1) + } else { + None + }; + } + if let Some(prev_ch) = iter.next() { + ch = prev_ch; + self.cat_before = Some(self.grapheme_category(ch)); + } else if self.offset == 0 { + self.decide(true); + } else { + self.resuming = true; + self.cat_after = Some(self.grapheme_category(ch)); + return Err(GraphemeIncomplete::PrevChunk); + } + } + self.resuming = true; + if self.is_boundary(chunk, chunk_start)? { + self.resuming = false; + return Ok(Some(self.offset)); + } + self.resuming = false; + } + } +} + +#[test] +fn test_grapheme_cursor_ris_precontext() { + let s = "\u{1f1fa}\u{1f1f8}\u{1f1fa}\u{1f1f8}\u{1f1fa}\u{1f1f8}"; + let mut c = GraphemeCursor::new(8, s.len(), true); + assert_eq!( + c.is_boundary(&s[4..], 4), + Err(GraphemeIncomplete::PreContext(4)) + ); + c.provide_context(&s[..4], 0); + assert_eq!(c.is_boundary(&s[4..], 4), Ok(true)); +} + +#[test] +fn test_grapheme_cursor_chunk_start_require_precontext() { + let s = "\r\n"; + let mut c = GraphemeCursor::new(1, s.len(), true); + assert_eq!( + c.is_boundary(&s[1..], 1), + Err(GraphemeIncomplete::PreContext(1)) + ); + c.provide_context(&s[..1], 0); + assert_eq!(c.is_boundary(&s[1..], 1), Ok(false)); +} + +#[test] +fn test_grapheme_cursor_prev_boundary() { + let s = "abcd"; + let mut c = GraphemeCursor::new(3, s.len(), true); + assert_eq!( + c.prev_boundary(&s[2..], 2), + Err(GraphemeIncomplete::PrevChunk) + ); + assert_eq!(c.prev_boundary(&s[..2], 0), Ok(Some(2))); +} + +#[test] +fn test_grapheme_cursor_prev_boundary_chunk_start() { + let s = "abcd"; + let mut c = GraphemeCursor::new(2, s.len(), true); + assert_eq!( + c.prev_boundary(&s[2..], 2), + Err(GraphemeIncomplete::PrevChunk) + ); + assert_eq!(c.prev_boundary(&s[..2], 0), Ok(Some(1))); +} |