summaryrefslogtreecommitdiffstats
path: root/third_party/rust/unicode-segmentation/src/sentence.rs
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 19:33:14 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 19:33:14 +0000
commit36d22d82aa202bb199967e9512281e9a53db42c9 (patch)
tree105e8c98ddea1c1e4784a60a5a6410fa416be2de /third_party/rust/unicode-segmentation/src/sentence.rs
parentInitial commit. (diff)
downloadfirefox-esr-upstream.tar.xz
firefox-esr-upstream.zip
Adding upstream version 115.7.0esr.upstream/115.7.0esrupstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/rust/unicode-segmentation/src/sentence.rs')
-rw-r--r--third_party/rust/unicode-segmentation/src/sentence.rs415
1 files changed, 415 insertions, 0 deletions
diff --git a/third_party/rust/unicode-segmentation/src/sentence.rs b/third_party/rust/unicode-segmentation/src/sentence.rs
new file mode 100644
index 0000000000..78d87b4072
--- /dev/null
+++ b/third_party/rust/unicode-segmentation/src/sentence.rs
@@ -0,0 +1,415 @@
+// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
+// file at the top-level directory of this distribution and at
+// http://rust-lang.org/COPYRIGHT.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+use core::cmp;
+use core::iter::Filter;
+
+// All of the logic for forward iteration over sentences
+mod fwd {
+ use crate::tables::sentence::SentenceCat;
+ use core::cmp;
+
+ // Describe a parsed part of source string as described in this table:
+ // https://unicode.org/reports/tr29/#Default_Sentence_Boundaries
+ #[derive(Clone, Copy, PartialEq, Eq)]
+ enum StatePart {
+ Sot,
+ Eot,
+ Other,
+ CR,
+ LF,
+ Sep,
+ ATerm,
+ UpperLower,
+ ClosePlus,
+ SpPlus,
+ STerm,
+ }
+
+ #[derive(Clone, PartialEq, Eq)]
+ struct SentenceBreaksState(pub [StatePart; 4]);
+
+ const INITIAL_STATE: SentenceBreaksState = SentenceBreaksState([
+ StatePart::Sot,
+ StatePart::Sot,
+ StatePart::Sot,
+ StatePart::Sot,
+ ]);
+
+ #[derive(Clone)]
+ pub struct SentenceBreaks<'a> {
+ pub string: &'a str,
+ pos: usize,
+ state: SentenceBreaksState,
+ }
+
+ impl SentenceBreaksState {
+ // Attempt to advance the internal state by one part
+ // Whitespace and some punctutation will be collapsed
+ fn next(&self, cat: SentenceCat) -> SentenceBreaksState {
+ let &SentenceBreaksState(parts) = self;
+ let parts = match (parts[3], cat) {
+ (StatePart::ClosePlus, SentenceCat::SC_Close) => parts,
+ (StatePart::SpPlus, SentenceCat::SC_Sp) => parts,
+ _ => [
+ parts[1],
+ parts[2],
+ parts[3],
+ match cat {
+ SentenceCat::SC_CR => StatePart::CR,
+ SentenceCat::SC_LF => StatePart::LF,
+ SentenceCat::SC_Sep => StatePart::Sep,
+ SentenceCat::SC_ATerm => StatePart::ATerm,
+ SentenceCat::SC_Upper | SentenceCat::SC_Lower => StatePart::UpperLower,
+ SentenceCat::SC_Close => StatePart::ClosePlus,
+ SentenceCat::SC_Sp => StatePart::SpPlus,
+ SentenceCat::SC_STerm => StatePart::STerm,
+ _ => StatePart::Other,
+ },
+ ],
+ };
+ SentenceBreaksState(parts)
+ }
+
+ fn end(&self) -> SentenceBreaksState {
+ let &SentenceBreaksState(parts) = self;
+ SentenceBreaksState([parts[1], parts[2], parts[3], StatePart::Eot])
+ }
+
+ // Helper function to check if state head matches a single `StatePart`
+ fn match1(&self, part: StatePart) -> bool {
+ let &SentenceBreaksState(parts) = self;
+ part == parts[3]
+ }
+
+ // Helper function to check if first two `StateParts` in state match
+ // the given two
+ fn match2(&self, part1: StatePart, part2: StatePart) -> bool {
+ let &SentenceBreaksState(parts) = self;
+ part1 == parts[2] && part2 == parts[3]
+ }
+ }
+
+ // https://unicode.org/reports/tr29/#SB8
+ // TODO cache this, it is currently quadratic
+ fn match_sb8(state: &SentenceBreaksState, ahead: &str) -> bool {
+ let &SentenceBreaksState(parts) = state;
+ let mut idx = if parts[3] == StatePart::SpPlus { 2 } else { 3 };
+ if parts[idx] == StatePart::ClosePlus {
+ idx -= 1
+ }
+
+ if parts[idx] == StatePart::ATerm {
+ use crate::tables::sentence as se;
+
+ for next_char in ahead.chars() {
+ //( ¬(OLetter | Upper | Lower | ParaSep | SATerm) )* Lower
+ match se::sentence_category(next_char).2 {
+ se::SC_Lower => return true,
+ se::SC_OLetter
+ | se::SC_Upper
+ | se::SC_Sep
+ | se::SC_CR
+ | se::SC_LF
+ | se::SC_STerm
+ | se::SC_ATerm => return false,
+ _ => continue,
+ }
+ }
+ }
+
+ false
+ }
+
+ // https://unicode.org/reports/tr29/#SB8a
+ fn match_sb8a(state: &SentenceBreaksState) -> bool {
+ // SATerm Close* Sp*
+ let &SentenceBreaksState(parts) = state;
+ let mut idx = if parts[3] == StatePart::SpPlus { 2 } else { 3 };
+ if parts[idx] == StatePart::ClosePlus {
+ idx -= 1
+ }
+ parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm
+ }
+
+ // https://unicode.org/reports/tr29/#SB9
+ fn match_sb9(state: &SentenceBreaksState) -> bool {
+ // SATerm Close*
+ let &SentenceBreaksState(parts) = state;
+ let idx = if parts[3] == StatePart::ClosePlus {
+ 2
+ } else {
+ 3
+ };
+ parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm
+ }
+
+ // https://unicode.org/reports/tr29/#SB11
+ fn match_sb11(state: &SentenceBreaksState) -> bool {
+ // SATerm Close* Sp* ParaSep?
+ let &SentenceBreaksState(parts) = state;
+ let mut idx = match parts[3] {
+ StatePart::Sep | StatePart::CR | StatePart::LF => 2,
+ _ => 3,
+ };
+
+ if parts[idx] == StatePart::SpPlus {
+ idx -= 1
+ }
+ if parts[idx] == StatePart::ClosePlus {
+ idx -= 1
+ }
+
+ parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm
+ }
+
+ impl<'a> Iterator for SentenceBreaks<'a> {
+ // Returns the index of the character which follows a break
+ type Item = usize;
+
+ #[inline]
+ fn size_hint(&self) -> (usize, Option<usize>) {
+ let slen = self.string.len();
+ // A sentence could be one character
+ (cmp::min(slen, 2), Some(slen + 1))
+ }
+
+ #[inline]
+ fn next(&mut self) -> Option<usize> {
+ use crate::tables::sentence as se;
+
+ for next_char in self.string[self.pos..].chars() {
+ let position_before = self.pos;
+ let state_before = self.state.clone();
+
+ let next_cat = se::sentence_category(next_char).2;
+
+ self.pos += next_char.len_utf8();
+ self.state = self.state.next(next_cat);
+
+ match next_cat {
+ // SB1 https://unicode.org/reports/tr29/#SB1
+ _ if state_before.match1(StatePart::Sot) => return Some(position_before),
+
+ // SB2 is handled when inner iterator (chars) is finished
+
+ // SB3 https://unicode.org/reports/tr29/#SB3
+ SentenceCat::SC_LF if state_before.match1(StatePart::CR) => continue,
+
+ // SB4 https://unicode.org/reports/tr29/#SB4
+ _ if state_before.match1(StatePart::Sep)
+ || state_before.match1(StatePart::CR)
+ || state_before.match1(StatePart::LF) =>
+ {
+ return Some(position_before)
+ }
+
+ // SB5 https://unicode.org/reports/tr29/#SB5
+ SentenceCat::SC_Extend | SentenceCat::SC_Format => self.state = state_before,
+
+ // SB6 https://unicode.org/reports/tr29/#SB6
+ SentenceCat::SC_Numeric if state_before.match1(StatePart::ATerm) => continue,
+
+ // SB7 https://unicode.org/reports/tr29/#SB7
+ SentenceCat::SC_Upper
+ if state_before.match2(StatePart::UpperLower, StatePart::ATerm) =>
+ {
+ continue
+ }
+
+ // SB8 https://unicode.org/reports/tr29/#SB8
+ _ if match_sb8(&state_before, &self.string[position_before..]) => continue,
+
+ // SB8a https://unicode.org/reports/tr29/#SB8a
+ SentenceCat::SC_SContinue | SentenceCat::SC_STerm | SentenceCat::SC_ATerm
+ if match_sb8a(&state_before) =>
+ {
+ continue
+ }
+
+ // SB9 https://unicode.org/reports/tr29/#SB9
+ SentenceCat::SC_Close
+ | SentenceCat::SC_Sp
+ | SentenceCat::SC_Sep
+ | SentenceCat::SC_CR
+ | SentenceCat::SC_LF
+ if match_sb9(&state_before) =>
+ {
+ continue
+ }
+
+ // SB10 https://unicode.org/reports/tr29/#SB10
+ SentenceCat::SC_Sp
+ | SentenceCat::SC_Sep
+ | SentenceCat::SC_CR
+ | SentenceCat::SC_LF
+ if match_sb8a(&state_before) =>
+ {
+ continue
+ }
+
+ // SB11 https://unicode.org/reports/tr29/#SB11
+ _ if match_sb11(&state_before) => return Some(position_before),
+
+ // SB998 https://unicode.org/reports/tr29/#SB998
+ _ => continue,
+ }
+ }
+
+ // SB2 https://unicode.org/reports/tr29/#SB2
+ if self.state.match1(StatePart::Sot) {
+ None
+ } else if self.state.match1(StatePart::Eot) {
+ None
+ } else {
+ self.state = self.state.end();
+ Some(self.pos)
+ }
+ }
+ }
+
+ pub fn new_sentence_breaks<'a>(source: &'a str) -> SentenceBreaks<'a> {
+ SentenceBreaks {
+ string: source,
+ pos: 0,
+ state: INITIAL_STATE,
+ }
+ }
+}
+
+/// An iterator over the substrings of a string which, after splitting the string on
+/// [sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries),
+/// contain any characters with the
+/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
+/// property, or with
+/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
+///
+/// This struct is created by the [`unicode_sentences`] method on the [`UnicodeSegmentation`]
+/// trait. See its documentation for more.
+///
+/// [`unicode_sentences`]: trait.UnicodeSegmentation.html#tymethod.unicode_sentences
+/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
+#[derive(Clone)]
+pub struct UnicodeSentences<'a> {
+ inner: Filter<USentenceBounds<'a>, fn(&&str) -> bool>,
+}
+
+/// External iterator for a string's
+/// [sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
+///
+/// This struct is created by the [`split_sentence_bounds`] method on the [`UnicodeSegmentation`]
+/// trait. See its documentation for more.
+///
+/// [`split_sentence_bounds`]: trait.UnicodeSegmentation.html#tymethod.split_sentence_bounds
+/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
+#[derive(Clone)]
+pub struct USentenceBounds<'a> {
+ iter: fwd::SentenceBreaks<'a>,
+ sentence_start: Option<usize>,
+}
+
+/// External iterator for sentence boundaries and byte offsets.
+///
+/// This struct is created by the [`split_sentence_bound_indices`] method on the
+/// [`UnicodeSegmentation`] trait. See its documentation for more.
+///
+/// [`split_sentence_bound_indices`]: trait.UnicodeSegmentation.html#tymethod.split_sentence_bound_indices
+/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
+#[derive(Clone)]
+pub struct USentenceBoundIndices<'a> {
+ start_offset: usize,
+ iter: USentenceBounds<'a>,
+}
+
+#[inline]
+pub fn new_sentence_bounds<'a>(source: &'a str) -> USentenceBounds<'a> {
+ USentenceBounds {
+ iter: fwd::new_sentence_breaks(source),
+ sentence_start: None,
+ }
+}
+
+#[inline]
+pub fn new_sentence_bound_indices<'a>(source: &'a str) -> USentenceBoundIndices<'a> {
+ USentenceBoundIndices {
+ start_offset: source.as_ptr() as usize,
+ iter: new_sentence_bounds(source),
+ }
+}
+
+#[inline]
+pub fn new_unicode_sentences<'b>(s: &'b str) -> UnicodeSentences<'b> {
+ use super::UnicodeSegmentation;
+ use crate::tables::util::is_alphanumeric;
+
+ fn has_alphanumeric(s: &&str) -> bool {
+ s.chars().any(|c| is_alphanumeric(c))
+ }
+ let has_alphanumeric: fn(&&str) -> bool = has_alphanumeric; // coerce to fn pointer
+
+ UnicodeSentences {
+ inner: s.split_sentence_bounds().filter(has_alphanumeric),
+ }
+}
+
+impl<'a> Iterator for UnicodeSentences<'a> {
+ type Item = &'a str;
+
+ #[inline]
+ fn next(&mut self) -> Option<&'a str> {
+ self.inner.next()
+ }
+}
+
+impl<'a> Iterator for USentenceBounds<'a> {
+ type Item = &'a str;
+
+ #[inline]
+ fn size_hint(&self) -> (usize, Option<usize>) {
+ let (lower, upper) = self.iter.size_hint();
+ (cmp::max(0, lower - 1), upper.map(|u| cmp::max(0, u - 1)))
+ }
+
+ #[inline]
+ fn next(&mut self) -> Option<&'a str> {
+ if self.sentence_start == None {
+ if let Some(start_pos) = self.iter.next() {
+ self.sentence_start = Some(start_pos)
+ } else {
+ return None;
+ }
+ }
+
+ if let Some(break_pos) = self.iter.next() {
+ let start_pos = self.sentence_start.unwrap();
+ let sentence = &self.iter.string[start_pos..break_pos];
+ self.sentence_start = Some(break_pos);
+ Some(sentence)
+ } else {
+ None
+ }
+ }
+}
+
+impl<'a> Iterator for USentenceBoundIndices<'a> {
+ type Item = (usize, &'a str);
+
+ #[inline]
+ fn next(&mut self) -> Option<(usize, &'a str)> {
+ self.iter
+ .next()
+ .map(|s| (s.as_ptr() as usize - self.start_offset, s))
+ }
+
+ #[inline]
+ fn size_hint(&self) -> (usize, Option<usize>) {
+ self.iter.size_hint()
+ }
+}