summaryrefslogtreecommitdiffstats
path: root/third_party/rust/icu_segmenter/src/line.rs
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/rust/icu_segmenter/src/line.rs')
-rw-r--r--third_party/rust/icu_segmenter/src/line.rs1641
1 files changed, 1641 insertions, 0 deletions
diff --git a/third_party/rust/icu_segmenter/src/line.rs b/third_party/rust/icu_segmenter/src/line.rs
new file mode 100644
index 0000000000..f93e31b13d
--- /dev/null
+++ b/third_party/rust/icu_segmenter/src/line.rs
@@ -0,0 +1,1641 @@
+// This file is part of ICU4X. For terms of use, please see the file
+// called LICENSE at the top level of the ICU4X source tree
+// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
+
+use crate::complex::*;
+use crate::indices::*;
+use crate::provider::*;
+use crate::symbols::*;
+use crate::SegmenterError;
+use alloc::string::String;
+use alloc::vec;
+use alloc::vec::Vec;
+use core::char;
+use core::str::CharIndices;
+use icu_provider::prelude::*;
+use utf8_iter::Utf8CharIndices;
+
+/// An enum specifies the strictness of line-breaking rules. It can be passed as
+/// an argument when creating a line segmenter.
+///
+/// Each enum value has the same meaning with respect to the `line-break`
+/// property values in the CSS Text spec. See the details in
+/// <https://drafts.csswg.org/css-text-3/#line-break-property>.
+#[non_exhaustive]
+#[derive(Copy, Clone, PartialEq, Eq, Debug)]
+pub enum LineBreakStrictness {
+ /// Breaks text using the least restrictive set of line-breaking rules.
+ /// Typically used for short lines, such as in newspapers.
+ /// <https://drafts.csswg.org/css-text-3/#valdef-line-break-loose>
+ Loose,
+
+ /// Breaks text using the most common set of line-breaking rules.
+ /// <https://drafts.csswg.org/css-text-3/#valdef-line-break-normal>
+ Normal,
+
+ /// Breaks text using the most stringent set of line-breaking rules.
+ /// <https://drafts.csswg.org/css-text-3/#valdef-line-break-strict>
+ ///
+ /// This is the default behaviour of the Unicode Line Breaking Algorithm,
+ /// resolving class [CJ](https://www.unicode.org/reports/tr14/#CJ) to
+ /// [NS](https://www.unicode.org/reports/tr14/#NS);
+ /// see rule [LB1](https://www.unicode.org/reports/tr14/#LB1).
+ Strict,
+
+ /// Breaks text assuming there is a soft wrap opportunity around every
+ /// typographic character unit, disregarding any prohibition against line
+ /// breaks. See more details in
+ /// <https://drafts.csswg.org/css-text-3/#valdef-line-break-anywhere>.
+ Anywhere,
+}
+
+/// An enum specifies the line break opportunities between letters. It can be
+/// passed as an argument when creating a line segmenter.
+///
+/// Each enum value has the same meaning with respect to the `word-break`
+/// property values in the CSS Text spec. See the details in
+/// <https://drafts.csswg.org/css-text-3/#word-break-property>
+#[non_exhaustive]
+#[derive(Copy, Clone, PartialEq, Eq, Debug)]
+pub enum LineBreakWordOption {
+ /// Words break according to their customary rules. See the details in
+ /// <https://drafts.csswg.org/css-text-3/#valdef-word-break-normal>.
+ Normal,
+
+ /// Breaking is allowed within "words".
+ /// <https://drafts.csswg.org/css-text-3/#valdef-word-break-break-all>
+ BreakAll,
+
+ /// Breaking is forbidden within "word".
+ /// <https://drafts.csswg.org/css-text-3/#valdef-word-break-keep-all>
+ KeepAll,
+}
+
+/// Options to tailor line-breaking behavior.
+#[non_exhaustive]
+#[derive(Clone, PartialEq, Eq, Debug)]
+pub struct LineBreakOptions {
+ /// Strictness of line-breaking rules. See [`LineBreakStrictness`].
+ pub strictness: LineBreakStrictness,
+
+ /// Line break opportunities between letters. See [`LineBreakWordOption`].
+ pub word_option: LineBreakWordOption,
+
+ /// Use `true` as a hint to the line segmenter that the writing
+ /// system is Chinese or Japanese. This allows more break opportunities when
+ /// `LineBreakStrictness` is `Normal` or `Loose`. See
+ /// <https://drafts.csswg.org/css-text-3/#line-break-property> for details.
+ ///
+ /// This option has no effect in Latin-1 mode.
+ pub ja_zh: bool,
+}
+
+impl Default for LineBreakOptions {
+ fn default() -> Self {
+ Self {
+ strictness: LineBreakStrictness::Strict,
+ word_option: LineBreakWordOption::Normal,
+ ja_zh: false,
+ }
+ }
+}
+
+/// Line break iterator for an `str` (a UTF-8 string).
+///
+/// For examples of use, see [`LineSegmenter`].
+pub type LineBreakIteratorUtf8<'l, 's> = LineBreakIterator<'l, 's, LineBreakTypeUtf8>;
+
+/// Line break iterator for a potentially invalid UTF-8 string.
+///
+/// For examples of use, see [`LineSegmenter`].
+pub type LineBreakIteratorPotentiallyIllFormedUtf8<'l, 's> =
+ LineBreakIterator<'l, 's, LineBreakTypePotentiallyIllFormedUtf8>;
+
+/// Line break iterator for a Latin-1 (8-bit) string.
+///
+/// For examples of use, see [`LineSegmenter`].
+pub type LineBreakIteratorLatin1<'l, 's> = LineBreakIterator<'l, 's, LineBreakTypeLatin1>;
+
+/// Line break iterator for a UTF-16 string.
+///
+/// For examples of use, see [`LineSegmenter`].
+pub type LineBreakIteratorUtf16<'l, 's> = LineBreakIterator<'l, 's, LineBreakTypeUtf16>;
+
+/// Supports loading line break data, and creating line break iterators for different string
+/// encodings.
+///
+/// The segmenter returns mandatory breaks (as defined by [definition LD7][LD7] of
+/// Unicode Standard Annex #14, _Unicode Line Breaking Algorithm_) as well as
+/// line break opportunities ([definition LD3][LD3]).
+/// It does not distinguish them. Callers requiring that distinction can check
+/// the Line_Break property of the code point preceding the break against those
+/// listed in rules [LB4][LB4] and [LB5][LB5], special-casing the end of text
+/// according to [LB3][LB3].
+///
+/// For consistency with the grapheme, word, and sentence segmenters, there is
+/// always a breakpoint returned at index 0, but this breakpoint is not a
+/// meaningful line break opportunity.
+///
+/// [LD3]: https://www.unicode.org/reports/tr14/#LD3
+/// [LD7]: https://www.unicode.org/reports/tr14/#LD7
+/// [LB3]: https://www.unicode.org/reports/tr14/#LB3
+/// [LB4]: https://www.unicode.org/reports/tr14/#LB4
+/// [LB5]: https://www.unicode.org/reports/tr14/#LB5
+///
+/// ```rust
+/// # use icu_segmenter::LineSegmenter;
+/// #
+/// # let segmenter = LineSegmenter::new_auto();
+/// #
+/// let text = "Summary\r\nThis annex…";
+/// let breakpoints: Vec<usize> = segmenter.segment_str(text).collect();
+/// // 9 and 22 are mandatory breaks, 14 is a line break opportunity.
+/// assert_eq!(&breakpoints, &[0, 9, 14, 22]);
+/// ```
+///
+/// # Examples
+///
+/// Segment a string with default options:
+///
+/// ```rust
+/// use icu_segmenter::LineSegmenter;
+///
+/// let segmenter = LineSegmenter::new_auto();
+///
+/// let breakpoints: Vec<usize> =
+/// segmenter.segment_str("Hello World").collect();
+/// assert_eq!(&breakpoints, &[0, 6, 11]);
+/// ```
+///
+/// Segment a string with CSS option overrides:
+///
+/// ```rust
+/// use icu_segmenter::{
+/// LineBreakOptions, LineBreakStrictness, LineBreakWordOption,
+/// LineSegmenter,
+/// };
+///
+/// let mut options = LineBreakOptions::default();
+/// options.strictness = LineBreakStrictness::Strict;
+/// options.word_option = LineBreakWordOption::BreakAll;
+/// options.ja_zh = false;
+/// let segmenter = LineSegmenter::new_auto_with_options(options);
+///
+/// let breakpoints: Vec<usize> =
+/// segmenter.segment_str("Hello World").collect();
+/// assert_eq!(&breakpoints, &[0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11]);
+/// ```
+///
+/// Segment a Latin1 byte string:
+///
+/// ```rust
+/// use icu_segmenter::LineSegmenter;
+///
+/// let segmenter = LineSegmenter::new_auto();
+///
+/// let breakpoints: Vec<usize> =
+/// segmenter.segment_latin1(b"Hello World").collect();
+/// assert_eq!(&breakpoints, &[0, 6, 11]);
+/// ```
+///
+/// Separate mandatory breaks from the break opportunities:
+///
+/// ```rust
+/// use icu::properties::{maps, LineBreak};
+/// use icu_segmenter::LineSegmenter;
+///
+/// # let segmenter = LineSegmenter::new_auto();
+/// #
+/// let text = "Summary\r\nThis annex…";
+///
+/// let mandatory_breaks: Vec<usize> = segmenter
+/// .segment_str(text)
+/// .into_iter()
+/// .filter(|&i| {
+/// text[..i].chars().next_back().map_or(false, |c| {
+/// matches!(
+/// maps::line_break().get(c),
+/// LineBreak::MandatoryBreak
+/// | LineBreak::CarriageReturn
+/// | LineBreak::LineFeed
+/// | LineBreak::NextLine
+/// ) || i == text.len()
+/// })
+/// })
+/// .collect();
+/// assert_eq!(&mandatory_breaks, &[9, 22]);
+/// ```
+#[derive(Debug)]
+pub struct LineSegmenter {
+ options: LineBreakOptions,
+ payload: DataPayload<LineBreakDataV1Marker>,
+ complex: ComplexPayloads,
+}
+
+impl LineSegmenter {
+ /// Constructs a [`LineSegmenter`] with an invariant locale and the best available compiled data for
+ /// complex scripts (Khmer, Lao, Myanmar, and Thai).
+ ///
+ /// The current behavior, which is subject to change, is to use the LSTM model when available.
+ ///
+ /// See also [`Self::new_auto_with_options`].
+ ///
+ /// ✨ *Enabled with the `compiled_data` and `auto` Cargo features.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ #[cfg(feature = "compiled_data")]
+ #[cfg(feature = "auto")]
+ pub fn new_auto() -> Self {
+ Self::new_auto_with_options(Default::default())
+ }
+
+ #[cfg(feature = "auto")]
+ icu_provider::gen_any_buffer_data_constructors!(
+ locale: skip,
+ options: skip,
+ error: SegmenterError,
+ #[cfg(skip)]
+ functions: [
+ new_auto,
+ try_new_auto_with_any_provider,
+ try_new_auto_with_buffer_provider,
+ try_new_auto_unstable,
+ Self,
+ ]
+ );
+
+ #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new_auto)]
+ #[cfg(feature = "auto")]
+ pub fn try_new_auto_unstable<D>(provider: &D) -> Result<Self, SegmenterError>
+ where
+ D: DataProvider<LineBreakDataV1Marker>
+ + DataProvider<LstmForWordLineAutoV1Marker>
+ + DataProvider<GraphemeClusterBreakDataV1Marker>
+ + ?Sized,
+ {
+ Self::try_new_auto_with_options_unstable(provider, Default::default())
+ }
+
+ /// Constructs a [`LineSegmenter`] with an invariant locale and compiled LSTM data for
+ /// complex scripts (Khmer, Lao, Myanmar, and Thai).
+ ///
+ /// The LSTM, or Long Term Short Memory, is a machine learning model. It is smaller than
+ /// the full dictionary but more expensive during segmentation (inference).
+ ///
+ /// See also [`Self::new_lstm_with_options`].
+ ///
+ /// ✨ *Enabled with the `compiled_data` and `lstm` Cargo features.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ #[cfg(feature = "compiled_data")]
+ #[cfg(feature = "lstm")]
+ pub fn new_lstm() -> Self {
+ Self::new_lstm_with_options(Default::default())
+ }
+
+ #[cfg(feature = "lstm")]
+ icu_provider::gen_any_buffer_data_constructors!(
+ locale: skip,
+ options: skip,
+ error: SegmenterError,
+ #[cfg(skip)]
+ functions: [
+ new_lstm,
+ try_new_lstm_with_any_provider,
+ try_new_lstm_with_buffer_provider,
+ try_new_lstm_unstable,
+ Self,
+ ]
+ );
+
+ #[cfg(feature = "lstm")]
+ #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new_lstm)]
+ pub fn try_new_lstm_unstable<D>(provider: &D) -> Result<Self, SegmenterError>
+ where
+ D: DataProvider<LineBreakDataV1Marker>
+ + DataProvider<LstmForWordLineAutoV1Marker>
+ + DataProvider<GraphemeClusterBreakDataV1Marker>
+ + ?Sized,
+ {
+ Self::try_new_lstm_with_options_unstable(provider, Default::default())
+ }
+
+ /// Constructs a [`LineSegmenter`] with an invariant locale and compiled dictionary data for
+ /// complex scripts (Khmer, Lao, Myanmar, and Thai).
+ ///
+ /// The dictionary model uses a list of words to determine appropriate breakpoints. It is
+ /// faster than the LSTM model but requires more data.
+ ///
+ /// See also [`Self::new_dictionary_with_options`].
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ #[cfg(feature = "compiled_data")]
+ pub fn new_dictionary() -> Self {
+ Self::new_dictionary_with_options(Default::default())
+ }
+
+ icu_provider::gen_any_buffer_data_constructors!(
+ locale: skip,
+ options: skip,
+ error: SegmenterError,
+ #[cfg(skip)]
+ functions: [
+ new_dictionary,
+ try_new_dictionary_with_any_provider,
+ try_new_dictionary_with_buffer_provider,
+ try_new_dictionary_unstable,
+ Self,
+ ]
+ );
+
+ #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new_dictionary)]
+ pub fn try_new_dictionary_unstable<D>(provider: &D) -> Result<Self, SegmenterError>
+ where
+ D: DataProvider<LineBreakDataV1Marker>
+ + DataProvider<DictionaryForWordLineExtendedV1Marker>
+ + DataProvider<GraphemeClusterBreakDataV1Marker>
+ + ?Sized,
+ {
+ Self::try_new_dictionary_with_options_unstable(provider, Default::default())
+ }
+
+ /// Constructs a [`LineSegmenter`] with an invariant locale, custom [`LineBreakOptions`], and
+ /// the best available compiled data for complex scripts (Khmer, Lao, Myanmar, and Thai).
+ ///
+ /// The current behavior, which is subject to change, is to use the LSTM model when available.
+ ///
+ /// See also [`Self::new_auto`].
+ ///
+ /// ✨ *Enabled with the `compiled_data` and `auto` Cargo features.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ #[cfg(feature = "auto")]
+ #[cfg(feature = "compiled_data")]
+ pub fn new_auto_with_options(options: LineBreakOptions) -> Self {
+ Self::new_lstm_with_options(options)
+ }
+
+ #[cfg(feature = "auto")]
+ icu_provider::gen_any_buffer_data_constructors!(
+ locale: skip,
+ options: LineBreakOptions,
+ error: SegmenterError,
+ #[cfg(skip)]
+ functions: [
+ new_auto_with_options,
+ try_new_auto_with_options_with_any_provider,
+ try_new_auto_with_options_with_buffer_provider,
+ try_new_auto_with_options_unstable,
+ Self,
+ ]
+ );
+
+ #[cfg(feature = "auto")]
+ #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new_auto_with_options)]
+ pub fn try_new_auto_with_options_unstable<D>(
+ provider: &D,
+ options: LineBreakOptions,
+ ) -> Result<Self, SegmenterError>
+ where
+ D: DataProvider<LineBreakDataV1Marker>
+ + DataProvider<LstmForWordLineAutoV1Marker>
+ + DataProvider<GraphemeClusterBreakDataV1Marker>
+ + ?Sized,
+ {
+ Self::try_new_lstm_with_options_unstable(provider, options)
+ }
+
+ /// Constructs a [`LineSegmenter`] with an invariant locale, custom [`LineBreakOptions`], and
+ /// compiled LSTM data for complex scripts (Khmer, Lao, Myanmar, and Thai).
+ ///
+ /// The LSTM, or Long Term Short Memory, is a machine learning model. It is smaller than
+ /// the full dictionary but more expensive during segmentation (inference).
+ ///
+ /// See also [`Self::new_dictionary`].
+ ///
+ /// ✨ *Enabled with the `compiled_data` and `lstm` Cargo features.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ #[cfg(feature = "lstm")]
+ #[cfg(feature = "compiled_data")]
+ pub fn new_lstm_with_options(options: LineBreakOptions) -> Self {
+ Self {
+ options,
+ payload: DataPayload::from_static_ref(
+ crate::provider::Baked::SINGLETON_SEGMENTER_LINE_V1,
+ ),
+ complex: ComplexPayloads::new_lstm(),
+ }
+ }
+
+ #[cfg(feature = "lstm")]
+ icu_provider::gen_any_buffer_data_constructors!(
+ locale: skip,
+ options: LineBreakOptions,
+ error: SegmenterError,
+ #[cfg(skip)]
+ functions: [
+ try_new_lstm_with_options,
+ try_new_lstm_with_options_with_any_provider,
+ try_new_lstm_with_options_with_buffer_provider,
+ try_new_lstm_with_options_unstable,
+ Self,
+ ]
+ );
+
+ #[cfg(feature = "lstm")]
+ #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new_lstm_with_options)]
+ pub fn try_new_lstm_with_options_unstable<D>(
+ provider: &D,
+ options: LineBreakOptions,
+ ) -> Result<Self, SegmenterError>
+ where
+ D: DataProvider<LineBreakDataV1Marker>
+ + DataProvider<LstmForWordLineAutoV1Marker>
+ + DataProvider<GraphemeClusterBreakDataV1Marker>
+ + ?Sized,
+ {
+ Ok(Self {
+ options,
+ payload: provider.load(Default::default())?.take_payload()?,
+ complex: ComplexPayloads::try_new_lstm(provider)?,
+ })
+ }
+
+ /// Constructs a [`LineSegmenter`] with an invariant locale, custom [`LineBreakOptions`], and
+ /// compiled dictionary data for complex scripts (Khmer, Lao, Myanmar, and Thai).
+ ///
+ /// The dictionary model uses a list of words to determine appropriate breakpoints. It is
+ /// faster than the LSTM model but requires more data.
+ ///
+ /// See also [`Self::new_dictionary`].
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ #[cfg(feature = "compiled_data")]
+ pub fn new_dictionary_with_options(options: LineBreakOptions) -> Self {
+ Self {
+ options,
+ payload: DataPayload::from_static_ref(
+ crate::provider::Baked::SINGLETON_SEGMENTER_LINE_V1,
+ ),
+ // Line segmenter doesn't need to load CJ dictionary because UAX 14 rules handles CJK
+ // characters [1]. Southeast Asian languages however require complex context analysis
+ // [2].
+ //
+ // [1]: https://www.unicode.org/reports/tr14/#ID
+ // [2]: https://www.unicode.org/reports/tr14/#SA
+ complex: ComplexPayloads::new_southeast_asian(),
+ }
+ }
+
+ icu_provider::gen_any_buffer_data_constructors!(
+ locale: skip,
+ options: LineBreakOptions,
+ error: SegmenterError,
+ #[cfg(skip)]
+ functions: [
+ new_dictionary_with_options,
+ try_new_dictionary_with_options_with_any_provider,
+ try_new_dictionary_with_options_with_buffer_provider,
+ try_new_dictionary_with_options_unstable,
+ Self,
+ ]
+ );
+
+ #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new_dictionary_with_options)]
+ pub fn try_new_dictionary_with_options_unstable<D>(
+ provider: &D,
+ options: LineBreakOptions,
+ ) -> Result<Self, SegmenterError>
+ where
+ D: DataProvider<LineBreakDataV1Marker>
+ + DataProvider<DictionaryForWordLineExtendedV1Marker>
+ + DataProvider<GraphemeClusterBreakDataV1Marker>
+ + ?Sized,
+ {
+ Ok(Self {
+ options,
+ payload: provider.load(Default::default())?.take_payload()?,
+ // Line segmenter doesn't need to load CJ dictionary because UAX 14 rules handles CJK
+ // characters [1]. Southeast Asian languages however require complex context analysis
+ // [2].
+ //
+ // [1]: https://www.unicode.org/reports/tr14/#ID
+ // [2]: https://www.unicode.org/reports/tr14/#SA
+ complex: ComplexPayloads::try_new_southeast_asian(provider)?,
+ })
+ }
+
+ /// Creates a line break iterator for an `str` (a UTF-8 string).
+ ///
+ /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
+ pub fn segment_str<'l, 's>(&'l self, input: &'s str) -> LineBreakIteratorUtf8<'l, 's> {
+ LineBreakIterator {
+ iter: input.char_indices(),
+ len: input.len(),
+ current_pos_data: None,
+ result_cache: Vec::new(),
+ data: self.payload.get(),
+ options: &self.options,
+ complex: &self.complex,
+ }
+ }
+ /// Creates a line break iterator for a potentially ill-formed UTF8 string
+ ///
+ /// Invalid characters are treated as REPLACEMENT CHARACTER
+ ///
+ /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
+ pub fn segment_utf8<'l, 's>(
+ &'l self,
+ input: &'s [u8],
+ ) -> LineBreakIteratorPotentiallyIllFormedUtf8<'l, 's> {
+ LineBreakIterator {
+ iter: Utf8CharIndices::new(input),
+ len: input.len(),
+ current_pos_data: None,
+ result_cache: Vec::new(),
+ data: self.payload.get(),
+ options: &self.options,
+ complex: &self.complex,
+ }
+ }
+ /// Creates a line break iterator for a Latin-1 (8-bit) string.
+ ///
+ /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
+ pub fn segment_latin1<'l, 's>(&'l self, input: &'s [u8]) -> LineBreakIteratorLatin1<'l, 's> {
+ LineBreakIterator {
+ iter: Latin1Indices::new(input),
+ len: input.len(),
+ current_pos_data: None,
+ result_cache: Vec::new(),
+ data: self.payload.get(),
+ options: &self.options,
+ complex: &self.complex,
+ }
+ }
+
+ /// Creates a line break iterator for a UTF-16 string.
+ ///
+ /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
+ pub fn segment_utf16<'l, 's>(&'l self, input: &'s [u16]) -> LineBreakIteratorUtf16<'l, 's> {
+ LineBreakIterator {
+ iter: Utf16Indices::new(input),
+ len: input.len(),
+ current_pos_data: None,
+ result_cache: Vec::new(),
+ data: self.payload.get(),
+ options: &self.options,
+ complex: &self.complex,
+ }
+ }
+}
+
+fn get_linebreak_property_utf32_with_rule(
+ property_table: &RuleBreakPropertyTable<'_>,
+ codepoint: u32,
+ strictness: LineBreakStrictness,
+ word_option: LineBreakWordOption,
+) -> u8 {
+ // Note: Default value is 0 == UNKNOWN
+ let prop = property_table.0.get32(codepoint);
+
+ if word_option == LineBreakWordOption::BreakAll
+ || strictness == LineBreakStrictness::Loose
+ || strictness == LineBreakStrictness::Normal
+ {
+ return match prop {
+ CJ => ID, // All CJ's General_Category is Other_Letter (Lo).
+ _ => prop,
+ };
+ }
+
+ // CJ is treated as NS by default, yielding strict line breaking.
+ // https://www.unicode.org/reports/tr14/#CJ
+ prop
+}
+
+#[inline]
+fn get_linebreak_property_latin1(property_table: &RuleBreakPropertyTable<'_>, codepoint: u8) -> u8 {
+ // Note: Default value is 0 == UNKNOWN
+ property_table.0.get32(codepoint as u32)
+}
+
+#[inline]
+fn get_linebreak_property_with_rule(
+ property_table: &RuleBreakPropertyTable<'_>,
+ codepoint: char,
+ linebreak_rule: LineBreakStrictness,
+ wordbreak_rule: LineBreakWordOption,
+) -> u8 {
+ get_linebreak_property_utf32_with_rule(
+ property_table,
+ codepoint as u32,
+ linebreak_rule,
+ wordbreak_rule,
+ )
+}
+
+#[inline]
+fn is_break_utf32_by_normal(codepoint: u32, ja_zh: bool) -> bool {
+ match codepoint {
+ 0x301C => ja_zh,
+ 0x30A0 => ja_zh,
+ _ => false,
+ }
+}
+
+#[inline]
+fn is_break_utf32_by_loose(
+ right_codepoint: u32,
+ left_prop: u8,
+ right_prop: u8,
+ ja_zh: bool,
+) -> Option<bool> {
+ // breaks before hyphens
+ if right_prop == BA {
+ if left_prop == ID && (right_codepoint == 0x2010 || right_codepoint == 0x2013) {
+ return Some(true);
+ }
+ } else if right_prop == NS {
+ // breaks before certain CJK hyphen-like characters
+ if right_codepoint == 0x301C || right_codepoint == 0x30A0 {
+ return Some(ja_zh);
+ }
+
+ // breaks before iteration marks
+ if right_codepoint == 0x3005
+ || right_codepoint == 0x303B
+ || right_codepoint == 0x309D
+ || right_codepoint == 0x309E
+ || right_codepoint == 0x30FD
+ || right_codepoint == 0x30FE
+ {
+ return Some(true);
+ }
+
+ // breaks before certain centered punctuation marks:
+ if right_codepoint == 0x30FB
+ || right_codepoint == 0xFF1A
+ || right_codepoint == 0xFF1B
+ || right_codepoint == 0xFF65
+ || right_codepoint == 0x203C
+ || (0x2047..=0x2049).contains(&right_codepoint)
+ {
+ return Some(ja_zh);
+ }
+ } else if right_prop == IN {
+ // breaks between inseparable characters such as U+2025, U+2026 i.e. characters with the Unicode Line Break property IN
+ return Some(true);
+ } else if right_prop == EX {
+ // breaks before certain centered punctuation marks:
+ if right_codepoint == 0xFF01 || right_codepoint == 0xFF1F {
+ return Some(ja_zh);
+ }
+ }
+
+ // breaks before suffixes:
+ // Characters with the Unicode Line Break property PO and the East Asian Width property
+ if right_prop == PO_EAW {
+ return Some(ja_zh);
+ }
+ // breaks after prefixes:
+ // Characters with the Unicode Line Break property PR and the East Asian Width property
+ if left_prop == PR_EAW {
+ return Some(ja_zh);
+ }
+ None
+}
+
+#[inline]
+fn is_break_from_table(
+ break_state_table: &RuleBreakStateTable<'_>,
+ property_count: u8,
+ left: u8,
+ right: u8,
+) -> bool {
+ let rule = get_break_state_from_table(break_state_table, property_count, left, right);
+ if rule == KEEP_RULE {
+ return false;
+ }
+ if rule >= 0 {
+ // need additional next characters to get break rule.
+ return false;
+ }
+ true
+}
+
+#[inline]
+fn is_non_break_by_keepall(left: u8, right: u8) -> bool {
+ // typographic letter units shouldn't be break
+ (left == AI
+ || left == AL
+ || left == ID
+ || left == NU
+ || left == HY
+ || left == H2
+ || left == H3
+ || left == JL
+ || left == JV
+ || left == JT
+ || left == CJ)
+ && (right == AI
+ || right == AL
+ || right == ID
+ || right == NU
+ || right == HY
+ || right == H2
+ || right == H3
+ || right == JL
+ || right == JV
+ || right == JT
+ || right == CJ)
+}
+
+#[inline]
+fn get_break_state_from_table(
+ break_state_table: &RuleBreakStateTable<'_>,
+ property_count: u8,
+ left: u8,
+ right: u8,
+) -> i8 {
+ let idx = (left as usize) * (property_count as usize) + (right as usize);
+ // We use unwrap_or to fall back to the base case and prevent panics on bad data.
+ break_state_table.0.get(idx).unwrap_or(KEEP_RULE)
+}
+
+#[inline]
+fn use_complex_breaking_utf32(property_table: &RuleBreakPropertyTable<'_>, codepoint: u32) -> bool {
+ let line_break_property = get_linebreak_property_utf32_with_rule(
+ property_table,
+ codepoint,
+ LineBreakStrictness::Strict,
+ LineBreakWordOption::Normal,
+ );
+
+ line_break_property == SA
+}
+
+/*
+#[inline]
+fn use_complex_breaking_utf32(codepoint: u32) -> bool {
+ // Thai, Lao and Khmer
+ (codepoint >= 0xe01 && codepoint <= 0xeff) || (codepoint >= 0x1780 && codepoint <= 0x17ff)
+}
+*/
+
+/// A trait allowing for LineBreakIterator to be generalized to multiple string iteration methods.
+///
+/// This is implemented by ICU4X for several common string types.
+pub trait LineBreakType<'l, 's> {
+ /// The iterator over characters.
+ type IterAttr: Iterator<Item = (usize, Self::CharType)> + Clone;
+
+ /// The character type.
+ type CharType: Copy + Into<u32>;
+
+ fn use_complex_breaking(iterator: &LineBreakIterator<'l, 's, Self>, c: Self::CharType) -> bool;
+
+ fn get_linebreak_property_with_rule(
+ iterator: &LineBreakIterator<'l, 's, Self>,
+ c: Self::CharType,
+ ) -> u8;
+
+ fn get_current_position_character_len(iterator: &LineBreakIterator<'l, 's, Self>) -> usize;
+
+ fn handle_complex_language(
+ iterator: &mut LineBreakIterator<'l, 's, Self>,
+ left_codepoint: Self::CharType,
+ ) -> Option<usize>;
+}
+
+/// Implements the [`Iterator`] trait over the line break opportunities of the given string.
+///
+/// Lifetimes:
+///
+/// - `'l` = lifetime of the [`LineSegmenter`] object from which this iterator was created
+/// - `'s` = lifetime of the string being segmented
+///
+/// The [`Iterator::Item`] is an [`usize`] representing index of a code unit
+/// _after_ the break (for a break at the end of text, this index is the length
+/// of the [`str`] or array of code units).
+///
+/// For examples of use, see [`LineSegmenter`].
+#[derive(Debug)]
+pub struct LineBreakIterator<'l, 's, Y: LineBreakType<'l, 's> + ?Sized> {
+ iter: Y::IterAttr,
+ len: usize,
+ current_pos_data: Option<(usize, Y::CharType)>,
+ result_cache: Vec<usize>,
+ data: &'l RuleBreakDataV1<'l>,
+ options: &'l LineBreakOptions,
+ complex: &'l ComplexPayloads,
+}
+
+impl<'l, 's, Y: LineBreakType<'l, 's>> Iterator for LineBreakIterator<'l, 's, Y> {
+ type Item = usize;
+
+ fn next(&mut self) -> Option<Self::Item> {
+ match self.check_eof() {
+ StringBoundaryPosType::Start => return Some(0),
+ StringBoundaryPosType::End => return None,
+ _ => (),
+ }
+
+ // If we have break point cache by previous run, return this result
+ if let Some(&first_pos) = self.result_cache.first() {
+ let mut i = 0;
+ loop {
+ if i == first_pos {
+ self.result_cache = self.result_cache.iter().skip(1).map(|r| r - i).collect();
+ return self.get_current_position();
+ }
+ i += Y::get_current_position_character_len(self);
+ self.advance_iter();
+ if self.is_eof() {
+ self.result_cache.clear();
+ return Some(self.len);
+ }
+ }
+ }
+
+ loop {
+ debug_assert!(!self.is_eof());
+ let left_codepoint = self.get_current_codepoint()?;
+ let mut left_prop = self.get_linebreak_property(left_codepoint);
+ self.advance_iter();
+
+ let Some(right_codepoint) = self.get_current_codepoint() else {
+ return Some(self.len);
+ };
+ let right_prop = self.get_linebreak_property(right_codepoint);
+
+ // CSS word-break property handling
+ match self.options.word_option {
+ LineBreakWordOption::BreakAll => {
+ left_prop = match left_prop {
+ AL => ID,
+ NU => ID,
+ SA => ID,
+ _ => left_prop,
+ };
+ }
+ LineBreakWordOption::KeepAll => {
+ if is_non_break_by_keepall(left_prop, right_prop) {
+ continue;
+ }
+ }
+ _ => (),
+ }
+
+ // CSS line-break property handling
+ match self.options.strictness {
+ LineBreakStrictness::Normal => {
+ if self.is_break_by_normal(right_codepoint) {
+ return self.get_current_position();
+ }
+ }
+ LineBreakStrictness::Loose => {
+ if let Some(breakable) = is_break_utf32_by_loose(
+ right_codepoint.into(),
+ left_prop,
+ right_prop,
+ self.options.ja_zh,
+ ) {
+ if breakable {
+ return self.get_current_position();
+ }
+ continue;
+ }
+ }
+ LineBreakStrictness::Anywhere => {
+ return self.get_current_position();
+ }
+ _ => (),
+ };
+
+ // UAX14 doesn't have Thai etc, so use another way.
+ if self.options.word_option != LineBreakWordOption::BreakAll
+ && Y::use_complex_breaking(self, left_codepoint)
+ && Y::use_complex_breaking(self, right_codepoint)
+ {
+ let result = Y::handle_complex_language(self, left_codepoint);
+ if result.is_some() {
+ return result;
+ }
+ // I may have to fetch text until non-SA character?.
+ }
+
+ // If break_state is equals or grater than 0, it is alias of property.
+ let mut break_state = self.get_break_state_from_table(left_prop, right_prop);
+ if break_state >= 0_i8 {
+ let mut previous_iter = self.iter.clone();
+ let mut previous_pos_data = self.current_pos_data;
+
+ loop {
+ self.advance_iter();
+
+ let Some(prop) = self.get_current_linebreak_property() else {
+ // Reached EOF. But we are analyzing multiple characters now, so next break may be previous point.
+ let break_state = self
+ .get_break_state_from_table(break_state as u8, self.data.eot_property);
+ if break_state == NOT_MATCH_RULE {
+ self.iter = previous_iter;
+ self.current_pos_data = previous_pos_data;
+ return self.get_current_position();
+ }
+ // EOF
+ return Some(self.len);
+ };
+
+ break_state = self.get_break_state_from_table(break_state as u8, prop);
+ if break_state < 0 {
+ break;
+ }
+
+ previous_iter = self.iter.clone();
+ previous_pos_data = self.current_pos_data;
+ }
+ if break_state == KEEP_RULE {
+ continue;
+ }
+ if break_state == NOT_MATCH_RULE {
+ self.iter = previous_iter;
+ self.current_pos_data = previous_pos_data;
+ return self.get_current_position();
+ }
+ return self.get_current_position();
+ }
+
+ if self.is_break_from_table(left_prop, right_prop) {
+ return self.get_current_position();
+ }
+ }
+ }
+}
+
+enum StringBoundaryPosType {
+ Start,
+ Middle,
+ End,
+}
+
+impl<'l, 's, Y: LineBreakType<'l, 's>> LineBreakIterator<'l, 's, Y> {
+ fn advance_iter(&mut self) {
+ self.current_pos_data = self.iter.next();
+ }
+
+ fn is_eof(&self) -> bool {
+ self.current_pos_data.is_none()
+ }
+
+ #[inline]
+ fn check_eof(&mut self) -> StringBoundaryPosType {
+ if self.is_eof() {
+ self.advance_iter();
+ if self.is_eof() {
+ if self.len == 0 {
+ // Empty string. Since `self.current_pos_data` is always going to be empty,
+ // we never read `self.len` except for here, so we can use it to mark that
+ // we have already returned the single empty-string breakpoint.
+ self.len = 1;
+ StringBoundaryPosType::Start
+ } else {
+ StringBoundaryPosType::End
+ }
+ } else {
+ StringBoundaryPosType::Start
+ }
+ } else {
+ StringBoundaryPosType::Middle
+ }
+ }
+
+ fn get_current_position(&self) -> Option<usize> {
+ self.current_pos_data.map(|(pos, _)| pos)
+ }
+
+ fn get_current_codepoint(&self) -> Option<Y::CharType> {
+ self.current_pos_data.map(|(_, codepoint)| codepoint)
+ }
+
+ fn get_linebreak_property(&self, codepoint: Y::CharType) -> u8 {
+ Y::get_linebreak_property_with_rule(self, codepoint)
+ }
+
+ fn get_current_linebreak_property(&self) -> Option<u8> {
+ self.get_current_codepoint()
+ .map(|c| self.get_linebreak_property(c))
+ }
+
+ fn is_break_by_normal(&self, codepoint: Y::CharType) -> bool {
+ is_break_utf32_by_normal(codepoint.into(), self.options.ja_zh)
+ }
+
+ fn get_break_state_from_table(&self, left: u8, right: u8) -> i8 {
+ get_break_state_from_table(
+ &self.data.break_state_table,
+ self.data.property_count,
+ left,
+ right,
+ )
+ }
+
+ fn is_break_from_table(&self, left: u8, right: u8) -> bool {
+ is_break_from_table(
+ &self.data.break_state_table,
+ self.data.property_count,
+ left,
+ right,
+ )
+ }
+}
+
+#[derive(Debug)]
+pub struct LineBreakTypeUtf8;
+
+impl<'l, 's> LineBreakType<'l, 's> for LineBreakTypeUtf8 {
+ type IterAttr = CharIndices<'s>;
+ type CharType = char;
+
+ fn get_linebreak_property_with_rule(iterator: &LineBreakIterator<Self>, c: char) -> u8 {
+ get_linebreak_property_with_rule(
+ &iterator.data.property_table,
+ c,
+ iterator.options.strictness,
+ iterator.options.word_option,
+ )
+ }
+
+ #[inline]
+ fn use_complex_breaking(iterator: &LineBreakIterator<Self>, c: char) -> bool {
+ use_complex_breaking_utf32(&iterator.data.property_table, c as u32)
+ }
+
+ fn get_current_position_character_len(iterator: &LineBreakIterator<Self>) -> usize {
+ iterator.get_current_codepoint().map_or(0, |c| c.len_utf8())
+ }
+
+ fn handle_complex_language(
+ iter: &mut LineBreakIterator<'l, 's, Self>,
+ left_codepoint: char,
+ ) -> Option<usize> {
+ handle_complex_language_utf8(iter, left_codepoint)
+ }
+}
+
+#[derive(Debug)]
+pub struct LineBreakTypePotentiallyIllFormedUtf8;
+
+impl<'l, 's> LineBreakType<'l, 's> for LineBreakTypePotentiallyIllFormedUtf8 {
+ type IterAttr = Utf8CharIndices<'s>;
+ type CharType = char;
+
+ fn get_linebreak_property_with_rule(iterator: &LineBreakIterator<Self>, c: char) -> u8 {
+ get_linebreak_property_with_rule(
+ &iterator.data.property_table,
+ c,
+ iterator.options.strictness,
+ iterator.options.word_option,
+ )
+ }
+
+ #[inline]
+ fn use_complex_breaking(iterator: &LineBreakIterator<Self>, c: char) -> bool {
+ use_complex_breaking_utf32(&iterator.data.property_table, c as u32)
+ }
+
+ fn get_current_position_character_len(iterator: &LineBreakIterator<Self>) -> usize {
+ iterator.get_current_codepoint().map_or(0, |c| c.len_utf8())
+ }
+
+ fn handle_complex_language(
+ iter: &mut LineBreakIterator<'l, 's, Self>,
+ left_codepoint: char,
+ ) -> Option<usize> {
+ handle_complex_language_utf8(iter, left_codepoint)
+ }
+}
+/// handle_complex_language impl for UTF8 iterators
+fn handle_complex_language_utf8<'l, 's, T>(
+ iter: &mut LineBreakIterator<'l, 's, T>,
+ left_codepoint: char,
+) -> Option<usize>
+where
+ T: LineBreakType<'l, 's, CharType = char>,
+{
+ // word segmenter doesn't define break rules for some languages such as Thai.
+ let start_iter = iter.iter.clone();
+ let start_point = iter.current_pos_data;
+ let mut s = String::new();
+ s.push(left_codepoint);
+ loop {
+ debug_assert!(!iter.is_eof());
+ s.push(iter.get_current_codepoint()?);
+ iter.advance_iter();
+ if let Some(current_codepoint) = iter.get_current_codepoint() {
+ if !T::use_complex_breaking(iter, current_codepoint) {
+ break;
+ }
+ } else {
+ // EOF
+ break;
+ }
+ }
+
+ // Restore iterator to move to head of complex string
+ iter.iter = start_iter;
+ iter.current_pos_data = start_point;
+ let breaks = complex_language_segment_str(iter.complex, &s);
+ iter.result_cache = breaks;
+ let first_pos = *iter.result_cache.first()?;
+ let mut i = left_codepoint.len_utf8();
+ loop {
+ if i == first_pos {
+ // Re-calculate breaking offset
+ iter.result_cache = iter.result_cache.iter().skip(1).map(|r| r - i).collect();
+ return iter.get_current_position();
+ }
+ debug_assert!(
+ i < first_pos,
+ "we should always arrive at first_pos: near index {:?}",
+ iter.get_current_position()
+ );
+ i += T::get_current_position_character_len(iter);
+ iter.advance_iter();
+ if iter.is_eof() {
+ iter.result_cache.clear();
+ return Some(iter.len);
+ }
+ }
+}
+
+#[derive(Debug)]
+pub struct LineBreakTypeLatin1;
+
+impl<'l, 's> LineBreakType<'l, 's> for LineBreakTypeLatin1 {
+ type IterAttr = Latin1Indices<'s>;
+ type CharType = u8;
+
+ fn get_linebreak_property_with_rule(iterator: &LineBreakIterator<Self>, c: u8) -> u8 {
+ // No CJ on Latin1
+ get_linebreak_property_latin1(&iterator.data.property_table, c)
+ }
+
+ #[inline]
+ fn use_complex_breaking(_iterator: &LineBreakIterator<Self>, _c: u8) -> bool {
+ false
+ }
+
+ fn get_current_position_character_len(_: &LineBreakIterator<Self>) -> usize {
+ unreachable!()
+ }
+
+ fn handle_complex_language(
+ _: &mut LineBreakIterator<Self>,
+ _: Self::CharType,
+ ) -> Option<usize> {
+ unreachable!()
+ }
+}
+
+#[derive(Debug)]
+pub struct LineBreakTypeUtf16;
+
+impl<'l, 's> LineBreakType<'l, 's> for LineBreakTypeUtf16 {
+ type IterAttr = Utf16Indices<'s>;
+ type CharType = u32;
+
+ fn get_linebreak_property_with_rule(iterator: &LineBreakIterator<Self>, c: u32) -> u8 {
+ get_linebreak_property_utf32_with_rule(
+ &iterator.data.property_table,
+ c,
+ iterator.options.strictness,
+ iterator.options.word_option,
+ )
+ }
+
+ #[inline]
+ fn use_complex_breaking(iterator: &LineBreakIterator<Self>, c: u32) -> bool {
+ use_complex_breaking_utf32(&iterator.data.property_table, c)
+ }
+
+ fn get_current_position_character_len(iterator: &LineBreakIterator<Self>) -> usize {
+ match iterator.get_current_codepoint() {
+ None => 0,
+ Some(ch) if ch >= 0x10000 => 2,
+ _ => 1,
+ }
+ }
+
+ fn handle_complex_language(
+ iterator: &mut LineBreakIterator<Self>,
+ left_codepoint: Self::CharType,
+ ) -> Option<usize> {
+ // word segmenter doesn't define break rules for some languages such as Thai.
+ let start_iter = iterator.iter.clone();
+ let start_point = iterator.current_pos_data;
+ let mut s = vec![left_codepoint as u16];
+ loop {
+ debug_assert!(!iterator.is_eof());
+ s.push(iterator.get_current_codepoint()? as u16);
+ iterator.advance_iter();
+ if let Some(current_codepoint) = iterator.get_current_codepoint() {
+ if !Self::use_complex_breaking(iterator, current_codepoint) {
+ break;
+ }
+ } else {
+ // EOF
+ break;
+ }
+ }
+
+ // Restore iterator to move to head of complex string
+ iterator.iter = start_iter;
+ iterator.current_pos_data = start_point;
+ let breaks = complex_language_segment_utf16(iterator.complex, &s);
+ iterator.result_cache = breaks;
+ // result_cache vector is utf-16 index that is in BMP.
+ let first_pos = *iterator.result_cache.first()?;
+ let mut i = 1;
+ loop {
+ if i == first_pos {
+ // Re-calculate breaking offset
+ iterator.result_cache = iterator
+ .result_cache
+ .iter()
+ .skip(1)
+ .map(|r| r - i)
+ .collect();
+ return iterator.get_current_position();
+ }
+ debug_assert!(
+ i < first_pos,
+ "we should always arrive at first_pos: near index {:?}",
+ iterator.get_current_position()
+ );
+ i += 1;
+ iterator.advance_iter();
+ if iterator.is_eof() {
+ iterator.result_cache.clear();
+ return Some(iterator.len);
+ }
+ }
+ }
+}
+
+#[cfg(test)]
+#[cfg(feature = "serde")]
+mod tests {
+ use super::*;
+ use crate::LineSegmenter;
+
+ #[test]
+ fn linebreak_property() {
+ let payload = DataProvider::<LineBreakDataV1Marker>::load(
+ &crate::provider::Baked,
+ Default::default(),
+ )
+ .expect("Loading should succeed!")
+ .take_payload()
+ .expect("Data should be present!");
+
+ let get_linebreak_property = |codepoint| {
+ get_linebreak_property_with_rule(
+ &payload.get().property_table,
+ codepoint,
+ LineBreakStrictness::Strict,
+ LineBreakWordOption::Normal,
+ )
+ };
+
+ assert_eq!(get_linebreak_property('\u{0020}'), SP);
+ assert_eq!(get_linebreak_property('\u{0022}'), QU);
+ assert_eq!(get_linebreak_property('('), OP_OP30);
+ assert_eq!(get_linebreak_property('\u{0030}'), NU);
+ assert_eq!(get_linebreak_property('['), OP_OP30);
+ assert_eq!(get_linebreak_property('\u{1f3fb}'), EM);
+ assert_eq!(get_linebreak_property('\u{20000}'), ID);
+ assert_eq!(get_linebreak_property('\u{e0020}'), CM);
+ assert_eq!(get_linebreak_property('\u{3041}'), CJ);
+ assert_eq!(get_linebreak_property('\u{0025}'), PO);
+ assert_eq!(get_linebreak_property('\u{00A7}'), AI);
+ assert_eq!(get_linebreak_property('\u{50005}'), XX);
+ assert_eq!(get_linebreak_property('\u{17D6}'), NS);
+ assert_eq!(get_linebreak_property('\u{2014}'), B2);
+ }
+
+ #[test]
+ #[allow(clippy::bool_assert_comparison)] // clearer when we're testing bools directly
+ fn break_rule() {
+ let payload = DataProvider::<LineBreakDataV1Marker>::load(
+ &crate::provider::Baked,
+ Default::default(),
+ )
+ .expect("Loading should succeed!")
+ .take_payload()
+ .expect("Data should be present!");
+ let lb_data: &RuleBreakDataV1 = payload.get();
+
+ let is_break = |left, right| {
+ is_break_from_table(
+ &lb_data.break_state_table,
+ lb_data.property_count,
+ left,
+ right,
+ )
+ };
+
+ // LB4
+ assert_eq!(is_break(BK, AL), true);
+ // LB5
+ assert_eq!(is_break(CR, LF), false);
+ assert_eq!(is_break(CR, AL), true);
+ assert_eq!(is_break(LF, AL), true);
+ assert_eq!(is_break(NL, AL), true);
+ // LB6
+ assert_eq!(is_break(AL, BK), false);
+ assert_eq!(is_break(AL, CR), false);
+ assert_eq!(is_break(AL, LF), false);
+ assert_eq!(is_break(AL, NL), false);
+ // LB7
+ assert_eq!(is_break(AL, SP), false);
+ assert_eq!(is_break(AL, ZW), false);
+ // LB8
+ // LB8a
+ assert_eq!(is_break(ZWJ, AL), false);
+ // LB9
+ assert_eq!(is_break(AL, ZWJ), false);
+ assert_eq!(is_break(AL, CM), false);
+ assert_eq!(is_break(ID, ZWJ), false);
+ // LB10
+ assert_eq!(is_break(ZWJ, SP), false);
+ assert_eq!(is_break(SP, CM), true);
+ // LB11
+ assert_eq!(is_break(AL, WJ), false);
+ assert_eq!(is_break(WJ, AL), false);
+ // LB12
+ assert_eq!(is_break(GL, AL), false);
+ // LB12a
+ assert_eq!(is_break(AL, GL), false);
+ assert_eq!(is_break(SP, GL), true);
+ // LB13
+ assert_eq!(is_break(AL, CL), false);
+ assert_eq!(is_break(AL, CP), false);
+ assert_eq!(is_break(AL, EX), false);
+ assert_eq!(is_break(AL, IS), false);
+ assert_eq!(is_break(AL, SY), false);
+ // LB18
+ assert_eq!(is_break(SP, AL), true);
+ // LB19
+ assert_eq!(is_break(AL, QU), false);
+ assert_eq!(is_break(QU, AL), false);
+ // LB20
+ assert_eq!(is_break(AL, CB), true);
+ assert_eq!(is_break(CB, AL), true);
+ // LB20
+ assert_eq!(is_break(AL, BA), false);
+ assert_eq!(is_break(AL, HY), false);
+ assert_eq!(is_break(AL, NS), false);
+ // LB21
+ assert_eq!(is_break(AL, BA), false);
+ assert_eq!(is_break(BB, AL), false);
+ assert_eq!(is_break(ID, BA), false);
+ assert_eq!(is_break(ID, NS), false);
+ // LB21a
+ // LB21b
+ assert_eq!(is_break(SY, HL), false);
+ // LB22
+ assert_eq!(is_break(AL, IN), false);
+ // LB 23
+ assert_eq!(is_break(AL, NU), false);
+ assert_eq!(is_break(HL, NU), false);
+ // LB 23a
+ assert_eq!(is_break(PR, ID), false);
+ assert_eq!(is_break(PR, EB), false);
+ assert_eq!(is_break(PR, EM), false);
+ assert_eq!(is_break(ID, PO), false);
+ assert_eq!(is_break(EB, PO), false);
+ assert_eq!(is_break(EM, PO), false);
+ // LB26
+ assert_eq!(is_break(JL, JL), false);
+ assert_eq!(is_break(JL, JV), false);
+ assert_eq!(is_break(JL, H2), false);
+ // LB27
+ assert_eq!(is_break(JL, IN), false);
+ assert_eq!(is_break(JL, PO), false);
+ assert_eq!(is_break(PR, JL), false);
+ // LB28
+ assert_eq!(is_break(AL, AL), false);
+ assert_eq!(is_break(HL, AL), false);
+ // LB29
+ assert_eq!(is_break(IS, AL), false);
+ assert_eq!(is_break(IS, HL), false);
+ // LB30b
+ assert_eq!(is_break(EB, EM), false);
+ // LB31
+ assert_eq!(is_break(ID, ID), true);
+ }
+
+ #[test]
+ fn linebreak() {
+ let segmenter = LineSegmenter::try_new_dictionary_unstable(&crate::provider::Baked)
+ .expect("Data exists");
+
+ let mut iter = segmenter.segment_str("hello world");
+ assert_eq!(Some(0), iter.next());
+ assert_eq!(Some(6), iter.next());
+ assert_eq!(Some(11), iter.next());
+ assert_eq!(None, iter.next());
+
+ iter = segmenter.segment_str("$10 $10");
+ assert_eq!(Some(0), iter.next());
+ assert_eq!(Some(4), iter.next());
+ assert_eq!(Some(7), iter.next());
+ assert_eq!(None, iter.next());
+
+ // LB10
+
+ // LB14
+ iter = segmenter.segment_str("[ abc def");
+ assert_eq!(Some(0), iter.next());
+ assert_eq!(Some(7), iter.next());
+ assert_eq!(Some(10), iter.next());
+ assert_eq!(None, iter.next());
+
+ let input: [u8; 10] = [0x5B, 0x20, 0x20, 0x61, 0x62, 0x63, 0x20, 0x64, 0x65, 0x66];
+ let mut iter_u8 = segmenter.segment_latin1(&input);
+ assert_eq!(Some(0), iter_u8.next());
+ assert_eq!(Some(7), iter_u8.next());
+ assert_eq!(Some(10), iter_u8.next());
+ assert_eq!(None, iter_u8.next());
+
+ let input: [u16; 10] = [0x5B, 0x20, 0x20, 0x61, 0x62, 0x63, 0x20, 0x64, 0x65, 0x66];
+ let mut iter_u16 = segmenter.segment_utf16(&input);
+ assert_eq!(Some(0), iter_u16.next());
+ assert_eq!(Some(7), iter_u16.next());
+ assert_eq!(Some(10), iter_u16.next());
+ assert_eq!(None, iter_u16.next());
+
+ // LB15
+ iter = segmenter.segment_str("abc\u{0022} (def");
+ assert_eq!(Some(0), iter.next());
+ assert_eq!(Some(10), iter.next());
+ assert_eq!(None, iter.next());
+
+ let input: [u8; 10] = [0x61, 0x62, 0x63, 0x22, 0x20, 0x20, 0x28, 0x64, 0x65, 0x66];
+ let mut iter_u8 = segmenter.segment_latin1(&input);
+ assert_eq!(Some(0), iter_u8.next());
+ assert_eq!(Some(10), iter_u8.next());
+ assert_eq!(None, iter_u8.next());
+
+ let input: [u16; 10] = [0x61, 0x62, 0x63, 0x22, 0x20, 0x20, 0x28, 0x64, 0x65, 0x66];
+ let mut iter_u16 = segmenter.segment_utf16(&input);
+ assert_eq!(Some(0), iter_u16.next());
+ assert_eq!(Some(10), iter_u16.next());
+ assert_eq!(None, iter_u16.next());
+
+ // LB16
+ iter = segmenter.segment_str("\u{0029}\u{203C}");
+ assert_eq!(Some(0), iter.next());
+ assert_eq!(Some(4), iter.next());
+ assert_eq!(None, iter.next());
+ iter = segmenter.segment_str("\u{0029} \u{203C}");
+ assert_eq!(Some(0), iter.next());
+ assert_eq!(Some(6), iter.next());
+ assert_eq!(None, iter.next());
+
+ let input: [u16; 4] = [0x29, 0x20, 0x20, 0x203c];
+ let mut iter_u16 = segmenter.segment_utf16(&input);
+ assert_eq!(Some(0), iter_u16.next());
+ assert_eq!(Some(4), iter_u16.next());
+ assert_eq!(None, iter_u16.next());
+
+ // LB17
+ iter = segmenter.segment_str("\u{2014}\u{2014}aa");
+ assert_eq!(Some(0), iter.next());
+ assert_eq!(Some(6), iter.next());
+ assert_eq!(Some(8), iter.next());
+ assert_eq!(None, iter.next());
+ iter = segmenter.segment_str("\u{2014} \u{2014}aa");
+ assert_eq!(Some(0), iter.next());
+ assert_eq!(Some(8), iter.next());
+ assert_eq!(Some(10), iter.next());
+ assert_eq!(None, iter.next());
+
+ iter = segmenter.segment_str("\u{2014}\u{2014} \u{2014}\u{2014}123 abc");
+ assert_eq!(Some(0), iter.next());
+ assert_eq!(Some(14), iter.next());
+ assert_eq!(Some(18), iter.next());
+ assert_eq!(Some(21), iter.next());
+ assert_eq!(None, iter.next());
+
+ // LB25
+ let mut iter = segmenter.segment_str("(0,1)+(2,3)");
+ assert_eq!(Some(0), iter.next());
+ assert_eq!(Some(11), iter.next());
+ assert_eq!(None, iter.next());
+ let input: [u16; 11] = [
+ 0x28, 0x30, 0x2C, 0x31, 0x29, 0x2B, 0x28, 0x32, 0x2C, 0x33, 0x29,
+ ];
+ let mut iter_u16 = segmenter.segment_utf16(&input);
+ assert_eq!(Some(0), iter_u16.next());
+ assert_eq!(Some(11), iter_u16.next());
+ assert_eq!(None, iter_u16.next());
+
+ let input: [u16; 13] = [
+ 0x2014, 0x2014, 0x20, 0x20, 0x2014, 0x2014, 0x31, 0x32, 0x33, 0x20, 0x61, 0x62, 0x63,
+ ];
+ let mut iter_u16 = segmenter.segment_utf16(&input);
+ assert_eq!(Some(0), iter_u16.next());
+ assert_eq!(Some(6), iter_u16.next());
+ assert_eq!(Some(10), iter_u16.next());
+ assert_eq!(Some(13), iter_u16.next());
+ assert_eq!(None, iter_u16.next());
+
+ iter = segmenter.segment_str("\u{1F3FB} \u{1F3FB}");
+ assert_eq!(Some(0), iter.next());
+ assert_eq!(Some(5), iter.next());
+ assert_eq!(Some(9), iter.next());
+ assert_eq!(None, iter.next());
+ }
+
+ #[test]
+ #[cfg(feature = "lstm")]
+ fn thai_line_break() {
+ const TEST_STR: &str = "ภาษาไทยภาษาไทย";
+
+ let segmenter = LineSegmenter::new_lstm();
+ let breaks: Vec<usize> = segmenter.segment_str(TEST_STR).collect();
+ assert_eq!(breaks, [0, 12, 21, 33, TEST_STR.len()], "Thai test");
+
+ let utf16: Vec<u16> = TEST_STR.encode_utf16().collect();
+ let breaks: Vec<usize> = segmenter.segment_utf16(&utf16).collect();
+ assert_eq!(breaks, [0, 4, 7, 11, utf16.len()], "Thai test");
+
+ let utf16: [u16; 4] = [0x0e20, 0x0e32, 0x0e29, 0x0e32];
+ let breaks: Vec<usize> = segmenter.segment_utf16(&utf16).collect();
+ assert_eq!(breaks, [0, 4], "Thai test");
+ }
+
+ #[test]
+ #[cfg(feature = "lstm")]
+ fn burmese_line_break() {
+ // "Burmese Language" in Burmese
+ const TEST_STR: &str = "မြန်မာဘာသာစကား";
+
+ let segmenter = LineSegmenter::new_lstm();
+ let breaks: Vec<usize> = segmenter.segment_str(TEST_STR).collect();
+ // LSTM model breaks more characters, but it is better to return [30].
+ assert_eq!(breaks, [0, 12, 18, 30, TEST_STR.len()], "Burmese test");
+
+ let utf16: Vec<u16> = TEST_STR.encode_utf16().collect();
+ let breaks: Vec<usize> = segmenter.segment_utf16(&utf16).collect();
+ // LSTM model breaks more characters, but it is better to return [10].
+ assert_eq!(breaks, [0, 4, 6, 10, utf16.len()], "Burmese utf-16 test");
+ }
+
+ #[test]
+ #[cfg(feature = "lstm")]
+ fn khmer_line_break() {
+ const TEST_STR: &str = "សេចក្ដីប្រកាសជាសកលស្ដីពីសិទ្ធិមនុស្ស";
+
+ let segmenter = LineSegmenter::new_lstm();
+ let breaks: Vec<usize> = segmenter.segment_str(TEST_STR).collect();
+ // Note: This small sample matches the ICU dictionary segmenter
+ assert_eq!(breaks, [0, 39, 48, 54, 72, TEST_STR.len()], "Khmer test");
+
+ let utf16: Vec<u16> = TEST_STR.encode_utf16().collect();
+ let breaks: Vec<usize> = segmenter.segment_utf16(&utf16).collect();
+ assert_eq!(
+ breaks,
+ [0, 13, 16, 18, 24, utf16.len()],
+ "Khmer utf-16 test"
+ );
+ }
+
+ #[test]
+ #[cfg(feature = "lstm")]
+ fn lao_line_break() {
+ const TEST_STR: &str = "ກ່ຽວກັບສິດຂອງມະນຸດ";
+
+ let segmenter = LineSegmenter::new_lstm();
+ let breaks: Vec<usize> = segmenter.segment_str(TEST_STR).collect();
+ // Note: LSTM finds a break at '12' that the dictionary does not find
+ assert_eq!(breaks, [0, 12, 21, 30, 39, TEST_STR.len()], "Lao test");
+
+ let utf16: Vec<u16> = TEST_STR.encode_utf16().collect();
+ let breaks: Vec<usize> = segmenter.segment_utf16(&utf16).collect();
+ assert_eq!(breaks, [0, 4, 7, 10, 13, utf16.len()], "Lao utf-16 test");
+ }
+
+ #[test]
+ fn empty_string() {
+ let segmenter = LineSegmenter::new_auto();
+ let breaks: Vec<usize> = segmenter.segment_str("").collect();
+ assert_eq!(breaks, [0]);
+ }
+}