Adding upstream version 124.0.1.upstream/124.0.1

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-19 00:47:55 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-19 00:47:55 +0000
commit: 26a029d407be480d791972afb5975cf62c9360a6 (patch)
tree: f435a8308119effd964b339f76abb83a57c29483 /third_party/rust/icu_segmenter/src
parent: Initial commit. (diff)
download: firefox-26a029d407be480d791972afb5975cf62c9360a6.tar.xz
firefox-26a029d407be480d791972afb5975cf62c9360a6.zip
17 files changed, 5946 insertions, 0 deletions
diff --git a/third_party/rust/icu_segmenter/src/complex/dictionary.rs b/third_party/rust/icu_segmenter/src/complex/dictionary.rs
new file mode 100644
index 0000000000..90360ee2b0
--- /dev/null
+++ b/third_party/rust/icu_segmenter/src/complex/dictionary.rs
@@ -0,0 +1,268 @@
+// This file is part of ICU4X. For terms of use, please see the file
+// called LICENSE at the top level of the ICU4X source tree
+// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
+
+use crate::grapheme::*;
+use crate::indices::Utf16Indices;
+use crate::provider::*;
+use core::str::CharIndices;
+use icu_collections::char16trie::{Char16Trie, TrieResult};
+
+/// A trait for dictionary based iterator
+trait DictionaryType<'l, 's> {
+    /// The iterator over characters.
+    type IterAttr: Iterator<Item = (usize, Self::CharType)> + Clone;
+
+    /// The character type.
+    type CharType: Copy + Into<u32>;
+
+    fn to_char(c: Self::CharType) -> char;
+    fn char_len(c: Self::CharType) -> usize;
+}
+
+struct DictionaryBreakIterator<
+    'l,
+    's,
+    Y: DictionaryType<'l, 's> + ?Sized,
+    X: Iterator<Item = usize> + ?Sized,
+> {
+    trie: Char16Trie<'l>,
+    iter: Y::IterAttr,
+    len: usize,
+    grapheme_iter: X,
+    // TODO transform value for byte trie
+}
+
+/// Implement the [`Iterator`] trait over the segmenter break opportunities of the given string.
+/// Please see the [module-level documentation](crate) for its usages.
+///
+/// Lifetimes:
+/// - `'l` = lifetime of the segmenter object from which this iterator was created
+/// - `'s` = lifetime of the string being segmented
+///
+/// [`Iterator`]: core::iter::Iterator
+impl<'l, 's, Y: DictionaryType<'l, 's> + ?Sized, X: Iterator<Item = usize> + ?Sized> Iterator
+    for DictionaryBreakIterator<'l, 's, Y, X>
+{
+    type Item = usize;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        let mut trie_iter = self.trie.iter();
+        let mut intermediate_length = 0;
+        let mut not_match = false;
+        let mut previous_match = None;
+        let mut last_grapheme_offset = 0;
+
+        while let Some(next) = self.iter.next() {
+            let ch = Y::to_char(next.1);
+            match trie_iter.next(ch) {
+                TrieResult::FinalValue(_) => {
+                    return Some(next.0 + Y::char_len(next.1));
+                }
+                TrieResult::Intermediate(_) => {
+                    // Dictionary has to match with grapheme cluster segment.
+                    // If not, we ignore it.
+                    while last_grapheme_offset < next.0 + Y::char_len(next.1) {
+                        if let Some(offset) = self.grapheme_iter.next() {
+                            last_grapheme_offset = offset;
+                            continue;
+                        }
+                        last_grapheme_offset = self.len;
+                        break;
+                    }
+                    if last_grapheme_offset != next.0 + Y::char_len(next.1) {
+                        continue;
+                    }
+
+                    intermediate_length = next.0 + Y::char_len(next.1);
+                    previous_match = Some(self.iter.clone());
+                }
+                TrieResult::NoMatch => {
+                    if intermediate_length > 0 {
+                        if let Some(previous_match) = previous_match {
+                            // Rewind previous match point
+                            self.iter = previous_match;
+                        }
+                        return Some(intermediate_length);
+                    }
+                    // Not found
+                    return Some(next.0 + Y::char_len(next.1));
+                }
+                TrieResult::NoValue => {
+                    // Prefix string is matched
+                    not_match = true;
+                }
+            }
+        }
+
+        if intermediate_length > 0 {
+            Some(intermediate_length)
+        } else if not_match {
+            // no match by scanning text
+            Some(self.len)
+        } else {
+            None
+        }
+    }
+}
+
+impl<'l, 's> DictionaryType<'l, 's> for u32 {
+    type IterAttr = Utf16Indices<'s>;
+    type CharType = u32;
+
+    fn to_char(c: u32) -> char {
+        char::from_u32(c).unwrap_or(char::REPLACEMENT_CHARACTER)
+    }
+
+    fn char_len(c: u32) -> usize {
+        if c >= 0x10000 {
+            2
+        } else {
+            1
+        }
+    }
+}
+
+impl<'l, 's> DictionaryType<'l, 's> for char {
+    type IterAttr = CharIndices<'s>;
+    type CharType = char;
+
+    fn to_char(c: char) -> char {
+        c
+    }
+
+    fn char_len(c: char) -> usize {
+        c.len_utf8()
+    }
+}
+
+pub(super) struct DictionarySegmenter<'l> {
+    dict: &'l UCharDictionaryBreakDataV1<'l>,
+    grapheme: &'l RuleBreakDataV1<'l>,
+}
+
+impl<'l> DictionarySegmenter<'l> {
+    pub(super) fn new(
+        dict: &'l UCharDictionaryBreakDataV1<'l>,
+        grapheme: &'l RuleBreakDataV1<'l>,
+    ) -> Self {
+        // TODO: no way to verify trie data
+        Self { dict, grapheme }
+    }
+
+    /// Create a dictionary based break iterator for an `str` (a UTF-8 string).
+    pub(super) fn segment_str(&'l self, input: &'l str) -> impl Iterator<Item = usize> + 'l {
+        let grapheme_iter = GraphemeClusterSegmenter::new_and_segment_str(input, self.grapheme);
+        DictionaryBreakIterator::<char, GraphemeClusterBreakIteratorUtf8> {
+            trie: Char16Trie::new(self.dict.trie_data.clone()),
+            iter: input.char_indices(),
+            len: input.len(),
+            grapheme_iter,
+        }
+    }
+
+    /// Create a dictionary based break iterator for a UTF-16 string.
+    pub(super) fn segment_utf16(&'l self, input: &'l [u16]) -> impl Iterator<Item = usize> + 'l {
+        let grapheme_iter = GraphemeClusterSegmenter::new_and_segment_utf16(input, self.grapheme);
+        DictionaryBreakIterator::<u32, GraphemeClusterBreakIteratorUtf16> {
+            trie: Char16Trie::new(self.dict.trie_data.clone()),
+            iter: Utf16Indices::new(input),
+            len: input.len(),
+            grapheme_iter,
+        }
+    }
+}
+
+#[cfg(test)]
+#[cfg(feature = "serde")]
+mod tests {
+    use super::*;
+    use crate::{LineSegmenter, WordSegmenter};
+    use icu_provider::prelude::*;
+
+    #[test]
+    fn burmese_dictionary_test() {
+        let segmenter = LineSegmenter::new_dictionary();
+        // From css/css-text/word-break/word-break-normal-my-000.html
+        let s = "မြန်မာစာမြန်မာစာမြန်မာစာ";
+        let result: Vec<usize> = segmenter.segment_str(s).collect();
+        assert_eq!(result, vec![0, 18, 24, 42, 48, 66, 72]);
+
+        let s_utf16: Vec<u16> = s.encode_utf16().collect();
+        let result: Vec<usize> = segmenter.segment_utf16(&s_utf16).collect();
+        assert_eq!(result, vec![0, 6, 8, 14, 16, 22, 24]);
+    }
+
+    #[test]
+    fn cj_dictionary_test() {
+        let dict_payload: DataPayload<DictionaryForWordOnlyAutoV1Marker> = crate::provider::Baked
+            .load(DataRequest {
+                locale: &icu_locid::locale!("ja").into(),
+                metadata: Default::default(),
+            })
+            .unwrap()
+            .take_payload()
+            .unwrap();
+        let word_segmenter = WordSegmenter::new_dictionary();
+        let dict_segmenter = DictionarySegmenter::new(
+            dict_payload.get(),
+            crate::provider::Baked::SINGLETON_SEGMENTER_GRAPHEME_V1,
+        );
+
+        // Match case
+        let s = "龟山岛龟山岛";
+        let result: Vec<usize> = dict_segmenter.segment_str(s).collect();
+        assert_eq!(result, vec![9, 18]);
+
+        let result: Vec<usize> = word_segmenter.segment_str(s).collect();
+        assert_eq!(result, vec![0, 9, 18]);
+
+        let s_utf16: Vec<u16> = s.encode_utf16().collect();
+        let result: Vec<usize> = dict_segmenter.segment_utf16(&s_utf16).collect();
+        assert_eq!(result, vec![3, 6]);
+
+        let result: Vec<usize> = word_segmenter.segment_utf16(&s_utf16).collect();
+        assert_eq!(result, vec![0, 3, 6]);
+
+        // Match case, then no match case
+        let s = "エディターエディ";
+        let result: Vec<usize> = dict_segmenter.segment_str(s).collect();
+        assert_eq!(result, vec![15, 24]);
+
+        // TODO(#3236): Why is WordSegmenter not returning the middle segment?
+        let result: Vec<usize> = word_segmenter.segment_str(s).collect();
+        assert_eq!(result, vec![0, 24]);
+
+        let s_utf16: Vec<u16> = s.encode_utf16().collect();
+        let result: Vec<usize> = dict_segmenter.segment_utf16(&s_utf16).collect();
+        assert_eq!(result, vec![5, 8]);
+
+        // TODO(#3236): Why is WordSegmenter not returning the middle segment?
+        let result: Vec<usize> = word_segmenter.segment_utf16(&s_utf16).collect();
+        assert_eq!(result, vec![0, 8]);
+    }
+
+    #[test]
+    fn khmer_dictionary_test() {
+        let segmenter = LineSegmenter::new_dictionary();
+        let s = "ភាសាខ្មែរភាសាខ្មែរភាសាខ្មែរ";
+        let result: Vec<usize> = segmenter.segment_str(s).collect();
+        assert_eq!(result, vec![0, 27, 54, 81]);
+
+        let s_utf16: Vec<u16> = s.encode_utf16().collect();
+        let result: Vec<usize> = segmenter.segment_utf16(&s_utf16).collect();
+        assert_eq!(result, vec![0, 9, 18, 27]);
+    }
+
+    #[test]
+    fn lao_dictionary_test() {
+        let segmenter = LineSegmenter::new_dictionary();
+        let s = "ພາສາລາວພາສາລາວພາສາລາວ";
+        let r: Vec<usize> = segmenter.segment_str(s).collect();
+        assert_eq!(r, vec![0, 12, 21, 33, 42, 54, 63]);
+
+        let s_utf16: Vec<u16> = s.encode_utf16().collect();
+        let r: Vec<usize> = segmenter.segment_utf16(&s_utf16).collect();
+        assert_eq!(r, vec![0, 4, 7, 11, 14, 18, 21]);
+    }
+}
diff --git a/third_party/rust/icu_segmenter/src/complex/language.rs b/third_party/rust/icu_segmenter/src/complex/language.rs
new file mode 100644
index 0000000000..327eea5e20
--- /dev/null
+++ b/third_party/rust/icu_segmenter/src/complex/language.rs
@@ -0,0 +1,161 @@
+// This file is part of ICU4X. For terms of use, please see the file
+// called LICENSE at the top level of the ICU4X source tree
+// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
+
+#[derive(PartialEq, Debug, Copy, Clone)]
+pub(super) enum Language {
+    Burmese,
+    ChineseOrJapanese,
+    Khmer,
+    Lao,
+    Thai,
+    Unknown,
+}
+
+// TODO: Use data provider
+fn get_language(codepoint: u32) -> Language {
+    match codepoint {
+        0xe01..=0xe7f => Language::Thai,
+        0x0E80..=0x0EFF => Language::Lao,
+        0x1000..=0x109f => Language::Burmese,
+        0x1780..=0x17FF => Language::Khmer,
+        0x19E0..=0x19FF => Language::Khmer,
+        0x2E80..=0x2EFF => Language::ChineseOrJapanese,
+        0x2F00..=0x2FDF => Language::ChineseOrJapanese,
+        0x3040..=0x30FF => Language::ChineseOrJapanese,
+        0x31F0..=0x31FF => Language::ChineseOrJapanese,
+        0x32D0..=0x32FE => Language::ChineseOrJapanese,
+        0x3400..=0x4DBF => Language::ChineseOrJapanese,
+        0x4E00..=0x9FFF => Language::ChineseOrJapanese,
+        0xa9e0..=0xa9ff => Language::Burmese,
+        0xaa60..=0xaa7f => Language::Burmese,
+        0xF900..=0xFAFF => Language::ChineseOrJapanese,
+        0xFF66..=0xFF9D => Language::ChineseOrJapanese,
+        0x16FE2..=0x16FE3 => Language::ChineseOrJapanese,
+        0x16FF0..=0x16FF1 => Language::ChineseOrJapanese,
+        0x1AFF0..=0x1B16F => Language::ChineseOrJapanese,
+        0x1F200 => Language::ChineseOrJapanese,
+        0x20000..=0x2FA1F => Language::ChineseOrJapanese,
+        0x30000..=0x3134F => Language::ChineseOrJapanese,
+        _ => Language::Unknown,
+    }
+}
+
+/// This struct is an iterator that returns the string per language from the
+/// given string.
+pub(super) struct LanguageIterator<'s> {
+    rest: &'s str,
+}
+
+impl<'s> LanguageIterator<'s> {
+    pub(super) fn new(input: &'s str) -> Self {
+        Self { rest: input }
+    }
+}
+
+impl<'s> Iterator for LanguageIterator<'s> {
+    type Item = (&'s str, Language);
+
+    fn next(&mut self) -> Option<Self::Item> {
+        let mut indices = self.rest.char_indices();
+        let lang = get_language(indices.next()?.1 as u32);
+        match indices.find(|&(_, ch)| get_language(ch as u32) != lang) {
+            Some((i, _)) => {
+                let (result, rest) = self.rest.split_at(i);
+                self.rest = rest;
+                Some((result, lang))
+            }
+            None => Some((core::mem::take(&mut self.rest), lang)),
+        }
+    }
+}
+
+pub(super) struct LanguageIteratorUtf16<'s> {
+    rest: &'s [u16],
+}
+
+impl<'s> LanguageIteratorUtf16<'s> {
+    pub(super) fn new(input: &'s [u16]) -> Self {
+        Self { rest: input }
+    }
+}
+
+impl<'s> Iterator for LanguageIteratorUtf16<'s> {
+    type Item = (&'s [u16], Language);
+
+    fn next(&mut self) -> Option<Self::Item> {
+        let lang = get_language(*self.rest.first()? as u32);
+        match self
+            .rest
+            .iter()
+            .position(|&ch| get_language(ch as u32) != lang)
+        {
+            Some(i) => {
+                let (result, rest) = self.rest.split_at(i);
+                self.rest = rest;
+                Some((result, lang))
+            }
+            None => Some((core::mem::take(&mut self.rest), lang)),
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_thai_only() {
+        let s = "ภาษาไทยภาษาไทย";
+        let utf16: Vec<u16> = s.encode_utf16().collect();
+        let mut iter = LanguageIteratorUtf16::new(&utf16);
+        assert_eq!(
+            iter.next(),
+            Some((utf16.as_slice(), Language::Thai)),
+            "Thai language only with UTF-16"
+        );
+        let mut iter = LanguageIterator::new(s);
+        assert_eq!(
+            iter.next(),
+            Some((s, Language::Thai)),
+            "Thai language only with UTF-8"
+        );
+        assert_eq!(iter.next(), None, "Iterator for UTF-8 is finished");
+    }
+
+    #[test]
+    fn test_combine() {
+        const TEST_STR_THAI: &str = "ภาษาไทยภาษาไทย";
+        const TEST_STR_BURMESE: &str = "ဗမာနွယ်ဘာသာစကားမျာ";
+        let s = format!("{TEST_STR_THAI}{TEST_STR_BURMESE}");
+        let utf16: Vec<u16> = s.encode_utf16().collect();
+        let thai_utf16: Vec<u16> = TEST_STR_THAI.encode_utf16().collect();
+        let burmese_utf16: Vec<u16> = TEST_STR_BURMESE.encode_utf16().collect();
+
+        let mut iter = LanguageIteratorUtf16::new(&utf16);
+        assert_eq!(
+            iter.next(),
+            Some((thai_utf16.as_slice(), Language::Thai)),
+            "Thai language with UTF-16 at first"
+        );
+        assert_eq!(
+            iter.next(),
+            Some((burmese_utf16.as_slice(), Language::Burmese)),
+            "Burmese language with UTF-16 at second"
+        );
+        assert_eq!(iter.next(), None, "Iterator for UTF-16 is finished");
+
+        let mut iter = LanguageIterator::new(&s);
+        assert_eq!(
+            iter.next(),
+            Some((TEST_STR_THAI, Language::Thai)),
+            "Thai language with UTF-8 at first"
+        );
+        assert_eq!(
+            iter.next(),
+            Some((TEST_STR_BURMESE, Language::Burmese)),
+            "Burmese language with UTF-8 at second"
+        );
+        assert_eq!(iter.next(), None, "Iterator for UTF-8 is finished");
+    }
+}
diff --git a/third_party/rust/icu_segmenter/src/complex/lstm/matrix.rs b/third_party/rust/icu_segmenter/src/complex/lstm/matrix.rs
new file mode 100644
index 0000000000..3cf5ce2e3c
--- /dev/null
+++ b/third_party/rust/icu_segmenter/src/complex/lstm/matrix.rs
@@ -0,0 +1,540 @@
+// This file is part of ICU4X. For terms of use, please see the file
+// called LICENSE at the top level of the ICU4X source tree
+// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
+
+use alloc::vec;
+use alloc::vec::Vec;
+use core::ops::Range;
+#[allow(unused_imports)]
+use core_maths::*;
+use zerovec::ule::AsULE;
+use zerovec::ZeroSlice;
+
+/// A `D`-dimensional, heap-allocated matrix.
+///
+/// This matrix implementation supports slicing matrices into tightly-packed
+/// submatrices. For example, indexing into a matrix of size 5x4x3 returns a
+/// matrix of size 4x3. For more information, see [`MatrixOwned::submatrix`].
+#[derive(Debug, Clone)]
+pub(super) struct MatrixOwned<const D: usize> {
+    data: Vec<f32>,
+    dims: [usize; D],
+}
+
+impl<const D: usize> MatrixOwned<D> {
+    pub(super) fn as_borrowed(&self) -> MatrixBorrowed<D> {
+        MatrixBorrowed {
+            data: &self.data,
+            dims: self.dims,
+        }
+    }
+
+    pub(super) fn new_zero(dims: [usize; D]) -> Self {
+        let total_len = dims.iter().product::<usize>();
+        MatrixOwned {
+            data: vec![0.0; total_len],
+            dims,
+        }
+    }
+
+    /// Returns the tighly packed submatrix at _index_, or `None` if _index_ is out of range.
+    ///
+    /// For example, if the matrix is 5x4x3, this function returns a matrix sized 4x3. If the
+    /// matrix is 4x3, then this function returns a linear matrix of length 3.
+    ///
+    /// The type parameter `M` should be `D - 1`.
+    #[inline]
+    pub(super) fn submatrix<const M: usize>(&self, index: usize) -> Option<MatrixBorrowed<M>> {
+        // This assertion is based on const generics; it should always succeed and be elided.
+        assert_eq!(M, D - 1);
+        let (range, dims) = self.as_borrowed().submatrix_range(index);
+        let data = &self.data.get(range)?;
+        Some(MatrixBorrowed { data, dims })
+    }
+
+    pub(super) fn as_mut(&mut self) -> MatrixBorrowedMut<D> {
+        MatrixBorrowedMut {
+            data: &mut self.data,
+            dims: self.dims,
+        }
+    }
+
+    /// A mutable version of [`Self::submatrix`].
+    #[inline]
+    pub(super) fn submatrix_mut<const M: usize>(
+        &mut self,
+        index: usize,
+    ) -> Option<MatrixBorrowedMut<M>> {
+        // This assertion is based on const generics; it should always succeed and be elided.
+        assert_eq!(M, D - 1);
+        let (range, dims) = self.as_borrowed().submatrix_range(index);
+        let data = self.data.get_mut(range)?;
+        Some(MatrixBorrowedMut { data, dims })
+    }
+}
+
+/// A `D`-dimensional, borrowed matrix.
+#[derive(Debug, Clone, Copy)]
+pub(super) struct MatrixBorrowed<'a, const D: usize> {
+    data: &'a [f32],
+    dims: [usize; D],
+}
+
+impl<'a, const D: usize> MatrixBorrowed<'a, D> {
+    #[cfg(debug_assertions)]
+    pub(super) fn debug_assert_dims(&self, dims: [usize; D]) {
+        debug_assert_eq!(dims, self.dims);
+        let expected_len = dims.iter().product::<usize>();
+        debug_assert_eq!(expected_len, self.data.len());
+    }
+
+    pub(super) fn as_slice(&self) -> &'a [f32] {
+        self.data
+    }
+
+    /// See [`MatrixOwned::submatrix`].
+    #[inline]
+    pub(super) fn submatrix<const M: usize>(&self, index: usize) -> Option<MatrixBorrowed<'a, M>> {
+        // This assertion is based on const generics; it should always succeed and be elided.
+        assert_eq!(M, D - 1);
+        let (range, dims) = self.submatrix_range(index);
+        let data = &self.data.get(range)?;
+        Some(MatrixBorrowed { data, dims })
+    }
+
+    #[inline]
+    fn submatrix_range<const M: usize>(&self, index: usize) -> (Range<usize>, [usize; M]) {
+        // This assertion is based on const generics; it should always succeed and be elided.
+        assert_eq!(M, D - 1);
+        // The above assertion guarantees that the following line will succeed
+        #[allow(clippy::indexing_slicing, clippy::unwrap_used)]
+        let sub_dims: [usize; M] = self.dims[1..].try_into().unwrap();
+        let n = sub_dims.iter().product::<usize>();
+        (n * index..n * (index + 1), sub_dims)
+    }
+}
+
+macro_rules! impl_basic_dim {
+    ($t1:path, $t2:path, $t3:path) => {
+        impl<'a> $t1 {
+            #[allow(dead_code)]
+            pub(super) fn dim(&self) -> usize {
+                let [dim] = self.dims;
+                dim
+            }
+        }
+        impl<'a> $t2 {
+            #[allow(dead_code)]
+            pub(super) fn dim(&self) -> (usize, usize) {
+                let [d0, d1] = self.dims;
+                (d0, d1)
+            }
+        }
+        impl<'a> $t3 {
+            #[allow(dead_code)]
+            pub(super) fn dim(&self) -> (usize, usize, usize) {
+                let [d0, d1, d2] = self.dims;
+                (d0, d1, d2)
+            }
+        }
+    };
+}
+
+impl_basic_dim!(MatrixOwned<1>, MatrixOwned<2>, MatrixOwned<3>);
+impl_basic_dim!(
+    MatrixBorrowed<'a, 1>,
+    MatrixBorrowed<'a, 2>,
+    MatrixBorrowed<'a, 3>
+);
+impl_basic_dim!(
+    MatrixBorrowedMut<'a, 1>,
+    MatrixBorrowedMut<'a, 2>,
+    MatrixBorrowedMut<'a, 3>
+);
+impl_basic_dim!(MatrixZero<'a, 1>, MatrixZero<'a, 2>, MatrixZero<'a, 3>);
+
+/// A `D`-dimensional, mutably borrowed matrix.
+pub(super) struct MatrixBorrowedMut<'a, const D: usize> {
+    pub(super) data: &'a mut [f32],
+    pub(super) dims: [usize; D],
+}
+
+impl<'a, const D: usize> MatrixBorrowedMut<'a, D> {
+    pub(super) fn as_borrowed(&self) -> MatrixBorrowed<D> {
+        MatrixBorrowed {
+            data: self.data,
+            dims: self.dims,
+        }
+    }
+
+    pub(super) fn as_mut_slice(&mut self) -> &mut [f32] {
+        self.data
+    }
+
+    pub(super) fn copy_submatrix<const M: usize>(&mut self, from: usize, to: usize) {
+        let (range_from, _) = self.as_borrowed().submatrix_range::<M>(from);
+        let (range_to, _) = self.as_borrowed().submatrix_range::<M>(to);
+        if let (Some(_), Some(_)) = (
+            self.data.get(range_from.clone()),
+            self.data.get(range_to.clone()),
+        ) {
+            // This function is panicky, but we just validated the ranges
+            self.data.copy_within(range_from, range_to.start);
+        }
+    }
+
+    #[must_use]
+    pub(super) fn add(&mut self, other: MatrixZero<'_, D>) -> Option<()> {
+        debug_assert_eq!(self.dims, other.dims);
+        // TODO: Vectorize?
+        for i in 0..self.data.len() {
+            *self.data.get_mut(i)? += other.data.get(i)?;
+        }
+        Some(())
+    }
+
+    #[allow(dead_code)] // maybe needed for more complicated bies calculations
+    /// Mutates this matrix by applying a softmax transformation.
+    pub(super) fn softmax_transform(&mut self) {
+        for v in self.data.iter_mut() {
+            *v = v.exp();
+        }
+        let sm = 1.0 / self.data.iter().sum::<f32>();
+        for v in self.data.iter_mut() {
+            *v *= sm;
+        }
+    }
+
+    pub(super) fn sigmoid_transform(&mut self) {
+        for x in &mut self.data.iter_mut() {
+            *x = 1.0 / (1.0 + (-*x).exp());
+        }
+    }
+
+    pub(super) fn tanh_transform(&mut self) {
+        for x in &mut self.data.iter_mut() {
+            *x = x.tanh();
+        }
+    }
+
+    pub(super) fn convolve(
+        &mut self,
+        i: MatrixBorrowed<'_, D>,
+        c: MatrixBorrowed<'_, D>,
+        f: MatrixBorrowed<'_, D>,
+    ) {
+        let i = i.as_slice();
+        let c = c.as_slice();
+        let f = f.as_slice();
+        let len = self.data.len();
+        if len != i.len() || len != c.len() || len != f.len() {
+            debug_assert!(false, "LSTM matrices not the correct dimensions");
+            return;
+        }
+        for idx in 0..len {
+            // Safety: The lengths are all the same (checked above)
+            unsafe {
+                *self.data.get_unchecked_mut(idx) = i.get_unchecked(idx) * c.get_unchecked(idx)
+                    + self.data.get_unchecked(idx) * f.get_unchecked(idx)
+            }
+        }
+    }
+
+    pub(super) fn mul_tanh(&mut self, o: MatrixBorrowed<'_, D>, c: MatrixBorrowed<'_, D>) {
+        let o = o.as_slice();
+        let c = c.as_slice();
+        let len = self.data.len();
+        if len != o.len() || len != c.len() {
+            debug_assert!(false, "LSTM matrices not the correct dimensions");
+            return;
+        }
+        for idx in 0..len {
+            // Safety: The lengths are all the same (checked above)
+            unsafe {
+                *self.data.get_unchecked_mut(idx) =
+                    o.get_unchecked(idx) * c.get_unchecked(idx).tanh();
+            }
+        }
+    }
+}
+
+impl<'a> MatrixBorrowed<'a, 1> {
+    #[allow(dead_code)] // could be useful
+    pub(super) fn dot_1d(&self, other: MatrixZero<1>) -> f32 {
+        debug_assert_eq!(self.dims, other.dims);
+        unrolled_dot_1(self.data, other.data)
+    }
+}
+
+impl<'a> MatrixBorrowedMut<'a, 1> {
+    /// Calculate the dot product of a and b, adding the result to self.
+    ///
+    /// Note: For better dot product efficiency, if `b` is MxN, then `a` should be N;
+    /// this is the opposite of standard practice.
+    pub(super) fn add_dot_2d(&mut self, a: MatrixBorrowed<1>, b: MatrixZero<2>) {
+        let m = a.dim();
+        let n = self.as_borrowed().dim();
+        debug_assert_eq!(
+            m,
+            b.dim().1,
+            "dims: {:?}/{:?}/{:?}",
+            self.as_borrowed().dim(),
+            a.dim(),
+            b.dim()
+        );
+        debug_assert_eq!(
+            n,
+            b.dim().0,
+            "dims: {:?}/{:?}/{:?}",
+            self.as_borrowed().dim(),
+            a.dim(),
+            b.dim()
+        );
+        for i in 0..n {
+            if let (Some(dest), Some(b_sub)) = (self.as_mut_slice().get_mut(i), b.submatrix::<1>(i))
+            {
+                *dest += unrolled_dot_1(a.data, b_sub.data);
+            } else {
+                debug_assert!(false, "unreachable: dims checked above");
+            }
+        }
+    }
+}
+
+impl<'a> MatrixBorrowedMut<'a, 2> {
+    /// Calculate the dot product of a and b, adding the result to self.
+    ///
+    /// Self should be _MxN_; `a`, _O_; and `b`, _MxNxO_.
+    pub(super) fn add_dot_3d_1(&mut self, a: MatrixBorrowed<1>, b: MatrixZero<3>) {
+        let m = a.dim();
+        let n = self.as_borrowed().dim().0 * self.as_borrowed().dim().1;
+        debug_assert_eq!(
+            m,
+            b.dim().2,
+            "dims: {:?}/{:?}/{:?}",
+            self.as_borrowed().dim(),
+            a.dim(),
+            b.dim()
+        );
+        debug_assert_eq!(
+            n,
+            b.dim().0 * b.dim().1,
+            "dims: {:?}/{:?}/{:?}",
+            self.as_borrowed().dim(),
+            a.dim(),
+            b.dim()
+        );
+        // Note: The following two loops are equivalent, but the second has more opportunity for
+        // vectorization since it allows the vectorization to span submatrices.
+        // for i in 0..b.dim().0 {
+        //     self.submatrix_mut::<1>(i).add_dot_2d(a, b.submatrix(i));
+        // }
+        let lhs = a.as_slice();
+        for i in 0..n {
+            if let (Some(dest), Some(rhs)) = (
+                self.as_mut_slice().get_mut(i),
+                b.as_slice().get_subslice(i * m..(i + 1) * m),
+            ) {
+                *dest += unrolled_dot_1(lhs, rhs);
+            } else {
+                debug_assert!(false, "unreachable: dims checked above");
+            }
+        }
+    }
+
+    /// Calculate the dot product of a and b, adding the result to self.
+    ///
+    /// Self should be _MxN_; `a`, _O_; and `b`, _MxNxO_.
+    pub(super) fn add_dot_3d_2(&mut self, a: MatrixZero<1>, b: MatrixZero<3>) {
+        let m = a.dim();
+        let n = self.as_borrowed().dim().0 * self.as_borrowed().dim().1;
+        debug_assert_eq!(
+            m,
+            b.dim().2,
+            "dims: {:?}/{:?}/{:?}",
+            self.as_borrowed().dim(),
+            a.dim(),
+            b.dim()
+        );
+        debug_assert_eq!(
+            n,
+            b.dim().0 * b.dim().1,
+            "dims: {:?}/{:?}/{:?}",
+            self.as_borrowed().dim(),
+            a.dim(),
+            b.dim()
+        );
+        // Note: The following two loops are equivalent, but the second has more opportunity for
+        // vectorization since it allows the vectorization to span submatrices.
+        // for i in 0..b.dim().0 {
+        //     self.submatrix_mut::<1>(i).add_dot_2d(a, b.submatrix(i));
+        // }
+        let lhs = a.as_slice();
+        for i in 0..n {
+            if let (Some(dest), Some(rhs)) = (
+                self.as_mut_slice().get_mut(i),
+                b.as_slice().get_subslice(i * m..(i + 1) * m),
+            ) {
+                *dest += unrolled_dot_2(lhs, rhs);
+            } else {
+                debug_assert!(false, "unreachable: dims checked above");
+            }
+        }
+    }
+}
+
+/// A `D`-dimensional matrix borrowed from a [`ZeroSlice`].
+#[derive(Debug, Clone, Copy)]
+pub(super) struct MatrixZero<'a, const D: usize> {
+    data: &'a ZeroSlice<f32>,
+    dims: [usize; D],
+}
+
+impl<'a> From<&'a crate::provider::LstmMatrix1<'a>> for MatrixZero<'a, 1> {
+    fn from(other: &'a crate::provider::LstmMatrix1<'a>) -> Self {
+        Self {
+            data: &other.data,
+            dims: other.dims.map(|x| x as usize),
+        }
+    }
+}
+
+impl<'a> From<&'a crate::provider::LstmMatrix2<'a>> for MatrixZero<'a, 2> {
+    fn from(other: &'a crate::provider::LstmMatrix2<'a>) -> Self {
+        Self {
+            data: &other.data,
+            dims: other.dims.map(|x| x as usize),
+        }
+    }
+}
+
+impl<'a> From<&'a crate::provider::LstmMatrix3<'a>> for MatrixZero<'a, 3> {
+    fn from(other: &'a crate::provider::LstmMatrix3<'a>) -> Self {
+        Self {
+            data: &other.data,
+            dims: other.dims.map(|x| x as usize),
+        }
+    }
+}
+
+impl<'a, const D: usize> MatrixZero<'a, D> {
+    #[allow(clippy::wrong_self_convention)] // same convention as slice::to_vec
+    pub(super) fn to_owned(&self) -> MatrixOwned<D> {
+        MatrixOwned {
+            data: self.data.iter().collect(),
+            dims: self.dims,
+        }
+    }
+
+    pub(super) fn as_slice(&self) -> &ZeroSlice<f32> {
+        self.data
+    }
+
+    #[cfg(debug_assertions)]
+    pub(super) fn debug_assert_dims(&self, dims: [usize; D]) {
+        debug_assert_eq!(dims, self.dims);
+        let expected_len = dims.iter().product::<usize>();
+        debug_assert_eq!(expected_len, self.data.len());
+    }
+
+    /// See [`MatrixOwned::submatrix`].
+    #[inline]
+    pub(super) fn submatrix<const M: usize>(&self, index: usize) -> Option<MatrixZero<'a, M>> {
+        // This assertion is based on const generics; it should always succeed and be elided.
+        assert_eq!(M, D - 1);
+        let (range, dims) = self.submatrix_range(index);
+        let data = &self.data.get_subslice(range)?;
+        Some(MatrixZero { data, dims })
+    }
+
+    #[inline]
+    fn submatrix_range<const M: usize>(&self, index: usize) -> (Range<usize>, [usize; M]) {
+        // This assertion is based on const generics; it should always succeed and be elided.
+        assert_eq!(M, D - 1);
+        // The above assertion guarantees that the following line will succeed
+        #[allow(clippy::indexing_slicing, clippy::unwrap_used)]
+        let sub_dims: [usize; M] = self.dims[1..].try_into().unwrap();
+        let n = sub_dims.iter().product::<usize>();
+        (n * index..n * (index + 1), sub_dims)
+    }
+}
+
+macro_rules! f32c {
+    ($ule:expr) => {
+        f32::from_unaligned($ule)
+    };
+}
+
+/// Compute the dot product of an aligned and an unaligned f32 slice.
+///
+/// `xs` and `ys` must be the same length
+///
+/// (Based on ndarray 0.15.6)
+fn unrolled_dot_1(xs: &[f32], ys: &ZeroSlice<f32>) -> f32 {
+    debug_assert_eq!(xs.len(), ys.len());
+    // eightfold unrolled so that floating point can be vectorized
+    // (even with strict floating point accuracy semantics)
+    let mut p = (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0);
+    let xit = xs.chunks_exact(8);
+    let yit = ys.as_ule_slice().chunks_exact(8);
+    let sum = xit
+        .remainder()
+        .iter()
+        .zip(yit.remainder().iter())
+        .map(|(x, y)| x * f32c!(*y))
+        .sum::<f32>();
+    for (xx, yy) in xit.zip(yit) {
+        // TODO: Use array_chunks once stable to avoid the unwrap.
+        // <https://github.com/rust-lang/rust/issues/74985>
+        #[allow(clippy::unwrap_used)]
+        let [x0, x1, x2, x3, x4, x5, x6, x7] = *<&[f32; 8]>::try_from(xx).unwrap();
+        #[allow(clippy::unwrap_used)]
+        let [y0, y1, y2, y3, y4, y5, y6, y7] = *<&[<f32 as AsULE>::ULE; 8]>::try_from(yy).unwrap();
+        p.0 += x0 * f32c!(y0);
+        p.1 += x1 * f32c!(y1);
+        p.2 += x2 * f32c!(y2);
+        p.3 += x3 * f32c!(y3);
+        p.4 += x4 * f32c!(y4);
+        p.5 += x5 * f32c!(y5);
+        p.6 += x6 * f32c!(y6);
+        p.7 += x7 * f32c!(y7);
+    }
+    sum + (p.0 + p.4) + (p.1 + p.5) + (p.2 + p.6) + (p.3 + p.7)
+}
+
+/// Compute the dot product of two unaligned f32 slices.
+///
+/// `xs` and `ys` must be the same length
+///
+/// (Based on ndarray 0.15.6)
+fn unrolled_dot_2(xs: &ZeroSlice<f32>, ys: &ZeroSlice<f32>) -> f32 {
+    debug_assert_eq!(xs.len(), ys.len());
+    // eightfold unrolled so that floating point can be vectorized
+    // (even with strict floating point accuracy semantics)
+    let mut p = (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0);
+    let xit = xs.as_ule_slice().chunks_exact(8);
+    let yit = ys.as_ule_slice().chunks_exact(8);
+    let sum = xit
+        .remainder()
+        .iter()
+        .zip(yit.remainder().iter())
+        .map(|(x, y)| f32c!(*x) * f32c!(*y))
+        .sum::<f32>();
+    for (xx, yy) in xit.zip(yit) {
+        // TODO: Use array_chunks once stable to avoid the unwrap.
+        // <https://github.com/rust-lang/rust/issues/74985>
+        #[allow(clippy::unwrap_used)]
+        let [x0, x1, x2, x3, x4, x5, x6, x7] = *<&[<f32 as AsULE>::ULE; 8]>::try_from(xx).unwrap();
+        #[allow(clippy::unwrap_used)]
+        let [y0, y1, y2, y3, y4, y5, y6, y7] = *<&[<f32 as AsULE>::ULE; 8]>::try_from(yy).unwrap();
+        p.0 += f32c!(x0) * f32c!(y0);
+        p.1 += f32c!(x1) * f32c!(y1);
+        p.2 += f32c!(x2) * f32c!(y2);
+        p.3 += f32c!(x3) * f32c!(y3);
+        p.4 += f32c!(x4) * f32c!(y4);
+        p.5 += f32c!(x5) * f32c!(y5);
+        p.6 += f32c!(x6) * f32c!(y6);
+        p.7 += f32c!(x7) * f32c!(y7);
+    }
+    sum + (p.0 + p.4) + (p.1 + p.5) + (p.2 + p.6) + (p.3 + p.7)
+}
diff --git a/third_party/rust/icu_segmenter/src/complex/lstm/mod.rs b/third_party/rust/icu_segmenter/src/complex/lstm/mod.rs
new file mode 100644
index 0000000000..8718cbd3da
--- /dev/null
+++ b/third_party/rust/icu_segmenter/src/complex/lstm/mod.rs
@@ -0,0 +1,402 @@
+// This file is part of ICU4X. For terms of use, please see the file
+// called LICENSE at the top level of the ICU4X source tree
+// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
+
+use crate::grapheme::GraphemeClusterSegmenter;
+use crate::provider::*;
+use alloc::vec::Vec;
+use core::char::{decode_utf16, REPLACEMENT_CHARACTER};
+use zerovec::{maps::ZeroMapBorrowed, ule::UnvalidatedStr};
+
+mod matrix;
+use matrix::*;
+
+// A word break iterator using LSTM model. Input string have to be same language.
+
+struct LstmSegmenterIterator<'s> {
+    input: &'s str,
+    pos_utf8: usize,
+    bies: BiesIterator<'s>,
+}
+
+impl Iterator for LstmSegmenterIterator<'_> {
+    type Item = usize;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        #[allow(clippy::indexing_slicing)] // pos_utf8 in range
+        loop {
+            let is_e = self.bies.next()?;
+            self.pos_utf8 += self.input[self.pos_utf8..].chars().next()?.len_utf8();
+            if is_e || self.bies.len() == 0 {
+                return Some(self.pos_utf8);
+            }
+        }
+    }
+}
+
+struct LstmSegmenterIteratorUtf16<'s> {
+    bies: BiesIterator<'s>,
+    pos: usize,
+}
+
+impl Iterator for LstmSegmenterIteratorUtf16<'_> {
+    type Item = usize;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        loop {
+            self.pos += 1;
+            if self.bies.next()? || self.bies.len() == 0 {
+                return Some(self.pos);
+            }
+        }
+    }
+}
+
+pub(super) struct LstmSegmenter<'l> {
+    dic: ZeroMapBorrowed<'l, UnvalidatedStr, u16>,
+    embedding: MatrixZero<'l, 2>,
+    fw_w: MatrixZero<'l, 3>,
+    fw_u: MatrixZero<'l, 3>,
+    fw_b: MatrixZero<'l, 2>,
+    bw_w: MatrixZero<'l, 3>,
+    bw_u: MatrixZero<'l, 3>,
+    bw_b: MatrixZero<'l, 2>,
+    timew_fw: MatrixZero<'l, 2>,
+    timew_bw: MatrixZero<'l, 2>,
+    time_b: MatrixZero<'l, 1>,
+    grapheme: Option<&'l RuleBreakDataV1<'l>>,
+}
+
+impl<'l> LstmSegmenter<'l> {
+    /// Returns `Err` if grapheme data is required but not present
+    pub(super) fn new(lstm: &'l LstmDataV1<'l>, grapheme: &'l RuleBreakDataV1<'l>) -> Self {
+        let LstmDataV1::Float32(lstm) = lstm;
+        let time_w = MatrixZero::from(&lstm.time_w);
+        #[allow(clippy::unwrap_used)] // shape (2, 4, hunits)
+        let timew_fw = time_w.submatrix(0).unwrap();
+        #[allow(clippy::unwrap_used)] // shape (2, 4, hunits)
+        let timew_bw = time_w.submatrix(1).unwrap();
+        Self {
+            dic: lstm.dic.as_borrowed(),
+            embedding: MatrixZero::from(&lstm.embedding),
+            fw_w: MatrixZero::from(&lstm.fw_w),
+            fw_u: MatrixZero::from(&lstm.fw_u),
+            fw_b: MatrixZero::from(&lstm.fw_b),
+            bw_w: MatrixZero::from(&lstm.bw_w),
+            bw_u: MatrixZero::from(&lstm.bw_u),
+            bw_b: MatrixZero::from(&lstm.bw_b),
+            timew_fw,
+            timew_bw,
+            time_b: MatrixZero::from(&lstm.time_b),
+            grapheme: (lstm.model == ModelType::GraphemeClusters).then_some(grapheme),
+        }
+    }
+
+    /// Create an LSTM based break iterator for an `str` (a UTF-8 string).
+    pub(super) fn segment_str(&'l self, input: &'l str) -> impl Iterator<Item = usize> + 'l {
+        self.segment_str_p(input)
+    }
+
+    // For unit testing as we cannot inspect the opaque type's bies
+    fn segment_str_p(&'l self, input: &'l str) -> LstmSegmenterIterator<'l> {
+        let input_seq = if let Some(grapheme) = self.grapheme {
+            GraphemeClusterSegmenter::new_and_segment_str(input, grapheme)
+                .collect::<Vec<usize>>()
+                .windows(2)
+                .map(|chunk| {
+                    let range = if let [first, second, ..] = chunk {
+                        *first..*second
+                    } else {
+                        unreachable!()
+                    };
+                    let grapheme_cluster = if let Some(grapheme_cluster) = input.get(range) {
+                        grapheme_cluster
+                    } else {
+                        return self.dic.len() as u16;
+                    };
+
+                    self.dic
+                        .get_copied(UnvalidatedStr::from_str(grapheme_cluster))
+                        .unwrap_or_else(|| self.dic.len() as u16)
+                })
+                .collect()
+        } else {
+            input
+                .chars()
+                .map(|c| {
+                    self.dic
+                        .get_copied(UnvalidatedStr::from_str(c.encode_utf8(&mut [0; 4])))
+                        .unwrap_or_else(|| self.dic.len() as u16)
+                })
+                .collect()
+        };
+        LstmSegmenterIterator {
+            input,
+            pos_utf8: 0,
+            bies: BiesIterator::new(self, input_seq),
+        }
+    }
+
+    /// Create an LSTM based break iterator for a UTF-16 string.
+    pub(super) fn segment_utf16(&'l self, input: &[u16]) -> impl Iterator<Item = usize> + 'l {
+        let input_seq = if let Some(grapheme) = self.grapheme {
+            GraphemeClusterSegmenter::new_and_segment_utf16(input, grapheme)
+                .collect::<Vec<usize>>()
+                .windows(2)
+                .map(|chunk| {
+                    let range = if let [first, second, ..] = chunk {
+                        *first..*second
+                    } else {
+                        unreachable!()
+                    };
+                    let grapheme_cluster = if let Some(grapheme_cluster) = input.get(range) {
+                        grapheme_cluster
+                    } else {
+                        return self.dic.len() as u16;
+                    };
+
+                    self.dic
+                        .get_copied_by(|key| {
+                            key.as_bytes().iter().copied().cmp(
+                                decode_utf16(grapheme_cluster.iter().copied()).flat_map(|c| {
+                                    let mut buf = [0; 4];
+                                    let len = c
+                                        .unwrap_or(REPLACEMENT_CHARACTER)
+                                        .encode_utf8(&mut buf)
+                                        .len();
+                                    buf.into_iter().take(len)
+                                }),
+                            )
+                        })
+                        .unwrap_or_else(|| self.dic.len() as u16)
+                })
+                .collect()
+        } else {
+            decode_utf16(input.iter().copied())
+                .map(|c| c.unwrap_or(REPLACEMENT_CHARACTER))
+                .map(|c| {
+                    self.dic
+                        .get_copied(UnvalidatedStr::from_str(c.encode_utf8(&mut [0; 4])))
+                        .unwrap_or_else(|| self.dic.len() as u16)
+                })
+                .collect()
+        };
+        LstmSegmenterIteratorUtf16 {
+            bies: BiesIterator::new(self, input_seq),
+            pos: 0,
+        }
+    }
+}
+
+struct BiesIterator<'l> {
+    segmenter: &'l LstmSegmenter<'l>,
+    input_seq: core::iter::Enumerate<alloc::vec::IntoIter<u16>>,
+    h_bw: MatrixOwned<2>,
+    curr_fw: MatrixOwned<1>,
+    c_fw: MatrixOwned<1>,
+}
+
+impl<'l> BiesIterator<'l> {
+    // input_seq is a sequence of id numbers that represents grapheme clusters or code points in the input line. These ids are used later
+    // in the embedding layer of the model.
+    fn new(segmenter: &'l LstmSegmenter<'l>, input_seq: Vec<u16>) -> Self {
+        let hunits = segmenter.fw_u.dim().1;
+
+        // Backward LSTM
+        let mut c_bw = MatrixOwned::<1>::new_zero([hunits]);
+        let mut h_bw = MatrixOwned::<2>::new_zero([input_seq.len(), hunits]);
+        for (i, &g_id) in input_seq.iter().enumerate().rev() {
+            if i + 1 < input_seq.len() {
+                h_bw.as_mut().copy_submatrix::<1>(i + 1, i);
+            }
+            #[allow(clippy::unwrap_used)]
+            compute_hc(
+                segmenter.embedding.submatrix::<1>(g_id as usize).unwrap(), /* shape (dict.len() + 1, hunit), g_id is at most dict.len() */
+                h_bw.submatrix_mut(i).unwrap(), // shape (input_seq.len(), hunits)
+                c_bw.as_mut(),
+                segmenter.bw_w,
+                segmenter.bw_u,
+                segmenter.bw_b,
+            );
+        }
+
+        Self {
+            input_seq: input_seq.into_iter().enumerate(),
+            h_bw,
+            c_fw: MatrixOwned::<1>::new_zero([hunits]),
+            curr_fw: MatrixOwned::<1>::new_zero([hunits]),
+            segmenter,
+        }
+    }
+}
+
+impl ExactSizeIterator for BiesIterator<'_> {
+    fn len(&self) -> usize {
+        self.input_seq.len()
+    }
+}
+
+impl Iterator for BiesIterator<'_> {
+    type Item = bool;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        let (i, g_id) = self.input_seq.next()?;
+
+        #[allow(clippy::unwrap_used)]
+        compute_hc(
+            self.segmenter
+                .embedding
+                .submatrix::<1>(g_id as usize)
+                .unwrap(), // shape (dict.len() + 1, hunit), g_id is at most dict.len()
+            self.curr_fw.as_mut(),
+            self.c_fw.as_mut(),
+            self.segmenter.fw_w,
+            self.segmenter.fw_u,
+            self.segmenter.fw_b,
+        );
+
+        #[allow(clippy::unwrap_used)] // shape (input_seq.len(), hunits)
+        let curr_bw = self.h_bw.submatrix::<1>(i).unwrap();
+        let mut weights = [0.0; 4];
+        let mut curr_est = MatrixBorrowedMut {
+            data: &mut weights,
+            dims: [4],
+        };
+        curr_est.add_dot_2d(self.curr_fw.as_borrowed(), self.segmenter.timew_fw);
+        curr_est.add_dot_2d(curr_bw, self.segmenter.timew_bw);
+        #[allow(clippy::unwrap_used)] // both shape (4)
+        curr_est.add(self.segmenter.time_b).unwrap();
+        // For correct BIES weight calculation we'd now have to apply softmax, however
+        // we're only doing a naive argmax, so a monotonic function doesn't make a difference.
+
+        Some(weights[2] > weights[0] && weights[2] > weights[1] && weights[2] > weights[3])
+    }
+}
+
+/// `compute_hc1` implemens the evaluation of one LSTM layer.
+fn compute_hc<'a>(
+    x_t: MatrixZero<'a, 1>,
+    mut h_tm1: MatrixBorrowedMut<'a, 1>,
+    mut c_tm1: MatrixBorrowedMut<'a, 1>,
+    w: MatrixZero<'a, 3>,
+    u: MatrixZero<'a, 3>,
+    b: MatrixZero<'a, 2>,
+) {
+    #[cfg(debug_assertions)]
+    {
+        let hunits = h_tm1.dim();
+        let embedd_dim = x_t.dim();
+        c_tm1.as_borrowed().debug_assert_dims([hunits]);
+        w.debug_assert_dims([4, hunits, embedd_dim]);
+        u.debug_assert_dims([4, hunits, hunits]);
+        b.debug_assert_dims([4, hunits]);
+    }
+
+    let mut s_t = b.to_owned();
+
+    s_t.as_mut().add_dot_3d_2(x_t, w);
+    s_t.as_mut().add_dot_3d_1(h_tm1.as_borrowed(), u);
+
+    #[allow(clippy::unwrap_used)] // first dimension is 4
+    s_t.submatrix_mut::<1>(0).unwrap().sigmoid_transform();
+    #[allow(clippy::unwrap_used)] // first dimension is 4
+    s_t.submatrix_mut::<1>(1).unwrap().sigmoid_transform();
+    #[allow(clippy::unwrap_used)] // first dimension is 4
+    s_t.submatrix_mut::<1>(2).unwrap().tanh_transform();
+    #[allow(clippy::unwrap_used)] // first dimension is 4
+    s_t.submatrix_mut::<1>(3).unwrap().sigmoid_transform();
+
+    #[allow(clippy::unwrap_used)] // first dimension is 4
+    c_tm1.convolve(
+        s_t.as_borrowed().submatrix(0).unwrap(),
+        s_t.as_borrowed().submatrix(2).unwrap(),
+        s_t.as_borrowed().submatrix(1).unwrap(),
+    );
+
+    #[allow(clippy::unwrap_used)] // first dimension is 4
+    h_tm1.mul_tanh(s_t.as_borrowed().submatrix(3).unwrap(), c_tm1.as_borrowed());
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use icu_locid::locale;
+    use icu_provider::prelude::*;
+    use serde::Deserialize;
+    use std::fs::File;
+    use std::io::BufReader;
+
+    /// `TestCase` is a struct used to store a single test case.
+    /// Each test case has two attributs: `unseg` which denots the unsegmented line, and `true_bies` which indicates the Bies
+    /// sequence representing the true segmentation.
+    #[derive(PartialEq, Debug, Deserialize)]
+    struct TestCase {
+        unseg: String,
+        expected_bies: String,
+        true_bies: String,
+    }
+
+    /// `TestTextData` is a struct to store a vector of `TestCase` that represents a test text.
+    #[derive(PartialEq, Debug, Deserialize)]
+    struct TestTextData {
+        testcases: Vec<TestCase>,
+    }
+
+    #[derive(Debug)]
+    struct TestText {
+        data: TestTextData,
+    }
+
+    fn load_test_text(filename: &str) -> TestTextData {
+        let file = File::open(filename).expect("File should be present");
+        let reader = BufReader::new(file);
+        serde_json::from_reader(reader).expect("JSON syntax error")
+    }
+
+    #[test]
+    fn segment_file_by_lstm() {
+        let lstm: DataPayload<LstmForWordLineAutoV1Marker> = crate::provider::Baked
+            .load(DataRequest {
+                locale: &locale!("th").into(),
+                metadata: Default::default(),
+            })
+            .unwrap()
+            .take_payload()
+            .unwrap();
+        let lstm = LstmSegmenter::new(
+            lstm.get(),
+            crate::provider::Baked::SINGLETON_SEGMENTER_GRAPHEME_V1,
+        );
+
+        // Importing the test data
+        let test_text_data = load_test_text(&format!(
+            "tests/testdata/test_text_{}.json",
+            if lstm.grapheme.is_some() {
+                "grapheme"
+            } else {
+                "codepoints"
+            }
+        ));
+        let test_text = TestText {
+            data: test_text_data,
+        };
+
+        // Testing
+        for test_case in &test_text.data.testcases {
+            let lstm_output = lstm
+                .segment_str_p(&test_case.unseg)
+                .bies
+                .map(|is_e| if is_e { 'e' } else { '?' })
+                .collect::<String>();
+            println!("Test case      : {}", test_case.unseg);
+            println!("Expected bies  : {}", test_case.expected_bies);
+            println!("Estimated bies : {lstm_output}");
+            println!("True bies      : {}", test_case.true_bies);
+            println!("****************************************************");
+            assert_eq!(
+                test_case.expected_bies.replace(['b', 'i', 's'], "?"),
+                lstm_output
+            );
+        }
+    }
+}
diff --git a/third_party/rust/icu_segmenter/src/complex/mod.rs b/third_party/rust/icu_segmenter/src/complex/mod.rs
new file mode 100644
index 0000000000..65f49a92f0
--- /dev/null
+++ b/third_party/rust/icu_segmenter/src/complex/mod.rs
@@ -0,0 +1,440 @@
+// This file is part of ICU4X. For terms of use, please see the file
+// called LICENSE at the top level of the ICU4X source tree
+// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
+
+use crate::provider::*;
+use alloc::vec::Vec;
+use icu_locid::{locale, Locale};
+use icu_provider::prelude::*;
+
+mod dictionary;
+use dictionary::*;
+mod language;
+use language::*;
+#[cfg(feature = "lstm")]
+mod lstm;
+#[cfg(feature = "lstm")]
+use lstm::*;
+
+#[cfg(not(feature = "lstm"))]
+type DictOrLstm = Result<DataPayload<UCharDictionaryBreakDataV1Marker>, core::convert::Infallible>;
+#[cfg(not(feature = "lstm"))]
+type DictOrLstmBorrowed<'a> =
+    Result<&'a DataPayload<UCharDictionaryBreakDataV1Marker>, &'a core::convert::Infallible>;
+
+#[cfg(feature = "lstm")]
+type DictOrLstm =
+    Result<DataPayload<UCharDictionaryBreakDataV1Marker>, DataPayload<LstmDataV1Marker>>;
+#[cfg(feature = "lstm")]
+type DictOrLstmBorrowed<'a> =
+    Result<&'a DataPayload<UCharDictionaryBreakDataV1Marker>, &'a DataPayload<LstmDataV1Marker>>;
+
+#[derive(Debug)]
+pub(crate) struct ComplexPayloads {
+    grapheme: DataPayload<GraphemeClusterBreakDataV1Marker>,
+    my: Option<DictOrLstm>,
+    km: Option<DictOrLstm>,
+    lo: Option<DictOrLstm>,
+    th: Option<DictOrLstm>,
+    ja: Option<DataPayload<UCharDictionaryBreakDataV1Marker>>,
+}
+
+impl ComplexPayloads {
+    fn select(&self, language: Language) -> Option<DictOrLstmBorrowed> {
+        const ERR: DataError = DataError::custom("No segmentation model for language");
+        match language {
+            Language::Burmese => self.my.as_ref().map(Result::as_ref).or_else(|| {
+                ERR.with_display_context("my");
+                None
+            }),
+            Language::Khmer => self.km.as_ref().map(Result::as_ref).or_else(|| {
+                ERR.with_display_context("km");
+                None
+            }),
+            Language::Lao => self.lo.as_ref().map(Result::as_ref).or_else(|| {
+                ERR.with_display_context("lo");
+                None
+            }),
+            Language::Thai => self.th.as_ref().map(Result::as_ref).or_else(|| {
+                ERR.with_display_context("th");
+                None
+            }),
+            Language::ChineseOrJapanese => self.ja.as_ref().map(Ok).or_else(|| {
+                ERR.with_display_context("ja");
+                None
+            }),
+            Language::Unknown => None,
+        }
+    }
+
+    #[cfg(feature = "lstm")]
+    #[cfg(feature = "compiled_data")]
+    pub(crate) fn new_lstm() -> Self {
+        #[allow(clippy::unwrap_used)]
+        // try_load is infallible if the provider only returns `MissingLocale`.
+        Self {
+            grapheme: DataPayload::from_static_ref(
+                crate::provider::Baked::SINGLETON_SEGMENTER_GRAPHEME_V1,
+            ),
+            my: try_load::<LstmForWordLineAutoV1Marker, _>(&crate::provider::Baked, locale!("my"))
+                .unwrap()
+                .map(DataPayload::cast)
+                .map(Err),
+            km: try_load::<LstmForWordLineAutoV1Marker, _>(&crate::provider::Baked, locale!("km"))
+                .unwrap()
+                .map(DataPayload::cast)
+                .map(Err),
+            lo: try_load::<LstmForWordLineAutoV1Marker, _>(&crate::provider::Baked, locale!("lo"))
+                .unwrap()
+                .map(DataPayload::cast)
+                .map(Err),
+            th: try_load::<LstmForWordLineAutoV1Marker, _>(&crate::provider::Baked, locale!("th"))
+                .unwrap()
+                .map(DataPayload::cast)
+                .map(Err),
+            ja: None,
+        }
+    }
+
+    #[cfg(feature = "lstm")]
+    pub(crate) fn try_new_lstm<D>(provider: &D) -> Result<Self, DataError>
+    where
+        D: DataProvider<GraphemeClusterBreakDataV1Marker>
+            + DataProvider<LstmForWordLineAutoV1Marker>
+            + ?Sized,
+    {
+        Ok(Self {
+            grapheme: provider.load(Default::default())?.take_payload()?,
+            my: try_load::<LstmForWordLineAutoV1Marker, D>(provider, locale!("my"))?
+                .map(DataPayload::cast)
+                .map(Err),
+            km: try_load::<LstmForWordLineAutoV1Marker, D>(provider, locale!("km"))?
+                .map(DataPayload::cast)
+                .map(Err),
+            lo: try_load::<LstmForWordLineAutoV1Marker, D>(provider, locale!("lo"))?
+                .map(DataPayload::cast)
+                .map(Err),
+            th: try_load::<LstmForWordLineAutoV1Marker, D>(provider, locale!("th"))?
+                .map(DataPayload::cast)
+                .map(Err),
+            ja: None,
+        })
+    }
+
+    #[cfg(feature = "compiled_data")]
+    pub(crate) fn new_dict() -> Self {
+        #[allow(clippy::unwrap_used)]
+        // try_load is infallible if the provider only returns `MissingLocale`.
+        Self {
+            grapheme: DataPayload::from_static_ref(
+                crate::provider::Baked::SINGLETON_SEGMENTER_GRAPHEME_V1,
+            ),
+            my: try_load::<DictionaryForWordLineExtendedV1Marker, _>(
+                &crate::provider::Baked,
+                locale!("my"),
+            )
+            .unwrap()
+            .map(DataPayload::cast)
+            .map(Ok),
+            km: try_load::<DictionaryForWordLineExtendedV1Marker, _>(
+                &crate::provider::Baked,
+                locale!("km"),
+            )
+            .unwrap()
+            .map(DataPayload::cast)
+            .map(Ok),
+            lo: try_load::<DictionaryForWordLineExtendedV1Marker, _>(
+                &crate::provider::Baked,
+                locale!("lo"),
+            )
+            .unwrap()
+            .map(DataPayload::cast)
+            .map(Ok),
+            th: try_load::<DictionaryForWordLineExtendedV1Marker, _>(
+                &crate::provider::Baked,
+                locale!("th"),
+            )
+            .unwrap()
+            .map(DataPayload::cast)
+            .map(Ok),
+            ja: try_load::<DictionaryForWordOnlyAutoV1Marker, _>(
+                &crate::provider::Baked,
+                locale!("ja"),
+            )
+            .unwrap()
+            .map(DataPayload::cast),
+        }
+    }
+
+    pub(crate) fn try_new_dict<D>(provider: &D) -> Result<Self, DataError>
+    where
+        D: DataProvider<GraphemeClusterBreakDataV1Marker>
+            + DataProvider<DictionaryForWordLineExtendedV1Marker>
+            + DataProvider<DictionaryForWordOnlyAutoV1Marker>
+            + ?Sized,
+    {
+        Ok(Self {
+            grapheme: provider.load(Default::default())?.take_payload()?,
+            my: try_load::<DictionaryForWordLineExtendedV1Marker, D>(provider, locale!("my"))?
+                .map(DataPayload::cast)
+                .map(Ok),
+            km: try_load::<DictionaryForWordLineExtendedV1Marker, D>(provider, locale!("km"))?
+                .map(DataPayload::cast)
+                .map(Ok),
+            lo: try_load::<DictionaryForWordLineExtendedV1Marker, D>(provider, locale!("lo"))?
+                .map(DataPayload::cast)
+                .map(Ok),
+            th: try_load::<DictionaryForWordLineExtendedV1Marker, D>(provider, locale!("th"))?
+                .map(DataPayload::cast)
+                .map(Ok),
+            ja: try_load::<DictionaryForWordOnlyAutoV1Marker, D>(provider, locale!("ja"))?
+                .map(DataPayload::cast),
+        })
+    }
+
+    #[cfg(feature = "auto")] // Use by WordSegmenter with "auto" enabled.
+    #[cfg(feature = "compiled_data")]
+    pub(crate) fn new_auto() -> Self {
+        #[allow(clippy::unwrap_used)]
+        // try_load is infallible if the provider only returns `MissingLocale`.
+        Self {
+            grapheme: DataPayload::from_static_ref(
+                crate::provider::Baked::SINGLETON_SEGMENTER_GRAPHEME_V1,
+            ),
+            my: try_load::<LstmForWordLineAutoV1Marker, _>(&crate::provider::Baked, locale!("my"))
+                .unwrap()
+                .map(DataPayload::cast)
+                .map(Err),
+            km: try_load::<LstmForWordLineAutoV1Marker, _>(&crate::provider::Baked, locale!("km"))
+                .unwrap()
+                .map(DataPayload::cast)
+                .map(Err),
+            lo: try_load::<LstmForWordLineAutoV1Marker, _>(&crate::provider::Baked, locale!("lo"))
+                .unwrap()
+                .map(DataPayload::cast)
+                .map(Err),
+            th: try_load::<LstmForWordLineAutoV1Marker, _>(&crate::provider::Baked, locale!("th"))
+                .unwrap()
+                .map(DataPayload::cast)
+                .map(Err),
+            ja: try_load::<DictionaryForWordOnlyAutoV1Marker, _>(
+                &crate::provider::Baked,
+                locale!("ja"),
+            )
+            .unwrap()
+            .map(DataPayload::cast),
+        }
+    }
+
+    #[cfg(feature = "auto")] // Use by WordSegmenter with "auto" enabled.
+    pub(crate) fn try_new_auto<D>(provider: &D) -> Result<Self, DataError>
+    where
+        D: DataProvider<GraphemeClusterBreakDataV1Marker>
+            + DataProvider<LstmForWordLineAutoV1Marker>
+            + DataProvider<DictionaryForWordOnlyAutoV1Marker>
+            + ?Sized,
+    {
+        Ok(Self {
+            grapheme: provider.load(Default::default())?.take_payload()?,
+            my: try_load::<LstmForWordLineAutoV1Marker, D>(provider, locale!("my"))?
+                .map(DataPayload::cast)
+                .map(Err),
+            km: try_load::<LstmForWordLineAutoV1Marker, D>(provider, locale!("km"))?
+                .map(DataPayload::cast)
+                .map(Err),
+            lo: try_load::<LstmForWordLineAutoV1Marker, D>(provider, locale!("lo"))?
+                .map(DataPayload::cast)
+                .map(Err),
+            th: try_load::<LstmForWordLineAutoV1Marker, D>(provider, locale!("th"))?
+                .map(DataPayload::cast)
+                .map(Err),
+            ja: try_load::<DictionaryForWordOnlyAutoV1Marker, D>(provider, locale!("ja"))?
+                .map(DataPayload::cast),
+        })
+    }
+
+    #[cfg(feature = "compiled_data")]
+    pub(crate) fn new_southeast_asian() -> Self {
+        #[allow(clippy::unwrap_used)]
+        // try_load is infallible if the provider only returns `MissingLocale`.
+        Self {
+            grapheme: DataPayload::from_static_ref(
+                crate::provider::Baked::SINGLETON_SEGMENTER_GRAPHEME_V1,
+            ),
+            my: try_load::<DictionaryForWordLineExtendedV1Marker, _>(
+                &crate::provider::Baked,
+                locale!("my"),
+            )
+            .unwrap()
+            .map(DataPayload::cast)
+            .map(Ok),
+            km: try_load::<DictionaryForWordLineExtendedV1Marker, _>(
+                &crate::provider::Baked,
+                locale!("km"),
+            )
+            .unwrap()
+            .map(DataPayload::cast)
+            .map(Ok),
+            lo: try_load::<DictionaryForWordLineExtendedV1Marker, _>(
+                &crate::provider::Baked,
+                locale!("lo"),
+            )
+            .unwrap()
+            .map(DataPayload::cast)
+            .map(Ok),
+            th: try_load::<DictionaryForWordLineExtendedV1Marker, _>(
+                &crate::provider::Baked,
+                locale!("th"),
+            )
+            .unwrap()
+            .map(DataPayload::cast)
+            .map(Ok),
+            ja: None,
+        }
+    }
+
+    pub(crate) fn try_new_southeast_asian<D>(provider: &D) -> Result<Self, DataError>
+    where
+        D: DataProvider<DictionaryForWordLineExtendedV1Marker>
+            + DataProvider<GraphemeClusterBreakDataV1Marker>
+            + ?Sized,
+    {
+        Ok(Self {
+            grapheme: provider.load(Default::default())?.take_payload()?,
+            my: try_load::<DictionaryForWordLineExtendedV1Marker, _>(provider, locale!("my"))?
+                .map(DataPayload::cast)
+                .map(Ok),
+            km: try_load::<DictionaryForWordLineExtendedV1Marker, _>(provider, locale!("km"))?
+                .map(DataPayload::cast)
+                .map(Ok),
+            lo: try_load::<DictionaryForWordLineExtendedV1Marker, _>(provider, locale!("lo"))?
+                .map(DataPayload::cast)
+                .map(Ok),
+            th: try_load::<DictionaryForWordLineExtendedV1Marker, _>(provider, locale!("th"))?
+                .map(DataPayload::cast)
+                .map(Ok),
+            ja: None,
+        })
+    }
+}
+
+fn try_load<M: KeyedDataMarker, P: DataProvider<M> + ?Sized>(
+    provider: &P,
+    locale: Locale,
+) -> Result<Option<DataPayload<M>>, DataError> {
+    match provider.load(DataRequest {
+        locale: &DataLocale::from(locale),
+        metadata: {
+            let mut m = DataRequestMetadata::default();
+            m.silent = true;
+            m
+        },
+    }) {
+        Ok(response) => Ok(Some(response.take_payload()?)),
+        Err(DataError {
+            kind: DataErrorKind::MissingLocale,
+            ..
+        }) => Ok(None),
+        Err(e) => Err(e),
+    }
+}
+
+/// Return UTF-16 segment offset array using dictionary or lstm segmenter.
+pub(crate) fn complex_language_segment_utf16(
+    payloads: &ComplexPayloads,
+    input: &[u16],
+) -> Vec<usize> {
+    let mut result = Vec::new();
+    let mut offset = 0;
+    for (slice, lang) in LanguageIteratorUtf16::new(input) {
+        match payloads.select(lang) {
+            Some(Ok(dict)) => {
+                result.extend(
+                    DictionarySegmenter::new(dict.get(), payloads.grapheme.get())
+                        .segment_utf16(slice)
+                        .map(|n| offset + n),
+                );
+            }
+            #[cfg(feature = "lstm")]
+            Some(Err(lstm)) => {
+                result.extend(
+                    LstmSegmenter::new(lstm.get(), payloads.grapheme.get())
+                        .segment_utf16(slice)
+                        .map(|n| offset + n),
+                );
+            }
+            #[cfg(not(feature = "lstm"))]
+            Some(Err(_infallible)) => {} // should be refutable
+            None => {
+                result.push(offset + slice.len());
+            }
+        }
+        offset += slice.len();
+    }
+    result
+}
+
+/// Return UTF-8 segment offset array using dictionary or lstm segmenter.
+pub(crate) fn complex_language_segment_str(payloads: &ComplexPayloads, input: &str) -> Vec<usize> {
+    let mut result = Vec::new();
+    let mut offset = 0;
+    for (slice, lang) in LanguageIterator::new(input) {
+        match payloads.select(lang) {
+            Some(Ok(dict)) => {
+                result.extend(
+                    DictionarySegmenter::new(dict.get(), payloads.grapheme.get())
+                        .segment_str(slice)
+                        .map(|n| offset + n),
+                );
+            }
+            #[cfg(feature = "lstm")]
+            Some(Err(lstm)) => {
+                result.extend(
+                    LstmSegmenter::new(lstm.get(), payloads.grapheme.get())
+                        .segment_str(slice)
+                        .map(|n| offset + n),
+                );
+            }
+            #[cfg(not(feature = "lstm"))]
+            Some(Err(_infallible)) => {} // should be refutable
+            None => {
+                result.push(offset + slice.len());
+            }
+        }
+        offset += slice.len();
+    }
+    result
+}
+
+#[cfg(test)]
+#[cfg(feature = "serde")]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn thai_word_break() {
+        const TEST_STR: &str = "ภาษาไทยภาษาไทย";
+        let utf16: Vec<u16> = TEST_STR.encode_utf16().collect();
+
+        let lstm = ComplexPayloads::new_lstm();
+        let dict = ComplexPayloads::new_dict();
+
+        assert_eq!(
+            complex_language_segment_str(&lstm, TEST_STR),
+            [12, 21, 33, 42]
+        );
+        assert_eq!(
+            complex_language_segment_utf16(&lstm, &utf16),
+            [4, 7, 11, 14]
+        );
+
+        assert_eq!(
+            complex_language_segment_str(&dict, TEST_STR),
+            [12, 21, 33, 42]
+        );
+        assert_eq!(
+            complex_language_segment_utf16(&dict, &utf16),
+            [4, 7, 11, 14]
+        );
+    }
+}
diff --git a/third_party/rust/icu_segmenter/src/error.rs b/third_party/rust/icu_segmenter/src/error.rs
new file mode 100644
index 0000000000..b0f79ec85f
--- /dev/null
+++ b/third_party/rust/icu_segmenter/src/error.rs
@@ -0,0 +1,27 @@
+// This file is part of ICU4X. For terms of use, please see the file
+// called LICENSE at the top level of the ICU4X source tree
+// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
+
+use core::fmt::Debug;
+use displaydoc::Display;
+use icu_provider::DataError;
+
+#[cfg(feature = "std")]
+impl std::error::Error for SegmenterError {}
+
+/// A list of error outcomes for various operations in this module.
+///
+/// Re-exported as [`Error`](crate::Error).
+#[derive(Display, Debug, Copy, Clone, PartialEq)]
+#[non_exhaustive]
+pub enum SegmenterError {
+    /// An error originating inside of the [data provider](icu_provider).
+    #[displaydoc("{0}")]
+    Data(DataError),
+}
+
+impl From<DataError> for SegmenterError {
+    fn from(e: DataError) -> Self {
+        Self::Data(e)
+    }
+}
diff --git a/third_party/rust/icu_segmenter/src/grapheme.rs b/third_party/rust/icu_segmenter/src/grapheme.rs
new file mode 100644
index 0000000000..9cfe0349bc
--- /dev/null
+++ b/third_party/rust/icu_segmenter/src/grapheme.rs
@@ -0,0 +1,270 @@
+// This file is part of ICU4X. For terms of use, please see the file
+// called LICENSE at the top level of the ICU4X source tree
+// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
+
+use alloc::vec::Vec;
+use icu_provider::prelude::*;
+
+use crate::indices::{Latin1Indices, Utf16Indices};
+use crate::iterator_helpers::derive_usize_iterator_with_type;
+use crate::rule_segmenter::*;
+use crate::{provider::*, SegmenterError};
+use utf8_iter::Utf8CharIndices;
+
+/// Implements the [`Iterator`] trait over the grapheme cluster boundaries of the given string.
+///
+/// Lifetimes:
+///
+/// - `'l` = lifetime of the segmenter object from which this iterator was created
+/// - `'s` = lifetime of the string being segmented
+///
+/// The [`Iterator::Item`] is an [`usize`] representing index of a code unit
+/// _after_ the boundary (for a boundary at the end of text, this index is the length
+/// of the [`str`] or array of code units).
+///
+/// For examples of use, see [`GraphemeClusterSegmenter`].
+#[derive(Debug)]
+pub struct GraphemeClusterBreakIterator<'l, 's, Y: RuleBreakType<'l, 's> + ?Sized>(
+    RuleBreakIterator<'l, 's, Y>,
+);
+
+derive_usize_iterator_with_type!(GraphemeClusterBreakIterator);
+
+/// Grapheme cluster break iterator for an `str` (a UTF-8 string).
+///
+/// For examples of use, see [`GraphemeClusterSegmenter`].
+pub type GraphemeClusterBreakIteratorUtf8<'l, 's> =
+    GraphemeClusterBreakIterator<'l, 's, RuleBreakTypeUtf8>;
+
+/// Grapheme cluster break iterator for a potentially invalid UTF-8 string.
+///
+/// For examples of use, see [`GraphemeClusterSegmenter`].
+pub type GraphemeClusterBreakIteratorPotentiallyIllFormedUtf8<'l, 's> =
+    GraphemeClusterBreakIterator<'l, 's, RuleBreakTypePotentiallyIllFormedUtf8>;
+
+/// Grapheme cluster break iterator for a Latin-1 (8-bit) string.
+///
+/// For examples of use, see [`GraphemeClusterSegmenter`].
+pub type GraphemeClusterBreakIteratorLatin1<'l, 's> =
+    GraphemeClusterBreakIterator<'l, 's, RuleBreakTypeLatin1>;
+
+/// Grapheme cluster break iterator for a UTF-16 string.
+///
+/// For examples of use, see [`GraphemeClusterSegmenter`].
+pub type GraphemeClusterBreakIteratorUtf16<'l, 's> =
+    GraphemeClusterBreakIterator<'l, 's, RuleBreakTypeUtf16>;
+
+/// Segments a string into grapheme clusters.
+///
+/// Supports loading grapheme cluster break data, and creating grapheme cluster break iterators for
+/// different string encodings.
+///
+/// # Examples
+///
+/// Segment a string:
+///
+/// ```rust
+/// use icu_segmenter::GraphemeClusterSegmenter;
+/// let segmenter = GraphemeClusterSegmenter::new();
+///
+/// let breakpoints: Vec<usize> = segmenter.segment_str("Hello 🗺").collect();
+/// // World Map (U+1F5FA) is encoded in four bytes in UTF-8.
+/// assert_eq!(&breakpoints, &[0, 1, 2, 3, 4, 5, 6, 10]);
+/// ```
+///
+/// Segment a Latin1 byte string:
+///
+/// ```rust
+/// use icu_segmenter::GraphemeClusterSegmenter;
+/// let segmenter = GraphemeClusterSegmenter::new();
+///
+/// let breakpoints: Vec<usize> =
+///     segmenter.segment_latin1(b"Hello World").collect();
+/// assert_eq!(&breakpoints, &[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]);
+/// ```
+///
+/// Successive boundaries can be used to retrieve the grapheme clusters.
+/// In particular, the first boundary is always 0, and the last one is the
+/// length of the segmented text in code units.
+///
+/// ```rust
+/// # use icu_segmenter::GraphemeClusterSegmenter;
+/// # let segmenter =
+/// #     GraphemeClusterSegmenter::new();
+/// use itertools::Itertools;
+/// let text = "मांजर";
+/// let grapheme_clusters: Vec<&str> = segmenter
+///     .segment_str(text)
+///     .tuple_windows()
+///     .map(|(i, j)| &text[i..j])
+///     .collect();
+/// assert_eq!(&grapheme_clusters, &["मां", "ज", "र"]);
+/// ```
+///
+/// This segmenter applies all rules provided to the constructor.
+/// Thus, if the data supplied by the provider comprises all
+/// [grapheme cluster boundary rules][Rules] from Unicode Standard Annex #29,
+/// _Unicode Text Segmentation_, which is the case of default data
+/// (both test data and data produced by `icu_datagen`), the `segment_*`
+/// functions return extended grapheme cluster boundaries, as opposed to
+/// legacy grapheme cluster boundaries.  See [_Section 3, Grapheme Cluster
+/// Boundaries_][GC], and [_Table 1a, Sample Grapheme Clusters_][Sample_GC],
+/// in Unicode Standard Annex #29, _Unicode Text Segmentation_.
+///
+/// [Rules]: https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
+/// [GC]: https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries
+/// [Sample_GC]: https://www.unicode.org/reports/tr29/#Table_Sample_Grapheme_Clusters
+///
+/// ```rust
+/// use icu_segmenter::GraphemeClusterSegmenter;
+/// let segmenter =
+///     GraphemeClusterSegmenter::new();
+///
+/// // நி (TAMIL LETTER NA, TAMIL VOWEL SIGN I) is an extended grapheme cluster,
+/// // but not a legacy grapheme cluster.
+/// let ni = "நி";
+/// let egc_boundaries: Vec<usize> = segmenter.segment_str(ni).collect();
+/// assert_eq!(&egc_boundaries, &[0, ni.len()]);
+/// ```
+#[derive(Debug)]
+pub struct GraphemeClusterSegmenter {
+    payload: DataPayload<GraphemeClusterBreakDataV1Marker>,
+}
+
+#[cfg(feature = "compiled_data")]
+impl Default for GraphemeClusterSegmenter {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl GraphemeClusterSegmenter {
+    /// Constructs a [`GraphemeClusterSegmenter`] with an invariant locale from compiled data.
+    ///
+    /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+    ///
+    /// [📚 Help choosing a constructor](icu_provider::constructors)
+    #[cfg(feature = "compiled_data")]
+    pub fn new() -> Self {
+        Self {
+            payload: DataPayload::from_static_ref(
+                crate::provider::Baked::SINGLETON_SEGMENTER_GRAPHEME_V1,
+            ),
+        }
+    }
+
+    icu_provider::gen_any_buffer_data_constructors!(locale: skip, options: skip, error: SegmenterError,
+        #[cfg(skip)]
+        functions: [
+            new,
+            try_new_with_any_provider,
+            try_new_with_buffer_provider,
+            try_new_unstable,
+            Self,
+    ]);
+
+    #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new)]
+    pub fn try_new_unstable<D>(provider: &D) -> Result<Self, SegmenterError>
+    where
+        D: DataProvider<GraphemeClusterBreakDataV1Marker> + ?Sized,
+    {
+        let payload = provider.load(Default::default())?.take_payload()?;
+        Ok(Self { payload })
+    }
+
+    /// Creates a grapheme cluster break iterator for an `str` (a UTF-8 string).
+    pub fn segment_str<'l, 's>(
+        &'l self,
+        input: &'s str,
+    ) -> GraphemeClusterBreakIteratorUtf8<'l, 's> {
+        GraphemeClusterSegmenter::new_and_segment_str(input, self.payload.get())
+    }
+
+    /// Creates a grapheme cluster break iterator from grapheme cluster rule payload.
+    ///
+    /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
+    pub(crate) fn new_and_segment_str<'l, 's>(
+        input: &'s str,
+        payload: &'l RuleBreakDataV1<'l>,
+    ) -> GraphemeClusterBreakIteratorUtf8<'l, 's> {
+        GraphemeClusterBreakIterator(RuleBreakIterator {
+            iter: input.char_indices(),
+            len: input.len(),
+            current_pos_data: None,
+            result_cache: Vec::new(),
+            data: payload,
+            complex: None,
+            boundary_property: 0,
+        })
+    }
+
+    /// Creates a grapheme cluster break iterator for a potentially ill-formed UTF8 string
+    ///
+    /// Invalid characters are treated as REPLACEMENT CHARACTER
+    ///
+    /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
+    pub fn segment_utf8<'l, 's>(
+        &'l self,
+        input: &'s [u8],
+    ) -> GraphemeClusterBreakIteratorPotentiallyIllFormedUtf8<'l, 's> {
+        GraphemeClusterBreakIterator(RuleBreakIterator {
+            iter: Utf8CharIndices::new(input),
+            len: input.len(),
+            current_pos_data: None,
+            result_cache: Vec::new(),
+            data: self.payload.get(),
+            complex: None,
+            boundary_property: 0,
+        })
+    }
+    /// Creates a grapheme cluster break iterator for a Latin-1 (8-bit) string.
+    ///
+    /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
+    pub fn segment_latin1<'l, 's>(
+        &'l self,
+        input: &'s [u8],
+    ) -> GraphemeClusterBreakIteratorLatin1<'l, 's> {
+        GraphemeClusterBreakIterator(RuleBreakIterator {
+            iter: Latin1Indices::new(input),
+            len: input.len(),
+            current_pos_data: None,
+            result_cache: Vec::new(),
+            data: self.payload.get(),
+            complex: None,
+            boundary_property: 0,
+        })
+    }
+
+    /// Creates a grapheme cluster break iterator for a UTF-16 string.
+    ///
+    /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
+    pub fn segment_utf16<'l, 's>(
+        &'l self,
+        input: &'s [u16],
+    ) -> GraphemeClusterBreakIteratorUtf16<'l, 's> {
+        GraphemeClusterSegmenter::new_and_segment_utf16(input, self.payload.get())
+    }
+
+    /// Creates a grapheme cluster break iterator from grapheme cluster rule payload.
+    pub(crate) fn new_and_segment_utf16<'l, 's>(
+        input: &'s [u16],
+        payload: &'l RuleBreakDataV1<'l>,
+    ) -> GraphemeClusterBreakIteratorUtf16<'l, 's> {
+        GraphemeClusterBreakIterator(RuleBreakIterator {
+            iter: Utf16Indices::new(input),
+            len: input.len(),
+            current_pos_data: None,
+            result_cache: Vec::new(),
+            data: payload,
+            complex: None,
+            boundary_property: 0,
+        })
+    }
+}
+
+#[test]
+fn empty_string() {
+    let segmenter = GraphemeClusterSegmenter::new();
+    let breaks: Vec<usize> = segmenter.segment_str("").collect();
+    assert_eq!(breaks, [0]);
+}
diff --git a/third_party/rust/icu_segmenter/src/indices.rs b/third_party/rust/icu_segmenter/src/indices.rs
new file mode 100644
index 0000000000..2ea6b81fc6
--- /dev/null
+++ b/third_party/rust/icu_segmenter/src/indices.rs
@@ -0,0 +1,129 @@
+// This file is part of ICU4X. For terms of use, please see the file
+// called LICENSE at the top level of the ICU4X source tree
+// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
+
+/// Similar to [`core::str::CharIndices`] for Latin-1 strings, represented as `[u8]`.
+///
+/// Contrary to [`core::str::CharIndices`], the second element of the
+/// [`Iterator::Item`] is a [`u8`], representing a Unicode scalar value in the
+/// range U+0000–U+00FF.
+#[derive(Clone, Debug)]
+pub struct Latin1Indices<'a> {
+    front_offset: usize,
+    iter: &'a [u8],
+}
+
+impl<'a> Latin1Indices<'a> {
+    pub fn new(input: &'a [u8]) -> Self {
+        Self {
+            front_offset: 0,
+            iter: input,
+        }
+    }
+}
+
+impl<'a> Iterator for Latin1Indices<'a> {
+    type Item = (usize, u8);
+
+    #[inline]
+    fn next(&mut self) -> Option<(usize, u8)> {
+        self.iter.get(self.front_offset).map(|ch| {
+            self.front_offset += 1;
+            (self.front_offset - 1, *ch)
+        })
+    }
+}
+
+/// Similar to [`core::str::CharIndices`] for UTF-16 strings, represented as `[u16]`.
+///
+/// Contrary to [`core::str::CharIndices`], the second element of the
+/// [`Iterator::Item`] is a Unicode code point represented by a [`u32`],
+/// rather than a Unicode scalar value represented by a [`char`], because this
+/// iterator preserves unpaired surrogates.
+#[derive(Clone, Debug)]
+pub struct Utf16Indices<'a> {
+    front_offset: usize,
+    iter: &'a [u16],
+}
+
+impl<'a> Utf16Indices<'a> {
+    pub fn new(input: &'a [u16]) -> Self {
+        Self {
+            front_offset: 0,
+            iter: input,
+        }
+    }
+}
+
+impl<'a> Iterator for Utf16Indices<'a> {
+    type Item = (usize, u32);
+
+    #[inline]
+    fn next(&mut self) -> Option<(usize, u32)> {
+        let (index, ch) = self.iter.get(self.front_offset).map(|ch| {
+            self.front_offset += 1;
+            (self.front_offset - 1, *ch)
+        })?;
+
+        let mut ch = ch as u32;
+        if (ch & 0xfc00) != 0xd800 {
+            return Some((index, ch));
+        }
+
+        if let Some(next) = self.iter.get(self.front_offset) {
+            let next = *next as u32;
+            if (next & 0xfc00) == 0xdc00 {
+                // Combine low and high surrogates to UTF-32 code point.
+                ch = ((ch & 0x3ff) << 10) + (next & 0x3ff) + 0x10000;
+                self.front_offset += 1;
+            }
+        }
+        Some((index, ch))
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::indices::*;
+
+    #[test]
+    fn latin1_indices() {
+        let latin1 = [0x30, 0x31, 0x32];
+        let mut indices = Latin1Indices::new(&latin1);
+        let n = indices.next().unwrap();
+        assert_eq!(n.0, 0);
+        assert_eq!(n.1, 0x30);
+        let n = indices.next().unwrap();
+        assert_eq!(n.0, 1);
+        assert_eq!(n.1, 0x31);
+        let n = indices.next().unwrap();
+        assert_eq!(n.0, 2);
+        assert_eq!(n.1, 0x32);
+        let n = indices.next();
+        assert_eq!(n, None);
+    }
+
+    #[test]
+    fn utf16_indices() {
+        let utf16 = [0xd83d, 0xde03, 0x0020, 0xd83c, 0xdf00, 0xd800, 0x0020];
+        let mut indices = Utf16Indices::new(&utf16);
+        let n = indices.next().unwrap();
+        assert_eq!(n.0, 0);
+        assert_eq!(n.1, 0x1f603);
+        let n = indices.next().unwrap();
+        assert_eq!(n.0, 2);
+        assert_eq!(n.1, 0x20);
+        let n = indices.next().unwrap();
+        assert_eq!(n.0, 3);
+        assert_eq!(n.1, 0x1f300);
+        // This is invalid surrogate pair.
+        let n = indices.next().unwrap();
+        assert_eq!(n.0, 5);
+        assert_eq!(n.1, 0xd800);
+        let n = indices.next().unwrap();
+        assert_eq!(n.0, 6);
+        assert_eq!(n.1, 0x0020);
+        let n = indices.next();
+        assert_eq!(n, None);
+    }
+}
diff --git a/third_party/rust/icu_segmenter/src/iterator_helpers.rs b/third_party/rust/icu_segmenter/src/iterator_helpers.rs
new file mode 100644
index 0000000000..593a4702ca
--- /dev/null
+++ b/third_party/rust/icu_segmenter/src/iterator_helpers.rs
@@ -0,0 +1,19 @@
+// This file is part of ICU4X. For terms of use, please see the file
+// called LICENSE at the top level of the ICU4X source tree
+// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
+
+//! Macros and utilities to help implement the various iterator types.
+
+macro_rules! derive_usize_iterator_with_type {
+    ($ty:tt) => {
+        impl<'l, 's, Y: RuleBreakType<'l, 's> + ?Sized> Iterator for $ty<'l, 's, Y> {
+            type Item = usize;
+            #[inline]
+            fn next(&mut self) -> Option<Self::Item> {
+                self.0.next()
+            }
+        }
+    };
+}
+
+pub(crate) use derive_usize_iterator_with_type;
diff --git a/third_party/rust/icu_segmenter/src/lib.rs b/third_party/rust/icu_segmenter/src/lib.rs
new file mode 100644
index 0000000000..b286c4e312
--- /dev/null
+++ b/third_party/rust/icu_segmenter/src/lib.rs
@@ -0,0 +1,174 @@
+// This file is part of ICU4X. For terms of use, please see the file
+// called LICENSE at the top level of the ICU4X source tree
+// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
+
+//! Segment strings by lines, graphemes, words, and sentences.
+//!
+//! This module is published as its own crate ([`icu_segmenter`](https://docs.rs/icu_segmenter/latest/icu_segmenter/))
+//! and as part of the [`icu`](https://docs.rs/icu/latest/icu/) crate. See the latter for more details on the ICU4X project.
+//!
+//! This module contains segmenter implementation for the following rules.
+//!
+//! - Line segmenter that is compatible with [Unicode Standard Annex #14][UAX14], _Unicode Line
+//!   Breaking Algorithm_, with options to tailor line-breaking behavior for CSS [`line-break`] and
+//!   [`word-break`] properties.
+//! - Grapheme cluster segmenter, word segmenter, and sentence segmenter that are compatible with
+//!   [Unicode Standard Annex #29][UAX29], _Unicode Text Segmentation_.
+//!
+//! [UAX14]: https://www.unicode.org/reports/tr14/
+//! [UAX29]: https://www.unicode.org/reports/tr29/
+//! [`line-break`]: https://drafts.csswg.org/css-text-3/#line-break-property
+//! [`word-break`]: https://drafts.csswg.org/css-text-3/#word-break-property
+//!
+//! # Examples
+//!
+//! ## Line Break
+//!
+//! Find line break opportunities:
+//!
+//!```rust
+//! use icu::segmenter::LineSegmenter;
+//!
+//! let segmenter = LineSegmenter::new_auto();
+//!
+//! let breakpoints: Vec<usize> = segmenter
+//!     .segment_str("Hello World. Xin chào thế giới!")
+//!     .collect();
+//! assert_eq!(&breakpoints, &[0, 6, 13, 17, 23, 29, 36]);
+//! ```
+//!
+//! See [`LineSegmenter`] for more examples.
+//!
+//! ## Grapheme Cluster Break
+//!
+//! Find all grapheme cluster boundaries:
+//!
+//!```rust
+//! use icu::segmenter::GraphemeClusterSegmenter;
+//!
+//! let segmenter = GraphemeClusterSegmenter::new();
+//!
+//! let breakpoints: Vec<usize> = segmenter
+//!     .segment_str("Hello World. Xin chào thế giới!")
+//!     .collect();
+//! assert_eq!(
+//!     &breakpoints,
+//!     &[
+//!         0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+//!         19, 21, 22, 23, 24, 25, 28, 29, 30, 31, 34, 35, 36
+//!     ]
+//! );
+//! ```
+//!
+//! See [`GraphemeClusterSegmenter`] for more examples.
+//!
+//! ## Word Break
+//!
+//! Find all word boundaries:
+//!
+//!```rust
+//! use icu::segmenter::WordSegmenter;
+//!
+//! let segmenter = WordSegmenter::new_auto();
+//!
+//! let breakpoints: Vec<usize> = segmenter
+//!     .segment_str("Hello World. Xin chào thế giới!")
+//!     .collect();
+//! assert_eq!(
+//!     &breakpoints,
+//!     &[0, 5, 6, 11, 12, 13, 16, 17, 22, 23, 28, 29, 35, 36]
+//! );
+//! ```
+//!
+//! See [`WordSegmenter`] for more examples.
+//!
+//! ## Sentence Break
+//!
+//! Segment the string into sentences:
+//!
+//!```rust
+//! use icu::segmenter::SentenceSegmenter;
+//!
+//! let segmenter = SentenceSegmenter::new();
+//!
+//! let breakpoints: Vec<usize> = segmenter
+//!     .segment_str("Hello World. Xin chào thế giới!")
+//!     .collect();
+//! assert_eq!(&breakpoints, &[0, 13, 36]);
+//! ```
+//!
+//! See [`SentenceSegmenter`] for more examples.
+
+// https://github.com/unicode-org/icu4x/blob/main/docs/process/boilerplate.md#library-annotations
+#![cfg_attr(not(any(test, feature = "std")), no_std)]
+#![cfg_attr(
+    not(test),
+    deny(
+        clippy::indexing_slicing,
+        clippy::unwrap_used,
+        clippy::expect_used,
+        clippy::panic,
+        clippy::exhaustive_structs,
+        clippy::exhaustive_enums,
+        missing_debug_implementations,
+    )
+)]
+#![warn(missing_docs)]
+
+extern crate alloc;
+
+mod complex;
+mod error;
+mod indices;
+mod iterator_helpers;
+mod rule_segmenter;
+
+mod grapheme;
+mod line;
+mod sentence;
+mod word;
+
+pub mod provider;
+
+// icu_datagen uses symbols, but we don't want to expose this implementation detail to the users.
+#[doc(hidden)]
+pub mod symbols;
+
+// Main Segmenter and BreakIterator public types
+pub use crate::grapheme::GraphemeClusterBreakIterator;
+pub use crate::grapheme::GraphemeClusterSegmenter;
+pub use crate::line::LineBreakIterator;
+pub use crate::line::LineSegmenter;
+pub use crate::sentence::SentenceBreakIterator;
+pub use crate::sentence::SentenceSegmenter;
+pub use crate::word::WordBreakIterator;
+pub use crate::word::WordSegmenter;
+
+// Options structs and enums
+pub use crate::line::LineBreakOptions;
+pub use crate::line::LineBreakStrictness;
+pub use crate::line::LineBreakWordOption;
+pub use crate::word::WordType;
+
+// Typedefs
+pub use crate::grapheme::GraphemeClusterBreakIteratorLatin1;
+pub use crate::grapheme::GraphemeClusterBreakIteratorPotentiallyIllFormedUtf8;
+pub use crate::grapheme::GraphemeClusterBreakIteratorUtf16;
+pub use crate::grapheme::GraphemeClusterBreakIteratorUtf8;
+pub use crate::line::LineBreakIteratorLatin1;
+pub use crate::line::LineBreakIteratorPotentiallyIllFormedUtf8;
+pub use crate::line::LineBreakIteratorUtf16;
+pub use crate::line::LineBreakIteratorUtf8;
+pub use crate::sentence::SentenceBreakIteratorLatin1;
+pub use crate::sentence::SentenceBreakIteratorPotentiallyIllFormedUtf8;
+pub use crate::sentence::SentenceBreakIteratorUtf16;
+pub use crate::sentence::SentenceBreakIteratorUtf8;
+pub use crate::word::WordBreakIteratorLatin1;
+pub use crate::word::WordBreakIteratorPotentiallyIllFormedUtf8;
+pub use crate::word::WordBreakIteratorUtf16;
+pub use crate::word::WordBreakIteratorUtf8;
+
+pub use error::SegmenterError;
+
+#[doc(no_inline)]
+pub use SegmenterError as Error;
diff --git a/third_party/rust/icu_segmenter/src/line.rs b/third_party/rust/icu_segmenter/src/line.rs
new file mode 100644
index 0000000000..f93e31b13d
--- /dev/null
+++ b/third_party/rust/icu_segmenter/src/line.rs
@@ -0,0 +1,1641 @@
+// This file is part of ICU4X. For terms of use, please see the file
+// called LICENSE at the top level of the ICU4X source tree
+// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
+
+use crate::complex::*;
+use crate::indices::*;
+use crate::provider::*;
+use crate::symbols::*;
+use crate::SegmenterError;
+use alloc::string::String;
+use alloc::vec;
+use alloc::vec::Vec;
+use core::char;
+use core::str::CharIndices;
+use icu_provider::prelude::*;
+use utf8_iter::Utf8CharIndices;
+
+/// An enum specifies the strictness of line-breaking rules. It can be passed as
+/// an argument when creating a line segmenter.
+///
+/// Each enum value has the same meaning with respect to the `line-break`
+/// property values in the CSS Text spec. See the details in
+/// <https://drafts.csswg.org/css-text-3/#line-break-property>.
+#[non_exhaustive]
+#[derive(Copy, Clone, PartialEq, Eq, Debug)]
+pub enum LineBreakStrictness {
+    /// Breaks text using the least restrictive set of line-breaking rules.
+    /// Typically used for short lines, such as in newspapers.
+    /// <https://drafts.csswg.org/css-text-3/#valdef-line-break-loose>
+    Loose,
+
+    /// Breaks text using the most common set of line-breaking rules.
+    /// <https://drafts.csswg.org/css-text-3/#valdef-line-break-normal>
+    Normal,
+
+    /// Breaks text using the most stringent set of line-breaking rules.
+    /// <https://drafts.csswg.org/css-text-3/#valdef-line-break-strict>
+    ///
+    /// This is the default behaviour of the Unicode Line Breaking Algorithm,
+    /// resolving class [CJ](https://www.unicode.org/reports/tr14/#CJ) to
+    /// [NS](https://www.unicode.org/reports/tr14/#NS);
+    /// see rule [LB1](https://www.unicode.org/reports/tr14/#LB1).
+    Strict,
+
+    /// Breaks text assuming there is a soft wrap opportunity around every
+    /// typographic character unit, disregarding any prohibition against line
+    /// breaks. See more details in
+    /// <https://drafts.csswg.org/css-text-3/#valdef-line-break-anywhere>.
+    Anywhere,
+}
+
+/// An enum specifies the line break opportunities between letters. It can be
+/// passed as an argument when creating a line segmenter.
+///
+/// Each enum value has the same meaning with respect to the `word-break`
+/// property values in the CSS Text spec. See the details in
+/// <https://drafts.csswg.org/css-text-3/#word-break-property>
+#[non_exhaustive]
+#[derive(Copy, Clone, PartialEq, Eq, Debug)]
+pub enum LineBreakWordOption {
+    /// Words break according to their customary rules. See the details in
+    /// <https://drafts.csswg.org/css-text-3/#valdef-word-break-normal>.
+    Normal,
+
+    /// Breaking is allowed within "words".
+    /// <https://drafts.csswg.org/css-text-3/#valdef-word-break-break-all>
+    BreakAll,
+
+    /// Breaking is forbidden within "word".
+    /// <https://drafts.csswg.org/css-text-3/#valdef-word-break-keep-all>
+    KeepAll,
+}
+
+/// Options to tailor line-breaking behavior.
+#[non_exhaustive]
+#[derive(Clone, PartialEq, Eq, Debug)]
+pub struct LineBreakOptions {
+    /// Strictness of line-breaking rules. See [`LineBreakStrictness`].
+    pub strictness: LineBreakStrictness,
+
+    /// Line break opportunities between letters. See [`LineBreakWordOption`].
+    pub word_option: LineBreakWordOption,
+
+    /// Use `true` as a hint to the line segmenter that the writing
+    /// system is Chinese or Japanese. This allows more break opportunities when
+    /// `LineBreakStrictness` is `Normal` or `Loose`. See
+    /// <https://drafts.csswg.org/css-text-3/#line-break-property> for details.
+    ///
+    /// This option has no effect in Latin-1 mode.
+    pub ja_zh: bool,
+}
+
+impl Default for LineBreakOptions {
+    fn default() -> Self {
+        Self {
+            strictness: LineBreakStrictness::Strict,
+            word_option: LineBreakWordOption::Normal,
+            ja_zh: false,
+        }
+    }
+}
+
+/// Line break iterator for an `str` (a UTF-8 string).
+///
+/// For examples of use, see [`LineSegmenter`].
+pub type LineBreakIteratorUtf8<'l, 's> = LineBreakIterator<'l, 's, LineBreakTypeUtf8>;
+
+/// Line break iterator for a potentially invalid UTF-8 string.
+///
+/// For examples of use, see [`LineSegmenter`].
+pub type LineBreakIteratorPotentiallyIllFormedUtf8<'l, 's> =
+    LineBreakIterator<'l, 's, LineBreakTypePotentiallyIllFormedUtf8>;
+
+/// Line break iterator for a Latin-1 (8-bit) string.
+///
+/// For examples of use, see [`LineSegmenter`].
+pub type LineBreakIteratorLatin1<'l, 's> = LineBreakIterator<'l, 's, LineBreakTypeLatin1>;
+
+/// Line break iterator for a UTF-16 string.
+///
+/// For examples of use, see [`LineSegmenter`].
+pub type LineBreakIteratorUtf16<'l, 's> = LineBreakIterator<'l, 's, LineBreakTypeUtf16>;
+
+/// Supports loading line break data, and creating line break iterators for different string
+/// encodings.
+///
+/// The segmenter returns mandatory breaks (as defined by [definition LD7][LD7] of
+/// Unicode Standard Annex #14, _Unicode Line Breaking Algorithm_) as well as
+/// line break opportunities ([definition LD3][LD3]).
+/// It does not distinguish them.  Callers requiring that distinction can check
+/// the Line_Break property of the code point preceding the break against those
+/// listed in rules [LB4][LB4] and [LB5][LB5], special-casing the end of text
+/// according to [LB3][LB3].
+///
+/// For consistency with the grapheme, word, and sentence segmenters, there is
+/// always a breakpoint returned at index 0, but this breakpoint is not a
+/// meaningful line break opportunity.
+///
+/// [LD3]: https://www.unicode.org/reports/tr14/#LD3
+/// [LD7]: https://www.unicode.org/reports/tr14/#LD7
+/// [LB3]: https://www.unicode.org/reports/tr14/#LB3
+/// [LB4]: https://www.unicode.org/reports/tr14/#LB4
+/// [LB5]: https://www.unicode.org/reports/tr14/#LB5
+///
+/// ```rust
+/// # use icu_segmenter::LineSegmenter;
+/// #
+/// # let segmenter = LineSegmenter::new_auto();
+/// #
+/// let text = "Summary\r\nThis annex…";
+/// let breakpoints: Vec<usize> = segmenter.segment_str(text).collect();
+/// // 9 and 22 are mandatory breaks, 14 is a line break opportunity.
+/// assert_eq!(&breakpoints, &[0, 9, 14, 22]);
+/// ```
+///
+/// # Examples
+///
+/// Segment a string with default options:
+///
+/// ```rust
+/// use icu_segmenter::LineSegmenter;
+///
+/// let segmenter = LineSegmenter::new_auto();
+///
+/// let breakpoints: Vec<usize> =
+///     segmenter.segment_str("Hello World").collect();
+/// assert_eq!(&breakpoints, &[0, 6, 11]);
+/// ```
+///
+/// Segment a string with CSS option overrides:
+///
+/// ```rust
+/// use icu_segmenter::{
+///     LineBreakOptions, LineBreakStrictness, LineBreakWordOption,
+///     LineSegmenter,
+/// };
+///
+/// let mut options = LineBreakOptions::default();
+/// options.strictness = LineBreakStrictness::Strict;
+/// options.word_option = LineBreakWordOption::BreakAll;
+/// options.ja_zh = false;
+/// let segmenter = LineSegmenter::new_auto_with_options(options);
+///
+/// let breakpoints: Vec<usize> =
+///     segmenter.segment_str("Hello World").collect();
+/// assert_eq!(&breakpoints, &[0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11]);
+/// ```
+///
+/// Segment a Latin1 byte string:
+///
+/// ```rust
+/// use icu_segmenter::LineSegmenter;
+///
+/// let segmenter = LineSegmenter::new_auto();
+///
+/// let breakpoints: Vec<usize> =
+///     segmenter.segment_latin1(b"Hello World").collect();
+/// assert_eq!(&breakpoints, &[0, 6, 11]);
+/// ```
+///
+/// Separate mandatory breaks from the break opportunities:
+///
+/// ```rust
+/// use icu::properties::{maps, LineBreak};
+/// use icu_segmenter::LineSegmenter;
+///
+/// # let segmenter = LineSegmenter::new_auto();
+/// #
+/// let text = "Summary\r\nThis annex…";
+///
+/// let mandatory_breaks: Vec<usize> = segmenter
+///     .segment_str(text)
+///     .into_iter()
+///     .filter(|&i| {
+///         text[..i].chars().next_back().map_or(false, |c| {
+///             matches!(
+///                 maps::line_break().get(c),
+///                 LineBreak::MandatoryBreak
+///                     | LineBreak::CarriageReturn
+///                     | LineBreak::LineFeed
+///                     | LineBreak::NextLine
+///             ) || i == text.len()
+///         })
+///     })
+///     .collect();
+/// assert_eq!(&mandatory_breaks, &[9, 22]);
+/// ```
+#[derive(Debug)]
+pub struct LineSegmenter {
+    options: LineBreakOptions,
+    payload: DataPayload<LineBreakDataV1Marker>,
+    complex: ComplexPayloads,
+}
+
+impl LineSegmenter {
+    /// Constructs a [`LineSegmenter`] with an invariant locale and the best available compiled data for
+    /// complex scripts (Khmer, Lao, Myanmar, and Thai).
+    ///
+    /// The current behavior, which is subject to change, is to use the LSTM model when available.
+    ///
+    /// See also [`Self::new_auto_with_options`].
+    ///
+    /// ✨ *Enabled with the `compiled_data` and `auto` Cargo features.*
+    ///
+    /// [📚 Help choosing a constructor](icu_provider::constructors)
+    #[cfg(feature = "compiled_data")]
+    #[cfg(feature = "auto")]
+    pub fn new_auto() -> Self {
+        Self::new_auto_with_options(Default::default())
+    }
+
+    #[cfg(feature = "auto")]
+    icu_provider::gen_any_buffer_data_constructors!(
+        locale: skip,
+        options: skip,
+        error: SegmenterError,
+        #[cfg(skip)]
+        functions: [
+            new_auto,
+            try_new_auto_with_any_provider,
+            try_new_auto_with_buffer_provider,
+            try_new_auto_unstable,
+            Self,
+        ]
+    );
+
+    #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new_auto)]
+    #[cfg(feature = "auto")]
+    pub fn try_new_auto_unstable<D>(provider: &D) -> Result<Self, SegmenterError>
+    where
+        D: DataProvider<LineBreakDataV1Marker>
+            + DataProvider<LstmForWordLineAutoV1Marker>
+            + DataProvider<GraphemeClusterBreakDataV1Marker>
+            + ?Sized,
+    {
+        Self::try_new_auto_with_options_unstable(provider, Default::default())
+    }
+
+    /// Constructs a [`LineSegmenter`] with an invariant locale and compiled LSTM data for
+    /// complex scripts (Khmer, Lao, Myanmar, and Thai).
+    ///
+    /// The LSTM, or Long Term Short Memory, is a machine learning model. It is smaller than
+    /// the full dictionary but more expensive during segmentation (inference).
+    ///
+    /// See also [`Self::new_lstm_with_options`].
+    ///
+    /// ✨ *Enabled with the `compiled_data` and `lstm` Cargo features.*
+    ///
+    /// [📚 Help choosing a constructor](icu_provider::constructors)
+    #[cfg(feature = "compiled_data")]
+    #[cfg(feature = "lstm")]
+    pub fn new_lstm() -> Self {
+        Self::new_lstm_with_options(Default::default())
+    }
+
+    #[cfg(feature = "lstm")]
+    icu_provider::gen_any_buffer_data_constructors!(
+        locale: skip,
+        options: skip,
+        error: SegmenterError,
+        #[cfg(skip)]
+        functions: [
+            new_lstm,
+            try_new_lstm_with_any_provider,
+            try_new_lstm_with_buffer_provider,
+            try_new_lstm_unstable,
+            Self,
+        ]
+    );
+
+    #[cfg(feature = "lstm")]
+    #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new_lstm)]
+    pub fn try_new_lstm_unstable<D>(provider: &D) -> Result<Self, SegmenterError>
+    where
+        D: DataProvider<LineBreakDataV1Marker>
+            + DataProvider<LstmForWordLineAutoV1Marker>
+            + DataProvider<GraphemeClusterBreakDataV1Marker>
+            + ?Sized,
+    {
+        Self::try_new_lstm_with_options_unstable(provider, Default::default())
+    }
+
+    /// Constructs a [`LineSegmenter`] with an invariant locale and compiled dictionary data for
+    /// complex scripts (Khmer, Lao, Myanmar, and Thai).
+    ///
+    /// The dictionary model uses a list of words to determine appropriate breakpoints. It is
+    /// faster than the LSTM model but requires more data.
+    ///
+    /// See also [`Self::new_dictionary_with_options`].
+    ///
+    /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+    ///
+    /// [📚 Help choosing a constructor](icu_provider::constructors)
+    #[cfg(feature = "compiled_data")]
+    pub fn new_dictionary() -> Self {
+        Self::new_dictionary_with_options(Default::default())
+    }
+
+    icu_provider::gen_any_buffer_data_constructors!(
+        locale: skip,
+        options: skip,
+        error: SegmenterError,
+        #[cfg(skip)]
+        functions: [
+            new_dictionary,
+            try_new_dictionary_with_any_provider,
+            try_new_dictionary_with_buffer_provider,
+            try_new_dictionary_unstable,
+            Self,
+        ]
+    );
+
+    #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new_dictionary)]
+    pub fn try_new_dictionary_unstable<D>(provider: &D) -> Result<Self, SegmenterError>
+    where
+        D: DataProvider<LineBreakDataV1Marker>
+            + DataProvider<DictionaryForWordLineExtendedV1Marker>
+            + DataProvider<GraphemeClusterBreakDataV1Marker>
+            + ?Sized,
+    {
+        Self::try_new_dictionary_with_options_unstable(provider, Default::default())
+    }
+
+    /// Constructs a [`LineSegmenter`] with an invariant locale, custom [`LineBreakOptions`], and
+    /// the best available compiled data for complex scripts (Khmer, Lao, Myanmar, and Thai).
+    ///
+    /// The current behavior, which is subject to change, is to use the LSTM model when available.
+    ///
+    /// See also [`Self::new_auto`].
+    ///
+    /// ✨ *Enabled with the `compiled_data` and `auto` Cargo features.*
+    ///
+    /// [📚 Help choosing a constructor](icu_provider::constructors)
+    #[cfg(feature = "auto")]
+    #[cfg(feature = "compiled_data")]
+    pub fn new_auto_with_options(options: LineBreakOptions) -> Self {
+        Self::new_lstm_with_options(options)
+    }
+
+    #[cfg(feature = "auto")]
+    icu_provider::gen_any_buffer_data_constructors!(
+        locale: skip,
+        options: LineBreakOptions,
+        error: SegmenterError,
+        #[cfg(skip)]
+        functions: [
+            new_auto_with_options,
+            try_new_auto_with_options_with_any_provider,
+            try_new_auto_with_options_with_buffer_provider,
+            try_new_auto_with_options_unstable,
+            Self,
+        ]
+    );
+
+    #[cfg(feature = "auto")]
+    #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new_auto_with_options)]
+    pub fn try_new_auto_with_options_unstable<D>(
+        provider: &D,
+        options: LineBreakOptions,
+    ) -> Result<Self, SegmenterError>
+    where
+        D: DataProvider<LineBreakDataV1Marker>
+            + DataProvider<LstmForWordLineAutoV1Marker>
+            + DataProvider<GraphemeClusterBreakDataV1Marker>
+            + ?Sized,
+    {
+        Self::try_new_lstm_with_options_unstable(provider, options)
+    }
+
+    /// Constructs a [`LineSegmenter`] with an invariant locale, custom [`LineBreakOptions`], and
+    /// compiled LSTM data for complex scripts (Khmer, Lao, Myanmar, and Thai).
+    ///
+    /// The LSTM, or Long Term Short Memory, is a machine learning model. It is smaller than
+    /// the full dictionary but more expensive during segmentation (inference).
+    ///
+    /// See also [`Self::new_dictionary`].
+    ///
+    /// ✨ *Enabled with the `compiled_data` and `lstm` Cargo features.*
+    ///
+    /// [📚 Help choosing a constructor](icu_provider::constructors)
+    #[cfg(feature = "lstm")]
+    #[cfg(feature = "compiled_data")]
+    pub fn new_lstm_with_options(options: LineBreakOptions) -> Self {
+        Self {
+            options,
+            payload: DataPayload::from_static_ref(
+                crate::provider::Baked::SINGLETON_SEGMENTER_LINE_V1,
+            ),
+            complex: ComplexPayloads::new_lstm(),
+        }
+    }
+
+    #[cfg(feature = "lstm")]
+    icu_provider::gen_any_buffer_data_constructors!(
+        locale: skip,
+        options: LineBreakOptions,
+        error: SegmenterError,
+        #[cfg(skip)]
+        functions: [
+            try_new_lstm_with_options,
+            try_new_lstm_with_options_with_any_provider,
+            try_new_lstm_with_options_with_buffer_provider,
+            try_new_lstm_with_options_unstable,
+            Self,
+        ]
+    );
+
+    #[cfg(feature = "lstm")]
+    #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new_lstm_with_options)]
+    pub fn try_new_lstm_with_options_unstable<D>(
+        provider: &D,
+        options: LineBreakOptions,
+    ) -> Result<Self, SegmenterError>
+    where
+        D: DataProvider<LineBreakDataV1Marker>
+            + DataProvider<LstmForWordLineAutoV1Marker>
+            + DataProvider<GraphemeClusterBreakDataV1Marker>
+            + ?Sized,
+    {
+        Ok(Self {
+            options,
+            payload: provider.load(Default::default())?.take_payload()?,
+            complex: ComplexPayloads::try_new_lstm(provider)?,
+        })
+    }
+
+    /// Constructs a [`LineSegmenter`] with an invariant locale, custom [`LineBreakOptions`], and
+    /// compiled dictionary data for complex scripts (Khmer, Lao, Myanmar, and Thai).
+    ///
+    /// The dictionary model uses a list of words to determine appropriate breakpoints. It is
+    /// faster than the LSTM model but requires more data.
+    ///
+    /// See also [`Self::new_dictionary`].
+    ///
+    /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+    ///
+    /// [📚 Help choosing a constructor](icu_provider::constructors)
+    #[cfg(feature = "compiled_data")]
+    pub fn new_dictionary_with_options(options: LineBreakOptions) -> Self {
+        Self {
+            options,
+            payload: DataPayload::from_static_ref(
+                crate::provider::Baked::SINGLETON_SEGMENTER_LINE_V1,
+            ),
+            // Line segmenter doesn't need to load CJ dictionary because UAX 14 rules handles CJK
+            // characters [1]. Southeast Asian languages however require complex context analysis
+            // [2].
+            //
+            // [1]: https://www.unicode.org/reports/tr14/#ID
+            // [2]: https://www.unicode.org/reports/tr14/#SA
+            complex: ComplexPayloads::new_southeast_asian(),
+        }
+    }
+
+    icu_provider::gen_any_buffer_data_constructors!(
+        locale: skip,
+        options: LineBreakOptions,
+        error: SegmenterError,
+        #[cfg(skip)]
+        functions: [
+            new_dictionary_with_options,
+            try_new_dictionary_with_options_with_any_provider,
+            try_new_dictionary_with_options_with_buffer_provider,
+            try_new_dictionary_with_options_unstable,
+            Self,
+        ]
+    );
+
+    #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new_dictionary_with_options)]
+    pub fn try_new_dictionary_with_options_unstable<D>(
+        provider: &D,
+        options: LineBreakOptions,
+    ) -> Result<Self, SegmenterError>
+    where
+        D: DataProvider<LineBreakDataV1Marker>
+            + DataProvider<DictionaryForWordLineExtendedV1Marker>
+            + DataProvider<GraphemeClusterBreakDataV1Marker>
+            + ?Sized,
+    {
+        Ok(Self {
+            options,
+            payload: provider.load(Default::default())?.take_payload()?,
+            // Line segmenter doesn't need to load CJ dictionary because UAX 14 rules handles CJK
+            // characters [1]. Southeast Asian languages however require complex context analysis
+            // [2].
+            //
+            // [1]: https://www.unicode.org/reports/tr14/#ID
+            // [2]: https://www.unicode.org/reports/tr14/#SA
+            complex: ComplexPayloads::try_new_southeast_asian(provider)?,
+        })
+    }
+
+    /// Creates a line break iterator for an `str` (a UTF-8 string).
+    ///
+    /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
+    pub fn segment_str<'l, 's>(&'l self, input: &'s str) -> LineBreakIteratorUtf8<'l, 's> {
+        LineBreakIterator {
+            iter: input.char_indices(),
+            len: input.len(),
+            current_pos_data: None,
+            result_cache: Vec::new(),
+            data: self.payload.get(),
+            options: &self.options,
+            complex: &self.complex,
+        }
+    }
+    /// Creates a line break iterator for a potentially ill-formed UTF8 string
+    ///
+    /// Invalid characters are treated as REPLACEMENT CHARACTER
+    ///
+    /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
+    pub fn segment_utf8<'l, 's>(
+        &'l self,
+        input: &'s [u8],
+    ) -> LineBreakIteratorPotentiallyIllFormedUtf8<'l, 's> {
+        LineBreakIterator {
+            iter: Utf8CharIndices::new(input),
+            len: input.len(),
+            current_pos_data: None,
+            result_cache: Vec::new(),
+            data: self.payload.get(),
+            options: &self.options,
+            complex: &self.complex,
+        }
+    }
+    /// Creates a line break iterator for a Latin-1 (8-bit) string.
+    ///
+    /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
+    pub fn segment_latin1<'l, 's>(&'l self, input: &'s [u8]) -> LineBreakIteratorLatin1<'l, 's> {
+        LineBreakIterator {
+            iter: Latin1Indices::new(input),
+            len: input.len(),
+            current_pos_data: None,
+            result_cache: Vec::new(),
+            data: self.payload.get(),
+            options: &self.options,
+            complex: &self.complex,
+        }
+    }
+
+    /// Creates a line break iterator for a UTF-16 string.
+    ///
+    /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
+    pub fn segment_utf16<'l, 's>(&'l self, input: &'s [u16]) -> LineBreakIteratorUtf16<'l, 's> {
+        LineBreakIterator {
+            iter: Utf16Indices::new(input),
+            len: input.len(),
+            current_pos_data: None,
+            result_cache: Vec::new(),
+            data: self.payload.get(),
+            options: &self.options,
+            complex: &self.complex,
+        }
+    }
+}
+
+fn get_linebreak_property_utf32_with_rule(
+    property_table: &RuleBreakPropertyTable<'_>,
+    codepoint: u32,
+    strictness: LineBreakStrictness,
+    word_option: LineBreakWordOption,
+) -> u8 {
+    // Note: Default value is 0 == UNKNOWN
+    let prop = property_table.0.get32(codepoint);
+
+    if word_option == LineBreakWordOption::BreakAll
+        || strictness == LineBreakStrictness::Loose
+        || strictness == LineBreakStrictness::Normal
+    {
+        return match prop {
+            CJ => ID, // All CJ's General_Category is Other_Letter (Lo).
+            _ => prop,
+        };
+    }
+
+    // CJ is treated as NS by default, yielding strict line breaking.
+    // https://www.unicode.org/reports/tr14/#CJ
+    prop
+}
+
+#[inline]
+fn get_linebreak_property_latin1(property_table: &RuleBreakPropertyTable<'_>, codepoint: u8) -> u8 {
+    // Note: Default value is 0 == UNKNOWN
+    property_table.0.get32(codepoint as u32)
+}
+
+#[inline]
+fn get_linebreak_property_with_rule(
+    property_table: &RuleBreakPropertyTable<'_>,
+    codepoint: char,
+    linebreak_rule: LineBreakStrictness,
+    wordbreak_rule: LineBreakWordOption,
+) -> u8 {
+    get_linebreak_property_utf32_with_rule(
+        property_table,
+        codepoint as u32,
+        linebreak_rule,
+        wordbreak_rule,
+    )
+}
+
+#[inline]
+fn is_break_utf32_by_normal(codepoint: u32, ja_zh: bool) -> bool {
+    match codepoint {
+        0x301C => ja_zh,
+        0x30A0 => ja_zh,
+        _ => false,
+    }
+}
+
+#[inline]
+fn is_break_utf32_by_loose(
+    right_codepoint: u32,
+    left_prop: u8,
+    right_prop: u8,
+    ja_zh: bool,
+) -> Option<bool> {
+    // breaks before hyphens
+    if right_prop == BA {
+        if left_prop == ID && (right_codepoint == 0x2010 || right_codepoint == 0x2013) {
+            return Some(true);
+        }
+    } else if right_prop == NS {
+        // breaks before certain CJK hyphen-like characters
+        if right_codepoint == 0x301C || right_codepoint == 0x30A0 {
+            return Some(ja_zh);
+        }
+
+        // breaks before iteration marks
+        if right_codepoint == 0x3005
+            || right_codepoint == 0x303B
+            || right_codepoint == 0x309D
+            || right_codepoint == 0x309E
+            || right_codepoint == 0x30FD
+            || right_codepoint == 0x30FE
+        {
+            return Some(true);
+        }
+
+        // breaks before certain centered punctuation marks:
+        if right_codepoint == 0x30FB
+            || right_codepoint == 0xFF1A
+            || right_codepoint == 0xFF1B
+            || right_codepoint == 0xFF65
+            || right_codepoint == 0x203C
+            || (0x2047..=0x2049).contains(&right_codepoint)
+        {
+            return Some(ja_zh);
+        }
+    } else if right_prop == IN {
+        // breaks between inseparable characters such as U+2025, U+2026 i.e. characters with the Unicode Line Break property IN
+        return Some(true);
+    } else if right_prop == EX {
+        // breaks before certain centered punctuation marks:
+        if right_codepoint == 0xFF01 || right_codepoint == 0xFF1F {
+            return Some(ja_zh);
+        }
+    }
+
+    // breaks before suffixes:
+    // Characters with the Unicode Line Break property PO and the East Asian Width property
+    if right_prop == PO_EAW {
+        return Some(ja_zh);
+    }
+    // breaks after prefixes:
+    // Characters with the Unicode Line Break property PR and the East Asian Width property
+    if left_prop == PR_EAW {
+        return Some(ja_zh);
+    }
+    None
+}
+
+#[inline]
+fn is_break_from_table(
+    break_state_table: &RuleBreakStateTable<'_>,
+    property_count: u8,
+    left: u8,
+    right: u8,
+) -> bool {
+    let rule = get_break_state_from_table(break_state_table, property_count, left, right);
+    if rule == KEEP_RULE {
+        return false;
+    }
+    if rule >= 0 {
+        // need additional next characters to get break rule.
+        return false;
+    }
+    true
+}
+
+#[inline]
+fn is_non_break_by_keepall(left: u8, right: u8) -> bool {
+    //  typographic letter units shouldn't be break
+    (left == AI
+        || left == AL
+        || left == ID
+        || left == NU
+        || left == HY
+        || left == H2
+        || left == H3
+        || left == JL
+        || left == JV
+        || left == JT
+        || left == CJ)
+        && (right == AI
+            || right == AL
+            || right == ID
+            || right == NU
+            || right == HY
+            || right == H2
+            || right == H3
+            || right == JL
+            || right == JV
+            || right == JT
+            || right == CJ)
+}
+
+#[inline]
+fn get_break_state_from_table(
+    break_state_table: &RuleBreakStateTable<'_>,
+    property_count: u8,
+    left: u8,
+    right: u8,
+) -> i8 {
+    let idx = (left as usize) * (property_count as usize) + (right as usize);
+    // We use unwrap_or to fall back to the base case and prevent panics on bad data.
+    break_state_table.0.get(idx).unwrap_or(KEEP_RULE)
+}
+
+#[inline]
+fn use_complex_breaking_utf32(property_table: &RuleBreakPropertyTable<'_>, codepoint: u32) -> bool {
+    let line_break_property = get_linebreak_property_utf32_with_rule(
+        property_table,
+        codepoint,
+        LineBreakStrictness::Strict,
+        LineBreakWordOption::Normal,
+    );
+
+    line_break_property == SA
+}
+
+/*
+#[inline]
+fn use_complex_breaking_utf32(codepoint: u32) -> bool {
+    // Thai, Lao and Khmer
+    (codepoint >= 0xe01 && codepoint <= 0xeff) || (codepoint >= 0x1780 && codepoint <= 0x17ff)
+}
+*/
+
+/// A trait allowing for LineBreakIterator to be generalized to multiple string iteration methods.
+///
+/// This is implemented by ICU4X for several common string types.
+pub trait LineBreakType<'l, 's> {
+    /// The iterator over characters.
+    type IterAttr: Iterator<Item = (usize, Self::CharType)> + Clone;
+
+    /// The character type.
+    type CharType: Copy + Into<u32>;
+
+    fn use_complex_breaking(iterator: &LineBreakIterator<'l, 's, Self>, c: Self::CharType) -> bool;
+
+    fn get_linebreak_property_with_rule(
+        iterator: &LineBreakIterator<'l, 's, Self>,
+        c: Self::CharType,
+    ) -> u8;
+
+    fn get_current_position_character_len(iterator: &LineBreakIterator<'l, 's, Self>) -> usize;
+
+    fn handle_complex_language(
+        iterator: &mut LineBreakIterator<'l, 's, Self>,
+        left_codepoint: Self::CharType,
+    ) -> Option<usize>;
+}
+
+/// Implements the [`Iterator`] trait over the line break opportunities of the given string.
+///
+/// Lifetimes:
+///
+/// - `'l` = lifetime of the [`LineSegmenter`] object from which this iterator was created
+/// - `'s` = lifetime of the string being segmented
+///
+/// The [`Iterator::Item`] is an [`usize`] representing index of a code unit
+/// _after_ the break (for a break at the end of text, this index is the length
+/// of the [`str`] or array of code units).
+///
+/// For examples of use, see [`LineSegmenter`].
+#[derive(Debug)]
+pub struct LineBreakIterator<'l, 's, Y: LineBreakType<'l, 's> + ?Sized> {
+    iter: Y::IterAttr,
+    len: usize,
+    current_pos_data: Option<(usize, Y::CharType)>,
+    result_cache: Vec<usize>,
+    data: &'l RuleBreakDataV1<'l>,
+    options: &'l LineBreakOptions,
+    complex: &'l ComplexPayloads,
+}
+
+impl<'l, 's, Y: LineBreakType<'l, 's>> Iterator for LineBreakIterator<'l, 's, Y> {
+    type Item = usize;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        match self.check_eof() {
+            StringBoundaryPosType::Start => return Some(0),
+            StringBoundaryPosType::End => return None,
+            _ => (),
+        }
+
+        // If we have break point cache by previous run, return this result
+        if let Some(&first_pos) = self.result_cache.first() {
+            let mut i = 0;
+            loop {
+                if i == first_pos {
+                    self.result_cache = self.result_cache.iter().skip(1).map(|r| r - i).collect();
+                    return self.get_current_position();
+                }
+                i += Y::get_current_position_character_len(self);
+                self.advance_iter();
+                if self.is_eof() {
+                    self.result_cache.clear();
+                    return Some(self.len);
+                }
+            }
+        }
+
+        loop {
+            debug_assert!(!self.is_eof());
+            let left_codepoint = self.get_current_codepoint()?;
+            let mut left_prop = self.get_linebreak_property(left_codepoint);
+            self.advance_iter();
+
+            let Some(right_codepoint) = self.get_current_codepoint() else {
+                return Some(self.len);
+            };
+            let right_prop = self.get_linebreak_property(right_codepoint);
+
+            // CSS word-break property handling
+            match self.options.word_option {
+                LineBreakWordOption::BreakAll => {
+                    left_prop = match left_prop {
+                        AL => ID,
+                        NU => ID,
+                        SA => ID,
+                        _ => left_prop,
+                    };
+                }
+                LineBreakWordOption::KeepAll => {
+                    if is_non_break_by_keepall(left_prop, right_prop) {
+                        continue;
+                    }
+                }
+                _ => (),
+            }
+
+            // CSS line-break property handling
+            match self.options.strictness {
+                LineBreakStrictness::Normal => {
+                    if self.is_break_by_normal(right_codepoint) {
+                        return self.get_current_position();
+                    }
+                }
+                LineBreakStrictness::Loose => {
+                    if let Some(breakable) = is_break_utf32_by_loose(
+                        right_codepoint.into(),
+                        left_prop,
+                        right_prop,
+                        self.options.ja_zh,
+                    ) {
+                        if breakable {
+                            return self.get_current_position();
+                        }
+                        continue;
+                    }
+                }
+                LineBreakStrictness::Anywhere => {
+                    return self.get_current_position();
+                }
+                _ => (),
+            };
+
+            // UAX14 doesn't have Thai etc, so use another way.
+            if self.options.word_option != LineBreakWordOption::BreakAll
+                && Y::use_complex_breaking(self, left_codepoint)
+                && Y::use_complex_breaking(self, right_codepoint)
+            {
+                let result = Y::handle_complex_language(self, left_codepoint);
+                if result.is_some() {
+                    return result;
+                }
+                // I may have to fetch text until non-SA character?.
+            }
+
+            // If break_state is equals or grater than 0, it is alias of property.
+            let mut break_state = self.get_break_state_from_table(left_prop, right_prop);
+            if break_state >= 0_i8 {
+                let mut previous_iter = self.iter.clone();
+                let mut previous_pos_data = self.current_pos_data;
+
+                loop {
+                    self.advance_iter();
+
+                    let Some(prop) = self.get_current_linebreak_property() else {
+                        // Reached EOF. But we are analyzing multiple characters now, so next break may be previous point.
+                        let break_state = self
+                            .get_break_state_from_table(break_state as u8, self.data.eot_property);
+                        if break_state == NOT_MATCH_RULE {
+                            self.iter = previous_iter;
+                            self.current_pos_data = previous_pos_data;
+                            return self.get_current_position();
+                        }
+                        // EOF
+                        return Some(self.len);
+                    };
+
+                    break_state = self.get_break_state_from_table(break_state as u8, prop);
+                    if break_state < 0 {
+                        break;
+                    }
+
+                    previous_iter = self.iter.clone();
+                    previous_pos_data = self.current_pos_data;
+                }
+                if break_state == KEEP_RULE {
+                    continue;
+                }
+                if break_state == NOT_MATCH_RULE {
+                    self.iter = previous_iter;
+                    self.current_pos_data = previous_pos_data;
+                    return self.get_current_position();
+                }
+                return self.get_current_position();
+            }
+
+            if self.is_break_from_table(left_prop, right_prop) {
+                return self.get_current_position();
+            }
+        }
+    }
+}
+
+enum StringBoundaryPosType {
+    Start,
+    Middle,
+    End,
+}
+
+impl<'l, 's, Y: LineBreakType<'l, 's>> LineBreakIterator<'l, 's, Y> {
+    fn advance_iter(&mut self) {
+        self.current_pos_data = self.iter.next();
+    }
+
+    fn is_eof(&self) -> bool {
+        self.current_pos_data.is_none()
+    }
+
+    #[inline]
+    fn check_eof(&mut self) -> StringBoundaryPosType {
+        if self.is_eof() {
+            self.advance_iter();
+            if self.is_eof() {
+                if self.len == 0 {
+                    // Empty string. Since `self.current_pos_data` is always going to be empty,
+                    // we never read `self.len` except for here, so we can use it to mark that
+                    // we have already returned the single empty-string breakpoint.
+                    self.len = 1;
+                    StringBoundaryPosType::Start
+                } else {
+                    StringBoundaryPosType::End
+                }
+            } else {
+                StringBoundaryPosType::Start
+            }
+        } else {
+            StringBoundaryPosType::Middle
+        }
+    }
+
+    fn get_current_position(&self) -> Option<usize> {
+        self.current_pos_data.map(|(pos, _)| pos)
+    }
+
+    fn get_current_codepoint(&self) -> Option<Y::CharType> {
+        self.current_pos_data.map(|(_, codepoint)| codepoint)
+    }
+
+    fn get_linebreak_property(&self, codepoint: Y::CharType) -> u8 {
+        Y::get_linebreak_property_with_rule(self, codepoint)
+    }
+
+    fn get_current_linebreak_property(&self) -> Option<u8> {
+        self.get_current_codepoint()
+            .map(|c| self.get_linebreak_property(c))
+    }
+
+    fn is_break_by_normal(&self, codepoint: Y::CharType) -> bool {
+        is_break_utf32_by_normal(codepoint.into(), self.options.ja_zh)
+    }
+
+    fn get_break_state_from_table(&self, left: u8, right: u8) -> i8 {
+        get_break_state_from_table(
+            &self.data.break_state_table,
+            self.data.property_count,
+            left,
+            right,
+        )
+    }
+
+    fn is_break_from_table(&self, left: u8, right: u8) -> bool {
+        is_break_from_table(
+            &self.data.break_state_table,
+            self.data.property_count,
+            left,
+            right,
+        )
+    }
+}
+
+#[derive(Debug)]
+pub struct LineBreakTypeUtf8;
+
+impl<'l, 's> LineBreakType<'l, 's> for LineBreakTypeUtf8 {
+    type IterAttr = CharIndices<'s>;
+    type CharType = char;
+
+    fn get_linebreak_property_with_rule(iterator: &LineBreakIterator<Self>, c: char) -> u8 {
+        get_linebreak_property_with_rule(
+            &iterator.data.property_table,
+            c,
+            iterator.options.strictness,
+            iterator.options.word_option,
+        )
+    }
+
+    #[inline]
+    fn use_complex_breaking(iterator: &LineBreakIterator<Self>, c: char) -> bool {
+        use_complex_breaking_utf32(&iterator.data.property_table, c as u32)
+    }
+
+    fn get_current_position_character_len(iterator: &LineBreakIterator<Self>) -> usize {
+        iterator.get_current_codepoint().map_or(0, |c| c.len_utf8())
+    }
+
+    fn handle_complex_language(
+        iter: &mut LineBreakIterator<'l, 's, Self>,
+        left_codepoint: char,
+    ) -> Option<usize> {
+        handle_complex_language_utf8(iter, left_codepoint)
+    }
+}
+
+#[derive(Debug)]
+pub struct LineBreakTypePotentiallyIllFormedUtf8;
+
+impl<'l, 's> LineBreakType<'l, 's> for LineBreakTypePotentiallyIllFormedUtf8 {
+    type IterAttr = Utf8CharIndices<'s>;
+    type CharType = char;
+
+    fn get_linebreak_property_with_rule(iterator: &LineBreakIterator<Self>, c: char) -> u8 {
+        get_linebreak_property_with_rule(
+            &iterator.data.property_table,
+            c,
+            iterator.options.strictness,
+            iterator.options.word_option,
+        )
+    }
+
+    #[inline]
+    fn use_complex_breaking(iterator: &LineBreakIterator<Self>, c: char) -> bool {
+        use_complex_breaking_utf32(&iterator.data.property_table, c as u32)
+    }
+
+    fn get_current_position_character_len(iterator: &LineBreakIterator<Self>) -> usize {
+        iterator.get_current_codepoint().map_or(0, |c| c.len_utf8())
+    }
+
+    fn handle_complex_language(
+        iter: &mut LineBreakIterator<'l, 's, Self>,
+        left_codepoint: char,
+    ) -> Option<usize> {
+        handle_complex_language_utf8(iter, left_codepoint)
+    }
+}
+/// handle_complex_language impl for UTF8 iterators
+fn handle_complex_language_utf8<'l, 's, T>(
+    iter: &mut LineBreakIterator<'l, 's, T>,
+    left_codepoint: char,
+) -> Option<usize>
+where
+    T: LineBreakType<'l, 's, CharType = char>,
+{
+    // word segmenter doesn't define break rules for some languages such as Thai.
+    let start_iter = iter.iter.clone();
+    let start_point = iter.current_pos_data;
+    let mut s = String::new();
+    s.push(left_codepoint);
+    loop {
+        debug_assert!(!iter.is_eof());
+        s.push(iter.get_current_codepoint()?);
+        iter.advance_iter();
+        if let Some(current_codepoint) = iter.get_current_codepoint() {
+            if !T::use_complex_breaking(iter, current_codepoint) {
+                break;
+            }
+        } else {
+            // EOF
+            break;
+        }
+    }
+
+    // Restore iterator to move to head of complex string
+    iter.iter = start_iter;
+    iter.current_pos_data = start_point;
+    let breaks = complex_language_segment_str(iter.complex, &s);
+    iter.result_cache = breaks;
+    let first_pos = *iter.result_cache.first()?;
+    let mut i = left_codepoint.len_utf8();
+    loop {
+        if i == first_pos {
+            // Re-calculate breaking offset
+            iter.result_cache = iter.result_cache.iter().skip(1).map(|r| r - i).collect();
+            return iter.get_current_position();
+        }
+        debug_assert!(
+            i < first_pos,
+            "we should always arrive at first_pos: near index {:?}",
+            iter.get_current_position()
+        );
+        i += T::get_current_position_character_len(iter);
+        iter.advance_iter();
+        if iter.is_eof() {
+            iter.result_cache.clear();
+            return Some(iter.len);
+        }
+    }
+}
+
+#[derive(Debug)]
+pub struct LineBreakTypeLatin1;
+
+impl<'l, 's> LineBreakType<'l, 's> for LineBreakTypeLatin1 {
+    type IterAttr = Latin1Indices<'s>;
+    type CharType = u8;
+
+    fn get_linebreak_property_with_rule(iterator: &LineBreakIterator<Self>, c: u8) -> u8 {
+        // No CJ on Latin1
+        get_linebreak_property_latin1(&iterator.data.property_table, c)
+    }
+
+    #[inline]
+    fn use_complex_breaking(_iterator: &LineBreakIterator<Self>, _c: u8) -> bool {
+        false
+    }
+
+    fn get_current_position_character_len(_: &LineBreakIterator<Self>) -> usize {
+        unreachable!()
+    }
+
+    fn handle_complex_language(
+        _: &mut LineBreakIterator<Self>,
+        _: Self::CharType,
+    ) -> Option<usize> {
+        unreachable!()
+    }
+}
+
+#[derive(Debug)]
+pub struct LineBreakTypeUtf16;
+
+impl<'l, 's> LineBreakType<'l, 's> for LineBreakTypeUtf16 {
+    type IterAttr = Utf16Indices<'s>;
+    type CharType = u32;
+
+    fn get_linebreak_property_with_rule(iterator: &LineBreakIterator<Self>, c: u32) -> u8 {
+        get_linebreak_property_utf32_with_rule(
+            &iterator.data.property_table,
+            c,
+            iterator.options.strictness,
+            iterator.options.word_option,
+        )
+    }
+
+    #[inline]
+    fn use_complex_breaking(iterator: &LineBreakIterator<Self>, c: u32) -> bool {
+        use_complex_breaking_utf32(&iterator.data.property_table, c)
+    }
+
+    fn get_current_position_character_len(iterator: &LineBreakIterator<Self>) -> usize {
+        match iterator.get_current_codepoint() {
+            None => 0,
+            Some(ch) if ch >= 0x10000 => 2,
+            _ => 1,
+        }
+    }
+
+    fn handle_complex_language(
+        iterator: &mut LineBreakIterator<Self>,
+        left_codepoint: Self::CharType,
+    ) -> Option<usize> {
+        // word segmenter doesn't define break rules for some languages such as Thai.
+        let start_iter = iterator.iter.clone();
+        let start_point = iterator.current_pos_data;
+        let mut s = vec![left_codepoint as u16];
+        loop {
+            debug_assert!(!iterator.is_eof());
+            s.push(iterator.get_current_codepoint()? as u16);
+            iterator.advance_iter();
+            if let Some(current_codepoint) = iterator.get_current_codepoint() {
+                if !Self::use_complex_breaking(iterator, current_codepoint) {
+                    break;
+                }
+            } else {
+                // EOF
+                break;
+            }
+        }
+
+        // Restore iterator to move to head of complex string
+        iterator.iter = start_iter;
+        iterator.current_pos_data = start_point;
+        let breaks = complex_language_segment_utf16(iterator.complex, &s);
+        iterator.result_cache = breaks;
+        // result_cache vector is utf-16 index that is in BMP.
+        let first_pos = *iterator.result_cache.first()?;
+        let mut i = 1;
+        loop {
+            if i == first_pos {
+                // Re-calculate breaking offset
+                iterator.result_cache = iterator
+                    .result_cache
+                    .iter()
+                    .skip(1)
+                    .map(|r| r - i)
+                    .collect();
+                return iterator.get_current_position();
+            }
+            debug_assert!(
+                i < first_pos,
+                "we should always arrive at first_pos: near index {:?}",
+                iterator.get_current_position()
+            );
+            i += 1;
+            iterator.advance_iter();
+            if iterator.is_eof() {
+                iterator.result_cache.clear();
+                return Some(iterator.len);
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+#[cfg(feature = "serde")]
+mod tests {
+    use super::*;
+    use crate::LineSegmenter;
+
+    #[test]
+    fn linebreak_property() {
+        let payload = DataProvider::<LineBreakDataV1Marker>::load(
+            &crate::provider::Baked,
+            Default::default(),
+        )
+        .expect("Loading should succeed!")
+        .take_payload()
+        .expect("Data should be present!");
+
+        let get_linebreak_property = |codepoint| {
+            get_linebreak_property_with_rule(
+                &payload.get().property_table,
+                codepoint,
+                LineBreakStrictness::Strict,
+                LineBreakWordOption::Normal,
+            )
+        };
+
+        assert_eq!(get_linebreak_property('\u{0020}'), SP);
+        assert_eq!(get_linebreak_property('\u{0022}'), QU);
+        assert_eq!(get_linebreak_property('('), OP_OP30);
+        assert_eq!(get_linebreak_property('\u{0030}'), NU);
+        assert_eq!(get_linebreak_property('['), OP_OP30);
+        assert_eq!(get_linebreak_property('\u{1f3fb}'), EM);
+        assert_eq!(get_linebreak_property('\u{20000}'), ID);
+        assert_eq!(get_linebreak_property('\u{e0020}'), CM);
+        assert_eq!(get_linebreak_property('\u{3041}'), CJ);
+        assert_eq!(get_linebreak_property('\u{0025}'), PO);
+        assert_eq!(get_linebreak_property('\u{00A7}'), AI);
+        assert_eq!(get_linebreak_property('\u{50005}'), XX);
+        assert_eq!(get_linebreak_property('\u{17D6}'), NS);
+        assert_eq!(get_linebreak_property('\u{2014}'), B2);
+    }
+
+    #[test]
+    #[allow(clippy::bool_assert_comparison)] // clearer when we're testing bools directly
+    fn break_rule() {
+        let payload = DataProvider::<LineBreakDataV1Marker>::load(
+            &crate::provider::Baked,
+            Default::default(),
+        )
+        .expect("Loading should succeed!")
+        .take_payload()
+        .expect("Data should be present!");
+        let lb_data: &RuleBreakDataV1 = payload.get();
+
+        let is_break = |left, right| {
+            is_break_from_table(
+                &lb_data.break_state_table,
+                lb_data.property_count,
+                left,
+                right,
+            )
+        };
+
+        // LB4
+        assert_eq!(is_break(BK, AL), true);
+        // LB5
+        assert_eq!(is_break(CR, LF), false);
+        assert_eq!(is_break(CR, AL), true);
+        assert_eq!(is_break(LF, AL), true);
+        assert_eq!(is_break(NL, AL), true);
+        // LB6
+        assert_eq!(is_break(AL, BK), false);
+        assert_eq!(is_break(AL, CR), false);
+        assert_eq!(is_break(AL, LF), false);
+        assert_eq!(is_break(AL, NL), false);
+        // LB7
+        assert_eq!(is_break(AL, SP), false);
+        assert_eq!(is_break(AL, ZW), false);
+        // LB8
+        // LB8a
+        assert_eq!(is_break(ZWJ, AL), false);
+        // LB9
+        assert_eq!(is_break(AL, ZWJ), false);
+        assert_eq!(is_break(AL, CM), false);
+        assert_eq!(is_break(ID, ZWJ), false);
+        // LB10
+        assert_eq!(is_break(ZWJ, SP), false);
+        assert_eq!(is_break(SP, CM), true);
+        // LB11
+        assert_eq!(is_break(AL, WJ), false);
+        assert_eq!(is_break(WJ, AL), false);
+        // LB12
+        assert_eq!(is_break(GL, AL), false);
+        // LB12a
+        assert_eq!(is_break(AL, GL), false);
+        assert_eq!(is_break(SP, GL), true);
+        // LB13
+        assert_eq!(is_break(AL, CL), false);
+        assert_eq!(is_break(AL, CP), false);
+        assert_eq!(is_break(AL, EX), false);
+        assert_eq!(is_break(AL, IS), false);
+        assert_eq!(is_break(AL, SY), false);
+        // LB18
+        assert_eq!(is_break(SP, AL), true);
+        // LB19
+        assert_eq!(is_break(AL, QU), false);
+        assert_eq!(is_break(QU, AL), false);
+        // LB20
+        assert_eq!(is_break(AL, CB), true);
+        assert_eq!(is_break(CB, AL), true);
+        // LB20
+        assert_eq!(is_break(AL, BA), false);
+        assert_eq!(is_break(AL, HY), false);
+        assert_eq!(is_break(AL, NS), false);
+        // LB21
+        assert_eq!(is_break(AL, BA), false);
+        assert_eq!(is_break(BB, AL), false);
+        assert_eq!(is_break(ID, BA), false);
+        assert_eq!(is_break(ID, NS), false);
+        // LB21a
+        // LB21b
+        assert_eq!(is_break(SY, HL), false);
+        // LB22
+        assert_eq!(is_break(AL, IN), false);
+        // LB 23
+        assert_eq!(is_break(AL, NU), false);
+        assert_eq!(is_break(HL, NU), false);
+        // LB 23a
+        assert_eq!(is_break(PR, ID), false);
+        assert_eq!(is_break(PR, EB), false);
+        assert_eq!(is_break(PR, EM), false);
+        assert_eq!(is_break(ID, PO), false);
+        assert_eq!(is_break(EB, PO), false);
+        assert_eq!(is_break(EM, PO), false);
+        // LB26
+        assert_eq!(is_break(JL, JL), false);
+        assert_eq!(is_break(JL, JV), false);
+        assert_eq!(is_break(JL, H2), false);
+        // LB27
+        assert_eq!(is_break(JL, IN), false);
+        assert_eq!(is_break(JL, PO), false);
+        assert_eq!(is_break(PR, JL), false);
+        // LB28
+        assert_eq!(is_break(AL, AL), false);
+        assert_eq!(is_break(HL, AL), false);
+        // LB29
+        assert_eq!(is_break(IS, AL), false);
+        assert_eq!(is_break(IS, HL), false);
+        // LB30b
+        assert_eq!(is_break(EB, EM), false);
+        // LB31
+        assert_eq!(is_break(ID, ID), true);
+    }
+
+    #[test]
+    fn linebreak() {
+        let segmenter = LineSegmenter::try_new_dictionary_unstable(&crate::provider::Baked)
+            .expect("Data exists");
+
+        let mut iter = segmenter.segment_str("hello world");
+        assert_eq!(Some(0), iter.next());
+        assert_eq!(Some(6), iter.next());
+        assert_eq!(Some(11), iter.next());
+        assert_eq!(None, iter.next());
+
+        iter = segmenter.segment_str("$10 $10");
+        assert_eq!(Some(0), iter.next());
+        assert_eq!(Some(4), iter.next());
+        assert_eq!(Some(7), iter.next());
+        assert_eq!(None, iter.next());
+
+        // LB10
+
+        // LB14
+        iter = segmenter.segment_str("[  abc def");
+        assert_eq!(Some(0), iter.next());
+        assert_eq!(Some(7), iter.next());
+        assert_eq!(Some(10), iter.next());
+        assert_eq!(None, iter.next());
+
+        let input: [u8; 10] = [0x5B, 0x20, 0x20, 0x61, 0x62, 0x63, 0x20, 0x64, 0x65, 0x66];
+        let mut iter_u8 = segmenter.segment_latin1(&input);
+        assert_eq!(Some(0), iter_u8.next());
+        assert_eq!(Some(7), iter_u8.next());
+        assert_eq!(Some(10), iter_u8.next());
+        assert_eq!(None, iter_u8.next());
+
+        let input: [u16; 10] = [0x5B, 0x20, 0x20, 0x61, 0x62, 0x63, 0x20, 0x64, 0x65, 0x66];
+        let mut iter_u16 = segmenter.segment_utf16(&input);
+        assert_eq!(Some(0), iter_u16.next());
+        assert_eq!(Some(7), iter_u16.next());
+        assert_eq!(Some(10), iter_u16.next());
+        assert_eq!(None, iter_u16.next());
+
+        // LB15
+        iter = segmenter.segment_str("abc\u{0022}  (def");
+        assert_eq!(Some(0), iter.next());
+        assert_eq!(Some(10), iter.next());
+        assert_eq!(None, iter.next());
+
+        let input: [u8; 10] = [0x61, 0x62, 0x63, 0x22, 0x20, 0x20, 0x28, 0x64, 0x65, 0x66];
+        let mut iter_u8 = segmenter.segment_latin1(&input);
+        assert_eq!(Some(0), iter_u8.next());
+        assert_eq!(Some(10), iter_u8.next());
+        assert_eq!(None, iter_u8.next());
+
+        let input: [u16; 10] = [0x61, 0x62, 0x63, 0x22, 0x20, 0x20, 0x28, 0x64, 0x65, 0x66];
+        let mut iter_u16 = segmenter.segment_utf16(&input);
+        assert_eq!(Some(0), iter_u16.next());
+        assert_eq!(Some(10), iter_u16.next());
+        assert_eq!(None, iter_u16.next());
+
+        // LB16
+        iter = segmenter.segment_str("\u{0029}\u{203C}");
+        assert_eq!(Some(0), iter.next());
+        assert_eq!(Some(4), iter.next());
+        assert_eq!(None, iter.next());
+        iter = segmenter.segment_str("\u{0029}  \u{203C}");
+        assert_eq!(Some(0), iter.next());
+        assert_eq!(Some(6), iter.next());
+        assert_eq!(None, iter.next());
+
+        let input: [u16; 4] = [0x29, 0x20, 0x20, 0x203c];
+        let mut iter_u16 = segmenter.segment_utf16(&input);
+        assert_eq!(Some(0), iter_u16.next());
+        assert_eq!(Some(4), iter_u16.next());
+        assert_eq!(None, iter_u16.next());
+
+        // LB17
+        iter = segmenter.segment_str("\u{2014}\u{2014}aa");
+        assert_eq!(Some(0), iter.next());
+        assert_eq!(Some(6), iter.next());
+        assert_eq!(Some(8), iter.next());
+        assert_eq!(None, iter.next());
+        iter = segmenter.segment_str("\u{2014}  \u{2014}aa");
+        assert_eq!(Some(0), iter.next());
+        assert_eq!(Some(8), iter.next());
+        assert_eq!(Some(10), iter.next());
+        assert_eq!(None, iter.next());
+
+        iter = segmenter.segment_str("\u{2014}\u{2014}  \u{2014}\u{2014}123 abc");
+        assert_eq!(Some(0), iter.next());
+        assert_eq!(Some(14), iter.next());
+        assert_eq!(Some(18), iter.next());
+        assert_eq!(Some(21), iter.next());
+        assert_eq!(None, iter.next());
+
+        // LB25
+        let mut iter = segmenter.segment_str("(0,1)+(2,3)");
+        assert_eq!(Some(0), iter.next());
+        assert_eq!(Some(11), iter.next());
+        assert_eq!(None, iter.next());
+        let input: [u16; 11] = [
+            0x28, 0x30, 0x2C, 0x31, 0x29, 0x2B, 0x28, 0x32, 0x2C, 0x33, 0x29,
+        ];
+        let mut iter_u16 = segmenter.segment_utf16(&input);
+        assert_eq!(Some(0), iter_u16.next());
+        assert_eq!(Some(11), iter_u16.next());
+        assert_eq!(None, iter_u16.next());
+
+        let input: [u16; 13] = [
+            0x2014, 0x2014, 0x20, 0x20, 0x2014, 0x2014, 0x31, 0x32, 0x33, 0x20, 0x61, 0x62, 0x63,
+        ];
+        let mut iter_u16 = segmenter.segment_utf16(&input);
+        assert_eq!(Some(0), iter_u16.next());
+        assert_eq!(Some(6), iter_u16.next());
+        assert_eq!(Some(10), iter_u16.next());
+        assert_eq!(Some(13), iter_u16.next());
+        assert_eq!(None, iter_u16.next());
+
+        iter = segmenter.segment_str("\u{1F3FB} \u{1F3FB}");
+        assert_eq!(Some(0), iter.next());
+        assert_eq!(Some(5), iter.next());
+        assert_eq!(Some(9), iter.next());
+        assert_eq!(None, iter.next());
+    }
+
+    #[test]
+    #[cfg(feature = "lstm")]
+    fn thai_line_break() {
+        const TEST_STR: &str = "ภาษาไทยภาษาไทย";
+
+        let segmenter = LineSegmenter::new_lstm();
+        let breaks: Vec<usize> = segmenter.segment_str(TEST_STR).collect();
+        assert_eq!(breaks, [0, 12, 21, 33, TEST_STR.len()], "Thai test");
+
+        let utf16: Vec<u16> = TEST_STR.encode_utf16().collect();
+        let breaks: Vec<usize> = segmenter.segment_utf16(&utf16).collect();
+        assert_eq!(breaks, [0, 4, 7, 11, utf16.len()], "Thai test");
+
+        let utf16: [u16; 4] = [0x0e20, 0x0e32, 0x0e29, 0x0e32];
+        let breaks: Vec<usize> = segmenter.segment_utf16(&utf16).collect();
+        assert_eq!(breaks, [0, 4], "Thai test");
+    }
+
+    #[test]
+    #[cfg(feature = "lstm")]
+    fn burmese_line_break() {
+        // "Burmese Language" in Burmese
+        const TEST_STR: &str = "မြန်မာဘာသာစကား";
+
+        let segmenter = LineSegmenter::new_lstm();
+        let breaks: Vec<usize> = segmenter.segment_str(TEST_STR).collect();
+        // LSTM model breaks more characters, but it is better to return [30].
+        assert_eq!(breaks, [0, 12, 18, 30, TEST_STR.len()], "Burmese test");
+
+        let utf16: Vec<u16> = TEST_STR.encode_utf16().collect();
+        let breaks: Vec<usize> = segmenter.segment_utf16(&utf16).collect();
+        // LSTM model breaks more characters, but it is better to return [10].
+        assert_eq!(breaks, [0, 4, 6, 10, utf16.len()], "Burmese utf-16 test");
+    }
+
+    #[test]
+    #[cfg(feature = "lstm")]
+    fn khmer_line_break() {
+        const TEST_STR: &str = "សេចក្ដីប្រកាសជាសកលស្ដីពីសិទ្ធិមនុស្ស";
+
+        let segmenter = LineSegmenter::new_lstm();
+        let breaks: Vec<usize> = segmenter.segment_str(TEST_STR).collect();
+        // Note: This small sample matches the ICU dictionary segmenter
+        assert_eq!(breaks, [0, 39, 48, 54, 72, TEST_STR.len()], "Khmer test");
+
+        let utf16: Vec<u16> = TEST_STR.encode_utf16().collect();
+        let breaks: Vec<usize> = segmenter.segment_utf16(&utf16).collect();
+        assert_eq!(
+            breaks,
+            [0, 13, 16, 18, 24, utf16.len()],
+            "Khmer utf-16 test"
+        );
+    }
+
+    #[test]
+    #[cfg(feature = "lstm")]
+    fn lao_line_break() {
+        const TEST_STR: &str = "ກ່ຽວກັບສິດຂອງມະນຸດ";
+
+        let segmenter = LineSegmenter::new_lstm();
+        let breaks: Vec<usize> = segmenter.segment_str(TEST_STR).collect();
+        // Note: LSTM finds a break at '12' that the dictionary does not find
+        assert_eq!(breaks, [0, 12, 21, 30, 39, TEST_STR.len()], "Lao test");
+
+        let utf16: Vec<u16> = TEST_STR.encode_utf16().collect();
+        let breaks: Vec<usize> = segmenter.segment_utf16(&utf16).collect();
+        assert_eq!(breaks, [0, 4, 7, 10, 13, utf16.len()], "Lao utf-16 test");
+    }
+
+    #[test]
+    fn empty_string() {
+        let segmenter = LineSegmenter::new_auto();
+        let breaks: Vec<usize> = segmenter.segment_str("").collect();
+        assert_eq!(breaks, [0]);
+    }
+}
diff --git a/third_party/rust/icu_segmenter/src/provider/lstm.rs b/third_party/rust/icu_segmenter/src/provider/lstm.rs
new file mode 100644
index 0000000000..6a85680e4c
--- /dev/null
+++ b/third_party/rust/icu_segmenter/src/provider/lstm.rs
@@ -0,0 +1,358 @@
+// This file is part of ICU4X. For terms of use, please see the file
+// called LICENSE at the top level of the ICU4X source tree
+// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
+
+//! Data provider struct definitions for the lstm
+
+// Provider structs must be stable
+#![allow(clippy::exhaustive_structs, clippy::exhaustive_enums)]
+
+use icu_provider::prelude::*;
+use zerovec::{ule::UnvalidatedStr, ZeroMap, ZeroVec};
+
+// We do this instead of const generics because ZeroFrom and Yokeable derives, as well as serde
+// don't support them
+macro_rules! lstm_matrix {
+    ($name:ident, $generic:literal) => {
+        /// The struct that stores a LSTM's matrix.
+        ///
+        /// <div class="stab unstable">
+        /// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
+        /// including in SemVer minor releases. While the serde representation of data structs is guaranteed
+        /// to be stable, their Rust representation might not be. Use with caution.
+        /// </div>
+        #[derive(PartialEq, Debug, Clone, zerofrom::ZeroFrom, yoke::Yokeable)]
+        #[cfg_attr(feature = "datagen", derive(serde::Serialize))]
+        pub struct $name<'data> {
+            // Invariant: dims.product() == data.len()
+            #[allow(missing_docs)]
+            pub(crate) dims: [u16; $generic],
+            #[allow(missing_docs)]
+            pub(crate) data: ZeroVec<'data, f32>,
+        }
+
+        impl<'data> $name<'data> {
+            #[cfg(any(feature = "serde", feature = "datagen"))]
+            /// Creates a LstmMatrix with the given dimensions. Fails if the dimensions don't match the data.
+            pub fn from_parts(
+                dims: [u16; $generic],
+                data: ZeroVec<'data, f32>,
+            ) -> Result<Self, DataError> {
+                if dims.iter().map(|&i| i as usize).product::<usize>() != data.len() {
+                    Err(DataError::custom("Dimension mismatch"))
+                } else {
+                    Ok(Self { dims, data })
+                }
+            }
+
+            #[doc(hidden)] // databake
+            pub const fn from_parts_unchecked(
+                dims: [u16; $generic],
+                data: ZeroVec<'data, f32>,
+            ) -> Self {
+                Self { dims, data }
+            }
+        }
+
+        #[cfg(feature = "serde")]
+        impl<'de: 'data, 'data> serde::Deserialize<'de> for $name<'data> {
+            fn deserialize<S>(deserializer: S) -> Result<Self, S::Error>
+            where
+                S: serde::de::Deserializer<'de>,
+            {
+                #[derive(serde::Deserialize)]
+                struct Raw<'data> {
+                    dims: [u16; $generic],
+                    #[serde(borrow)]
+                    data: ZeroVec<'data, f32>,
+                }
+
+                let raw = Raw::deserialize(deserializer)?;
+
+                use serde::de::Error;
+                Self::from_parts(raw.dims, raw.data)
+                    .map_err(|_| S::Error::custom("Dimension mismatch"))
+            }
+        }
+
+        #[cfg(feature = "datagen")]
+        impl databake::Bake for $name<'_> {
+            fn bake(&self, env: &databake::CrateEnv) -> databake::TokenStream {
+                let dims = self.dims.bake(env);
+                let data = self.data.bake(env);
+                databake::quote! {
+                    icu_segmenter::provider::$name::from_parts_unchecked(#dims, #data)
+                }
+            }
+        }
+    };
+}
+
+lstm_matrix!(LstmMatrix1, 1);
+lstm_matrix!(LstmMatrix2, 2);
+lstm_matrix!(LstmMatrix3, 3);
+
+#[derive(PartialEq, Debug, Clone, Copy)]
+#[cfg_attr(
+    feature = "datagen",
+    derive(serde::Serialize,databake::Bake),
+    databake(path = icu_segmenter::provider),
+)]
+#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
+/// The type of LSTM model
+///
+/// <div class="stab unstable">
+/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
+/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
+/// to be stable, their Rust representation might not be. Use with caution.
+/// </div>
+pub enum ModelType {
+    /// A model working on code points
+    Codepoints,
+    /// A model working on grapheme clusters
+    GraphemeClusters,
+}
+
+/// The struct that stores a LSTM model.
+///
+/// <div class="stab unstable">
+/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
+/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
+/// to be stable, their Rust representation might not be. Use with caution.
+/// </div>
+#[derive(PartialEq, Debug, Clone, yoke::Yokeable, zerofrom::ZeroFrom)]
+#[cfg_attr(feature = "datagen", derive(serde::Serialize))]
+#[yoke(prove_covariance_manually)]
+pub struct LstmDataFloat32<'data> {
+    /// Type of the model
+    pub(crate) model: ModelType,
+    /// The grapheme cluster dictionary used to train the model
+    pub(crate) dic: ZeroMap<'data, UnvalidatedStr, u16>,
+    /// The embedding layer. Shape (dic.len + 1, e)
+    pub(crate) embedding: LstmMatrix2<'data>,
+    /// The forward layer's first matrix. Shape (h, 4, e)
+    pub(crate) fw_w: LstmMatrix3<'data>,
+    /// The forward layer's second matrix. Shape (h, 4, h)
+    pub(crate) fw_u: LstmMatrix3<'data>,
+    /// The forward layer's bias. Shape (h, 4)
+    pub(crate) fw_b: LstmMatrix2<'data>,
+    /// The backward layer's first matrix. Shape (h, 4, e)
+    pub(crate) bw_w: LstmMatrix3<'data>,
+    /// The backward layer's second matrix. Shape (h, 4, h)
+    pub(crate) bw_u: LstmMatrix3<'data>,
+    /// The backward layer's bias. Shape (h, 4)
+    pub(crate) bw_b: LstmMatrix2<'data>,
+    /// The output layer's weights. Shape (2, 4, h)
+    pub(crate) time_w: LstmMatrix3<'data>,
+    /// The output layer's bias. Shape (4)
+    pub(crate) time_b: LstmMatrix1<'data>,
+}
+
+impl<'data> LstmDataFloat32<'data> {
+    #[doc(hidden)] // databake
+    #[allow(clippy::too_many_arguments)] // constructor
+    pub const fn from_parts_unchecked(
+        model: ModelType,
+        dic: ZeroMap<'data, UnvalidatedStr, u16>,
+        embedding: LstmMatrix2<'data>,
+        fw_w: LstmMatrix3<'data>,
+        fw_u: LstmMatrix3<'data>,
+        fw_b: LstmMatrix2<'data>,
+        bw_w: LstmMatrix3<'data>,
+        bw_u: LstmMatrix3<'data>,
+        bw_b: LstmMatrix2<'data>,
+        time_w: LstmMatrix3<'data>,
+        time_b: LstmMatrix1<'data>,
+    ) -> Self {
+        Self {
+            model,
+            dic,
+            embedding,
+            fw_w,
+            fw_u,
+            fw_b,
+            bw_w,
+            bw_u,
+            bw_b,
+            time_w,
+            time_b,
+        }
+    }
+
+    #[cfg(any(feature = "serde", feature = "datagen"))]
+    /// Creates a LstmDataFloat32 with the given data. Fails if the matrix dimensions are inconsisent.
+    #[allow(clippy::too_many_arguments)] // constructor
+    pub fn try_from_parts(
+        model: ModelType,
+        dic: ZeroMap<'data, UnvalidatedStr, u16>,
+        embedding: LstmMatrix2<'data>,
+        fw_w: LstmMatrix3<'data>,
+        fw_u: LstmMatrix3<'data>,
+        fw_b: LstmMatrix2<'data>,
+        bw_w: LstmMatrix3<'data>,
+        bw_u: LstmMatrix3<'data>,
+        bw_b: LstmMatrix2<'data>,
+        time_w: LstmMatrix3<'data>,
+        time_b: LstmMatrix1<'data>,
+    ) -> Result<Self, DataError> {
+        let dic_len = u16::try_from(dic.len())
+            .map_err(|_| DataError::custom("Dictionary does not fit in u16"))?;
+
+        let num_classes = embedding.dims[0];
+        let embedd_dim = embedding.dims[1];
+        let hunits = fw_u.dims[2];
+        if num_classes - 1 != dic_len
+            || fw_w.dims != [4, hunits, embedd_dim]
+            || fw_u.dims != [4, hunits, hunits]
+            || fw_b.dims != [4, hunits]
+            || bw_w.dims != [4, hunits, embedd_dim]
+            || bw_u.dims != [4, hunits, hunits]
+            || bw_b.dims != [4, hunits]
+            || time_w.dims != [2, 4, hunits]
+            || time_b.dims != [4]
+        {
+            return Err(DataError::custom("LSTM dimension mismatch"));
+        }
+
+        #[cfg(debug_assertions)]
+        if !dic.iter_copied_values().all(|(_, g)| g < dic_len) {
+            return Err(DataError::custom("Invalid cluster id"));
+        }
+
+        Ok(Self {
+            model,
+            dic,
+            embedding,
+            fw_w,
+            fw_u,
+            fw_b,
+            bw_w,
+            bw_u,
+            bw_b,
+            time_w,
+            time_b,
+        })
+    }
+}
+
+#[cfg(feature = "serde")]
+impl<'de: 'data, 'data> serde::Deserialize<'de> for LstmDataFloat32<'data> {
+    fn deserialize<S>(deserializer: S) -> Result<Self, S::Error>
+    where
+        S: serde::de::Deserializer<'de>,
+    {
+        #[derive(serde::Deserialize)]
+        struct Raw<'data> {
+            model: ModelType,
+            #[cfg_attr(feature = "serde", serde(borrow))]
+            dic: ZeroMap<'data, UnvalidatedStr, u16>,
+            #[cfg_attr(feature = "serde", serde(borrow))]
+            embedding: LstmMatrix2<'data>,
+            #[cfg_attr(feature = "serde", serde(borrow))]
+            fw_w: LstmMatrix3<'data>,
+            #[cfg_attr(feature = "serde", serde(borrow))]
+            fw_u: LstmMatrix3<'data>,
+            #[cfg_attr(feature = "serde", serde(borrow))]
+            fw_b: LstmMatrix2<'data>,
+            #[cfg_attr(feature = "serde", serde(borrow))]
+            bw_w: LstmMatrix3<'data>,
+            #[cfg_attr(feature = "serde", serde(borrow))]
+            bw_u: LstmMatrix3<'data>,
+            #[cfg_attr(feature = "serde", serde(borrow))]
+            bw_b: LstmMatrix2<'data>,
+            #[cfg_attr(feature = "serde", serde(borrow))]
+            time_w: LstmMatrix3<'data>,
+            #[cfg_attr(feature = "serde", serde(borrow))]
+            time_b: LstmMatrix1<'data>,
+        }
+
+        let raw = Raw::deserialize(deserializer)?;
+
+        use serde::de::Error;
+        Self::try_from_parts(
+            raw.model,
+            raw.dic,
+            raw.embedding,
+            raw.fw_w,
+            raw.fw_u,
+            raw.fw_b,
+            raw.bw_w,
+            raw.bw_u,
+            raw.bw_b,
+            raw.time_w,
+            raw.time_b,
+        )
+        .map_err(|_| S::Error::custom("Invalid dimensions"))
+    }
+}
+
+#[cfg(feature = "datagen")]
+impl databake::Bake for LstmDataFloat32<'_> {
+    fn bake(&self, env: &databake::CrateEnv) -> databake::TokenStream {
+        let model = self.model.bake(env);
+        let dic = self.dic.bake(env);
+        let embedding = self.embedding.bake(env);
+        let fw_w = self.fw_w.bake(env);
+        let fw_u = self.fw_u.bake(env);
+        let fw_b = self.fw_b.bake(env);
+        let bw_w = self.bw_w.bake(env);
+        let bw_u = self.bw_u.bake(env);
+        let bw_b = self.bw_b.bake(env);
+        let time_w = self.time_w.bake(env);
+        let time_b = self.time_b.bake(env);
+        databake::quote! {
+            icu_segmenter::provider::LstmDataFloat32::from_parts_unchecked(
+                #model,
+                #dic,
+                #embedding,
+                #fw_w,
+                #fw_u,
+                #fw_b,
+                #bw_w,
+                #bw_u,
+                #bw_b,
+                #time_w,
+                #time_b,
+            )
+        }
+    }
+}
+
+/// The data to power the LSTM segmentation model.
+///
+/// This data enum is extensible: more backends may be added in the future.
+/// Old data can be used with newer code but not vice versa.
+///
+/// Examples of possible future extensions:
+///
+/// 1. Variant to store data in 16 instead of 32 bits
+/// 2. Minor changes to the LSTM model, such as different forward/backward matrix sizes
+///
+/// <div class="stab unstable">
+/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
+/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
+/// to be stable, their Rust representation might not be. Use with caution.
+/// </div>
+#[icu_provider::data_struct(LstmForWordLineAutoV1Marker = "segmenter/lstm/wl_auto@1")]
+#[derive(Debug, PartialEq, Clone)]
+#[cfg_attr(
+    feature = "datagen", 
+    derive(serde::Serialize, databake::Bake),
+    databake(path = icu_segmenter::provider),
+)]
+#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
+#[yoke(prove_covariance_manually)]
+#[non_exhaustive]
+pub enum LstmDataV1<'data> {
+    /// The data as matrices of zerovec f32 values.
+    Float32(#[cfg_attr(feature = "serde", serde(borrow))] LstmDataFloat32<'data>),
+    // new variants should go BELOW existing ones
+    // Serde serializes based on variant name and index in the enum
+    // https://docs.rs/serde/latest/serde/trait.Serializer.html#tymethod.serialize_unit_variant
+}
+
+pub(crate) struct LstmDataV1Marker;
+
+impl DataMarker for LstmDataV1Marker {
+    type Yokeable = LstmDataV1<'static>;
+}
diff --git a/third_party/rust/icu_segmenter/src/provider/mod.rs b/third_party/rust/icu_segmenter/src/provider/mod.rs
new file mode 100644
index 0000000000..75f0d4d1e7
--- /dev/null
+++ b/third_party/rust/icu_segmenter/src/provider/mod.rs
@@ -0,0 +1,202 @@
+// This file is part of ICU4X. For terms of use, please see the file
+// called LICENSE at the top level of the ICU4X source tree
+// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
+
+//! 🚧 \[Unstable\] Data provider struct definitions for this ICU4X component.
+//!
+//! <div class="stab unstable">
+//! 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
+//! including in SemVer minor releases. While the serde representation of data structs is guaranteed
+//! to be stable, their Rust representation might not be. Use with caution.
+//! </div>
+//!
+//! Read more about data providers: [`icu_provider`]
+
+// Provider structs must be stable
+#![allow(clippy::exhaustive_structs, clippy::exhaustive_enums)]
+
+mod lstm;
+pub use lstm::*;
+
+// Re-export this from the provider module because it is needed by datagen
+#[cfg(feature = "datagen")]
+pub use crate::rule_segmenter::RuleStatusType;
+
+use icu_collections::codepointtrie::CodePointTrie;
+use icu_provider::prelude::*;
+use zerovec::ZeroVec;
+
+#[cfg(feature = "compiled_data")]
+#[derive(Debug)]
+/// Baked data
+///
+/// <div class="stab unstable">
+/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
+/// including in SemVer minor releases. In particular, the `DataProvider` implementations are only
+/// guaranteed to match with this version's `*_unstable` providers. Use with caution.
+/// </div>
+pub struct Baked;
+
+#[cfg(feature = "compiled_data")]
+const _: () = {
+    pub mod icu {
+        pub use crate as segmenter;
+        pub use icu_collections as collections;
+    }
+    icu_segmenter_data::make_provider!(Baked);
+    icu_segmenter_data::impl_segmenter_dictionary_w_auto_v1!(Baked);
+    icu_segmenter_data::impl_segmenter_dictionary_wl_ext_v1!(Baked);
+    icu_segmenter_data::impl_segmenter_grapheme_v1!(Baked);
+    icu_segmenter_data::impl_segmenter_line_v1!(Baked);
+    #[cfg(feature = "lstm")]
+    icu_segmenter_data::impl_segmenter_lstm_wl_auto_v1!(Baked);
+    icu_segmenter_data::impl_segmenter_sentence_v1!(Baked);
+    icu_segmenter_data::impl_segmenter_word_v1!(Baked);
+};
+
+#[cfg(feature = "datagen")]
+/// The latest minimum set of keys required by this component.
+pub const KEYS: &[DataKey] = &[
+    DictionaryForWordLineExtendedV1Marker::KEY,
+    DictionaryForWordOnlyAutoV1Marker::KEY,
+    GraphemeClusterBreakDataV1Marker::KEY,
+    LineBreakDataV1Marker::KEY,
+    LstmForWordLineAutoV1Marker::KEY,
+    SentenceBreakDataV1Marker::KEY,
+    WordBreakDataV1Marker::KEY,
+];
+
+/// Pre-processed Unicode data in the form of tables to be used for rule-based breaking.
+///
+/// <div class="stab unstable">
+/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
+/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
+/// to be stable, their Rust representation might not be. Use with caution.
+/// </div>
+#[icu_provider::data_struct(
+    marker(LineBreakDataV1Marker, "segmenter/line@1", singleton),
+    marker(WordBreakDataV1Marker, "segmenter/word@1", singleton),
+    marker(GraphemeClusterBreakDataV1Marker, "segmenter/grapheme@1", singleton),
+    marker(SentenceBreakDataV1Marker, "segmenter/sentence@1", singleton)
+)]
+#[derive(Debug, PartialEq, Clone)]
+#[cfg_attr(
+    feature = "datagen",
+    derive(serde::Serialize,databake::Bake),
+    databake(path = icu_segmenter::provider),
+)]
+#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
+pub struct RuleBreakDataV1<'data> {
+    /// Property table for rule-based breaking.
+    #[cfg_attr(feature = "serde", serde(borrow))]
+    pub property_table: RuleBreakPropertyTable<'data>,
+
+    /// Break state table for rule-based breaking.
+    #[cfg_attr(feature = "serde", serde(borrow))]
+    pub break_state_table: RuleBreakStateTable<'data>,
+
+    /// Rule status table for rule-based breaking.
+    #[cfg_attr(feature = "serde", serde(borrow))]
+    pub rule_status_table: RuleStatusTable<'data>,
+
+    /// Number of properties; should be the square root of the length of [`Self::break_state_table`].
+    pub property_count: u8,
+
+    /// The index of the last simple state for [`Self::break_state_table`]. (A simple state has no
+    /// `left` nor `right` in SegmenterProperty).
+    pub last_codepoint_property: i8,
+
+    /// The index of SOT (start of text) state for [`Self::break_state_table`].
+    pub sot_property: u8,
+
+    /// The index of EOT (end of text) state [`Self::break_state_table`].
+    pub eot_property: u8,
+
+    /// The index of "SA" state (or 127 if the complex language isn't handled) for
+    /// [`Self::break_state_table`].
+    pub complex_property: u8,
+}
+
+/// Property table for rule-based breaking.
+///
+/// <div class="stab unstable">
+/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
+/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
+/// to be stable, their Rust representation might not be. Use with caution.
+/// </div>
+#[derive(Debug, PartialEq, Clone, yoke::Yokeable, zerofrom::ZeroFrom)]
+#[cfg_attr(
+    feature = "datagen",
+    derive(serde::Serialize,databake::Bake),
+    databake(path = icu_segmenter::provider),
+)]
+#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
+pub struct RuleBreakPropertyTable<'data>(
+    #[cfg_attr(feature = "serde", serde(borrow))] pub CodePointTrie<'data, u8>,
+);
+
+/// Break state table for rule-based breaking.
+///
+/// <div class="stab unstable">
+/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
+/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
+/// to be stable, their Rust representation might not be. Use with caution.
+/// </div>
+#[derive(Debug, PartialEq, Clone, yoke::Yokeable, zerofrom::ZeroFrom)]
+#[cfg_attr(
+    feature = "datagen",
+    derive(serde::Serialize,databake::Bake),
+    databake(path = icu_segmenter::provider),
+)]
+#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
+pub struct RuleBreakStateTable<'data>(
+    #[cfg_attr(feature = "serde", serde(borrow))] pub ZeroVec<'data, i8>,
+);
+
+/// Rules status data for rule_status and is_word_like of word segmenter.
+///
+/// <div class="stab unstable">
+/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
+/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
+/// to be stable, their Rust representation might not be. Use with caution.
+/// </div>
+#[derive(Debug, PartialEq, Clone, yoke::Yokeable, zerofrom::ZeroFrom)]
+#[cfg_attr(
+    feature = "datagen",
+    derive(serde::Serialize,databake::Bake),
+    databake(path = icu_segmenter::provider),
+)]
+#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
+pub struct RuleStatusTable<'data>(
+    #[cfg_attr(feature = "serde", serde(borrow))] pub ZeroVec<'data, u8>,
+);
+
+/// char16trie data for dictionary break
+///
+/// <div class="stab unstable">
+/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
+/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
+/// to be stable, their Rust representation might not be. Use with caution.
+/// </div>
+#[icu_provider::data_struct(
+    DictionaryForWordOnlyAutoV1Marker = "segmenter/dictionary/w_auto@1",
+    DictionaryForWordLineExtendedV1Marker = "segmenter/dictionary/wl_ext@1"
+)]
+#[derive(Debug, PartialEq, Clone)]
+#[cfg_attr(
+    feature = "datagen",
+    derive(serde::Serialize,databake::Bake),
+    databake(path = icu_segmenter::provider),
+)]
+#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
+pub struct UCharDictionaryBreakDataV1<'data> {
+    /// Dictionary data of char16trie.
+    #[cfg_attr(feature = "serde", serde(borrow))]
+    pub trie_data: ZeroVec<'data, u16>,
+}
+
+pub(crate) struct UCharDictionaryBreakDataV1Marker;
+
+impl DataMarker for UCharDictionaryBreakDataV1Marker {
+    type Yokeable = UCharDictionaryBreakDataV1<'static>;
+}
diff --git a/third_party/rust/icu_segmenter/src/rule_segmenter.rs b/third_party/rust/icu_segmenter/src/rule_segmenter.rs
new file mode 100644
index 0000000000..740138e4ca
--- /dev/null
+++ b/third_party/rust/icu_segmenter/src/rule_segmenter.rs
@@ -0,0 +1,349 @@
+// This file is part of ICU4X. For terms of use, please see the file
+// called LICENSE at the top level of the ICU4X source tree
+// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
+
+use crate::complex::ComplexPayloads;
+use crate::indices::{Latin1Indices, Utf16Indices};
+use crate::provider::RuleBreakDataV1;
+use crate::symbols::*;
+use core::str::CharIndices;
+use utf8_iter::Utf8CharIndices;
+
+/// The category tag that is returned by
+/// [`WordBreakIterator::word_type()`][crate::WordBreakIterator::word_type()].
+#[non_exhaustive]
+#[derive(Copy, Clone, PartialEq, Debug)]
+#[repr(u8)]
+pub enum RuleStatusType {
+    /// No category tag
+    None = 0,
+    /// Number category tag
+    Number = 1,
+    /// Letter category tag, including CJK.
+    Letter = 2,
+}
+
+/// A trait allowing for RuleBreakIterator to be generalized to multiple string
+/// encoding methods and granularity such as grapheme cluster, word, etc.
+pub trait RuleBreakType<'l, 's> {
+    /// The iterator over characters.
+    type IterAttr: Iterator<Item = (usize, Self::CharType)> + Clone + core::fmt::Debug;
+
+    /// The character type.
+    type CharType: Copy + Into<u32> + core::fmt::Debug;
+
+    fn get_current_position_character_len(iter: &RuleBreakIterator<'l, 's, Self>) -> usize;
+
+    fn handle_complex_language(
+        iter: &mut RuleBreakIterator<'l, 's, Self>,
+        left_codepoint: Self::CharType,
+    ) -> Option<usize>;
+}
+
+/// Implements the [`Iterator`] trait over the segmenter boundaries of the given string.
+///
+/// Lifetimes:
+///
+/// - `'l` = lifetime of the segmenter object from which this iterator was created
+/// - `'s` = lifetime of the string being segmented
+///
+/// The [`Iterator::Item`] is an [`usize`] representing index of a code unit
+/// _after_ the boundary (for a boundary at the end of text, this index is the length
+/// of the [`str`] or array of code units).
+#[derive(Debug)]
+pub struct RuleBreakIterator<'l, 's, Y: RuleBreakType<'l, 's> + ?Sized> {
+    pub(crate) iter: Y::IterAttr,
+    pub(crate) len: usize,
+    pub(crate) current_pos_data: Option<(usize, Y::CharType)>,
+    pub(crate) result_cache: alloc::vec::Vec<usize>,
+    pub(crate) data: &'l RuleBreakDataV1<'l>,
+    pub(crate) complex: Option<&'l ComplexPayloads>,
+    pub(crate) boundary_property: u8,
+}
+
+impl<'l, 's, Y: RuleBreakType<'l, 's> + ?Sized> Iterator for RuleBreakIterator<'l, 's, Y> {
+    type Item = usize;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        // If we have break point cache by previous run, return this result
+        if let Some(&first_result) = self.result_cache.first() {
+            let mut i = 0;
+            loop {
+                if i == first_result {
+                    self.result_cache = self.result_cache.iter().skip(1).map(|r| r - i).collect();
+                    return self.get_current_position();
+                }
+                i += Y::get_current_position_character_len(self);
+                self.advance_iter();
+                if self.is_eof() {
+                    self.result_cache.clear();
+                    return Some(self.len);
+                }
+            }
+        }
+
+        if self.is_eof() {
+            self.advance_iter();
+            if self.is_eof() && self.len == 0 {
+                // Empty string. Since `self.current_pos_data` is always going to be empty,
+                // we never read `self.len` except for here, so we can use it to mark that
+                // we have already returned the single empty-string breakpoint.
+                self.len = 1;
+                return Some(0);
+            }
+            // SOT x anything
+            let right_prop = self.get_current_break_property()?;
+            if self.is_break_from_table(self.data.sot_property, right_prop) {
+                self.boundary_property = 0; // SOT is special type
+                return self.get_current_position();
+            }
+        }
+
+        loop {
+            debug_assert!(!self.is_eof());
+            let left_codepoint = self.get_current_codepoint()?;
+            let left_prop = self.get_break_property(left_codepoint);
+            self.advance_iter();
+
+            let Some(right_prop) = self.get_current_break_property() else {
+                self.boundary_property = left_prop;
+                return Some(self.len);
+            };
+
+            // Some segmenter rules doesn't have language-specific rules, we have to use LSTM (or dictionary) segmenter.
+            // If property is marked as SA, use it
+            if right_prop == self.data.complex_property {
+                if left_prop != self.data.complex_property {
+                    // break before SA
+                    self.boundary_property = left_prop;
+                    return self.get_current_position();
+                }
+                let break_offset = Y::handle_complex_language(self, left_codepoint);
+                if break_offset.is_some() {
+                    return break_offset;
+                }
+            }
+
+            // If break_state is equals or grater than 0, it is alias of property.
+            let mut break_state = self.get_break_state_from_table(left_prop, right_prop);
+
+            if break_state >= 0 {
+                // This isn't simple rule set. We need marker to restore iterator to previous position.
+                let mut previous_iter = self.iter.clone();
+                let mut previous_pos_data = self.current_pos_data;
+                let mut previous_left_prop = left_prop;
+
+                break_state &= !INTERMEDIATE_MATCH_RULE;
+                loop {
+                    self.advance_iter();
+
+                    let Some(prop) = self.get_current_break_property() else {
+                        // Reached EOF. But we are analyzing multiple characters now, so next break may be previous point.
+                        self.boundary_property = break_state as u8;
+                        if self
+                            .get_break_state_from_table(break_state as u8, self.data.eot_property)
+                            == NOT_MATCH_RULE
+                        {
+                            self.boundary_property = previous_left_prop;
+                            self.iter = previous_iter;
+                            self.current_pos_data = previous_pos_data;
+                            return self.get_current_position();
+                        }
+                        // EOF
+                        return Some(self.len);
+                    };
+
+                    let previous_break_state = break_state;
+                    break_state = self.get_break_state_from_table(break_state as u8, prop);
+                    if break_state < 0 {
+                        break;
+                    }
+                    if previous_break_state >= 0
+                        && previous_break_state <= self.data.last_codepoint_property
+                    {
+                        // Move marker
+                        previous_iter = self.iter.clone();
+                        previous_pos_data = self.current_pos_data;
+                        previous_left_prop = break_state as u8;
+                    }
+                    if (break_state & INTERMEDIATE_MATCH_RULE) != 0 {
+                        break_state -= INTERMEDIATE_MATCH_RULE;
+                        previous_iter = self.iter.clone();
+                        previous_pos_data = self.current_pos_data;
+                        previous_left_prop = break_state as u8;
+                    }
+                }
+                if break_state == KEEP_RULE {
+                    continue;
+                }
+                if break_state == NOT_MATCH_RULE {
+                    self.boundary_property = previous_left_prop;
+                    self.iter = previous_iter;
+                    self.current_pos_data = previous_pos_data;
+                    return self.get_current_position();
+                }
+                return self.get_current_position();
+            }
+
+            if self.is_break_from_table(left_prop, right_prop) {
+                self.boundary_property = left_prop;
+                return self.get_current_position();
+            }
+        }
+    }
+}
+
+impl<'l, 's, Y: RuleBreakType<'l, 's> + ?Sized> RuleBreakIterator<'l, 's, Y> {
+    pub(crate) fn advance_iter(&mut self) {
+        self.current_pos_data = self.iter.next();
+    }
+
+    pub(crate) fn is_eof(&self) -> bool {
+        self.current_pos_data.is_none()
+    }
+
+    pub(crate) fn get_current_break_property(&self) -> Option<u8> {
+        self.get_current_codepoint()
+            .map(|c| self.get_break_property(c))
+    }
+
+    pub(crate) fn get_current_position(&self) -> Option<usize> {
+        self.current_pos_data.map(|(pos, _)| pos)
+    }
+
+    pub(crate) fn get_current_codepoint(&self) -> Option<Y::CharType> {
+        self.current_pos_data.map(|(_, codepoint)| codepoint)
+    }
+
+    fn get_break_property(&self, codepoint: Y::CharType) -> u8 {
+        // Note: Default value is 0 == UNKNOWN
+        self.data.property_table.0.get32(codepoint.into())
+    }
+
+    fn get_break_state_from_table(&self, left: u8, right: u8) -> i8 {
+        let idx = left as usize * self.data.property_count as usize + right as usize;
+        // We use unwrap_or to fall back to the base case and prevent panics on bad data.
+        self.data.break_state_table.0.get(idx).unwrap_or(KEEP_RULE)
+    }
+
+    fn is_break_from_table(&self, left: u8, right: u8) -> bool {
+        let rule = self.get_break_state_from_table(left, right);
+        if rule == KEEP_RULE {
+            return false;
+        }
+        if rule >= 0 {
+            // need additional next characters to get break rule.
+            return false;
+        }
+        true
+    }
+
+    /// Return the status value of break boundary.
+    /// If segmenter isn't word, always return RuleStatusType::None
+    pub fn rule_status(&self) -> RuleStatusType {
+        if self.result_cache.first().is_some() {
+            // Dictionary type (CJ and East Asian) is letter.
+            return RuleStatusType::Letter;
+        }
+        if self.boundary_property == 0 {
+            // break position is SOT / Any
+            return RuleStatusType::None;
+        }
+        match self
+            .data
+            .rule_status_table
+            .0
+            .get((self.boundary_property - 1) as usize)
+        {
+            Some(1) => RuleStatusType::Number,
+            Some(2) => RuleStatusType::Letter,
+            _ => RuleStatusType::None,
+        }
+    }
+
+    /// Return true when break boundary is word-like such as letter/number/CJK
+    /// If segmenter isn't word, return false
+    pub fn is_word_like(&self) -> bool {
+        self.rule_status() != RuleStatusType::None
+    }
+}
+
+#[derive(Debug)]
+pub struct RuleBreakTypeUtf8;
+
+impl<'l, 's> RuleBreakType<'l, 's> for RuleBreakTypeUtf8 {
+    type IterAttr = CharIndices<'s>;
+    type CharType = char;
+
+    fn get_current_position_character_len(iter: &RuleBreakIterator<Self>) -> usize {
+        iter.get_current_codepoint().map_or(0, |c| c.len_utf8())
+    }
+
+    fn handle_complex_language(
+        _: &mut RuleBreakIterator<Self>,
+        _: Self::CharType,
+    ) -> Option<usize> {
+        unreachable!()
+    }
+}
+
+#[derive(Debug)]
+pub struct RuleBreakTypePotentiallyIllFormedUtf8;
+
+impl<'l, 's> RuleBreakType<'l, 's> for RuleBreakTypePotentiallyIllFormedUtf8 {
+    type IterAttr = Utf8CharIndices<'s>;
+    type CharType = char;
+
+    fn get_current_position_character_len(iter: &RuleBreakIterator<Self>) -> usize {
+        iter.get_current_codepoint().map_or(0, |c| c.len_utf8())
+    }
+
+    fn handle_complex_language(
+        _: &mut RuleBreakIterator<Self>,
+        _: Self::CharType,
+    ) -> Option<usize> {
+        unreachable!()
+    }
+}
+
+#[derive(Debug)]
+pub struct RuleBreakTypeLatin1;
+
+impl<'l, 's> RuleBreakType<'l, 's> for RuleBreakTypeLatin1 {
+    type IterAttr = Latin1Indices<'s>;
+    type CharType = u8;
+
+    fn get_current_position_character_len(_: &RuleBreakIterator<Self>) -> usize {
+        unreachable!()
+    }
+
+    fn handle_complex_language(
+        _: &mut RuleBreakIterator<Self>,
+        _: Self::CharType,
+    ) -> Option<usize> {
+        unreachable!()
+    }
+}
+
+#[derive(Debug)]
+pub struct RuleBreakTypeUtf16;
+
+impl<'l, 's> RuleBreakType<'l, 's> for RuleBreakTypeUtf16 {
+    type IterAttr = Utf16Indices<'s>;
+    type CharType = u32;
+
+    fn get_current_position_character_len(iter: &RuleBreakIterator<Self>) -> usize {
+        match iter.get_current_codepoint() {
+            None => 0,
+            Some(ch) if ch >= 0x10000 => 2,
+            _ => 1,
+        }
+    }
+
+    fn handle_complex_language(
+        _: &mut RuleBreakIterator<Self>,
+        _: Self::CharType,
+    ) -> Option<usize> {
+        unreachable!()
+    }
+}
diff --git a/third_party/rust/icu_segmenter/src/sentence.rs b/third_party/rust/icu_segmenter/src/sentence.rs
new file mode 100644
index 0000000000..05173f9eb5
--- /dev/null
+++ b/third_party/rust/icu_segmenter/src/sentence.rs
@@ -0,0 +1,220 @@
+// This file is part of ICU4X. For terms of use, please see the file
+// called LICENSE at the top level of the ICU4X source tree
+// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
+
+use alloc::vec::Vec;
+use icu_provider::prelude::*;
+
+use crate::indices::{Latin1Indices, Utf16Indices};
+use crate::iterator_helpers::derive_usize_iterator_with_type;
+use crate::rule_segmenter::*;
+use crate::{provider::*, SegmenterError};
+use utf8_iter::Utf8CharIndices;
+
+/// Implements the [`Iterator`] trait over the sentence boundaries of the given string.
+///
+/// Lifetimes:
+///
+/// - `'l` = lifetime of the segmenter object from which this iterator was created
+/// - `'s` = lifetime of the string being segmented
+///
+/// The [`Iterator::Item`] is an [`usize`] representing index of a code unit
+/// _after_ the boundary (for a boundary at the end of text, this index is the length
+/// of the [`str`] or array of code units).
+///
+/// For examples of use, see [`SentenceSegmenter`].
+#[derive(Debug)]
+pub struct SentenceBreakIterator<'l, 's, Y: RuleBreakType<'l, 's> + ?Sized>(
+    RuleBreakIterator<'l, 's, Y>,
+);
+
+derive_usize_iterator_with_type!(SentenceBreakIterator);
+
+/// Sentence break iterator for an `str` (a UTF-8 string).
+///
+/// For examples of use, see [`SentenceSegmenter`].
+pub type SentenceBreakIteratorUtf8<'l, 's> = SentenceBreakIterator<'l, 's, RuleBreakTypeUtf8>;
+
+/// Sentence break iterator for a potentially invalid UTF-8 string.
+///
+/// For examples of use, see [`SentenceSegmenter`].
+pub type SentenceBreakIteratorPotentiallyIllFormedUtf8<'l, 's> =
+    SentenceBreakIterator<'l, 's, RuleBreakTypePotentiallyIllFormedUtf8>;
+
+/// Sentence break iterator for a Latin-1 (8-bit) string.
+///
+/// For examples of use, see [`SentenceSegmenter`].
+pub type SentenceBreakIteratorLatin1<'l, 's> = SentenceBreakIterator<'l, 's, RuleBreakTypeLatin1>;
+
+/// Sentence break iterator for a UTF-16 string.
+///
+/// For examples of use, see [`SentenceSegmenter`].
+pub type SentenceBreakIteratorUtf16<'l, 's> = SentenceBreakIterator<'l, 's, RuleBreakTypeUtf16>;
+
+/// Supports loading sentence break data, and creating sentence break iterators for different string
+/// encodings.
+///
+/// # Examples
+///
+/// Segment a string:
+///
+/// ```rust
+/// use icu_segmenter::SentenceSegmenter;
+/// let segmenter = SentenceSegmenter::new();
+///
+/// let breakpoints: Vec<usize> =
+///     segmenter.segment_str("Hello World").collect();
+/// assert_eq!(&breakpoints, &[0, 11]);
+/// ```
+///
+/// Segment a Latin1 byte string:
+///
+/// ```rust
+/// use icu_segmenter::SentenceSegmenter;
+/// let segmenter = SentenceSegmenter::new();
+///
+/// let breakpoints: Vec<usize> =
+///     segmenter.segment_latin1(b"Hello World").collect();
+/// assert_eq!(&breakpoints, &[0, 11]);
+/// ```
+///
+/// Successive boundaries can be used to retrieve the sentences.
+/// In particular, the first boundary is always 0, and the last one is the
+/// length of the segmented text in code units.
+///
+/// ```rust
+/// # use icu_segmenter::SentenceSegmenter;
+/// # let segmenter = SentenceSegmenter::new();
+/// use itertools::Itertools;
+/// let text = "Ceci tuera cela. Le livre tuera l’édifice.";
+/// let sentences: Vec<&str> = segmenter
+///     .segment_str(text)
+///     .tuple_windows()
+///     .map(|(i, j)| &text[i..j])
+///     .collect();
+/// assert_eq!(
+///     &sentences,
+///     &["Ceci tuera cela. ", "Le livre tuera l’édifice."]
+/// );
+/// ```
+#[derive(Debug)]
+pub struct SentenceSegmenter {
+    payload: DataPayload<SentenceBreakDataV1Marker>,
+}
+
+#[cfg(feature = "compiled_data")]
+impl Default for SentenceSegmenter {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SentenceSegmenter {
+    /// Constructs a [`SentenceSegmenter`] with an invariant locale and compiled data.
+    ///
+    /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+    ///
+    /// [📚 Help choosing a constructor](icu_provider::constructors)
+    #[cfg(feature = "compiled_data")]
+    pub fn new() -> Self {
+        Self {
+            payload: DataPayload::from_static_ref(
+                crate::provider::Baked::SINGLETON_SEGMENTER_SENTENCE_V1,
+            ),
+        }
+    }
+
+    icu_provider::gen_any_buffer_data_constructors!(locale: skip, options: skip, error: SegmenterError,
+        #[cfg(skip)]
+        functions: [
+            new,
+            try_new_with_any_provider,
+            try_new_with_buffer_provider,
+            try_new_unstable,
+            Self,
+        ]
+    );
+
+    #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new)]
+    pub fn try_new_unstable<D>(provider: &D) -> Result<Self, SegmenterError>
+    where
+        D: DataProvider<SentenceBreakDataV1Marker> + ?Sized,
+    {
+        let payload = provider.load(Default::default())?.take_payload()?;
+        Ok(Self { payload })
+    }
+
+    /// Creates a sentence break iterator for an `str` (a UTF-8 string).
+    ///
+    /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
+    pub fn segment_str<'l, 's>(&'l self, input: &'s str) -> SentenceBreakIteratorUtf8<'l, 's> {
+        SentenceBreakIterator(RuleBreakIterator {
+            iter: input.char_indices(),
+            len: input.len(),
+            current_pos_data: None,
+            result_cache: Vec::new(),
+            data: self.payload.get(),
+            complex: None,
+            boundary_property: 0,
+        })
+    }
+    /// Creates a sentence break iterator for a potentially ill-formed UTF8 string
+    ///
+    /// Invalid characters are treated as REPLACEMENT CHARACTER
+    ///
+    /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
+    pub fn segment_utf8<'l, 's>(
+        &'l self,
+        input: &'s [u8],
+    ) -> SentenceBreakIteratorPotentiallyIllFormedUtf8<'l, 's> {
+        SentenceBreakIterator(RuleBreakIterator {
+            iter: Utf8CharIndices::new(input),
+            len: input.len(),
+            current_pos_data: None,
+            result_cache: Vec::new(),
+            data: self.payload.get(),
+            complex: None,
+            boundary_property: 0,
+        })
+    }
+    /// Creates a sentence break iterator for a Latin-1 (8-bit) string.
+    ///
+    /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
+    pub fn segment_latin1<'l, 's>(
+        &'l self,
+        input: &'s [u8],
+    ) -> SentenceBreakIteratorLatin1<'l, 's> {
+        SentenceBreakIterator(RuleBreakIterator {
+            iter: Latin1Indices::new(input),
+            len: input.len(),
+            current_pos_data: None,
+            result_cache: Vec::new(),
+            data: self.payload.get(),
+            complex: None,
+            boundary_property: 0,
+        })
+    }
+
+    /// Creates a sentence break iterator for a UTF-16 string.
+    ///
+    /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
+    pub fn segment_utf16<'l, 's>(&'l self, input: &'s [u16]) -> SentenceBreakIteratorUtf16<'l, 's> {
+        SentenceBreakIterator(RuleBreakIterator {
+            iter: Utf16Indices::new(input),
+            len: input.len(),
+            current_pos_data: None,
+            result_cache: Vec::new(),
+            data: self.payload.get(),
+            complex: None,
+            boundary_property: 0,
+        })
+    }
+}
+
+#[cfg(all(test, feature = "serde"))]
+#[test]
+fn empty_string() {
+    let segmenter = SentenceSegmenter::new();
+    let breaks: Vec<usize> = segmenter.segment_str("").collect();
+    assert_eq!(breaks, [0]);
+}
diff --git a/third_party/rust/icu_segmenter/src/symbols.rs b/third_party/rust/icu_segmenter/src/symbols.rs
new file mode 100644
index 0000000000..b2c9a2450f
--- /dev/null
+++ b/third_party/rust/icu_segmenter/src/symbols.rs
@@ -0,0 +1,141 @@
+// This file is part of ICU4X. For terms of use, please see the file
+// called LICENSE at the top level of the ICU4X source tree
+// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
+
+// TODO(#1637): The numeric values of these symbols are generated by the old transformation code
+// (aka build.rs). We should move these symbols into RuleBreakDataV1, and remove this file.
+
+// Used by line.rs.
+#[allow(dead_code)]
+pub const UNKNOWN: u8 = 0;
+#[allow(dead_code)]
+pub const AI: u8 = 1;
+#[allow(dead_code)]
+pub const AL: u8 = 2;
+#[allow(dead_code)]
+pub const B2: u8 = 3;
+#[allow(dead_code)]
+pub const BA: u8 = 4;
+#[allow(dead_code)]
+pub const BB: u8 = 5;
+#[allow(dead_code)]
+pub const BK: u8 = 6;
+#[allow(dead_code)]
+pub const CB: u8 = 7;
+#[allow(dead_code)]
+pub const CJ: u8 = 8;
+#[allow(dead_code)]
+pub const CL: u8 = 9;
+#[allow(dead_code)]
+pub const CM: u8 = 10;
+#[allow(dead_code)]
+pub const CP: u8 = 11;
+#[allow(dead_code)]
+pub const CR: u8 = 12;
+#[allow(dead_code)]
+pub const EB: u8 = 13;
+#[allow(dead_code)]
+pub const EM: u8 = 14;
+#[allow(dead_code)]
+pub const EX: u8 = 15;
+#[allow(dead_code)]
+pub const GL: u8 = 16;
+#[allow(dead_code)]
+pub const H2: u8 = 17;
+#[allow(dead_code)]
+pub const H3: u8 = 18;
+#[allow(dead_code)]
+pub const HL: u8 = 19;
+#[allow(dead_code)]
+pub const HY: u8 = 20;
+#[allow(dead_code)]
+pub const ID: u8 = 21;
+#[allow(dead_code)]
+pub const ID_CN: u8 = 22;
+#[allow(dead_code)]
+pub const IN: u8 = 23;
+#[allow(dead_code)]
+pub const IS: u8 = 24;
+#[allow(dead_code)]
+pub const JL: u8 = 25;
+#[allow(dead_code)]
+pub const JT: u8 = 26;
+#[allow(dead_code)]
+pub const JV: u8 = 27;
+#[allow(dead_code)]
+pub const LF: u8 = 28;
+#[allow(dead_code)]
+pub const NL: u8 = 29;
+#[allow(dead_code)]
+pub const NS: u8 = 30;
+#[allow(dead_code)]
+pub const NU: u8 = 31;
+#[allow(dead_code)]
+pub const OP_EA: u8 = 32;
+#[allow(dead_code)]
+pub const OP_OP30: u8 = 33;
+#[allow(dead_code)]
+pub const PO: u8 = 34;
+#[allow(dead_code)]
+pub const PO_EAW: u8 = 35;
+#[allow(dead_code)]
+pub const PR: u8 = 36;
+#[allow(dead_code)]
+pub const PR_EAW: u8 = 37;
+#[allow(dead_code)]
+pub const QU: u8 = 38;
+#[allow(dead_code)]
+pub const RI: u8 = 39;
+#[allow(dead_code)]
+pub const SA: u8 = 40;
+#[allow(dead_code)]
+pub const SG: u8 = 41;
+#[allow(dead_code)]
+pub const SP: u8 = 42;
+#[allow(dead_code)]
+pub const SY: u8 = 43;
+#[allow(dead_code)]
+pub const WJ: u8 = 44;
+#[allow(dead_code)]
+pub const XX: u8 = 45;
+#[allow(dead_code)]
+pub const ZW: u8 = 46;
+#[allow(dead_code)]
+pub const ZWJ: u8 = 47;
+#[allow(dead_code)]
+pub const OP_SP: u8 = 48;
+#[allow(dead_code)]
+pub const QU_SP: u8 = 49;
+#[allow(dead_code)]
+pub const CL_CP_SP: u8 = 50;
+#[allow(dead_code)]
+pub const B2_SP: u8 = 51;
+#[allow(dead_code)]
+pub const HL_HY: u8 = 52;
+#[allow(dead_code)]
+pub const LB25_HY: u8 = 53;
+#[allow(dead_code)]
+pub const LB25_OP: u8 = 54;
+#[allow(dead_code)]
+pub const LB25_NU_IS: u8 = 55;
+#[allow(dead_code)]
+pub const LB25_NU_SY: u8 = 56;
+#[allow(dead_code)]
+pub const LB25_NU_CL: u8 = 57;
+#[allow(dead_code)]
+pub const LB25_NU_CP: u8 = 58;
+#[allow(dead_code)]
+pub const RI_RI: u8 = 59;
+#[allow(dead_code)]
+pub const SOT: u8 = 60;
+#[allow(dead_code)]
+pub const EOT: u8 = 61;
+
+// Used by all segmenters.
+pub const BREAK_RULE: i8 = -128;
+pub const UNKNOWN_RULE: i8 = -127;
+pub const NOT_MATCH_RULE: i8 = -2;
+pub const KEEP_RULE: i8 = -1;
+// This is a mask bit chosen sufficiently large than all other concrete states.
+// If a break state contains this bit, we have to look ahead one more character.
+pub const INTERMEDIATE_MATCH_RULE: i8 = 64;
diff --git a/third_party/rust/icu_segmenter/src/word.rs b/third_party/rust/icu_segmenter/src/word.rs
new file mode 100644
index 0000000000..de4af16543
--- /dev/null
+++ b/third_party/rust/icu_segmenter/src/word.rs
@@ -0,0 +1,605 @@
+// This file is part of ICU4X. For terms of use, please see the file
+// called LICENSE at the top level of the ICU4X source tree
+// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
+
+use crate::complex::*;
+use crate::indices::{Latin1Indices, Utf16Indices};
+use crate::iterator_helpers::derive_usize_iterator_with_type;
+use crate::provider::*;
+use crate::rule_segmenter::*;
+use crate::SegmenterError;
+use alloc::string::String;
+use alloc::vec;
+use alloc::vec::Vec;
+use core::str::CharIndices;
+use icu_provider::prelude::*;
+use utf8_iter::Utf8CharIndices;
+
+/// Implements the [`Iterator`] trait over the word boundaries of the given string.
+///
+/// Lifetimes:
+///
+/// - `'l` = lifetime of the segmenter object from which this iterator was created
+/// - `'s` = lifetime of the string being segmented
+///
+/// The [`Iterator::Item`] is an [`usize`] representing index of a code unit
+/// _after_ the boundary (for a boundary at the end of text, this index is the length
+/// of the [`str`] or array of code units).
+///
+/// For examples of use, see [`WordSegmenter`].
+#[derive(Debug)]
+pub struct WordBreakIterator<'l, 's, Y: RuleBreakType<'l, 's> + ?Sized>(
+    RuleBreakIterator<'l, 's, Y>,
+);
+
+derive_usize_iterator_with_type!(WordBreakIterator);
+
+/// The word type tag that is returned by [`WordBreakIterator::word_type()`].
+#[non_exhaustive]
+#[derive(Copy, Clone, PartialEq, Debug)]
+#[repr(u8)]
+pub enum WordType {
+    /// No category tag.
+    None = 0,
+    /// Number category tag.
+    Number = 1,
+    /// Letter category tag, including CJK.
+    Letter = 2,
+}
+
+impl<'l, 's, Y: RuleBreakType<'l, 's> + ?Sized> WordBreakIterator<'l, 's, Y> {
+    /// Returns the word type of the segment preceding the current boundary.
+    #[inline]
+    pub fn word_type(&self) -> WordType {
+        match self.0.rule_status() {
+            RuleStatusType::None => WordType::None,
+            RuleStatusType::Number => WordType::Number,
+            RuleStatusType::Letter => WordType::Letter,
+        }
+    }
+    /// Returns `true` when the segment preceding the current boundary is word-like,
+    /// such as letter, number, or CJK.
+    #[inline]
+    pub fn is_word_like(&self) -> bool {
+        self.0.is_word_like()
+    }
+}
+
+/// Word break iterator for an `str` (a UTF-8 string).
+///
+/// For examples of use, see [`WordSegmenter`].
+pub type WordBreakIteratorUtf8<'l, 's> = WordBreakIterator<'l, 's, WordBreakTypeUtf8>;
+
+/// Word break iterator for a potentially invalid UTF-8 string.
+///
+/// For examples of use, see [`WordSegmenter`].
+pub type WordBreakIteratorPotentiallyIllFormedUtf8<'l, 's> =
+    WordBreakIterator<'l, 's, WordBreakTypePotentiallyIllFormedUtf8>;
+
+/// Word break iterator for a Latin-1 (8-bit) string.
+///
+/// For examples of use, see [`WordSegmenter`].
+pub type WordBreakIteratorLatin1<'l, 's> = WordBreakIterator<'l, 's, RuleBreakTypeLatin1>;
+
+/// Word break iterator for a UTF-16 string.
+///
+/// For examples of use, see [`WordSegmenter`].
+pub type WordBreakIteratorUtf16<'l, 's> = WordBreakIterator<'l, 's, WordBreakTypeUtf16>;
+
+/// Supports loading word break data, and creating word break iterators for different string
+/// encodings.
+///
+/// # Examples
+///
+/// Segment a string:
+///
+/// ```rust
+/// use icu_segmenter::WordSegmenter;
+/// let segmenter = WordSegmenter::new_auto();
+///
+/// let breakpoints: Vec<usize> =
+///     segmenter.segment_str("Hello World").collect();
+/// assert_eq!(&breakpoints, &[0, 5, 6, 11]);
+/// ```
+///
+/// Segment a Latin1 byte string:
+///
+/// ```rust
+/// use icu_segmenter::WordSegmenter;
+/// let segmenter = WordSegmenter::new_auto();
+///
+/// let breakpoints: Vec<usize> =
+///     segmenter.segment_latin1(b"Hello World").collect();
+/// assert_eq!(&breakpoints, &[0, 5, 6, 11]);
+/// ```
+///
+/// Successive boundaries can be used to retrieve the segments.
+/// In particular, the first boundary is always 0, and the last one is the
+/// length of the segmented text in code units.
+///
+/// ```rust
+/// # use icu_segmenter::WordSegmenter;
+/// # let segmenter = WordSegmenter::new_auto();
+/// use itertools::Itertools;
+/// let text = "Mark’d ye his words?";
+/// let segments: Vec<&str> = segmenter
+///     .segment_str(text)
+///     .tuple_windows()
+///     .map(|(i, j)| &text[i..j])
+///     .collect();
+/// assert_eq!(
+///     &segments,
+///     &["Mark’d", " ", "ye", " ", "his", " ", "words", "?"]
+/// );
+/// ```
+///
+/// Not all segments delimited by word boundaries are words; some are interword
+/// segments such as spaces and punctuation.
+/// The [`WordBreakIterator::word_type()`] of a boundary can be used to
+/// classify the preceding segment.
+/// ```rust
+/// # use itertools::Itertools;
+/// # use icu_segmenter::{WordType, WordSegmenter};
+/// # let segmenter = WordSegmenter::new_auto();
+/// # let text = "Mark’d ye his words?";
+/// let words: Vec<&str> = {
+///     let mut it = segmenter.segment_str(text);
+///     std::iter::from_fn(move || it.next().map(|i| (i, it.word_type())))
+///         .tuple_windows()
+///         .filter(|(_, (_, status))| *status == WordType::Letter)
+///         .map(|((i, _), (j, _))| &text[i..j])
+///         .collect()
+/// };
+/// assert_eq!(&words, &["Mark’d", "ye", "his", "words"]);
+/// ```
+#[derive(Debug)]
+pub struct WordSegmenter {
+    payload: DataPayload<WordBreakDataV1Marker>,
+    complex: ComplexPayloads,
+}
+
+impl WordSegmenter {
+    /// Constructs a [`WordSegmenter`] with an invariant locale and the best available compiled data for
+    /// complex scripts (Chinese, Japanese, Khmer, Lao, Myanmar, and Thai).
+    ///
+    /// The current behavior, which is subject to change, is to use the LSTM model when available
+    /// and the dictionary model for Chinese and Japanese.
+    ///
+    /// ✨ *Enabled with the `compiled_data` and `auto` Cargo features.*
+    ///
+    /// [📚 Help choosing a constructor](icu_provider::constructors)
+    ///
+    /// # Examples
+    ///
+    /// Behavior with complex scripts:
+    ///
+    /// ```
+    /// use icu::segmenter::WordSegmenter;
+    ///
+    /// let th_str = "ทุกสองสัปดาห์";
+    /// let ja_str = "こんにちは世界";
+    ///
+    /// let segmenter = WordSegmenter::new_auto();
+    ///
+    /// let th_bps = segmenter.segment_str(th_str).collect::<Vec<_>>();
+    /// let ja_bps = segmenter.segment_str(ja_str).collect::<Vec<_>>();
+    ///
+    /// assert_eq!(th_bps, [0, 9, 18, 39]);
+    /// assert_eq!(ja_bps, [0, 15, 21]);
+    /// ```
+    #[cfg(feature = "compiled_data")]
+    #[cfg(feature = "auto")]
+    pub fn new_auto() -> Self {
+        Self {
+            payload: DataPayload::from_static_ref(
+                crate::provider::Baked::SINGLETON_SEGMENTER_WORD_V1,
+            ),
+            complex: ComplexPayloads::new_auto(),
+        }
+    }
+
+    #[cfg(feature = "auto")]
+    icu_provider::gen_any_buffer_data_constructors!(
+        locale: skip,
+        options: skip,
+        error: SegmenterError,
+        #[cfg(skip)]
+        functions: [
+            try_new_auto,
+            try_new_auto_with_any_provider,
+            try_new_auto_with_buffer_provider,
+            try_new_auto_unstable,
+            Self
+        ]
+    );
+
+    #[cfg(feature = "auto")]
+    #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new_auto)]
+    pub fn try_new_auto_unstable<D>(provider: &D) -> Result<Self, SegmenterError>
+    where
+        D: DataProvider<WordBreakDataV1Marker>
+            + DataProvider<DictionaryForWordOnlyAutoV1Marker>
+            + DataProvider<LstmForWordLineAutoV1Marker>
+            + DataProvider<GraphemeClusterBreakDataV1Marker>
+            + ?Sized,
+    {
+        Ok(Self {
+            payload: provider.load(Default::default())?.take_payload()?,
+            complex: ComplexPayloads::try_new_auto(provider)?,
+        })
+    }
+
+    /// Constructs a [`WordSegmenter`] with an invariant locale and compiled LSTM data for
+    /// complex scripts (Burmese, Khmer, Lao, and Thai).
+    ///
+    /// The LSTM, or Long Term Short Memory, is a machine learning model. It is smaller than
+    /// the full dictionary but more expensive during segmentation (inference).
+    ///
+    /// Warning: there is not currently an LSTM model for Chinese or Japanese, so the [`WordSegmenter`]
+    /// created by this function will have unexpected behavior in spans of those scripts.
+    ///
+    /// ✨ *Enabled with the `compiled_data` and `lstm` Cargo features.*
+    ///
+    /// [📚 Help choosing a constructor](icu_provider::constructors)
+    ///
+    /// # Examples
+    ///
+    /// Behavior with complex scripts:
+    ///
+    /// ```
+    /// use icu::segmenter::WordSegmenter;
+    ///
+    /// let th_str = "ทุกสองสัปดาห์";
+    /// let ja_str = "こんにちは世界";
+    ///
+    /// let segmenter = WordSegmenter::new_lstm();
+    ///
+    /// let th_bps = segmenter.segment_str(th_str).collect::<Vec<_>>();
+    /// let ja_bps = segmenter.segment_str(ja_str).collect::<Vec<_>>();
+    ///
+    /// assert_eq!(th_bps, [0, 9, 18, 39]);
+    ///
+    /// // Note: We aren't able to find a suitable breakpoint in Chinese/Japanese.
+    /// assert_eq!(ja_bps, [0, 21]);
+    /// ```
+    #[cfg(feature = "compiled_data")]
+    #[cfg(feature = "lstm")]
+    pub fn new_lstm() -> Self {
+        Self {
+            payload: DataPayload::from_static_ref(
+                crate::provider::Baked::SINGLETON_SEGMENTER_WORD_V1,
+            ),
+            complex: ComplexPayloads::new_lstm(),
+        }
+    }
+
+    #[cfg(feature = "lstm")]
+    icu_provider::gen_any_buffer_data_constructors!(
+        locale: skip,
+        options: skip,
+        error: SegmenterError,
+        #[cfg(skip)]
+        functions: [
+            new_lstm,
+            try_new_lstm_with_any_provider,
+            try_new_lstm_with_buffer_provider,
+            try_new_lstm_unstable,
+            Self
+        ]
+    );
+
+    #[cfg(feature = "lstm")]
+    #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new_lstm)]
+    pub fn try_new_lstm_unstable<D>(provider: &D) -> Result<Self, SegmenterError>
+    where
+        D: DataProvider<WordBreakDataV1Marker>
+            + DataProvider<LstmForWordLineAutoV1Marker>
+            + DataProvider<GraphemeClusterBreakDataV1Marker>
+            + ?Sized,
+    {
+        Ok(Self {
+            payload: provider.load(Default::default())?.take_payload()?,
+            complex: ComplexPayloads::try_new_lstm(provider)?,
+        })
+    }
+
+    /// Construct a [`WordSegmenter`] with an invariant locale and compiled dictionary data for
+    /// complex scripts (Chinese, Japanese, Khmer, Lao, Myanmar, and Thai).
+    ///
+    /// The dictionary model uses a list of words to determine appropriate breakpoints. It is
+    /// faster than the LSTM model but requires more data.
+    ///
+    /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+    ///
+    /// [📚 Help choosing a constructor](icu_provider::constructors)
+    ///
+    /// # Examples
+    ///
+    /// Behavior with complex scripts:
+    ///
+    /// ```
+    /// use icu::segmenter::WordSegmenter;
+    ///
+    /// let th_str = "ทุกสองสัปดาห์";
+    /// let ja_str = "こんにちは世界";
+    ///
+    /// let segmenter = WordSegmenter::new_dictionary();
+    ///
+    /// let th_bps = segmenter.segment_str(th_str).collect::<Vec<_>>();
+    /// let ja_bps = segmenter.segment_str(ja_str).collect::<Vec<_>>();
+    ///
+    /// assert_eq!(th_bps, [0, 9, 18, 39]);
+    /// assert_eq!(ja_bps, [0, 15, 21]);
+    /// ```
+    #[cfg(feature = "compiled_data")]
+    pub fn new_dictionary() -> Self {
+        Self {
+            payload: DataPayload::from_static_ref(
+                crate::provider::Baked::SINGLETON_SEGMENTER_WORD_V1,
+            ),
+            complex: ComplexPayloads::new_dict(),
+        }
+    }
+
+    icu_provider::gen_any_buffer_data_constructors!(
+        locale: skip,
+        options: skip,
+        error: SegmenterError,
+        #[cfg(skip)]
+        functions: [
+            new_dictionary,
+            try_new_dictionary_with_any_provider,
+            try_new_dictionary_with_buffer_provider,
+            try_new_dictionary_unstable,
+            Self
+        ]
+    );
+
+    #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new_dictionary)]
+    pub fn try_new_dictionary_unstable<D>(provider: &D) -> Result<Self, SegmenterError>
+    where
+        D: DataProvider<WordBreakDataV1Marker>
+            + DataProvider<DictionaryForWordOnlyAutoV1Marker>
+            + DataProvider<DictionaryForWordLineExtendedV1Marker>
+            + DataProvider<GraphemeClusterBreakDataV1Marker>
+            + ?Sized,
+    {
+        Ok(Self {
+            payload: provider.load(Default::default())?.take_payload()?,
+            complex: ComplexPayloads::try_new_dict(provider)?,
+        })
+    }
+
+    /// Creates a word break iterator for an `str` (a UTF-8 string).
+    ///
+    /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
+    pub fn segment_str<'l, 's>(&'l self, input: &'s str) -> WordBreakIteratorUtf8<'l, 's> {
+        WordBreakIterator(RuleBreakIterator {
+            iter: input.char_indices(),
+            len: input.len(),
+            current_pos_data: None,
+            result_cache: Vec::new(),
+            data: self.payload.get(),
+            complex: Some(&self.complex),
+            boundary_property: 0,
+        })
+    }
+
+    /// Creates a word break iterator for a potentially ill-formed UTF8 string
+    ///
+    /// Invalid characters are treated as REPLACEMENT CHARACTER
+    ///
+    /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
+    pub fn segment_utf8<'l, 's>(
+        &'l self,
+        input: &'s [u8],
+    ) -> WordBreakIteratorPotentiallyIllFormedUtf8<'l, 's> {
+        WordBreakIterator(RuleBreakIterator {
+            iter: Utf8CharIndices::new(input),
+            len: input.len(),
+            current_pos_data: None,
+            result_cache: Vec::new(),
+            data: self.payload.get(),
+            complex: Some(&self.complex),
+            boundary_property: 0,
+        })
+    }
+
+    /// Creates a word break iterator for a Latin-1 (8-bit) string.
+    ///
+    /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
+    pub fn segment_latin1<'l, 's>(&'l self, input: &'s [u8]) -> WordBreakIteratorLatin1<'l, 's> {
+        WordBreakIterator(RuleBreakIterator {
+            iter: Latin1Indices::new(input),
+            len: input.len(),
+            current_pos_data: None,
+            result_cache: Vec::new(),
+            data: self.payload.get(),
+            complex: Some(&self.complex),
+            boundary_property: 0,
+        })
+    }
+
+    /// Creates a word break iterator for a UTF-16 string.
+    ///
+    /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
+    pub fn segment_utf16<'l, 's>(&'l self, input: &'s [u16]) -> WordBreakIteratorUtf16<'l, 's> {
+        WordBreakIterator(RuleBreakIterator {
+            iter: Utf16Indices::new(input),
+            len: input.len(),
+            current_pos_data: None,
+            result_cache: Vec::new(),
+            data: self.payload.get(),
+            complex: Some(&self.complex),
+            boundary_property: 0,
+        })
+    }
+}
+
+#[derive(Debug)]
+pub struct WordBreakTypeUtf8;
+
+impl<'l, 's> RuleBreakType<'l, 's> for WordBreakTypeUtf8 {
+    type IterAttr = CharIndices<'s>;
+    type CharType = char;
+
+    fn get_current_position_character_len(iter: &RuleBreakIterator<Self>) -> usize {
+        iter.get_current_codepoint().map_or(0, |c| c.len_utf8())
+    }
+
+    fn handle_complex_language(
+        iter: &mut RuleBreakIterator<'l, 's, Self>,
+        left_codepoint: Self::CharType,
+    ) -> Option<usize> {
+        handle_complex_language_utf8(iter, left_codepoint)
+    }
+}
+
+#[derive(Debug)]
+pub struct WordBreakTypePotentiallyIllFormedUtf8;
+
+impl<'l, 's> RuleBreakType<'l, 's> for WordBreakTypePotentiallyIllFormedUtf8 {
+    type IterAttr = Utf8CharIndices<'s>;
+    type CharType = char;
+
+    fn get_current_position_character_len(iter: &RuleBreakIterator<Self>) -> usize {
+        iter.get_current_codepoint().map_or(0, |c| c.len_utf8())
+    }
+
+    fn handle_complex_language(
+        iter: &mut RuleBreakIterator<'l, 's, Self>,
+        left_codepoint: Self::CharType,
+    ) -> Option<usize> {
+        handle_complex_language_utf8(iter, left_codepoint)
+    }
+}
+
+/// handle_complex_language impl for UTF8 iterators
+fn handle_complex_language_utf8<'l, 's, T>(
+    iter: &mut RuleBreakIterator<'l, 's, T>,
+    left_codepoint: T::CharType,
+) -> Option<usize>
+where
+    T: RuleBreakType<'l, 's, CharType = char>,
+{
+    // word segmenter doesn't define break rules for some languages such as Thai.
+    let start_iter = iter.iter.clone();
+    let start_point = iter.current_pos_data;
+    let mut s = String::new();
+    s.push(left_codepoint);
+    loop {
+        debug_assert!(!iter.is_eof());
+        s.push(iter.get_current_codepoint()?);
+        iter.advance_iter();
+        if let Some(current_break_property) = iter.get_current_break_property() {
+            if current_break_property != iter.data.complex_property {
+                break;
+            }
+        } else {
+            // EOF
+            break;
+        }
+    }
+
+    // Restore iterator to move to head of complex string
+    iter.iter = start_iter;
+    iter.current_pos_data = start_point;
+    #[allow(clippy::unwrap_used)] // iter.complex present for word segmenter
+    let breaks = complex_language_segment_str(iter.complex.unwrap(), &s);
+    iter.result_cache = breaks;
+    let first_pos = *iter.result_cache.first()?;
+    let mut i = left_codepoint.len_utf8();
+    loop {
+        if i == first_pos {
+            // Re-calculate breaking offset
+            iter.result_cache = iter.result_cache.iter().skip(1).map(|r| r - i).collect();
+            return iter.get_current_position();
+        }
+        debug_assert!(
+            i < first_pos,
+            "we should always arrive at first_pos: near index {:?}",
+            iter.get_current_position()
+        );
+        i += T::get_current_position_character_len(iter);
+        iter.advance_iter();
+        if iter.is_eof() {
+            iter.result_cache.clear();
+            return Some(iter.len);
+        }
+    }
+}
+
+#[derive(Debug)]
+pub struct WordBreakTypeUtf16;
+
+impl<'l, 's> RuleBreakType<'l, 's> for WordBreakTypeUtf16 {
+    type IterAttr = Utf16Indices<'s>;
+    type CharType = u32;
+
+    fn get_current_position_character_len(iter: &RuleBreakIterator<Self>) -> usize {
+        match iter.get_current_codepoint() {
+            None => 0,
+            Some(ch) if ch >= 0x10000 => 2,
+            _ => 1,
+        }
+    }
+
+    fn handle_complex_language(
+        iter: &mut RuleBreakIterator<Self>,
+        left_codepoint: Self::CharType,
+    ) -> Option<usize> {
+        // word segmenter doesn't define break rules for some languages such as Thai.
+        let start_iter = iter.iter.clone();
+        let start_point = iter.current_pos_data;
+        let mut s = vec![left_codepoint as u16];
+        loop {
+            debug_assert!(!iter.is_eof());
+            s.push(iter.get_current_codepoint()? as u16);
+            iter.advance_iter();
+            if let Some(current_break_property) = iter.get_current_break_property() {
+                if current_break_property != iter.data.complex_property {
+                    break;
+                }
+            } else {
+                // EOF
+                break;
+            }
+        }
+
+        // Restore iterator to move to head of complex string
+        iter.iter = start_iter;
+        iter.current_pos_data = start_point;
+        #[allow(clippy::unwrap_used)] // iter.complex present for word segmenter
+        let breaks = complex_language_segment_utf16(iter.complex.unwrap(), &s);
+        iter.result_cache = breaks;
+        // result_cache vector is utf-16 index that is in BMP.
+        let first_pos = *iter.result_cache.first()?;
+        let mut i = 1;
+        loop {
+            if i == first_pos {
+                // Re-calculate breaking offset
+                iter.result_cache = iter.result_cache.iter().skip(1).map(|r| r - i).collect();
+                return iter.get_current_position();
+            }
+            debug_assert!(
+                i < first_pos,
+                "we should always arrive at first_pos: near index {:?}",
+                iter.get_current_position()
+            );
+            i += 1;
+            iter.advance_iter();
+            if iter.is_eof() {
+                iter.result_cache.clear();
+                return Some(iter.len);
+            }
+        }
+    }
+}
+
+#[cfg(all(test, feature = "serde"))]
+#[test]
+fn empty_string() {
+    let segmenter = WordSegmenter::new_auto();
+    let breaks: Vec<usize> = segmenter.segment_str("").collect();
+    assert_eq!(breaks, [0]);
+}
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-19 00:47:55 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-19 00:47:55 +0000
commit	26a029d407be480d791972afb5975cf62c9360a6 (patch)
tree	f435a8308119effd964b339f76abb83a57c29483 /third_party/rust/icu_segmenter/src
parent	Initial commit. (diff)
download	firefox-26a029d407be480d791972afb5975cf62c9360a6.tar.xz firefox-26a029d407be480d791972afb5975cf62c9360a6.zip