diff options
Diffstat (limited to 'intl/icu_capi/src/segmenter_word.rs')
-rw-r--r-- | intl/icu_capi/src/segmenter_word.rs | 213 |
1 files changed, 213 insertions, 0 deletions
diff --git a/intl/icu_capi/src/segmenter_word.rs b/intl/icu_capi/src/segmenter_word.rs new file mode 100644 index 0000000000..226bcdd334 --- /dev/null +++ b/intl/icu_capi/src/segmenter_word.rs @@ -0,0 +1,213 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +#[diplomat::bridge] +pub mod ffi { + use crate::errors::ffi::ICU4XError; + use crate::provider::ffi::ICU4XDataProvider; + use alloc::boxed::Box; + use core::convert::TryFrom; + use icu_segmenter::{ + WordBreakIteratorLatin1, WordBreakIteratorPotentiallyIllFormedUtf8, WordBreakIteratorUtf16, + WordSegmenter, WordType, + }; + + #[diplomat::enum_convert(WordType, needs_wildcard)] + #[diplomat::rust_link(icu::segmenter::WordType, Enum)] + pub enum ICU4XSegmenterWordType { + None = 0, + Number = 1, + Letter = 2, + } + + #[diplomat::opaque] + /// An ICU4X word-break segmenter, capable of finding word breakpoints in strings. + #[diplomat::rust_link(icu::segmenter::WordSegmenter, Struct)] + pub struct ICU4XWordSegmenter(WordSegmenter); + + #[diplomat::opaque] + #[diplomat::rust_link(icu::segmenter::WordBreakIterator, Struct)] + #[diplomat::rust_link( + icu::segmenter::WordBreakIteratorPotentiallyIllFormedUtf8, + Typedef, + hidden + )] + pub struct ICU4XWordBreakIteratorUtf8<'a>(WordBreakIteratorPotentiallyIllFormedUtf8<'a, 'a>); + + #[diplomat::opaque] + #[diplomat::rust_link(icu::segmenter::WordBreakIterator, Struct)] + #[diplomat::rust_link(icu::segmenter::WordBreakIteratorUtf16, Typedef, hidden)] + pub struct ICU4XWordBreakIteratorUtf16<'a>(WordBreakIteratorUtf16<'a, 'a>); + + #[diplomat::opaque] + #[diplomat::rust_link(icu::segmenter::WordBreakIterator, Struct)] + #[diplomat::rust_link(icu::segmenter::WordBreakIteratorLatin1, Typedef, hidden)] + pub struct ICU4XWordBreakIteratorLatin1<'a>(WordBreakIteratorLatin1<'a, 'a>); + + impl ICU4XWordSegmenter { + /// Construct an [`ICU4XWordSegmenter`] with automatically selecting the best available LSTM + /// or dictionary payload data. + /// + /// Note: currently, it uses dictionary for Chinese and Japanese, and LSTM for Burmese, + /// Khmer, Lao, and Thai. + #[diplomat::rust_link(icu::segmenter::WordSegmenter::new_auto, FnInStruct)] + pub fn create_auto( + provider: &ICU4XDataProvider, + ) -> Result<Box<ICU4XWordSegmenter>, ICU4XError> { + Ok(Box::new(ICU4XWordSegmenter(call_constructor!( + WordSegmenter::new_auto [r => Ok(r)], + WordSegmenter::try_new_auto_with_any_provider, + WordSegmenter::try_new_auto_with_buffer_provider, + provider + )?))) + } + + /// Construct an [`ICU4XWordSegmenter`] with LSTM payload data for Burmese, Khmer, Lao, and + /// Thai. + /// + /// Warning: [`ICU4XWordSegmenter`] created by this function doesn't handle Chinese or + /// Japanese. + #[diplomat::rust_link(icu::segmenter::WordSegmenter::new_lstm, FnInStruct)] + pub fn create_lstm( + provider: &ICU4XDataProvider, + ) -> Result<Box<ICU4XWordSegmenter>, ICU4XError> { + Ok(Box::new(ICU4XWordSegmenter(call_constructor!( + WordSegmenter::new_lstm [r => Ok(r)], + WordSegmenter::try_new_lstm_with_any_provider, + WordSegmenter::try_new_lstm_with_buffer_provider, + provider, + )?))) + } + + /// Construct an [`ICU4XWordSegmenter`] with dictionary payload data for Chinese, Japanese, + /// Burmese, Khmer, Lao, and Thai. + #[diplomat::rust_link(icu::segmenter::WordSegmenter::new_dictionary, FnInStruct)] + pub fn create_dictionary( + provider: &ICU4XDataProvider, + ) -> Result<Box<ICU4XWordSegmenter>, ICU4XError> { + Ok(Box::new(ICU4XWordSegmenter(call_constructor!( + WordSegmenter::new_dictionary [r => Ok(r)], + WordSegmenter::try_new_dictionary_with_any_provider, + WordSegmenter::try_new_dictionary_with_buffer_provider, + provider, + )?))) + } + + /// Segments a (potentially ill-formed) UTF-8 string. + #[diplomat::rust_link(icu::segmenter::WordSegmenter::segment_utf8, FnInStruct)] + #[diplomat::rust_link(icu::segmenter::WordSegmenter::segment_str, FnInStruct, hidden)] + pub fn segment_utf8<'a>(&'a self, input: &'a str) -> Box<ICU4XWordBreakIteratorUtf8<'a>> { + let input = input.as_bytes(); // #2520 + Box::new(ICU4XWordBreakIteratorUtf8(self.0.segment_utf8(input))) + } + + /// Segments a UTF-16 string. + #[diplomat::rust_link(icu::segmenter::WordSegmenter::segment_utf16, FnInStruct)] + pub fn segment_utf16<'a>( + &'a self, + input: &'a [u16], + ) -> Box<ICU4XWordBreakIteratorUtf16<'a>> { + Box::new(ICU4XWordBreakIteratorUtf16(self.0.segment_utf16(input))) + } + + /// Segments a Latin-1 string. + #[diplomat::rust_link(icu::segmenter::WordSegmenter::segment_latin1, FnInStruct)] + pub fn segment_latin1<'a>( + &'a self, + input: &'a [u8], + ) -> Box<ICU4XWordBreakIteratorLatin1<'a>> { + Box::new(ICU4XWordBreakIteratorLatin1(self.0.segment_latin1(input))) + } + } + + impl<'a> ICU4XWordBreakIteratorUtf8<'a> { + /// Finds the next breakpoint. Returns -1 if at the end of the string or if the index is + /// out of range of a 32-bit signed integer. + #[allow(clippy::should_implement_trait)] + #[diplomat::rust_link(icu::segmenter::WordBreakIterator::next, FnInStruct)] + #[diplomat::rust_link( + icu::segmenter::WordBreakIterator::Item, + AssociatedTypeInStruct, + hidden + )] + pub fn next(&mut self) -> i32 { + self.0 + .next() + .and_then(|u| i32::try_from(u).ok()) + .unwrap_or(-1) + } + + /// Return the status value of break boundary. + #[diplomat::rust_link(icu::segmenter::WordBreakIterator::word_type, FnInStruct)] + pub fn word_type(&self) -> ICU4XSegmenterWordType { + self.0.word_type().into() + } + + /// Return true when break boundary is word-like such as letter/number/CJK + #[diplomat::rust_link(icu::segmenter::WordBreakIterator::is_word_like, FnInStruct)] + pub fn is_word_like(&self) -> bool { + self.0.is_word_like() + } + } + + impl<'a> ICU4XWordBreakIteratorUtf16<'a> { + /// Finds the next breakpoint. Returns -1 if at the end of the string or if the index is + /// out of range of a 32-bit signed integer. + #[allow(clippy::should_implement_trait)] + #[diplomat::rust_link(icu::segmenter::WordBreakIterator::next, FnInStruct)] + #[diplomat::rust_link( + icu::segmenter::WordBreakIterator::Item, + AssociatedTypeInStruct, + hidden + )] + pub fn next(&mut self) -> i32 { + self.0 + .next() + .and_then(|u| i32::try_from(u).ok()) + .unwrap_or(-1) + } + + /// Return the status value of break boundary. + #[diplomat::rust_link(icu::segmenter::WordBreakIterator::word_type, FnInStruct)] + pub fn word_type(&self) -> ICU4XSegmenterWordType { + self.0.word_type().into() + } + + /// Return true when break boundary is word-like such as letter/number/CJK + #[diplomat::rust_link(icu::segmenter::WordBreakIterator::is_word_like, FnInStruct)] + pub fn is_word_like(&self) -> bool { + self.0.is_word_like() + } + } + + impl<'a> ICU4XWordBreakIteratorLatin1<'a> { + /// Finds the next breakpoint. Returns -1 if at the end of the string or if the index is + /// out of range of a 32-bit signed integer. + #[allow(clippy::should_implement_trait)] + #[diplomat::rust_link(icu::segmenter::WordBreakIterator::next, FnInStruct)] + #[diplomat::rust_link( + icu::segmenter::WordBreakIterator::Item, + AssociatedTypeInStruct, + hidden + )] + pub fn next(&mut self) -> i32 { + self.0 + .next() + .and_then(|u| i32::try_from(u).ok()) + .unwrap_or(-1) + } + + /// Return the status value of break boundary. + #[diplomat::rust_link(icu::segmenter::WordBreakIterator::word_type, FnInStruct)] + pub fn word_type(&self) -> ICU4XSegmenterWordType { + self.0.word_type().into() + } + + /// Return true when break boundary is word-like such as letter/number/CJK + #[diplomat::rust_link(icu::segmenter::WordBreakIterator::is_word_like, FnInStruct)] + pub fn is_word_like(&self) -> bool { + self.0.is_word_like() + } + } +} |