diff options
Diffstat (limited to 'intl/lwbrk/Segmenter.cpp')
-rw-r--r-- | intl/lwbrk/Segmenter.cpp | 545 |
1 files changed, 545 insertions, 0 deletions
diff --git a/intl/lwbrk/Segmenter.cpp b/intl/lwbrk/Segmenter.cpp new file mode 100644 index 0000000000..8cfd179366 --- /dev/null +++ b/intl/lwbrk/Segmenter.cpp @@ -0,0 +1,545 @@ +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim: set ts=8 sts=2 et sw=2 tw=80: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this file, + * You can obtain one at http://mozilla.org/MPL/2.0/. */ + +/* Classes to iterate over grapheme, word, sentence, or line. */ + +#include "mozilla/intl/Segmenter.h" + +#include "mozilla/intl/LineBreaker.h" +#include "mozilla/intl/WordBreaker.h" +#include "mozilla/intl/UnicodeProperties.h" +#include "mozilla/StaticPrefs_intl.h" +#include "nsUnicodeProperties.h" +#include "nsCharTraits.h" + +#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) +# include "ICU4XDataProvider.h" +# include "ICU4XGraphemeClusterSegmenter.h" +# include "ICU4XLineSegmenter.h" +# include "ICU4XSentenceSegmenter.h" +# include "ICU4XWordSegmenter.h" +# include "mozilla/ClearOnShutdown.h" +# include "mozilla/intl/ICU4XGeckoDataProvider.h" +# include "nsThreadUtils.h" + +# include <mutex> +#endif + +using namespace mozilla::unicode; + +namespace mozilla::intl { + +SegmentIteratorUtf16::SegmentIteratorUtf16(Span<const char16_t> aText) + : mText(aText) {} + +Maybe<uint32_t> SegmentIteratorUtf16::Seek(uint32_t aPos) { + if (mPos < aPos) { + mPos = aPos; + } + return Next(); +} + +LineBreakIteratorUtf16::LineBreakIteratorUtf16(Span<const char16_t> aText, + const LineBreakOptions& aOptions) + : SegmentIteratorUtf16(aText), mOptions(aOptions) { +#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) + if (!StaticPrefs::intl_icu4x_segmenter_enabled()) { + return; + } + auto result = + capi::ICU4XLineSegmenter_create_auto(mozilla::intl::GetDataProvider()); + MOZ_RELEASE_ASSERT(result.is_ok); + mSegmenter = result.ok; + mIterator = capi::ICU4XLineSegmenter_segment_utf16( + mSegmenter, (const uint16_t*)mText.Elements(), mText.Length()); +#endif +} + +LineBreakIteratorUtf16::~LineBreakIteratorUtf16() { +#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) + if (mIterator) { + capi::ICU4XLineBreakIteratorUtf16_destroy(mIterator); + } + if (mSegmenter) { + capi::ICU4XLineSegmenter_destroy(mSegmenter); + } +#endif +} + +Maybe<uint32_t> LineBreakIteratorUtf16::Next() { +#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) + if (mIterator) { + const int32_t nextPos = capi::ICU4XLineBreakIteratorUtf16_next(mIterator); + if (nextPos < 0) { + return Nothing(); + } + if (!nextPos) { + return Next(); + } + mPos = nextPos; + return Some(mPos); + } +#endif + const int32_t nextPos = + LineBreaker::Next(mText.Elements(), mText.Length(), mPos); + if (nextPos == NS_LINEBREAKER_NEED_MORE_TEXT) { + return Nothing(); + } + mPos = nextPos; + return Some(mPos); +} + +Maybe<uint32_t> LineBreakIteratorUtf16::Seek(uint32_t aPos) { +#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) + if (mIterator) { + if (mPos >= aPos) { + return Next(); + } + + while (mPos < aPos) { + const int32_t nextPos = capi::ICU4XLineBreakIteratorUtf16_next(mIterator); + if (nextPos < 0) { + return Nothing(); + } + mPos = static_cast<uint32_t>(nextPos); + } + + if (aPos < mPos) { + return Some(mPos); + } + + return Next(); + } +#endif + return SegmentIteratorUtf16::Seek(aPos); +} + +WordBreakIteratorUtf16::WordBreakIteratorUtf16(Span<const char16_t> aText) + : SegmentIteratorUtf16(aText) { +#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) + if (!StaticPrefs::intl_icu4x_segmenter_enabled()) { + return; + } + auto result = + capi::ICU4XWordSegmenter_create_auto(mozilla::intl::GetDataProvider()); + MOZ_RELEASE_ASSERT(result.is_ok); + mSegmenter = result.ok; + mIterator = capi::ICU4XWordSegmenter_segment_utf16( + mSegmenter, (const uint16_t*)mText.Elements(), mText.Length()); +#endif +} + +WordBreakIteratorUtf16::~WordBreakIteratorUtf16() { +#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) + if (mIterator) { + capi::ICU4XWordBreakIteratorUtf16_destroy(mIterator); + } + if (mSegmenter) { + capi::ICU4XWordSegmenter_destroy(mSegmenter); + } +#endif +} + +Maybe<uint32_t> WordBreakIteratorUtf16::Next() { +#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) + if (mIterator) { + const int32_t nextPos = capi::ICU4XWordBreakIteratorUtf16_next(mIterator); + if (nextPos < 0) { + return Nothing(); + } + if (!nextPos) { + return Next(); + } + mPos = nextPos; + return Some(mPos); + } +#endif + const int32_t nextPos = + WordBreaker::Next(mText.Elements(), mText.Length(), mPos); + if (nextPos == NS_WORDBREAKER_NEED_MORE_TEXT) { + return Nothing(); + } + mPos = nextPos; + return Some(mPos); +} + +Maybe<uint32_t> WordBreakIteratorUtf16::Seek(uint32_t aPos) { +#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) + if (mIterator) { + if (mPos >= aPos) { + return Next(); + } + + while (mPos < aPos) { + const int32_t nextPos = capi::ICU4XWordBreakIteratorUtf16_next(mIterator); + if (nextPos < 0) { + return Nothing(); + } + mPos = static_cast<uint32_t>(nextPos); + } + + if (aPos < mPos) { + return Some(mPos); + } + + return Next(); + } +#endif + return SegmentIteratorUtf16::Seek(aPos); +} + +#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) +capi::ICU4XGraphemeClusterSegmenter* + GraphemeClusterBreakIteratorUtf16::sSegmenter = nullptr; +#endif + +GraphemeClusterBreakIteratorUtf16::GraphemeClusterBreakIteratorUtf16( + Span<const char16_t> aText) + : SegmentIteratorUtf16(aText) { +#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) + if (!StaticPrefs::intl_icu4x_segmenter_enabled()) { + return; + } + static std::once_flag sOnce; + + std::call_once(sOnce, [] { + auto result = capi::ICU4XGraphemeClusterSegmenter_create( + mozilla::intl::GetDataProvider()); + MOZ_RELEASE_ASSERT(result.is_ok); + sSegmenter = result.ok; + + NS_DispatchToMainThread( + NS_NewRunnableFunction("GraphemeClusterBreakIteratorUtf16", [] { + RunOnShutdown([] { + capi::ICU4XGraphemeClusterSegmenter_destroy(sSegmenter); + sSegmenter = nullptr; + }); + })); + }); + + MOZ_RELEASE_ASSERT(sSegmenter); + mIterator = capi::ICU4XGraphemeClusterSegmenter_segment_utf16( + sSegmenter, (const uint16_t*)mText.Elements(), mText.Length()); +#endif +} + +GraphemeClusterBreakIteratorUtf16::~GraphemeClusterBreakIteratorUtf16() { +#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) + if (mIterator) { + capi::ICU4XGraphemeClusterBreakIteratorUtf16_destroy(mIterator); + } +#endif +} + +enum HSType { + HST_NONE = U_HST_NOT_APPLICABLE, + HST_L = U_HST_LEADING_JAMO, + HST_V = U_HST_VOWEL_JAMO, + HST_T = U_HST_TRAILING_JAMO, + HST_LV = U_HST_LV_SYLLABLE, + HST_LVT = U_HST_LVT_SYLLABLE +}; + +static HSType GetHangulSyllableType(uint32_t aCh) { + return HSType(UnicodeProperties::GetIntPropertyValue( + aCh, UnicodeProperties::IntProperty::HangulSyllableType)); +} + +Maybe<uint32_t> GraphemeClusterBreakIteratorUtf16::Next() { + const auto len = mText.Length(); +#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) + if (mIterator) { + const int32_t nextPos = + capi::ICU4XGraphemeClusterBreakIteratorUtf16_next(mIterator); + if (nextPos < 0) { + return Nothing(); + } + if (!nextPos) { + return Next(); + } + mPos = nextPos; + return Some(mPos); + } +#endif + if (mPos >= len) { + // The iterator has already reached the end. + return Nothing(); + } + + uint32_t ch = mText[mPos++]; + + if (mPos < len && NS_IS_SURROGATE_PAIR(ch, mText[mPos])) { + ch = SURROGATE_TO_UCS4(ch, mText[mPos++]); + } else if ((ch & ~0xff) == 0x1100 || (ch >= 0xa960 && ch <= 0xa97f) || + (ch >= 0xac00 && ch <= 0xd7ff)) { + // Handle conjoining Jamo that make Hangul syllables + HSType hangulState = GetHangulSyllableType(ch); + while (mPos < len) { + ch = mText[mPos]; + HSType hangulType = GetHangulSyllableType(ch); + switch (hangulType) { + case HST_L: + case HST_LV: + case HST_LVT: + if (hangulState == HST_L) { + hangulState = hangulType; + mPos++; + continue; + } + break; + case HST_V: + if ((hangulState != HST_NONE) && (hangulState != HST_T) && + (hangulState != HST_LVT)) { + hangulState = hangulType; + mPos++; + continue; + } + break; + case HST_T: + if (hangulState != HST_NONE && hangulState != HST_L) { + hangulState = hangulType; + mPos++; + continue; + } + break; + default: + break; + } + break; + } + } + + const uint32_t kVS16 = 0xfe0f; + const uint32_t kZWJ = 0x200d; + // UTF-16 surrogate values for Fitzpatrick type modifiers + const uint32_t kFitzpatrickHigh = 0xD83C; + const uint32_t kFitzpatrickLowFirst = 0xDFFB; + const uint32_t kFitzpatrickLowLast = 0xDFFF; + + // Checking the emoji-presentation property of the base character is a bit + // expensive, so we do it lazily. + enum class EmojiStatus : uint8_t { + No, + Yes, + Unknown, + } baseIsEmojiStatus = EmojiStatus::Unknown; + + // Remember the base character and the position of the next, in case we need + // to evaluate its emoji status. + uint32_t baseCh = ch; + uint32_t afterBase = mPos; + + auto isFitzpatrickModifierAt = [&](uint32_t aPos) -> bool { + return aPos + 1 < len && mText[aPos] == kFitzpatrickHigh && + mText[aPos + 1] >= kFitzpatrickLowFirst && + mText[aPos + 1] <= kFitzpatrickLowLast; + }; + + auto baseIsEmoji = [&]() -> bool { + if (baseIsEmojiStatus == EmojiStatus::Unknown) { + auto basePresentation = GetEmojiPresentation(baseCh); + baseIsEmojiStatus = + basePresentation == EmojiDefault || + (basePresentation == TextDefault && + ((afterBase < len && mText[afterBase] == kVS16) || + isFitzpatrickModifierAt(afterBase))) + ? EmojiStatus::Yes + : EmojiStatus::No; + } + return baseIsEmojiStatus == EmojiStatus::Yes; + }; + + bool prevWasZwj = false; + + while (mPos < len) { + ch = mText[mPos]; + size_t chLen = 1; + + // Check for surrogate pairs; note that isolated surrogates will just + // be treated as generic (non-cluster-extending) characters here, + // which is fine for cluster-iterating purposes + if (mPos < len - 1 && NS_IS_SURROGATE_PAIR(ch, mText[mPos + 1])) { + ch = SURROGATE_TO_UCS4(ch, mText[mPos + 1]); + chLen = 2; + } + + bool extendCluster = + IsClusterExtender(ch) || + (prevWasZwj && baseIsEmoji() && + ((GetEmojiPresentation(ch) == EmojiDefault) || + (GetEmojiPresentation(ch) == TextDefault && mPos + chLen < len && + mText[mPos + chLen] == kVS16))); + if (!extendCluster) { + break; + } + + prevWasZwj = (ch == kZWJ); + mPos += chLen; + } + + MOZ_ASSERT(mPos <= len, "Next() has overshot the string!"); + return Some(mPos); +} + +Maybe<uint32_t> GraphemeClusterBreakIteratorUtf16::Seek(uint32_t aPos) { +#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) + if (mIterator) { + if (mPos >= aPos) { + return Next(); + } + + while (mPos < aPos) { + const int32_t nextPos = + capi::ICU4XGraphemeClusterBreakIteratorUtf16_next(mIterator); + if (nextPos < 0) { + return Nothing(); + } + mPos = static_cast<uint32_t>(nextPos); + } + + if (aPos < mPos) { + return Some(mPos); + } + + return Next(); + } +#endif + return SegmentIteratorUtf16::Seek(aPos); +} + +GraphemeClusterBreakReverseIteratorUtf16:: + GraphemeClusterBreakReverseIteratorUtf16(Span<const char16_t> aText) + : SegmentIteratorUtf16(aText) { + mPos = mText.Length(); +} + +Maybe<uint32_t> GraphemeClusterBreakReverseIteratorUtf16::Next() { + if (mPos == 0) { + return Nothing(); + } + + uint32_t ch; + do { + ch = mText[--mPos]; + + if (mPos > 0 && NS_IS_SURROGATE_PAIR(mText[mPos - 1], ch)) { + ch = SURROGATE_TO_UCS4(mText[--mPos], ch); + } + + if (!IsClusterExtender(ch)) { + break; + } + } while (mPos > 0); + + // XXX May need to handle conjoining Jamo + + return Some(mPos); +} + +Maybe<uint32_t> GraphemeClusterBreakReverseIteratorUtf16::Seek(uint32_t aPos) { + if (mPos > aPos) { + mPos = aPos; + } + return Next(); +} + +#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) +SentenceBreakIteratorUtf16::SentenceBreakIteratorUtf16( + Span<const char16_t> aText) + : SegmentIteratorUtf16(aText) { + auto result = + capi::ICU4XSentenceSegmenter_create(mozilla::intl::GetDataProvider()); + MOZ_RELEASE_ASSERT(result.is_ok); + mSegmenter = result.ok; + mIterator = capi::ICU4XSentenceSegmenter_segment_utf16( + mSegmenter, (const uint16_t*)mText.Elements(), mText.Length()); +} + +SentenceBreakIteratorUtf16::~SentenceBreakIteratorUtf16() { + if (mIterator) { + capi::ICU4XSentenceBreakIteratorUtf16_destroy(mIterator); + } + if (mSegmenter) { + capi::ICU4XSentenceSegmenter_destroy(mSegmenter); + } +} + +Maybe<uint32_t> SentenceBreakIteratorUtf16::Seek(uint32_t aPos) { + if (!mIterator) { + return Nothing(); + } + + if (mPos >= aPos) { + return Next(); + } + + while (mPos < aPos) { + const int32_t nextPos = + capi::ICU4XSentenceBreakIteratorUtf16_next(mIterator); + if (nextPos < 0) { + return Nothing(); + } + mPos = static_cast<uint32_t>(nextPos); + } + + if (aPos < mPos) { + return Some(mPos); + } + + return Next(); +} + +Maybe<uint32_t> SentenceBreakIteratorUtf16::Next() { + if (!mIterator) { + return Nothing(); + } + + const int32_t nextPos = capi::ICU4XSentenceBreakIteratorUtf16_next(mIterator); + if (nextPos < 0) { + return Nothing(); + } + if (!nextPos) { + return Next(); + } + mPos = nextPos; + return Some(mPos); +} +#endif + +Result<UniquePtr<Segmenter>, ICUError> Segmenter::TryCreate( + Span<const char> aLocale, const SegmenterOptions& aOptions) { +#if !defined(MOZ_ICU4X) || !defined(JS_HAS_INTL_API) + if (aOptions.mGranularity == SegmenterGranularity::Sentence) { + // Grapheme and Sentence iterator are not yet implemented. + return Err(ICUError::InternalError); + } +#endif + return MakeUnique<Segmenter>(aLocale, aOptions); +} + +UniquePtr<SegmentIteratorUtf16> Segmenter::Segment( + Span<const char16_t> aText) const { + switch (mOptions.mGranularity) { + case SegmenterGranularity::Grapheme: + return MakeUnique<GraphemeClusterBreakIteratorUtf16>(aText); + case SegmenterGranularity::Sentence: +#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) + if (StaticPrefs::intl_icu4x_segmenter_enabled()) { + return MakeUnique<SentenceBreakIteratorUtf16>(aText); + } +#endif + MOZ_ASSERT_UNREACHABLE("Unimplemented yet!"); + return nullptr; + case SegmenterGranularity::Word: + return MakeUnique<WordBreakIteratorUtf16>(aText); + case SegmenterGranularity::Line: + return MakeUnique<LineBreakIteratorUtf16>(aText); + } + MOZ_ASSERT_UNREACHABLE("All granularities must be handled!"); + return nullptr; +} + +} // namespace mozilla::intl |