/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ /* vim: set ts=8 sts=2 et sw=2 tw=80: */ /* This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this file, * You can obtain one at http://mozilla.org/MPL/2.0/. */ /* Classes to iterate over grapheme, word, sentence, or line. */ #include "mozilla/intl/Segmenter.h" #include "mozilla/intl/LineBreaker.h" #include "mozilla/intl/WordBreaker.h" #include "mozilla/intl/UnicodeProperties.h" #include "nsUnicodeProperties.h" #include "nsCharTraits.h" using namespace mozilla::unicode; namespace mozilla::intl { SegmentIteratorUtf16::SegmentIteratorUtf16(Span aText) : mText(aText) {} Maybe SegmentIteratorUtf16::Seek(uint32_t aPos) { if (mPos < aPos) { mPos = aPos; } return Next(); } LineBreakIteratorUtf16::LineBreakIteratorUtf16(Span aText, const LineBreakOptions& aOptions) : SegmentIteratorUtf16(aText), mOptions(aOptions) {} Maybe LineBreakIteratorUtf16::Next() { const int32_t nextPos = LineBreaker::Next(mText.Elements(), mText.Length(), mPos); if (nextPos == NS_LINEBREAKER_NEED_MORE_TEXT) { return Nothing(); } mPos = nextPos; return Some(mPos); } WordBreakIteratorUtf16::WordBreakIteratorUtf16(Span aText) : SegmentIteratorUtf16(aText) {} Maybe WordBreakIteratorUtf16::Next() { const int32_t nextPos = WordBreaker::Next(mText.Elements(), mText.Length(), mPos); if (nextPos == NS_WORDBREAKER_NEED_MORE_TEXT) { return Nothing(); } mPos = nextPos; return Some(mPos); } GraphemeClusterBreakIteratorUtf16::GraphemeClusterBreakIteratorUtf16( Span aText) : SegmentIteratorUtf16(aText) {} enum HSType { HST_NONE = U_HST_NOT_APPLICABLE, HST_L = U_HST_LEADING_JAMO, HST_V = U_HST_VOWEL_JAMO, HST_T = U_HST_TRAILING_JAMO, HST_LV = U_HST_LV_SYLLABLE, HST_LVT = U_HST_LVT_SYLLABLE }; static HSType GetHangulSyllableType(uint32_t aCh) { return HSType(UnicodeProperties::GetIntPropertyValue( aCh, UnicodeProperties::IntProperty::HangulSyllableType)); } Maybe GraphemeClusterBreakIteratorUtf16::Next() { const auto len = mText.Length(); if (mPos >= len) { // The iterator has already reached the end. return Nothing(); } uint32_t ch = mText[mPos++]; if (mPos < len && NS_IS_SURROGATE_PAIR(ch, mText[mPos])) { ch = SURROGATE_TO_UCS4(ch, mText[mPos++]); } else if ((ch & ~0xff) == 0x1100 || (ch >= 0xa960 && ch <= 0xa97f) || (ch >= 0xac00 && ch <= 0xd7ff)) { // Handle conjoining Jamo that make Hangul syllables HSType hangulState = GetHangulSyllableType(ch); while (mPos < len) { ch = mText[mPos]; HSType hangulType = GetHangulSyllableType(ch); switch (hangulType) { case HST_L: case HST_LV: case HST_LVT: if (hangulState == HST_L) { hangulState = hangulType; mPos++; continue; } break; case HST_V: if ((hangulState != HST_NONE) && (hangulState != HST_T) && (hangulState != HST_LVT)) { hangulState = hangulType; mPos++; continue; } break; case HST_T: if (hangulState != HST_NONE && hangulState != HST_L) { hangulState = hangulType; mPos++; continue; } break; default: break; } break; } } const uint32_t kVS16 = 0xfe0f; const uint32_t kZWJ = 0x200d; // UTF-16 surrogate values for Fitzpatrick type modifiers const uint32_t kFitzpatrickHigh = 0xD83C; const uint32_t kFitzpatrickLowFirst = 0xDFFB; const uint32_t kFitzpatrickLowLast = 0xDFFF; // Checking the emoji-presentation property of the base character is a bit // expensive, so we do it lazily. enum class EmojiStatus : uint8_t { No, Yes, Unknown, } baseIsEmojiStatus = EmojiStatus::Unknown; // Remember the base character and the position of the next, in case we need // to evaluate its emoji status. uint32_t baseCh = ch; uint32_t afterBase = mPos; auto isFitzpatrickModifierAt = [&](uint32_t aPos) -> bool { return aPos + 1 < len && mText[aPos] == kFitzpatrickHigh && mText[aPos + 1] >= kFitzpatrickLowFirst && mText[aPos + 1] <= kFitzpatrickLowLast; }; auto baseIsEmoji = [&]() -> bool { if (baseIsEmojiStatus == EmojiStatus::Unknown) { auto basePresentation = GetEmojiPresentation(baseCh); baseIsEmojiStatus = basePresentation == EmojiDefault || (basePresentation == TextDefault && ((afterBase < len && mText[afterBase] == kVS16) || isFitzpatrickModifierAt(afterBase))) ? EmojiStatus::Yes : EmojiStatus::No; } return baseIsEmojiStatus == EmojiStatus::Yes; }; bool prevWasZwj = false; while (mPos < len) { ch = mText[mPos]; size_t chLen = 1; // Check for surrogate pairs; note that isolated surrogates will just // be treated as generic (non-cluster-extending) characters here, // which is fine for cluster-iterating purposes if (mPos < len - 1 && NS_IS_SURROGATE_PAIR(ch, mText[mPos + 1])) { ch = SURROGATE_TO_UCS4(ch, mText[mPos + 1]); chLen = 2; } bool extendCluster = IsClusterExtender(ch) || (prevWasZwj && baseIsEmoji() && ((GetEmojiPresentation(ch) == EmojiDefault) || (GetEmojiPresentation(ch) == TextDefault && mPos + chLen < len && mText[mPos + chLen] == kVS16))); if (!extendCluster) { break; } prevWasZwj = (ch == kZWJ); mPos += chLen; } MOZ_ASSERT(mPos <= len, "Next() has overshot the string!"); return Some(mPos); } GraphemeClusterBreakReverseIteratorUtf16:: GraphemeClusterBreakReverseIteratorUtf16(Span aText) : SegmentIteratorUtf16(aText) { mPos = mText.Length(); } Maybe GraphemeClusterBreakReverseIteratorUtf16::Next() { if (mPos == 0) { return Nothing(); } uint32_t ch; do { ch = mText[--mPos]; if (mPos > 0 && NS_IS_SURROGATE_PAIR(mText[mPos - 1], ch)) { ch = SURROGATE_TO_UCS4(mText[--mPos], ch); } if (!IsClusterExtender(ch)) { break; } } while (mPos > 0); // XXX May need to handle conjoining Jamo return Some(mPos); } Maybe GraphemeClusterBreakReverseIteratorUtf16::Seek(uint32_t aPos) { if (mPos > aPos) { mPos = aPos; } return Next(); } Result, ICUError> Segmenter::TryCreate( Span aLocale, const SegmenterOptions& aOptions) { if (aOptions.mGranularity == SegmenterGranularity::Sentence) { // Grapheme and Sentence iterator are not yet implemented. return Err(ICUError::InternalError); } return MakeUnique(aLocale, aOptions); } UniquePtr Segmenter::Segment( Span aText) const { switch (mOptions.mGranularity) { case SegmenterGranularity::Grapheme: return MakeUnique(aText); case SegmenterGranularity::Sentence: MOZ_ASSERT_UNREACHABLE("Unimplemented yet!"); return nullptr; case SegmenterGranularity::Word: return MakeUnique(aText); case SegmenterGranularity::Line: return MakeUnique(aText); } MOZ_ASSERT_UNREACHABLE("All granularities must be handled!"); return nullptr; } } // namespace mozilla::intl