1 files changed, 260 insertions, 0 deletions
diff --git a/intl/lwbrk/Segmenter.cpp b/intl/lwbrk/Segmenter.cpp
new file mode 100644
index 0000000000..53d87336a3
--- /dev/null
+++ b/intl/lwbrk/Segmenter.cpp
@@ -0,0 +1,260 @@
+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim: set ts=8 sts=2 et sw=2 tw=80: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this file,
+ * You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+/* Classes to iterate over grapheme, word, sentence, or line. */
+
+#include "mozilla/intl/Segmenter.h"
+
+#include "mozilla/intl/LineBreaker.h"
+#include "mozilla/intl/WordBreaker.h"
+#include "mozilla/intl/UnicodeProperties.h"
+#include "nsUnicodeProperties.h"
+#include "nsCharTraits.h"
+
+using namespace mozilla::unicode;
+
+namespace mozilla::intl {
+
+SegmentIteratorUtf16::SegmentIteratorUtf16(Span<const char16_t> aText)
+    : mText(aText) {}
+
+Maybe<uint32_t> SegmentIteratorUtf16::Seek(uint32_t aPos) {
+  if (mPos < aPos) {
+    mPos = aPos;
+  }
+  return Next();
+}
+
+LineBreakIteratorUtf16::LineBreakIteratorUtf16(Span<const char16_t> aText,
+                                               const LineBreakOptions& aOptions)
+    : SegmentIteratorUtf16(aText), mOptions(aOptions) {}
+
+Maybe<uint32_t> LineBreakIteratorUtf16::Next() {
+  const int32_t nextPos =
+      LineBreaker::Next(mText.Elements(), mText.Length(), mPos);
+  if (nextPos == NS_LINEBREAKER_NEED_MORE_TEXT) {
+    return Nothing();
+  }
+  mPos = nextPos;
+  return Some(mPos);
+}
+
+WordBreakIteratorUtf16::WordBreakIteratorUtf16(Span<const char16_t> aText)
+    : SegmentIteratorUtf16(aText) {}
+
+Maybe<uint32_t> WordBreakIteratorUtf16::Next() {
+  const int32_t nextPos =
+      WordBreaker::Next(mText.Elements(), mText.Length(), mPos);
+  if (nextPos == NS_WORDBREAKER_NEED_MORE_TEXT) {
+    return Nothing();
+  }
+  mPos = nextPos;
+  return Some(mPos);
+}
+
+GraphemeClusterBreakIteratorUtf16::GraphemeClusterBreakIteratorUtf16(
+    Span<const char16_t> aText)
+    : SegmentIteratorUtf16(aText) {}
+
+enum HSType {
+  HST_NONE = U_HST_NOT_APPLICABLE,
+  HST_L = U_HST_LEADING_JAMO,
+  HST_V = U_HST_VOWEL_JAMO,
+  HST_T = U_HST_TRAILING_JAMO,
+  HST_LV = U_HST_LV_SYLLABLE,
+  HST_LVT = U_HST_LVT_SYLLABLE
+};
+
+static HSType GetHangulSyllableType(uint32_t aCh) {
+  return HSType(UnicodeProperties::GetIntPropertyValue(
+      aCh, UnicodeProperties::IntProperty::HangulSyllableType));
+}
+
+Maybe<uint32_t> GraphemeClusterBreakIteratorUtf16::Next() {
+  const auto len = mText.Length();
+  if (mPos >= len) {
+    // The iterator has already reached the end.
+    return Nothing();
+  }
+
+  uint32_t ch = mText[mPos++];
+
+  if (mPos < len && NS_IS_SURROGATE_PAIR(ch, mText[mPos])) {
+    ch = SURROGATE_TO_UCS4(ch, mText[mPos++]);
+  } else if ((ch & ~0xff) == 0x1100 || (ch >= 0xa960 && ch <= 0xa97f) ||
+             (ch >= 0xac00 && ch <= 0xd7ff)) {
+    // Handle conjoining Jamo that make Hangul syllables
+    HSType hangulState = GetHangulSyllableType(ch);
+    while (mPos < len) {
+      ch = mText[mPos];
+      HSType hangulType = GetHangulSyllableType(ch);
+      switch (hangulType) {
+        case HST_L:
+        case HST_LV:
+        case HST_LVT:
+          if (hangulState == HST_L) {
+            hangulState = hangulType;
+            mPos++;
+            continue;
+          }
+          break;
+        case HST_V:
+          if ((hangulState != HST_NONE) && (hangulState != HST_T) &&
+              (hangulState != HST_LVT)) {
+            hangulState = hangulType;
+            mPos++;
+            continue;
+          }
+          break;
+        case HST_T:
+          if (hangulState != HST_NONE && hangulState != HST_L) {
+            hangulState = hangulType;
+            mPos++;
+            continue;
+          }
+          break;
+        default:
+          break;
+      }
+      break;
+    }
+  }
+
+  const uint32_t kVS16 = 0xfe0f;
+  const uint32_t kZWJ = 0x200d;
+  // UTF-16 surrogate values for Fitzpatrick type modifiers
+  const uint32_t kFitzpatrickHigh = 0xD83C;
+  const uint32_t kFitzpatrickLowFirst = 0xDFFB;
+  const uint32_t kFitzpatrickLowLast = 0xDFFF;
+
+  // Checking the emoji-presentation property of the base character is a bit
+  // expensive, so we do it lazily.
+  enum class EmojiStatus : uint8_t {
+    No,
+    Yes,
+    Unknown,
+  } baseIsEmojiStatus = EmojiStatus::Unknown;
+
+  // Remember the base character and the position of the next, in case we need
+  // to evaluate its emoji status.
+  uint32_t baseCh = ch;
+  uint32_t afterBase = mPos;
+
+  auto isFitzpatrickModifierAt = [&](uint32_t aPos) -> bool {
+    return aPos + 1 < len && mText[aPos] == kFitzpatrickHigh &&
+           mText[aPos + 1] >= kFitzpatrickLowFirst &&
+           mText[aPos + 1] <= kFitzpatrickLowLast;
+  };
+
+  auto baseIsEmoji = [&]() -> bool {
+    if (baseIsEmojiStatus == EmojiStatus::Unknown) {
+      auto basePresentation = GetEmojiPresentation(baseCh);
+      baseIsEmojiStatus =
+          basePresentation == EmojiDefault ||
+                  (basePresentation == TextDefault &&
+                   ((afterBase < len && mText[afterBase] == kVS16) ||
+                    isFitzpatrickModifierAt(afterBase)))
+              ? EmojiStatus::Yes
+              : EmojiStatus::No;
+    }
+    return baseIsEmojiStatus == EmojiStatus::Yes;
+  };
+
+  bool prevWasZwj = false;
+
+  while (mPos < len) {
+    ch = mText[mPos];
+    size_t chLen = 1;
+
+    // Check for surrogate pairs; note that isolated surrogates will just
+    // be treated as generic (non-cluster-extending) characters here,
+    // which is fine for cluster-iterating purposes
+    if (mPos < len - 1 && NS_IS_SURROGATE_PAIR(ch, mText[mPos + 1])) {
+      ch = SURROGATE_TO_UCS4(ch, mText[mPos + 1]);
+      chLen = 2;
+    }
+
+    bool extendCluster =
+        IsClusterExtender(ch) ||
+        (prevWasZwj && baseIsEmoji() &&
+         ((GetEmojiPresentation(ch) == EmojiDefault) ||
+          (GetEmojiPresentation(ch) == TextDefault && mPos + chLen < len &&
+           mText[mPos + chLen] == kVS16)));
+    if (!extendCluster) {
+      break;
+    }
+
+    prevWasZwj = (ch == kZWJ);
+    mPos += chLen;
+  }
+
+  MOZ_ASSERT(mPos <= len, "Next() has overshot the string!");
+  return Some(mPos);
+}
+
+GraphemeClusterBreakReverseIteratorUtf16::
+    GraphemeClusterBreakReverseIteratorUtf16(Span<const char16_t> aText)
+    : SegmentIteratorUtf16(aText) {
+  mPos = mText.Length();
+}
+
+Maybe<uint32_t> GraphemeClusterBreakReverseIteratorUtf16::Next() {
+  if (mPos == 0) {
+    return Nothing();
+  }
+
+  uint32_t ch;
+  do {
+    ch = mText[--mPos];
+
+    if (mPos > 0 && NS_IS_SURROGATE_PAIR(mText[mPos - 1], ch)) {
+      ch = SURROGATE_TO_UCS4(mText[--mPos], ch);
+    }
+
+    if (!IsClusterExtender(ch)) {
+      break;
+    }
+  } while (mPos > 0);
+
+  // XXX May need to handle conjoining Jamo
+
+  return Some(mPos);
+}
+
+Maybe<uint32_t> GraphemeClusterBreakReverseIteratorUtf16::Seek(uint32_t aPos) {
+  if (mPos > aPos) {
+    mPos = aPos;
+  }
+  return Next();
+}
+
+Result<UniquePtr<Segmenter>, ICUError> Segmenter::TryCreate(
+    Span<const char> aLocale, const SegmenterOptions& aOptions) {
+  if (aOptions.mGranularity == SegmenterGranularity::Sentence) {
+    // Grapheme and Sentence iterator are not yet implemented.
+    return Err(ICUError::InternalError);
+  }
+  return MakeUnique<Segmenter>(aLocale, aOptions);
+}
+
+UniquePtr<SegmentIteratorUtf16> Segmenter::Segment(
+    Span<const char16_t> aText) const {
+  switch (mOptions.mGranularity) {
+    case SegmenterGranularity::Grapheme:
+      return MakeUnique<GraphemeClusterBreakIteratorUtf16>(aText);
+    case SegmenterGranularity::Sentence:
+      MOZ_ASSERT_UNREACHABLE("Unimplemented yet!");
+      return nullptr;
+    case SegmenterGranularity::Word:
+      return MakeUnique<WordBreakIteratorUtf16>(aText);
+    case SegmenterGranularity::Line:
+      return MakeUnique<LineBreakIteratorUtf16>(aText);
+  }
+  MOZ_ASSERT_UNREACHABLE("All granularities must be handled!");
+  return nullptr;
+}
+
+}  // namespace mozilla::intl