summaryrefslogtreecommitdiffstats
path: root/intl/lwbrk/Segmenter.h
diff options
context:
space:
mode:
Diffstat (limited to 'intl/lwbrk/Segmenter.h')
-rw-r--r--intl/lwbrk/Segmenter.h231
1 files changed, 231 insertions, 0 deletions
diff --git a/intl/lwbrk/Segmenter.h b/intl/lwbrk/Segmenter.h
new file mode 100644
index 0000000000..a3233dc8ed
--- /dev/null
+++ b/intl/lwbrk/Segmenter.h
@@ -0,0 +1,231 @@
+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim: set ts=8 sts=2 et sw=2 tw=80: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this file,
+ * You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+/* Classes to iterate over grapheme, word, sentence, or line. */
+
+#ifndef intl_components_Segmenter_h_
+#define intl_components_Segmenter_h_
+
+#include "mozilla/intl/ICUError.h"
+#include "mozilla/Maybe.h"
+#include "mozilla/Result.h"
+#include "mozilla/Span.h"
+#include "mozilla/UniquePtr.h"
+
+#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
+namespace capi {
+struct ICU4XLineSegmenter;
+struct ICU4XLineBreakIteratorUtf16;
+struct ICU4XWordSegmenter;
+struct ICU4XWordBreakIteratorUtf16;
+struct ICU4XGraphemeClusterSegmenter;
+struct ICU4XGraphemeClusterBreakIteratorUtf16;
+struct ICU4XSentenceSegmenter;
+struct ICU4XSentenceBreakIteratorUtf16;
+} // namespace capi
+#endif
+
+namespace mozilla::intl {
+
+enum class SegmenterGranularity : uint8_t {
+ Grapheme,
+ Word,
+ Sentence,
+ Line,
+};
+
+struct SegmenterOptions final {
+ SegmenterGranularity mGranularity = SegmenterGranularity::Grapheme;
+};
+
+/**
+ * Interface of segment iterators. Subclass this class to implement iterator for
+ * UTF-16 text.
+ */
+class SegmentIteratorUtf16 {
+ public:
+ virtual ~SegmentIteratorUtf16() = default;
+
+ // Disable copy or move semantics. Move semantic could be enabled in the
+ // future if needed.
+ SegmentIteratorUtf16(SegmentIteratorUtf16&&) = delete;
+ SegmentIteratorUtf16& operator=(SegmentIteratorUtf16&&) = delete;
+ SegmentIteratorUtf16(const SegmentIteratorUtf16&) = delete;
+ SegmentIteratorUtf16& operator=(const SegmentIteratorUtf16&) = delete;
+
+ /**
+ * Advance the iterator to the next break position.
+ *
+ * @return the break position. If there's no further break position, return
+ * Nothing().
+ */
+ virtual Maybe<uint32_t> Next() = 0;
+
+ /**
+ * Advance the iterator to the first break position following the specified
+ * position aPos.
+ *
+ * Note: if this iterator's current position is already >= aPos, this method
+ * behaves the same as Next().
+ */
+ virtual Maybe<uint32_t> Seek(uint32_t aPos);
+
+ protected:
+ explicit SegmentIteratorUtf16(Span<const char16_t> aText);
+
+ // The text to iterate over.
+ Span<const char16_t> mText;
+
+ // The current break position within mText.
+ uint32_t mPos = 0;
+};
+
+// Each enum value has the same meaning with respect to the `word-break`
+// property values in the CSS Text spec. See the details in
+// https://drafts.csswg.org/css-text-3/#word-break-property
+enum class WordBreakRule : uint8_t {
+ Normal = 0,
+ BreakAll,
+ KeepAll,
+};
+
+// Each enum value has the same meaning with respect to the `line-break`
+// property values in the CSS Text spec. See the details in
+// https://drafts.csswg.org/css-text-3/#line-break-property.
+enum class LineBreakRule : uint8_t {
+ Auto = 0,
+ Loose,
+ Normal,
+ Strict,
+ Anywhere,
+};
+
+// Extra options for line break iterator.
+struct LineBreakOptions final {
+ WordBreakRule mWordBreakRule = WordBreakRule::Normal;
+ LineBreakRule mLineBreakRule = LineBreakRule::Auto;
+ bool mScriptIsChineseOrJapanese = false;
+};
+
+/**
+ * Line break iterator for UTF-16 text.
+ */
+class LineBreakIteratorUtf16 final : public SegmentIteratorUtf16 {
+ public:
+ explicit LineBreakIteratorUtf16(Span<const char16_t> aText,
+ const LineBreakOptions& aOptions = {});
+ ~LineBreakIteratorUtf16() override;
+
+ Maybe<uint32_t> Next() override;
+ Maybe<uint32_t> Seek(uint32_t aPos) override;
+
+ private:
+ LineBreakOptions mOptions;
+
+#ifdef MOZ_ICU4X
+ capi::ICU4XLineSegmenter* mSegmenter = nullptr;
+ capi::ICU4XLineBreakIteratorUtf16* mIterator = nullptr;
+#endif
+};
+
+/**
+ * Word break iterator for UTF-16 text.
+ */
+class WordBreakIteratorUtf16 final : public SegmentIteratorUtf16 {
+ public:
+ explicit WordBreakIteratorUtf16(Span<const char16_t> aText);
+ ~WordBreakIteratorUtf16() override;
+
+ Maybe<uint32_t> Next() override;
+ Maybe<uint32_t> Seek(uint32_t aPos) override;
+
+#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
+ private:
+ capi::ICU4XWordSegmenter* mSegmenter = nullptr;
+ capi::ICU4XWordBreakIteratorUtf16* mIterator = nullptr;
+#endif
+};
+
+/**
+ * Grapheme cluster break iterator for UTF-16 text.
+ */
+class GraphemeClusterBreakIteratorUtf16 final : public SegmentIteratorUtf16 {
+ public:
+ explicit GraphemeClusterBreakIteratorUtf16(Span<const char16_t> aText);
+ ~GraphemeClusterBreakIteratorUtf16() override;
+
+ Maybe<uint32_t> Next() override;
+ Maybe<uint32_t> Seek(uint32_t aPos) override;
+
+#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
+ private:
+ static capi::ICU4XGraphemeClusterSegmenter* sSegmenter;
+ capi::ICU4XGraphemeClusterBreakIteratorUtf16* mIterator = nullptr;
+#endif
+};
+
+/**
+ * Grapheme cluster break reverse iterator for UTF-16 text.
+ *
+ * Note: The reverse iterator doesn't handle conjoining Jamo and emoji. Use it
+ * at your own risk.
+ */
+class GraphemeClusterBreakReverseIteratorUtf16 final
+ : public SegmentIteratorUtf16 {
+ public:
+ explicit GraphemeClusterBreakReverseIteratorUtf16(Span<const char16_t> aText);
+
+ Maybe<uint32_t> Next() override;
+ Maybe<uint32_t> Seek(uint32_t aPos) override;
+};
+
+#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
+/**
+ * Sentence break iterator for UTF-16 text.
+ */
+class SentenceBreakIteratorUtf16 final : public SegmentIteratorUtf16 {
+ public:
+ explicit SentenceBreakIteratorUtf16(Span<const char16_t> aText);
+ ~SentenceBreakIteratorUtf16() override;
+
+ Maybe<uint32_t> Next() override;
+ Maybe<uint32_t> Seek(uint32_t aPos) override;
+
+ private:
+ capi::ICU4XSentenceSegmenter* mSegmenter = nullptr;
+ capi::ICU4XSentenceBreakIteratorUtf16* mIterator = nullptr;
+};
+#endif
+
+/**
+ * This component is a Mozilla-focused API for working with segmenters in
+ * internationalization code.
+ *
+ * This is a factor class. Calling Segment() to create an iterator over a text
+ * of given granularity.
+ */
+class Segmenter final {
+ public:
+ // NOTE: aLocale is a no-op currently.
+ static Result<UniquePtr<Segmenter>, ICUError> TryCreate(
+ Span<const char> aLocale, const SegmenterOptions& aOptions);
+
+ explicit Segmenter(Span<const char> aLocale, const SegmenterOptions& aOptions)
+ : mOptions(aOptions) {}
+
+ // Creates an iterator over aText of a given granularity in mOptions.
+ UniquePtr<SegmentIteratorUtf16> Segment(Span<const char16_t> aText) const;
+
+ // TODO: Implement an iterator for Latin1 text.
+ // UniquePtr<SegmentIteratorLatin1> Segment(Span<const uint8_t> aText) const;
+
+ private:
+ SegmenterOptions mOptions;
+};
+
+} // namespace mozilla::intl
+
+#endif