/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ /* vim: set ts=8 sts=2 et sw=2 tw=80: */ /* This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this file, * You can obtain one at http://mozilla.org/MPL/2.0/. */ /* Classes to iterate over grapheme, word, sentence, or line. */ #ifndef intl_components_Segmenter_h_ #define intl_components_Segmenter_h_ #include "mozilla/intl/ICUError.h" #include "mozilla/Maybe.h" #include "mozilla/Result.h" #include "mozilla/Span.h" #include "mozilla/UniquePtr.h" #if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) namespace capi { struct ICU4XLineSegmenter; struct ICU4XLineBreakIteratorUtf16; struct ICU4XWordSegmenter; struct ICU4XWordBreakIteratorUtf16; struct ICU4XGraphemeClusterSegmenter; struct ICU4XGraphemeClusterBreakIteratorUtf16; struct ICU4XSentenceSegmenter; struct ICU4XSentenceBreakIteratorUtf16; } // namespace capi #endif namespace mozilla::intl { enum class SegmenterGranularity : uint8_t { Grapheme, Word, Sentence, Line, }; struct SegmenterOptions final { SegmenterGranularity mGranularity = SegmenterGranularity::Grapheme; }; /** * Interface of segment iterators. Subclass this class to implement iterator for * UTF-16 text. */ class SegmentIteratorUtf16 { public: virtual ~SegmentIteratorUtf16() = default; // Disable copy or move semantics. Move semantic could be enabled in the // future if needed. SegmentIteratorUtf16(SegmentIteratorUtf16&&) = delete; SegmentIteratorUtf16& operator=(SegmentIteratorUtf16&&) = delete; SegmentIteratorUtf16(const SegmentIteratorUtf16&) = delete; SegmentIteratorUtf16& operator=(const SegmentIteratorUtf16&) = delete; /** * Advance the iterator to the next break position. * * @return the break position. If there's no further break position, return * Nothing(). */ virtual Maybe Next() = 0; /** * Advance the iterator to the first break position following the specified * position aPos. * * Note: if this iterator's current position is already >= aPos, this method * behaves the same as Next(). */ virtual Maybe Seek(uint32_t aPos); protected: explicit SegmentIteratorUtf16(Span aText); // The text to iterate over. Span mText; // The current break position within mText. uint32_t mPos = 0; }; // Each enum value has the same meaning with respect to the `word-break` // property values in the CSS Text spec. See the details in // https://drafts.csswg.org/css-text-3/#word-break-property enum class WordBreakRule : uint8_t { Normal = 0, BreakAll, KeepAll, }; // Each enum value has the same meaning with respect to the `line-break` // property values in the CSS Text spec. See the details in // https://drafts.csswg.org/css-text-3/#line-break-property. enum class LineBreakRule : uint8_t { Auto = 0, Loose, Normal, Strict, Anywhere, }; // Extra options for line break iterator. struct LineBreakOptions final { WordBreakRule mWordBreakRule = WordBreakRule::Normal; LineBreakRule mLineBreakRule = LineBreakRule::Auto; bool mScriptIsChineseOrJapanese = false; }; /** * Line break iterator for UTF-16 text. */ class LineBreakIteratorUtf16 final : public SegmentIteratorUtf16 { public: explicit LineBreakIteratorUtf16(Span aText, const LineBreakOptions& aOptions = {}); ~LineBreakIteratorUtf16() override; Maybe Next() override; Maybe Seek(uint32_t aPos) override; private: LineBreakOptions mOptions; #ifdef MOZ_ICU4X capi::ICU4XLineSegmenter* mSegmenter = nullptr; capi::ICU4XLineBreakIteratorUtf16* mIterator = nullptr; #endif }; /** * Word break iterator for UTF-16 text. */ class WordBreakIteratorUtf16 final : public SegmentIteratorUtf16 { public: explicit WordBreakIteratorUtf16(Span aText); ~WordBreakIteratorUtf16() override; Maybe Next() override; Maybe Seek(uint32_t aPos) override; #if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) private: capi::ICU4XWordSegmenter* mSegmenter = nullptr; capi::ICU4XWordBreakIteratorUtf16* mIterator = nullptr; #endif }; /** * Grapheme cluster break iterator for UTF-16 text. */ class GraphemeClusterBreakIteratorUtf16 final : public SegmentIteratorUtf16 { public: explicit GraphemeClusterBreakIteratorUtf16(Span aText); ~GraphemeClusterBreakIteratorUtf16() override; Maybe Next() override; Maybe Seek(uint32_t aPos) override; #if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) private: static capi::ICU4XGraphemeClusterSegmenter* sSegmenter; capi::ICU4XGraphemeClusterBreakIteratorUtf16* mIterator = nullptr; #endif }; /** * Grapheme cluster break reverse iterator for UTF-16 text. * * Note: The reverse iterator doesn't handle conjoining Jamo and emoji. Use it * at your own risk. */ class GraphemeClusterBreakReverseIteratorUtf16 final : public SegmentIteratorUtf16 { public: explicit GraphemeClusterBreakReverseIteratorUtf16(Span aText); Maybe Next() override; Maybe Seek(uint32_t aPos) override; }; #if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) /** * Sentence break iterator for UTF-16 text. */ class SentenceBreakIteratorUtf16 final : public SegmentIteratorUtf16 { public: explicit SentenceBreakIteratorUtf16(Span aText); ~SentenceBreakIteratorUtf16() override; Maybe Next() override; Maybe Seek(uint32_t aPos) override; private: capi::ICU4XSentenceSegmenter* mSegmenter = nullptr; capi::ICU4XSentenceBreakIteratorUtf16* mIterator = nullptr; }; #endif /** * This component is a Mozilla-focused API for working with segmenters in * internationalization code. * * This is a factor class. Calling Segment() to create an iterator over a text * of given granularity. */ class Segmenter final { public: // NOTE: aLocale is a no-op currently. static Result, ICUError> TryCreate( Span aLocale, const SegmenterOptions& aOptions); explicit Segmenter(Span aLocale, const SegmenterOptions& aOptions) : mOptions(aOptions) {} // Creates an iterator over aText of a given granularity in mOptions. UniquePtr Segment(Span aText) const; // TODO: Implement an iterator for Latin1 text. // UniquePtr Segment(Span aText) const; private: SegmenterOptions mOptions; }; } // namespace mozilla::intl #endif