/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ /* vim: set ts=8 sts=2 et sw=2 tw=80: */ /* This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this file, * You can obtain one at http://mozilla.org/MPL/2.0/. */ /* Classes to iterate over grapheme, word, sentence, or line. */ #ifndef intl_components_Segmenter_h_ #define intl_components_Segmenter_h_ #include "mozilla/intl/ICUError.h" #include "mozilla/Maybe.h" #include "mozilla/Result.h" #include "mozilla/Span.h" #include "mozilla/UniquePtr.h" namespace mozilla::intl { enum class SegmenterGranularity : uint8_t { Grapheme, Word, Sentence, Line, }; struct SegmenterOptions final { SegmenterGranularity mGranularity = SegmenterGranularity::Grapheme; }; /** * Interface of segment iterators. Subclass this class to implement iterator for * UTF-16 text. */ class SegmentIteratorUtf16 { public: virtual ~SegmentIteratorUtf16() = default; // Disable copy or move semantics. Move semantic could be enabled in the // future if needed. SegmentIteratorUtf16(SegmentIteratorUtf16&&) = delete; SegmentIteratorUtf16& operator=(SegmentIteratorUtf16&&) = delete; SegmentIteratorUtf16(const SegmentIteratorUtf16&) = delete; SegmentIteratorUtf16& operator=(const SegmentIteratorUtf16&) = delete; /** * Advance the iterator to the next break position. * * @return the break position. If there's no further break position, return * Nothing(). */ virtual Maybe Next() = 0; /** * Advance the iterator to the first break position following the specified * position aPos. * * Note: if this iterator's current position is already >= aPos, this method * behaves the same as Next(). */ virtual Maybe Seek(uint32_t aPos); protected: explicit SegmentIteratorUtf16(Span aText); // The text to iterate over. Span mText; // The current break position within mText. uint32_t mPos = 0; }; // Each enum value has the same meaning with respect to the `word-break` // property values in the CSS Text spec. See the details in // https://drafts.csswg.org/css-text-3/#word-break-property enum class WordBreakRule : uint8_t { Normal = 0, BreakAll, KeepAll, }; // Each enum value has the same meaning with respect to the `line-break` // property values in the CSS Text spec. See the details in // https://drafts.csswg.org/css-text-3/#line-break-property. enum class LineBreakRule : uint8_t { Auto = 0, Loose, Normal, Strict, Anywhere, }; // Extra options for line break iterator. struct LineBreakOptions final { WordBreakRule mWordBreakRule = WordBreakRule::Normal; LineBreakRule mLineBreakRule = LineBreakRule::Auto; bool mScriptIsChineseOrJapanese = false; }; /** * Line break iterator for UTF-16 text. */ class LineBreakIteratorUtf16 final : public SegmentIteratorUtf16 { public: explicit LineBreakIteratorUtf16(Span aText, const LineBreakOptions& aOptions = {}); Maybe Next() override; private: LineBreakOptions mOptions; }; /** * Word break iterator for UTF-16 text. */ class WordBreakIteratorUtf16 final : public SegmentIteratorUtf16 { public: explicit WordBreakIteratorUtf16(Span aText); Maybe Next() override; }; /** * Grapheme cluster break iterator for UTF-16 text. */ class GraphemeClusterBreakIteratorUtf16 final : public SegmentIteratorUtf16 { public: explicit GraphemeClusterBreakIteratorUtf16(Span aText); Maybe Next() override; }; /** * Grapheme cluster break reverse iterator for UTF-16 text. * * Note: The reverse iterator doesn't handle conjoining Jamo and emoji. Use it * at your own risk. */ class GraphemeClusterBreakReverseIteratorUtf16 final : public SegmentIteratorUtf16 { public: explicit GraphemeClusterBreakReverseIteratorUtf16(Span aText); Maybe Next() override; Maybe Seek(uint32_t aPos) override; }; /** * This component is a Mozilla-focused API for working with segmenters in * internationalization code. * * This is a factor class. Calling Segment() to create an iterator over a text * of given granularity. */ class Segmenter final { public: // NOTE: aLocale is a no-op currently. static Result, ICUError> TryCreate( Span aLocale, const SegmenterOptions& aOptions); explicit Segmenter(Span aLocale, const SegmenterOptions& aOptions) : mOptions(aOptions) {} // Creates an iterator over aText of a given granularity in mOptions. UniquePtr Segment(Span aText) const; // TODO: Implement an iterator for Latin1 text. // UniquePtr Segment(Span aText) const; private: SegmenterOptions mOptions; }; } // namespace mozilla::intl #endif