diff options
Diffstat (limited to 'intl/lwbrk/Segmenter.h')
-rw-r--r-- | intl/lwbrk/Segmenter.h | 231 |
1 files changed, 231 insertions, 0 deletions
diff --git a/intl/lwbrk/Segmenter.h b/intl/lwbrk/Segmenter.h new file mode 100644 index 0000000000..a3233dc8ed --- /dev/null +++ b/intl/lwbrk/Segmenter.h @@ -0,0 +1,231 @@ +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim: set ts=8 sts=2 et sw=2 tw=80: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this file, + * You can obtain one at http://mozilla.org/MPL/2.0/. */ + +/* Classes to iterate over grapheme, word, sentence, or line. */ + +#ifndef intl_components_Segmenter_h_ +#define intl_components_Segmenter_h_ + +#include "mozilla/intl/ICUError.h" +#include "mozilla/Maybe.h" +#include "mozilla/Result.h" +#include "mozilla/Span.h" +#include "mozilla/UniquePtr.h" + +#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) +namespace capi { +struct ICU4XLineSegmenter; +struct ICU4XLineBreakIteratorUtf16; +struct ICU4XWordSegmenter; +struct ICU4XWordBreakIteratorUtf16; +struct ICU4XGraphemeClusterSegmenter; +struct ICU4XGraphemeClusterBreakIteratorUtf16; +struct ICU4XSentenceSegmenter; +struct ICU4XSentenceBreakIteratorUtf16; +} // namespace capi +#endif + +namespace mozilla::intl { + +enum class SegmenterGranularity : uint8_t { + Grapheme, + Word, + Sentence, + Line, +}; + +struct SegmenterOptions final { + SegmenterGranularity mGranularity = SegmenterGranularity::Grapheme; +}; + +/** + * Interface of segment iterators. Subclass this class to implement iterator for + * UTF-16 text. + */ +class SegmentIteratorUtf16 { + public: + virtual ~SegmentIteratorUtf16() = default; + + // Disable copy or move semantics. Move semantic could be enabled in the + // future if needed. + SegmentIteratorUtf16(SegmentIteratorUtf16&&) = delete; + SegmentIteratorUtf16& operator=(SegmentIteratorUtf16&&) = delete; + SegmentIteratorUtf16(const SegmentIteratorUtf16&) = delete; + SegmentIteratorUtf16& operator=(const SegmentIteratorUtf16&) = delete; + + /** + * Advance the iterator to the next break position. + * + * @return the break position. If there's no further break position, return + * Nothing(). + */ + virtual Maybe<uint32_t> Next() = 0; + + /** + * Advance the iterator to the first break position following the specified + * position aPos. + * + * Note: if this iterator's current position is already >= aPos, this method + * behaves the same as Next(). + */ + virtual Maybe<uint32_t> Seek(uint32_t aPos); + + protected: + explicit SegmentIteratorUtf16(Span<const char16_t> aText); + + // The text to iterate over. + Span<const char16_t> mText; + + // The current break position within mText. + uint32_t mPos = 0; +}; + +// Each enum value has the same meaning with respect to the `word-break` +// property values in the CSS Text spec. See the details in +// https://drafts.csswg.org/css-text-3/#word-break-property +enum class WordBreakRule : uint8_t { + Normal = 0, + BreakAll, + KeepAll, +}; + +// Each enum value has the same meaning with respect to the `line-break` +// property values in the CSS Text spec. See the details in +// https://drafts.csswg.org/css-text-3/#line-break-property. +enum class LineBreakRule : uint8_t { + Auto = 0, + Loose, + Normal, + Strict, + Anywhere, +}; + +// Extra options for line break iterator. +struct LineBreakOptions final { + WordBreakRule mWordBreakRule = WordBreakRule::Normal; + LineBreakRule mLineBreakRule = LineBreakRule::Auto; + bool mScriptIsChineseOrJapanese = false; +}; + +/** + * Line break iterator for UTF-16 text. + */ +class LineBreakIteratorUtf16 final : public SegmentIteratorUtf16 { + public: + explicit LineBreakIteratorUtf16(Span<const char16_t> aText, + const LineBreakOptions& aOptions = {}); + ~LineBreakIteratorUtf16() override; + + Maybe<uint32_t> Next() override; + Maybe<uint32_t> Seek(uint32_t aPos) override; + + private: + LineBreakOptions mOptions; + +#ifdef MOZ_ICU4X + capi::ICU4XLineSegmenter* mSegmenter = nullptr; + capi::ICU4XLineBreakIteratorUtf16* mIterator = nullptr; +#endif +}; + +/** + * Word break iterator for UTF-16 text. + */ +class WordBreakIteratorUtf16 final : public SegmentIteratorUtf16 { + public: + explicit WordBreakIteratorUtf16(Span<const char16_t> aText); + ~WordBreakIteratorUtf16() override; + + Maybe<uint32_t> Next() override; + Maybe<uint32_t> Seek(uint32_t aPos) override; + +#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) + private: + capi::ICU4XWordSegmenter* mSegmenter = nullptr; + capi::ICU4XWordBreakIteratorUtf16* mIterator = nullptr; +#endif +}; + +/** + * Grapheme cluster break iterator for UTF-16 text. + */ +class GraphemeClusterBreakIteratorUtf16 final : public SegmentIteratorUtf16 { + public: + explicit GraphemeClusterBreakIteratorUtf16(Span<const char16_t> aText); + ~GraphemeClusterBreakIteratorUtf16() override; + + Maybe<uint32_t> Next() override; + Maybe<uint32_t> Seek(uint32_t aPos) override; + +#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) + private: + static capi::ICU4XGraphemeClusterSegmenter* sSegmenter; + capi::ICU4XGraphemeClusterBreakIteratorUtf16* mIterator = nullptr; +#endif +}; + +/** + * Grapheme cluster break reverse iterator for UTF-16 text. + * + * Note: The reverse iterator doesn't handle conjoining Jamo and emoji. Use it + * at your own risk. + */ +class GraphemeClusterBreakReverseIteratorUtf16 final + : public SegmentIteratorUtf16 { + public: + explicit GraphemeClusterBreakReverseIteratorUtf16(Span<const char16_t> aText); + + Maybe<uint32_t> Next() override; + Maybe<uint32_t> Seek(uint32_t aPos) override; +}; + +#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) +/** + * Sentence break iterator for UTF-16 text. + */ +class SentenceBreakIteratorUtf16 final : public SegmentIteratorUtf16 { + public: + explicit SentenceBreakIteratorUtf16(Span<const char16_t> aText); + ~SentenceBreakIteratorUtf16() override; + + Maybe<uint32_t> Next() override; + Maybe<uint32_t> Seek(uint32_t aPos) override; + + private: + capi::ICU4XSentenceSegmenter* mSegmenter = nullptr; + capi::ICU4XSentenceBreakIteratorUtf16* mIterator = nullptr; +}; +#endif + +/** + * This component is a Mozilla-focused API for working with segmenters in + * internationalization code. + * + * This is a factor class. Calling Segment() to create an iterator over a text + * of given granularity. + */ +class Segmenter final { + public: + // NOTE: aLocale is a no-op currently. + static Result<UniquePtr<Segmenter>, ICUError> TryCreate( + Span<const char> aLocale, const SegmenterOptions& aOptions); + + explicit Segmenter(Span<const char> aLocale, const SegmenterOptions& aOptions) + : mOptions(aOptions) {} + + // Creates an iterator over aText of a given granularity in mOptions. + UniquePtr<SegmentIteratorUtf16> Segment(Span<const char16_t> aText) const; + + // TODO: Implement an iterator for Latin1 text. + // UniquePtr<SegmentIteratorLatin1> Segment(Span<const uint8_t> aText) const; + + private: + SegmenterOptions mOptions; +}; + +} // namespace mozilla::intl + +#endif |