summaryrefslogtreecommitdiffstats
path: root/js/src/builtin/intl/Segmenter.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'js/src/builtin/intl/Segmenter.cpp')
-rw-r--r--js/src/builtin/intl/Segmenter.cpp988
1 files changed, 988 insertions, 0 deletions
diff --git a/js/src/builtin/intl/Segmenter.cpp b/js/src/builtin/intl/Segmenter.cpp
new file mode 100644
index 0000000000..b7c82bb135
--- /dev/null
+++ b/js/src/builtin/intl/Segmenter.cpp
@@ -0,0 +1,988 @@
+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
+ * vim: set ts=8 sts=2 et sw=2 tw=80:
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+/* Intl.Segmenter implementation. */
+
+#include "builtin/intl/Segmenter.h"
+
+#include "mozilla/Assertions.h"
+#include "mozilla/IntegerTypeTraits.h"
+#include "mozilla/Range.h"
+#include "mozilla/UniquePtr.h"
+
+#if defined(MOZ_ICU4X)
+# include "mozilla/intl/ICU4XGeckoDataProvider.h"
+# include "ICU4XGraphemeClusterSegmenter.h"
+# include "ICU4XSentenceSegmenter.h"
+# include "ICU4XWordSegmenter.h"
+#endif
+
+#include "jspubtd.h"
+#include "NamespaceImports.h"
+
+#include "builtin/Array.h"
+#include "builtin/intl/CommonFunctions.h"
+#include "gc/AllocKind.h"
+#include "gc/GCContext.h"
+#include "js/CallArgs.h"
+#include "js/PropertyDescriptor.h"
+#include "js/PropertySpec.h"
+#include "js/RootingAPI.h"
+#include "js/StableStringChars.h"
+#include "js/TypeDecls.h"
+#include "js/Value.h"
+#include "util/Unicode.h"
+#include "vm/ArrayObject.h"
+#include "vm/GlobalObject.h"
+#include "vm/JSContext.h"
+#include "vm/PlainObject.h"
+#include "vm/WellKnownAtom.h"
+
+#include "vm/JSObject-inl.h"
+#include "vm/NativeObject-inl.h"
+
+using namespace js;
+
+const JSClassOps SegmenterObject::classOps_ = {
+ nullptr, // addProperty
+ nullptr, // delProperty
+ nullptr, // enumerate
+ nullptr, // newEnumerate
+ nullptr, // resolve
+ nullptr, // mayResolve
+ SegmenterObject::finalize, // finalize
+ nullptr, // call
+ nullptr, // construct
+ nullptr, // trace
+};
+
+const JSClass SegmenterObject::class_ = {
+ "Intl.Segmenter",
+ JSCLASS_HAS_RESERVED_SLOTS(SegmenterObject::SLOT_COUNT) |
+ JSCLASS_HAS_CACHED_PROTO(JSProto_Segmenter) |
+ JSCLASS_FOREGROUND_FINALIZE,
+ &SegmenterObject::classOps_,
+ &SegmenterObject::classSpec_,
+};
+
+const JSClass& SegmenterObject::protoClass_ = PlainObject::class_;
+
+static bool segmenter_toSource(JSContext* cx, unsigned argc, Value* vp) {
+ CallArgs args = CallArgsFromVp(argc, vp);
+ args.rval().setString(cx->names().Segmenter);
+ return true;
+}
+
+static const JSFunctionSpec segmenter_static_methods[] = {
+ JS_SELF_HOSTED_FN("supportedLocalesOf", "Intl_Segmenter_supportedLocalesOf",
+ 1, 0),
+ JS_FS_END,
+};
+
+static const JSFunctionSpec segmenter_methods[] = {
+ JS_SELF_HOSTED_FN("resolvedOptions", "Intl_Segmenter_resolvedOptions", 0,
+ 0),
+ JS_SELF_HOSTED_FN("segment", "Intl_Segmenter_segment", 1, 0),
+ JS_FN("toSource", segmenter_toSource, 0, 0),
+ JS_FS_END,
+};
+
+static const JSPropertySpec segmenter_properties[] = {
+ JS_STRING_SYM_PS(toStringTag, "Intl.Segmenter", JSPROP_READONLY),
+ JS_PS_END,
+};
+
+static bool Segmenter(JSContext* cx, unsigned argc, Value* vp);
+
+const ClassSpec SegmenterObject::classSpec_ = {
+ GenericCreateConstructor<Segmenter, 0, gc::AllocKind::FUNCTION>,
+ GenericCreatePrototype<SegmenterObject>,
+ segmenter_static_methods,
+ nullptr,
+ segmenter_methods,
+ segmenter_properties,
+ nullptr,
+ ClassSpec::DontDefineConstructor,
+};
+
+/**
+ * Intl.Segmenter ([ locales [ , options ]])
+ */
+static bool Segmenter(JSContext* cx, unsigned argc, Value* vp) {
+ CallArgs args = CallArgsFromVp(argc, vp);
+
+ // Step 1.
+ if (!ThrowIfNotConstructing(cx, args, "Intl.Segmenter")) {
+ return false;
+ }
+
+ // Steps 2-3 (Inlined 9.1.14, OrdinaryCreateFromConstructor).
+ Rooted<JSObject*> proto(cx);
+ if (!GetPrototypeFromBuiltinConstructor(cx, args, JSProto_Segmenter,
+ &proto)) {
+ return false;
+ }
+
+ Rooted<SegmenterObject*> segmenter(cx);
+ segmenter = NewObjectWithClassProto<SegmenterObject>(cx, proto);
+ if (!segmenter) {
+ return false;
+ }
+
+ HandleValue locales = args.get(0);
+ HandleValue options = args.get(1);
+
+ // Steps 4-13.
+ if (!intl::InitializeObject(cx, segmenter, cx->names().InitializeSegmenter,
+ locales, options)) {
+ return false;
+ }
+
+ // Step 14.
+ args.rval().setObject(*segmenter);
+ return true;
+}
+
+const JSClassOps SegmentsObject::classOps_ = {
+ nullptr, // addProperty
+ nullptr, // delProperty
+ nullptr, // enumerate
+ nullptr, // newEnumerate
+ nullptr, // resolve
+ nullptr, // mayResolve
+ SegmentsObject::finalize, // finalize
+ nullptr, // call
+ nullptr, // construct
+ nullptr, // trace
+};
+
+const JSClass SegmentsObject::class_ = {
+ "Intl.Segments",
+ JSCLASS_HAS_RESERVED_SLOTS(SegmentsObject::SLOT_COUNT) |
+ JSCLASS_FOREGROUND_FINALIZE,
+ &SegmentsObject::classOps_,
+};
+
+static const JSFunctionSpec segments_methods[] = {
+ JS_SELF_HOSTED_FN("containing", "Intl_Segments_containing", 1, 0),
+ JS_SELF_HOSTED_SYM_FN(iterator, "Intl_Segments_iterator", 0, 0),
+ JS_FS_END,
+};
+
+bool GlobalObject::initSegmentsProto(JSContext* cx,
+ Handle<GlobalObject*> global) {
+ Rooted<JSObject*> proto(
+ cx, GlobalObject::createBlankPrototype<PlainObject>(cx, global));
+ if (!proto) {
+ return false;
+ }
+
+ if (!JS_DefineFunctions(cx, proto, segments_methods)) {
+ return false;
+ }
+
+ global->initBuiltinProto(ProtoKind::SegmentsProto, proto);
+ return true;
+}
+
+const JSClassOps SegmentIteratorObject::classOps_ = {
+ nullptr, // addProperty
+ nullptr, // delProperty
+ nullptr, // enumerate
+ nullptr, // newEnumerate
+ nullptr, // resolve
+ nullptr, // mayResolve
+ SegmentIteratorObject::finalize, // finalize
+ nullptr, // call
+ nullptr, // construct
+ nullptr, // trace
+};
+
+const JSClass SegmentIteratorObject::class_ = {
+ "Intl.SegmentIterator",
+ JSCLASS_HAS_RESERVED_SLOTS(SegmentIteratorObject::SLOT_COUNT) |
+ JSCLASS_FOREGROUND_FINALIZE,
+ &SegmentIteratorObject::classOps_,
+};
+
+static const JSFunctionSpec segment_iterator_methods[] = {
+ JS_SELF_HOSTED_FN("next", "Intl_SegmentIterator_next", 0, 0),
+ JS_FS_END,
+};
+
+static const JSPropertySpec segment_iterator_properties[] = {
+ JS_STRING_SYM_PS(toStringTag, "Segmenter String Iterator", JSPROP_READONLY),
+ JS_PS_END,
+};
+
+bool GlobalObject::initSegmentIteratorProto(JSContext* cx,
+ Handle<GlobalObject*> global) {
+ Rooted<JSObject*> iteratorProto(
+ cx, GlobalObject::getOrCreateIteratorPrototype(cx, global));
+ if (!iteratorProto) {
+ return false;
+ }
+
+ Rooted<JSObject*> proto(
+ cx, GlobalObject::createBlankPrototypeInheriting<PlainObject>(
+ cx, iteratorProto));
+ if (!proto) {
+ return false;
+ }
+
+ if (!JS_DefineFunctions(cx, proto, segment_iterator_methods)) {
+ return false;
+ }
+
+ if (!JS_DefineProperties(cx, proto, segment_iterator_properties)) {
+ return false;
+ }
+
+ global->initBuiltinProto(ProtoKind::SegmentIteratorProto, proto);
+ return true;
+}
+
+struct Boundaries {
+ // Start index of this segmentation boundary.
+ int32_t startIndex = 0;
+
+ // End index of this segmentation boundary.
+ int32_t endIndex = 0;
+
+ // |true| if the segment is word-like. (Only used for word segmentation.)
+ bool isWordLike = false;
+};
+
+/**
+ * Find the segmentation boundary for the string character whose position is
+ * |index|. The end position of the last segment boundary is |previousIndex|.
+ */
+template <class T>
+static Boundaries FindBoundaryFrom(const T& iter, int32_t previousIndex,
+ int32_t index) {
+ MOZ_ASSERT(previousIndex <= index,
+ "previous index must not exceed the search index");
+
+ int32_t previous = previousIndex;
+ while (true) {
+ // Find the next possible break index.
+ int32_t next = iter.next();
+
+ // If |next| is larger than the search index, we've found our segment end
+ // index.
+ if (next > index) {
+ return {previous, next, iter.isWordLike()};
+ }
+
+ // Otherwise store |next| as the start index of the next segment,
+ previous = next;
+ }
+}
+
+// TODO: Consider switching to the ICU4X C++ headers when the C++ headers
+// are in better shape: https://github.com/rust-diplomat/diplomat/issues/280
+
+template <typename Interface>
+class SegmenterBreakIteratorType {
+ typename Interface::BreakIterator* impl_;
+
+ public:
+ explicit SegmenterBreakIteratorType(void* impl)
+ : impl_(static_cast<typename Interface::BreakIterator*>(impl)) {
+ MOZ_ASSERT(impl);
+ }
+
+ int32_t next() const { return Interface::next(impl_); }
+
+ bool isWordLike() const { return Interface::isWordLike(impl_); }
+};
+
+#if defined(MOZ_ICU4X)
+// Each SegmenterBreakIterator interface contains the following definitions:
+//
+// - BreakIterator: Type of the ICU4X break iterator.
+// - Segmenter: Type of the ICU4X segmenter.
+// - Char: Character type, either `JS::Latin1Char` or `char16_t`.
+// - create: Static method to create a new instance of `BreakIterator`.
+// - destroy: Static method to destroy an instance of `BreakIterator`.
+// - next: Static method to fetch the next break iteration index.
+// - isWordLike: Static method to determine if the current segment is word-like.
+//
+//
+// Each Segmenter interface contains the following definitions:
+//
+// - Segmenter: Type of the ICU4X segmenter.
+// - BreakIteratorLatin1: SegmenterBreakIterator interface to Latin1 strings.
+// - BreakIteratorTwoByte: SegmenterBreakIterator interface to TwoByte strings.
+// - create: Static method to create a new instance of `Segmenter`.
+// - destroy: Static method to destroy an instance of `Segmenter`.
+
+struct GraphemeClusterSegmenterBreakIteratorLatin1 {
+ using BreakIterator = capi::ICU4XGraphemeClusterBreakIteratorLatin1;
+ using Segmenter = capi::ICU4XGraphemeClusterSegmenter;
+ using Char = JS::Latin1Char;
+
+ static constexpr auto& create =
+ capi::ICU4XGraphemeClusterSegmenter_segment_latin1;
+ static constexpr auto& destroy =
+ capi::ICU4XGraphemeClusterBreakIteratorLatin1_destroy;
+ static constexpr auto& next =
+ capi::ICU4XGraphemeClusterBreakIteratorLatin1_next;
+
+ static bool isWordLike(const BreakIterator*) { return false; }
+};
+
+struct GraphemeClusterSegmenterBreakIteratorTwoByte {
+ using BreakIterator = capi::ICU4XGraphemeClusterBreakIteratorUtf16;
+ using Segmenter = capi::ICU4XGraphemeClusterSegmenter;
+ using Char = char16_t;
+
+ static constexpr auto& create =
+ capi::ICU4XGraphemeClusterSegmenter_segment_utf16;
+ static constexpr auto& destroy =
+ capi::ICU4XGraphemeClusterBreakIteratorUtf16_destroy;
+ static constexpr auto& next =
+ capi::ICU4XGraphemeClusterBreakIteratorUtf16_next;
+
+ static bool isWordLike(const BreakIterator*) { return false; }
+};
+
+struct GraphemeClusterSegmenter {
+ using Segmenter = capi::ICU4XGraphemeClusterSegmenter;
+ using BreakIteratorLatin1 =
+ SegmenterBreakIteratorType<GraphemeClusterSegmenterBreakIteratorLatin1>;
+ using BreakIteratorTwoByte =
+ SegmenterBreakIteratorType<GraphemeClusterSegmenterBreakIteratorTwoByte>;
+
+ static constexpr auto& create = capi::ICU4XGraphemeClusterSegmenter_create;
+ static constexpr auto& destroy = capi::ICU4XGraphemeClusterSegmenter_destroy;
+};
+
+struct WordSegmenterBreakIteratorLatin1 {
+ using BreakIterator = capi::ICU4XWordBreakIteratorLatin1;
+ using Segmenter = capi::ICU4XWordSegmenter;
+ using Char = JS::Latin1Char;
+
+ static constexpr auto& create = capi::ICU4XWordSegmenter_segment_latin1;
+ static constexpr auto& destroy = capi::ICU4XWordBreakIteratorLatin1_destroy;
+ static constexpr auto& next = capi::ICU4XWordBreakIteratorLatin1_next;
+ static constexpr auto& isWordLike =
+ capi::ICU4XWordBreakIteratorLatin1_is_word_like;
+};
+
+struct WordSegmenterBreakIteratorTwoByte {
+ using BreakIterator = capi::ICU4XWordBreakIteratorUtf16;
+ using Segmenter = capi::ICU4XWordSegmenter;
+ using Char = char16_t;
+
+ static constexpr auto& create = capi::ICU4XWordSegmenter_segment_utf16;
+ static constexpr auto& destroy = capi::ICU4XWordBreakIteratorUtf16_destroy;
+ static constexpr auto& next = capi::ICU4XWordBreakIteratorUtf16_next;
+ static constexpr auto& isWordLike =
+ capi::ICU4XWordBreakIteratorUtf16_is_word_like;
+};
+
+struct WordSegmenter {
+ using Segmenter = capi::ICU4XWordSegmenter;
+ using BreakIteratorLatin1 =
+ SegmenterBreakIteratorType<WordSegmenterBreakIteratorLatin1>;
+ using BreakIteratorTwoByte =
+ SegmenterBreakIteratorType<WordSegmenterBreakIteratorTwoByte>;
+
+ static constexpr auto& create = capi::ICU4XWordSegmenter_create_auto;
+ static constexpr auto& destroy = capi::ICU4XWordSegmenter_destroy;
+};
+
+struct SentenceSegmenterBreakIteratorLatin1 {
+ using BreakIterator = capi::ICU4XSentenceBreakIteratorLatin1;
+ using Segmenter = capi::ICU4XSentenceSegmenter;
+ using Char = JS::Latin1Char;
+
+ static constexpr auto& create = capi::ICU4XSentenceSegmenter_segment_latin1;
+ static constexpr auto& destroy =
+ capi::ICU4XSentenceBreakIteratorLatin1_destroy;
+ static constexpr auto& next = capi::ICU4XSentenceBreakIteratorLatin1_next;
+
+ static bool isWordLike(const BreakIterator*) { return false; }
+};
+
+struct SentenceSegmenterBreakIteratorTwoByte {
+ using BreakIterator = capi::ICU4XSentenceBreakIteratorUtf16;
+ using Segmenter = capi::ICU4XSentenceSegmenter;
+ using Char = char16_t;
+
+ static constexpr auto& create = capi::ICU4XSentenceSegmenter_segment_utf16;
+ static constexpr auto& destroy =
+ capi::ICU4XSentenceBreakIteratorUtf16_destroy;
+ static constexpr auto& next = capi::ICU4XSentenceBreakIteratorUtf16_next;
+
+ static bool isWordLike(const BreakIterator*) { return false; }
+};
+
+struct SentenceSegmenter {
+ using Segmenter = capi::ICU4XSentenceSegmenter;
+ using BreakIteratorLatin1 =
+ SegmenterBreakIteratorType<SentenceSegmenterBreakIteratorLatin1>;
+ using BreakIteratorTwoByte =
+ SegmenterBreakIteratorType<SentenceSegmenterBreakIteratorTwoByte>;
+
+ static constexpr auto& create = capi::ICU4XSentenceSegmenter_create;
+ static constexpr auto& destroy = capi::ICU4XSentenceSegmenter_destroy;
+};
+#endif
+
+/**
+ * Create a new ICU4X segmenter instance.
+ */
+template <typename Interface>
+static typename Interface::Segmenter* CreateSegmenter(JSContext* cx) {
+ auto result = Interface::create(mozilla::intl::GetDataProvider());
+ if (!result.is_ok) {
+ intl::ReportInternalError(cx);
+ return nullptr;
+ }
+ return result.ok;
+}
+
+static bool EnsureInternalsResolved(JSContext* cx,
+ Handle<SegmenterObject*> segmenter) {
+ if (segmenter->getLocale()) {
+ return true;
+ }
+
+ Rooted<JS::Value> value(cx);
+
+ Rooted<JSObject*> internals(cx, intl::GetInternalsObject(cx, segmenter));
+ if (!internals) {
+ return false;
+ }
+
+ if (!GetProperty(cx, internals, internals, cx->names().locale, &value)) {
+ return false;
+ }
+ Rooted<JSString*> locale(cx, value.toString());
+
+ if (!GetProperty(cx, internals, internals, cx->names().granularity, &value)) {
+ return false;
+ }
+
+ SegmenterGranularity granularity;
+ {
+ JSLinearString* linear = value.toString()->ensureLinear(cx);
+ if (!linear) {
+ return false;
+ }
+
+ if (StringEqualsLiteral(linear, "grapheme")) {
+ granularity = SegmenterGranularity::Grapheme;
+ } else if (StringEqualsLiteral(linear, "word")) {
+ granularity = SegmenterGranularity::Word;
+ } else {
+ MOZ_ASSERT(StringEqualsLiteral(linear, "sentence"));
+ granularity = SegmenterGranularity::Sentence;
+ }
+ }
+
+#if defined(MOZ_ICU4X)
+ switch (granularity) {
+ case SegmenterGranularity::Grapheme: {
+ auto* seg = CreateSegmenter<GraphemeClusterSegmenter>(cx);
+ if (!seg) {
+ return false;
+ }
+ segmenter->setSegmenter(seg);
+ break;
+ }
+ case SegmenterGranularity::Word: {
+ auto* seg = CreateSegmenter<WordSegmenter>(cx);
+ if (!seg) {
+ return false;
+ }
+ segmenter->setSegmenter(seg);
+ break;
+ }
+ case SegmenterGranularity::Sentence: {
+ auto* seg = CreateSegmenter<SentenceSegmenter>(cx);
+ if (!seg) {
+ return false;
+ }
+ segmenter->setSegmenter(seg);
+ break;
+ }
+ }
+#endif
+
+ segmenter->setLocale(locale);
+ segmenter->setGranularity(granularity);
+
+ return true;
+}
+
+/**
+ * Destroy an ICU4X segmenter instance.
+ */
+template <typename Interface>
+static void DestroySegmenter(void* seg) {
+ auto* segmenter = static_cast<typename Interface::Segmenter*>(seg);
+ Interface::destroy(segmenter);
+}
+
+void SegmenterObject::finalize(JS::GCContext* gcx, JSObject* obj) {
+ MOZ_ASSERT(gcx->onMainThread());
+
+ auto& segmenter = obj->as<SegmenterObject>();
+ if (void* seg = segmenter.getSegmenter()) {
+#if defined(MOZ_ICU4X)
+ switch (segmenter.getGranularity()) {
+ case SegmenterGranularity::Grapheme: {
+ DestroySegmenter<GraphemeClusterSegmenter>(seg);
+ break;
+ }
+ case SegmenterGranularity::Word: {
+ DestroySegmenter<WordSegmenter>(seg);
+ break;
+ }
+ case SegmenterGranularity::Sentence: {
+ DestroySegmenter<SentenceSegmenter>(seg);
+ break;
+ }
+ }
+#else
+ MOZ_CRASH("ICU4X disabled");
+#endif
+ }
+}
+
+/**
+ * Destroy an ICU4X break iterator instance.
+ */
+template <typename Interface>
+static void DestroyBreakIterator(void* brk) {
+ auto* breakIterator = static_cast<typename Interface::BreakIterator*>(brk);
+ Interface::destroy(breakIterator);
+}
+
+/**
+ * Destroy the ICU4X break iterator attached to |segments|.
+ */
+template <typename T>
+static void DestroyBreakIterator(const T* segments) {
+#if defined(MOZ_ICU4X)
+ void* brk = segments->getBreakIterator();
+ MOZ_ASSERT(brk);
+
+ bool isLatin1 = segments->getString()->hasLatin1Chars();
+
+ switch (segments->getGranularity()) {
+ case SegmenterGranularity::Grapheme: {
+ if (isLatin1) {
+ DestroyBreakIterator<GraphemeClusterSegmenterBreakIteratorLatin1>(brk);
+ } else {
+ DestroyBreakIterator<GraphemeClusterSegmenterBreakIteratorTwoByte>(brk);
+ }
+ break;
+ }
+ case SegmenterGranularity::Word: {
+ if (isLatin1) {
+ DestroyBreakIterator<WordSegmenterBreakIteratorLatin1>(brk);
+ } else {
+ DestroyBreakIterator<WordSegmenterBreakIteratorTwoByte>(brk);
+ }
+ break;
+ }
+ case SegmenterGranularity::Sentence: {
+ if (isLatin1) {
+ DestroyBreakIterator<SentenceSegmenterBreakIteratorLatin1>(brk);
+ } else {
+ DestroyBreakIterator<SentenceSegmenterBreakIteratorTwoByte>(brk);
+ }
+ break;
+ }
+ }
+#else
+ MOZ_CRASH("ICU4X disabled");
+#endif
+}
+
+void SegmentsObject::finalize(JS::GCContext* gcx, JSObject* obj) {
+ MOZ_ASSERT(gcx->onMainThread());
+
+ auto* segments = &obj->as<SegmentsObject>();
+ bool isLatin1 = segments->getString()->hasLatin1Chars();
+
+ if (void* chars = segments->getStringChars()) {
+ size_t length = segments->getString()->length();
+ if (isLatin1) {
+ intl::RemoveICUCellMemory(gcx, segments, length * sizeof(JS::Latin1Char));
+ } else {
+ intl::RemoveICUCellMemory(gcx, segments, length * sizeof(char16_t));
+ }
+ js_free(chars);
+ }
+
+ if (segments->getBreakIterator()) {
+ DestroyBreakIterator(segments);
+ }
+}
+
+void SegmentIteratorObject::finalize(JS::GCContext* gcx, JSObject* obj) {
+ MOZ_ASSERT(gcx->onMainThread());
+
+ auto* iterator = &obj->as<SegmentIteratorObject>();
+ bool isLatin1 = iterator->getString()->hasLatin1Chars();
+
+ if (void* chars = iterator->getStringChars()) {
+ size_t length = iterator->getString()->length();
+ if (isLatin1) {
+ intl::RemoveICUCellMemory(gcx, iterator, length * sizeof(JS::Latin1Char));
+ } else {
+ intl::RemoveICUCellMemory(gcx, iterator, length * sizeof(char16_t));
+ }
+ js_free(chars);
+ }
+
+ if (iterator->getBreakIterator()) {
+ DestroyBreakIterator(iterator);
+ }
+}
+
+template <typename Iterator, typename T>
+static Boundaries FindBoundaryFrom(Handle<T*> segments, int32_t index) {
+ MOZ_ASSERT(0 <= index && uint32_t(index) < segments->getString()->length());
+
+ Iterator iter(segments->getBreakIterator());
+ return FindBoundaryFrom(iter, segments->getIndex(), index);
+}
+
+template <typename T>
+static Boundaries GraphemeBoundaries(Handle<T*> segments, int32_t index) {
+#if defined(MOZ_ICU4X)
+ if (segments->getString()->hasLatin1Chars()) {
+ return FindBoundaryFrom<GraphemeClusterSegmenter::BreakIteratorLatin1>(
+ segments, index);
+ }
+ return FindBoundaryFrom<GraphemeClusterSegmenter::BreakIteratorTwoByte>(
+ segments, index);
+#else
+ MOZ_CRASH("ICU4X disabled");
+#endif
+}
+
+template <typename T>
+static Boundaries WordBoundaries(Handle<T*> segments, int32_t index) {
+#if defined(MOZ_ICU4X)
+ if (segments->getString()->hasLatin1Chars()) {
+ return FindBoundaryFrom<WordSegmenter::BreakIteratorLatin1>(segments,
+ index);
+ }
+ return FindBoundaryFrom<WordSegmenter::BreakIteratorTwoByte>(segments, index);
+#else
+ MOZ_CRASH("ICU4X disabled");
+#endif
+}
+
+template <typename T>
+static Boundaries SentenceBoundaries(Handle<T*> segments, int32_t index) {
+#if defined(MOZ_ICU4X)
+ if (segments->getString()->hasLatin1Chars()) {
+ return FindBoundaryFrom<SentenceSegmenter::BreakIteratorLatin1>(segments,
+ index);
+ }
+ return FindBoundaryFrom<SentenceSegmenter::BreakIteratorTwoByte>(segments,
+ index);
+#else
+ MOZ_CRASH("ICU4X disabled");
+#endif
+}
+
+/**
+ * Ensure the string characters have been copied into |segments| in preparation
+ * for passing the string characters to ICU4X.
+ */
+template <typename T>
+static bool EnsureStringChars(JSContext* cx, Handle<T*> segments) {
+ if (segments->hasStringChars()) {
+ return true;
+ }
+
+ Rooted<JSLinearString*> string(cx, segments->getString()->ensureLinear(cx));
+ if (!string) {
+ return false;
+ }
+
+ size_t length = string->length();
+
+ JS::AutoCheckCannotGC nogc;
+ if (string->hasLatin1Chars()) {
+ auto chars = DuplicateString(cx, string->latin1Chars(nogc), length);
+ if (!chars) {
+ return false;
+ }
+ segments->setLatin1Chars(chars.release());
+
+ intl::AddICUCellMemory(segments, length * sizeof(JS::Latin1Char));
+ } else {
+ auto chars = DuplicateString(cx, string->twoByteChars(nogc), length);
+ if (!chars) {
+ return false;
+ }
+ segments->setTwoByteChars(chars.release());
+
+ intl::AddICUCellMemory(segments, length * sizeof(char16_t));
+ }
+ return true;
+}
+
+/**
+ * Create a new ICU4X break iterator instance.
+ */
+template <typename Interface, typename T>
+static auto* CreateBreakIterator(Handle<T*> segments) {
+ void* segmenter = segments->getSegmenter()->getSegmenter();
+ MOZ_ASSERT(segmenter);
+
+ void* chars = segments->getStringChars();
+ MOZ_ASSERT(chars);
+
+ size_t length = segments->getString()->length();
+
+ using Unsigned = typename mozilla::UnsignedStdintTypeForSize<sizeof(
+ typename Interface::Char)>::Type;
+
+ auto* seg = static_cast<const typename Interface::Segmenter*>(segmenter);
+ auto* ch = static_cast<const Unsigned*>(chars);
+ return Interface::create(seg, ch, length);
+}
+
+/**
+ * Ensure |segments| has a break iterator whose current segment index is at most
+ * |index|.
+ */
+template <typename T>
+static bool EnsureBreakIterator(JSContext* cx, Handle<T*> segments,
+ int32_t index) {
+ if (segments->getBreakIterator()) {
+ // Reuse the break iterator if its current segment index is at most |index|.
+ if (index >= segments->getIndex()) {
+ return true;
+ }
+
+ // Reverse iteration not supported. Destroy the previous break iterator and
+ // start from fresh.
+ DestroyBreakIterator(segments.get());
+
+ // Reset internal state.
+ segments->setBreakIterator(nullptr);
+ segments->setIndex(0);
+ }
+
+ // Ensure the string characters can be passed to ICU4X.
+ if (!EnsureStringChars(cx, segments)) {
+ return false;
+ }
+
+#if defined(MOZ_ICU4X)
+ bool isLatin1 = segments->getString()->hasLatin1Chars();
+
+ // Create a new break iterator based on the granularity and character type.
+ void* brk;
+ switch (segments->getGranularity()) {
+ case SegmenterGranularity::Grapheme: {
+ if (isLatin1) {
+ brk = CreateBreakIterator<GraphemeClusterSegmenterBreakIteratorLatin1>(
+ segments);
+ } else {
+ brk = CreateBreakIterator<GraphemeClusterSegmenterBreakIteratorTwoByte>(
+ segments);
+ }
+ break;
+ }
+ case SegmenterGranularity::Word: {
+ if (isLatin1) {
+ brk = CreateBreakIterator<WordSegmenterBreakIteratorLatin1>(segments);
+ } else {
+ brk = CreateBreakIterator<WordSegmenterBreakIteratorTwoByte>(segments);
+ }
+ break;
+ }
+ case SegmenterGranularity::Sentence: {
+ if (isLatin1) {
+ brk =
+ CreateBreakIterator<SentenceSegmenterBreakIteratorLatin1>(segments);
+ } else {
+ brk = CreateBreakIterator<SentenceSegmenterBreakIteratorTwoByte>(
+ segments);
+ }
+ break;
+ }
+ }
+
+ MOZ_RELEASE_ASSERT(brk);
+ segments->setBreakIterator(brk);
+
+ MOZ_ASSERT(segments->getIndex() == 0, "index is initially zero");
+
+ return true;
+#else
+ MOZ_CRASH("ICU4X disabled");
+#endif
+}
+
+/**
+ * Create the boundaries result array for self-hosted code.
+ */
+static ArrayObject* CreateBoundaries(JSContext* cx, Boundaries boundaries,
+ SegmenterGranularity granularity) {
+ auto [startIndex, endIndex, isWordLike] = boundaries;
+
+ auto* result = NewDenseFullyAllocatedArray(cx, 3);
+ if (!result) {
+ return nullptr;
+ }
+ result->setDenseInitializedLength(3);
+ result->initDenseElement(0, Int32Value(startIndex));
+ result->initDenseElement(1, Int32Value(endIndex));
+ if (granularity == SegmenterGranularity::Word) {
+ result->initDenseElement(2, BooleanValue(isWordLike));
+ } else {
+ result->initDenseElement(2, UndefinedValue());
+ }
+ return result;
+}
+
+template <typename T>
+static ArrayObject* FindSegmentBoundaries(JSContext* cx, Handle<T*> segments,
+ int32_t index) {
+ // Ensure break iteration can start at |index|.
+ if (!EnsureBreakIterator(cx, segments, index)) {
+ return nullptr;
+ }
+
+ // Find the actual segment boundaries.
+ Boundaries boundaries{};
+ switch (segments->getGranularity()) {
+ case SegmenterGranularity::Grapheme: {
+ boundaries = GraphemeBoundaries(segments, index);
+ break;
+ }
+ case SegmenterGranularity::Word: {
+ boundaries = WordBoundaries(segments, index);
+ break;
+ }
+ case SegmenterGranularity::Sentence: {
+ boundaries = SentenceBoundaries(segments, index);
+ break;
+ }
+ }
+
+ // Remember the end index of the current boundary segment.
+ segments->setIndex(boundaries.endIndex);
+
+ return CreateBoundaries(cx, boundaries, segments->getGranularity());
+}
+
+bool js::intl_CreateSegmentsObject(JSContext* cx, unsigned argc, Value* vp) {
+ CallArgs args = CallArgsFromVp(argc, vp);
+ MOZ_ASSERT(args.length() == 2);
+
+ Rooted<SegmenterObject*> segmenter(cx,
+ &args[0].toObject().as<SegmenterObject>());
+ Rooted<JSString*> string(cx, args[1].toString());
+
+ // Ensure the internal properties are resolved.
+ if (!EnsureInternalsResolved(cx, segmenter)) {
+ return false;
+ }
+
+ Rooted<JSObject*> proto(
+ cx, GlobalObject::getOrCreateSegmentsPrototype(cx, cx->global()));
+ if (!proto) {
+ return false;
+ }
+
+ auto* segments = NewObjectWithGivenProto<SegmentsObject>(cx, proto);
+ if (!segments) {
+ return false;
+ }
+
+ segments->setSegmenter(segmenter);
+ segments->setGranularity(segmenter->getGranularity());
+ segments->setString(string);
+ segments->setIndex(0);
+
+ args.rval().setObject(*segments);
+ return true;
+}
+
+bool js::intl_CreateSegmentIterator(JSContext* cx, unsigned argc, Value* vp) {
+ CallArgs args = CallArgsFromVp(argc, vp);
+ MOZ_ASSERT(args.length() == 1);
+
+ Rooted<SegmentsObject*> segments(cx,
+ &args[0].toObject().as<SegmentsObject>());
+
+ Rooted<JSObject*> proto(
+ cx, GlobalObject::getOrCreateSegmentIteratorPrototype(cx, cx->global()));
+ if (!proto) {
+ return false;
+ }
+
+ auto* iterator = NewObjectWithGivenProto<SegmentIteratorObject>(cx, proto);
+ if (!iterator) {
+ return false;
+ }
+
+ iterator->setSegmenter(segments->getSegmenter());
+ iterator->setGranularity(segments->getGranularity());
+ iterator->setString(segments->getString());
+ iterator->setIndex(0);
+
+ args.rval().setObject(*iterator);
+ return true;
+}
+
+bool js::intl_FindSegmentBoundaries(JSContext* cx, unsigned argc, Value* vp) {
+ CallArgs args = CallArgsFromVp(argc, vp);
+ MOZ_ASSERT(args.length() == 2);
+
+ Rooted<SegmentsObject*> segments(cx,
+ &args[0].toObject().as<SegmentsObject>());
+
+ int32_t index = args[1].toInt32();
+ MOZ_ASSERT(index >= 0);
+ MOZ_ASSERT(uint32_t(index) < segments->getString()->length());
+
+ auto* result = FindSegmentBoundaries(
+ cx, static_cast<Handle<SegmentsObject*>>(segments), index);
+ if (!result) {
+ return false;
+ }
+
+ args.rval().setObject(*result);
+ return true;
+}
+
+bool js::intl_FindNextSegmentBoundaries(JSContext* cx, unsigned argc,
+ Value* vp) {
+ CallArgs args = CallArgsFromVp(argc, vp);
+ MOZ_ASSERT(args.length() == 1);
+
+ Rooted<SegmentIteratorObject*> iterator(
+ cx, &args[0].toObject().as<SegmentIteratorObject>());
+
+ int32_t index = iterator->getIndex();
+ MOZ_ASSERT(index >= 0);
+ MOZ_ASSERT(uint32_t(index) < iterator->getString()->length());
+
+ auto* result = FindSegmentBoundaries(
+ cx, static_cast<Handle<SegmentIteratorObject*>>(iterator), index);
+ if (!result) {
+ return false;
+ }
+
+ args.rval().setObject(*result);
+ return true;
+}