summaryrefslogtreecommitdiffstats
path: root/js/src/tests/non262/Intl/Segmenter
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-19 00:47:55 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-19 00:47:55 +0000
commit26a029d407be480d791972afb5975cf62c9360a6 (patch)
treef435a8308119effd964b339f76abb83a57c29483 /js/src/tests/non262/Intl/Segmenter
parentInitial commit. (diff)
downloadfirefox-26a029d407be480d791972afb5975cf62c9360a6.tar.xz
firefox-26a029d407be480d791972afb5975cf62c9360a6.zip
Adding upstream version 124.0.1.upstream/124.0.1
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'js/src/tests/non262/Intl/Segmenter')
-rw-r--r--js/src/tests/non262/Intl/Segmenter/browser.js0
-rw-r--r--js/src/tests/non262/Intl/Segmenter/cross-compartment.js35
-rw-r--r--js/src/tests/non262/Intl/Segmenter/grapheme-latin1.js37
-rw-r--r--js/src/tests/non262/Intl/Segmenter/grapheme.js110
-rw-r--r--js/src/tests/non262/Intl/Segmenter/refresh-text-asan.js15
-rw-r--r--js/src/tests/non262/Intl/Segmenter/sentence-latin.js96
-rw-r--r--js/src/tests/non262/Intl/Segmenter/sentence.js137
-rw-r--r--js/src/tests/non262/Intl/Segmenter/shell.js0
-rw-r--r--js/src/tests/non262/Intl/Segmenter/surrogate-pair-split.js57
-rw-r--r--js/src/tests/non262/Intl/Segmenter/word-latin1.js215
-rw-r--r--js/src/tests/non262/Intl/Segmenter/word.js152
11 files changed, 854 insertions, 0 deletions
diff --git a/js/src/tests/non262/Intl/Segmenter/browser.js b/js/src/tests/non262/Intl/Segmenter/browser.js
new file mode 100644
index 0000000000..e69de29bb2
--- /dev/null
+++ b/js/src/tests/non262/Intl/Segmenter/browser.js
diff --git a/js/src/tests/non262/Intl/Segmenter/cross-compartment.js b/js/src/tests/non262/Intl/Segmenter/cross-compartment.js
new file mode 100644
index 0000000000..58845e6b62
--- /dev/null
+++ b/js/src/tests/non262/Intl/Segmenter/cross-compartment.js
@@ -0,0 +1,35 @@
+// |reftest| skip-if(!this.hasOwnProperty('Intl')||!this.Intl.Segmenter)
+
+var g = newGlobal({});
+
+var segmenter = new Intl.Segmenter();
+var ccwSegmenter = new g.Intl.Segmenter();
+
+const SegmentsPrototype = Object.getPrototypeOf(segmenter.segment(""));
+const SegmentIteratorPrototype = Object.getPrototypeOf(segmenter.segment("")[Symbol.iterator]());
+
+// Intl.Segmenter.prototype.resolvedOptions ()
+var resolved1 = Intl.Segmenter.prototype.resolvedOptions.call(segmenter);
+var resolved2 = Intl.Segmenter.prototype.resolvedOptions.call(ccwSegmenter);
+assertDeepEq(resolved1, resolved2);
+
+// Intl.Segmenter.prototype.segment
+var seg1 = Intl.Segmenter.prototype.segment.call(segmenter, "This is a test.");
+var seg2 = Intl.Segmenter.prototype.segment.call(ccwSegmenter, "This is a test.");
+
+// %Segments.prototype%.containing ( index )
+var data1 = SegmentsPrototype.containing.call(seg1, 10);
+var data2 = SegmentsPrototype.containing.call(seg2, 10);
+assertDeepEq(data1, data2);
+
+// %Segments.prototype% [ @@iterator ] ()
+var iter1 = SegmentsPrototype[Symbol.iterator].call(seg1);
+var iter2 = SegmentsPrototype[Symbol.iterator].call(seg2);
+
+// %SegmentIterator.prototype%.next ()
+var result1 = SegmentIteratorPrototype.next.call(iter1);
+var result2 = SegmentIteratorPrototype.next.call(iter2);
+assertDeepEq(result1, result2);
+
+if (typeof reportCompare === "function")
+ reportCompare(0, 0);
diff --git a/js/src/tests/non262/Intl/Segmenter/grapheme-latin1.js b/js/src/tests/non262/Intl/Segmenter/grapheme-latin1.js
new file mode 100644
index 0000000000..3b54c02236
--- /dev/null
+++ b/js/src/tests/non262/Intl/Segmenter/grapheme-latin1.js
@@ -0,0 +1,37 @@
+// |reftest| slow skip-if(!this.hasOwnProperty('Intl')||!this.Intl.Segmenter)
+
+// CRLF should be the only compound grapheme for Latin-1 strings.
+
+let segmenter = new Intl.Segmenter("en", {granularity: "grapheme"});
+
+for (let i = 0; i <= 0xff; ++i) {
+ for (let j = 0; j <= 0xff; ++j) {
+ let string = String.fromCodePoint(i, j);
+ let segments = segmenter.segment(string);
+
+ let data1 = segments.containing(0);
+ let data2 = segments.containing(1);
+ let graphemes = [...segments];
+
+ if (i === "\r".charCodeAt(0) && j === "\n".charCodeAt(0)) {
+ assertEq(data1.index, 0);
+ assertEq(data1.segment, "\r\n");
+
+ assertEq(data2.index, 0);
+ assertEq(data2.segment, "\r\n");
+
+ assertEq(graphemes.length, 1);
+ } else {
+ assertEq(data1.index, 0);
+ assertEq(data1.segment, String.fromCodePoint(i));
+
+ assertEq(data2.index, 1);
+ assertEq(data2.segment, String.fromCodePoint(j));
+
+ assertEq(graphemes.length, 2);
+ }
+ }
+}
+
+if (typeof reportCompare === "function")
+ reportCompare(0, 0);
diff --git a/js/src/tests/non262/Intl/Segmenter/grapheme.js b/js/src/tests/non262/Intl/Segmenter/grapheme.js
new file mode 100644
index 0000000000..c51de7f8d0
--- /dev/null
+++ b/js/src/tests/non262/Intl/Segmenter/grapheme.js
@@ -0,0 +1,110 @@
+// |reftest| skip-if(!this.hasOwnProperty('Intl')||!this.Intl.Segmenter)
+
+// Grapheme boundaries are locale independent. Test with various locales to
+// ensure we get the same results.
+const locales = [
+ "en", "de", "fr", "ar", "ja", "zh", "th",
+];
+
+let strings = {
+ // Empty string
+ "": [],
+
+ // Ascii
+ "test": "test".split(""),
+ "hello world": "hello world".split(""),
+ "hello\0world": "hello\0world".split(""),
+ "\r\n": ["\r\n"],
+
+ // Latin-1
+ "äöü éèê µß \xff": "äöü éèê µß \xff".split(""),
+
+ // Two-Byte
+ "中文字": "中文字".split(""),
+
+ // Grapheme Clusters: https://www.unicode.org/reports/tr29/#Table_Sample_Grapheme_Clusters
+ "e\u0300": ["e\u0300"],
+ "\u1100\u1161\u11A8": ["\u1100\u1161\u11A8"], // Hangul syllable "gag"
+ "\u0E01\u0E33": ["\u0E01\u0E33"], // Thai kam
+ "\u0937\u093F": ["\u0937\u093F"], // Devanagari ssi
+
+ // Emojis
+ "\u263A\uFE0F": ["\u263A\uFE0F"], // Variant selector
+ "\u{1F385}\u{1F3FB}": ["\u{1F385}\u{1F3FB}"], // Skin tone selector
+ "\u{1F469}\u{1F3FD}\u{200D}\u{1F52C}": ["\u{1F469}\u{1F3FD}\u{200D}\u{1F52C}"], // ZWJ
+ "\u{1F469}\u{1F3FD}\u{200D}\u{1F52C}\u{FE0F}": ["\u{1F469}\u{1F3FD}\u{200D}\u{1F52C}\u{FE0F}"], // ZWJ + VS
+ "\u{1F926}\u{1F3FC}\u{200D}\u{2642}\u{FE0F}": ["\u{1F926}\u{1F3FC}\u{200D}\u{2642}\u{FE0F}"], // ZWJ + VS with BMP modifier
+ "\u{1F1E9}\u{1F1EA}": ["\u{1F1E9}\u{1F1EA}"], // Flags
+ "\u{1F3F4}\u{E0067}\u{E0062}\u{E0073}\u{E0063}\u{E0074}\u{E007F}": ["\u{1F3F4}\u{E0067}\u{E0062}\u{E0073}\u{E0063}\u{E0074}\u{E007F}"], // Subdivision flags
+};
+
+function assertIsSegmentDataObject(obj) {
+ // The prototype is %Object.prototype%.
+ assertEq(Object.getPrototypeOf(obj), Object.prototype);
+
+ // The Segment Data object has exactly three own properties.
+ let keys = Reflect.ownKeys(obj);
+ assertEq(keys.length, 3);
+ assertEq(keys[0], "segment");
+ assertEq(keys[1], "index");
+ assertEq(keys[2], "input");
+
+ // Ensure each property has the correct value type.
+ assertEq(typeof obj.segment, "string");
+ assertEq(typeof obj.index, "number");
+ assertEq(typeof obj.input, "string");
+
+ // |index| is an integer index into |string|.
+ assertEq(Number.isInteger(obj.index), true);
+ assertEq(obj.index >= 0, true);
+ assertEq(obj.index < obj.input.length, true);
+
+ // Segments are non-empty.
+ assertEq(obj.segment.length > 0, true);
+
+ // Ensure the segment is present in the input at the correct position.
+ assertEq(obj.input.substr(obj.index, obj.segment.length), obj.segment);
+}
+
+function segmentsFromContaining(segmenter, string) {
+ let segments = segmenter.segment(string);
+
+ let result = [];
+ for (let index = 0, data; (data = segments.containing(index)); index += data.segment.length) {
+ result.push(data);
+ }
+ return result;
+}
+
+for (let locale of locales) {
+ let segmenter = new Intl.Segmenter(locale, {granularity: "grapheme"});
+
+ let resolved = segmenter.resolvedOptions();
+ assertEq(resolved.locale, locale);
+ assertEq(resolved.granularity, "grapheme");
+
+ for (let [string, graphemes] of Object.entries(strings)) {
+ let segments = [...segmenter.segment(string)];
+
+ // Assert each segment is a valid Segment Data object.
+ segments.forEach(assertIsSegmentDataObject);
+
+ // Concatenating all segments should return the input.
+ assertEq(segments.reduce((acc, {segment}) => acc + segment, ""), string);
+
+ // The "input" property matches the original input string.
+ assertEq(segments.every(({input}) => input === string), true);
+
+ // The indices are sorted in ascending order.
+ assertEq(isNaN(segments.reduce((acc, {index}) => index > acc ? index : NaN, -Infinity)), false);
+
+ // The computed segments match the expected value.
+ assertEqArray(segments.map(({segment}) => segment), graphemes);
+
+ // Segment iteration and %Segments.prototype%.containing return the same results.
+ assertDeepEq(segmentsFromContaining(segmenter, string), segments);
+ }
+}
+
+if (typeof reportCompare === "function")
+ reportCompare(0, 0);
diff --git a/js/src/tests/non262/Intl/Segmenter/refresh-text-asan.js b/js/src/tests/non262/Intl/Segmenter/refresh-text-asan.js
new file mode 100644
index 0000000000..98ad7c56f5
--- /dev/null
+++ b/js/src/tests/non262/Intl/Segmenter/refresh-text-asan.js
@@ -0,0 +1,15 @@
+// |reftest| skip-if(!this.hasOwnProperty('Intl')||!this.Intl.Segmenter)
+
+// Test fails in ASan builds when ubrk_refreshUText isn't called.
+
+let string = "A. ";
+
+let segmenter = new Intl.Segmenter(undefined, {granularity: "sentence"});
+let segments = segmenter.segment(string.repeat(100));
+
+for (let {segment} of segments) {
+ assertEq(segment, string);
+}
+
+if (typeof reportCompare === "function")
+ reportCompare(0, 0);
diff --git a/js/src/tests/non262/Intl/Segmenter/sentence-latin.js b/js/src/tests/non262/Intl/Segmenter/sentence-latin.js
new file mode 100644
index 0000000000..cebb029a40
--- /dev/null
+++ b/js/src/tests/non262/Intl/Segmenter/sentence-latin.js
@@ -0,0 +1,96 @@
+// |reftest| skip-if(!this.hasOwnProperty('Intl')||!this.Intl.Segmenter)
+
+// https://www.unicode.org/reports/tr29/#Sentence_Boundary_Rules
+
+const strings = {
+ // SB1, SB2
+ "": [],
+
+ // SB3
+ "\r\n": ["\r\n"],
+
+ // SB4
+ "First paragraph.\nSecond paragraph.": ["First paragraph.\n", "Second paragraph."],
+ "First paragraph.\rSecond paragraph.": ["First paragraph.\r", "Second paragraph."],
+ "First paragraph.\r\nSecond paragraph.": ["First paragraph.\r\n", "Second paragraph."],
+ "First paragraph.\x85Second paragraph.": ["First paragraph.\x85", "Second paragraph."],
+
+ // SB5
+ "\xADWo\xADrd\xAD.\xAD": ["\xADWo\xADrd\xAD.\xAD"],
+ "Word.\n\xAD": ["Word.\n", "\xAD"],
+ "Word.\r\xAD\n": ["Word.\r", "\xAD\n"],
+
+ // SB6
+ ".2": [".2"],
+ "1.2": ["1.2"],
+ "!2": ["!", "2"],
+ "1!2": ["1!", "2"],
+
+ // SB7
+ "A.B": ["A.B"],
+ "a.B": ["a.B"],
+ "A. B": ["A. ", "B"],
+ "a. B": ["a. ", "B"],
+
+ // SB8
+ "#.a": ["#.a"],
+ "#. a": ["#. a"],
+ "#. # a": ["#. # a"],
+ "#. 1 a": ["#. 1 a"],
+ "#. , a": ["#. , a"],
+ "#. Aa": ["#. ", "Aa"],
+
+ // SB8a
+ "Word..": ["Word.."],
+ "Word . , ": ["Word . , "],
+ "Word.'\t , ": ["Word.'\t , "],
+
+ // SB9, SB10, SB11
+ "Word.''": ["Word.''"],
+ "Word.'\t ": ["Word.'\t "],
+ "Word.'\t \n": ["Word.'\t \n"],
+};
+
+function assertSegments(string, sentences) {
+ let seg = segmenter.segment(string);
+ let segments = [...seg];
+
+ // The computed segments match the expected value.
+ assertEqArray(segments.map(({segment}) => segment), sentences);
+
+ // |containing()| should return the same result.
+ for (let expected of segments) {
+ let {segment, index} = expected;
+ for (let i = index; i < index + segment.length; ++i) {
+ let actual = seg.containing(i);
+ assertDeepEq(actual, expected);
+ }
+ }
+}
+
+let segmenter = new Intl.Segmenter("en", {granularity: "sentence"});
+
+for (let [string, words] of Object.entries(strings)) {
+ assertSegments(string, words);
+}
+
+// Locale-dependent sentence segmentation.
+{
+ // https://en.wikipedia.org/wiki/Greek_question_mark#Greek_question_mark
+ let string = "A sentence; semicolon separated.";
+
+ let english = new Intl.Segmenter("en", {granularity: "sentence"});
+ let greek = new Intl.Segmenter("el", {granularity: "sentence"});
+
+ // A single sentence in English.
+ assertEq([...english.segment(string)].length, 1);
+
+ // ICU4C: Two sentences in Greek.
+ // assertEq([...greek.segment(string)].length, 2);
+
+ // ICU4X: A single sentence in Greek.
+ assertEq([...greek.segment(string)].length, 1);
+}
+
+if (typeof reportCompare === "function")
+ reportCompare(0, 0);
diff --git a/js/src/tests/non262/Intl/Segmenter/sentence.js b/js/src/tests/non262/Intl/Segmenter/sentence.js
new file mode 100644
index 0000000000..326d6f1b86
--- /dev/null
+++ b/js/src/tests/non262/Intl/Segmenter/sentence.js
@@ -0,0 +1,137 @@
+// |reftest| skip-if(!this.hasOwnProperty('Intl')||!this.Intl.Segmenter)
+
+// Sentence boundaries can be locale dependent. The following locales don't use
+// any custom tailoring, so they should give the same results.
+const locales = [
+ "en", "de", "fr", "ar", "ja", "zh", "th",
+];
+
+let strings = {
+ // Empty string
+ "": [],
+
+ // Ascii
+ "This is an English sentence. And this is another one.": [
+ "This is an English sentence. ",
+ "And this is another one."
+ ],
+ "The colon: it doesn't start a new sentence.": [
+ "The colon: it doesn't start a new sentence."
+ ],
+
+ // Latin-1
+ "Unnötig umständlich Wörter überlegen. Und dann lästigerweise zu längeren Sätzen überarbeiten!": [
+ "Unnötig umständlich Wörter überlegen. ",
+ "Und dann lästigerweise zu längeren Sätzen überarbeiten!"
+ ],
+
+ // Two-Byte
+ // Source: https://ja.wikipedia.org/wiki/Unicode
+ "Unicode(ユニコード)は、符号化文字集合や文字符号化方式などを定めた、文字コードの業界規格。文字集合(文字セット)が単一の大規模文字セットであること(「Uni」という名はそれに由来する)などが特徴である。": [
+ "Unicode(ユニコード)は、符号化文字集合や文字符号化方式などを定めた、文字コードの業界規格。",
+ "文字集合(文字セット)が単一の大規模文字セットであること(「Uni」という名はそれに由来する)などが特徴である。"
+ ],
+};
+
+function assertIsSegmentDataObject(obj) {
+ // The prototype is %Object.prototype%.
+ assertEq(Object.getPrototypeOf(obj), Object.prototype);
+
+ // The Segment Data object has exactly three own properties.
+ let keys = Reflect.ownKeys(obj);
+ assertEq(keys.length, 3);
+ assertEq(keys[0], "segment");
+ assertEq(keys[1], "index");
+ assertEq(keys[2], "input");
+
+ // Ensure each property has the correct value type.
+ assertEq(typeof obj.segment, "string");
+ assertEq(typeof obj.index, "number");
+ assertEq(typeof obj.input, "string");
+
+ // |index| is an integer index into |string|.
+ assertEq(Number.isInteger(obj.index), true);
+ assertEq(obj.index >= 0, true);
+ assertEq(obj.index < obj.input.length, true);
+
+ // Segments are non-empty.
+ assertEq(obj.segment.length > 0, true);
+
+ // Ensure the segment is present in the input at the correct position.
+ assertEq(obj.input.substr(obj.index, obj.segment.length), obj.segment);
+}
+
+function segmentsFromContaining(segmenter, string) {
+ let segments = segmenter.segment(string);
+
+ let result = [];
+ for (let index = 0, data; (data = segments.containing(index)); index += data.segment.length) {
+ result.push(data);
+ }
+ return result;
+}
+
+for (let locale of locales) {
+ let segmenter = new Intl.Segmenter(locale, {granularity: "sentence"});
+
+ let resolved = segmenter.resolvedOptions();
+ assertEq(resolved.locale, locale);
+ assertEq(resolved.granularity, "sentence");
+
+ for (let [string, sentences] of Object.entries(strings)) {
+ let segments = [...segmenter.segment(string)];
+
+ // Assert each segment is a valid Segment Data object.
+ segments.forEach(assertIsSegmentDataObject);
+
+ // Concatenating all segments should return the input.
+ assertEq(segments.reduce((acc, {segment}) => acc + segment, ""), string);
+
+ // The "input" property matches the original input string.
+ assertEq(segments.every(({input}) => input === string), true);
+
+ // The indices are sorted in ascending order.
+ assertEq(isNaN(segments.reduce((acc, {index}) => index > acc ? index : NaN, -Infinity)), false);
+
+ // The computed segments match the expected value.
+ assertEqArray(segments.map(({segment}) => segment), sentences);
+
+ // Segment iteration and %Segments.prototype%.containing return the same results.
+ assertDeepEq(segmentsFromContaining(segmenter, string), segments);
+ }
+}
+
+// Sentence break suppressions through the "ss" Unicode extension key aren't supported.
+{
+ let segmenter = new Intl.Segmenter("en-u-ss-standard", {granularity: "sentence"});
+ assertEq(segmenter.resolvedOptions().locale, "en");
+
+ let segments = [...segmenter.segment("Dr. Strange is a fictional character.")];
+ assertEqArray(segments.map(({segment}) => segment),
+ ["Dr. ", "Strange is a fictional character."]);
+}
+
+// Locale-dependent sentence segmentation.
+{
+ // https://en.wikipedia.org/wiki/Greek_question_mark#Greek_question_mark
+ let string1 = "Από πού είσαι; Τί κάνεις;";
+ let string2 = string1.replaceAll(";", "\u037E"); // U+037E GREEK QUESTION MARK
+ assertEq(string1 !== string2, true);
+
+ for (let string of [string1, string2]) {
+ let english = new Intl.Segmenter("en", {granularity: "sentence"});
+ let greek = new Intl.Segmenter("el", {granularity: "sentence"});
+
+ // A single sentence in English.
+ assertEq([...english.segment(string)].length, 1);
+
+ // But two sentences in Greek.
+ //
+ // ICU4X doesn't support locale-specific tailoring:
+ // https://github.com/unicode-org/icu4x/issues/3284
+ // assertEq([...greek.segment(string)].length, 2);
+ }
+}
+
+if (typeof reportCompare === "function")
+ reportCompare(0, 0);
diff --git a/js/src/tests/non262/Intl/Segmenter/shell.js b/js/src/tests/non262/Intl/Segmenter/shell.js
new file mode 100644
index 0000000000..e69de29bb2
--- /dev/null
+++ b/js/src/tests/non262/Intl/Segmenter/shell.js
diff --git a/js/src/tests/non262/Intl/Segmenter/surrogate-pair-split.js b/js/src/tests/non262/Intl/Segmenter/surrogate-pair-split.js
new file mode 100644
index 0000000000..e7c9fcf727
--- /dev/null
+++ b/js/src/tests/non262/Intl/Segmenter/surrogate-pair-split.js
@@ -0,0 +1,57 @@
+// |reftest| skip-if(!this.hasOwnProperty('Intl')||!this.Intl.Segmenter)
+
+// Calling %Segments.prototype%.containing in the middle of a surrogate pair
+// doubles back to the lead surrogate.
+
+// Grapheme
+{
+ let segmenter = new Intl.Segmenter(undefined, {granularity: "grapheme"});
+
+ let string = "\u{1F925}";
+ let segments = segmenter.segment(string);
+
+ let data1 = segments.containing(0);
+ let data2 = segments.containing(1);
+ let data3 = segments.containing(2);
+
+ assertEq(data1.segment, string);
+ assertDeepEq(data1, data2);
+ assertEq(data3, undefined);
+}
+
+// Word
+{
+ let segmenter = new Intl.Segmenter(undefined, {granularity: "word"});
+
+ let prefix = "Nothing to see here! ";
+ let string = "\u{1F925}";
+ let segments = segmenter.segment(prefix + string);
+
+ let data1 = segments.containing(prefix.length + 0);
+ let data2 = segments.containing(prefix.length + 1);
+ let data3 = segments.containing(prefix.length + 2);
+
+ assertEq(data1.segment, string);
+ assertDeepEq(data1, data2);
+ assertEq(data3, undefined);
+}
+
+// Sentence
+{
+ let segmenter = new Intl.Segmenter(undefined, {granularity: "sentence"});
+
+ let prefix = "Nothing to see here! Please disperse. ";
+ let string = "\u{1F925}";
+ let segments = segmenter.segment(prefix + string);
+
+ let data1 = segments.containing(prefix.length + 0);
+ let data2 = segments.containing(prefix.length + 1);
+ let data3 = segments.containing(prefix.length + 2);
+
+ assertEq(data1.segment, string);
+ assertDeepEq(data1, data2);
+ assertEq(data3, undefined);
+}
+
+if (typeof reportCompare === "function")
+ reportCompare(0, 0);
diff --git a/js/src/tests/non262/Intl/Segmenter/word-latin1.js b/js/src/tests/non262/Intl/Segmenter/word-latin1.js
new file mode 100644
index 0000000000..396947cc1c
--- /dev/null
+++ b/js/src/tests/non262/Intl/Segmenter/word-latin1.js
@@ -0,0 +1,215 @@
+// |reftest| skip-if(!this.hasOwnProperty('Intl')||!this.Intl.Segmenter)
+
+// https://www.unicode.org/reports/tr29/#Word_Boundary_Rules
+
+const strings = {
+ // WB1, WB2
+ "": [],
+
+ // WB3
+ "\r\n": ["\r\n"],
+
+ // WB3a, WB3b
+ "\n": ["\n"],
+ "\r": ["\r"],
+ "\v": ["\v"],
+ "\f": ["\f"],
+ "\x85": ["\x85"],
+
+ // WB3d
+ " ": [" "],
+ " ": [" "],
+
+ // WB4
+ "\xAD": ["\xAD"],
+ "\xAD\xAD": ["\xAD\xAD"],
+
+ // WB5
+ "a": ["a"],
+ "ab": ["ab"],
+
+ // WB6, WB7
+ "a:b": ["a:b"],
+ "a·b": ["a·b"],
+ "a.b": ["a.b"],
+ "a'b": ["a'b"],
+
+ // WB8
+ "1": ["1"],
+ "12": ["12"],
+
+ // WB9
+ "a1": ["a1"],
+
+ // WB10
+ "1a": ["1a"],
+
+ // WB11, WB12
+ "1,2": ["1,2"],
+ "1;2": ["1;2"],
+ "1.2": ["1.2"],
+ "1'2": ["1'2"],
+
+ // WB13a
+ "a_": ["a_"],
+ "1_": ["1_"],
+ "__": ["__"],
+
+ // WB13b
+ "_a": ["_a"],
+ "_1": ["_1"],
+
+ // WB999
+ "\0": ["\0"],
+ "?": ["?"],
+ "??": ["?", "?"],
+};
+
+function assertSegments(string, words) {
+ let seg = segmenter.segment(string);
+ let segments = [...seg];
+
+ // The computed segments match the expected value.
+ assertEqArray(segments.map(({segment}) => segment), words);
+
+ // |containing()| should return the same result.
+ for (let expected of segments) {
+ let {segment, index} = expected;
+ for (let i = index; i < index + segment.length; ++i) {
+ let actual = seg.containing(i);
+ assertDeepEq(actual, expected);
+ }
+ }
+}
+
+let segmenter = new Intl.Segmenter("en", {granularity: "word"});
+
+for (let [string, words] of Object.entries(strings)) {
+ assertSegments(string, words);
+}
+
+// WB3, WB3a, WB3b and WB4
+for (let string of ["\r\n", "\n", "\r", "\v", "\f", "\x85"]) {
+ assertSegments(string + "\xAD", [string, "\xAD"]);
+ assertSegments("\xAD" + string, ["\xAD", string]);
+}
+
+// WB3d and WB4
+for (let string of [" ", " "]) {
+ assertSegments(string + "\xAD", [string + "\xAD"]);
+ assertSegments("\xAD" + string, ["\xAD", string]);
+}
+assertSegments(" \xAD ", [" \xAD", " "]);
+assertSegments(" \xAD\xAD ", [" \xAD\xAD", " "]);
+
+// WB5-WB13 and WB4
+for (let string of [
+ // WB5
+ "a", "ab",
+
+ // WB6, WB7
+ "a:b",
+ "a·b",
+ "a.b",
+ "a'b",
+
+ // WB8
+ "1",
+ "12",
+
+ // WB9
+ "a1",
+
+ // WB10
+ "1a",
+
+ // WB11, WB12
+ "1,2",
+ "1;2",
+ "1.2",
+ "1'2",
+
+ // WB13a
+ "a_",
+ "1_",
+ "__",
+
+ // WB13b
+ "_a",
+ "_1",
+
+ // WB999
+ "?",
+]) {
+ assertSegments(string + "\xAD", [string + "\xAD"]);
+ assertSegments("\xAD" + string, ["\xAD", string]);
+
+ if (string === "a.b") {
+ // ICU4X incorrectly splits the result into three words.
+ // https://github.com/unicode-org/icu4x/issues/4417
+ assertSegments(string.split("").join("\xAD"), ["a\xAD", ".\xAD", "b"]);
+ assertSegments(string.split("").join("\xAD\xAD"), ["a\xAD\xAD", ".\xAD\xAD", "b"]);
+ } else {
+ assertSegments(string.split("").join("\xAD"), [string.split("").join("\xAD")]);
+ assertSegments(string.split("").join("\xAD\xAD"), [string.split("").join("\xAD\xAD")]);
+ }
+}
+
+assertSegments("?\xAD?", ["?\xAD", "?"]);
+
+for (let string of [
+ // WB6, WB7
+ "a:b",
+ "a·b",
+ "a.b",
+ "a'b",
+
+ // WB11, WB12
+ "1,2",
+ "1;2",
+ "1.2",
+ "1'2",
+]) {
+ let prefix = string.slice(0, -1);
+ let suffix = string.slice(1);
+
+ assertSegments(prefix, prefix.split(""));
+ assertSegments(suffix, suffix.split(""));
+}
+
+// MidNum with ALetter
+assertSegments("a,b", ["a", ",", "b"]);
+assertSegments("a;b", ["a", ";", "b"]);
+
+// MidLetter with Numeric
+assertSegments("1:2", ["1", ":", "2"]);
+assertSegments("1·2", ["1", "·", "2"]);
+
+// MidNumLet with mixed ALetter and Numeric
+assertSegments("a.2", ["a", ".", "2"]);
+assertSegments("1.b", ["1", ".", "b"]);
+assertSegments("a'2", ["a", "'", "2"]);
+assertSegments("1'b", ["1", "'", "b"]);
+
+// MidNum with ExtendNumLet
+assertSegments("_,_", ["_", ",", "_"]);
+assertSegments("_;_", ["_", ";", "_"]);
+
+// MidLetter with ExtendNumLet
+assertSegments("_:_", ["_", ":", "_"]);
+assertSegments("_·_", ["_", "·", "_"]);
+
+// MidNumLet with ExtendNumLet
+assertSegments("_._", ["_", ".", "_"]);
+assertSegments("_'_", ["_", "'", "_"]);
+
+// CLDR has locale-dependent word segmentation for the "en-posix" locale. This
+// locale is currently not selectable, so the Latin-1 fast-paths don't need to
+// implement it. If one of the two below assertions ever fail, please update
+// the Latin-1 fast-paths for word segmentation to implement the "en-posix"
+// changes.
+assertEq(new Intl.Segmenter("en-posix").resolvedOptions().locale, "en");
+assertEq(new Intl.Segmenter("en-u-va-posix").resolvedOptions().locale, "en");
+
+if (typeof reportCompare === "function")
+ reportCompare(0, 0);
diff --git a/js/src/tests/non262/Intl/Segmenter/word.js b/js/src/tests/non262/Intl/Segmenter/word.js
new file mode 100644
index 0000000000..5b3e1747a3
--- /dev/null
+++ b/js/src/tests/non262/Intl/Segmenter/word.js
@@ -0,0 +1,152 @@
+// |reftest| skip-if(!this.hasOwnProperty('Intl')||!this.Intl.Segmenter)
+
+// Word boundaries are locale independent. Test with various locales to ensure
+// we get the same results.
+const locales = [
+ "en", "de", "fr", "ar", "ja", "zh", "th",
+];
+
+let strings = {
+ // Empty string
+ "": [],
+
+ // Ascii
+ "This is an English sentence.": [
+ "This", " ", "is", " ", "an", " ", "English", " ", "sentence", "."
+ ],
+ "Moi? N'est-ce pas.": [
+ "Moi", "?", " ", "N'est", "-", "ce", " ", "pas", "."
+ ],
+
+ // Latin-1
+ "Unnötig umständlich Wörter überlegen.": [
+ "Unnötig", " ", "umständlich", " ", "Wörter", " ", "überlegen", "."
+ ],
+
+ // Two-Byte
+ // Source: https://en.wikipedia.org/wiki/Japanese_writing_system#Examples
+ "ラドクリフ、マラソン五輪代表に 1万メートル出場にも含み。": [
+ "ラドクリフ", "、", "マラソン", "五輪", "代表", "に", " ", "1", "万", "メートル", "出場", "に", "も", "含み", "。"
+ ],
+
+ // From: Language Sense and Ambiguity in Thai
+ // Source: https://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.98.118
+ "ขนบนอก": [
+ // According to the paper this should instead be separated into ขน|บน|อก.
+ "ขนบ", "นอก"
+ ],
+ "พนักงานนําโคลงเรือสามตัว": [
+ // Expected segmentation is พนักงาน|นํา|โค|ลง|เรือ|สาม|ตัว.
+
+ // ICU4C segmentation:
+ // "พนัก", "งาน", "นํา", "โคลง", "เรือ", "สาม", "ตัว"
+
+ // ICU4X segmentation:
+ "พ", "นัก", "งานนํา", "โคลง", "เรือ", "สาม", "ตัว"
+ ],
+
+ "หมอหุงขาวสวยด": [
+ // Has three possible segmentations:
+ // หมอหงขาว|สวย|ด
+ // หมอ|หง|ขาวสวย|ด
+ // หมอ|หง|ขาว|สวย|ด
+
+ // ICU4C segmentation:
+ // "หมอ", "หุง", "ขาว", "สวย", "ด"
+
+ // ICU4X segmentation:
+ "หมอ", "หุง", "ขาว", "สวยด"
+ ],
+
+ // From: Thoughts on Word and Sentence Segmentation in Thai
+ // Source: https://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.63.7038
+ "หนังสือรวมบทความทางวิชาการในการประชุมสัมมนา": [
+ "หนังสือ", "รวม", "บทความ", "ทาง", "วิชาการ", "ใน", "การ", "ประชุม", "สัมมนา"
+ ],
+};
+
+function assertIsSegmentDataObject(obj) {
+ // The prototype is %Object.prototype%.
+ assertEq(Object.getPrototypeOf(obj), Object.prototype);
+
+ // The Segment Data object has exactly four own properties.
+ let keys = Reflect.ownKeys(obj);
+ assertEq(keys.length, 4);
+ assertEq(keys[0], "segment");
+ assertEq(keys[1], "index");
+ assertEq(keys[2], "input");
+ assertEq(keys[3], "isWordLike");
+
+ // Ensure each property has the correct value type.
+ assertEq(typeof obj.segment, "string");
+ assertEq(typeof obj.index, "number");
+ assertEq(typeof obj.input, "string");
+ assertEq(typeof obj.isWordLike, "boolean");
+
+ // |index| is an integer index into |string|.
+ assertEq(Number.isInteger(obj.index), true);
+ assertEq(obj.index >= 0, true);
+ assertEq(obj.index < obj.input.length, true);
+
+ // Segments are non-empty.
+ assertEq(obj.segment.length > 0, true);
+
+ // Ensure the segment is present in the input at the correct position.
+ assertEq(obj.input.substr(obj.index, obj.segment.length), obj.segment);
+
+ // The non-word parts in the samples are either punctuators or space separators.
+ let expectedWordLike = !/^(\p{gc=P}|\p{gc=Zs})+$/u.test(obj.segment);
+
+ // ICU4X incorrectly marks the last segment as non-word like for Thai.
+ // https://github.com/unicode-org/icu4x/issues/4446
+ let isThai = /^\p{sc=Thai}+$/u.test(obj.segment);
+ let isLastSegment = obj.index + obj.segment.length === obj.input.length;
+ if (isThai && isLastSegment) {
+ expectedWordLike = false;
+ }
+
+ assertEq(obj.isWordLike, expectedWordLike, obj.segment);
+}
+
+function segmentsFromContaining(segmenter, string) {
+ let segments = segmenter.segment(string);
+
+ let result = [];
+ for (let index = 0, data; (data = segments.containing(index)); index += data.segment.length) {
+ result.push(data);
+ }
+ return result;
+}
+
+for (let locale of locales) {
+ let segmenter = new Intl.Segmenter(locale, {granularity: "word"});
+
+ let resolved = segmenter.resolvedOptions();
+ assertEq(resolved.locale, locale);
+ assertEq(resolved.granularity, "word");
+
+ for (let [string, words] of Object.entries(strings)) {
+ let segments = [...segmenter.segment(string)];
+
+ // Assert each segment is a valid Segment Data object.
+ segments.forEach(assertIsSegmentDataObject);
+
+ // Concatenating all segments should return the input.
+ assertEq(segments.reduce((acc, {segment}) => acc + segment, ""), string);
+
+ // The "input" property matches the original input string.
+ assertEq(segments.every(({input}) => input === string), true);
+
+ // The indices are sorted in ascending order.
+ assertEq(isNaN(segments.reduce((acc, {index}) => index > acc ? index : NaN, -Infinity)), false);
+
+ // The computed segments match the expected value.
+ assertEqArray(segments.map(({segment}) => segment), words);
+
+ // Segment iteration and %Segments.prototype%.containing return the same results.
+ assertDeepEq(segmentsFromContaining(segmenter, string), segments);
+ }
+}
+
+if (typeof reportCompare === "function")
+ reportCompare(0, 0);