diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-19 00:47:55 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-19 00:47:55 +0000 |
commit | 26a029d407be480d791972afb5975cf62c9360a6 (patch) | |
tree | f435a8308119effd964b339f76abb83a57c29483 /js/src/tests/non262/Intl/Segmenter | |
parent | Initial commit. (diff) | |
download | firefox-26a029d407be480d791972afb5975cf62c9360a6.tar.xz firefox-26a029d407be480d791972afb5975cf62c9360a6.zip |
Adding upstream version 124.0.1.upstream/124.0.1
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'js/src/tests/non262/Intl/Segmenter')
-rw-r--r-- | js/src/tests/non262/Intl/Segmenter/browser.js | 0 | ||||
-rw-r--r-- | js/src/tests/non262/Intl/Segmenter/cross-compartment.js | 35 | ||||
-rw-r--r-- | js/src/tests/non262/Intl/Segmenter/grapheme-latin1.js | 37 | ||||
-rw-r--r-- | js/src/tests/non262/Intl/Segmenter/grapheme.js | 110 | ||||
-rw-r--r-- | js/src/tests/non262/Intl/Segmenter/refresh-text-asan.js | 15 | ||||
-rw-r--r-- | js/src/tests/non262/Intl/Segmenter/sentence-latin.js | 96 | ||||
-rw-r--r-- | js/src/tests/non262/Intl/Segmenter/sentence.js | 137 | ||||
-rw-r--r-- | js/src/tests/non262/Intl/Segmenter/shell.js | 0 | ||||
-rw-r--r-- | js/src/tests/non262/Intl/Segmenter/surrogate-pair-split.js | 57 | ||||
-rw-r--r-- | js/src/tests/non262/Intl/Segmenter/word-latin1.js | 215 | ||||
-rw-r--r-- | js/src/tests/non262/Intl/Segmenter/word.js | 152 |
11 files changed, 854 insertions, 0 deletions
diff --git a/js/src/tests/non262/Intl/Segmenter/browser.js b/js/src/tests/non262/Intl/Segmenter/browser.js new file mode 100644 index 0000000000..e69de29bb2 --- /dev/null +++ b/js/src/tests/non262/Intl/Segmenter/browser.js diff --git a/js/src/tests/non262/Intl/Segmenter/cross-compartment.js b/js/src/tests/non262/Intl/Segmenter/cross-compartment.js new file mode 100644 index 0000000000..58845e6b62 --- /dev/null +++ b/js/src/tests/non262/Intl/Segmenter/cross-compartment.js @@ -0,0 +1,35 @@ +// |reftest| skip-if(!this.hasOwnProperty('Intl')||!this.Intl.Segmenter) + +var g = newGlobal({}); + +var segmenter = new Intl.Segmenter(); +var ccwSegmenter = new g.Intl.Segmenter(); + +const SegmentsPrototype = Object.getPrototypeOf(segmenter.segment("")); +const SegmentIteratorPrototype = Object.getPrototypeOf(segmenter.segment("")[Symbol.iterator]()); + +// Intl.Segmenter.prototype.resolvedOptions () +var resolved1 = Intl.Segmenter.prototype.resolvedOptions.call(segmenter); +var resolved2 = Intl.Segmenter.prototype.resolvedOptions.call(ccwSegmenter); +assertDeepEq(resolved1, resolved2); + +// Intl.Segmenter.prototype.segment +var seg1 = Intl.Segmenter.prototype.segment.call(segmenter, "This is a test."); +var seg2 = Intl.Segmenter.prototype.segment.call(ccwSegmenter, "This is a test."); + +// %Segments.prototype%.containing ( index ) +var data1 = SegmentsPrototype.containing.call(seg1, 10); +var data2 = SegmentsPrototype.containing.call(seg2, 10); +assertDeepEq(data1, data2); + +// %Segments.prototype% [ @@iterator ] () +var iter1 = SegmentsPrototype[Symbol.iterator].call(seg1); +var iter2 = SegmentsPrototype[Symbol.iterator].call(seg2); + +// %SegmentIterator.prototype%.next () +var result1 = SegmentIteratorPrototype.next.call(iter1); +var result2 = SegmentIteratorPrototype.next.call(iter2); +assertDeepEq(result1, result2); + +if (typeof reportCompare === "function") + reportCompare(0, 0); diff --git a/js/src/tests/non262/Intl/Segmenter/grapheme-latin1.js b/js/src/tests/non262/Intl/Segmenter/grapheme-latin1.js new file mode 100644 index 0000000000..3b54c02236 --- /dev/null +++ b/js/src/tests/non262/Intl/Segmenter/grapheme-latin1.js @@ -0,0 +1,37 @@ +// |reftest| slow skip-if(!this.hasOwnProperty('Intl')||!this.Intl.Segmenter) + +// CRLF should be the only compound grapheme for Latin-1 strings. + +let segmenter = new Intl.Segmenter("en", {granularity: "grapheme"}); + +for (let i = 0; i <= 0xff; ++i) { + for (let j = 0; j <= 0xff; ++j) { + let string = String.fromCodePoint(i, j); + let segments = segmenter.segment(string); + + let data1 = segments.containing(0); + let data2 = segments.containing(1); + let graphemes = [...segments]; + + if (i === "\r".charCodeAt(0) && j === "\n".charCodeAt(0)) { + assertEq(data1.index, 0); + assertEq(data1.segment, "\r\n"); + + assertEq(data2.index, 0); + assertEq(data2.segment, "\r\n"); + + assertEq(graphemes.length, 1); + } else { + assertEq(data1.index, 0); + assertEq(data1.segment, String.fromCodePoint(i)); + + assertEq(data2.index, 1); + assertEq(data2.segment, String.fromCodePoint(j)); + + assertEq(graphemes.length, 2); + } + } +} + +if (typeof reportCompare === "function") + reportCompare(0, 0); diff --git a/js/src/tests/non262/Intl/Segmenter/grapheme.js b/js/src/tests/non262/Intl/Segmenter/grapheme.js new file mode 100644 index 0000000000..c51de7f8d0 --- /dev/null +++ b/js/src/tests/non262/Intl/Segmenter/grapheme.js @@ -0,0 +1,110 @@ +// |reftest| skip-if(!this.hasOwnProperty('Intl')||!this.Intl.Segmenter) + +// Grapheme boundaries are locale independent. Test with various locales to +// ensure we get the same results. +const locales = [ + "en", "de", "fr", "ar", "ja", "zh", "th", +]; + +let strings = { + // Empty string + "": [], + + // Ascii + "test": "test".split(""), + "hello world": "hello world".split(""), + "hello\0world": "hello\0world".split(""), + "\r\n": ["\r\n"], + + // Latin-1 + "äöü éèê µß \xff": "äöü éèê µß \xff".split(""), + + // Two-Byte + "中文字": "中文字".split(""), + + // Grapheme Clusters: https://www.unicode.org/reports/tr29/#Table_Sample_Grapheme_Clusters + "e\u0300": ["e\u0300"], + "\u1100\u1161\u11A8": ["\u1100\u1161\u11A8"], // Hangul syllable "gag" + "\u0E01\u0E33": ["\u0E01\u0E33"], // Thai kam + "\u0937\u093F": ["\u0937\u093F"], // Devanagari ssi + + // Emojis + "\u263A\uFE0F": ["\u263A\uFE0F"], // Variant selector + "\u{1F385}\u{1F3FB}": ["\u{1F385}\u{1F3FB}"], // Skin tone selector + "\u{1F469}\u{1F3FD}\u{200D}\u{1F52C}": ["\u{1F469}\u{1F3FD}\u{200D}\u{1F52C}"], // ZWJ + "\u{1F469}\u{1F3FD}\u{200D}\u{1F52C}\u{FE0F}": ["\u{1F469}\u{1F3FD}\u{200D}\u{1F52C}\u{FE0F}"], // ZWJ + VS + "\u{1F926}\u{1F3FC}\u{200D}\u{2642}\u{FE0F}": ["\u{1F926}\u{1F3FC}\u{200D}\u{2642}\u{FE0F}"], // ZWJ + VS with BMP modifier + "\u{1F1E9}\u{1F1EA}": ["\u{1F1E9}\u{1F1EA}"], // Flags + "\u{1F3F4}\u{E0067}\u{E0062}\u{E0073}\u{E0063}\u{E0074}\u{E007F}": ["\u{1F3F4}\u{E0067}\u{E0062}\u{E0073}\u{E0063}\u{E0074}\u{E007F}"], // Subdivision flags +}; + +function assertIsSegmentDataObject(obj) { + // The prototype is %Object.prototype%. + assertEq(Object.getPrototypeOf(obj), Object.prototype); + + // The Segment Data object has exactly three own properties. + let keys = Reflect.ownKeys(obj); + assertEq(keys.length, 3); + assertEq(keys[0], "segment"); + assertEq(keys[1], "index"); + assertEq(keys[2], "input"); + + // Ensure each property has the correct value type. + assertEq(typeof obj.segment, "string"); + assertEq(typeof obj.index, "number"); + assertEq(typeof obj.input, "string"); + + // |index| is an integer index into |string|. + assertEq(Number.isInteger(obj.index), true); + assertEq(obj.index >= 0, true); + assertEq(obj.index < obj.input.length, true); + + // Segments are non-empty. + assertEq(obj.segment.length > 0, true); + + // Ensure the segment is present in the input at the correct position. + assertEq(obj.input.substr(obj.index, obj.segment.length), obj.segment); +} + +function segmentsFromContaining(segmenter, string) { + let segments = segmenter.segment(string); + + let result = []; + for (let index = 0, data; (data = segments.containing(index)); index += data.segment.length) { + result.push(data); + } + return result; +} + +for (let locale of locales) { + let segmenter = new Intl.Segmenter(locale, {granularity: "grapheme"}); + + let resolved = segmenter.resolvedOptions(); + assertEq(resolved.locale, locale); + assertEq(resolved.granularity, "grapheme"); + + for (let [string, graphemes] of Object.entries(strings)) { + let segments = [...segmenter.segment(string)]; + + // Assert each segment is a valid Segment Data object. + segments.forEach(assertIsSegmentDataObject); + + // Concatenating all segments should return the input. + assertEq(segments.reduce((acc, {segment}) => acc + segment, ""), string); + + // The "input" property matches the original input string. + assertEq(segments.every(({input}) => input === string), true); + + // The indices are sorted in ascending order. + assertEq(isNaN(segments.reduce((acc, {index}) => index > acc ? index : NaN, -Infinity)), false); + + // The computed segments match the expected value. + assertEqArray(segments.map(({segment}) => segment), graphemes); + + // Segment iteration and %Segments.prototype%.containing return the same results. + assertDeepEq(segmentsFromContaining(segmenter, string), segments); + } +} + +if (typeof reportCompare === "function") + reportCompare(0, 0); diff --git a/js/src/tests/non262/Intl/Segmenter/refresh-text-asan.js b/js/src/tests/non262/Intl/Segmenter/refresh-text-asan.js new file mode 100644 index 0000000000..98ad7c56f5 --- /dev/null +++ b/js/src/tests/non262/Intl/Segmenter/refresh-text-asan.js @@ -0,0 +1,15 @@ +// |reftest| skip-if(!this.hasOwnProperty('Intl')||!this.Intl.Segmenter) + +// Test fails in ASan builds when ubrk_refreshUText isn't called. + +let string = "A. "; + +let segmenter = new Intl.Segmenter(undefined, {granularity: "sentence"}); +let segments = segmenter.segment(string.repeat(100)); + +for (let {segment} of segments) { + assertEq(segment, string); +} + +if (typeof reportCompare === "function") + reportCompare(0, 0); diff --git a/js/src/tests/non262/Intl/Segmenter/sentence-latin.js b/js/src/tests/non262/Intl/Segmenter/sentence-latin.js new file mode 100644 index 0000000000..cebb029a40 --- /dev/null +++ b/js/src/tests/non262/Intl/Segmenter/sentence-latin.js @@ -0,0 +1,96 @@ +// |reftest| skip-if(!this.hasOwnProperty('Intl')||!this.Intl.Segmenter) + +// https://www.unicode.org/reports/tr29/#Sentence_Boundary_Rules + +const strings = { + // SB1, SB2 + "": [], + + // SB3 + "\r\n": ["\r\n"], + + // SB4 + "First paragraph.\nSecond paragraph.": ["First paragraph.\n", "Second paragraph."], + "First paragraph.\rSecond paragraph.": ["First paragraph.\r", "Second paragraph."], + "First paragraph.\r\nSecond paragraph.": ["First paragraph.\r\n", "Second paragraph."], + "First paragraph.\x85Second paragraph.": ["First paragraph.\x85", "Second paragraph."], + + // SB5 + "\xADWo\xADrd\xAD.\xAD": ["\xADWo\xADrd\xAD.\xAD"], + "Word.\n\xAD": ["Word.\n", "\xAD"], + "Word.\r\xAD\n": ["Word.\r", "\xAD\n"], + + // SB6 + ".2": [".2"], + "1.2": ["1.2"], + "!2": ["!", "2"], + "1!2": ["1!", "2"], + + // SB7 + "A.B": ["A.B"], + "a.B": ["a.B"], + "A. B": ["A. ", "B"], + "a. B": ["a. ", "B"], + + // SB8 + "#.a": ["#.a"], + "#. a": ["#. a"], + "#. # a": ["#. # a"], + "#. 1 a": ["#. 1 a"], + "#. , a": ["#. , a"], + "#. Aa": ["#. ", "Aa"], + + // SB8a + "Word..": ["Word.."], + "Word . , ": ["Word . , "], + "Word.'\t , ": ["Word.'\t , "], + + // SB9, SB10, SB11 + "Word.''": ["Word.''"], + "Word.'\t ": ["Word.'\t "], + "Word.'\t \n": ["Word.'\t \n"], +}; + +function assertSegments(string, sentences) { + let seg = segmenter.segment(string); + let segments = [...seg]; + + // The computed segments match the expected value. + assertEqArray(segments.map(({segment}) => segment), sentences); + + // |containing()| should return the same result. + for (let expected of segments) { + let {segment, index} = expected; + for (let i = index; i < index + segment.length; ++i) { + let actual = seg.containing(i); + assertDeepEq(actual, expected); + } + } +} + +let segmenter = new Intl.Segmenter("en", {granularity: "sentence"}); + +for (let [string, words] of Object.entries(strings)) { + assertSegments(string, words); +} + +// Locale-dependent sentence segmentation. +{ + // https://en.wikipedia.org/wiki/Greek_question_mark#Greek_question_mark + let string = "A sentence; semicolon separated."; + + let english = new Intl.Segmenter("en", {granularity: "sentence"}); + let greek = new Intl.Segmenter("el", {granularity: "sentence"}); + + // A single sentence in English. + assertEq([...english.segment(string)].length, 1); + + // ICU4C: Two sentences in Greek. + // assertEq([...greek.segment(string)].length, 2); + + // ICU4X: A single sentence in Greek. + assertEq([...greek.segment(string)].length, 1); +} + +if (typeof reportCompare === "function") + reportCompare(0, 0); diff --git a/js/src/tests/non262/Intl/Segmenter/sentence.js b/js/src/tests/non262/Intl/Segmenter/sentence.js new file mode 100644 index 0000000000..326d6f1b86 --- /dev/null +++ b/js/src/tests/non262/Intl/Segmenter/sentence.js @@ -0,0 +1,137 @@ +// |reftest| skip-if(!this.hasOwnProperty('Intl')||!this.Intl.Segmenter) + +// Sentence boundaries can be locale dependent. The following locales don't use +// any custom tailoring, so they should give the same results. +const locales = [ + "en", "de", "fr", "ar", "ja", "zh", "th", +]; + +let strings = { + // Empty string + "": [], + + // Ascii + "This is an English sentence. And this is another one.": [ + "This is an English sentence. ", + "And this is another one." + ], + "The colon: it doesn't start a new sentence.": [ + "The colon: it doesn't start a new sentence." + ], + + // Latin-1 + "Unnötig umständlich Wörter überlegen. Und dann lästigerweise zu längeren Sätzen überarbeiten!": [ + "Unnötig umständlich Wörter überlegen. ", + "Und dann lästigerweise zu längeren Sätzen überarbeiten!" + ], + + // Two-Byte + // Source: https://ja.wikipedia.org/wiki/Unicode + "Unicode(ユニコード)は、符号化文字集合や文字符号化方式などを定めた、文字コードの業界規格。文字集合(文字セット)が単一の大規模文字セットであること(「Uni」という名はそれに由来する)などが特徴である。": [ + "Unicode(ユニコード)は、符号化文字集合や文字符号化方式などを定めた、文字コードの業界規格。", + "文字集合(文字セット)が単一の大規模文字セットであること(「Uni」という名はそれに由来する)などが特徴である。" + ], +}; + +function assertIsSegmentDataObject(obj) { + // The prototype is %Object.prototype%. + assertEq(Object.getPrototypeOf(obj), Object.prototype); + + // The Segment Data object has exactly three own properties. + let keys = Reflect.ownKeys(obj); + assertEq(keys.length, 3); + assertEq(keys[0], "segment"); + assertEq(keys[1], "index"); + assertEq(keys[2], "input"); + + // Ensure each property has the correct value type. + assertEq(typeof obj.segment, "string"); + assertEq(typeof obj.index, "number"); + assertEq(typeof obj.input, "string"); + + // |index| is an integer index into |string|. + assertEq(Number.isInteger(obj.index), true); + assertEq(obj.index >= 0, true); + assertEq(obj.index < obj.input.length, true); + + // Segments are non-empty. + assertEq(obj.segment.length > 0, true); + + // Ensure the segment is present in the input at the correct position. + assertEq(obj.input.substr(obj.index, obj.segment.length), obj.segment); +} + +function segmentsFromContaining(segmenter, string) { + let segments = segmenter.segment(string); + + let result = []; + for (let index = 0, data; (data = segments.containing(index)); index += data.segment.length) { + result.push(data); + } + return result; +} + +for (let locale of locales) { + let segmenter = new Intl.Segmenter(locale, {granularity: "sentence"}); + + let resolved = segmenter.resolvedOptions(); + assertEq(resolved.locale, locale); + assertEq(resolved.granularity, "sentence"); + + for (let [string, sentences] of Object.entries(strings)) { + let segments = [...segmenter.segment(string)]; + + // Assert each segment is a valid Segment Data object. + segments.forEach(assertIsSegmentDataObject); + + // Concatenating all segments should return the input. + assertEq(segments.reduce((acc, {segment}) => acc + segment, ""), string); + + // The "input" property matches the original input string. + assertEq(segments.every(({input}) => input === string), true); + + // The indices are sorted in ascending order. + assertEq(isNaN(segments.reduce((acc, {index}) => index > acc ? index : NaN, -Infinity)), false); + + // The computed segments match the expected value. + assertEqArray(segments.map(({segment}) => segment), sentences); + + // Segment iteration and %Segments.prototype%.containing return the same results. + assertDeepEq(segmentsFromContaining(segmenter, string), segments); + } +} + +// Sentence break suppressions through the "ss" Unicode extension key aren't supported. +{ + let segmenter = new Intl.Segmenter("en-u-ss-standard", {granularity: "sentence"}); + assertEq(segmenter.resolvedOptions().locale, "en"); + + let segments = [...segmenter.segment("Dr. Strange is a fictional character.")]; + assertEqArray(segments.map(({segment}) => segment), + ["Dr. ", "Strange is a fictional character."]); +} + +// Locale-dependent sentence segmentation. +{ + // https://en.wikipedia.org/wiki/Greek_question_mark#Greek_question_mark + let string1 = "Από πού είσαι; Τί κάνεις;"; + let string2 = string1.replaceAll(";", "\u037E"); // U+037E GREEK QUESTION MARK + assertEq(string1 !== string2, true); + + for (let string of [string1, string2]) { + let english = new Intl.Segmenter("en", {granularity: "sentence"}); + let greek = new Intl.Segmenter("el", {granularity: "sentence"}); + + // A single sentence in English. + assertEq([...english.segment(string)].length, 1); + + // But two sentences in Greek. + // + // ICU4X doesn't support locale-specific tailoring: + // https://github.com/unicode-org/icu4x/issues/3284 + // assertEq([...greek.segment(string)].length, 2); + } +} + +if (typeof reportCompare === "function") + reportCompare(0, 0); diff --git a/js/src/tests/non262/Intl/Segmenter/shell.js b/js/src/tests/non262/Intl/Segmenter/shell.js new file mode 100644 index 0000000000..e69de29bb2 --- /dev/null +++ b/js/src/tests/non262/Intl/Segmenter/shell.js diff --git a/js/src/tests/non262/Intl/Segmenter/surrogate-pair-split.js b/js/src/tests/non262/Intl/Segmenter/surrogate-pair-split.js new file mode 100644 index 0000000000..e7c9fcf727 --- /dev/null +++ b/js/src/tests/non262/Intl/Segmenter/surrogate-pair-split.js @@ -0,0 +1,57 @@ +// |reftest| skip-if(!this.hasOwnProperty('Intl')||!this.Intl.Segmenter) + +// Calling %Segments.prototype%.containing in the middle of a surrogate pair +// doubles back to the lead surrogate. + +// Grapheme +{ + let segmenter = new Intl.Segmenter(undefined, {granularity: "grapheme"}); + + let string = "\u{1F925}"; + let segments = segmenter.segment(string); + + let data1 = segments.containing(0); + let data2 = segments.containing(1); + let data3 = segments.containing(2); + + assertEq(data1.segment, string); + assertDeepEq(data1, data2); + assertEq(data3, undefined); +} + +// Word +{ + let segmenter = new Intl.Segmenter(undefined, {granularity: "word"}); + + let prefix = "Nothing to see here! "; + let string = "\u{1F925}"; + let segments = segmenter.segment(prefix + string); + + let data1 = segments.containing(prefix.length + 0); + let data2 = segments.containing(prefix.length + 1); + let data3 = segments.containing(prefix.length + 2); + + assertEq(data1.segment, string); + assertDeepEq(data1, data2); + assertEq(data3, undefined); +} + +// Sentence +{ + let segmenter = new Intl.Segmenter(undefined, {granularity: "sentence"}); + + let prefix = "Nothing to see here! Please disperse. "; + let string = "\u{1F925}"; + let segments = segmenter.segment(prefix + string); + + let data1 = segments.containing(prefix.length + 0); + let data2 = segments.containing(prefix.length + 1); + let data3 = segments.containing(prefix.length + 2); + + assertEq(data1.segment, string); + assertDeepEq(data1, data2); + assertEq(data3, undefined); +} + +if (typeof reportCompare === "function") + reportCompare(0, 0); diff --git a/js/src/tests/non262/Intl/Segmenter/word-latin1.js b/js/src/tests/non262/Intl/Segmenter/word-latin1.js new file mode 100644 index 0000000000..396947cc1c --- /dev/null +++ b/js/src/tests/non262/Intl/Segmenter/word-latin1.js @@ -0,0 +1,215 @@ +// |reftest| skip-if(!this.hasOwnProperty('Intl')||!this.Intl.Segmenter) + +// https://www.unicode.org/reports/tr29/#Word_Boundary_Rules + +const strings = { + // WB1, WB2 + "": [], + + // WB3 + "\r\n": ["\r\n"], + + // WB3a, WB3b + "\n": ["\n"], + "\r": ["\r"], + "\v": ["\v"], + "\f": ["\f"], + "\x85": ["\x85"], + + // WB3d + " ": [" "], + " ": [" "], + + // WB4 + "\xAD": ["\xAD"], + "\xAD\xAD": ["\xAD\xAD"], + + // WB5 + "a": ["a"], + "ab": ["ab"], + + // WB6, WB7 + "a:b": ["a:b"], + "a·b": ["a·b"], + "a.b": ["a.b"], + "a'b": ["a'b"], + + // WB8 + "1": ["1"], + "12": ["12"], + + // WB9 + "a1": ["a1"], + + // WB10 + "1a": ["1a"], + + // WB11, WB12 + "1,2": ["1,2"], + "1;2": ["1;2"], + "1.2": ["1.2"], + "1'2": ["1'2"], + + // WB13a + "a_": ["a_"], + "1_": ["1_"], + "__": ["__"], + + // WB13b + "_a": ["_a"], + "_1": ["_1"], + + // WB999 + "\0": ["\0"], + "?": ["?"], + "??": ["?", "?"], +}; + +function assertSegments(string, words) { + let seg = segmenter.segment(string); + let segments = [...seg]; + + // The computed segments match the expected value. + assertEqArray(segments.map(({segment}) => segment), words); + + // |containing()| should return the same result. + for (let expected of segments) { + let {segment, index} = expected; + for (let i = index; i < index + segment.length; ++i) { + let actual = seg.containing(i); + assertDeepEq(actual, expected); + } + } +} + +let segmenter = new Intl.Segmenter("en", {granularity: "word"}); + +for (let [string, words] of Object.entries(strings)) { + assertSegments(string, words); +} + +// WB3, WB3a, WB3b and WB4 +for (let string of ["\r\n", "\n", "\r", "\v", "\f", "\x85"]) { + assertSegments(string + "\xAD", [string, "\xAD"]); + assertSegments("\xAD" + string, ["\xAD", string]); +} + +// WB3d and WB4 +for (let string of [" ", " "]) { + assertSegments(string + "\xAD", [string + "\xAD"]); + assertSegments("\xAD" + string, ["\xAD", string]); +} +assertSegments(" \xAD ", [" \xAD", " "]); +assertSegments(" \xAD\xAD ", [" \xAD\xAD", " "]); + +// WB5-WB13 and WB4 +for (let string of [ + // WB5 + "a", "ab", + + // WB6, WB7 + "a:b", + "a·b", + "a.b", + "a'b", + + // WB8 + "1", + "12", + + // WB9 + "a1", + + // WB10 + "1a", + + // WB11, WB12 + "1,2", + "1;2", + "1.2", + "1'2", + + // WB13a + "a_", + "1_", + "__", + + // WB13b + "_a", + "_1", + + // WB999 + "?", +]) { + assertSegments(string + "\xAD", [string + "\xAD"]); + assertSegments("\xAD" + string, ["\xAD", string]); + + if (string === "a.b") { + // ICU4X incorrectly splits the result into three words. + // https://github.com/unicode-org/icu4x/issues/4417 + assertSegments(string.split("").join("\xAD"), ["a\xAD", ".\xAD", "b"]); + assertSegments(string.split("").join("\xAD\xAD"), ["a\xAD\xAD", ".\xAD\xAD", "b"]); + } else { + assertSegments(string.split("").join("\xAD"), [string.split("").join("\xAD")]); + assertSegments(string.split("").join("\xAD\xAD"), [string.split("").join("\xAD\xAD")]); + } +} + +assertSegments("?\xAD?", ["?\xAD", "?"]); + +for (let string of [ + // WB6, WB7 + "a:b", + "a·b", + "a.b", + "a'b", + + // WB11, WB12 + "1,2", + "1;2", + "1.2", + "1'2", +]) { + let prefix = string.slice(0, -1); + let suffix = string.slice(1); + + assertSegments(prefix, prefix.split("")); + assertSegments(suffix, suffix.split("")); +} + +// MidNum with ALetter +assertSegments("a,b", ["a", ",", "b"]); +assertSegments("a;b", ["a", ";", "b"]); + +// MidLetter with Numeric +assertSegments("1:2", ["1", ":", "2"]); +assertSegments("1·2", ["1", "·", "2"]); + +// MidNumLet with mixed ALetter and Numeric +assertSegments("a.2", ["a", ".", "2"]); +assertSegments("1.b", ["1", ".", "b"]); +assertSegments("a'2", ["a", "'", "2"]); +assertSegments("1'b", ["1", "'", "b"]); + +// MidNum with ExtendNumLet +assertSegments("_,_", ["_", ",", "_"]); +assertSegments("_;_", ["_", ";", "_"]); + +// MidLetter with ExtendNumLet +assertSegments("_:_", ["_", ":", "_"]); +assertSegments("_·_", ["_", "·", "_"]); + +// MidNumLet with ExtendNumLet +assertSegments("_._", ["_", ".", "_"]); +assertSegments("_'_", ["_", "'", "_"]); + +// CLDR has locale-dependent word segmentation for the "en-posix" locale. This +// locale is currently not selectable, so the Latin-1 fast-paths don't need to +// implement it. If one of the two below assertions ever fail, please update +// the Latin-1 fast-paths for word segmentation to implement the "en-posix" +// changes. +assertEq(new Intl.Segmenter("en-posix").resolvedOptions().locale, "en"); +assertEq(new Intl.Segmenter("en-u-va-posix").resolvedOptions().locale, "en"); + +if (typeof reportCompare === "function") + reportCompare(0, 0); diff --git a/js/src/tests/non262/Intl/Segmenter/word.js b/js/src/tests/non262/Intl/Segmenter/word.js new file mode 100644 index 0000000000..5b3e1747a3 --- /dev/null +++ b/js/src/tests/non262/Intl/Segmenter/word.js @@ -0,0 +1,152 @@ +// |reftest| skip-if(!this.hasOwnProperty('Intl')||!this.Intl.Segmenter) + +// Word boundaries are locale independent. Test with various locales to ensure +// we get the same results. +const locales = [ + "en", "de", "fr", "ar", "ja", "zh", "th", +]; + +let strings = { + // Empty string + "": [], + + // Ascii + "This is an English sentence.": [ + "This", " ", "is", " ", "an", " ", "English", " ", "sentence", "." + ], + "Moi? N'est-ce pas.": [ + "Moi", "?", " ", "N'est", "-", "ce", " ", "pas", "." + ], + + // Latin-1 + "Unnötig umständlich Wörter überlegen.": [ + "Unnötig", " ", "umständlich", " ", "Wörter", " ", "überlegen", "." + ], + + // Two-Byte + // Source: https://en.wikipedia.org/wiki/Japanese_writing_system#Examples + "ラドクリフ、マラソン五輪代表に 1万メートル出場にも含み。": [ + "ラドクリフ", "、", "マラソン", "五輪", "代表", "に", " ", "1", "万", "メートル", "出場", "に", "も", "含み", "。" + ], + + // From: Language Sense and Ambiguity in Thai + // Source: https://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.98.118 + "ขนบนอก": [ + // According to the paper this should instead be separated into ขน|บน|อก. + "ขนบ", "นอก" + ], + "พนักงานนําโคลงเรือสามตัว": [ + // Expected segmentation is พนักงาน|นํา|โค|ลง|เรือ|สาม|ตัว. + + // ICU4C segmentation: + // "พนัก", "งาน", "นํา", "โคลง", "เรือ", "สาม", "ตัว" + + // ICU4X segmentation: + "พ", "นัก", "งานนํา", "โคลง", "เรือ", "สาม", "ตัว" + ], + + "หมอหุงขาวสวยด": [ + // Has three possible segmentations: + // หมอหงขาว|สวย|ด + // หมอ|หง|ขาวสวย|ด + // หมอ|หง|ขาว|สวย|ด + + // ICU4C segmentation: + // "หมอ", "หุง", "ขาว", "สวย", "ด" + + // ICU4X segmentation: + "หมอ", "หุง", "ขาว", "สวยด" + ], + + // From: Thoughts on Word and Sentence Segmentation in Thai + // Source: https://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.63.7038 + "หนังสือรวมบทความทางวิชาการในการประชุมสัมมนา": [ + "หนังสือ", "รวม", "บทความ", "ทาง", "วิชาการ", "ใน", "การ", "ประชุม", "สัมมนา" + ], +}; + +function assertIsSegmentDataObject(obj) { + // The prototype is %Object.prototype%. + assertEq(Object.getPrototypeOf(obj), Object.prototype); + + // The Segment Data object has exactly four own properties. + let keys = Reflect.ownKeys(obj); + assertEq(keys.length, 4); + assertEq(keys[0], "segment"); + assertEq(keys[1], "index"); + assertEq(keys[2], "input"); + assertEq(keys[3], "isWordLike"); + + // Ensure each property has the correct value type. + assertEq(typeof obj.segment, "string"); + assertEq(typeof obj.index, "number"); + assertEq(typeof obj.input, "string"); + assertEq(typeof obj.isWordLike, "boolean"); + + // |index| is an integer index into |string|. + assertEq(Number.isInteger(obj.index), true); + assertEq(obj.index >= 0, true); + assertEq(obj.index < obj.input.length, true); + + // Segments are non-empty. + assertEq(obj.segment.length > 0, true); + + // Ensure the segment is present in the input at the correct position. + assertEq(obj.input.substr(obj.index, obj.segment.length), obj.segment); + + // The non-word parts in the samples are either punctuators or space separators. + let expectedWordLike = !/^(\p{gc=P}|\p{gc=Zs})+$/u.test(obj.segment); + + // ICU4X incorrectly marks the last segment as non-word like for Thai. + // https://github.com/unicode-org/icu4x/issues/4446 + let isThai = /^\p{sc=Thai}+$/u.test(obj.segment); + let isLastSegment = obj.index + obj.segment.length === obj.input.length; + if (isThai && isLastSegment) { + expectedWordLike = false; + } + + assertEq(obj.isWordLike, expectedWordLike, obj.segment); +} + +function segmentsFromContaining(segmenter, string) { + let segments = segmenter.segment(string); + + let result = []; + for (let index = 0, data; (data = segments.containing(index)); index += data.segment.length) { + result.push(data); + } + return result; +} + +for (let locale of locales) { + let segmenter = new Intl.Segmenter(locale, {granularity: "word"}); + + let resolved = segmenter.resolvedOptions(); + assertEq(resolved.locale, locale); + assertEq(resolved.granularity, "word"); + + for (let [string, words] of Object.entries(strings)) { + let segments = [...segmenter.segment(string)]; + + // Assert each segment is a valid Segment Data object. + segments.forEach(assertIsSegmentDataObject); + + // Concatenating all segments should return the input. + assertEq(segments.reduce((acc, {segment}) => acc + segment, ""), string); + + // The "input" property matches the original input string. + assertEq(segments.every(({input}) => input === string), true); + + // The indices are sorted in ascending order. + assertEq(isNaN(segments.reduce((acc, {index}) => index > acc ? index : NaN, -Infinity)), false); + + // The computed segments match the expected value. + assertEqArray(segments.map(({segment}) => segment), words); + + // Segment iteration and %Segments.prototype%.containing return the same results. + assertDeepEq(segmentsFromContaining(segmenter, string), segments); + } +} + +if (typeof reportCompare === "function") + reportCompare(0, 0); |