diff options
Diffstat (limited to 'js/src/tests/non262/Intl/Segmenter/word-latin1.js')
-rw-r--r-- | js/src/tests/non262/Intl/Segmenter/word-latin1.js | 215 |
1 files changed, 215 insertions, 0 deletions
diff --git a/js/src/tests/non262/Intl/Segmenter/word-latin1.js b/js/src/tests/non262/Intl/Segmenter/word-latin1.js new file mode 100644 index 0000000000..396947cc1c --- /dev/null +++ b/js/src/tests/non262/Intl/Segmenter/word-latin1.js @@ -0,0 +1,215 @@ +// |reftest| skip-if(!this.hasOwnProperty('Intl')||!this.Intl.Segmenter) + +// https://www.unicode.org/reports/tr29/#Word_Boundary_Rules + +const strings = { + // WB1, WB2 + "": [], + + // WB3 + "\r\n": ["\r\n"], + + // WB3a, WB3b + "\n": ["\n"], + "\r": ["\r"], + "\v": ["\v"], + "\f": ["\f"], + "\x85": ["\x85"], + + // WB3d + " ": [" "], + " ": [" "], + + // WB4 + "\xAD": ["\xAD"], + "\xAD\xAD": ["\xAD\xAD"], + + // WB5 + "a": ["a"], + "ab": ["ab"], + + // WB6, WB7 + "a:b": ["a:b"], + "a·b": ["a·b"], + "a.b": ["a.b"], + "a'b": ["a'b"], + + // WB8 + "1": ["1"], + "12": ["12"], + + // WB9 + "a1": ["a1"], + + // WB10 + "1a": ["1a"], + + // WB11, WB12 + "1,2": ["1,2"], + "1;2": ["1;2"], + "1.2": ["1.2"], + "1'2": ["1'2"], + + // WB13a + "a_": ["a_"], + "1_": ["1_"], + "__": ["__"], + + // WB13b + "_a": ["_a"], + "_1": ["_1"], + + // WB999 + "\0": ["\0"], + "?": ["?"], + "??": ["?", "?"], +}; + +function assertSegments(string, words) { + let seg = segmenter.segment(string); + let segments = [...seg]; + + // The computed segments match the expected value. + assertEqArray(segments.map(({segment}) => segment), words); + + // |containing()| should return the same result. + for (let expected of segments) { + let {segment, index} = expected; + for (let i = index; i < index + segment.length; ++i) { + let actual = seg.containing(i); + assertDeepEq(actual, expected); + } + } +} + +let segmenter = new Intl.Segmenter("en", {granularity: "word"}); + +for (let [string, words] of Object.entries(strings)) { + assertSegments(string, words); +} + +// WB3, WB3a, WB3b and WB4 +for (let string of ["\r\n", "\n", "\r", "\v", "\f", "\x85"]) { + assertSegments(string + "\xAD", [string, "\xAD"]); + assertSegments("\xAD" + string, ["\xAD", string]); +} + +// WB3d and WB4 +for (let string of [" ", " "]) { + assertSegments(string + "\xAD", [string + "\xAD"]); + assertSegments("\xAD" + string, ["\xAD", string]); +} +assertSegments(" \xAD ", [" \xAD", " "]); +assertSegments(" \xAD\xAD ", [" \xAD\xAD", " "]); + +// WB5-WB13 and WB4 +for (let string of [ + // WB5 + "a", "ab", + + // WB6, WB7 + "a:b", + "a·b", + "a.b", + "a'b", + + // WB8 + "1", + "12", + + // WB9 + "a1", + + // WB10 + "1a", + + // WB11, WB12 + "1,2", + "1;2", + "1.2", + "1'2", + + // WB13a + "a_", + "1_", + "__", + + // WB13b + "_a", + "_1", + + // WB999 + "?", +]) { + assertSegments(string + "\xAD", [string + "\xAD"]); + assertSegments("\xAD" + string, ["\xAD", string]); + + if (string === "a.b") { + // ICU4X incorrectly splits the result into three words. + // https://github.com/unicode-org/icu4x/issues/4417 + assertSegments(string.split("").join("\xAD"), ["a\xAD", ".\xAD", "b"]); + assertSegments(string.split("").join("\xAD\xAD"), ["a\xAD\xAD", ".\xAD\xAD", "b"]); + } else { + assertSegments(string.split("").join("\xAD"), [string.split("").join("\xAD")]); + assertSegments(string.split("").join("\xAD\xAD"), [string.split("").join("\xAD\xAD")]); + } +} + +assertSegments("?\xAD?", ["?\xAD", "?"]); + +for (let string of [ + // WB6, WB7 + "a:b", + "a·b", + "a.b", + "a'b", + + // WB11, WB12 + "1,2", + "1;2", + "1.2", + "1'2", +]) { + let prefix = string.slice(0, -1); + let suffix = string.slice(1); + + assertSegments(prefix, prefix.split("")); + assertSegments(suffix, suffix.split("")); +} + +// MidNum with ALetter +assertSegments("a,b", ["a", ",", "b"]); +assertSegments("a;b", ["a", ";", "b"]); + +// MidLetter with Numeric +assertSegments("1:2", ["1", ":", "2"]); +assertSegments("1·2", ["1", "·", "2"]); + +// MidNumLet with mixed ALetter and Numeric +assertSegments("a.2", ["a", ".", "2"]); +assertSegments("1.b", ["1", ".", "b"]); +assertSegments("a'2", ["a", "'", "2"]); +assertSegments("1'b", ["1", "'", "b"]); + +// MidNum with ExtendNumLet +assertSegments("_,_", ["_", ",", "_"]); +assertSegments("_;_", ["_", ";", "_"]); + +// MidLetter with ExtendNumLet +assertSegments("_:_", ["_", ":", "_"]); +assertSegments("_·_", ["_", "·", "_"]); + +// MidNumLet with ExtendNumLet +assertSegments("_._", ["_", ".", "_"]); +assertSegments("_'_", ["_", "'", "_"]); + +// CLDR has locale-dependent word segmentation for the "en-posix" locale. This +// locale is currently not selectable, so the Latin-1 fast-paths don't need to +// implement it. If one of the two below assertions ever fail, please update +// the Latin-1 fast-paths for word segmentation to implement the "en-posix" +// changes. +assertEq(new Intl.Segmenter("en-posix").resolvedOptions().locale, "en"); +assertEq(new Intl.Segmenter("en-u-va-posix").resolvedOptions().locale, "en"); + +if (typeof reportCompare === "function") + reportCompare(0, 0); |