1 files changed, 215 insertions, 0 deletions
diff --git a/js/src/tests/non262/Intl/Segmenter/word-latin1.js b/js/src/tests/non262/Intl/Segmenter/word-latin1.js
new file mode 100644
index 0000000000..396947cc1c
--- /dev/null
+++ b/js/src/tests/non262/Intl/Segmenter/word-latin1.js
@@ -0,0 +1,215 @@
+// |reftest| skip-if(!this.hasOwnProperty('Intl')||!this.Intl.Segmenter)
+
+// https://www.unicode.org/reports/tr29/#Word_Boundary_Rules
+
+const strings = {
+  // WB1, WB2
+  "": [],
+
+  // WB3
+  "\r\n": ["\r\n"],
+
+  // WB3a, WB3b
+  "\n": ["\n"],
+  "\r": ["\r"],
+  "\v": ["\v"],
+  "\f": ["\f"],
+  "\x85": ["\x85"],
+
+  // WB3d
+  " ": [" "],
+  "  ": ["  "],
+
+  // WB4
+  "\xAD": ["\xAD"],
+  "\xAD\xAD": ["\xAD\xAD"],
+
+  // WB5
+  "a": ["a"],
+  "ab": ["ab"],
+
+  // WB6, WB7
+  "a:b": ["a:b"],
+  "a·b": ["a·b"],
+  "a.b": ["a.b"],
+  "a'b": ["a'b"],
+
+  // WB8
+  "1": ["1"],
+  "12": ["12"],
+
+  // WB9
+  "a1": ["a1"],
+
+  // WB10
+  "1a": ["1a"],
+
+  // WB11, WB12
+  "1,2": ["1,2"],
+  "1;2": ["1;2"],
+  "1.2": ["1.2"],
+  "1'2": ["1'2"],
+
+  // WB13a
+  "a_": ["a_"],
+  "1_": ["1_"],
+  "__": ["__"],
+
+  // WB13b
+  "_a": ["_a"],
+  "_1": ["_1"],
+
+  // WB999
+  "\0": ["\0"],
+  "?": ["?"],
+  "??": ["?", "?"],
+};
+
+function assertSegments(string, words) {
+  let seg = segmenter.segment(string);
+  let segments = [...seg];
+
+  // The computed segments match the expected value.
+  assertEqArray(segments.map(({segment}) => segment), words);
+
+  // |containing()| should return the same result.
+  for (let expected of segments) {
+    let {segment, index} = expected;
+    for (let i = index; i < index + segment.length; ++i) {
+      let actual = seg.containing(i);
+      assertDeepEq(actual, expected);
+    }
+  }
+}
+
+let segmenter = new Intl.Segmenter("en", {granularity: "word"});
+
+for (let [string, words] of Object.entries(strings)) {
+  assertSegments(string, words);
+}
+
+// WB3, WB3a, WB3b and WB4
+for (let string of ["\r\n", "\n", "\r", "\v", "\f", "\x85"]) {
+  assertSegments(string + "\xAD", [string, "\xAD"]);
+  assertSegments("\xAD" + string, ["\xAD", string]);
+}
+
+// WB3d and WB4
+for (let string of [" ", "  "]) {
+  assertSegments(string + "\xAD", [string + "\xAD"]);
+  assertSegments("\xAD" + string, ["\xAD", string]);
+}
+assertSegments(" \xAD ", [" \xAD", " "]);
+assertSegments(" \xAD\xAD ", [" \xAD\xAD", " "]);
+
+// WB5-WB13 and WB4
+for (let string of [
+  // WB5
+  "a", "ab",
+
+  // WB6, WB7
+  "a:b",
+  "a·b",
+  "a.b",
+  "a'b",
+
+  // WB8
+  "1",
+  "12",
+
+  // WB9
+  "a1",
+
+  // WB10
+  "1a",
+
+  // WB11, WB12
+  "1,2",
+  "1;2",
+  "1.2",
+  "1'2",
+
+  // WB13a
+  "a_",
+  "1_",
+  "__",
+
+  // WB13b
+  "_a",
+  "_1",
+
+  // WB999
+  "?",
+]) {
+  assertSegments(string + "\xAD", [string + "\xAD"]);
+  assertSegments("\xAD" + string, ["\xAD", string]);
+
+  if (string === "a.b") {
+    // ICU4X incorrectly splits the result into three words.
+    // https://github.com/unicode-org/icu4x/issues/4417
+    assertSegments(string.split("").join("\xAD"), ["a\xAD", ".\xAD", "b"]);
+    assertSegments(string.split("").join("\xAD\xAD"), ["a\xAD\xAD", ".\xAD\xAD", "b"]);
+  } else {
+    assertSegments(string.split("").join("\xAD"), [string.split("").join("\xAD")]);
+    assertSegments(string.split("").join("\xAD\xAD"), [string.split("").join("\xAD\xAD")]);
+  }
+}
+
+assertSegments("?\xAD?", ["?\xAD", "?"]);
+
+for (let string of [
+  // WB6, WB7
+  "a:b",
+  "a·b",
+  "a.b",
+  "a'b",
+
+  // WB11, WB12
+  "1,2",
+  "1;2",
+  "1.2",
+  "1'2",
+]) {
+  let prefix = string.slice(0, -1);
+  let suffix = string.slice(1);
+
+  assertSegments(prefix, prefix.split(""));
+  assertSegments(suffix, suffix.split(""));
+}
+
+// MidNum with ALetter
+assertSegments("a,b", ["a", ",", "b"]);
+assertSegments("a;b", ["a", ";", "b"]);
+
+// MidLetter with Numeric
+assertSegments("1:2", ["1", ":", "2"]);
+assertSegments("1·2", ["1", "·", "2"]);
+
+// MidNumLet with mixed ALetter and Numeric
+assertSegments("a.2", ["a", ".", "2"]);
+assertSegments("1.b", ["1", ".", "b"]);
+assertSegments("a'2", ["a", "'", "2"]);
+assertSegments("1'b", ["1", "'", "b"]);
+
+// MidNum with ExtendNumLet
+assertSegments("_,_", ["_", ",", "_"]);
+assertSegments("_;_", ["_", ";", "_"]);
+
+// MidLetter with ExtendNumLet
+assertSegments("_:_", ["_", ":", "_"]);
+assertSegments("_·_", ["_", "·", "_"]);
+
+// MidNumLet with ExtendNumLet
+assertSegments("_._", ["_", ".", "_"]);
+assertSegments("_'_", ["_", "'", "_"]);
+
+// CLDR has locale-dependent word segmentation for the "en-posix" locale. This
+// locale is currently not selectable, so the Latin-1 fast-paths don't need to
+// implement it. If one of the two below assertions ever fail, please update
+// the Latin-1 fast-paths for word segmentation to implement the "en-posix"
+// changes.
+assertEq(new Intl.Segmenter("en-posix").resolvedOptions().locale, "en");
+assertEq(new Intl.Segmenter("en-u-va-posix").resolvedOptions().locale, "en");
+
+if (typeof reportCompare === "function")
+  reportCompare(0, 0);