summaryrefslogtreecommitdiffstats
path: root/js/src/tests/non262/Intl/Segmenter/word.js
blob: 5b3e1747a35d2241b838424073a4275191bcc28f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
// |reftest| skip-if(!this.hasOwnProperty('Intl')||!this.Intl.Segmenter)

// Word boundaries are locale independent. Test with various locales to ensure
// we get the same results.
const locales = [
  "en", "de", "fr", "ar", "ja", "zh", "th",
];

let strings = {
  // Empty string
  "": [],

  // Ascii
  "This is an English sentence.": [
    "This", " ", "is", " ", "an", " ", "English", " ", "sentence", "."
  ],
  "Moi?  N'est-ce pas.": [
    "Moi", "?", "  ", "N'est", "-", "ce", " ", "pas", "."
  ],

  // Latin-1
  "Unnötig umständlich Wörter überlegen.": [
    "Unnötig", " ", "umständlich", " ", "Wörter", " ", "überlegen", "."
  ],

  // Two-Byte
  // Source: https://en.wikipedia.org/wiki/Japanese_writing_system#Examples
  "ラドクリフ、マラソン五輪代表に 1万メートル出場にも含み。": [
    "ラドクリフ", "、", "マラソン", "五輪", "代表", "に", " ", "1", "万", "メートル", "出場", "に", "も", "含み", "。"
  ],

  // From: Language Sense and Ambiguity in Thai
  // Source: https://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.98.118
  "ขนบนอก": [
    // According to the paper this should instead be separated into ขน|บน|อก.
    "ขนบ", "นอก"
  ],
  "พนักงานนําโคลงเรือสามตัว": [
    // Expected segmentation is พนักงาน|นํา|โค|ลง|เรือ|สาม|ตัว.

    // ICU4C segmentation:
    // "พนัก", "งาน", "นํา", "โคลง", "เรือ", "สาม", "ตัว"

    // ICU4X segmentation:
    "พ", "นัก", "งานนํา", "โคลง", "เรือ", "สาม", "ตัว"
  ],

  "หมอหุงขาวสวยด": [
    // Has three possible segmentations:
    // หมอหงขาว|สวย|ด
    // หมอ|หง|ขาวสวย|ด
    // หมอ|หง|ขาว|สวย|ด

    // ICU4C segmentation:
    // "หมอ", "หุง", "ขาว", "สวย", "ด"

    // ICU4X segmentation:
    "หมอ", "หุง", "ขาว", "สวยด"
  ],

  // From: Thoughts on Word and Sentence Segmentation in Thai
  // Source: https://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.63.7038
  "หนังสือรวมบทความทางวิชาการในการประชุมสัมมนา": [
    "หนังสือ", "รวม", "บทความ", "ทาง", "วิชาการ", "ใน", "การ", "ประชุม", "สัมมนา"
  ],
};

function assertIsSegmentDataObject(obj) {
  // The prototype is %Object.prototype%.
  assertEq(Object.getPrototypeOf(obj), Object.prototype);

  // The Segment Data object has exactly four own properties.
  let keys = Reflect.ownKeys(obj);
  assertEq(keys.length, 4);
  assertEq(keys[0], "segment");
  assertEq(keys[1], "index");
  assertEq(keys[2], "input");
  assertEq(keys[3], "isWordLike");

  // Ensure each property has the correct value type.
  assertEq(typeof obj.segment, "string");
  assertEq(typeof obj.index, "number");
  assertEq(typeof obj.input, "string");
  assertEq(typeof obj.isWordLike, "boolean");

  // |index| is an integer index into |string|.
  assertEq(Number.isInteger(obj.index), true);
  assertEq(obj.index >= 0, true);
  assertEq(obj.index < obj.input.length, true);

  // Segments are non-empty.
  assertEq(obj.segment.length > 0, true);

  // Ensure the segment is present in the input at the correct position.
  assertEq(obj.input.substr(obj.index, obj.segment.length), obj.segment);

  // The non-word parts in the samples are either punctuators or space separators.
  let expectedWordLike = !/^(\p{gc=P}|\p{gc=Zs})+$/u.test(obj.segment);

  // ICU4X incorrectly marks the last segment as non-word like for Thai.
  // https://github.com/unicode-org/icu4x/issues/4446
  let isThai = /^\p{sc=Thai}+$/u.test(obj.segment);
  let isLastSegment = obj.index + obj.segment.length === obj.input.length;
  if (isThai && isLastSegment) {
    expectedWordLike = false;
  }

  assertEq(obj.isWordLike, expectedWordLike, obj.segment);
}

function segmentsFromContaining(segmenter, string) {
  let segments = segmenter.segment(string);

  let result = [];
  for (let index = 0, data; (data = segments.containing(index)); index += data.segment.length) {
    result.push(data);
  }
  return result;
}

for (let locale of locales) {
  let segmenter = new Intl.Segmenter(locale, {granularity: "word"});

  let resolved = segmenter.resolvedOptions();
  assertEq(resolved.locale, locale);
  assertEq(resolved.granularity, "word");

  for (let [string, words] of Object.entries(strings)) {
    let segments = [...segmenter.segment(string)];

    // Assert each segment is a valid Segment Data object.
    segments.forEach(assertIsSegmentDataObject);

    // Concatenating all segments should return the input.
    assertEq(segments.reduce((acc, {segment}) => acc + segment, ""), string);

    // The "input" property matches the original input string.
    assertEq(segments.every(({input}) => input === string), true);

    // The indices are sorted in ascending order.
    assertEq(isNaN(segments.reduce((acc, {index}) => index > acc ? index : NaN, -Infinity)), false);

    // The computed segments match the expected value.
    assertEqArray(segments.map(({segment}) => segment), words);

    // Segment iteration and %Segments.prototype%.containing return the same results.
    assertDeepEq(segmentsFromContaining(segmenter, string), segments);
  }
}

if (typeof reportCompare === "function")
  reportCompare(0, 0);