1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
|
// |reftest| skip-if(!this.hasOwnProperty('Intl')||!this.Intl.Segmenter)
// https://www.unicode.org/reports/tr29/#Sentence_Boundary_Rules
const strings = {
// SB1, SB2
"": [],
// SB3
"\r\n": ["\r\n"],
// SB4
"First paragraph.\nSecond paragraph.": ["First paragraph.\n", "Second paragraph."],
"First paragraph.\rSecond paragraph.": ["First paragraph.\r", "Second paragraph."],
"First paragraph.\r\nSecond paragraph.": ["First paragraph.\r\n", "Second paragraph."],
"First paragraph.\x85Second paragraph.": ["First paragraph.\x85", "Second paragraph."],
// SB5
"\xADWo\xADrd\xAD.\xAD": ["\xADWo\xADrd\xAD.\xAD"],
"Word.\n\xAD": ["Word.\n", "\xAD"],
"Word.\r\xAD\n": ["Word.\r", "\xAD\n"],
// SB6
".2": [".2"],
"1.2": ["1.2"],
"!2": ["!", "2"],
"1!2": ["1!", "2"],
// SB7
"A.B": ["A.B"],
"a.B": ["a.B"],
"A. B": ["A. ", "B"],
"a. B": ["a. ", "B"],
// SB8
"#.a": ["#.a"],
"#. a": ["#. a"],
"#. # a": ["#. # a"],
"#. 1 a": ["#. 1 a"],
"#. , a": ["#. , a"],
"#. Aa": ["#. ", "Aa"],
// SB8a
"Word..": ["Word.."],
"Word . , ": ["Word . , "],
"Word.'\t , ": ["Word.'\t , "],
// SB9, SB10, SB11
"Word.''": ["Word.''"],
"Word.'\t ": ["Word.'\t "],
"Word.'\t \n": ["Word.'\t \n"],
};
function assertSegments(string, sentences) {
let seg = segmenter.segment(string);
let segments = [...seg];
// The computed segments match the expected value.
assertEqArray(segments.map(({segment}) => segment), sentences);
// |containing()| should return the same result.
for (let expected of segments) {
let {segment, index} = expected;
for (let i = index; i < index + segment.length; ++i) {
let actual = seg.containing(i);
assertDeepEq(actual, expected);
}
}
}
let segmenter = new Intl.Segmenter("en", {granularity: "sentence"});
for (let [string, words] of Object.entries(strings)) {
assertSegments(string, words);
}
// Locale-dependent sentence segmentation.
{
// https://en.wikipedia.org/wiki/Greek_question_mark#Greek_question_mark
let string = "A sentence; semicolon separated.";
let english = new Intl.Segmenter("en", {granularity: "sentence"});
let greek = new Intl.Segmenter("el", {granularity: "sentence"});
// A single sentence in English.
assertEq([...english.segment(string)].length, 1);
// ICU4C: Two sentences in Greek.
// assertEq([...greek.segment(string)].length, 2);
// ICU4X: A single sentence in Greek.
assertEq([...greek.segment(string)].length, 1);
}
if (typeof reportCompare === "function")
reportCompare(0, 0);
|