summaryrefslogtreecommitdiffstats
path: root/js/src/tests/non262/Intl/Segmenter/sentence-latin.js
blob: cebb029a40644451b461af847573173b4e63ff3b (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
// |reftest| skip-if(!this.hasOwnProperty('Intl')||!this.Intl.Segmenter)

// https://www.unicode.org/reports/tr29/#Sentence_Boundary_Rules

const strings = {
  // SB1, SB2
  "": [],

  // SB3
  "\r\n": ["\r\n"],

  // SB4
  "First paragraph.\nSecond paragraph.": ["First paragraph.\n", "Second paragraph."],
  "First paragraph.\rSecond paragraph.": ["First paragraph.\r", "Second paragraph."],
  "First paragraph.\r\nSecond paragraph.": ["First paragraph.\r\n", "Second paragraph."],
  "First paragraph.\x85Second paragraph.": ["First paragraph.\x85", "Second paragraph."],

  // SB5
  "\xADWo\xADrd\xAD.\xAD": ["\xADWo\xADrd\xAD.\xAD"],
  "Word.\n\xAD": ["Word.\n", "\xAD"],
  "Word.\r\xAD\n": ["Word.\r", "\xAD\n"],

  // SB6
  ".2": [".2"],
  "1.2": ["1.2"],
  "!2": ["!", "2"],
  "1!2": ["1!", "2"],

  // SB7
  "A.B": ["A.B"],
  "a.B": ["a.B"],
  "A. B": ["A. ", "B"],
  "a. B": ["a. ", "B"],

  // SB8
  "#.a": ["#.a"],
  "#. a": ["#. a"],
  "#. # a": ["#. # a"],
  "#. 1 a": ["#. 1 a"],
  "#. , a": ["#. , a"],
  "#. Aa": ["#. ", "Aa"],

  // SB8a
  "Word..": ["Word.."],
  "Word . , ": ["Word . , "],
  "Word.'\t , ": ["Word.'\t , "],

  // SB9, SB10, SB11
  "Word.''": ["Word.''"],
  "Word.'\t ": ["Word.'\t "],
  "Word.'\t \n": ["Word.'\t \n"],
};

function assertSegments(string, sentences) {
  let seg = segmenter.segment(string);
  let segments = [...seg];

  // The computed segments match the expected value.
  assertEqArray(segments.map(({segment}) => segment), sentences);

  // |containing()| should return the same result.
  for (let expected of segments) {
    let {segment, index} = expected;
    for (let i = index; i < index + segment.length; ++i) {
      let actual = seg.containing(i);
      assertDeepEq(actual, expected);
    }
  }
}

let segmenter = new Intl.Segmenter("en", {granularity: "sentence"});

for (let [string, words] of Object.entries(strings)) {
  assertSegments(string, words);
}

// Locale-dependent sentence segmentation.
{
  // https://en.wikipedia.org/wiki/Greek_question_mark#Greek_question_mark
  let string = "A sentence; semicolon separated.";

  let english = new Intl.Segmenter("en", {granularity: "sentence"});
  let greek = new Intl.Segmenter("el", {granularity: "sentence"});

  // A single sentence in English.
  assertEq([...english.segment(string)].length, 1);

  // ICU4C: Two sentences in Greek.
  // assertEq([...greek.segment(string)].length, 2);

  // ICU4X: A single sentence in Greek.
  assertEq([...greek.segment(string)].length, 1);
}

if (typeof reportCompare === "function")
  reportCompare(0, 0);