399 lines
14 KiB
Rust
399 lines
14 KiB
Rust
// This file is part of ICU4X. For terms of use, please see the file
|
||
// called LICENSE at the top level of the ICU4X source tree
|
||
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
|
||
|
||
use icu_segmenter::GraphemeClusterSegmenter;
|
||
use icu_segmenter::LineSegmenter;
|
||
use icu_segmenter::SentenceSegmenter;
|
||
use icu_segmenter::WordSegmenter;
|
||
use std::char;
|
||
|
||
struct TestContentIterator(core::str::Split<'static, char>);
|
||
|
||
struct TestData {
|
||
original_line: &'static str,
|
||
utf8_vec: Vec<char>,
|
||
utf16_vec: Vec<u16>,
|
||
latin1_vec: Vec<u8>,
|
||
break_result_utf8: Vec<usize>,
|
||
break_result_utf16: Vec<usize>,
|
||
break_result_latin1: Option<Vec<usize>>,
|
||
}
|
||
|
||
impl TestContentIterator {
|
||
pub fn new(file: &'static str) -> Self {
|
||
Self(file.split('\n'))
|
||
}
|
||
}
|
||
|
||
impl Iterator for TestContentIterator {
|
||
type Item = TestData;
|
||
|
||
fn next(&mut self) -> Option<Self::Item> {
|
||
loop {
|
||
let line = self.0.next()?;
|
||
if line.is_empty() {
|
||
// EOF
|
||
return None;
|
||
}
|
||
if line.starts_with('#') {
|
||
// Comment
|
||
continue;
|
||
}
|
||
|
||
let mut r = line.split('#');
|
||
let r = r.next();
|
||
let v = r.unwrap().split_ascii_whitespace();
|
||
let mut char_break: Vec<_> = Vec::new();
|
||
let mut u8_break: Vec<_> = Vec::new();
|
||
let mut u16_break: Vec<_> = Vec::new();
|
||
let mut char_vec: Vec<_> = Vec::new();
|
||
let mut u8_vec: Vec<_> = Vec::new();
|
||
let mut u16_vec: Vec<_> = Vec::new();
|
||
|
||
let mut char_len = 0;
|
||
let mut u8_len = 0;
|
||
let mut u16_len = 0;
|
||
|
||
let mut ascii_only = true;
|
||
for (count, item) in v.enumerate() {
|
||
if count % 2 == 1 {
|
||
let ch = char::from_u32(u32::from_str_radix(item, 16).unwrap()).unwrap();
|
||
char_vec.push(ch);
|
||
char_len += ch.len_utf8();
|
||
|
||
if ch as u32 >= 0x100 {
|
||
ascii_only = false;
|
||
} else {
|
||
u8_vec.push(ch as u8);
|
||
u8_len += 1;
|
||
}
|
||
|
||
let mut u16_buf = [0; 2];
|
||
let ch_u16 = ch.encode_utf16(&mut u16_buf);
|
||
u16_vec.extend_from_slice(ch_u16);
|
||
u16_len += ch_u16.len();
|
||
} else if item != "\u{00d7}" {
|
||
assert_eq!(item, "\u{00f7}");
|
||
char_break.push(char_len);
|
||
u8_break.push(u8_len);
|
||
u16_break.push(u16_len);
|
||
}
|
||
}
|
||
return Some(Self::Item {
|
||
original_line: line,
|
||
utf8_vec: char_vec,
|
||
utf16_vec: u16_vec,
|
||
latin1_vec: u8_vec,
|
||
break_result_utf8: char_break,
|
||
break_result_utf16: u16_break,
|
||
break_result_latin1: if ascii_only { Some(u8_break) } else { None },
|
||
});
|
||
}
|
||
}
|
||
}
|
||
|
||
fn line_break_test(file: &'static str) {
|
||
let test_iter = TestContentIterator::new(file);
|
||
let segmenter = LineSegmenter::new_dictionary();
|
||
for (i, mut test) in test_iter.enumerate() {
|
||
let s: String = test.utf8_vec.into_iter().collect();
|
||
let iter = segmenter.segment_str(&s);
|
||
let result: Vec<usize> = iter.collect();
|
||
// NOTE: For consistency with ICU4C and other Segmenters, we return a breakpoint at
|
||
// index 0, despite UAX #14 suggesting otherwise. See issue #3283.
|
||
if test.break_result_utf8.first() != Some(&0) {
|
||
test.break_result_utf8.insert(0, 0);
|
||
}
|
||
if result != test.break_result_utf8 {
|
||
let lb = icu::properties::maps::line_break();
|
||
let lb_name = icu::properties::LineBreak::enum_to_long_name_mapper();
|
||
let mut iter = segmenter.segment_str(&s);
|
||
// TODO(egg): It would be really nice to have Name here.
|
||
println!(" | A | E | Code pt. | Line_Break | Literal");
|
||
for (i, c) in s.char_indices() {
|
||
let expected_break = test.break_result_utf8.contains(&i);
|
||
let actual_break = result.contains(&i);
|
||
if actual_break {
|
||
iter.next();
|
||
}
|
||
println!(
|
||
"{}| {} | {} | {:>8} | {:>18} | {}",
|
||
if actual_break != expected_break {
|
||
"😭"
|
||
} else {
|
||
" "
|
||
},
|
||
if actual_break { "÷" } else { "×" },
|
||
if expected_break { "÷" } else { "×" },
|
||
format!("{:04X}", c as u32),
|
||
lb_name
|
||
.get(lb.get(c))
|
||
.unwrap_or(&format!("{:?}", lb.get(c))),
|
||
c
|
||
)
|
||
}
|
||
println!("Test case #{}", i);
|
||
panic!()
|
||
}
|
||
|
||
let iter = segmenter.segment_utf16(&test.utf16_vec);
|
||
let result: Vec<usize> = iter.collect();
|
||
if test.break_result_utf16.first() != Some(&0) {
|
||
test.break_result_utf16.insert(0, 0);
|
||
}
|
||
assert_eq!(
|
||
result, test.break_result_utf16,
|
||
"UTF16: {}",
|
||
test.original_line
|
||
);
|
||
|
||
// Test data is Latin-1 character only, it can run for Latin-1 segmenter test.
|
||
if let Some(mut break_result_latin1) = test.break_result_latin1 {
|
||
let iter = segmenter.segment_latin1(&test.latin1_vec);
|
||
if break_result_latin1.first() != Some(&0) {
|
||
break_result_latin1.insert(0, 0);
|
||
}
|
||
let result: Vec<usize> = iter.collect();
|
||
assert_eq!(
|
||
result, break_result_latin1,
|
||
"Latin1: {}",
|
||
test.original_line
|
||
);
|
||
}
|
||
}
|
||
}
|
||
|
||
#[test]
|
||
fn run_line_break_test() {
|
||
line_break_test(include_str!("testdata/LineBreakTest.txt"));
|
||
}
|
||
|
||
#[test]
|
||
fn run_line_break_extra_test() {
|
||
line_break_test(include_str!("testdata/LineBreakExtraTest.txt"));
|
||
}
|
||
|
||
fn word_break_test(file: &'static str) {
|
||
let test_iter = TestContentIterator::new(file);
|
||
let segmenter = WordSegmenter::new_dictionary();
|
||
for (i, test) in test_iter.enumerate() {
|
||
let s: String = test.utf8_vec.into_iter().collect();
|
||
let iter = segmenter.segment_str(&s);
|
||
let result: Vec<usize> = iter.collect();
|
||
if result != test.break_result_utf8 {
|
||
let wb = icu::properties::maps::word_break();
|
||
let wb_name = icu::properties::WordBreak::enum_to_long_name_mapper();
|
||
let mut iter = segmenter.segment_str(&s);
|
||
// TODO(egg): It would be really nice to have Name here.
|
||
println!(" | A | E | Code pt. | Word_Break | State | Literal");
|
||
for (i, c) in s.char_indices() {
|
||
let expected_break = test.break_result_utf8.contains(&i);
|
||
let actual_break = result.contains(&i);
|
||
if actual_break {
|
||
iter.next();
|
||
}
|
||
println!(
|
||
"{}| {} | {} | {:>8} | {:>14} | {} | {}",
|
||
if actual_break != expected_break {
|
||
"😭"
|
||
} else {
|
||
" "
|
||
},
|
||
if actual_break { "÷" } else { "×" },
|
||
if expected_break { "÷" } else { "×" },
|
||
format!("{:04X}", c as u32),
|
||
wb_name
|
||
.get(wb.get(c))
|
||
.unwrap_or(&format!("{:?}", wb.get(c))),
|
||
// Placeholder for logging the state if exposed.
|
||
// Not "?????" to hide from clippy.
|
||
"?".repeat(5),
|
||
c
|
||
)
|
||
}
|
||
println!("Test case #{}", i);
|
||
panic!()
|
||
}
|
||
|
||
let iter = segmenter.segment_utf16(&test.utf16_vec);
|
||
let result: Vec<usize> = iter.collect();
|
||
assert_eq!(
|
||
result, test.break_result_utf16,
|
||
"UTF16: {}",
|
||
test.original_line
|
||
);
|
||
|
||
// Test data is Latin-1 character only, it can run for Latin-1 segmenter test.
|
||
if let Some(break_result_latin1) = test.break_result_latin1 {
|
||
let iter = segmenter.segment_latin1(&test.latin1_vec);
|
||
let result: Vec<usize> = iter.collect();
|
||
assert_eq!(
|
||
result, break_result_latin1,
|
||
"Latin1: {}",
|
||
test.original_line
|
||
);
|
||
}
|
||
}
|
||
}
|
||
|
||
#[test]
|
||
fn run_word_break_test() {
|
||
word_break_test(include_str!("testdata/WordBreakTest.txt"));
|
||
}
|
||
|
||
#[test]
|
||
fn run_word_break_extra_test() {
|
||
word_break_test(include_str!("testdata/WordBreakExtraTest.txt"));
|
||
}
|
||
|
||
fn grapheme_break_test(file: &'static str) {
|
||
let test_iter = TestContentIterator::new(file);
|
||
let segmenter = GraphemeClusterSegmenter::new();
|
||
for (i, test) in test_iter.enumerate() {
|
||
let s: String = test.utf8_vec.into_iter().collect();
|
||
let iter = segmenter.segment_str(&s);
|
||
let result: Vec<usize> = iter.collect();
|
||
if result != test.break_result_utf8 {
|
||
let gcb = icu::properties::maps::grapheme_cluster_break();
|
||
let gcb_name = icu::properties::GraphemeClusterBreak::enum_to_long_name_mapper();
|
||
let mut iter = segmenter.segment_str(&s);
|
||
// TODO(egg): It would be really nice to have Name here.
|
||
println!(" | A | E | Code pt. | GCB | State | Literal");
|
||
for (i, c) in s.char_indices() {
|
||
let expected_break = test.break_result_utf8.contains(&i);
|
||
let actual_break = result.contains(&i);
|
||
if actual_break {
|
||
iter.next();
|
||
}
|
||
println!(
|
||
"{}| {} | {} | {:>8} | {:>14} | {} | {}",
|
||
if actual_break != expected_break {
|
||
"😭"
|
||
} else {
|
||
" "
|
||
},
|
||
if actual_break { "÷" } else { "×" },
|
||
if expected_break { "÷" } else { "×" },
|
||
format!("{:04X}", c as u32),
|
||
gcb_name
|
||
.get(gcb.get(c))
|
||
.unwrap_or(&format!("{:?}", gcb.get(c))),
|
||
// Placeholder for logging the state if exposed.
|
||
// Not "?????" to hide from clippy.
|
||
"?".repeat(5),
|
||
c
|
||
)
|
||
}
|
||
println!("Test case #{}", i);
|
||
panic!()
|
||
}
|
||
|
||
let iter = segmenter.segment_utf16(&test.utf16_vec);
|
||
let result: Vec<usize> = iter.collect();
|
||
assert_eq!(
|
||
result, test.break_result_utf16,
|
||
"UTF16: {}",
|
||
test.original_line
|
||
);
|
||
|
||
// Test data is Latin-1 character only, it can run for Latin-1 segmenter test.
|
||
if let Some(break_result_latin1) = test.break_result_latin1 {
|
||
let iter = segmenter.segment_latin1(&test.latin1_vec);
|
||
let result: Vec<usize> = iter.collect();
|
||
assert_eq!(
|
||
result, break_result_latin1,
|
||
"Latin1: {}",
|
||
test.original_line
|
||
);
|
||
}
|
||
}
|
||
}
|
||
|
||
#[test]
|
||
fn run_grapheme_break_test() {
|
||
grapheme_break_test(include_str!("testdata/GraphemeBreakTest.txt"));
|
||
}
|
||
|
||
#[test]
|
||
fn run_grapheme_break_extra_test() {
|
||
grapheme_break_test(include_str!("testdata/GraphemeBreakExtraTest.txt"));
|
||
}
|
||
|
||
fn sentence_break_test(file: &'static str) {
|
||
let test_iter = TestContentIterator::new(file);
|
||
let segmenter = SentenceSegmenter::new();
|
||
for (i, test) in test_iter.enumerate() {
|
||
let s: String = test.utf8_vec.into_iter().collect();
|
||
let iter = segmenter.segment_str(&s);
|
||
let result: Vec<usize> = iter.collect();
|
||
if result != test.break_result_utf8 {
|
||
let sb = icu::properties::maps::sentence_break();
|
||
let sb_name = icu::properties::SentenceBreak::enum_to_long_name_mapper();
|
||
let mut iter = segmenter.segment_str(&s);
|
||
// TODO(egg): It would be really nice to have Name here.
|
||
println!(" | A | E | Code pt. | Sentence_Break | State | Literal");
|
||
for (i, c) in s.char_indices() {
|
||
let expected_break = test.break_result_utf8.contains(&i);
|
||
let actual_break = result.contains(&i);
|
||
if actual_break {
|
||
iter.next();
|
||
}
|
||
println!(
|
||
"{}| {} | {} | {:>8} | {:>14} | {} | {}",
|
||
if actual_break != expected_break {
|
||
"😭"
|
||
} else {
|
||
" "
|
||
},
|
||
if actual_break { "÷" } else { "×" },
|
||
if expected_break { "÷" } else { "×" },
|
||
format!("{:04X}", c as u32),
|
||
sb_name
|
||
.get(sb.get(c))
|
||
.unwrap_or(&format!("{:?}", sb.get(c))),
|
||
// Placeholder for logging the state if exposed.
|
||
// Not "?????" to hide from clippy.
|
||
"?".repeat(5),
|
||
c
|
||
)
|
||
}
|
||
println!("Test case #{}", i);
|
||
panic!()
|
||
}
|
||
|
||
let iter = segmenter.segment_utf16(&test.utf16_vec);
|
||
let result: Vec<usize> = iter.collect();
|
||
assert_eq!(
|
||
result, test.break_result_utf16,
|
||
"UTF16: {}",
|
||
test.original_line
|
||
);
|
||
|
||
// Test data is Latin-1 character only, it can run for Latin-1 segmenter test.
|
||
if let Some(break_result_latin1) = test.break_result_latin1 {
|
||
let iter = segmenter.segment_latin1(&test.latin1_vec);
|
||
let result: Vec<usize> = iter.collect();
|
||
assert_eq!(
|
||
result, break_result_latin1,
|
||
"Latin1: {}",
|
||
test.original_line
|
||
);
|
||
}
|
||
}
|
||
}
|
||
|
||
#[test]
|
||
fn run_sentence_break_test() {
|
||
sentence_break_test(include_str!("testdata/SentenceBreakTest.txt"));
|
||
}
|
||
|
||
#[test]
|
||
fn run_sentence_break_extra_test() {
|
||
sentence_break_test(include_str!("testdata/SentenceBreakExtraTest.txt"));
|
||
}
|
||
|
||
#[test]
|
||
fn run_sentence_break_random_test() {
|
||
sentence_break_test(include_str!("testdata/SentenceBreakRandomTest.txt"));
|
||
}
|