1
0
Fork 0
firefox/third_party/rust/icu_segmenter/tests/spec_test.rs
Daniel Baumann 5e9a113729
Adding upstream version 140.0.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
2025-06-25 09:37:52 +02:00

399 lines
14 KiB
Rust
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use icu_segmenter::GraphemeClusterSegmenter;
use icu_segmenter::LineSegmenter;
use icu_segmenter::SentenceSegmenter;
use icu_segmenter::WordSegmenter;
use std::char;
struct TestContentIterator(core::str::Split<'static, char>);
struct TestData {
original_line: &'static str,
utf8_vec: Vec<char>,
utf16_vec: Vec<u16>,
latin1_vec: Vec<u8>,
break_result_utf8: Vec<usize>,
break_result_utf16: Vec<usize>,
break_result_latin1: Option<Vec<usize>>,
}
impl TestContentIterator {
pub fn new(file: &'static str) -> Self {
Self(file.split('\n'))
}
}
impl Iterator for TestContentIterator {
type Item = TestData;
fn next(&mut self) -> Option<Self::Item> {
loop {
let line = self.0.next()?;
if line.is_empty() {
// EOF
return None;
}
if line.starts_with('#') {
// Comment
continue;
}
let mut r = line.split('#');
let r = r.next();
let v = r.unwrap().split_ascii_whitespace();
let mut char_break: Vec<_> = Vec::new();
let mut u8_break: Vec<_> = Vec::new();
let mut u16_break: Vec<_> = Vec::new();
let mut char_vec: Vec<_> = Vec::new();
let mut u8_vec: Vec<_> = Vec::new();
let mut u16_vec: Vec<_> = Vec::new();
let mut char_len = 0;
let mut u8_len = 0;
let mut u16_len = 0;
let mut ascii_only = true;
for (count, item) in v.enumerate() {
if count % 2 == 1 {
let ch = char::from_u32(u32::from_str_radix(item, 16).unwrap()).unwrap();
char_vec.push(ch);
char_len += ch.len_utf8();
if ch as u32 >= 0x100 {
ascii_only = false;
} else {
u8_vec.push(ch as u8);
u8_len += 1;
}
let mut u16_buf = [0; 2];
let ch_u16 = ch.encode_utf16(&mut u16_buf);
u16_vec.extend_from_slice(ch_u16);
u16_len += ch_u16.len();
} else if item != "\u{00d7}" {
assert_eq!(item, "\u{00f7}");
char_break.push(char_len);
u8_break.push(u8_len);
u16_break.push(u16_len);
}
}
return Some(Self::Item {
original_line: line,
utf8_vec: char_vec,
utf16_vec: u16_vec,
latin1_vec: u8_vec,
break_result_utf8: char_break,
break_result_utf16: u16_break,
break_result_latin1: if ascii_only { Some(u8_break) } else { None },
});
}
}
}
fn line_break_test(file: &'static str) {
let test_iter = TestContentIterator::new(file);
let segmenter = LineSegmenter::new_dictionary();
for (i, mut test) in test_iter.enumerate() {
let s: String = test.utf8_vec.into_iter().collect();
let iter = segmenter.segment_str(&s);
let result: Vec<usize> = iter.collect();
// NOTE: For consistency with ICU4C and other Segmenters, we return a breakpoint at
// index 0, despite UAX #14 suggesting otherwise. See issue #3283.
if test.break_result_utf8.first() != Some(&0) {
test.break_result_utf8.insert(0, 0);
}
if result != test.break_result_utf8 {
let lb = icu::properties::maps::line_break();
let lb_name = icu::properties::LineBreak::enum_to_long_name_mapper();
let mut iter = segmenter.segment_str(&s);
// TODO(egg): It would be really nice to have Name here.
println!(" | A | E | Code pt. | Line_Break | Literal");
for (i, c) in s.char_indices() {
let expected_break = test.break_result_utf8.contains(&i);
let actual_break = result.contains(&i);
if actual_break {
iter.next();
}
println!(
"{}| {} | {} | {:>8} | {:>18} | {}",
if actual_break != expected_break {
"😭"
} else {
" "
},
if actual_break { "÷" } else { "×" },
if expected_break { "÷" } else { "×" },
format!("{:04X}", c as u32),
lb_name
.get(lb.get(c))
.unwrap_or(&format!("{:?}", lb.get(c))),
c
)
}
println!("Test case #{}", i);
panic!()
}
let iter = segmenter.segment_utf16(&test.utf16_vec);
let result: Vec<usize> = iter.collect();
if test.break_result_utf16.first() != Some(&0) {
test.break_result_utf16.insert(0, 0);
}
assert_eq!(
result, test.break_result_utf16,
"UTF16: {}",
test.original_line
);
// Test data is Latin-1 character only, it can run for Latin-1 segmenter test.
if let Some(mut break_result_latin1) = test.break_result_latin1 {
let iter = segmenter.segment_latin1(&test.latin1_vec);
if break_result_latin1.first() != Some(&0) {
break_result_latin1.insert(0, 0);
}
let result: Vec<usize> = iter.collect();
assert_eq!(
result, break_result_latin1,
"Latin1: {}",
test.original_line
);
}
}
}
#[test]
fn run_line_break_test() {
line_break_test(include_str!("testdata/LineBreakTest.txt"));
}
#[test]
fn run_line_break_extra_test() {
line_break_test(include_str!("testdata/LineBreakExtraTest.txt"));
}
fn word_break_test(file: &'static str) {
let test_iter = TestContentIterator::new(file);
let segmenter = WordSegmenter::new_dictionary();
for (i, test) in test_iter.enumerate() {
let s: String = test.utf8_vec.into_iter().collect();
let iter = segmenter.segment_str(&s);
let result: Vec<usize> = iter.collect();
if result != test.break_result_utf8 {
let wb = icu::properties::maps::word_break();
let wb_name = icu::properties::WordBreak::enum_to_long_name_mapper();
let mut iter = segmenter.segment_str(&s);
// TODO(egg): It would be really nice to have Name here.
println!(" | A | E | Code pt. | Word_Break | State | Literal");
for (i, c) in s.char_indices() {
let expected_break = test.break_result_utf8.contains(&i);
let actual_break = result.contains(&i);
if actual_break {
iter.next();
}
println!(
"{}| {} | {} | {:>8} | {:>14} | {} | {}",
if actual_break != expected_break {
"😭"
} else {
" "
},
if actual_break { "÷" } else { "×" },
if expected_break { "÷" } else { "×" },
format!("{:04X}", c as u32),
wb_name
.get(wb.get(c))
.unwrap_or(&format!("{:?}", wb.get(c))),
// Placeholder for logging the state if exposed.
// Not "?????" to hide from clippy.
"?".repeat(5),
c
)
}
println!("Test case #{}", i);
panic!()
}
let iter = segmenter.segment_utf16(&test.utf16_vec);
let result: Vec<usize> = iter.collect();
assert_eq!(
result, test.break_result_utf16,
"UTF16: {}",
test.original_line
);
// Test data is Latin-1 character only, it can run for Latin-1 segmenter test.
if let Some(break_result_latin1) = test.break_result_latin1 {
let iter = segmenter.segment_latin1(&test.latin1_vec);
let result: Vec<usize> = iter.collect();
assert_eq!(
result, break_result_latin1,
"Latin1: {}",
test.original_line
);
}
}
}
#[test]
fn run_word_break_test() {
word_break_test(include_str!("testdata/WordBreakTest.txt"));
}
#[test]
fn run_word_break_extra_test() {
word_break_test(include_str!("testdata/WordBreakExtraTest.txt"));
}
fn grapheme_break_test(file: &'static str) {
let test_iter = TestContentIterator::new(file);
let segmenter = GraphemeClusterSegmenter::new();
for (i, test) in test_iter.enumerate() {
let s: String = test.utf8_vec.into_iter().collect();
let iter = segmenter.segment_str(&s);
let result: Vec<usize> = iter.collect();
if result != test.break_result_utf8 {
let gcb = icu::properties::maps::grapheme_cluster_break();
let gcb_name = icu::properties::GraphemeClusterBreak::enum_to_long_name_mapper();
let mut iter = segmenter.segment_str(&s);
// TODO(egg): It would be really nice to have Name here.
println!(" | A | E | Code pt. | GCB | State | Literal");
for (i, c) in s.char_indices() {
let expected_break = test.break_result_utf8.contains(&i);
let actual_break = result.contains(&i);
if actual_break {
iter.next();
}
println!(
"{}| {} | {} | {:>8} | {:>14} | {} | {}",
if actual_break != expected_break {
"😭"
} else {
" "
},
if actual_break { "÷" } else { "×" },
if expected_break { "÷" } else { "×" },
format!("{:04X}", c as u32),
gcb_name
.get(gcb.get(c))
.unwrap_or(&format!("{:?}", gcb.get(c))),
// Placeholder for logging the state if exposed.
// Not "?????" to hide from clippy.
"?".repeat(5),
c
)
}
println!("Test case #{}", i);
panic!()
}
let iter = segmenter.segment_utf16(&test.utf16_vec);
let result: Vec<usize> = iter.collect();
assert_eq!(
result, test.break_result_utf16,
"UTF16: {}",
test.original_line
);
// Test data is Latin-1 character only, it can run for Latin-1 segmenter test.
if let Some(break_result_latin1) = test.break_result_latin1 {
let iter = segmenter.segment_latin1(&test.latin1_vec);
let result: Vec<usize> = iter.collect();
assert_eq!(
result, break_result_latin1,
"Latin1: {}",
test.original_line
);
}
}
}
#[test]
fn run_grapheme_break_test() {
grapheme_break_test(include_str!("testdata/GraphemeBreakTest.txt"));
}
#[test]
fn run_grapheme_break_extra_test() {
grapheme_break_test(include_str!("testdata/GraphemeBreakExtraTest.txt"));
}
fn sentence_break_test(file: &'static str) {
let test_iter = TestContentIterator::new(file);
let segmenter = SentenceSegmenter::new();
for (i, test) in test_iter.enumerate() {
let s: String = test.utf8_vec.into_iter().collect();
let iter = segmenter.segment_str(&s);
let result: Vec<usize> = iter.collect();
if result != test.break_result_utf8 {
let sb = icu::properties::maps::sentence_break();
let sb_name = icu::properties::SentenceBreak::enum_to_long_name_mapper();
let mut iter = segmenter.segment_str(&s);
// TODO(egg): It would be really nice to have Name here.
println!(" | A | E | Code pt. | Sentence_Break | State | Literal");
for (i, c) in s.char_indices() {
let expected_break = test.break_result_utf8.contains(&i);
let actual_break = result.contains(&i);
if actual_break {
iter.next();
}
println!(
"{}| {} | {} | {:>8} | {:>14} | {} | {}",
if actual_break != expected_break {
"😭"
} else {
" "
},
if actual_break { "÷" } else { "×" },
if expected_break { "÷" } else { "×" },
format!("{:04X}", c as u32),
sb_name
.get(sb.get(c))
.unwrap_or(&format!("{:?}", sb.get(c))),
// Placeholder for logging the state if exposed.
// Not "?????" to hide from clippy.
"?".repeat(5),
c
)
}
println!("Test case #{}", i);
panic!()
}
let iter = segmenter.segment_utf16(&test.utf16_vec);
let result: Vec<usize> = iter.collect();
assert_eq!(
result, test.break_result_utf16,
"UTF16: {}",
test.original_line
);
// Test data is Latin-1 character only, it can run for Latin-1 segmenter test.
if let Some(break_result_latin1) = test.break_result_latin1 {
let iter = segmenter.segment_latin1(&test.latin1_vec);
let result: Vec<usize> = iter.collect();
assert_eq!(
result, break_result_latin1,
"Latin1: {}",
test.original_line
);
}
}
}
#[test]
fn run_sentence_break_test() {
sentence_break_test(include_str!("testdata/SentenceBreakTest.txt"));
}
#[test]
fn run_sentence_break_extra_test() {
sentence_break_test(include_str!("testdata/SentenceBreakExtraTest.txt"));
}
#[test]
fn run_sentence_break_random_test() {
sentence_break_test(include_str!("testdata/SentenceBreakRandomTest.txt"));
}