// This file is part of ICU4X. For terms of use, please see the file // called LICENSE at the top level of the ICU4X source tree // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). use icu_segmenter::GraphemeClusterSegmenter; use icu_segmenter::LineSegmenter; use icu_segmenter::SentenceSegmenter; use icu_segmenter::WordSegmenter; use std::char; struct TestContentIterator(core::str::Split<'static, char>); struct TestData { original_line: &'static str, utf8_vec: Vec, utf16_vec: Vec, latin1_vec: Vec, break_result_utf8: Vec, break_result_utf16: Vec, break_result_latin1: Option>, } impl TestContentIterator { pub fn new(file: &'static str) -> Self { Self(file.split('\n')) } } impl Iterator for TestContentIterator { type Item = TestData; fn next(&mut self) -> Option { loop { let line = self.0.next()?; if line.is_empty() { // EOF return None; } if line.starts_with('#') { // Comment continue; } let mut r = line.split('#'); let r = r.next(); let v = r.unwrap().split_ascii_whitespace(); let mut char_break: Vec<_> = Vec::new(); let mut u8_break: Vec<_> = Vec::new(); let mut u16_break: Vec<_> = Vec::new(); let mut char_vec: Vec<_> = Vec::new(); let mut u8_vec: Vec<_> = Vec::new(); let mut u16_vec: Vec<_> = Vec::new(); let mut char_len = 0; let mut u8_len = 0; let mut u16_len = 0; let mut ascii_only = true; for (count, item) in v.enumerate() { if count % 2 == 1 { let ch = char::from_u32(u32::from_str_radix(item, 16).unwrap()).unwrap(); char_vec.push(ch); char_len += ch.len_utf8(); if ch as u32 >= 0x100 { ascii_only = false; } else { u8_vec.push(ch as u8); u8_len += 1; } let mut u16_buf = [0; 2]; let ch_u16 = ch.encode_utf16(&mut u16_buf); u16_vec.extend_from_slice(ch_u16); u16_len += ch_u16.len(); } else if item != "\u{00d7}" { assert_eq!(item, "\u{00f7}"); char_break.push(char_len); u8_break.push(u8_len); u16_break.push(u16_len); } } return Some(Self::Item { original_line: line, utf8_vec: char_vec, utf16_vec: u16_vec, latin1_vec: u8_vec, break_result_utf8: char_break, break_result_utf16: u16_break, break_result_latin1: if ascii_only { Some(u8_break) } else { None }, }); } } } fn line_break_test(file: &'static str) { let test_iter = TestContentIterator::new(file); let segmenter = LineSegmenter::new_dictionary(); for (i, mut test) in test_iter.enumerate() { let s: String = test.utf8_vec.into_iter().collect(); let iter = segmenter.segment_str(&s); let result: Vec = iter.collect(); // NOTE: For consistency with ICU4C and other Segmenters, we return a breakpoint at // index 0, despite UAX #14 suggesting otherwise. See issue #3283. if test.break_result_utf8.first() != Some(&0) { test.break_result_utf8.insert(0, 0); } if result != test.break_result_utf8 { let lb = icu::properties::maps::line_break(); let lb_name = icu::properties::LineBreak::enum_to_long_name_mapper(); let mut iter = segmenter.segment_str(&s); // TODO(egg): It would be really nice to have Name here. println!(" | A | E | Code pt. | Line_Break | Literal"); for (i, c) in s.char_indices() { let expected_break = test.break_result_utf8.contains(&i); let actual_break = result.contains(&i); if actual_break { iter.next(); } println!( "{}| {} | {} | {:>8} | {:>18} | {}", if actual_break != expected_break { "😭" } else { " " }, if actual_break { "÷" } else { "×" }, if expected_break { "÷" } else { "×" }, format!("{:04X}", c as u32), lb_name .get(lb.get(c)) .unwrap_or(&format!("{:?}", lb.get(c))), c ) } println!("Test case #{}", i); panic!() } let iter = segmenter.segment_utf16(&test.utf16_vec); let result: Vec = iter.collect(); if test.break_result_utf16.first() != Some(&0) { test.break_result_utf16.insert(0, 0); } assert_eq!( result, test.break_result_utf16, "UTF16: {}", test.original_line ); // Test data is Latin-1 character only, it can run for Latin-1 segmenter test. if let Some(mut break_result_latin1) = test.break_result_latin1 { let iter = segmenter.segment_latin1(&test.latin1_vec); if break_result_latin1.first() != Some(&0) { break_result_latin1.insert(0, 0); } let result: Vec = iter.collect(); assert_eq!( result, break_result_latin1, "Latin1: {}", test.original_line ); } } } #[test] fn run_line_break_test() { line_break_test(include_str!("testdata/LineBreakTest.txt")); } #[test] fn run_line_break_extra_test() { line_break_test(include_str!("testdata/LineBreakExtraTest.txt")); } fn word_break_test(file: &'static str) { let test_iter = TestContentIterator::new(file); let segmenter = WordSegmenter::new_dictionary(); for (i, test) in test_iter.enumerate() { let s: String = test.utf8_vec.into_iter().collect(); let iter = segmenter.segment_str(&s); let result: Vec = iter.collect(); if result != test.break_result_utf8 { let wb = icu::properties::maps::word_break(); let wb_name = icu::properties::WordBreak::enum_to_long_name_mapper(); let mut iter = segmenter.segment_str(&s); // TODO(egg): It would be really nice to have Name here. println!(" | A | E | Code pt. | Word_Break | State | Literal"); for (i, c) in s.char_indices() { let expected_break = test.break_result_utf8.contains(&i); let actual_break = result.contains(&i); if actual_break { iter.next(); } println!( "{}| {} | {} | {:>8} | {:>14} | {} | {}", if actual_break != expected_break { "😭" } else { " " }, if actual_break { "÷" } else { "×" }, if expected_break { "÷" } else { "×" }, format!("{:04X}", c as u32), wb_name .get(wb.get(c)) .unwrap_or(&format!("{:?}", wb.get(c))), // Placeholder for logging the state if exposed. // Not "?????" to hide from clippy. "?".repeat(5), c ) } println!("Test case #{}", i); panic!() } let iter = segmenter.segment_utf16(&test.utf16_vec); let result: Vec = iter.collect(); assert_eq!( result, test.break_result_utf16, "UTF16: {}", test.original_line ); // Test data is Latin-1 character only, it can run for Latin-1 segmenter test. if let Some(break_result_latin1) = test.break_result_latin1 { let iter = segmenter.segment_latin1(&test.latin1_vec); let result: Vec = iter.collect(); assert_eq!( result, break_result_latin1, "Latin1: {}", test.original_line ); } } } #[test] fn run_word_break_test() { word_break_test(include_str!("testdata/WordBreakTest.txt")); } #[test] fn run_word_break_extra_test() { word_break_test(include_str!("testdata/WordBreakExtraTest.txt")); } fn grapheme_break_test(file: &'static str) { let test_iter = TestContentIterator::new(file); let segmenter = GraphemeClusterSegmenter::new(); for (i, test) in test_iter.enumerate() { let s: String = test.utf8_vec.into_iter().collect(); let iter = segmenter.segment_str(&s); let result: Vec = iter.collect(); if result != test.break_result_utf8 { let gcb = icu::properties::maps::grapheme_cluster_break(); let gcb_name = icu::properties::GraphemeClusterBreak::enum_to_long_name_mapper(); let mut iter = segmenter.segment_str(&s); // TODO(egg): It would be really nice to have Name here. println!(" | A | E | Code pt. | GCB | State | Literal"); for (i, c) in s.char_indices() { let expected_break = test.break_result_utf8.contains(&i); let actual_break = result.contains(&i); if actual_break { iter.next(); } println!( "{}| {} | {} | {:>8} | {:>14} | {} | {}", if actual_break != expected_break { "😭" } else { " " }, if actual_break { "÷" } else { "×" }, if expected_break { "÷" } else { "×" }, format!("{:04X}", c as u32), gcb_name .get(gcb.get(c)) .unwrap_or(&format!("{:?}", gcb.get(c))), // Placeholder for logging the state if exposed. // Not "?????" to hide from clippy. "?".repeat(5), c ) } println!("Test case #{}", i); panic!() } let iter = segmenter.segment_utf16(&test.utf16_vec); let result: Vec = iter.collect(); assert_eq!( result, test.break_result_utf16, "UTF16: {}", test.original_line ); // Test data is Latin-1 character only, it can run for Latin-1 segmenter test. if let Some(break_result_latin1) = test.break_result_latin1 { let iter = segmenter.segment_latin1(&test.latin1_vec); let result: Vec = iter.collect(); assert_eq!( result, break_result_latin1, "Latin1: {}", test.original_line ); } } } #[test] fn run_grapheme_break_test() { grapheme_break_test(include_str!("testdata/GraphemeBreakTest.txt")); } #[test] fn run_grapheme_break_extra_test() { grapheme_break_test(include_str!("testdata/GraphemeBreakExtraTest.txt")); } fn sentence_break_test(file: &'static str) { let test_iter = TestContentIterator::new(file); let segmenter = SentenceSegmenter::new(); for (i, test) in test_iter.enumerate() { let s: String = test.utf8_vec.into_iter().collect(); let iter = segmenter.segment_str(&s); let result: Vec = iter.collect(); if result != test.break_result_utf8 { let sb = icu::properties::maps::sentence_break(); let sb_name = icu::properties::SentenceBreak::enum_to_long_name_mapper(); let mut iter = segmenter.segment_str(&s); // TODO(egg): It would be really nice to have Name here. println!(" | A | E | Code pt. | Sentence_Break | State | Literal"); for (i, c) in s.char_indices() { let expected_break = test.break_result_utf8.contains(&i); let actual_break = result.contains(&i); if actual_break { iter.next(); } println!( "{}| {} | {} | {:>8} | {:>14} | {} | {}", if actual_break != expected_break { "😭" } else { " " }, if actual_break { "÷" } else { "×" }, if expected_break { "÷" } else { "×" }, format!("{:04X}", c as u32), sb_name .get(sb.get(c)) .unwrap_or(&format!("{:?}", sb.get(c))), // Placeholder for logging the state if exposed. // Not "?????" to hide from clippy. "?".repeat(5), c ) } println!("Test case #{}", i); panic!() } let iter = segmenter.segment_utf16(&test.utf16_vec); let result: Vec = iter.collect(); assert_eq!( result, test.break_result_utf16, "UTF16: {}", test.original_line ); // Test data is Latin-1 character only, it can run for Latin-1 segmenter test. if let Some(break_result_latin1) = test.break_result_latin1 { let iter = segmenter.segment_latin1(&test.latin1_vec); let result: Vec = iter.collect(); assert_eq!( result, break_result_latin1, "Latin1: {}", test.original_line ); } } } #[test] fn run_sentence_break_test() { sentence_break_test(include_str!("testdata/SentenceBreakTest.txt")); } #[test] fn run_sentence_break_extra_test() { sentence_break_test(include_str!("testdata/SentenceBreakExtraTest.txt")); } #[test] fn run_sentence_break_random_test() { sentence_break_test(include_str!("testdata/SentenceBreakRandomTest.txt")); }