diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-19 00:47:55 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-19 00:47:55 +0000 |
commit | 26a029d407be480d791972afb5975cf62c9360a6 (patch) | |
tree | f435a8308119effd964b339f76abb83a57c29483 /intl/icu_capi/cpp/examples/segmenter/test.cpp | |
parent | Initial commit. (diff) | |
download | firefox-26a029d407be480d791972afb5975cf62c9360a6.tar.xz firefox-26a029d407be480d791972afb5975cf62c9360a6.zip |
Adding upstream version 124.0.1.upstream/124.0.1
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'intl/icu_capi/cpp/examples/segmenter/test.cpp')
-rw-r--r-- | intl/icu_capi/cpp/examples/segmenter/test.cpp | 160 |
1 files changed, 160 insertions, 0 deletions
diff --git a/intl/icu_capi/cpp/examples/segmenter/test.cpp b/intl/icu_capi/cpp/examples/segmenter/test.cpp new file mode 100644 index 0000000000..cfe0899f76 --- /dev/null +++ b/intl/icu_capi/cpp/examples/segmenter/test.cpp @@ -0,0 +1,160 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +#include "../../include/ICU4XDataProvider.hpp" +#include "../../include/ICU4XGraphemeClusterSegmenter.hpp" +#include "../../include/ICU4XLineSegmenter.hpp" +#include "../../include/ICU4XSentenceSegmenter.hpp" +#include "../../include/ICU4XWordSegmenter.hpp" +#include "../../include/ICU4XLogger.hpp" + +#include <iostream> +#include <string_view> + +using std::cout; +using std::endl; + +void print_ruler(size_t str_len) { + for (size_t i = 0; i < str_len; i++) { + if (i % 10 == 0) { + cout << "0"; + } else if (i % 5 == 0) { + cout << "5"; + } else { + cout << "."; + } + } + cout << endl; +} + +template <typename Iterator> +void iterate_breakpoints(Iterator& iterator) { + while (true) { + int32_t breakpoint = iterator.next(); + if (breakpoint == -1) { + break; + } + cout << " " << breakpoint; + } + cout << endl; +} + +template <typename Iterator> +void iterate_word_breakpoints(Iterator& iterator) { + while (true) { + int32_t breakpoint = iterator.next(); + if (breakpoint == -1) { + break; + } + cout << " " << breakpoint; + switch (iterator.word_type()) { + case ICU4XSegmenterWordType::None: + cout << " (none"; + break; + case ICU4XSegmenterWordType::Number: + cout << " (number"; + break; + case ICU4XSegmenterWordType::Letter: + cout << " (letter"; + break; + default: + cout << " (unknown status"; + break; + } + if (iterator.is_word_like()) { + cout << ", word-like"; + } + cout << ")"; + } + cout << endl; +} + +void test_line(const std::string_view& str) { + const auto provider = ICU4XDataProvider::create_compiled(); + const auto segmenter_auto = + ICU4XLineSegmenter::create_auto(provider).ok().value(); + const auto segmenter_lstm = + ICU4XLineSegmenter::create_lstm(provider).ok().value(); + const auto segmenter_dictionary = + ICU4XLineSegmenter::create_dictionary(provider).ok().value(); + + const ICU4XLineSegmenter* segmenters[] = {&segmenter_auto, &segmenter_lstm, + &segmenter_dictionary}; + for (const auto* segmenter : segmenters) { + cout << "Finding line breakpoints in string:" << endl << str << endl; + print_ruler(str.size()); + + cout << "Line breakpoints:"; + auto iterator = segmenter->segment_utf8(str); + iterate_breakpoints(iterator); + } +} + +void test_grapheme(const std::string_view& str) { + const auto provider = ICU4XDataProvider::create_compiled(); + const auto segmenter = ICU4XGraphemeClusterSegmenter::create(provider).ok().value(); + cout << "Finding grapheme cluster breakpoints in string:" << endl + << str << endl; + print_ruler(str.size()); + + cout << "Grapheme cluster breakpoints:"; + auto iterator = segmenter.segment_utf8(str); + iterate_breakpoints(iterator); +} + +void test_word(const std::string_view& str) { + const auto provider = ICU4XDataProvider::create_compiled(); + const auto segmenter_auto = + ICU4XWordSegmenter::create_auto(provider).ok().value(); + const auto segmenter_lstm = + ICU4XWordSegmenter::create_lstm(provider).ok().value(); + const auto segmenter_dictionary = + ICU4XWordSegmenter::create_dictionary(provider).ok().value(); + + const ICU4XWordSegmenter* segmenters[] = {&segmenter_auto, &segmenter_lstm, + &segmenter_dictionary}; + for (const auto* segmenter : segmenters) { + cout << "Finding word breakpoints in string:" << endl << str << endl; + print_ruler(str.size()); + + cout << "Word breakpoints:"; + auto iterator = segmenter->segment_utf8(str); + iterate_word_breakpoints(iterator); + } +} + +void test_sentence(const std::string_view& str) { + const auto provider = ICU4XDataProvider::create_compiled(); + const auto segmenter = ICU4XSentenceSegmenter::create(provider).ok().value(); + cout << "Finding sentence breakpoints in string:" << endl + << str << endl; + print_ruler(str.size()); + + cout << "Sentence breakpoints:"; + auto iterator = segmenter.segment_utf8(str); + iterate_breakpoints(iterator); +} + +int main(int argc, char* argv[]) { + ICU4XLogger::init_simple_logger(); + std::string_view str; + if (argc >= 2) { + str = argv[1]; + } else { + str = "The 101 quick brown foxes jump over the lazy dog."; + } + + test_line(str); + cout << endl; + + test_grapheme(str); + cout << endl; + + test_word(str); + cout << endl; + + test_sentence(str); + cout << endl; + return 0; +} |