diff options
Diffstat (limited to 'intl/icu_capi/cpp/examples/segmenter')
-rw-r--r-- | intl/icu_capi/cpp/examples/segmenter/.gitignore | 2 | ||||
-rw-r--r-- | intl/icu_capi/cpp/examples/segmenter/Makefile | 28 | ||||
-rw-r--r-- | intl/icu_capi/cpp/examples/segmenter/test.cpp | 160 |
3 files changed, 190 insertions, 0 deletions
diff --git a/intl/icu_capi/cpp/examples/segmenter/.gitignore b/intl/icu_capi/cpp/examples/segmenter/.gitignore new file mode 100644 index 0000000000..cb34c546b3 --- /dev/null +++ b/intl/icu_capi/cpp/examples/segmenter/.gitignore @@ -0,0 +1,2 @@ +a.out +a.out.dSYM diff --git a/intl/icu_capi/cpp/examples/segmenter/Makefile b/intl/icu_capi/cpp/examples/segmenter/Makefile new file mode 100644 index 0000000000..57a9c06943 --- /dev/null +++ b/intl/icu_capi/cpp/examples/segmenter/Makefile @@ -0,0 +1,28 @@ +# This file is part of ICU4X. For terms of use, please see the file +# called LICENSE at the top level of the ICU4X source tree +# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +.DEFAULT_GOAL := test +.PHONY: build test +FORCE: + +ALL_HEADERS := $(wildcard ../../include/*.hpp) $(wildcard ../../../c/include/*.h) + +CXX?=g++ + +$(ALL_HEADERS): + +../../../../../target/debug/libicu_capi_staticlib.a: FORCE + cargo build -p icu_capi_staticlib + +a.out: ../../../../../target/debug/libicu_capi_staticlib.a $(ALL_HEADERS) test.cpp + $(CXX) -std=c++17 test.cpp ../../../../../target/debug/libicu_capi_staticlib.a -ldl -lpthread -lm -g + +build: a.out + +test: build + ./a.out + +clean: + git clean -xf * + rm -f ../../../../../target/debug/libicu_capi_staticlib.a diff --git a/intl/icu_capi/cpp/examples/segmenter/test.cpp b/intl/icu_capi/cpp/examples/segmenter/test.cpp new file mode 100644 index 0000000000..cfe0899f76 --- /dev/null +++ b/intl/icu_capi/cpp/examples/segmenter/test.cpp @@ -0,0 +1,160 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +#include "../../include/ICU4XDataProvider.hpp" +#include "../../include/ICU4XGraphemeClusterSegmenter.hpp" +#include "../../include/ICU4XLineSegmenter.hpp" +#include "../../include/ICU4XSentenceSegmenter.hpp" +#include "../../include/ICU4XWordSegmenter.hpp" +#include "../../include/ICU4XLogger.hpp" + +#include <iostream> +#include <string_view> + +using std::cout; +using std::endl; + +void print_ruler(size_t str_len) { + for (size_t i = 0; i < str_len; i++) { + if (i % 10 == 0) { + cout << "0"; + } else if (i % 5 == 0) { + cout << "5"; + } else { + cout << "."; + } + } + cout << endl; +} + +template <typename Iterator> +void iterate_breakpoints(Iterator& iterator) { + while (true) { + int32_t breakpoint = iterator.next(); + if (breakpoint == -1) { + break; + } + cout << " " << breakpoint; + } + cout << endl; +} + +template <typename Iterator> +void iterate_word_breakpoints(Iterator& iterator) { + while (true) { + int32_t breakpoint = iterator.next(); + if (breakpoint == -1) { + break; + } + cout << " " << breakpoint; + switch (iterator.word_type()) { + case ICU4XSegmenterWordType::None: + cout << " (none"; + break; + case ICU4XSegmenterWordType::Number: + cout << " (number"; + break; + case ICU4XSegmenterWordType::Letter: + cout << " (letter"; + break; + default: + cout << " (unknown status"; + break; + } + if (iterator.is_word_like()) { + cout << ", word-like"; + } + cout << ")"; + } + cout << endl; +} + +void test_line(const std::string_view& str) { + const auto provider = ICU4XDataProvider::create_compiled(); + const auto segmenter_auto = + ICU4XLineSegmenter::create_auto(provider).ok().value(); + const auto segmenter_lstm = + ICU4XLineSegmenter::create_lstm(provider).ok().value(); + const auto segmenter_dictionary = + ICU4XLineSegmenter::create_dictionary(provider).ok().value(); + + const ICU4XLineSegmenter* segmenters[] = {&segmenter_auto, &segmenter_lstm, + &segmenter_dictionary}; + for (const auto* segmenter : segmenters) { + cout << "Finding line breakpoints in string:" << endl << str << endl; + print_ruler(str.size()); + + cout << "Line breakpoints:"; + auto iterator = segmenter->segment_utf8(str); + iterate_breakpoints(iterator); + } +} + +void test_grapheme(const std::string_view& str) { + const auto provider = ICU4XDataProvider::create_compiled(); + const auto segmenter = ICU4XGraphemeClusterSegmenter::create(provider).ok().value(); + cout << "Finding grapheme cluster breakpoints in string:" << endl + << str << endl; + print_ruler(str.size()); + + cout << "Grapheme cluster breakpoints:"; + auto iterator = segmenter.segment_utf8(str); + iterate_breakpoints(iterator); +} + +void test_word(const std::string_view& str) { + const auto provider = ICU4XDataProvider::create_compiled(); + const auto segmenter_auto = + ICU4XWordSegmenter::create_auto(provider).ok().value(); + const auto segmenter_lstm = + ICU4XWordSegmenter::create_lstm(provider).ok().value(); + const auto segmenter_dictionary = + ICU4XWordSegmenter::create_dictionary(provider).ok().value(); + + const ICU4XWordSegmenter* segmenters[] = {&segmenter_auto, &segmenter_lstm, + &segmenter_dictionary}; + for (const auto* segmenter : segmenters) { + cout << "Finding word breakpoints in string:" << endl << str << endl; + print_ruler(str.size()); + + cout << "Word breakpoints:"; + auto iterator = segmenter->segment_utf8(str); + iterate_word_breakpoints(iterator); + } +} + +void test_sentence(const std::string_view& str) { + const auto provider = ICU4XDataProvider::create_compiled(); + const auto segmenter = ICU4XSentenceSegmenter::create(provider).ok().value(); + cout << "Finding sentence breakpoints in string:" << endl + << str << endl; + print_ruler(str.size()); + + cout << "Sentence breakpoints:"; + auto iterator = segmenter.segment_utf8(str); + iterate_breakpoints(iterator); +} + +int main(int argc, char* argv[]) { + ICU4XLogger::init_simple_logger(); + std::string_view str; + if (argc >= 2) { + str = argv[1]; + } else { + str = "The 101 quick brown foxes jump over the lazy dog."; + } + + test_line(str); + cout << endl; + + test_grapheme(str); + cout << endl; + + test_word(str); + cout << endl; + + test_sentence(str); + cout << endl; + return 0; +} |