summaryrefslogtreecommitdiffstats
path: root/intl/icu_capi/cpp/examples/segmenter/test.cpp
blob: cfe0899f761ed6bc806846bf781a59beeb001606 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).

#include "../../include/ICU4XDataProvider.hpp"
#include "../../include/ICU4XGraphemeClusterSegmenter.hpp"
#include "../../include/ICU4XLineSegmenter.hpp"
#include "../../include/ICU4XSentenceSegmenter.hpp"
#include "../../include/ICU4XWordSegmenter.hpp"
#include "../../include/ICU4XLogger.hpp"

#include <iostream>
#include <string_view>

using std::cout;
using std::endl;

void print_ruler(size_t str_len) {
    for (size_t i = 0; i < str_len; i++) {
        if (i % 10 == 0) {
            cout << "0";
        } else if (i % 5 == 0) {
            cout << "5";
        } else {
            cout << ".";
        }
    }
    cout << endl;
}

template <typename Iterator>
void iterate_breakpoints(Iterator& iterator) {
    while (true) {
        int32_t breakpoint = iterator.next();
        if (breakpoint == -1) {
            break;
        }
        cout << " " << breakpoint;
    }
    cout << endl;
}

template <typename Iterator>
void iterate_word_breakpoints(Iterator& iterator) {
    while (true) {
        int32_t breakpoint = iterator.next();
        if (breakpoint == -1) {
            break;
        }
        cout << " " << breakpoint;
        switch (iterator.word_type()) {
            case ICU4XSegmenterWordType::None:
                cout << " (none";
                break;
            case ICU4XSegmenterWordType::Number:
                cout << " (number";
                break;
            case ICU4XSegmenterWordType::Letter:
                cout << " (letter";
                break;
            default:
                cout << " (unknown status";
                break;
        }
        if (iterator.is_word_like()) {
            cout << ", word-like";
        }
        cout << ")";
    }
    cout << endl;
}

void test_line(const std::string_view& str) {
    const auto provider = ICU4XDataProvider::create_compiled();
    const auto segmenter_auto =
        ICU4XLineSegmenter::create_auto(provider).ok().value();
    const auto segmenter_lstm =
        ICU4XLineSegmenter::create_lstm(provider).ok().value();
    const auto segmenter_dictionary =
        ICU4XLineSegmenter::create_dictionary(provider).ok().value();

    const ICU4XLineSegmenter* segmenters[] = {&segmenter_auto, &segmenter_lstm,
                                              &segmenter_dictionary};
    for (const auto* segmenter : segmenters) {
        cout << "Finding line breakpoints in string:" << endl << str << endl;
        print_ruler(str.size());

        cout << "Line breakpoints:";
        auto iterator = segmenter->segment_utf8(str);
        iterate_breakpoints(iterator);
    }
}

void test_grapheme(const std::string_view& str) {
    const auto provider = ICU4XDataProvider::create_compiled();
    const auto segmenter = ICU4XGraphemeClusterSegmenter::create(provider).ok().value();
    cout << "Finding grapheme cluster breakpoints in string:" << endl
         << str << endl;
    print_ruler(str.size());

    cout << "Grapheme cluster breakpoints:";
    auto iterator = segmenter.segment_utf8(str);
    iterate_breakpoints(iterator);
}

void test_word(const std::string_view& str) {
    const auto provider = ICU4XDataProvider::create_compiled();
    const auto segmenter_auto =
        ICU4XWordSegmenter::create_auto(provider).ok().value();
    const auto segmenter_lstm =
        ICU4XWordSegmenter::create_lstm(provider).ok().value();
    const auto segmenter_dictionary =
        ICU4XWordSegmenter::create_dictionary(provider).ok().value();

    const ICU4XWordSegmenter* segmenters[] = {&segmenter_auto, &segmenter_lstm,
                                              &segmenter_dictionary};
    for (const auto* segmenter : segmenters) {
        cout << "Finding word breakpoints in string:" << endl << str << endl;
        print_ruler(str.size());

        cout << "Word breakpoints:";
        auto iterator = segmenter->segment_utf8(str);
        iterate_word_breakpoints(iterator);
    }
}

void test_sentence(const std::string_view& str) {
    const auto provider = ICU4XDataProvider::create_compiled();
    const auto segmenter = ICU4XSentenceSegmenter::create(provider).ok().value();
    cout << "Finding sentence breakpoints in string:" << endl
         << str << endl;
    print_ruler(str.size());

    cout << "Sentence breakpoints:";
    auto iterator = segmenter.segment_utf8(str);
    iterate_breakpoints(iterator);
}

int main(int argc, char* argv[]) {
    ICU4XLogger::init_simple_logger();
    std::string_view str;
    if (argc >= 2) {
        str = argv[1];
    } else {
        str = "The 101 quick brown foxes jump over the lazy dog.";
    }

    test_line(str);
    cout << endl;

    test_grapheme(str);
    cout << endl;

    test_word(str);
    cout << endl;

    test_sentence(str);
    cout << endl;
    return 0;
}