summaryrefslogtreecommitdiffstats
path: root/intl/icu/source/i18n/number_affixutils.h
blob: 5cfde61ffd0ca2c1591fb9e1252439705d95fc8b (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
// © 2017 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html

#include "unicode/utypes.h"

#if !UCONFIG_NO_FORMATTING
#ifndef __NUMBER_AFFIXUTILS_H__
#define __NUMBER_AFFIXUTILS_H__

#include <cstdint>
#include "number_types.h"
#include "unicode/stringpiece.h"
#include "unicode/unistr.h"
#include "formatted_string_builder.h"
#include "unicode/uniset.h"

U_NAMESPACE_BEGIN namespace number {
namespace impl {

enum AffixPatternState {
    STATE_BASE = 0,
    STATE_FIRST_QUOTE = 1,
    STATE_INSIDE_QUOTE = 2,
    STATE_AFTER_QUOTE = 3,
    STATE_FIRST_CURR = 4,
    STATE_SECOND_CURR = 5,
    STATE_THIRD_CURR = 6,
    STATE_FOURTH_CURR = 7,
    STATE_FIFTH_CURR = 8,
    STATE_OVERFLOW_CURR = 9
};

// enum AffixPatternType defined in internals.h

struct AffixTag {
    int32_t offset;
    UChar32 codePoint;
    AffixPatternState state;
    AffixPatternType type;

    AffixTag()
            : offset(0), state(STATE_BASE) {}

    AffixTag(int32_t offset)
            : offset(offset) {}

    AffixTag(int32_t offset, UChar32 codePoint, AffixPatternState state, AffixPatternType type)
            : offset(offset), codePoint(codePoint), state(state), type(type) {}
};

class TokenConsumer {
  public:
    virtual ~TokenConsumer();

    virtual void consumeToken(AffixPatternType type, UChar32 cp, UErrorCode& status) = 0;
};

// Exported as U_I18N_API because it is a base class for other exported types
class U_I18N_API SymbolProvider {
  public:
    virtual ~SymbolProvider();

    // TODO: Could this be more efficient if it returned by reference?
    virtual UnicodeString getSymbol(AffixPatternType type) const = 0;
};

/**
 * Performs manipulations on affix patterns: the prefix and suffix strings associated with a decimal
 * format pattern. For example:
 *
 * <table>
 * <tr><th>Affix Pattern</th><th>Example Unescaped (Formatted) String</th></tr>
 * <tr><td>abc</td><td>abc</td></tr>
 * <tr><td>ab-</td><td>ab−</td></tr>
 * <tr><td>ab'-'</td><td>ab-</td></tr>
 * <tr><td>ab''</td><td>ab'</td></tr>
 * </table>
 *
 * To manually iterate over tokens in a literal string, use the following pattern, which is designed
 * to be efficient.
 *
 * <pre>
 * long tag = 0L;
 * while (AffixPatternUtils.hasNext(tag, patternString)) {
 *   tag = AffixPatternUtils.nextToken(tag, patternString);
 *   int typeOrCp = AffixPatternUtils.getTypeOrCp(tag);
 *   switch (typeOrCp) {
 *     case AffixPatternUtils.TYPE_MINUS_SIGN:
 *       // Current token is a minus sign.
 *       break;
 *     case AffixPatternUtils.TYPE_PLUS_SIGN:
 *       // Current token is a plus sign.
 *       break;
 *     case AffixPatternUtils.TYPE_PERCENT:
 *       // Current token is a percent sign.
 *       break;
 *     // ... other types ...
 *     default:
 *       // Current token is an arbitrary code point.
 *       // The variable typeOrCp is the code point.
 *       break;
 *   }
 * }
 * </pre>
 */
class U_I18N_API AffixUtils {

  public:

    /**
     * Estimates the number of code points present in an unescaped version of the affix pattern string
     * (one that would be returned by {@link #unescape}), assuming that all interpolated symbols
     * consume one code point and that currencies consume as many code points as their symbol width.
     * Used for computing padding width.
     *
     * @param patternString The original string whose width will be estimated.
     * @return The length of the unescaped string.
     */
    static int32_t estimateLength(const UnicodeString& patternString, UErrorCode& status);

    /**
     * Takes a string and escapes (quotes) characters that have special meaning in the affix pattern
     * syntax. This function does not reverse-lookup symbols.
     *
     * <p>Example input: "-$x"; example output: "'-'$x"
     *
     * @param input The string to be escaped.
     * @return The resulting UnicodeString.
     */
    static UnicodeString escape(const UnicodeString& input);

    static Field getFieldForType(AffixPatternType type);

    /**
     * Executes the unescape state machine. Replaces the unquoted characters "-", "+", "%", "‰", and
     * "¤" with the corresponding symbols provided by the {@link SymbolProvider}, and inserts the
     * result into the FormattedStringBuilder at the requested location.
     *
     * <p>Example input: "'-'¤x"; example output: "-$x"
     *
     * @param affixPattern The original string to be unescaped.
     * @param output The FormattedStringBuilder to mutate with the result.
     * @param position The index into the FormattedStringBuilder to insert the string.
     * @param provider An object to generate locale symbols.
     */
    static int32_t unescape(const UnicodeString& affixPattern, FormattedStringBuilder& output,
                            int32_t position, const SymbolProvider& provider, Field field,
                            UErrorCode& status);

    /**
   * Sames as {@link #unescape}, but only calculates the code point count.  More efficient than {@link #unescape}
   * if you only need the length but not the string itself.
     *
     * @param affixPattern The original string to be unescaped.
     * @param provider An object to generate locale symbols.
     * @return The same return value as if you called {@link #unescape}.
     */
    static int32_t unescapedCodePointCount(const UnicodeString& affixPattern,
                                           const SymbolProvider& provider, UErrorCode& status);

    /**
     * Checks whether the given affix pattern contains at least one token of the given type, which is
     * one of the constants "TYPE_" in {@link AffixPatternUtils}.
     *
     * @param affixPattern The affix pattern to check.
     * @param type The token type.
     * @return true if the affix pattern contains the given token type; false otherwise.
     */
    static bool containsType(const UnicodeString& affixPattern, AffixPatternType type, UErrorCode& status);

    /**
     * Checks whether the specified affix pattern has any unquoted currency symbols ("¤").
     *
     * @param affixPattern The string to check for currency symbols.
     * @return true if the literal has at least one unquoted currency symbol; false otherwise.
     */
    static bool hasCurrencySymbols(const UnicodeString& affixPattern, UErrorCode& status);

    /**
     * Replaces all occurrences of tokens with the given type with the given replacement char.
     *
     * @param affixPattern The source affix pattern (does not get modified).
     * @param type The token type.
     * @param replacementChar The char to substitute in place of chars of the given token type.
     * @return A string containing the new affix pattern.
     */
    static UnicodeString replaceType(const UnicodeString& affixPattern, AffixPatternType type,
                                     char16_t replacementChar, UErrorCode& status);

    /**
     * Returns whether the given affix pattern contains only symbols and ignorables as defined by the
     * given ignorables set.
     */
    static bool containsOnlySymbolsAndIgnorables(const UnicodeString& affixPattern,
                                                 const UnicodeSet& ignorables, UErrorCode& status);

    /**
     * Iterates over the affix pattern, calling the TokenConsumer for each token.
     */
    static void iterateWithConsumer(const UnicodeString& affixPattern, TokenConsumer& consumer,
                                    UErrorCode& status);

    /**
     * Returns the next token from the affix pattern.
     *
     * @param tag A bitmask used for keeping track of state from token to token. The initial value
     *     should be 0L.
     * @param patternString The affix pattern.
     * @return The bitmask tag to pass to the next call of this method to retrieve the following token
     *     (never negative), or -1 if there were no more tokens in the affix pattern.
     * @see #hasNext
     */
    static AffixTag nextToken(AffixTag tag, const UnicodeString& patternString, UErrorCode& status);

    /**
     * Returns whether the affix pattern string has any more tokens to be retrieved from a call to
     * {@link #nextToken}.
     *
     * @param tag The bitmask tag of the previous token, as returned by {@link #nextToken}.
     * @param string The affix pattern.
     * @return true if there are more tokens to consume; false otherwise.
     */
    static bool hasNext(const AffixTag& tag, const UnicodeString& string);

  private:
    /**
     * Encodes the given values into a tag struct.
     * The order of the arguments is consistent with Java, but the order of the stored
     * fields is not necessarily the same.
     */
    static inline AffixTag makeTag(int32_t offset, AffixPatternType type, AffixPatternState state,
                                   UChar32 cp) {
        return {offset, cp, state, type};
    }
};

} // namespace impl
} // namespace number
U_NAMESPACE_END


#endif //__NUMBER_AFFIXUTILS_H__

#endif /* #if !UCONFIG_NO_FORMATTING */