summaryrefslogtreecommitdiffstats
path: root/intl/icu/source/i18n/rbt_pars.h
blob: 61ce9727e05f9d2cad4faaa9515734830eca0dcb (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
**********************************************************************
* Copyright (C) 1999-2011, International Business Machines Corporation
* and others. All Rights Reserved.
**********************************************************************
*   Date        Name        Description
*   11/17/99    aliu        Creation.
**********************************************************************
*/
#ifndef RBT_PARS_H
#define RBT_PARS_H

#include "unicode/utypes.h"

#if !UCONFIG_NO_TRANSLITERATION
#ifdef __cplusplus

#include "unicode/uobject.h"
#include "unicode/parseerr.h"
#include "unicode/unorm.h"
#include "rbt.h"
#include "hash.h"
#include "uvector.h"

U_NAMESPACE_BEGIN

class TransliterationRuleData;
class UnicodeFunctor;
class ParseData;
class RuleHalf;
class ParsePosition;
class StringMatcher;

class TransliteratorParser : public UMemory {

 public:

    /**
     * A Vector of TransliterationRuleData objects, one for each discrete group
     * of rules in the rule set
     */
    UVector dataVector;

    /**
     * PUBLIC data member.
     * A Vector of UnicodeStrings containing all of the ID blocks in the rule set
     */
    UVector idBlockVector;

    /**
     * PUBLIC data member containing the parsed compound filter, if any.
     */
    UnicodeSet* compoundFilter;

 private:

    /**
     * The current data object for which we are parsing rules
     */
    TransliterationRuleData* curData;

    UTransDirection direction;

    /**
     * Parse error information.
     */
    UParseError parseError;

    /**
     * Temporary symbol table used during parsing.
     */
    ParseData* parseData;

    /**
     * Temporary vector of matcher variables.  When parsing is complete, this
     * is copied into the array data.variables.  As with data.variables,
     * element 0 corresponds to character data.variablesBase.
     */
    UVector variablesVector;

    /**
     * Temporary table of variable names.  When parsing is complete, this is
     * copied into data.variableNames.
     */
    Hashtable variableNames;    
    
    /**
     * String of standins for segments.  Used during the parsing of a single
     * rule.  segmentStandins.charAt(0) is the standin for "$1" and corresponds
     * to StringMatcher object segmentObjects.elementAt(0), etc.
     */
    UnicodeString segmentStandins;

    /**
     * Vector of StringMatcher objects for segments.  Used during the
     * parsing of a single rule.  
     * segmentStandins.charAt(0) is the standin for "$1" and corresponds
     * to StringMatcher object segmentObjects.elementAt(0), etc.
     */
    UVector segmentObjects;

    /**
     * The next available stand-in for variables.  This starts at some point in
     * the private use area (discovered dynamically) and increments up toward
     * <code>variableLimit</code>.  At any point during parsing, available
     * variables are <code>variableNext..variableLimit-1</code>.
     */
    UChar variableNext;

    /**
     * The last available stand-in for variables.  This is discovered
     * dynamically.  At any point during parsing, available variables are
     * <code>variableNext..variableLimit-1</code>.
     */
    UChar variableLimit;

    /**
     * When we encounter an undefined variable, we do not immediately signal
     * an error, in case we are defining this variable, e.g., "$a = [a-z];".
     * Instead, we save the name of the undefined variable, and substitute
     * in the placeholder char variableLimit - 1, and decrement
     * variableLimit.
     */
    UnicodeString undefinedVariableName;

    /**
     * The stand-in character for the 'dot' set, represented by '.' in
     * patterns.  This is allocated the first time it is needed, and
     * reused thereafter.
     */
    UChar dotStandIn;

public:

    /**
     * Constructor.
     */
    TransliteratorParser(UErrorCode &statusReturn);

    /**
     * Destructor.
     */
    ~TransliteratorParser();

    /**
     * Parse the given string as a sequence of rules, separated by newline
     * characters ('\n'), and cause this object to implement those rules.  Any
     * previous rules are discarded.  Typically this method is called exactly
     * once after construction.
     *
     * Parse the given rules, in the given direction.  After this call
     * returns, query the public data members for results.  The caller
     * owns the 'data' and 'compoundFilter' data members after this
     * call returns.
     * @param rules      rules, separated by ';'
     * @param direction  either FORWARD or REVERSE.
     * @param pe         Struct to recieve information on position 
     *                   of error if an error is encountered
     * @param ec         Output param set to success/failure code.
     */
    void parse(const UnicodeString& rules,
               UTransDirection direction,
               UParseError& pe,
               UErrorCode& ec);

    /**
     * Return the compound filter parsed by parse().  Caller owns result.
     * @return the compound filter parsed by parse().
     */ 
    UnicodeSet* orphanCompoundFilter();

private:

    /**
     * Return a representation of this transliterator as source rules.
     * @param rules      Output param to receive the rules.
     * @param direction  either FORWARD or REVERSE.
     */
    void parseRules(const UnicodeString& rules,
                    UTransDirection direction,
                    UErrorCode& status);

    /**
     * MAIN PARSER.  Parse the next rule in the given rule string, starting
     * at pos.  Return the index after the last character parsed.  Do not
     * parse characters at or after limit.
     *
     * Important:  The character at pos must be a non-whitespace character
     * that is not the comment character.
     *
     * This method handles quoting, escaping, and whitespace removal.  It
     * parses the end-of-rule character.  It recognizes context and cursor
     * indicators.  Once it does a lexical breakdown of the rule at pos, it
     * creates a rule object and adds it to our rule list.
     * @param rules      Output param to receive the rules.
     * @param pos        the starting position.
     * @param limit      pointer past the last character of the rule.
     * @return           the index after the last character parsed.
     */
    int32_t parseRule(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status);

    /**
     * Set the variable range to [start, end] (inclusive).
     * @param start    the start value of the range.
     * @param end      the end value of the range.
     */
    void setVariableRange(int32_t start, int32_t end, UErrorCode& status);

    /**
     * Assert that the given character is NOT within the variable range.
     * If it is, return FALSE.  This is neccesary to ensure that the
     * variable range does not overlap characters used in a rule.
     * @param ch     the given character.
     * @return       True, if the given character is NOT within the variable range.
     */
    UBool checkVariableRange(UChar32 ch) const;

    /**
     * Set the maximum backup to 'backup', in response to a pragma
     * statement.
     * @param backup    the new value to be set.
     */
    void pragmaMaximumBackup(int32_t backup);

    /**
     * Begin normalizing all rules using the given mode, in response
     * to a pragma statement.
     * @param mode    the given mode.
     */
    void pragmaNormalizeRules(UNormalizationMode mode);

    /**
     * Return true if the given rule looks like a pragma.
     * @param pos offset to the first non-whitespace character
     * of the rule.
     * @param limit pointer past the last character of the rule.
     * @return true if the given rule looks like a pragma.
     */
    static UBool resemblesPragma(const UnicodeString& rule, int32_t pos, int32_t limit);

    /**
     * Parse a pragma.  This method assumes resemblesPragma() has
     * already returned true.
     * @param pos offset to the first non-whitespace character
     * of the rule.
     * @param limit pointer past the last character of the rule.
     * @return the position index after the final ';' of the pragma,
     * or -1 on failure.
     */
    int32_t parsePragma(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status);

    /**
     * Called by main parser upon syntax error.  Search the rule string
     * for the probable end of the rule.  Of course, if the error is that
     * the end of rule marker is missing, then the rule end will not be found.
     * In any case the rule start will be correctly reported.
     * @param parseErrorCode error code.
     * @param msg error description.
     * @param start position of first character of current rule.
     * @return start position of first character of current rule.
     */
    int32_t syntaxError(UErrorCode parseErrorCode, const UnicodeString&, int32_t start,
                        UErrorCode& status);

    /**
     * Parse a UnicodeSet out, store it, and return the stand-in character
     * used to represent it.
     *
     * @param rule    the rule for UnicodeSet.
     * @param pos     the position in pattern at which to start parsing.
     * @return        the stand-in character used to represent it.
     */
    UChar parseSet(const UnicodeString& rule,
                   ParsePosition& pos,
                   UErrorCode& status);

    /**
     * Generate and return a stand-in for a new UnicodeFunctor.  Store
     * the matcher (adopt it).
     * @param adopted the UnicodeFunctor to be adopted.
     * @return        a stand-in for a new UnicodeFunctor.
     */
    UChar generateStandInFor(UnicodeFunctor* adopted, UErrorCode& status);

    /**
     * Return the standin for segment seg (1-based).
     * @param seg    the given segment.
     * @return       the standIn character for the given segment.
     */
    UChar getSegmentStandin(int32_t seg, UErrorCode& status);

    /**
     * Set the object for segment seg (1-based).
     * @param seg      the given segment.
     * @param adopted  the StringMatcher to be adopted.
     */
    void setSegmentObject(int32_t seg, StringMatcher* adopted, UErrorCode& status);

    /**
     * Return the stand-in for the dot set.  It is allocated the first
     * time and reused thereafter.
     * @return    the stand-in for the dot set.
     */
    UChar getDotStandIn(UErrorCode& status);

    /**
     * Append the value of the given variable name to the given
     * UnicodeString.
     * @param name    the variable name to be appended.
     * @param buf     the given UnicodeString to append to.
     */
    void appendVariableDef(const UnicodeString& name,
                           UnicodeString& buf,
                           UErrorCode& status);

    /**
     * Glue method to get around access restrictions in C++.
     */
    /*static Transliterator* createBasicInstance(const UnicodeString& id,
                                               const UnicodeString* canonID);*/

    friend class RuleHalf;

    // Disallowed methods; no impl.
    /**
     * Copy constructor
     */
    TransliteratorParser(const TransliteratorParser&);
    
    /**
     * Assignment operator
     */
    TransliteratorParser& operator=(const TransliteratorParser&);
};

U_NAMESPACE_END

#endif /* #ifdef __cplusplus */

/**
 * Strip/convert the following from the transliterator rules:
 * comments
 * newlines
 * white space at the beginning and end of a line
 * unescape \u notation
 *
 * The target must be equal in size as the source.
 * @internal
 */
U_CAPI int32_t
utrans_stripRules(const UChar *source, int32_t sourceLen, UChar *target, UErrorCode *status);

#endif /* #if !UCONFIG_NO_TRANSLITERATION */

#endif