summaryrefslogtreecommitdiffstats
path: root/intl/icu/source/tools/toolutil/ucm.h
blob: 04e6b2030def41ed6e8bd9b53903d6fdf8fc208b (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
 *******************************************************************************
 *   Copyright (C) 2003-2013, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 *******************************************************************************
 *   file name:  ucm.h
 *   encoding:   UTF-8
 *   tab size:   8 (not used)
 *   indentation:4
 *
 *   created on: 2003jun20
 *   created by: Markus W. Scherer
 *
 *   Definitions for the .ucm file parser and handler module ucm.c.
 */

#ifndef __UCM_H__
#define __UCM_H__

#include "unicode/utypes.h"
#include "ucnvmbcs.h"
#include "ucnv_ext.h"
#include "filestrm.h"
#include <stdio.h>

#if !UCONFIG_NO_CONVERSION

U_CDECL_BEGIN

/* constants for UCMapping.moveFlag */
enum {
    UCM_MOVE_TO_EXT=1,
    UCM_REMOVE_MAPPING=2
};

/*
 * Per-mapping data structure
 *
 * u if uLen==1: Unicode code point
 *   else index to uLen code points
 * b if bLen<=4: up to 4 bytes
 *   else index to bLen bytes
 * uLen number of code points
 * bLen number of words containing left-justified bytes
 * bIsMultipleChars indicates that the bytes contain more than one sequence
 *                  according to the state table
 * f flag for roundtrip (0), fallback (1), sub mapping (2), reverse fallback (3)
 *   or "good one-way" mapping (4).
 *   Same values as in the source file after |
 */
typedef struct UCMapping {
    UChar32 u;
    union {
        uint32_t idx;
        uint8_t bytes[4];
    } b;
    int8_t uLen, bLen, f, moveFlag;
} UCMapping;

/* constants for UCMTable.flagsType */
enum {
    UCM_FLAGS_INITIAL,  /* no mappings parsed yet */
    UCM_FLAGS_EXPLICIT, /* .ucm file has mappings with | fallback indicators */
    UCM_FLAGS_IMPLICIT, /* .ucm file has mappings without | fallback indicators, later wins */
    UCM_FLAGS_MIXED     /* both implicit and explicit */
};

typedef struct UCMTable {
    UCMapping *mappings;
    int32_t mappingsCapacity, mappingsLength;

    UChar32 *codePoints;
    int32_t codePointsCapacity, codePointsLength;

    uint8_t *bytes;
    int32_t bytesCapacity, bytesLength;

    /* index map for mapping by bytes first */
    int32_t *reverseMap;

    uint8_t unicodeMask;
    int8_t flagsType; /* UCM_FLAGS_INITIAL etc. */
    UBool isSorted;
} UCMTable;

enum {
    MBCS_STATE_FLAG_DIRECT=1,
    MBCS_STATE_FLAG_SURROGATES,

    MBCS_STATE_FLAG_READY=16
};

typedef struct UCMStates {
    int32_t stateTable[MBCS_MAX_STATE_COUNT][256];
    uint32_t stateFlags[MBCS_MAX_STATE_COUNT],
             stateOffsetSum[MBCS_MAX_STATE_COUNT];

    int32_t countStates, minCharLength, maxCharLength, countToUCodeUnits;
    int8_t conversionType, outputType;
} UCMStates;

typedef struct UCMFile {
    UCMTable *base, *ext;
    UCMStates states;

    char baseName[UCNV_MAX_CONVERTER_NAME_LENGTH];
} UCMFile;

/* simple accesses ---------------------------------------------------------- */

#define UCM_GET_CODE_POINTS(t, m) \
    (((m)->uLen==1) ? &(m)->u : (t)->codePoints+(m)->u)

#define UCM_GET_BYTES(t, m) \
    (((m)->bLen<=4) ? (m)->b.bytes : (t)->bytes+(m)->b.idx)

/* APIs --------------------------------------------------------------------- */

U_CAPI UCMFile * U_EXPORT2
ucm_open(void);

U_CAPI void U_EXPORT2
ucm_close(UCMFile *ucm);

U_CAPI UBool U_EXPORT2
ucm_parseHeaderLine(UCMFile *ucm,
                    char *line, char **pKey, char **pValue);

/* @return -1 illegal bytes  0 suitable for base table  1 needs to go into extension table */
U_CAPI int32_t U_EXPORT2
ucm_mappingType(UCMStates *baseStates,
                UCMapping *m,
                UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
                uint8_t bytes[UCNV_EXT_MAX_BYTES]);

/* add a mapping to the base or extension table as appropriate */
U_CAPI UBool U_EXPORT2
ucm_addMappingAuto(UCMFile *ucm, UBool forBase, UCMStates *baseStates,
                   UCMapping *m,
                   UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
                   uint8_t bytes[UCNV_EXT_MAX_BYTES]);

U_CAPI UBool U_EXPORT2
ucm_addMappingFromLine(UCMFile *ucm, const char *line, UBool forBase, UCMStates *baseStates);


U_CAPI UCMTable * U_EXPORT2
ucm_openTable(void);

U_CAPI void U_EXPORT2
ucm_closeTable(UCMTable *table);

U_CAPI void U_EXPORT2
ucm_resetTable(UCMTable *table);

U_CAPI void U_EXPORT2
ucm_sortTable(UCMTable *t);

/*
 * Remove mappings with their move flag set from the base table
 * and move some of them (with UCM_MOVE_TO_EXT) to the extension table.
 */
U_CAPI void U_EXPORT2
ucm_moveMappings(UCMTable *base, UCMTable *ext);

/**
 * Read a table from a .ucm file, from after the CHARMAP line to
 * including the END CHARMAP line.
 */
U_CAPI void U_EXPORT2
ucm_readTable(UCMFile *ucm, FileStream* convFile,
              UBool forBase, UCMStates *baseStates,
              UErrorCode *pErrorCode);

/**
 * Check the validity of mappings against a base table's states;
 * necessary for extension-only tables that were read before their base tables.
 */
U_CAPI UBool U_EXPORT2
ucm_checkValidity(UCMTable *ext, UCMStates *baseStates);

/**
 * Check a base table against an extension table.
 * Set the moveTarget!=NULL if it is possible to move mappings from the base.
 * This is the case where base and extension tables are parsed from a single file
 * (moveTarget==ext)
 * or when delta file mappings are subtracted from a base table.
 *
 * When a base table cannot be modified because a delta file is parsed in makeconv,
 * then set moveTarget=NULL.
 *
 * if(intersectBase) then mappings that exist in the base table but not in
 * the extension table are moved to moveTarget instead of showing an error.
 *
 * Special mode:
 * If intersectBase==2 for a DBCS extension table, then SBCS mappings are
 * not moved out of the base unless their Unicode input requires it.
 * This helps ucmkbase generate base tables for DBCS-only extension .cnv files.
 *
 * For both tables in the same file, the extension table is automatically
 * built.
 * For separate files, the extension file can use a complete mapping table (.ucm file),
 * so that common mappings need not be stripped out manually.
 *
 *
 * Sort both tables, and then for each mapping direction:
 *
 * If intersectBase is TRUE and the base table contains a mapping
 * that does not exist in the extension table, then this mapping is moved
 * to moveTarget.
 *
 * - otherwise -
 *
 * If the base table contains a mapping for which the input sequence is
 * the same as the extension input, then
 * - if the output is the same: remove the extension mapping
 * - else: error
 *
 * If the base table contains a mapping for which the input sequence is
 * a prefix of the extension input, then
 * - if moveTarget!=NULL: move the base mapping to the moveTarget table
 * - else: error
 *
 * @return FALSE in case of an irreparable error
 */
U_CAPI UBool U_EXPORT2
ucm_checkBaseExt(UCMStates *baseStates, UCMTable *base, UCMTable *ext,
                 UCMTable *moveTarget, UBool intersectBase);

U_CAPI void U_EXPORT2
ucm_printTable(UCMTable *table, FILE *f, UBool byUnicode);

U_CAPI void U_EXPORT2
ucm_printMapping(UCMTable *table, UCMapping *m, FILE *f);


U_CAPI void U_EXPORT2
ucm_addState(UCMStates *states, const char *s);

U_CAPI void U_EXPORT2
ucm_processStates(UCMStates *states, UBool ignoreSISOCheck);

U_CAPI int32_t U_EXPORT2
ucm_countChars(UCMStates *states,
               const uint8_t *bytes, int32_t length);


U_CAPI int8_t U_EXPORT2
ucm_parseBytes(uint8_t bytes[UCNV_EXT_MAX_BYTES], const char *line, const char **ps);

U_CAPI UBool U_EXPORT2
ucm_parseMappingLine(UCMapping *m,
                     UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
                     uint8_t bytes[UCNV_EXT_MAX_BYTES],
                     const char *line);

U_CAPI void U_EXPORT2
ucm_addMapping(UCMTable *table,
               UCMapping *m,
               UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
               uint8_t bytes[UCNV_EXT_MAX_BYTES]);

/* very makeconv-specific functions ----------------------------------------- */

/* finalize and optimize states after the toUnicode mappings are processed */
U_CAPI void U_EXPORT2
ucm_optimizeStates(UCMStates *states,
                   uint16_t **pUnicodeCodeUnits,
                   _MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks,
                   UBool verbose);

/* moved here because it is used inside ucmstate.c */
U_CAPI int32_t U_EXPORT2
ucm_findFallback(_MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks,
                 uint32_t offset);

/* very rptp2ucm-specific functions ----------------------------------------- */

/*
 * Input: Separate tables with mappings from/to Unicode,
 * subchar and subchar1 (0 if none).
 * All mappings must have flag 0.
 *
 * Output: fromUTable will contain the union of mappings with the correct
 * precision flags, and be sorted.
 */
U_CAPI void U_EXPORT2
ucm_mergeTables(UCMTable *fromUTable, UCMTable *toUTable,
                const uint8_t *subchar, int32_t subcharLength,
                uint8_t subchar1);

U_CAPI UBool U_EXPORT2
ucm_separateMappings(UCMFile *ucm, UBool isSISO);

U_CDECL_END

#endif

#endif