summaryrefslogtreecommitdiffstats
path: root/intl/icu/source/i18n/anytrans.cpp
blob: 167b0185285371d93ebd8d23b875534720a37a83 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*****************************************************************
* Copyright (c) 2002-2014, International Business Machines Corporation
* and others.  All Rights Reserved.
*****************************************************************
* Date        Name        Description
* 06/06/2002  aliu        Creation.
*****************************************************************
*/

#include "unicode/utypes.h"

#if !UCONFIG_NO_TRANSLITERATION

#include "unicode/uobject.h"
#include "unicode/uscript.h"

#include "anytrans.h"
#include "hash.h"
#include "mutex.h"
#include "nultrans.h"
#include "putilimp.h"
#include "tridpars.h"
#include "uinvchar.h"
#include "uvector.h"

//------------------------------------------------------------
// Constants

static const UChar TARGET_SEP = 45; // '-'
static const UChar VARIANT_SEP = 47; // '/'
static const UChar ANY[] = {0x41,0x6E,0x79,0}; // "Any"
static const UChar NULL_ID[] = {78,117,108,108,0}; // "Null"
static const UChar LATIN_PIVOT[] = {0x2D,0x4C,0x61,0x74,0x6E,0x3B,0x4C,0x61,0x74,0x6E,0x2D,0}; // "-Latn;Latn-"

// initial size for an Any-XXXX transform's cache of script-XXXX transforms
// (will grow as necessary, but we don't expect to have source text with more than 7 scripts)
#define ANY_TRANS_CACHE_INIT_SIZE 7

//------------------------------------------------------------

U_CDECL_BEGIN
/**
 * Deleter function for Transliterator*.
 */
static void U_CALLCONV
_deleteTransliterator(void *obj) {
    delete (icu::Transliterator*) obj;
}
U_CDECL_END

//------------------------------------------------------------

U_NAMESPACE_BEGIN

//------------------------------------------------------------
// ScriptRunIterator

/**
 * Returns a series of ranges corresponding to scripts. They will be
 * of the form:
 *
 * ccccSScSSccccTTcTcccc   - c = common, S = first script, T = second
 * |            |          - first run (start, limit)
 *          |           |  - second run (start, limit)
 *
 * That is, the runs will overlap. The reason for this is so that a
 * transliterator can consider common characters both before and after
 * the scripts.
 */
class ScriptRunIterator : public UMemory {
private:
    const Replaceable& text;
    int32_t textStart;
    int32_t textLimit;

public:
    /**
     * The code of the current run, valid after next() returns.  May
     * be USCRIPT_INVALID_CODE if and only if the entire text is
     * COMMON/INHERITED.
     */
    UScriptCode scriptCode;

    /**
     * The start of the run, inclusive, valid after next() returns.
     */
    int32_t start;

    /**
     * The end of the run, exclusive, valid after next() returns.
     */
    int32_t limit;

    /**
     * Constructs a run iterator over the given text from start
     * (inclusive) to limit (exclusive).
     */
    ScriptRunIterator(const Replaceable& text, int32_t start, int32_t limit);

    /**
     * Returns TRUE if there are any more runs.  TRUE is always
     * returned at least once.  Upon return, the caller should
     * examine scriptCode, start, and limit.
     */
    UBool next();

    /**
     * Adjusts internal indices for a change in the limit index of the
     * given delta.  A positive delta means the limit has increased.
     */
    void adjustLimit(int32_t delta);

private:
    ScriptRunIterator(const ScriptRunIterator &other); // forbid copying of this class
    ScriptRunIterator &operator=(const ScriptRunIterator &other); // forbid copying of this class
};

ScriptRunIterator::ScriptRunIterator(const Replaceable& theText,
                                     int32_t myStart, int32_t myLimit) :
    text(theText)
{
    textStart = myStart;
    textLimit = myLimit;
    limit = myStart;
}

UBool ScriptRunIterator::next() {
    UChar32 ch;
    UScriptCode s;
    UErrorCode ec = U_ZERO_ERROR;

    scriptCode = USCRIPT_INVALID_CODE; // don't know script yet
    start = limit;

    // Are we done?
    if (start == textLimit) {
        return FALSE;
    }

    // Move start back to include adjacent COMMON or INHERITED
    // characters
    while (start > textStart) {
        ch = text.char32At(start - 1); // look back
        s = uscript_getScript(ch, &ec);
        if (s == USCRIPT_COMMON || s == USCRIPT_INHERITED) {
            --start;
        } else {
            break;
        }
    }

    // Move limit ahead to include COMMON, INHERITED, and characters
    // of the current script.
    while (limit < textLimit) {
        ch = text.char32At(limit); // look ahead
        s = uscript_getScript(ch, &ec);
        if (s != USCRIPT_COMMON && s != USCRIPT_INHERITED) {
            if (scriptCode == USCRIPT_INVALID_CODE) {
                scriptCode = s;
            } else if (s != scriptCode) {
                break;
            }
        }
        ++limit;
    }

    // Return TRUE even if the entire text is COMMON / INHERITED, in
    // which case scriptCode will be USCRIPT_INVALID_CODE.
    return TRUE;
}

void ScriptRunIterator::adjustLimit(int32_t delta) {
    limit += delta;
    textLimit += delta;
}

//------------------------------------------------------------
// AnyTransliterator

UOBJECT_DEFINE_RTTI_IMPLEMENTATION(AnyTransliterator)

AnyTransliterator::AnyTransliterator(const UnicodeString& id,
                                     const UnicodeString& theTarget,
                                     const UnicodeString& theVariant,
                                     UScriptCode theTargetScript,
                                     UErrorCode& ec) :
    Transliterator(id, NULL),
    targetScript(theTargetScript)
{
    cache = uhash_openSize(uhash_hashLong, uhash_compareLong, NULL, ANY_TRANS_CACHE_INIT_SIZE, &ec);
    if (U_FAILURE(ec)) {
        return;
    }
    uhash_setValueDeleter(cache, _deleteTransliterator);

    target = theTarget;
    if (theVariant.length() > 0) {
        target.append(VARIANT_SEP).append(theVariant);
    }
}

AnyTransliterator::~AnyTransliterator() {
    uhash_close(cache);
}

/**
 * Copy constructor.
 */
AnyTransliterator::AnyTransliterator(const AnyTransliterator& o) :
    Transliterator(o),
    target(o.target),
    targetScript(o.targetScript)
{
    // Don't copy the cache contents
    UErrorCode ec = U_ZERO_ERROR;
    cache = uhash_openSize(uhash_hashLong, uhash_compareLong, NULL, ANY_TRANS_CACHE_INIT_SIZE, &ec);
    if (U_FAILURE(ec)) {
        return;
    }
    uhash_setValueDeleter(cache, _deleteTransliterator);
}

/**
 * Transliterator API.
 */
AnyTransliterator* AnyTransliterator::clone() const {
    return new AnyTransliterator(*this);
}

/**
 * Implements {@link Transliterator#handleTransliterate}.
 */
void AnyTransliterator::handleTransliterate(Replaceable& text, UTransPosition& pos,
                                            UBool isIncremental) const {
    int32_t allStart = pos.start;
    int32_t allLimit = pos.limit;

    ScriptRunIterator it(text, pos.contextStart, pos.contextLimit);

    while (it.next()) {
        // Ignore runs in the ante context
        if (it.limit <= allStart) continue;

        // Try to instantiate transliterator from it.scriptCode to
        // our target or target/variant
        Transliterator* t = getTransliterator(it.scriptCode);

        if (t == NULL) {
            // We have no transliterator.  Do nothing, but keep
            // pos.start up to date.
            pos.start = it.limit;
            continue;
        }

        // If the run end is before the transliteration limit, do
        // a non-incremental transliteration.  Otherwise do an
        // incremental one.
        UBool incremental = isIncremental && (it.limit >= allLimit);

        pos.start = uprv_max(allStart, it.start);
        pos.limit = uprv_min(allLimit, it.limit);
        int32_t limit = pos.limit;
        t->filteredTransliterate(text, pos, incremental);
        int32_t delta = pos.limit - limit;
        allLimit += delta;
        it.adjustLimit(delta);

        // We're done if we enter the post context
        if (it.limit >= allLimit) break;
    }

    // Restore limit.  pos.start is fine where the last transliterator
    // left it, or at the end of the last run.
    pos.limit = allLimit;
}

Transliterator* AnyTransliterator::getTransliterator(UScriptCode source) const {

    if (source == targetScript || source == USCRIPT_INVALID_CODE) {
        return NULL;
    }

    Transliterator* t = NULL;
    {
        Mutex m(NULL);
        t = (Transliterator*) uhash_iget(cache, (int32_t) source);
    }
    if (t == NULL) {
        UErrorCode ec = U_ZERO_ERROR;
        UnicodeString sourceName(uscript_getShortName(source), -1, US_INV);
        UnicodeString id(sourceName);
        id.append(TARGET_SEP).append(target);

        t = Transliterator::createInstance(id, UTRANS_FORWARD, ec);
        if (U_FAILURE(ec) || t == NULL) {
            delete t;

            // Try to pivot around Latin, our most common script
            id = sourceName;
            id.append(LATIN_PIVOT, -1).append(target);
            t = Transliterator::createInstance(id, UTRANS_FORWARD, ec);
            if (U_FAILURE(ec) || t == NULL) {
                delete t;
                t = NULL;
            }
        }

        if (t != NULL) {
            Transliterator *rt = NULL;
            {
                Mutex m(NULL);
                rt = static_cast<Transliterator *> (uhash_iget(cache, (int32_t) source));
                if (rt == NULL) {
                    // Common case, no race to cache this new transliterator.
                    uhash_iput(cache, (int32_t) source, t, &ec);
                } else {
                    // Race case, some other thread beat us to caching this transliterator.
                    Transliterator *temp = rt;
                    rt = t;    // Our newly created transliterator that lost the race & now needs deleting.
                    t  = temp; // The transliterator from the cache that we will return.
                }
            }
            delete rt;    // will be non-null only in case of races.
        }
    }
    return t;
}

/**
 * Return the script code for a given name, or -1 if not found.
 */
static UScriptCode scriptNameToCode(const UnicodeString& name) {
    char buf[128];
    UScriptCode code;
    UErrorCode ec = U_ZERO_ERROR;
    int32_t nameLen = name.length();
    UBool isInvariant = uprv_isInvariantUString(name.getBuffer(), nameLen);

    if (isInvariant) {
        name.extract(0, nameLen, buf, (int32_t)sizeof(buf), US_INV);
        buf[127] = 0;   // Make sure that we NULL terminate the string.
    }
    if (!isInvariant || uscript_getCode(buf, &code, 1, &ec) != 1 || U_FAILURE(ec))
    {
        code = USCRIPT_INVALID_CODE;
    }
    return code;
}

/**
 * Registers standard transliterators with the system.  Called by
 * Transliterator during initialization.  Scan all current targets and
 * register those that are scripts T as Any-T/V.
 */
void AnyTransliterator::registerIDs() {

    UErrorCode ec = U_ZERO_ERROR;
    Hashtable seen(TRUE, ec);

    int32_t sourceCount = Transliterator::_countAvailableSources();
    for (int32_t s=0; s<sourceCount; ++s) {
        UnicodeString source;
        Transliterator::_getAvailableSource(s, source);

        // Ignore the "Any" source
        if (source.caseCompare(ANY, 3, 0 /*U_FOLD_CASE_DEFAULT*/) == 0) continue;

        int32_t targetCount = Transliterator::_countAvailableTargets(source);
        for (int32_t t=0; t<targetCount; ++t) {
            UnicodeString target;
            Transliterator::_getAvailableTarget(t, source, target);

            // Only process each target once
            if (seen.geti(target) != 0) continue;
            ec = U_ZERO_ERROR;
            seen.puti(target, 1, ec);

            // Get the script code for the target.  If not a script, ignore.
            UScriptCode targetScript = scriptNameToCode(target);
            if (targetScript == USCRIPT_INVALID_CODE) continue;

            int32_t variantCount = Transliterator::_countAvailableVariants(source, target);
            // assert(variantCount >= 1);
            for (int32_t v=0; v<variantCount; ++v) {
                UnicodeString variant;
                Transliterator::_getAvailableVariant(v, source, target, variant);

                UnicodeString id;
                TransliteratorIDParser::STVtoID(UnicodeString(TRUE, ANY, 3), target, variant, id);
                ec = U_ZERO_ERROR;
                AnyTransliterator* tl = new AnyTransliterator(id, target, variant,
                                                             targetScript, ec);
                if (U_FAILURE(ec)) {
                    delete tl;
                } else {
                    Transliterator::_registerInstance(tl);
                    Transliterator::_registerSpecialInverse(target, UnicodeString(TRUE, NULL_ID, 4), FALSE);
                }
            }
        }
    }
}

U_NAMESPACE_END

#endif /* #if !UCONFIG_NO_TRANSLITERATION */

//eof