diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 17:32:43 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 17:32:43 +0000 |
commit | 6bf0a5cb5034a7e684dcc3500e841785237ce2dd (patch) | |
tree | a68f146d7fa01f0134297619fbe7e33db084e0aa /intl/icu/source/tools/toolutil/ucm.cpp | |
parent | Initial commit. (diff) | |
download | thunderbird-6bf0a5cb5034a7e684dcc3500e841785237ce2dd.tar.xz thunderbird-6bf0a5cb5034a7e684dcc3500e841785237ce2dd.zip |
Adding upstream version 1:115.7.0.upstream/1%115.7.0upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'intl/icu/source/tools/toolutil/ucm.cpp')
-rw-r--r-- | intl/icu/source/tools/toolutil/ucm.cpp | 1195 |
1 files changed, 1195 insertions, 0 deletions
diff --git a/intl/icu/source/tools/toolutil/ucm.cpp b/intl/icu/source/tools/toolutil/ucm.cpp new file mode 100644 index 0000000000..272570e72f --- /dev/null +++ b/intl/icu/source/tools/toolutil/ucm.cpp @@ -0,0 +1,1195 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 2003-2013, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: ucm.c +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2003jun20 +* created by: Markus W. Scherer +* +* This file reads a .ucm file, stores its mappings and sorts them. +* It implements handling of Unicode conversion mappings from .ucm files +* for makeconv, canonucm, rptp2ucm, etc. +* +* Unicode code point sequences with a length of more than 1, +* as well as byte sequences with more than 4 bytes or more than one complete +* character sequence are handled to support m:n mappings. +*/ + +#include "unicode/utypes.h" +#include "unicode/ustring.h" +#include "cstring.h" +#include "cmemory.h" +#include "filestrm.h" +#include "uarrsort.h" +#include "ucnvmbcs.h" +#include "ucnv_bld.h" +#include "ucnv_ext.h" +#include "uparse.h" +#include "ucm.h" +#include <stdio.h> + +#if !UCONFIG_NO_CONVERSION + +/* -------------------------------------------------------------------------- */ + +static void +printMapping(UCMapping *m, UChar32 *codePoints, uint8_t *bytes, FILE *f) { + int32_t j; + + for(j=0; j<m->uLen; ++j) { + fprintf(f, "<U%04lX>", (long)codePoints[j]); + } + + fputc(' ', f); + + for(j=0; j<m->bLen; ++j) { + fprintf(f, "\\x%02X", bytes[j]); + } + + if(m->f>=0) { + fprintf(f, " |%u\n", m->f); + } else { + fputs("\n", f); + } +} + +U_CAPI void U_EXPORT2 +ucm_printMapping(UCMTable *table, UCMapping *m, FILE *f) { + printMapping(m, UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m), f); +} + +U_CAPI void U_EXPORT2 +ucm_printTable(UCMTable *table, FILE *f, UBool byUnicode) { + UCMapping *m; + int32_t i, length; + + m=table->mappings; + length=table->mappingsLength; + if(byUnicode) { + for(i=0; i<length; ++m, ++i) { + ucm_printMapping(table, m, f); + } + } else { + const int32_t *map=table->reverseMap; + for(i=0; i<length; ++i) { + ucm_printMapping(table, m+map[i], f); + } + } +} + +/* mapping comparisons ------------------------------------------------------ */ + +static int32_t +compareUnicode(UCMTable *lTable, const UCMapping *l, + UCMTable *rTable, const UCMapping *r) { + const UChar32 *lu, *ru; + int32_t result, i, length; + + if(l->uLen==1 && r->uLen==1) { + /* compare two single code points */ + return l->u-r->u; + } + + /* get pointers to the code point sequences */ + lu=UCM_GET_CODE_POINTS(lTable, l); + ru=UCM_GET_CODE_POINTS(rTable, r); + + /* get the minimum length */ + if(l->uLen<=r->uLen) { + length=l->uLen; + } else { + length=r->uLen; + } + + /* compare the code points */ + for(i=0; i<length; ++i) { + result=lu[i]-ru[i]; + if(result!=0) { + return result; + } + } + + /* compare the lengths */ + return l->uLen-r->uLen; +} + +static int32_t +compareBytes(UCMTable *lTable, const UCMapping *l, + UCMTable *rTable, const UCMapping *r, + UBool lexical) { + const uint8_t *lb, *rb; + int32_t result, i, length; + + /* + * A lexical comparison is used for sorting in the builder, to allow + * an efficient search for a byte sequence that could be a prefix + * of a previously entered byte sequence. + * + * Comparing by lengths first is for compatibility with old .ucm tools + * like canonucm and rptp2ucm. + */ + if(lexical) { + /* get the minimum length and continue */ + if(l->bLen<=r->bLen) { + length=l->bLen; + } else { + length=r->bLen; + } + } else { + /* compare lengths first */ + result=l->bLen-r->bLen; + if(result!=0) { + return result; + } else { + length=l->bLen; + } + } + + /* get pointers to the byte sequences */ + lb=UCM_GET_BYTES(lTable, l); + rb=UCM_GET_BYTES(rTable, r); + + /* compare the bytes */ + for(i=0; i<length; ++i) { + result=lb[i]-rb[i]; + if(result!=0) { + return result; + } + } + + /* compare the lengths */ + return l->bLen-r->bLen; +} + +/* compare UCMappings for sorting */ +static int32_t +compareMappings(UCMTable *lTable, const UCMapping *l, + UCMTable *rTable, const UCMapping *r, + UBool uFirst) { + int32_t result; + + /* choose which side to compare first */ + if(uFirst) { + /* Unicode then bytes */ + result=compareUnicode(lTable, l, rTable, r); + if(result==0) { + result=compareBytes(lTable, l, rTable, r, false); /* not lexically, like canonucm */ + } + } else { + /* bytes then Unicode */ + result=compareBytes(lTable, l, rTable, r, true); /* lexically, for builder */ + if(result==0) { + result=compareUnicode(lTable, l, rTable, r); + } + } + + if(result!=0) { + return result; + } + + /* compare the flags */ + return l->f-r->f; +} +U_CDECL_BEGIN +/* sorting by Unicode first sorts mappings directly */ +static int32_t U_CALLCONV +compareMappingsUnicodeFirst(const void *context, const void *left, const void *right) { + return compareMappings( + (UCMTable *)context, (const UCMapping *)left, + (UCMTable *)context, (const UCMapping *)right, true); +} + +/* sorting by bytes first sorts the reverseMap; use indirection to mappings */ +static int32_t U_CALLCONV +compareMappingsBytesFirst(const void *context, const void *left, const void *right) { + UCMTable *table=(UCMTable *)context; + int32_t l=*(const int32_t *)left, r=*(const int32_t *)right; + return compareMappings( + table, table->mappings+l, + table, table->mappings+r, false); +} +U_CDECL_END + +U_CAPI void U_EXPORT2 +ucm_sortTable(UCMTable *t) { + UErrorCode errorCode; + int32_t i; + + if(t->isSorted) { + return; + } + + errorCode=U_ZERO_ERROR; + + /* 1. sort by Unicode first */ + uprv_sortArray(t->mappings, t->mappingsLength, sizeof(UCMapping), + compareMappingsUnicodeFirst, t, + false, &errorCode); + + /* build the reverseMap */ + if(t->reverseMap==nullptr) { + /* + * allocate mappingsCapacity instead of mappingsLength so that + * if mappings are added, the reverseMap need not be + * reallocated each time + * (see ucm_moveMappings() and ucm_addMapping()) + */ + t->reverseMap=(int32_t *)uprv_malloc(t->mappingsCapacity*sizeof(int32_t)); + if(t->reverseMap==nullptr) { + fprintf(stderr, "ucm error: unable to allocate reverseMap\n"); + exit(U_MEMORY_ALLOCATION_ERROR); + } + } + for(i=0; i<t->mappingsLength; ++i) { + t->reverseMap[i]=i; + } + + /* 2. sort reverseMap by mappings bytes first */ + uprv_sortArray(t->reverseMap, t->mappingsLength, sizeof(int32_t), + compareMappingsBytesFirst, t, + false, &errorCode); + + if(U_FAILURE(errorCode)) { + fprintf(stderr, "ucm error: sortTable()/uprv_sortArray() fails - %s\n", + u_errorName(errorCode)); + exit(errorCode); + } + + t->isSorted=true; +} + +/* + * remove mappings with their move flag set from the base table + * and move some of them (with UCM_MOVE_TO_EXT) to the extension table + */ +U_CAPI void U_EXPORT2 +ucm_moveMappings(UCMTable *base, UCMTable *ext) { + UCMapping *mb, *mbLimit; + int8_t flag; + + mb=base->mappings; + mbLimit=mb+base->mappingsLength; + + while(mb<mbLimit) { + flag=mb->moveFlag; + if(flag!=0) { + /* reset the move flag */ + mb->moveFlag=0; + + if(ext!=nullptr && (flag&UCM_MOVE_TO_EXT)) { + /* add the mapping to the extension table */ + ucm_addMapping(ext, mb, UCM_GET_CODE_POINTS(base, mb), UCM_GET_BYTES(base, mb)); + } + + /* remove this mapping: move the last base mapping down and overwrite the current one */ + if(mb<(mbLimit-1)) { + uprv_memcpy(mb, mbLimit-1, sizeof(UCMapping)); + } + --mbLimit; + --base->mappingsLength; + base->isSorted=false; + } else { + ++mb; + } + } +} + +enum { + NEEDS_MOVE=1, + HAS_ERRORS=2 +}; + +static uint8_t +checkBaseExtUnicode(UCMStates *baseStates, UCMTable *base, UCMTable *ext, + UBool moveToExt, UBool intersectBase) { + (void)baseStates; + + UCMapping *mb, *me, *mbLimit, *meLimit; + int32_t cmp; + uint8_t result; + + mb=base->mappings; + mbLimit=mb+base->mappingsLength; + + me=ext->mappings; + meLimit=me+ext->mappingsLength; + + result=0; + + for(;;) { + /* skip irrelevant mappings on both sides */ + for(;;) { + if(mb==mbLimit) { + return result; + } + + if((0<=mb->f && mb->f<=2) || mb->f==4) { + break; + } + + ++mb; + } + + for(;;) { + if(me==meLimit) { + return result; + } + + if((0<=me->f && me->f<=2) || me->f==4) { + break; + } + + ++me; + } + + /* compare the base and extension mappings */ + cmp=compareUnicode(base, mb, ext, me); + if(cmp<0) { + if(intersectBase && (intersectBase!=2 || mb->bLen>1)) { + /* + * mapping in base but not in ext, move it + * + * if ext is DBCS, move DBCS mappings here + * and check SBCS ones for Unicode prefix below + */ + mb->moveFlag|=UCM_MOVE_TO_EXT; + result|=NEEDS_MOVE; + + /* does mb map from an input sequence that is a prefix of me's? */ + } else if( mb->uLen<me->uLen && + 0==uprv_memcmp(UCM_GET_CODE_POINTS(base, mb), UCM_GET_CODE_POINTS(ext, me), 4*mb->uLen) + ) { + if(moveToExt) { + /* mark this mapping to be moved to the extension table */ + mb->moveFlag|=UCM_MOVE_TO_EXT; + result|=NEEDS_MOVE; + } else { + fprintf(stderr, + "ucm error: the base table contains a mapping whose input sequence\n" + " is a prefix of the input sequence of an extension mapping\n"); + ucm_printMapping(base, mb, stderr); + ucm_printMapping(ext, me, stderr); + result|=HAS_ERRORS; + } + } + + ++mb; + } else if(cmp==0) { + /* + * same output: remove the extension mapping, + * otherwise treat as an error + */ + if( mb->f==me->f && mb->bLen==me->bLen && + 0==uprv_memcmp(UCM_GET_BYTES(base, mb), UCM_GET_BYTES(ext, me), mb->bLen) + ) { + me->moveFlag|=UCM_REMOVE_MAPPING; + result|=NEEDS_MOVE; + } else if(intersectBase) { + /* mapping in base but not in ext, move it */ + mb->moveFlag|=UCM_MOVE_TO_EXT; + result|=NEEDS_MOVE; + } else { + fprintf(stderr, + "ucm error: the base table contains a mapping whose input sequence\n" + " is the same as the input sequence of an extension mapping\n" + " but it maps differently\n"); + ucm_printMapping(base, mb, stderr); + ucm_printMapping(ext, me, stderr); + result|=HAS_ERRORS; + } + + ++mb; + } else /* cmp>0 */ { + ++me; + } + } +} + +static uint8_t +checkBaseExtBytes(UCMStates *baseStates, UCMTable *base, UCMTable *ext, + UBool moveToExt, UBool intersectBase) { + UCMapping *mb, *me; + int32_t *baseMap, *extMap; + int32_t b, e, bLimit, eLimit, cmp; + uint8_t result; + UBool isSISO; + + baseMap=base->reverseMap; + extMap=ext->reverseMap; + + b=e=0; + bLimit=base->mappingsLength; + eLimit=ext->mappingsLength; + + result=0; + + isSISO=(UBool)(baseStates->outputType==MBCS_OUTPUT_2_SISO); + + for(;;) { + /* skip irrelevant mappings on both sides */ + for(;; ++b) { + if(b==bLimit) { + return result; + } + mb=base->mappings+baseMap[b]; + + if(intersectBase==2 && mb->bLen==1) { + /* + * comparing a base against a DBCS extension: + * leave SBCS base mappings alone + */ + continue; + } + + if(mb->f==0 || mb->f==3) { + break; + } + } + + for(;;) { + if(e==eLimit) { + return result; + } + me=ext->mappings+extMap[e]; + + if(me->f==0 || me->f==3) { + break; + } + + ++e; + } + + /* compare the base and extension mappings */ + cmp=compareBytes(base, mb, ext, me, true); + if(cmp<0) { + if(intersectBase) { + /* mapping in base but not in ext, move it */ + mb->moveFlag|=UCM_MOVE_TO_EXT; + result|=NEEDS_MOVE; + + /* + * does mb map from an input sequence that is a prefix of me's? + * for SI/SO tables, a single byte is never a prefix because it + * occurs in a separate single-byte state + */ + } else if( mb->bLen<me->bLen && + (!isSISO || mb->bLen>1) && + 0==uprv_memcmp(UCM_GET_BYTES(base, mb), UCM_GET_BYTES(ext, me), mb->bLen) + ) { + if(moveToExt) { + /* mark this mapping to be moved to the extension table */ + mb->moveFlag|=UCM_MOVE_TO_EXT; + result|=NEEDS_MOVE; + } else { + fprintf(stderr, + "ucm error: the base table contains a mapping whose input sequence\n" + " is a prefix of the input sequence of an extension mapping\n"); + ucm_printMapping(base, mb, stderr); + ucm_printMapping(ext, me, stderr); + result|=HAS_ERRORS; + } + } + + ++b; + } else if(cmp==0) { + /* + * same output: remove the extension mapping, + * otherwise treat as an error + */ + if( mb->f==me->f && mb->uLen==me->uLen && + 0==uprv_memcmp(UCM_GET_CODE_POINTS(base, mb), UCM_GET_CODE_POINTS(ext, me), 4*mb->uLen) + ) { + me->moveFlag|=UCM_REMOVE_MAPPING; + result|=NEEDS_MOVE; + } else if(intersectBase) { + /* mapping in base but not in ext, move it */ + mb->moveFlag|=UCM_MOVE_TO_EXT; + result|=NEEDS_MOVE; + } else { + fprintf(stderr, + "ucm error: the base table contains a mapping whose input sequence\n" + " is the same as the input sequence of an extension mapping\n" + " but it maps differently\n"); + ucm_printMapping(base, mb, stderr); + ucm_printMapping(ext, me, stderr); + result|=HAS_ERRORS; + } + + ++b; + } else /* cmp>0 */ { + ++e; + } + } +} + +U_CAPI UBool U_EXPORT2 +ucm_checkValidity(UCMTable *table, UCMStates *baseStates) { + UCMapping *m, *mLimit; + int32_t count; + UBool isOK; + + m=table->mappings; + mLimit=m+table->mappingsLength; + isOK=true; + + while(m<mLimit) { + count=ucm_countChars(baseStates, UCM_GET_BYTES(table, m), m->bLen); + if(count<1) { + ucm_printMapping(table, m, stderr); + isOK=false; + } + ++m; + } + + return isOK; +} + +U_CAPI UBool U_EXPORT2 +ucm_checkBaseExt(UCMStates *baseStates, + UCMTable *base, UCMTable *ext, UCMTable *moveTarget, + UBool intersectBase) { + uint8_t result; + + /* if we have an extension table, we must always use precision flags */ + if(base->flagsType&UCM_FLAGS_IMPLICIT) { + fprintf(stderr, "ucm error: the base table contains mappings without precision flags\n"); + return false; + } + if(ext->flagsType&UCM_FLAGS_IMPLICIT) { + fprintf(stderr, "ucm error: extension table contains mappings without precision flags\n"); + return false; + } + + /* checking requires both tables to be sorted */ + ucm_sortTable(base); + ucm_sortTable(ext); + + /* check */ + result= + checkBaseExtUnicode(baseStates, base, ext, (UBool)(moveTarget!=nullptr), intersectBase)| + checkBaseExtBytes(baseStates, base, ext, (UBool)(moveTarget!=nullptr), intersectBase); + + if(result&HAS_ERRORS) { + return false; + } + + if(result&NEEDS_MOVE) { + ucm_moveMappings(ext, nullptr); + ucm_moveMappings(base, moveTarget); + ucm_sortTable(base); + ucm_sortTable(ext); + if(moveTarget!=nullptr) { + ucm_sortTable(moveTarget); + } + } + + return true; +} + +/* merge tables for rptp2ucm ------------------------------------------------ */ + +U_CAPI void U_EXPORT2 +ucm_mergeTables(UCMTable *fromUTable, UCMTable *toUTable, + const uint8_t *subchar, int32_t subcharLength, + uint8_t subchar1) { + UCMapping *fromUMapping, *toUMapping; + int32_t fromUIndex, toUIndex, fromUTop, toUTop, cmp; + + ucm_sortTable(fromUTable); + ucm_sortTable(toUTable); + + fromUMapping=fromUTable->mappings; + toUMapping=toUTable->mappings; + + fromUTop=fromUTable->mappingsLength; + toUTop=toUTable->mappingsLength; + + fromUIndex=toUIndex=0; + + while(fromUIndex<fromUTop && toUIndex<toUTop) { + cmp=compareMappings(fromUTable, fromUMapping, toUTable, toUMapping, true); + if(cmp==0) { + /* equal: roundtrip, nothing to do (flags are initially 0) */ + ++fromUMapping; + ++toUMapping; + + ++fromUIndex; + ++toUIndex; + } else if(cmp<0) { + /* + * the fromU mapping does not have a toU counterpart: + * fallback Unicode->codepage + */ + if( (fromUMapping->bLen==subcharLength && + 0==uprv_memcmp(UCM_GET_BYTES(fromUTable, fromUMapping), subchar, subcharLength)) || + (subchar1!=0 && fromUMapping->bLen==1 && fromUMapping->b.bytes[0]==subchar1) + ) { + fromUMapping->f=2; /* SUB mapping */ + } else { + fromUMapping->f=1; /* normal fallback */ + } + + ++fromUMapping; + ++fromUIndex; + } else { + /* + * the toU mapping does not have a fromU counterpart: + * (reverse) fallback codepage->Unicode, copy it to the fromU table + */ + + /* ignore reverse fallbacks to Unicode SUB */ + if(!(toUMapping->uLen==1 && (toUMapping->u==0xfffd || toUMapping->u==0x1a))) { + toUMapping->f=3; /* reverse fallback */ + ucm_addMapping(fromUTable, toUMapping, UCM_GET_CODE_POINTS(toUTable, toUMapping), UCM_GET_BYTES(toUTable, toUMapping)); + + /* the table may have been reallocated */ + fromUMapping=fromUTable->mappings+fromUIndex; + } + + ++toUMapping; + ++toUIndex; + } + } + + /* either one or both tables are exhausted */ + while(fromUIndex<fromUTop) { + /* leftover fromU mappings are fallbacks */ + if( (fromUMapping->bLen==subcharLength && + 0==uprv_memcmp(UCM_GET_BYTES(fromUTable, fromUMapping), subchar, subcharLength)) || + (subchar1!=0 && fromUMapping->bLen==1 && fromUMapping->b.bytes[0]==subchar1) + ) { + fromUMapping->f=2; /* SUB mapping */ + } else { + fromUMapping->f=1; /* normal fallback */ + } + + ++fromUMapping; + ++fromUIndex; + } + + while(toUIndex<toUTop) { + /* leftover toU mappings are reverse fallbacks */ + + /* ignore reverse fallbacks to Unicode SUB */ + if(!(toUMapping->uLen==1 && (toUMapping->u==0xfffd || toUMapping->u==0x1a))) { + toUMapping->f=3; /* reverse fallback */ + ucm_addMapping(fromUTable, toUMapping, UCM_GET_CODE_POINTS(toUTable, toUMapping), UCM_GET_BYTES(toUTable, toUMapping)); + } + + ++toUMapping; + ++toUIndex; + } + + fromUTable->isSorted=false; +} + +/* separate extension mappings out of base table for rptp2ucm --------------- */ + +U_CAPI UBool U_EXPORT2 +ucm_separateMappings(UCMFile *ucm, UBool isSISO) { + UCMTable *table; + UCMapping *m, *mLimit; + int32_t type; + UBool needsMove, isOK; + + table=ucm->base; + m=table->mappings; + mLimit=m+table->mappingsLength; + + needsMove=false; + isOK=true; + + for(; m<mLimit; ++m) { + if(isSISO && m->bLen==1 && (m->b.bytes[0]==0xe || m->b.bytes[0]==0xf)) { + fprintf(stderr, "warning: removing illegal mapping from an SI/SO-stateful table\n"); + ucm_printMapping(table, m, stderr); + m->moveFlag|=UCM_REMOVE_MAPPING; + needsMove=true; + continue; + } + + type=ucm_mappingType( + &ucm->states, m, + UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m)); + if(type<0) { + /* illegal byte sequence */ + printMapping(m, UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m), stderr); + isOK=false; + } else if(type>0) { + m->moveFlag|=UCM_MOVE_TO_EXT; + needsMove=true; + } + } + + if(!isOK) { + return false; + } + if(needsMove) { + ucm_moveMappings(ucm->base, ucm->ext); + return ucm_checkBaseExt(&ucm->states, ucm->base, ucm->ext, ucm->ext, false); + } else { + ucm_sortTable(ucm->base); + return true; + } +} + +/* ucm parser --------------------------------------------------------------- */ + +U_CAPI int8_t U_EXPORT2 +ucm_parseBytes(uint8_t bytes[UCNV_EXT_MAX_BYTES], const char *line, const char **ps) { + const char *s=*ps; + char *end; + uint8_t byte; + int8_t bLen; + + bLen=0; + for(;;) { + /* skip an optional plus sign */ + if(bLen>0 && *s=='+') { + ++s; + } + if(*s!='\\') { + break; + } + + if( s[1]!='x' || + (byte=(uint8_t)uprv_strtoul(s+2, &end, 16), end)!=s+4 + ) { + fprintf(stderr, "ucm error: byte must be formatted as \\xXX (2 hex digits) - \"%s\"\n", line); + return -1; + } + + if(bLen==UCNV_EXT_MAX_BYTES) { + fprintf(stderr, "ucm error: too many bytes on \"%s\"\n", line); + return -1; + } + bytes[bLen++]=byte; + s=end; + } + + *ps=s; + return bLen; +} + +/* parse a mapping line; must not be empty */ +U_CAPI UBool U_EXPORT2 +ucm_parseMappingLine(UCMapping *m, + UChar32 codePoints[UCNV_EXT_MAX_UCHARS], + uint8_t bytes[UCNV_EXT_MAX_BYTES], + const char *line) { + const char *s; + char *end; + UChar32 cp; + int32_t u16Length; + int8_t uLen, bLen, f; + + s=line; + uLen=bLen=0; + + /* parse code points */ + for(;;) { + /* skip an optional plus sign */ + if(uLen>0 && *s=='+') { + ++s; + } + if(*s!='<') { + break; + } + + if( s[1]!='U' || + (cp=(UChar32)uprv_strtoul(s+2, &end, 16), end)==s+2 || + *end!='>' + ) { + fprintf(stderr, "ucm error: Unicode code point must be formatted as <UXXXX> (1..6 hex digits) - \"%s\"\n", line); + return false; + } + if((uint32_t)cp>0x10ffff || U_IS_SURROGATE(cp)) { + fprintf(stderr, "ucm error: Unicode code point must be 0..d7ff or e000..10ffff - \"%s\"\n", line); + return false; + } + + if(uLen==UCNV_EXT_MAX_UCHARS) { + fprintf(stderr, "ucm error: too many code points on \"%s\"\n", line); + return false; + } + codePoints[uLen++]=cp; + s=end+1; + } + + if(uLen==0) { + fprintf(stderr, "ucm error: no Unicode code points on \"%s\"\n", line); + return false; + } else if(uLen==1) { + m->u=codePoints[0]; + } else { + UErrorCode errorCode=U_ZERO_ERROR; + u_strFromUTF32(nullptr, 0, &u16Length, codePoints, uLen, &errorCode); + if( (U_FAILURE(errorCode) && errorCode!=U_BUFFER_OVERFLOW_ERROR) || + u16Length>UCNV_EXT_MAX_UCHARS + ) { + fprintf(stderr, "ucm error: too many UChars on \"%s\"\n", line); + return false; + } + } + + s=u_skipWhitespace(s); + + /* parse bytes */ + bLen=ucm_parseBytes(bytes, line, &s); + + if(bLen<0) { + return false; + } else if(bLen==0) { + fprintf(stderr, "ucm error: no bytes on \"%s\"\n", line); + return false; + } else if(bLen<=4) { + uprv_memcpy(m->b.bytes, bytes, bLen); + } + + /* skip everything until the fallback indicator, even the start of a comment */ + for(;;) { + if(*s==0) { + f=-1; /* no fallback indicator */ + break; + } else if(*s=='|') { + f=(int8_t)(s[1]-'0'); + if((uint8_t)f>4) { + fprintf(stderr, "ucm error: fallback indicator must be |0..|4 - \"%s\"\n", line); + return false; + } + break; + } + ++s; + } + + m->uLen=uLen; + m->bLen=bLen; + m->f=f; + return true; +} + +/* general APIs ------------------------------------------------------------- */ + +U_CAPI UCMTable * U_EXPORT2 +ucm_openTable() { + UCMTable *table=(UCMTable *)uprv_malloc(sizeof(UCMTable)); + if(table==nullptr) { + fprintf(stderr, "ucm error: unable to allocate a UCMTable\n"); + exit(U_MEMORY_ALLOCATION_ERROR); + } + + memset(table, 0, sizeof(UCMTable)); + return table; +} + +U_CAPI void U_EXPORT2 +ucm_closeTable(UCMTable *table) { + if(table!=nullptr) { + uprv_free(table->mappings); + uprv_free(table->codePoints); + uprv_free(table->bytes); + uprv_free(table->reverseMap); + uprv_free(table); + } +} + +U_CAPI void U_EXPORT2 +ucm_resetTable(UCMTable *table) { + if(table!=nullptr) { + table->mappingsLength=0; + table->flagsType=0; + table->unicodeMask=0; + table->bytesLength=table->codePointsLength=0; + table->isSorted=false; + } +} + +U_CAPI void U_EXPORT2 +ucm_addMapping(UCMTable *table, + UCMapping *m, + UChar32 codePoints[UCNV_EXT_MAX_UCHARS], + uint8_t bytes[UCNV_EXT_MAX_BYTES]) { + UCMapping *tm; + UChar32 c; + int32_t idx; + + if(table->mappingsLength>=table->mappingsCapacity) { + /* make the mappings array larger */ + if(table->mappingsCapacity==0) { + table->mappingsCapacity=1000; + } else { + table->mappingsCapacity*=10; + } + table->mappings=(UCMapping *)uprv_realloc(table->mappings, + table->mappingsCapacity*sizeof(UCMapping)); + if(table->mappings==nullptr) { + fprintf(stderr, "ucm error: unable to allocate %d UCMappings\n", + (int)table->mappingsCapacity); + exit(U_MEMORY_ALLOCATION_ERROR); + } + + if(table->reverseMap!=nullptr) { + /* the reverseMap must be reallocated in a new sort */ + uprv_free(table->reverseMap); + table->reverseMap=nullptr; + } + } + + if(m->uLen>1 && table->codePointsCapacity==0) { + table->codePointsCapacity=10000; + table->codePoints=(UChar32 *)uprv_malloc(table->codePointsCapacity*4); + if(table->codePoints==nullptr) { + fprintf(stderr, "ucm error: unable to allocate %d UChar32s\n", + (int)table->codePointsCapacity); + exit(U_MEMORY_ALLOCATION_ERROR); + } + } + + if(m->bLen>4 && table->bytesCapacity==0) { + table->bytesCapacity=10000; + table->bytes=(uint8_t *)uprv_malloc(table->bytesCapacity); + if(table->bytes==nullptr) { + fprintf(stderr, "ucm error: unable to allocate %d bytes\n", + (int)table->bytesCapacity); + exit(U_MEMORY_ALLOCATION_ERROR); + } + } + + if(m->uLen>1) { + idx=table->codePointsLength; + table->codePointsLength+=m->uLen; + if(table->codePointsLength>table->codePointsCapacity) { + fprintf(stderr, "ucm error: too many code points in multiple-code point mappings\n"); + exit(U_MEMORY_ALLOCATION_ERROR); + } + + uprv_memcpy(table->codePoints+idx, codePoints, (size_t)m->uLen*4); + m->u=idx; + } + + if(m->bLen>4) { + idx=table->bytesLength; + table->bytesLength+=m->bLen; + if(table->bytesLength>table->bytesCapacity) { + fprintf(stderr, "ucm error: too many bytes in mappings with >4 charset bytes\n"); + exit(U_MEMORY_ALLOCATION_ERROR); + } + + uprv_memcpy(table->bytes+idx, bytes, m->bLen); + m->b.idx=idx; + } + + /* set unicodeMask */ + for(idx=0; idx<m->uLen; ++idx) { + c=codePoints[idx]; + if(c>=0x10000) { + table->unicodeMask|=UCNV_HAS_SUPPLEMENTARY; /* there are supplementary code points */ + } else if(U_IS_SURROGATE(c)) { + table->unicodeMask|=UCNV_HAS_SURROGATES; /* there are surrogate code points */ + } + } + + /* set flagsType */ + if(m->f<0) { + table->flagsType|=UCM_FLAGS_IMPLICIT; + } else { + table->flagsType|=UCM_FLAGS_EXPLICIT; + } + + tm=table->mappings+table->mappingsLength++; + uprv_memcpy(tm, m, sizeof(UCMapping)); + + table->isSorted=false; +} + +U_CAPI UCMFile * U_EXPORT2 +ucm_open() { + UCMFile *ucm=(UCMFile *)uprv_malloc(sizeof(UCMFile)); + if(ucm==nullptr) { + fprintf(stderr, "ucm error: unable to allocate a UCMFile\n"); + exit(U_MEMORY_ALLOCATION_ERROR); + } + + memset(ucm, 0, sizeof(UCMFile)); + + ucm->base=ucm_openTable(); + ucm->ext=ucm_openTable(); + + ucm->states.stateFlags[0]=MBCS_STATE_FLAG_DIRECT; + ucm->states.conversionType=UCNV_UNSUPPORTED_CONVERTER; + ucm->states.outputType=-1; + ucm->states.minCharLength=ucm->states.maxCharLength=1; + + return ucm; +} + +U_CAPI void U_EXPORT2 +ucm_close(UCMFile *ucm) { + if(ucm!=nullptr) { + ucm_closeTable(ucm->base); + ucm_closeTable(ucm->ext); + uprv_free(ucm); + } +} + +U_CAPI int32_t U_EXPORT2 +ucm_mappingType(UCMStates *baseStates, + UCMapping *m, + UChar32 codePoints[UCNV_EXT_MAX_UCHARS], + uint8_t bytes[UCNV_EXT_MAX_BYTES]) { + (void)codePoints; + /* check validity of the bytes and count the characters in them */ + int32_t count=ucm_countChars(baseStates, bytes, m->bLen); + if(count<1) { + /* illegal byte sequence */ + return -1; + } + + /* + * Suitable for an ICU conversion base table means: + * - a 1:1 mapping (1 Unicode code point : 1 byte sequence) + * - precision flag 0..3 + * - SBCS: any 1:1 mapping + * (the table stores additional bits to distinguish mapping types) + * - MBCS: not a |2 SUB mapping for <subchar1> + * - MBCS: not a |1 fallback to 0x00 + * - MBCS: not a multi-byte mapping with leading 0x00 bytes + * + * Further restrictions for fromUnicode tables + * are enforced in makeconv (MBCSOkForBaseFromUnicode()). + * + * All of the MBCS fromUnicode specific tests could be removed from here, + * but the ones above are for unusual mappings, and removing the tests + * from here would change canonucm output which seems gratuitous. + * (Markus Scherer 2006-nov-28) + * + * Exception: All implicit mappings (f<0) that need to be moved + * because of fromUnicode restrictions _must_ be moved here because + * makeconv uses a hack for moving mappings only for the fromUnicode table + * that only works with non-negative values of f. + */ + if( m->uLen==1 && count==1 && m->f<=3 && + (baseStates->maxCharLength==1 || + !((m->f==2 && m->bLen==1) || + (m->f==1 && bytes[0]==0) || + (m->f<=1 && m->bLen>1 && bytes[0]==0))) + ) { + return 0; /* suitable for a base table */ + } else { + return 1; /* needs to go into an extension table */ + } +} + +U_CAPI UBool U_EXPORT2 +ucm_addMappingAuto(UCMFile *ucm, UBool forBase, UCMStates *baseStates, + UCMapping *m, + UChar32 codePoints[UCNV_EXT_MAX_UCHARS], + uint8_t bytes[UCNV_EXT_MAX_BYTES]) { + int32_t type; + + if(m->f==2 && m->uLen>1) { + fprintf(stderr, "ucm error: illegal <subchar1> |2 mapping from multiple code points\n"); + printMapping(m, codePoints, bytes, stderr); + return false; + } + + if(baseStates!=nullptr) { + /* check validity of the bytes and count the characters in them */ + type=ucm_mappingType(baseStates, m, codePoints, bytes); + if(type<0) { + /* illegal byte sequence */ + printMapping(m, codePoints, bytes, stderr); + return false; + } + } else { + /* not used - adding a mapping for an extension-only table before its base table is read */ + type=1; + } + + /* + * Add the mapping to the base table if this is requested and suitable. + * Otherwise, add it to the extension table. + */ + if(forBase && type==0) { + ucm_addMapping(ucm->base, m, codePoints, bytes); + } else { + ucm_addMapping(ucm->ext, m, codePoints, bytes); + } + + return true; +} + +U_CAPI UBool U_EXPORT2 +ucm_addMappingFromLine(UCMFile *ucm, const char *line, UBool forBase, UCMStates *baseStates) { + UCMapping m={ 0, {0}, 0, 0, 0, 0 }; + UChar32 codePoints[UCNV_EXT_MAX_UCHARS]; + uint8_t bytes[UCNV_EXT_MAX_BYTES]; + + const char *s; + + /* ignore empty and comment lines */ + if(line[0]=='#' || *(s=u_skipWhitespace(line))==0 || *s=='\n' || *s=='\r') { + return true; + } + + return + ucm_parseMappingLine(&m, codePoints, bytes, line) && + ucm_addMappingAuto(ucm, forBase, baseStates, &m, codePoints, bytes); +} + +U_CAPI void U_EXPORT2 +ucm_readTable(UCMFile *ucm, FileStream* convFile, + UBool forBase, UCMStates *baseStates, + UErrorCode *pErrorCode) { + char line[500]; + char *end; + UBool isOK; + + if(U_FAILURE(*pErrorCode)) { + return; + } + + isOK=true; + + for(;;) { + /* read the next line */ + if(!T_FileStream_readLine(convFile, line, sizeof(line))) { + fprintf(stderr, "incomplete charmap section\n"); + isOK=false; + break; + } + + /* remove CR LF */ + end=uprv_strchr(line, 0); + while(line<end && (*(end-1)=='\r' || *(end-1)=='\n')) { + --end; + } + *end=0; + + /* ignore empty and comment lines */ + if(line[0]==0 || line[0]=='#') { + continue; + } + + /* stop at the end of the mapping table */ + if(0==uprv_strcmp(line, "END CHARMAP")) { + break; + } + + isOK&=ucm_addMappingFromLine(ucm, line, forBase, baseStates); + } + + if(!isOK) { + *pErrorCode=U_INVALID_TABLE_FORMAT; + } +} +#endif |