diff options
Diffstat (limited to 'intl/icu/source/common/unames.cpp')
-rw-r--r-- | intl/icu/source/common/unames.cpp | 2108 |
1 files changed, 2108 insertions, 0 deletions
diff --git a/intl/icu/source/common/unames.cpp b/intl/icu/source/common/unames.cpp new file mode 100644 index 0000000000..1b3192bf25 --- /dev/null +++ b/intl/icu/source/common/unames.cpp @@ -0,0 +1,2108 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +****************************************************************************** +* +* Copyright (C) 1999-2014, International Business Machines +* Corporation and others. All Rights Reserved. +* +****************************************************************************** +* file name: unames.c +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 1999oct04 +* created by: Markus W. Scherer +*/ + +#include "unicode/utypes.h" +#include "unicode/putil.h" +#include "unicode/uchar.h" +#include "unicode/udata.h" +#include "unicode/utf.h" +#include "unicode/utf16.h" +#include "uassert.h" +#include "ustr_imp.h" +#include "umutex.h" +#include "cmemory.h" +#include "cstring.h" +#include "ucln_cmn.h" +#include "udataswp.h" +#include "uprops.h" + +U_NAMESPACE_BEGIN + +/* prototypes ------------------------------------------------------------- */ + +static const char DATA_NAME[] = "unames"; +static const char DATA_TYPE[] = "icu"; + +#define GROUP_SHIFT 5 +#define LINES_PER_GROUP (1L<<GROUP_SHIFT) +#define GROUP_MASK (LINES_PER_GROUP-1) + +/* + * This struct was replaced by explicitly accessing equivalent + * fields from triples of uint16_t. + * The Group struct was padded to 8 bytes on compilers for early ARM CPUs, + * which broke the assumption that sizeof(Group)==6 and that the ++ operator + * would advance by 6 bytes (3 uint16_t). + * + * We can't just change the data structure because it's loaded from a data file, + * and we don't want to make it less compact, so we changed the access code. + * + * For details see ICU tickets 6331 and 6008. +typedef struct { + uint16_t groupMSB, + offsetHigh, offsetLow; / * avoid padding * / +} Group; + */ +enum { + GROUP_MSB, + GROUP_OFFSET_HIGH, + GROUP_OFFSET_LOW, + GROUP_LENGTH +}; + +/* + * Get the 32-bit group offset. + * @param group (const uint16_t *) pointer to a Group triple of uint16_t + * @return group offset (int32_t) + */ +#define GET_GROUP_OFFSET(group) ((int32_t)(group)[GROUP_OFFSET_HIGH]<<16|(group)[GROUP_OFFSET_LOW]) + +#define NEXT_GROUP(group) ((group)+GROUP_LENGTH) +#define PREV_GROUP(group) ((group)-GROUP_LENGTH) + +typedef struct { + uint32_t start, end; + uint8_t type, variant; + uint16_t size; +} AlgorithmicRange; + +typedef struct { + uint32_t tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset; +} UCharNames; + +/* + * Get the groups table from a UCharNames struct. + * The groups table consists of one uint16_t groupCount followed by + * groupCount groups. Each group is a triple of uint16_t, see GROUP_LENGTH + * and the comment for the old struct Group above. + * + * @param names (const UCharNames *) pointer to the UCharNames indexes + * @return (const uint16_t *) pointer to the groups table + */ +#define GET_GROUPS(names) (const uint16_t *)((const char *)names+names->groupsOffset) + +typedef struct { + const char *otherName; + UChar32 code; +} FindName; + +#define DO_FIND_NAME nullptr + +static UDataMemory *uCharNamesData=nullptr; +static UCharNames *uCharNames=nullptr; +static icu::UInitOnce gCharNamesInitOnce {}; + +/* + * Maximum length of character names (regular & 1.0). + */ +static int32_t gMaxNameLength=0; + +/* + * Set of chars used in character names (regular & 1.0). + * Chars are platform-dependent (can be EBCDIC). + */ +static uint32_t gNameSet[8]={ 0 }; + +#define U_NONCHARACTER_CODE_POINT U_CHAR_CATEGORY_COUNT +#define U_LEAD_SURROGATE U_CHAR_CATEGORY_COUNT + 1 +#define U_TRAIL_SURROGATE U_CHAR_CATEGORY_COUNT + 2 + +#define U_CHAR_EXTENDED_CATEGORY_COUNT (U_CHAR_CATEGORY_COUNT + 3) + +static const char * const charCatNames[U_CHAR_EXTENDED_CATEGORY_COUNT] = { + "unassigned", + "uppercase letter", + "lowercase letter", + "titlecase letter", + "modifier letter", + "other letter", + "non spacing mark", + "enclosing mark", + "combining spacing mark", + "decimal digit number", + "letter number", + "other number", + "space separator", + "line separator", + "paragraph separator", + "control", + "format", + "private use area", + "surrogate", + "dash punctuation", + "start punctuation", + "end punctuation", + "connector punctuation", + "other punctuation", + "math symbol", + "currency symbol", + "modifier symbol", + "other symbol", + "initial punctuation", + "final punctuation", + "noncharacter", + "lead surrogate", + "trail surrogate" +}; + +/* implementation ----------------------------------------------------------- */ + +static UBool U_CALLCONV unames_cleanup() +{ + if(uCharNamesData) { + udata_close(uCharNamesData); + uCharNamesData = nullptr; + } + if(uCharNames) { + uCharNames = nullptr; + } + gCharNamesInitOnce.reset(); + gMaxNameLength=0; + return true; +} + +static UBool U_CALLCONV +isAcceptable(void * /*context*/, + const char * /*type*/, const char * /*name*/, + const UDataInfo *pInfo) { + return (UBool)( + pInfo->size>=20 && + pInfo->isBigEndian==U_IS_BIG_ENDIAN && + pInfo->charsetFamily==U_CHARSET_FAMILY && + pInfo->dataFormat[0]==0x75 && /* dataFormat="unam" */ + pInfo->dataFormat[1]==0x6e && + pInfo->dataFormat[2]==0x61 && + pInfo->dataFormat[3]==0x6d && + pInfo->formatVersion[0]==1); +} + +static void U_CALLCONV +loadCharNames(UErrorCode &status) { + U_ASSERT(uCharNamesData == nullptr); + U_ASSERT(uCharNames == nullptr); + + uCharNamesData = udata_openChoice(nullptr, DATA_TYPE, DATA_NAME, isAcceptable, nullptr, &status); + if(U_FAILURE(status)) { + uCharNamesData = nullptr; + } else { + uCharNames = (UCharNames *)udata_getMemory(uCharNamesData); + } + ucln_common_registerCleanup(UCLN_COMMON_UNAMES, unames_cleanup); +} + + +static UBool +isDataLoaded(UErrorCode *pErrorCode) { + umtx_initOnce(gCharNamesInitOnce, &loadCharNames, *pErrorCode); + return U_SUCCESS(*pErrorCode); +} + +#define WRITE_CHAR(buffer, bufferLength, bufferPos, c) UPRV_BLOCK_MACRO_BEGIN { \ + if((bufferLength)>0) { \ + *(buffer)++=c; \ + --(bufferLength); \ + } \ + ++(bufferPos); \ +} UPRV_BLOCK_MACRO_END + +#define U_ISO_COMMENT U_CHAR_NAME_CHOICE_COUNT + +/* + * Important: expandName() and compareName() are almost the same - + * apply fixes to both. + * + * UnicodeData.txt uses ';' as a field separator, so no + * field can contain ';' as part of its contents. + * In unames.dat, it is marked as token[';']==-1 only if the + * semicolon is used in the data file - which is iff we + * have Unicode 1.0 names or ISO comments or aliases. + * So, it will be token[';']==-1 if we store U1.0 names/ISO comments/aliases + * although we know that it will never be part of a name. + */ +static uint16_t +expandName(UCharNames *names, + const uint8_t *name, uint16_t nameLength, UCharNameChoice nameChoice, + char *buffer, uint16_t bufferLength) { + uint16_t *tokens=(uint16_t *)names+8; + uint16_t token, tokenCount=*tokens++, bufferPos=0; + uint8_t *tokenStrings=(uint8_t *)names+names->tokenStringOffset; + uint8_t c; + + if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) { + /* + * skip the modern name if it is not requested _and_ + * if the semicolon byte value is a character, not a token number + */ + if((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) { + int fieldIndex= nameChoice==U_ISO_COMMENT ? 2 : nameChoice; + do { + while(nameLength>0) { + --nameLength; + if(*name++==';') { + break; + } + } + } while(--fieldIndex>0); + } else { + /* + * the semicolon byte value is a token number, therefore + * only modern names are stored in unames.dat and there is no + * such requested alternate name here + */ + nameLength=0; + } + } + + /* write each letter directly, and write a token word per token */ + while(nameLength>0) { + --nameLength; + c=*name++; + + if(c>=tokenCount) { + if(c!=';') { + /* implicit letter */ + WRITE_CHAR(buffer, bufferLength, bufferPos, c); + } else { + /* finished */ + break; + } + } else { + token=tokens[c]; + if(token==(uint16_t)(-2)) { + /* this is a lead byte for a double-byte token */ + token=tokens[c<<8|*name++]; + --nameLength; + } + if(token==(uint16_t)(-1)) { + if(c!=';') { + /* explicit letter */ + WRITE_CHAR(buffer, bufferLength, bufferPos, c); + } else { + /* stop, but skip the semicolon if we are seeking + extended names and there was no 2.0 name but there + is a 1.0 name. */ + if(!bufferPos && nameChoice == U_EXTENDED_CHAR_NAME) { + if ((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) { + continue; + } + } + /* finished */ + break; + } + } else { + /* write token word */ + uint8_t *tokenString=tokenStrings+token; + while((c=*tokenString++)!=0) { + WRITE_CHAR(buffer, bufferLength, bufferPos, c); + } + } + } + } + + /* zero-terminate */ + if(bufferLength>0) { + *buffer=0; + } + + return bufferPos; +} + +/* + * compareName() is almost the same as expandName() except that it compares + * the currently expanded name to an input name. + * It returns the match/no match result as soon as possible. + */ +static UBool +compareName(UCharNames *names, + const uint8_t *name, uint16_t nameLength, UCharNameChoice nameChoice, + const char *otherName) { + uint16_t *tokens=(uint16_t *)names+8; + uint16_t token, tokenCount=*tokens++; + uint8_t *tokenStrings=(uint8_t *)names+names->tokenStringOffset; + uint8_t c; + const char *origOtherName = otherName; + + if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) { + /* + * skip the modern name if it is not requested _and_ + * if the semicolon byte value is a character, not a token number + */ + if((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) { + int fieldIndex= nameChoice==U_ISO_COMMENT ? 2 : nameChoice; + do { + while(nameLength>0) { + --nameLength; + if(*name++==';') { + break; + } + } + } while(--fieldIndex>0); + } else { + /* + * the semicolon byte value is a token number, therefore + * only modern names are stored in unames.dat and there is no + * such requested alternate name here + */ + nameLength=0; + } + } + + /* compare each letter directly, and compare a token word per token */ + while(nameLength>0) { + --nameLength; + c=*name++; + + if(c>=tokenCount) { + if(c!=';') { + /* implicit letter */ + if((char)c!=*otherName++) { + return false; + } + } else { + /* finished */ + break; + } + } else { + token=tokens[c]; + if(token==(uint16_t)(-2)) { + /* this is a lead byte for a double-byte token */ + token=tokens[c<<8|*name++]; + --nameLength; + } + if(token==(uint16_t)(-1)) { + if(c!=';') { + /* explicit letter */ + if((char)c!=*otherName++) { + return false; + } + } else { + /* stop, but skip the semicolon if we are seeking + extended names and there was no 2.0 name but there + is a 1.0 name. */ + if(otherName == origOtherName && nameChoice == U_EXTENDED_CHAR_NAME) { + if ((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) { + continue; + } + } + /* finished */ + break; + } + } else { + /* write token word */ + uint8_t *tokenString=tokenStrings+token; + while((c=*tokenString++)!=0) { + if((char)c!=*otherName++) { + return false; + } + } + } + } + } + + /* complete match? */ + return (UBool)(*otherName==0); +} + +static uint8_t getCharCat(UChar32 cp) { + uint8_t cat; + + if (U_IS_UNICODE_NONCHAR(cp)) { + return U_NONCHARACTER_CODE_POINT; + } + + if ((cat = u_charType(cp)) == U_SURROGATE) { + cat = U_IS_LEAD(cp) ? U_LEAD_SURROGATE : U_TRAIL_SURROGATE; + } + + return cat; +} + +static const char *getCharCatName(UChar32 cp) { + uint8_t cat = getCharCat(cp); + + /* Return unknown if the table of names above is not up to + date. */ + + if (cat >= UPRV_LENGTHOF(charCatNames)) { + return "unknown"; + } else { + return charCatNames[cat]; + } +} + +static uint16_t getExtName(uint32_t code, char *buffer, uint16_t bufferLength) { + const char *catname = getCharCatName(code); + uint16_t length = 0; + + UChar32 cp; + int ndigits, i; + + WRITE_CHAR(buffer, bufferLength, length, '<'); + while (catname[length - 1]) { + WRITE_CHAR(buffer, bufferLength, length, catname[length - 1]); + } + WRITE_CHAR(buffer, bufferLength, length, '-'); + for (cp = code, ndigits = 0; cp; ++ndigits, cp >>= 4) + ; + if (ndigits < 4) + ndigits = 4; + for (cp = code, i = ndigits; (cp || i > 0) && bufferLength; cp >>= 4, bufferLength--) { + uint8_t v = (uint8_t)(cp & 0xf); + buffer[--i] = (v < 10 ? '0' + v : 'A' + v - 10); + } + buffer += ndigits; + length += static_cast<uint16_t>(ndigits); + WRITE_CHAR(buffer, bufferLength, length, '>'); + + return length; +} + +/* + * getGroup() does a binary search for the group that contains the + * Unicode code point "code". + * The return value is always a valid Group* that may contain "code" + * or else is the highest group before "code". + * If the lowest group is after "code", then that one is returned. + */ +static const uint16_t * +getGroup(UCharNames *names, uint32_t code) { + const uint16_t *groups=GET_GROUPS(names); + uint16_t groupMSB=(uint16_t)(code>>GROUP_SHIFT), + start=0, + limit=*groups++, + number; + + /* binary search for the group of names that contains the one for code */ + while(start<limit-1) { + number=(uint16_t)((start+limit)/2); + if(groupMSB<groups[number*GROUP_LENGTH+GROUP_MSB]) { + limit=number; + } else { + start=number; + } + } + + /* return this regardless of whether it is an exact match */ + return groups+start*GROUP_LENGTH; +} + +/* + * expandGroupLengths() reads a block of compressed lengths of 32 strings and + * expands them into offsets and lengths for each string. + * Lengths are stored with a variable-width encoding in consecutive nibbles: + * If a nibble<0xc, then it is the length itself (0=empty string). + * If a nibble>=0xc, then it forms a length value with the following nibble. + * Calculation see below. + * The offsets and lengths arrays must be at least 33 (one more) long because + * there is no check here at the end if the last nibble is still used. + */ +static const uint8_t * +expandGroupLengths(const uint8_t *s, + uint16_t offsets[LINES_PER_GROUP+1], uint16_t lengths[LINES_PER_GROUP+1]) { + /* read the lengths of the 32 strings in this group and get each string's offset */ + uint16_t i=0, offset=0, length=0; + uint8_t lengthByte; + + /* all 32 lengths must be read to get the offset of the first group string */ + while(i<LINES_PER_GROUP) { + lengthByte=*s++; + + /* read even nibble - MSBs of lengthByte */ + if(length>=12) { + /* double-nibble length spread across two bytes */ + length=(uint16_t)(((length&0x3)<<4|lengthByte>>4)+12); + lengthByte&=0xf; + } else if((lengthByte /* &0xf0 */)>=0xc0) { + /* double-nibble length spread across this one byte */ + length=(uint16_t)((lengthByte&0x3f)+12); + } else { + /* single-nibble length in MSBs */ + length=(uint16_t)(lengthByte>>4); + lengthByte&=0xf; + } + + *offsets++=offset; + *lengths++=length; + + offset+=length; + ++i; + + /* read odd nibble - LSBs of lengthByte */ + if((lengthByte&0xf0)==0) { + /* this nibble was not consumed for a double-nibble length above */ + length=lengthByte; + if(length<12) { + /* single-nibble length in LSBs */ + *offsets++=offset; + *lengths++=length; + + offset+=length; + ++i; + } + } else { + length=0; /* prevent double-nibble detection in the next iteration */ + } + } + + /* now, s is at the first group string */ + return s; +} + +static uint16_t +expandGroupName(UCharNames *names, const uint16_t *group, + uint16_t lineNumber, UCharNameChoice nameChoice, + char *buffer, uint16_t bufferLength) { + uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2]; + const uint8_t *s=(uint8_t *)names+names->groupStringOffset+GET_GROUP_OFFSET(group); + s=expandGroupLengths(s, offsets, lengths); + return expandName(names, s+offsets[lineNumber], lengths[lineNumber], nameChoice, + buffer, bufferLength); +} + +static uint16_t +getName(UCharNames *names, uint32_t code, UCharNameChoice nameChoice, + char *buffer, uint16_t bufferLength) { + const uint16_t *group=getGroup(names, code); + if((uint16_t)(code>>GROUP_SHIFT)==group[GROUP_MSB]) { + return expandGroupName(names, group, (uint16_t)(code&GROUP_MASK), nameChoice, + buffer, bufferLength); + } else { + /* group not found */ + /* zero-terminate */ + if(bufferLength>0) { + *buffer=0; + } + return 0; + } +} + +/* + * enumGroupNames() enumerates all the names in a 32-group + * and either calls the enumerator function or finds a given input name. + */ +static UBool +enumGroupNames(UCharNames *names, const uint16_t *group, + UChar32 start, UChar32 end, + UEnumCharNamesFn *fn, void *context, + UCharNameChoice nameChoice) { + uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2]; + const uint8_t *s=(uint8_t *)names+names->groupStringOffset+GET_GROUP_OFFSET(group); + + s=expandGroupLengths(s, offsets, lengths); + if(fn!=DO_FIND_NAME) { + char buffer[200]; + uint16_t length; + + while(start<=end) { + length=expandName(names, s+offsets[start&GROUP_MASK], lengths[start&GROUP_MASK], nameChoice, buffer, sizeof(buffer)); + if (!length && nameChoice == U_EXTENDED_CHAR_NAME) { + buffer[length = getExtName(start, buffer, sizeof(buffer))] = 0; + } + /* here, we assume that the buffer is large enough */ + if(length>0) { + if(!fn(context, start, nameChoice, buffer, length)) { + return false; + } + } + ++start; + } + } else { + const char *otherName=((FindName *)context)->otherName; + while(start<=end) { + if(compareName(names, s+offsets[start&GROUP_MASK], lengths[start&GROUP_MASK], nameChoice, otherName)) { + ((FindName *)context)->code=start; + return false; + } + ++start; + } + } + return true; +} + +/* + * enumExtNames enumerate extended names. + * It only needs to do it if it is called with a real function and not + * with the dummy DO_FIND_NAME, because u_charFromName() does a check + * for extended names by itself. + */ +static UBool +enumExtNames(UChar32 start, UChar32 end, + UEnumCharNamesFn *fn, void *context) +{ + if(fn!=DO_FIND_NAME) { + char buffer[200]; + uint16_t length; + + while(start<=end) { + buffer[length = getExtName(start, buffer, sizeof(buffer))] = 0; + /* here, we assume that the buffer is large enough */ + if(length>0) { + if(!fn(context, start, U_EXTENDED_CHAR_NAME, buffer, length)) { + return false; + } + } + ++start; + } + } + + return true; +} + +static UBool +enumNames(UCharNames *names, + UChar32 start, UChar32 limit, + UEnumCharNamesFn *fn, void *context, + UCharNameChoice nameChoice) { + uint16_t startGroupMSB, endGroupMSB, groupCount; + const uint16_t *group, *groupLimit; + + startGroupMSB=(uint16_t)(start>>GROUP_SHIFT); + endGroupMSB=(uint16_t)((limit-1)>>GROUP_SHIFT); + + /* find the group that contains start, or the highest before it */ + group=getGroup(names, start); + + if(startGroupMSB<group[GROUP_MSB] && nameChoice==U_EXTENDED_CHAR_NAME) { + /* enumerate synthetic names between start and the group start */ + UChar32 extLimit=((UChar32)group[GROUP_MSB]<<GROUP_SHIFT); + if(extLimit>limit) { + extLimit=limit; + } + if(!enumExtNames(start, extLimit-1, fn, context)) { + return false; + } + start=extLimit; + } + + if(startGroupMSB==endGroupMSB) { + if(startGroupMSB==group[GROUP_MSB]) { + /* if start and limit-1 are in the same group, then enumerate only in that one */ + return enumGroupNames(names, group, start, limit-1, fn, context, nameChoice); + } + } else { + const uint16_t *groups=GET_GROUPS(names); + groupCount=*groups++; + groupLimit=groups+groupCount*GROUP_LENGTH; + + if(startGroupMSB==group[GROUP_MSB]) { + /* enumerate characters in the partial start group */ + if((start&GROUP_MASK)!=0) { + if(!enumGroupNames(names, group, + start, ((UChar32)startGroupMSB<<GROUP_SHIFT)+LINES_PER_GROUP-1, + fn, context, nameChoice)) { + return false; + } + group=NEXT_GROUP(group); /* continue with the next group */ + } + } else if(startGroupMSB>group[GROUP_MSB]) { + /* make sure that we start enumerating with the first group after start */ + const uint16_t *nextGroup=NEXT_GROUP(group); + if (nextGroup < groupLimit && nextGroup[GROUP_MSB] > startGroupMSB && nameChoice == U_EXTENDED_CHAR_NAME) { + UChar32 end = nextGroup[GROUP_MSB] << GROUP_SHIFT; + if (end > limit) { + end = limit; + } + if (!enumExtNames(start, end - 1, fn, context)) { + return false; + } + } + group=nextGroup; + } + + /* enumerate entire groups between the start- and end-groups */ + while(group<groupLimit && group[GROUP_MSB]<endGroupMSB) { + const uint16_t *nextGroup; + start=(UChar32)group[GROUP_MSB]<<GROUP_SHIFT; + if(!enumGroupNames(names, group, start, start+LINES_PER_GROUP-1, fn, context, nameChoice)) { + return false; + } + nextGroup=NEXT_GROUP(group); + if (nextGroup < groupLimit && nextGroup[GROUP_MSB] > group[GROUP_MSB] + 1 && nameChoice == U_EXTENDED_CHAR_NAME) { + UChar32 end = nextGroup[GROUP_MSB] << GROUP_SHIFT; + if (end > limit) { + end = limit; + } + if (!enumExtNames((group[GROUP_MSB] + 1) << GROUP_SHIFT, end - 1, fn, context)) { + return false; + } + } + group=nextGroup; + } + + /* enumerate within the end group (group[GROUP_MSB]==endGroupMSB) */ + if(group<groupLimit && group[GROUP_MSB]==endGroupMSB) { + return enumGroupNames(names, group, (limit-1)&~GROUP_MASK, limit-1, fn, context, nameChoice); + } else if (nameChoice == U_EXTENDED_CHAR_NAME && group == groupLimit) { + UChar32 next = (PREV_GROUP(group)[GROUP_MSB] + 1) << GROUP_SHIFT; + if (next > start) { + start = next; + } + } else { + return true; + } + } + + /* we have not found a group, which means everything is made of + extended names. */ + if (nameChoice == U_EXTENDED_CHAR_NAME) { + if (limit > UCHAR_MAX_VALUE + 1) { + limit = UCHAR_MAX_VALUE + 1; + } + return enumExtNames(start, limit - 1, fn, context); + } + + return true; +} + +static uint16_t +writeFactorSuffix(const uint16_t *factors, uint16_t count, + const char *s, /* suffix elements */ + uint32_t code, + uint16_t indexes[8], /* output fields from here */ + const char *elementBases[8], const char *elements[8], + char *buffer, uint16_t bufferLength) { + uint16_t i, factor, bufferPos=0; + char c; + + /* write elements according to the factors */ + + /* + * the factorized elements are determined by modulo arithmetic + * with the factors of this algorithm + * + * note that for fewer operations, count is decremented here + */ + --count; + for(i=count; i>0; --i) { + factor=factors[i]; + indexes[i]=(uint16_t)(code%factor); + code/=factor; + } + /* + * we don't need to calculate the last modulus because start<=code<=end + * guarantees here that code<=factors[0] + */ + indexes[0]=(uint16_t)code; + + /* write each element */ + for(;;) { + if(elementBases!=nullptr) { + *elementBases++=s; + } + + /* skip indexes[i] strings */ + factor=indexes[i]; + while(factor>0) { + while(*s++!=0) {} + --factor; + } + if(elements!=nullptr) { + *elements++=s; + } + + /* write element */ + while((c=*s++)!=0) { + WRITE_CHAR(buffer, bufferLength, bufferPos, c); + } + + /* we do not need to perform the rest of this loop for i==count - break here */ + if(i>=count) { + break; + } + + /* skip the rest of the strings for this factors[i] */ + factor=(uint16_t)(factors[i]-indexes[i]-1); + while(factor>0) { + while(*s++!=0) {} + --factor; + } + + ++i; + } + + /* zero-terminate */ + if(bufferLength>0) { + *buffer=0; + } + + return bufferPos; +} + +/* + * Important: + * Parts of findAlgName() are almost the same as some of getAlgName(). + * Fixes must be applied to both. + */ +static uint16_t +getAlgName(AlgorithmicRange *range, uint32_t code, UCharNameChoice nameChoice, + char *buffer, uint16_t bufferLength) { + uint16_t bufferPos=0; + + /* Only the normative character name can be algorithmic. */ + if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) { + /* zero-terminate */ + if(bufferLength>0) { + *buffer=0; + } + return 0; + } + + switch(range->type) { + case 0: { + /* name = prefix hex-digits */ + const char *s=(const char *)(range+1); + char c; + + uint16_t i, count; + + /* copy prefix */ + while((c=*s++)!=0) { + WRITE_CHAR(buffer, bufferLength, bufferPos, c); + } + + /* write hexadecimal code point value */ + count=range->variant; + + /* zero-terminate */ + if(count<bufferLength) { + buffer[count]=0; + } + + for(i=count; i>0;) { + if(--i<bufferLength) { + c=(char)(code&0xf); + if(c<10) { + c+='0'; + } else { + c+='A'-10; + } + buffer[i]=c; + } + code>>=4; + } + + bufferPos+=count; + break; + } + case 1: { + /* name = prefix factorized-elements */ + uint16_t indexes[8]; + const uint16_t *factors=(const uint16_t *)(range+1); + uint16_t count=range->variant; + const char *s=(const char *)(factors+count); + char c; + + /* copy prefix */ + while((c=*s++)!=0) { + WRITE_CHAR(buffer, bufferLength, bufferPos, c); + } + + bufferPos+=writeFactorSuffix(factors, count, + s, code-range->start, indexes, nullptr, nullptr, buffer, bufferLength); + break; + } + default: + /* undefined type */ + /* zero-terminate */ + if(bufferLength>0) { + *buffer=0; + } + break; + } + + return bufferPos; +} + +/* + * Important: enumAlgNames() and findAlgName() are almost the same. + * Any fix must be applied to both. + */ +static UBool +enumAlgNames(AlgorithmicRange *range, + UChar32 start, UChar32 limit, + UEnumCharNamesFn *fn, void *context, + UCharNameChoice nameChoice) { + char buffer[200]; + uint16_t length; + + if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) { + return true; + } + + switch(range->type) { + case 0: { + char *s, *end; + char c; + + /* get the full name of the start character */ + length=getAlgName(range, (uint32_t)start, nameChoice, buffer, sizeof(buffer)); + if(length<=0) { + return true; + } + + /* call the enumerator function with this first character */ + if(!fn(context, start, nameChoice, buffer, length)) { + return false; + } + + /* go to the end of the name; all these names have the same length */ + end=buffer; + while(*end!=0) { + ++end; + } + + /* enumerate the rest of the names */ + while(++start<limit) { + /* increment the hexadecimal number on a character-basis */ + s=end; + for (;;) { + c=*--s; + if(('0'<=c && c<'9') || ('A'<=c && c<'F')) { + *s=(char)(c+1); + break; + } else if(c=='9') { + *s='A'; + break; + } else if(c=='F') { + *s='0'; + } + } + + if(!fn(context, start, nameChoice, buffer, length)) { + return false; + } + } + break; + } + case 1: { + uint16_t indexes[8]; + const char *elementBases[8], *elements[8]; + const uint16_t *factors=(const uint16_t *)(range+1); + uint16_t count=range->variant; + const char *s=(const char *)(factors+count); + char *suffix, *t; + uint16_t prefixLength, i, idx; + + char c; + + /* name = prefix factorized-elements */ + + /* copy prefix */ + suffix=buffer; + prefixLength=0; + while((c=*s++)!=0) { + *suffix++=c; + ++prefixLength; + } + + /* append the suffix of the start character */ + length=(uint16_t)(prefixLength+writeFactorSuffix(factors, count, + s, (uint32_t)start-range->start, + indexes, elementBases, elements, + suffix, (uint16_t)(sizeof(buffer)-prefixLength))); + + /* call the enumerator function with this first character */ + if(!fn(context, start, nameChoice, buffer, length)) { + return false; + } + + /* enumerate the rest of the names */ + while(++start<limit) { + /* increment the indexes in lexical order bound by the factors */ + i=count; + for (;;) { + idx=(uint16_t)(indexes[--i]+1); + if(idx<factors[i]) { + /* skip one index and its element string */ + indexes[i]=idx; + s=elements[i]; + while(*s++!=0) { + } + elements[i]=s; + break; + } else { + /* reset this index to 0 and its element string to the first one */ + indexes[i]=0; + elements[i]=elementBases[i]; + } + } + + /* to make matters a little easier, just append all elements to the suffix */ + t=suffix; + length=prefixLength; + for(i=0; i<count; ++i) { + s=elements[i]; + while((c=*s++)!=0) { + *t++=c; + ++length; + } + } + /* zero-terminate */ + *t=0; + + if(!fn(context, start, nameChoice, buffer, length)) { + return false; + } + } + break; + } + default: + /* undefined type */ + break; + } + + return true; +} + +/* + * findAlgName() is almost the same as enumAlgNames() except that it + * returns the code point for a name if it fits into the range. + * It returns 0xffff otherwise. + */ +static UChar32 +findAlgName(AlgorithmicRange *range, UCharNameChoice nameChoice, const char *otherName) { + UChar32 code; + + if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) { + return 0xffff; + } + + switch(range->type) { + case 0: { + /* name = prefix hex-digits */ + const char *s=(const char *)(range+1); + char c; + + uint16_t i, count; + + /* compare prefix */ + while((c=*s++)!=0) { + if((char)c!=*otherName++) { + return 0xffff; + } + } + + /* read hexadecimal code point value */ + count=range->variant; + code=0; + for(i=0; i<count; ++i) { + c=*otherName++; + if('0'<=c && c<='9') { + code=(code<<4)|(c-'0'); + } else if('A'<=c && c<='F') { + code=(code<<4)|(c-'A'+10); + } else { + return 0xffff; + } + } + + /* does it fit into the range? */ + if(*otherName==0 && range->start<=(uint32_t)code && (uint32_t)code<=range->end) { + return code; + } + break; + } + case 1: { + char buffer[64]; + uint16_t indexes[8]; + const char *elementBases[8], *elements[8]; + const uint16_t *factors=(const uint16_t *)(range+1); + uint16_t count=range->variant; + const char *s=(const char *)(factors+count), *t; + UChar32 start, limit; + uint16_t i, idx; + + char c; + + /* name = prefix factorized-elements */ + + /* compare prefix */ + while((c=*s++)!=0) { + if((char)c!=*otherName++) { + return 0xffff; + } + } + + start=(UChar32)range->start; + limit=(UChar32)(range->end+1); + + /* initialize the suffix elements for enumeration; indexes should all be set to 0 */ + writeFactorSuffix(factors, count, s, 0, + indexes, elementBases, elements, buffer, sizeof(buffer)); + + /* compare the first suffix */ + if(0==uprv_strcmp(otherName, buffer)) { + return start; + } + + /* enumerate and compare the rest of the suffixes */ + while(++start<limit) { + /* increment the indexes in lexical order bound by the factors */ + i=count; + for (;;) { + idx=(uint16_t)(indexes[--i]+1); + if(idx<factors[i]) { + /* skip one index and its element string */ + indexes[i]=idx; + s=elements[i]; + while(*s++!=0) {} + elements[i]=s; + break; + } else { + /* reset this index to 0 and its element string to the first one */ + indexes[i]=0; + elements[i]=elementBases[i]; + } + } + + /* to make matters a little easier, just compare all elements of the suffix */ + t=otherName; + for(i=0; i<count; ++i) { + s=elements[i]; + while((c=*s++)!=0) { + if(c!=*t++) { + s=""; /* does not match */ + i=99; + } + } + } + if(i<99 && *t==0) { + return start; + } + } + break; + } + default: + /* undefined type */ + break; + } + + return 0xffff; +} + +/* sets of name characters, maximum name lengths ---------------------------- */ + +#define SET_ADD(set, c) ((set)[(uint8_t)c>>5]|=((uint32_t)1<<((uint8_t)c&0x1f))) +#define SET_CONTAINS(set, c) (((set)[(uint8_t)c>>5]&((uint32_t)1<<((uint8_t)c&0x1f)))!=0) + +static int32_t +calcStringSetLength(uint32_t set[8], const char *s) { + int32_t length=0; + char c; + + while((c=*s++)!=0) { + SET_ADD(set, c); + ++length; + } + return length; +} + +static int32_t +calcAlgNameSetsLengths(int32_t maxNameLength) { + AlgorithmicRange *range; + uint32_t *p; + uint32_t rangeCount; + int32_t length; + + /* enumerate algorithmic ranges */ + p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset); + rangeCount=*p; + range=(AlgorithmicRange *)(p+1); + while(rangeCount>0) { + switch(range->type) { + case 0: + /* name = prefix + (range->variant times) hex-digits */ + /* prefix */ + length=calcStringSetLength(gNameSet, (const char *)(range+1))+range->variant; + if(length>maxNameLength) { + maxNameLength=length; + } + break; + case 1: { + /* name = prefix factorized-elements */ + const uint16_t *factors=(const uint16_t *)(range+1); + const char *s; + int32_t i, count=range->variant, factor, factorLength, maxFactorLength; + + /* prefix length */ + s=(const char *)(factors+count); + length=calcStringSetLength(gNameSet, s); + s+=length+1; /* start of factor suffixes */ + + /* get the set and maximum factor suffix length for each factor */ + for(i=0; i<count; ++i) { + maxFactorLength=0; + for(factor=factors[i]; factor>0; --factor) { + factorLength=calcStringSetLength(gNameSet, s); + s+=factorLength+1; + if(factorLength>maxFactorLength) { + maxFactorLength=factorLength; + } + } + length+=maxFactorLength; + } + + if(length>maxNameLength) { + maxNameLength=length; + } + break; + } + default: + /* unknown type */ + break; + } + + range=(AlgorithmicRange *)((uint8_t *)range+range->size); + --rangeCount; + } + return maxNameLength; +} + +static int32_t +calcExtNameSetsLengths(int32_t maxNameLength) { + int32_t i, length; + + for(i=0; i<UPRV_LENGTHOF(charCatNames); ++i) { + /* + * for each category, count the length of the category name + * plus 9= + * 2 for <> + * 1 for - + * 6 for most hex digits per code point + */ + length=9+calcStringSetLength(gNameSet, charCatNames[i]); + if(length>maxNameLength) { + maxNameLength=length; + } + } + return maxNameLength; +} + +static int32_t +calcNameSetLength(const uint16_t *tokens, uint16_t tokenCount, const uint8_t *tokenStrings, int8_t *tokenLengths, + uint32_t set[8], + const uint8_t **pLine, const uint8_t *lineLimit) { + const uint8_t *line=*pLine; + int32_t length=0, tokenLength; + uint16_t c, token; + + while(line!=lineLimit && (c=*line++)!=(uint8_t)';') { + if(c>=tokenCount) { + /* implicit letter */ + SET_ADD(set, c); + ++length; + } else { + token=tokens[c]; + if(token==(uint16_t)(-2)) { + /* this is a lead byte for a double-byte token */ + c=c<<8|*line++; + token=tokens[c]; + } + if(token==(uint16_t)(-1)) { + /* explicit letter */ + SET_ADD(set, c); + ++length; + } else { + /* count token word */ + if(tokenLengths!=nullptr) { + /* use cached token length */ + tokenLength=tokenLengths[c]; + if(tokenLength==0) { + tokenLength=calcStringSetLength(set, (const char *)tokenStrings+token); + tokenLengths[c]=(int8_t)tokenLength; + } + } else { + tokenLength=calcStringSetLength(set, (const char *)tokenStrings+token); + } + length+=tokenLength; + } + } + } + + *pLine=line; + return length; +} + +static void +calcGroupNameSetsLengths(int32_t maxNameLength) { + uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2]; + + uint16_t *tokens=(uint16_t *)uCharNames+8; + uint16_t tokenCount=*tokens++; + uint8_t *tokenStrings=(uint8_t *)uCharNames+uCharNames->tokenStringOffset; + + int8_t *tokenLengths; + + const uint16_t *group; + const uint8_t *s, *line, *lineLimit; + + int32_t groupCount, lineNumber, length; + + tokenLengths=(int8_t *)uprv_malloc(tokenCount); + if(tokenLengths!=nullptr) { + uprv_memset(tokenLengths, 0, tokenCount); + } + + group=GET_GROUPS(uCharNames); + groupCount=*group++; + + /* enumerate all groups */ + while(groupCount>0) { + s=(uint8_t *)uCharNames+uCharNames->groupStringOffset+GET_GROUP_OFFSET(group); + s=expandGroupLengths(s, offsets, lengths); + + /* enumerate all lines in each group */ + for(lineNumber=0; lineNumber<LINES_PER_GROUP; ++lineNumber) { + line=s+offsets[lineNumber]; + length=lengths[lineNumber]; + if(length==0) { + continue; + } + + lineLimit=line+length; + + /* read regular name */ + length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gNameSet, &line, lineLimit); + if(length>maxNameLength) { + maxNameLength=length; + } + if(line==lineLimit) { + continue; + } + + /* read Unicode 1.0 name */ + length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gNameSet, &line, lineLimit); + if(length>maxNameLength) { + maxNameLength=length; + } + if(line==lineLimit) { + continue; + } + + /* read ISO comment */ + /*length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gISOCommentSet, &line, lineLimit);*/ + } + + group=NEXT_GROUP(group); + --groupCount; + } + + if(tokenLengths!=nullptr) { + uprv_free(tokenLengths); + } + + /* set gMax... - name length last for threading */ + gMaxNameLength=maxNameLength; +} + +static UBool +calcNameSetsLengths(UErrorCode *pErrorCode) { + static const char extChars[]="0123456789ABCDEF<>-"; + int32_t i, maxNameLength; + + if(gMaxNameLength!=0) { + return true; + } + + if(!isDataLoaded(pErrorCode)) { + return false; + } + + /* set hex digits, used in various names, and <>-, used in extended names */ + for(i=0; i<(int32_t)sizeof(extChars)-1; ++i) { + SET_ADD(gNameSet, extChars[i]); + } + + /* set sets and lengths from algorithmic names */ + maxNameLength=calcAlgNameSetsLengths(0); + + /* set sets and lengths from extended names */ + maxNameLength=calcExtNameSetsLengths(maxNameLength); + + /* set sets and lengths from group names, set global maximum values */ + calcGroupNameSetsLengths(maxNameLength); + + return true; +} + +U_NAMESPACE_END + +/* public API --------------------------------------------------------------- */ + +U_NAMESPACE_USE + +U_CAPI int32_t U_EXPORT2 +u_charName(UChar32 code, UCharNameChoice nameChoice, + char *buffer, int32_t bufferLength, + UErrorCode *pErrorCode) { + AlgorithmicRange *algRange; + uint32_t *p; + uint32_t i; + int32_t length; + + /* check the argument values */ + if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) { + return 0; + } else if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT || + bufferLength<0 || (bufferLength>0 && buffer==nullptr) + ) { + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; + return 0; + } + + if((uint32_t)code>UCHAR_MAX_VALUE || !isDataLoaded(pErrorCode)) { + return u_terminateChars(buffer, bufferLength, 0, pErrorCode); + } + + length=0; + + /* try algorithmic names first */ + p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset); + i=*p; + algRange=(AlgorithmicRange *)(p+1); + while(i>0) { + if(algRange->start<=(uint32_t)code && (uint32_t)code<=algRange->end) { + length=getAlgName(algRange, (uint32_t)code, nameChoice, buffer, (uint16_t)bufferLength); + break; + } + algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size); + --i; + } + + if(i==0) { + if (nameChoice == U_EXTENDED_CHAR_NAME) { + length = getName(uCharNames, (uint32_t )code, U_EXTENDED_CHAR_NAME, buffer, (uint16_t) bufferLength); + if (!length) { + /* extended character name */ + length = getExtName((uint32_t) code, buffer, (uint16_t) bufferLength); + } + } else { + /* normal character name */ + length=getName(uCharNames, (uint32_t)code, nameChoice, buffer, (uint16_t)bufferLength); + } + } + + return u_terminateChars(buffer, bufferLength, length, pErrorCode); +} + +U_CAPI int32_t U_EXPORT2 +u_getISOComment(UChar32 /*c*/, + char *dest, int32_t destCapacity, + UErrorCode *pErrorCode) { + /* check the argument values */ + if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) { + return 0; + } else if(destCapacity<0 || (destCapacity>0 && dest==nullptr)) { + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; + return 0; + } + + return u_terminateChars(dest, destCapacity, 0, pErrorCode); +} + +U_CAPI UChar32 U_EXPORT2 +u_charFromName(UCharNameChoice nameChoice, + const char *name, + UErrorCode *pErrorCode) { + char upper[120] = {0}; + char lower[120] = {0}; + FindName findName; + AlgorithmicRange *algRange; + uint32_t *p; + uint32_t i; + UChar32 cp = 0; + char c0; + static constexpr UChar32 error = 0xffff; /* Undefined, but use this for backwards compatibility. */ + + if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) { + return error; + } + + if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT || name==nullptr || *name==0) { + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; + return error; + } + + if(!isDataLoaded(pErrorCode)) { + return error; + } + + /* construct the uppercase and lowercase of the name first */ + for(i=0; i<sizeof(upper); ++i) { + if((c0=*name++)!=0) { + upper[i]=uprv_toupper(c0); + lower[i]=uprv_tolower(c0); + } else { + upper[i]=lower[i]=0; + break; + } + } + if(i==sizeof(upper)) { + /* name too long, there is no such character */ + *pErrorCode = U_ILLEGAL_CHAR_FOUND; + return error; + } + // i==strlen(name)==strlen(lower)==strlen(upper) + + /* try extended names first */ + if (lower[0] == '<') { + if (nameChoice == U_EXTENDED_CHAR_NAME && lower[--i] == '>') { + // Parse a string like "<category-HHHH>" where HHHH is a hex code point. + uint32_t limit = i; + while (i >= 3 && lower[--i] != '-') {} + + // There should be 1 to 8 hex digits. + int32_t hexLength = limit - (i + 1); + if (i >= 2 && lower[i] == '-' && 1 <= hexLength && hexLength <= 8) { + uint32_t cIdx; + + lower[i] = 0; + + for (++i; i < limit; ++i) { + if (lower[i] >= '0' && lower[i] <= '9') { + cp = (cp << 4) + lower[i] - '0'; + } else if (lower[i] >= 'a' && lower[i] <= 'f') { + cp = (cp << 4) + lower[i] - 'a' + 10; + } else { + *pErrorCode = U_ILLEGAL_CHAR_FOUND; + return error; + } + // Prevent signed-integer overflow and out-of-range code points. + if (cp > UCHAR_MAX_VALUE) { + *pErrorCode = U_ILLEGAL_CHAR_FOUND; + return error; + } + } + + /* Now validate the category name. + We could use a binary search, or a trie, if + we really wanted to. */ + uint8_t cat = getCharCat(cp); + for (lower[i] = 0, cIdx = 0; cIdx < UPRV_LENGTHOF(charCatNames); ++cIdx) { + + if (!uprv_strcmp(lower + 1, charCatNames[cIdx])) { + if (cat == cIdx) { + return cp; + } + break; + } + } + } + } + + *pErrorCode = U_ILLEGAL_CHAR_FOUND; + return error; + } + + /* try algorithmic names now */ + p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset); + i=*p; + algRange=(AlgorithmicRange *)(p+1); + while(i>0) { + if((cp=findAlgName(algRange, nameChoice, upper))!=0xffff) { + return cp; + } + algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size); + --i; + } + + /* normal character name */ + findName.otherName=upper; + findName.code=error; + enumNames(uCharNames, 0, UCHAR_MAX_VALUE + 1, DO_FIND_NAME, &findName, nameChoice); + if (findName.code == error) { + *pErrorCode = U_ILLEGAL_CHAR_FOUND; + } + return findName.code; +} + +U_CAPI void U_EXPORT2 +u_enumCharNames(UChar32 start, UChar32 limit, + UEnumCharNamesFn *fn, + void *context, + UCharNameChoice nameChoice, + UErrorCode *pErrorCode) { + AlgorithmicRange *algRange; + uint32_t *p; + uint32_t i; + + if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) { + return; + } + + if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT || fn==nullptr) { + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; + return; + } + + if((uint32_t) limit > UCHAR_MAX_VALUE + 1) { + limit = UCHAR_MAX_VALUE + 1; + } + if((uint32_t)start>=(uint32_t)limit) { + return; + } + + if(!isDataLoaded(pErrorCode)) { + return; + } + + /* interleave the data-driven ones with the algorithmic ones */ + /* iterate over all algorithmic ranges; assume that they are in ascending order */ + p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset); + i=*p; + algRange=(AlgorithmicRange *)(p+1); + while(i>0) { + /* enumerate the character names before the current algorithmic range */ + /* here: start<limit */ + if((uint32_t)start<algRange->start) { + if((uint32_t)limit<=algRange->start) { + enumNames(uCharNames, start, limit, fn, context, nameChoice); + return; + } + if(!enumNames(uCharNames, start, (UChar32)algRange->start, fn, context, nameChoice)) { + return; + } + start=(UChar32)algRange->start; + } + /* enumerate the character names in the current algorithmic range */ + /* here: algRange->start<=start<limit */ + if((uint32_t)start<=algRange->end) { + if((uint32_t)limit<=(algRange->end+1)) { + enumAlgNames(algRange, start, limit, fn, context, nameChoice); + return; + } + if(!enumAlgNames(algRange, start, (UChar32)algRange->end+1, fn, context, nameChoice)) { + return; + } + start=(UChar32)algRange->end+1; + } + /* continue to the next algorithmic range (here: start<limit) */ + algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size); + --i; + } + /* enumerate the character names after the last algorithmic range */ + enumNames(uCharNames, start, limit, fn, context, nameChoice); +} + +U_CAPI int32_t U_EXPORT2 +uprv_getMaxCharNameLength() { + UErrorCode errorCode=U_ZERO_ERROR; + if(calcNameSetsLengths(&errorCode)) { + return gMaxNameLength; + } else { + return 0; + } +} + +/** + * Converts the char set cset into a Unicode set uset. + * @param cset Set of 256 bit flags corresponding to a set of chars. + * @param uset USet to receive characters. Existing contents are deleted. + */ +static void +charSetToUSet(uint32_t cset[8], const USetAdder *sa) { + char16_t us[256]; + char cs[256]; + + int32_t i, length; + UErrorCode errorCode; + + errorCode=U_ZERO_ERROR; + + if(!calcNameSetsLengths(&errorCode)) { + return; + } + + /* build a char string with all chars that are used in character names */ + length=0; + for(i=0; i<256; ++i) { + if(SET_CONTAINS(cset, i)) { + cs[length++]=(char)i; + } + } + + /* convert the char string to a char16_t string */ + u_charsToUChars(cs, us, length); + + /* add each char16_t to the USet */ + for(i=0; i<length; ++i) { + if(us[i]!=0 || cs[i]==0) { /* non-invariant chars become (char16_t)0 */ + sa->add(sa->set, us[i]); + } + } +} + +/** + * Fills set with characters that are used in Unicode character names. + * @param set USet to receive characters. + */ +U_CAPI void U_EXPORT2 +uprv_getCharNameCharacters(const USetAdder *sa) { + charSetToUSet(gNameSet, sa); +} + +/* data swapping ------------------------------------------------------------ */ + +/* + * The token table contains non-negative entries for token bytes, + * and -1 for bytes that represent themselves in the data file's charset. + * -2 entries are used for lead bytes. + * + * Direct bytes (-1 entries) must be translated from the input charset family + * to the output charset family. + * makeTokenMap() writes a permutation mapping for this. + * Use it once for single-/lead-byte tokens and once more for all trail byte + * tokens. (';' is an unused trail byte marked with -1.) + */ +static void +makeTokenMap(const UDataSwapper *ds, + int16_t tokens[], uint16_t tokenCount, + uint8_t map[256], + UErrorCode *pErrorCode) { + UBool usedOutChar[256]; + uint16_t i, j; + uint8_t c1, c2; + + if(U_FAILURE(*pErrorCode)) { + return; + } + + if(ds->inCharset==ds->outCharset) { + /* Same charset family: identity permutation */ + for(i=0; i<256; ++i) { + map[i]=(uint8_t)i; + } + } else { + uprv_memset(map, 0, 256); + uprv_memset(usedOutChar, 0, 256); + + if(tokenCount>256) { + tokenCount=256; + } + + /* set the direct bytes (byte 0 always maps to itself) */ + for(i=1; i<tokenCount; ++i) { + if(tokens[i]==-1) { + /* convert the direct byte character */ + c1=(uint8_t)i; + ds->swapInvChars(ds, &c1, 1, &c2, pErrorCode); + if(U_FAILURE(*pErrorCode)) { + udata_printError(ds, "unames/makeTokenMap() finds variant character 0x%02x used (input charset family %d)\n", + i, ds->inCharset); + return; + } + + /* enter the converted character into the map and mark it used */ + map[c1]=c2; + usedOutChar[c2]=true; + } + } + + /* set the mappings for the rest of the permutation */ + for(i=j=1; i<tokenCount; ++i) { + /* set mappings that were not set for direct bytes */ + if(map[i]==0) { + /* set an output byte value that was not used as an output byte above */ + while(usedOutChar[j]) { + ++j; + } + map[i]=(uint8_t)j++; + } + } + + /* + * leave mappings at tokenCount and above unset if tokenCount<256 + * because they won't be used + */ + } +} + +U_CAPI int32_t U_EXPORT2 +uchar_swapNames(const UDataSwapper *ds, + const void *inData, int32_t length, void *outData, + UErrorCode *pErrorCode) { + const UDataInfo *pInfo; + int32_t headerSize; + + const uint8_t *inBytes; + uint8_t *outBytes; + + uint32_t tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset, + offset, i, count, stringsCount; + + const AlgorithmicRange *inRange; + AlgorithmicRange *outRange; + + /* udata_swapDataHeader checks the arguments */ + headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode); + if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) { + return 0; + } + + /* check data format and format version */ + pInfo=(const UDataInfo *)((const char *)inData+4); + if(!( + pInfo->dataFormat[0]==0x75 && /* dataFormat="unam" */ + pInfo->dataFormat[1]==0x6e && + pInfo->dataFormat[2]==0x61 && + pInfo->dataFormat[3]==0x6d && + pInfo->formatVersion[0]==1 + )) { + udata_printError(ds, "uchar_swapNames(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as unames.icu\n", + pInfo->dataFormat[0], pInfo->dataFormat[1], + pInfo->dataFormat[2], pInfo->dataFormat[3], + pInfo->formatVersion[0]); + *pErrorCode=U_UNSUPPORTED_ERROR; + return 0; + } + + inBytes=(const uint8_t *)inData+headerSize; + outBytes=(outData == nullptr) ? nullptr : (uint8_t *)outData+headerSize; + if(length<0) { + algNamesOffset=ds->readUInt32(((const uint32_t *)inBytes)[3]); + } else { + length-=headerSize; + if( length<20 || + (uint32_t)length<(algNamesOffset=ds->readUInt32(((const uint32_t *)inBytes)[3])) + ) { + udata_printError(ds, "uchar_swapNames(): too few bytes (%d after header) for unames.icu\n", + length); + *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; + return 0; + } + } + + if(length<0) { + /* preflighting: iterate through algorithmic ranges */ + offset=algNamesOffset; + count=ds->readUInt32(*((const uint32_t *)(inBytes+offset))); + offset+=4; + + for(i=0; i<count; ++i) { + inRange=(const AlgorithmicRange *)(inBytes+offset); + offset+=ds->readUInt16(inRange->size); + } + } else { + /* swap data */ + const uint16_t *p; + uint16_t *q, *temp; + + int16_t tokens[512]; + uint16_t tokenCount; + + uint8_t map[256], trailMap[256]; + + /* copy the data for inaccessible bytes */ + if(inBytes!=outBytes) { + uprv_memcpy(outBytes, inBytes, length); + } + + /* the initial 4 offsets first */ + tokenStringOffset=ds->readUInt32(((const uint32_t *)inBytes)[0]); + groupsOffset=ds->readUInt32(((const uint32_t *)inBytes)[1]); + groupStringOffset=ds->readUInt32(((const uint32_t *)inBytes)[2]); + ds->swapArray32(ds, inBytes, 16, outBytes, pErrorCode); + + /* + * now the tokens table + * it needs to be permutated along with the compressed name strings + */ + p=(const uint16_t *)(inBytes+16); + q=(uint16_t *)(outBytes+16); + + /* read and swap the tokenCount */ + tokenCount=ds->readUInt16(*p); + ds->swapArray16(ds, p, 2, q, pErrorCode); + ++p; + ++q; + + /* read the first 512 tokens and make the token maps */ + if(tokenCount<=512) { + count=tokenCount; + } else { + count=512; + } + for(i=0; i<count; ++i) { + tokens[i]=udata_readInt16(ds, p[i]); + } + for(; i<512; ++i) { + tokens[i]=0; /* fill the rest of the tokens array if tokenCount<512 */ + } + makeTokenMap(ds, tokens, tokenCount, map, pErrorCode); + makeTokenMap(ds, tokens+256, (uint16_t)(tokenCount>256 ? tokenCount-256 : 0), trailMap, pErrorCode); + if(U_FAILURE(*pErrorCode)) { + return 0; + } + + /* + * swap and permutate the tokens + * go through a temporary array to support in-place swapping + */ + temp=(uint16_t *)uprv_malloc(tokenCount*2); + if(temp==nullptr) { + udata_printError(ds, "out of memory swapping %u unames.icu tokens\n", + tokenCount); + *pErrorCode=U_MEMORY_ALLOCATION_ERROR; + return 0; + } + + /* swap and permutate single-/lead-byte tokens */ + for(i=0; i<tokenCount && i<256; ++i) { + ds->swapArray16(ds, p+i, 2, temp+map[i], pErrorCode); + } + + /* swap and permutate trail-byte tokens */ + for(; i<tokenCount; ++i) { + ds->swapArray16(ds, p+i, 2, temp+(i&0xffffff00)+trailMap[i&0xff], pErrorCode); + } + + /* copy the result into the output and free the temporary array */ + uprv_memcpy(q, temp, tokenCount*2); + uprv_free(temp); + + /* + * swap the token strings but not a possible padding byte after + * the terminating NUL of the last string + */ + udata_swapInvStringBlock(ds, inBytes+tokenStringOffset, (int32_t)(groupsOffset-tokenStringOffset), + outBytes+tokenStringOffset, pErrorCode); + if(U_FAILURE(*pErrorCode)) { + udata_printError(ds, "uchar_swapNames(token strings) failed\n"); + return 0; + } + + /* swap the group table */ + count=ds->readUInt16(*((const uint16_t *)(inBytes+groupsOffset))); + ds->swapArray16(ds, inBytes+groupsOffset, (int32_t)((1+count*3)*2), + outBytes+groupsOffset, pErrorCode); + + /* + * swap the group strings + * swap the string bytes but not the nibble-encoded string lengths + */ + if(ds->inCharset!=ds->outCharset) { + uint16_t offsets[LINES_PER_GROUP+1], lengths[LINES_PER_GROUP+1]; + + const uint8_t *inStrings, *nextInStrings; + uint8_t *outStrings; + + uint8_t c; + + inStrings=inBytes+groupStringOffset; + outStrings=outBytes+groupStringOffset; + + stringsCount=algNamesOffset-groupStringOffset; + + /* iterate through string groups until only a few padding bytes are left */ + while(stringsCount>32) { + nextInStrings=expandGroupLengths(inStrings, offsets, lengths); + + /* move past the length bytes */ + stringsCount-=(uint32_t)(nextInStrings-inStrings); + outStrings+=nextInStrings-inStrings; + inStrings=nextInStrings; + + count=offsets[31]+lengths[31]; /* total number of string bytes in this group */ + stringsCount-=count; + + /* swap the string bytes using map[] and trailMap[] */ + while(count>0) { + c=*inStrings++; + *outStrings++=map[c]; + if(tokens[c]!=-2) { + --count; + } else { + /* token lead byte: swap the trail byte, too */ + *outStrings++=trailMap[*inStrings++]; + count-=2; + } + } + } + } + + /* swap the algorithmic ranges */ + offset=algNamesOffset; + count=ds->readUInt32(*((const uint32_t *)(inBytes+offset))); + ds->swapArray32(ds, inBytes+offset, 4, outBytes+offset, pErrorCode); + offset+=4; + + for(i=0; i<count; ++i) { + if(offset>(uint32_t)length) { + udata_printError(ds, "uchar_swapNames(): too few bytes (%d after header) for unames.icu algorithmic range %u\n", + length, i); + *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; + return 0; + } + + inRange=(const AlgorithmicRange *)(inBytes+offset); + outRange=(AlgorithmicRange *)(outBytes+offset); + offset+=ds->readUInt16(inRange->size); + + ds->swapArray32(ds, inRange, 8, outRange, pErrorCode); + ds->swapArray16(ds, &inRange->size, 2, &outRange->size, pErrorCode); + switch(inRange->type) { + case 0: + /* swap prefix string */ + ds->swapInvChars(ds, inRange+1, (int32_t)uprv_strlen((const char *)(inRange+1)), + outRange+1, pErrorCode); + if(U_FAILURE(*pErrorCode)) { + udata_printError(ds, "uchar_swapNames(prefix string of algorithmic range %u) failed\n", + i); + return 0; + } + break; + case 1: + { + /* swap factors and the prefix and factor strings */ + uint32_t factorsCount; + + factorsCount=inRange->variant; + p=(const uint16_t *)(inRange+1); + q=(uint16_t *)(outRange+1); + ds->swapArray16(ds, p, (int32_t)(factorsCount*2), q, pErrorCode); + + /* swap the strings, up to the last terminating NUL */ + p+=factorsCount; + q+=factorsCount; + stringsCount=(uint32_t)((inBytes+offset)-(const uint8_t *)p); + while(stringsCount>0 && ((const uint8_t *)p)[stringsCount-1]!=0) { + --stringsCount; + } + ds->swapInvChars(ds, p, (int32_t)stringsCount, q, pErrorCode); + } + break; + default: + udata_printError(ds, "uchar_swapNames(): unknown type %u of algorithmic range %u\n", + inRange->type, i); + *pErrorCode=U_UNSUPPORTED_ERROR; + return 0; + } + } + } + + return headerSize+(int32_t)offset; +} + +/* + * Hey, Emacs, please set the following: + * + * Local Variables: + * indent-tabs-mode: nil + * End: + * + */ |