diff options
Diffstat (limited to 'intl/icu/source/tools/gennorm2/extradata.cpp')
-rw-r--r-- | intl/icu/source/tools/gennorm2/extradata.cpp | 253 |
1 files changed, 253 insertions, 0 deletions
diff --git a/intl/icu/source/tools/gennorm2/extradata.cpp b/intl/icu/source/tools/gennorm2/extradata.cpp new file mode 100644 index 0000000000..b6c15adc7a --- /dev/null +++ b/intl/icu/source/tools/gennorm2/extradata.cpp @@ -0,0 +1,253 @@ +// © 2017 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html + +// extradata.cpp +// created: 2017jun04 Markus W. Scherer +// (pulled out of n2builder.cpp) + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_NORMALIZATION + +#include <stdio.h> +#include <stdlib.h> +#include "unicode/errorcode.h" +#include "unicode/unistr.h" +#include "unicode/utf16.h" +#include "extradata.h" +#include "normalizer2impl.h" +#include "norms.h" +#include "toolutil.h" +#include "utrie2.h" +#include "uvectr32.h" + +U_NAMESPACE_BEGIN + +ExtraData::ExtraData(Norms &n, UBool fast) : + Norms::Enumerator(n), + yesYesCompositions(1000, (UChar32)0xffff, 2), // 0=inert, 1=Jamo L, 2=start of compositions + yesNoMappingsAndCompositions(1000, (UChar32)0, 1), // 0=Hangul LV, 1=start of normal data + yesNoMappingsOnly(1000, (UChar32)0, 1), // 0=Hangul LVT, 1=start of normal data + optimizeFast(fast) { + // Hangul LV algorithmically decomposes to two Jamo. + // Some code may harmlessly read this firstUnit. + yesNoMappingsAndCompositions.setCharAt(0, 2); + // Hangul LVT algorithmically decomposes to three Jamo. + // Some code may harmlessly read this firstUnit. + yesNoMappingsOnly.setCharAt(0, 3); +} + +int32_t ExtraData::writeMapping(UChar32 c, const Norm &norm, UnicodeString &dataString) { + UnicodeString &m=*norm.mapping; + int32_t length=m.length(); + // Write the mapping & raw mapping extraData. + int32_t firstUnit=length|(norm.trailCC<<8); + int32_t preMappingLength=0; + if(norm.rawMapping!=NULL) { + UnicodeString &rm=*norm.rawMapping; + int32_t rmLength=rm.length(); + if(rmLength>Normalizer2Impl::MAPPING_LENGTH_MASK) { + fprintf(stderr, + "gennorm2 error: " + "raw mapping for U+%04lX longer than maximum of %d\n", + (long)c, Normalizer2Impl::MAPPING_LENGTH_MASK); + exit(U_INVALID_FORMAT_ERROR); + } + UChar rm0=rm.charAt(0); + if( rmLength==length-1 && + // 99: overlong substring lengths get pinned to remainder lengths anyway + 0==rm.compare(1, 99, m, 2, 99) && + rm0>Normalizer2Impl::MAPPING_LENGTH_MASK + ) { + // Compression: + // rawMapping=rm0+mapping.substring(2) -> store only rm0 + // + // The raw mapping is the same as the final mapping after replacing + // the final mapping's first two code units with the raw mapping's first one. + // In this case, we store only that first unit, rm0. + // This helps with a few hundred mappings. + dataString.append(rm0); + preMappingLength=1; + } else { + // Store the raw mapping with its length. + dataString.append(rm); + dataString.append((UChar)rmLength); + preMappingLength=rmLength+1; + } + firstUnit|=Normalizer2Impl::MAPPING_HAS_RAW_MAPPING; + } + int32_t cccLccc=norm.cc|(norm.leadCC<<8); + if(cccLccc!=0) { + dataString.append((UChar)cccLccc); + ++preMappingLength; + firstUnit|=Normalizer2Impl::MAPPING_HAS_CCC_LCCC_WORD; + } + dataString.append((UChar)firstUnit); + dataString.append(m); + return preMappingLength; +} + +int32_t ExtraData::writeNoNoMapping(UChar32 c, const Norm &norm, + UnicodeString &dataString, + Hashtable &previousMappings) { + UnicodeString newMapping; + int32_t offset=writeMapping(c, norm, newMapping); + int32_t previousOffset=previousMappings.geti(newMapping); + if(previousOffset!=0) { + // Duplicate, point to the identical mapping that has already been stored. + offset=previousOffset-1; + } else { + // Append this new mapping and + // enter it into the hashtable, avoiding value 0 which is "not found". + offset=dataString.length()+offset; + dataString.append(newMapping); + IcuToolErrorCode errorCode("gennorm2/writeExtraData()/Hashtable.puti()"); + previousMappings.puti(newMapping, offset+1, errorCode); + } + return offset; +} + +UBool ExtraData::setNoNoDelta(UChar32 c, Norm &norm) const { + // Try a compact, algorithmic encoding to a single compYesAndZeroCC code point. + // Do not map from ASCII to non-ASCII. + if(norm.mappingCP>=0 && + !(c<=0x7f && norm.mappingCP>0x7f) && + norms.getNormRef(norm.mappingCP).type<Norm::NO_NO_COMP_YES) { + int32_t delta=norm.mappingCP-c; + if(-Normalizer2Impl::MAX_DELTA<=delta && delta<=Normalizer2Impl::MAX_DELTA) { + norm.type=Norm::NO_NO_DELTA; + norm.offset=delta; + return TRUE; + } + } + return FALSE; +} + +void ExtraData::writeCompositions(UChar32 c, const Norm &norm, UnicodeString &dataString) { + if(norm.cc!=0) { + fprintf(stderr, + "gennorm2 error: " + "U+%04lX combines-forward and has ccc!=0, not possible in Unicode normalization\n", + (long)c); + exit(U_INVALID_FORMAT_ERROR); + } + int32_t length; + const CompositionPair *pairs=norm.getCompositionPairs(length); + for(int32_t i=0; i<length; ++i) { + const CompositionPair &pair=pairs[i]; + // 22 bits for the composite character and whether it combines forward. + UChar32 compositeAndFwd=pair.composite<<1; + if(norms.getNormRef(pair.composite).compositions!=NULL) { + compositeAndFwd|=1; // The composite character also combines-forward. + } + // Encode most pairs in two units and some in three. + int32_t firstUnit, secondUnit, thirdUnit; + if(pair.trail<Normalizer2Impl::COMP_1_TRAIL_LIMIT) { + if(compositeAndFwd<=0xffff) { + firstUnit=pair.trail<<1; + secondUnit=compositeAndFwd; + thirdUnit=-1; + } else { + firstUnit=(pair.trail<<1)|Normalizer2Impl::COMP_1_TRIPLE; + secondUnit=compositeAndFwd>>16; + thirdUnit=compositeAndFwd; + } + } else { + firstUnit=(Normalizer2Impl::COMP_1_TRAIL_LIMIT+ + (pair.trail>>Normalizer2Impl::COMP_1_TRAIL_SHIFT))| + Normalizer2Impl::COMP_1_TRIPLE; + secondUnit=(pair.trail<<Normalizer2Impl::COMP_2_TRAIL_SHIFT)| + (compositeAndFwd>>16); + thirdUnit=compositeAndFwd; + } + // Set the high bit of the first unit if this is the last composition pair. + if(i==(length-1)) { + firstUnit|=Normalizer2Impl::COMP_1_LAST_TUPLE; + } + dataString.append((UChar)firstUnit).append((UChar)secondUnit); + if(thirdUnit>=0) { + dataString.append((UChar)thirdUnit); + } + } +} + +void ExtraData::rangeHandler(UChar32 start, UChar32 end, Norm &norm) { + if(start!=end) { + fprintf(stderr, + "gennorm2 error: unexpected shared data for " + "multiple code points U+%04lX..U+%04lX\n", + (long)start, (long)end); + exit(U_INTERNAL_PROGRAM_ERROR); + } + if(norm.error!=nullptr) { + fprintf(stderr, "gennorm2 error: U+%04lX %s\n", (long)start, norm.error); + exit(U_INVALID_FORMAT_ERROR); + } + writeExtraData(start, norm); +} + +// Ticket #13342 - Disable optimizations on MSVC for this function as a workaround. +#if (defined(_MSC_VER) && (_MSC_VER >= 1900) && defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 190024210)) +#pragma optimize( "", off ) +#endif + +void ExtraData::writeExtraData(UChar32 c, Norm &norm) { + switch(norm.type) { + case Norm::INERT: + break; // no extra data + case Norm::YES_YES_COMBINES_FWD: + norm.offset=yesYesCompositions.length(); + writeCompositions(c, norm, yesYesCompositions); + break; + case Norm::YES_NO_COMBINES_FWD: + norm.offset=yesNoMappingsAndCompositions.length()+ + writeMapping(c, norm, yesNoMappingsAndCompositions); + writeCompositions(c, norm, yesNoMappingsAndCompositions); + break; + case Norm::YES_NO_MAPPING_ONLY: + norm.offset=yesNoMappingsOnly.length()+ + writeMapping(c, norm, yesNoMappingsOnly); + break; + case Norm::NO_NO_COMP_YES: + if(!optimizeFast && setNoNoDelta(c, norm)) { + break; + } + norm.offset=writeNoNoMapping(c, norm, noNoMappingsCompYes, previousNoNoMappingsCompYes); + break; + case Norm::NO_NO_COMP_BOUNDARY_BEFORE: + if(!optimizeFast && setNoNoDelta(c, norm)) { + break; + } + norm.offset=writeNoNoMapping( + c, norm, noNoMappingsCompBoundaryBefore, previousNoNoMappingsCompBoundaryBefore); + break; + case Norm::NO_NO_COMP_NO_MAYBE_CC: + norm.offset=writeNoNoMapping( + c, norm, noNoMappingsCompNoMaybeCC, previousNoNoMappingsCompNoMaybeCC); + break; + case Norm::NO_NO_EMPTY: + // There can be multiple extra data entries for mappings to the empty string + // if they have different raw mappings. + norm.offset=writeNoNoMapping(c, norm, noNoMappingsEmpty, previousNoNoMappingsEmpty); + break; + case Norm::MAYBE_YES_COMBINES_FWD: + norm.offset=maybeYesCompositions.length(); + writeCompositions(c, norm, maybeYesCompositions); + break; + case Norm::MAYBE_YES_SIMPLE: + break; // no extra data + case Norm::YES_YES_WITH_CC: + break; // no extra data + default: // Should not occur. + exit(U_INTERNAL_PROGRAM_ERROR); + } +} + +// Ticket #13342 - Turn optimization back on. +#if (defined(_MSC_VER) && (_MSC_VER >= 1900) && defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 190024210)) +#pragma optimize( "", on ) +#endif + +U_NAMESPACE_END + +#endif // #if !UCONFIG_NO_NORMALIZATION |