summaryrefslogtreecommitdiffstats
path: root/intl/icu/source/tools/gennorm2/extradata.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'intl/icu/source/tools/gennorm2/extradata.cpp')
-rw-r--r--intl/icu/source/tools/gennorm2/extradata.cpp254
1 files changed, 254 insertions, 0 deletions
diff --git a/intl/icu/source/tools/gennorm2/extradata.cpp b/intl/icu/source/tools/gennorm2/extradata.cpp
new file mode 100644
index 0000000000..f31bc418ea
--- /dev/null
+++ b/intl/icu/source/tools/gennorm2/extradata.cpp
@@ -0,0 +1,254 @@
+// © 2017 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+
+// extradata.cpp
+// created: 2017jun04 Markus W. Scherer
+// (pulled out of n2builder.cpp)
+
+#include "unicode/utypes.h"
+
+#if !UCONFIG_NO_NORMALIZATION
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "unicode/errorcode.h"
+#include "unicode/unistr.h"
+#include "unicode/utf16.h"
+#include "extradata.h"
+#include "normalizer2impl.h"
+#include "norms.h"
+#include "toolutil.h"
+#include "utrie2.h"
+#include "uvectr32.h"
+
+U_NAMESPACE_BEGIN
+
+ExtraData::ExtraData(Norms &n, UBool fast) :
+ Norms::Enumerator(n),
+ yesYesCompositions(1000, (UChar32)0xffff, 2), // 0=inert, 1=Jamo L, 2=start of compositions
+ yesNoMappingsAndCompositions(1000, (UChar32)0, 1), // 0=Hangul LV, 1=start of normal data
+ yesNoMappingsOnly(1000, (UChar32)0, 1), // 0=Hangul LVT, 1=start of normal data
+ optimizeFast(fast) {
+ // Hangul LV algorithmically decomposes to two Jamo.
+ // Some code may harmlessly read this firstUnit.
+ yesNoMappingsAndCompositions.setCharAt(0, 2);
+ // Hangul LVT algorithmically decomposes to three Jamo.
+ // Some code may harmlessly read this firstUnit.
+ yesNoMappingsOnly.setCharAt(0, 3);
+}
+
+int32_t ExtraData::writeMapping(UChar32 c, const Norm &norm, UnicodeString &dataString) {
+ UnicodeString &m=*norm.mapping;
+ int32_t length=m.length();
+ // Write the mapping & raw mapping extraData.
+ int32_t firstUnit=length|(norm.trailCC<<8);
+ int32_t preMappingLength=0;
+ if(norm.rawMapping!=nullptr) {
+ UnicodeString &rm=*norm.rawMapping;
+ int32_t rmLength=rm.length();
+ if(rmLength>Normalizer2Impl::MAPPING_LENGTH_MASK) {
+ fprintf(stderr,
+ "gennorm2 error: "
+ "raw mapping for U+%04lX longer than maximum of %d\n",
+ (long)c, Normalizer2Impl::MAPPING_LENGTH_MASK);
+ exit(U_INVALID_FORMAT_ERROR);
+ }
+ char16_t rm0=rm.charAt(0);
+ if( rmLength==length-1 &&
+ // 99: overlong substring lengths get pinned to remainder lengths anyway
+ 0==rm.compare(1, 99, m, 2, 99) &&
+ rm0>Normalizer2Impl::MAPPING_LENGTH_MASK
+ ) {
+ // Compression:
+ // rawMapping=rm0+mapping.substring(2) -> store only rm0
+ //
+ // The raw mapping is the same as the final mapping after replacing
+ // the final mapping's first two code units with the raw mapping's first one.
+ // In this case, we store only that first unit, rm0.
+ // This helps with a few hundred mappings.
+ dataString.append(rm0);
+ preMappingLength=1;
+ } else {
+ // Store the raw mapping with its length.
+ dataString.append(rm);
+ dataString.append((char16_t)rmLength);
+ preMappingLength=rmLength+1;
+ }
+ firstUnit|=Normalizer2Impl::MAPPING_HAS_RAW_MAPPING;
+ }
+ int32_t cccLccc=norm.cc|(norm.leadCC<<8);
+ if(cccLccc!=0) {
+ dataString.append((char16_t)cccLccc);
+ ++preMappingLength;
+ firstUnit|=Normalizer2Impl::MAPPING_HAS_CCC_LCCC_WORD;
+ }
+ dataString.append((char16_t)firstUnit);
+ dataString.append(m);
+ return preMappingLength;
+}
+
+int32_t ExtraData::writeNoNoMapping(UChar32 c, const Norm &norm,
+ UnicodeString &dataString,
+ Hashtable &previousMappings) {
+ UnicodeString newMapping;
+ int32_t offset=writeMapping(c, norm, newMapping);
+ UBool found=false;
+ int32_t previousOffset=previousMappings.getiAndFound(newMapping, found);
+ if(found) {
+ // Duplicate, point to the identical mapping that has already been stored.
+ offset=previousOffset;
+ } else {
+ // Append this new mapping and
+ // enter it into the hashtable, avoiding value 0 which is "not found".
+ offset=dataString.length()+offset;
+ dataString.append(newMapping);
+ IcuToolErrorCode errorCode("gennorm2/writeExtraData()/Hashtable.putiAllowZero()");
+ previousMappings.putiAllowZero(newMapping, offset, errorCode);
+ }
+ return offset;
+}
+
+UBool ExtraData::setNoNoDelta(UChar32 c, Norm &norm) const {
+ // Try a compact, algorithmic encoding to a single compYesAndZeroCC code point.
+ // Do not map from ASCII to non-ASCII.
+ if(norm.mappingCP>=0 &&
+ !(c<=0x7f && norm.mappingCP>0x7f) &&
+ norms.getNormRef(norm.mappingCP).type<Norm::NO_NO_COMP_YES) {
+ int32_t delta=norm.mappingCP-c;
+ if(-Normalizer2Impl::MAX_DELTA<=delta && delta<=Normalizer2Impl::MAX_DELTA) {
+ norm.type=Norm::NO_NO_DELTA;
+ norm.offset=delta;
+ return true;
+ }
+ }
+ return false;
+}
+
+void ExtraData::writeCompositions(UChar32 c, const Norm &norm, UnicodeString &dataString) {
+ if(norm.cc!=0) {
+ fprintf(stderr,
+ "gennorm2 error: "
+ "U+%04lX combines-forward and has ccc!=0, not possible in Unicode normalization\n",
+ (long)c);
+ exit(U_INVALID_FORMAT_ERROR);
+ }
+ int32_t length;
+ const CompositionPair *pairs=norm.getCompositionPairs(length);
+ for(int32_t i=0; i<length; ++i) {
+ const CompositionPair &pair=pairs[i];
+ // 22 bits for the composite character and whether it combines forward.
+ UChar32 compositeAndFwd=pair.composite<<1;
+ if(norms.getNormRef(pair.composite).compositions!=nullptr) {
+ compositeAndFwd|=1; // The composite character also combines-forward.
+ }
+ // Encode most pairs in two units and some in three.
+ int32_t firstUnit, secondUnit, thirdUnit;
+ if(pair.trail<Normalizer2Impl::COMP_1_TRAIL_LIMIT) {
+ if(compositeAndFwd<=0xffff) {
+ firstUnit=pair.trail<<1;
+ secondUnit=compositeAndFwd;
+ thirdUnit=-1;
+ } else {
+ firstUnit=(pair.trail<<1)|Normalizer2Impl::COMP_1_TRIPLE;
+ secondUnit=compositeAndFwd>>16;
+ thirdUnit=compositeAndFwd;
+ }
+ } else {
+ firstUnit=(Normalizer2Impl::COMP_1_TRAIL_LIMIT+
+ (pair.trail>>Normalizer2Impl::COMP_1_TRAIL_SHIFT))|
+ Normalizer2Impl::COMP_1_TRIPLE;
+ secondUnit=(pair.trail<<Normalizer2Impl::COMP_2_TRAIL_SHIFT)|
+ (compositeAndFwd>>16);
+ thirdUnit=compositeAndFwd;
+ }
+ // Set the high bit of the first unit if this is the last composition pair.
+ if(i==(length-1)) {
+ firstUnit|=Normalizer2Impl::COMP_1_LAST_TUPLE;
+ }
+ dataString.append((char16_t)firstUnit).append((char16_t)secondUnit);
+ if(thirdUnit>=0) {
+ dataString.append((char16_t)thirdUnit);
+ }
+ }
+}
+
+void ExtraData::rangeHandler(UChar32 start, UChar32 end, Norm &norm) {
+ if(start!=end) {
+ fprintf(stderr,
+ "gennorm2 error: unexpected shared data for "
+ "multiple code points U+%04lX..U+%04lX\n",
+ (long)start, (long)end);
+ exit(U_INTERNAL_PROGRAM_ERROR);
+ }
+ if(norm.error!=nullptr) {
+ fprintf(stderr, "gennorm2 error: U+%04lX %s\n", (long)start, norm.error);
+ exit(U_INVALID_FORMAT_ERROR);
+ }
+ writeExtraData(start, norm);
+}
+
+// Ticket #13342 - Disable optimizations on MSVC for this function as a workaround.
+#if (defined(_MSC_VER) && (_MSC_VER >= 1900) && defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 190024210))
+#pragma optimize( "", off )
+#endif
+
+void ExtraData::writeExtraData(UChar32 c, Norm &norm) {
+ switch(norm.type) {
+ case Norm::INERT:
+ break; // no extra data
+ case Norm::YES_YES_COMBINES_FWD:
+ norm.offset=yesYesCompositions.length();
+ writeCompositions(c, norm, yesYesCompositions);
+ break;
+ case Norm::YES_NO_COMBINES_FWD:
+ norm.offset=yesNoMappingsAndCompositions.length()+
+ writeMapping(c, norm, yesNoMappingsAndCompositions);
+ writeCompositions(c, norm, yesNoMappingsAndCompositions);
+ break;
+ case Norm::YES_NO_MAPPING_ONLY:
+ norm.offset=yesNoMappingsOnly.length()+
+ writeMapping(c, norm, yesNoMappingsOnly);
+ break;
+ case Norm::NO_NO_COMP_YES:
+ if(!optimizeFast && setNoNoDelta(c, norm)) {
+ break;
+ }
+ norm.offset=writeNoNoMapping(c, norm, noNoMappingsCompYes, previousNoNoMappingsCompYes);
+ break;
+ case Norm::NO_NO_COMP_BOUNDARY_BEFORE:
+ if(!optimizeFast && setNoNoDelta(c, norm)) {
+ break;
+ }
+ norm.offset=writeNoNoMapping(
+ c, norm, noNoMappingsCompBoundaryBefore, previousNoNoMappingsCompBoundaryBefore);
+ break;
+ case Norm::NO_NO_COMP_NO_MAYBE_CC:
+ norm.offset=writeNoNoMapping(
+ c, norm, noNoMappingsCompNoMaybeCC, previousNoNoMappingsCompNoMaybeCC);
+ break;
+ case Norm::NO_NO_EMPTY:
+ // There can be multiple extra data entries for mappings to the empty string
+ // if they have different raw mappings.
+ norm.offset=writeNoNoMapping(c, norm, noNoMappingsEmpty, previousNoNoMappingsEmpty);
+ break;
+ case Norm::MAYBE_YES_COMBINES_FWD:
+ norm.offset=maybeYesCompositions.length();
+ writeCompositions(c, norm, maybeYesCompositions);
+ break;
+ case Norm::MAYBE_YES_SIMPLE:
+ break; // no extra data
+ case Norm::YES_YES_WITH_CC:
+ break; // no extra data
+ default: // Should not occur.
+ exit(U_INTERNAL_PROGRAM_ERROR);
+ }
+}
+
+// Ticket #13342 - Turn optimization back on.
+#if (defined(_MSC_VER) && (_MSC_VER >= 1900) && defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 190024210))
+#pragma optimize( "", on )
+#endif
+
+U_NAMESPACE_END
+
+#endif // #if !UCONFIG_NO_NORMALIZATION