From 2aa4a82499d4becd2284cdb482213d541b8804dd Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 28 Apr 2024 16:29:10 +0200 Subject: Adding upstream version 86.0.1. Signed-off-by: Daniel Baumann --- intl/icu/source/tools/gennorm2/norms.cpp | 324 +++++++++++++++++++++++++++++++ 1 file changed, 324 insertions(+) create mode 100644 intl/icu/source/tools/gennorm2/norms.cpp (limited to 'intl/icu/source/tools/gennorm2/norms.cpp') diff --git a/intl/icu/source/tools/gennorm2/norms.cpp b/intl/icu/source/tools/gennorm2/norms.cpp new file mode 100644 index 0000000000..96692f233c --- /dev/null +++ b/intl/icu/source/tools/gennorm2/norms.cpp @@ -0,0 +1,324 @@ +// © 2017 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html + +// norms.cpp +// created: 2017jun04 Markus W. Scherer +// (pulled out of n2builder.cpp) + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_NORMALIZATION + +#include +#include +#include "unicode/errorcode.h" +#include "unicode/umutablecptrie.h" +#include "unicode/unistr.h" +#include "unicode/utf16.h" +#include "normalizer2impl.h" +#include "norms.h" +#include "toolutil.h" +#include "uvectr32.h" + +U_NAMESPACE_BEGIN + +void BuilderReorderingBuffer::append(UChar32 c, uint8_t cc) { + if(cc==0 || fLength==0 || ccAt(fLength-1)<=cc) { + if(cc==0) { + fLastStarterIndex=fLength; + } + fArray[fLength++]=(c<<8)|cc; + return; + } + // Let this character bubble back to its canonical order. + int32_t i=fLength-1; + while(i>fLastStarterIndex && ccAt(i)>cc) { + --i; + } + ++i; // after the last starter or prevCC<=cc + // Move this and the following characters forward one to make space. + for(int32_t j=fLength; itype=Norm::INERT; +} + +Norms::~Norms() { + umutablecptrie_close(normTrie); + int32_t normsLength=utm_countItems(normMem); + for(int32_t i=1; i=2) { + int32_t length; + const CompositionPair *pairs=norm.getCompositionPairs(length); + for(int32_t i=0; i= 0) { + if (i > 0) { + e.rangeHandler(start, end, norms[i]); + } + start = end + 1; + } +} + +Norms::Enumerator::~Enumerator() {} + +void CompositionBuilder::rangeHandler(UChar32 start, UChar32 end, Norm &norm) { + if(norm.mappingType!=Norm::ROUND_TRIP) { return; } + if(start!=end) { + fprintf(stderr, + "gennorm2 error: same round-trip mapping for " + "more than 1 code point U+%04lX..U+%04lX\n", + (long)start, (long)end); + exit(U_INVALID_FORMAT_ERROR); + } + if(norm.cc!=0) { + fprintf(stderr, + "gennorm2 error: " + "U+%04lX has a round-trip mapping and ccc!=0, " + "not possible in Unicode normalization\n", + (long)start); + exit(U_INVALID_FORMAT_ERROR); + } + // setRoundTripMapping() ensured that there are exactly two code points. + const UnicodeString &m=*norm.mapping; + UChar32 lead=m.char32At(0); + UChar32 trail=m.char32At(m.length()-1); + if(norms.getCC(lead)!=0) { + fprintf(stderr, + "gennorm2 error: " + "U+%04lX's round-trip mapping's starter U+%04lX has ccc!=0, " + "not possible in Unicode normalization\n", + (long)start, (long)lead); + exit(U_INVALID_FORMAT_ERROR); + } + // Flag for trailing character. + norms.createNorm(trail)->combinesBack=TRUE; + // Insert (trail, composite) pair into compositions list for the lead character. + IcuToolErrorCode errorCode("gennorm2/addComposition()"); + Norm *leadNorm=norms.createNorm(lead); + UVector32 *compositions=leadNorm->compositions; + int32_t i; + if(compositions==nullptr) { + compositions=leadNorm->compositions=new UVector32(errorCode); + i=0; // "insert" the first pair at index 0 + } else { + // Insertion sort, and check for duplicate trail characters. + int32_t length; + const CompositionPair *pairs=leadNorm->getCompositionPairs(length); + for(i=0; iinsertElementAt(trail, 2*i, errorCode); + compositions->insertElementAt(start, 2*i+1, errorCode); +} + +void Decomposer::rangeHandler(UChar32 start, UChar32 end, Norm &norm) { + if(!norm.hasMapping()) { return; } + const UnicodeString &m=*norm.mapping; + UnicodeString *decomposed=nullptr; + const UChar *s=toUCharPtr(m.getBuffer()); + int32_t length=m.length(); + int32_t prev, i=0; + UChar32 c; + while(ichar32At(cNorm.mapping->length()-1); + uint8_t cTrailCC=norms.getCC(cTrailChar); + if(cTrailCC>myTrailCC) { + fprintf(stderr, + "gennorm2 error: " + "U+%04lX's round-trip mapping's starter " + "U+%04lX decomposes and the " + "inner/earlier tccc=%hu > outer/following tccc=%hu, " + "not possible in Unicode normalization\n", + (long)start, (long)c, + (short)cTrailCC, (short)myTrailCC); + exit(U_INVALID_FORMAT_ERROR); + } + } else { + fprintf(stderr, + "gennorm2 error: " + "U+%04lX's round-trip mapping's non-starter " + "U+%04lX decomposes, " + "not possible in Unicode normalization\n", + (long)start, (long)c); + exit(U_INVALID_FORMAT_ERROR); + } + } + if(decomposed==nullptr) { + decomposed=new UnicodeString(m, 0, prev); + } + decomposed->append(*cNorm.mapping); + } else if(Hangul::isHangul(c)) { + UChar buffer[3]; + int32_t hangulLength=Hangul::decompose(c, buffer); + if(norm.mappingType==Norm::ROUND_TRIP && prev!=0) { + fprintf(stderr, + "gennorm2 error: " + "U+%04lX's round-trip mapping's non-starter " + "U+%04lX decomposes, " + "not possible in Unicode normalization\n", + (long)start, (long)c); + exit(U_INVALID_FORMAT_ERROR); + } + if(decomposed==nullptr) { + decomposed=new UnicodeString(m, 0, prev); + } + decomposed->append(buffer, hangulLength); + } else if(decomposed!=nullptr) { + decomposed->append(m, prev, i-prev); + } + } + if(decomposed!=nullptr) { + if(norm.rawMapping==nullptr) { + // Remember the original mapping when decomposing recursively. + norm.rawMapping=norm.mapping; + } else { + delete norm.mapping; + } + norm.mapping=decomposed; + // Not norm.setMappingCP(); because the original mapping + // is most likely to be encodable as a delta. + didDecompose|=TRUE; + } +} + +U_NAMESPACE_END + +#endif // #if !UCONFIG_NO_NORMALIZATION -- cgit v1.2.3