summaryrefslogtreecommitdiffstats
path: root/intl/icu/source/tools/gennorm2/norms.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'intl/icu/source/tools/gennorm2/norms.cpp')
-rw-r--r--intl/icu/source/tools/gennorm2/norms.cpp324
1 files changed, 324 insertions, 0 deletions
diff --git a/intl/icu/source/tools/gennorm2/norms.cpp b/intl/icu/source/tools/gennorm2/norms.cpp
new file mode 100644
index 0000000000..9dd8dca977
--- /dev/null
+++ b/intl/icu/source/tools/gennorm2/norms.cpp
@@ -0,0 +1,324 @@
+// © 2017 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+
+// norms.cpp
+// created: 2017jun04 Markus W. Scherer
+// (pulled out of n2builder.cpp)
+
+#include "unicode/utypes.h"
+
+#if !UCONFIG_NO_NORMALIZATION
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "unicode/errorcode.h"
+#include "unicode/umutablecptrie.h"
+#include "unicode/unistr.h"
+#include "unicode/utf16.h"
+#include "normalizer2impl.h"
+#include "norms.h"
+#include "toolutil.h"
+#include "uvectr32.h"
+
+U_NAMESPACE_BEGIN
+
+void BuilderReorderingBuffer::append(UChar32 c, uint8_t cc) {
+ if(cc==0 || fLength==0 || ccAt(fLength-1)<=cc) {
+ if(cc==0) {
+ fLastStarterIndex=fLength;
+ }
+ fArray[fLength++]=(c<<8)|cc;
+ return;
+ }
+ // Let this character bubble back to its canonical order.
+ int32_t i=fLength-1;
+ while(i>fLastStarterIndex && ccAt(i)>cc) {
+ --i;
+ }
+ ++i; // after the last starter or prevCC<=cc
+ // Move this and the following characters forward one to make space.
+ for(int32_t j=fLength; i<j; --j) {
+ fArray[j]=fArray[j-1];
+ }
+ fArray[i]=(c<<8)|cc;
+ ++fLength;
+ fDidReorder=true;
+}
+
+void BuilderReorderingBuffer::toString(UnicodeString &dest) const {
+ dest.remove();
+ for(int32_t i=0; i<fLength; ++i) {
+ dest.append(charAt(i));
+ }
+}
+
+UChar32 Norm::combine(UChar32 trail) const {
+ int32_t length;
+ const CompositionPair *pairs=getCompositionPairs(length);
+ for(int32_t i=0; i<length; ++i) {
+ if(trail==pairs[i].trail) {
+ return pairs[i].composite;
+ }
+ if(trail<pairs[i].trail) {
+ break;
+ }
+ }
+ return U_SENTINEL;
+}
+
+Norms::Norms(UErrorCode &errorCode) {
+ normTrie = umutablecptrie_open(0, 0, &errorCode);
+ normMem=utm_open("gennorm2 normalization structs", 10000, 0x110100, sizeof(Norm));
+ // Default "inert" Norm struct at index 0. Practically immutable.
+ norms=allocNorm();
+ norms->type=Norm::INERT;
+}
+
+Norms::~Norms() {
+ umutablecptrie_close(normTrie);
+ int32_t normsLength=utm_countItems(normMem);
+ for(int32_t i=1; i<normsLength; ++i) {
+ delete norms[i].mapping;
+ delete norms[i].rawMapping;
+ delete norms[i].compositions;
+ }
+ utm_close(normMem);
+}
+
+Norm *Norms::allocNorm() {
+ Norm *p=(Norm *)utm_alloc(normMem);
+ norms=(Norm *)utm_getStart(normMem); // in case it got reallocated
+ return p;
+}
+
+Norm *Norms::getNorm(UChar32 c) {
+ uint32_t i = umutablecptrie_get(normTrie, c);
+ if(i==0) {
+ return nullptr;
+ }
+ return norms+i;
+}
+
+const Norm *Norms::getNorm(UChar32 c) const {
+ uint32_t i = umutablecptrie_get(normTrie, c);
+ if(i==0) {
+ return nullptr;
+ }
+ return norms+i;
+}
+
+const Norm &Norms::getNormRef(UChar32 c) const {
+ return norms[umutablecptrie_get(normTrie, c)];
+}
+
+Norm *Norms::createNorm(UChar32 c) {
+ uint32_t i=umutablecptrie_get(normTrie, c);
+ if(i!=0) {
+ return norms+i;
+ } else {
+ /* allocate Norm */
+ Norm *p=allocNorm();
+ IcuToolErrorCode errorCode("gennorm2/createNorm()");
+ umutablecptrie_set(normTrie, c, (uint32_t)(p - norms), errorCode);
+ return p;
+ }
+}
+
+void Norms::reorder(UnicodeString &mapping, BuilderReorderingBuffer &buffer) const {
+ int32_t length=mapping.length();
+ U_ASSERT(length<=Normalizer2Impl::MAPPING_LENGTH_MASK);
+ const char16_t *s=mapping.getBuffer();
+ int32_t i=0;
+ UChar32 c;
+ while(i<length) {
+ U16_NEXT(s, i, length, c);
+ buffer.append(c, getCC(c));
+ }
+ if(buffer.didReorder()) {
+ buffer.toString(mapping);
+ }
+}
+
+UBool Norms::combinesWithCCBetween(const Norm &norm, uint8_t lowCC, int32_t highCC) const {
+ if((highCC-lowCC)>=2) {
+ int32_t length;
+ const CompositionPair *pairs=norm.getCompositionPairs(length);
+ for(int32_t i=0; i<length; ++i) {
+ uint8_t trailCC=getCC(pairs[i].trail);
+ if(lowCC<trailCC && trailCC<highCC) {
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
+void Norms::enumRanges(Enumerator &e) {
+ UChar32 start = 0, end;
+ uint32_t i;
+ while ((end = umutablecptrie_getRange(normTrie, start, UCPMAP_RANGE_NORMAL, 0,
+ nullptr, nullptr, &i)) >= 0) {
+ if (i > 0) {
+ e.rangeHandler(start, end, norms[i]);
+ }
+ start = end + 1;
+ }
+}
+
+Norms::Enumerator::~Enumerator() {}
+
+void CompositionBuilder::rangeHandler(UChar32 start, UChar32 end, Norm &norm) {
+ if(norm.mappingType!=Norm::ROUND_TRIP) { return; }
+ if(start!=end) {
+ fprintf(stderr,
+ "gennorm2 error: same round-trip mapping for "
+ "more than 1 code point U+%04lX..U+%04lX\n",
+ (long)start, (long)end);
+ exit(U_INVALID_FORMAT_ERROR);
+ }
+ if(norm.cc!=0) {
+ fprintf(stderr,
+ "gennorm2 error: "
+ "U+%04lX has a round-trip mapping and ccc!=0, "
+ "not possible in Unicode normalization\n",
+ (long)start);
+ exit(U_INVALID_FORMAT_ERROR);
+ }
+ // setRoundTripMapping() ensured that there are exactly two code points.
+ const UnicodeString &m=*norm.mapping;
+ UChar32 lead=m.char32At(0);
+ UChar32 trail=m.char32At(m.length()-1);
+ if(norms.getCC(lead)!=0) {
+ fprintf(stderr,
+ "gennorm2 error: "
+ "U+%04lX's round-trip mapping's starter U+%04lX has ccc!=0, "
+ "not possible in Unicode normalization\n",
+ (long)start, (long)lead);
+ exit(U_INVALID_FORMAT_ERROR);
+ }
+ // Flag for trailing character.
+ norms.createNorm(trail)->combinesBack=true;
+ // Insert (trail, composite) pair into compositions list for the lead character.
+ IcuToolErrorCode errorCode("gennorm2/addComposition()");
+ Norm *leadNorm=norms.createNorm(lead);
+ UVector32 *compositions=leadNorm->compositions;
+ int32_t i;
+ if(compositions==nullptr) {
+ compositions=leadNorm->compositions=new UVector32(errorCode);
+ i=0; // "insert" the first pair at index 0
+ } else {
+ // Insertion sort, and check for duplicate trail characters.
+ int32_t length;
+ const CompositionPair *pairs=leadNorm->getCompositionPairs(length);
+ for(i=0; i<length; ++i) {
+ if(trail==pairs[i].trail) {
+ fprintf(stderr,
+ "gennorm2 error: same round-trip mapping for "
+ "more than 1 code point (e.g., U+%04lX) to U+%04lX + U+%04lX\n",
+ (long)start, (long)lead, (long)trail);
+ exit(U_INVALID_FORMAT_ERROR);
+ }
+ if(trail<pairs[i].trail) {
+ break;
+ }
+ }
+ }
+ compositions->insertElementAt(trail, 2*i, errorCode);
+ compositions->insertElementAt(start, 2*i+1, errorCode);
+}
+
+void Decomposer::rangeHandler(UChar32 start, UChar32 end, Norm &norm) {
+ if(!norm.hasMapping()) { return; }
+ const UnicodeString &m=*norm.mapping;
+ UnicodeString *decomposed=nullptr;
+ const char16_t *s=toUCharPtr(m.getBuffer());
+ int32_t length=m.length();
+ int32_t prev, i=0;
+ UChar32 c;
+ while(i<length) {
+ prev=i;
+ U16_NEXT(s, i, length, c);
+ if(start<=c && c<=end) {
+ fprintf(stderr,
+ "gennorm2 error: U+%04lX maps to itself directly or indirectly\n",
+ (long)c);
+ exit(U_INVALID_FORMAT_ERROR);
+ }
+ const Norm &cNorm=norms.getNormRef(c);
+ if(cNorm.hasMapping()) {
+ if(norm.mappingType==Norm::ROUND_TRIP) {
+ if(prev==0) {
+ if(cNorm.mappingType!=Norm::ROUND_TRIP) {
+ fprintf(stderr,
+ "gennorm2 error: "
+ "U+%04lX's round-trip mapping's starter "
+ "U+%04lX one-way-decomposes, "
+ "not possible in Unicode normalization\n",
+ (long)start, (long)c);
+ exit(U_INVALID_FORMAT_ERROR);
+ }
+ uint8_t myTrailCC=norms.getCC(m.char32At(i));
+ UChar32 cTrailChar=cNorm.mapping->char32At(cNorm.mapping->length()-1);
+ uint8_t cTrailCC=norms.getCC(cTrailChar);
+ if(cTrailCC>myTrailCC) {
+ fprintf(stderr,
+ "gennorm2 error: "
+ "U+%04lX's round-trip mapping's starter "
+ "U+%04lX decomposes and the "
+ "inner/earlier tccc=%hu > outer/following tccc=%hu, "
+ "not possible in Unicode normalization\n",
+ (long)start, (long)c,
+ (short)cTrailCC, (short)myTrailCC);
+ exit(U_INVALID_FORMAT_ERROR);
+ }
+ } else {
+ fprintf(stderr,
+ "gennorm2 error: "
+ "U+%04lX's round-trip mapping's non-starter "
+ "U+%04lX decomposes, "
+ "not possible in Unicode normalization\n",
+ (long)start, (long)c);
+ exit(U_INVALID_FORMAT_ERROR);
+ }
+ }
+ if(decomposed==nullptr) {
+ decomposed=new UnicodeString(m, 0, prev);
+ }
+ decomposed->append(*cNorm.mapping);
+ } else if(Hangul::isHangul(c)) {
+ char16_t buffer[3];
+ int32_t hangulLength=Hangul::decompose(c, buffer);
+ if(norm.mappingType==Norm::ROUND_TRIP && prev!=0) {
+ fprintf(stderr,
+ "gennorm2 error: "
+ "U+%04lX's round-trip mapping's non-starter "
+ "U+%04lX decomposes, "
+ "not possible in Unicode normalization\n",
+ (long)start, (long)c);
+ exit(U_INVALID_FORMAT_ERROR);
+ }
+ if(decomposed==nullptr) {
+ decomposed=new UnicodeString(m, 0, prev);
+ }
+ decomposed->append(buffer, hangulLength);
+ } else if(decomposed!=nullptr) {
+ decomposed->append(m, prev, i-prev);
+ }
+ }
+ if(decomposed!=nullptr) {
+ if(norm.rawMapping==nullptr) {
+ // Remember the original mapping when decomposing recursively.
+ norm.rawMapping=norm.mapping;
+ } else {
+ delete norm.mapping;
+ }
+ norm.mapping=decomposed;
+ // Not norm.setMappingCP(); because the original mapping
+ // is most likely to be encodable as a delta.
+ didDecompose|=true;
+ }
+}
+
+U_NAMESPACE_END
+
+#endif // #if !UCONFIG_NO_NORMALIZATION