diff options
Diffstat (limited to 'intl/icu/source/tools/gencolusb')
-rw-r--r-- | intl/icu/source/tools/gencolusb/Makefile | 45 | ||||
-rw-r--r-- | intl/icu/source/tools/gencolusb/README.md | 10 | ||||
-rw-r--r-- | intl/icu/source/tools/gencolusb/extract_unsafe_backwards.cpp | 168 | ||||
-rw-r--r-- | intl/icu/source/tools/gencolusb/verify_uset.cpp | 71 |
4 files changed, 294 insertions, 0 deletions
diff --git a/intl/icu/source/tools/gencolusb/Makefile b/intl/icu/source/tools/gencolusb/Makefile new file mode 100644 index 0000000000..be13b5b106 --- /dev/null +++ b/intl/icu/source/tools/gencolusb/Makefile @@ -0,0 +1,45 @@ +## Makefile for rebuilding 'unsafe backward' data +## Copyright (C) 2016 and later: Unicode, Inc. and others. +## License & terms of use: http://www.unicode.org/copyright.html +## Copyright (c) 2015, International Business Machines Corporation and +## others. All Rights Reserved. + +## +## CONFIGURATION: +## 1. create Makefile.local containing overrides if necessary: +## BUILD_ROOT=/home/user/icu-build (location of 'config.status' etc.) +## PATH_VAR=DYLD_LIBRARY_PATH (if on OSX etc) +## +## UPDATING +## 1. make 'reset-icu' will reset ICU to 'bootstrap' state, zeroing out source/i18n/collunsafe.h +## 2. make 'gen-file' will generate and test source/i18n/collunsafe.h + +subdir=tools/gencolusb +srcdir=$(shell pwd) +SOURCE_ROOT=$(shell cd ../.. ; pwd) +BUILD_ROOT=$(SOURCE_ROOT) +BUILD_HERE=$(BUILD_ROOT)/$(subdir) +TOOL=extract_unsafe_backwards +TEST=verify_uset +PATH_VAR=LD_LIBRARY_PATH + +-include Makefile.local + +GEN_FILE=$(SOURCE_ROOT)/i18n/collunsafe.h +BUILD_OPTS=-I$(SOURCE_ROOT)/common -I$(SOURCE_ROOT)/i18n -L$(BUILD_ROOT)/lib -licuuc -licui18n -licudata +RUN_OPTS=env $(PATH_VAR)=$(BUILD_ROOT)/lib + +reset-icu: + >$(GEN_FILE) + $(MAKE) -C $(BUILD_ROOT)/i18n + +gen-file: reset-icu + mkdir -p $(BUILD_HERE) + $(CXX) -o $(BUILD_HERE)/$(TOOL) $(srcdir)/$(TOOL).cpp $(BUILD_OPTS) + $(RUN_OPTS) $(BUILD_HERE)/$(TOOL) > $(GEN_FILE) || exit 1 + $(CXX) -o $(BUILD_HERE)/$(TEST) $(srcdir)/$(TEST).cpp $(BUILD_OPTS) + $(RUN_OPTS) $(BUILD_HERE)/$(TEST) || exit 1 + $(MAKE) -C $(BUILD_ROOT)/i18n + $(RUN_OPTS) $(BUILD_HERE)/$(TEST) || exit 1 + +.PHONY: reset-icu gen-file diff --git a/intl/icu/source/tools/gencolusb/README.md b/intl/icu/source/tools/gencolusb/README.md new file mode 100644 index 0000000000..b0d9bae091 --- /dev/null +++ b/intl/icu/source/tools/gencolusb/README.md @@ -0,0 +1,10 @@ +Unsafe-Backward Collator Data +=== + +This directory contains tools to build the `source/i18n/collunsafe.h` +precomputed data. + +See [Makefile](./Makefile) for more details. + +* Copyright (C) 2016 and later: Unicode, Inc. and others. License & terms of use: http://www.unicode.org/copyright.html +* Copyright (c) 2015, International Business Machines Corporation and others. All Rights Reserved. diff --git a/intl/icu/source/tools/gencolusb/extract_unsafe_backwards.cpp b/intl/icu/source/tools/gencolusb/extract_unsafe_backwards.cpp new file mode 100644 index 0000000000..ee12e69f9b --- /dev/null +++ b/intl/icu/source/tools/gencolusb/extract_unsafe_backwards.cpp @@ -0,0 +1,168 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/** + * Copyright (c) 1999-2016, International Business Machines Corporation and + * others. All Rights Reserved. + * + * Generator for source/i18n/collunsafe.h + * see Makefile + */ + +#include <stdio.h> +#include "unicode/uversion.h" +#include "unicode/uniset.h" +#include "collationroot.h" +#include "collationtailoring.h" + +/** + * Define the type of generator to use. Choose one. + */ +#define SERIALIZE 1 //< Default: use UnicodeSet.serialize() and a new internal c'tor +#define RANGES 0 //< Enumerate ranges (works, not as fast. No support in collationdatareader.cpp) +#define PATTERN 0 //< Generate a UnicodeSet pattern (depends on #11891 AND probably slower. No support in collationdatareader.cpp) + +int main(int argc, const char *argv[]) { + UErrorCode errorCode = U_ZERO_ERROR; + + // Get the unsafeBackwardsSet + const CollationCacheEntry *rootEntry = CollationRoot::getRootCacheEntry(errorCode); + if(U_FAILURE(errorCode)) { + fprintf(stderr, "Err: %s getting root cache entry\n", u_errorName(errorCode)); + return 1; + } + const UVersionInfo &version = rootEntry->tailoring->version; + const UnicodeSet *unsafeBackwardSet = rootEntry->tailoring->unsafeBackwardSet; + char verString[20]; + u_versionToString(version, verString); + fprintf(stderr, "Generating data for ICU %s, Collation %s\n", U_ICU_VERSION, verString); + int32_t rangeCount = unsafeBackwardSet->getRangeCount(); + +#if SERIALIZE + fprintf(stderr, ".. serializing\n"); + // UnicodeSet serialization + + UErrorCode preflightCode = U_ZERO_ERROR; + // preflight + int32_t serializedCount = unsafeBackwardSet->serialize(nullptr,0,preflightCode); + if(U_FAILURE(preflightCode) && preflightCode != U_BUFFER_OVERFLOW_ERROR) { + fprintf(stderr, "Err: %s preflighting unicode set\n", u_errorName(preflightCode)); + return 1; + } + uint16_t *serializedData = new uint16_t[serializedCount]; + // serialize + unsafeBackwardSet->serialize(serializedData, serializedCount, errorCode); + if(U_FAILURE(errorCode)) { + delete [] serializedData; + fprintf(stderr, "Err: %s serializing unicodeset\n", u_errorName(errorCode)); + return 1; + } +#endif + +#if PATTERN + fprintf(stderr,".. pattern. (Note: collationdatareader.cpp does not support this form also see #11891)\n"); + // attempt to use pattern + + UnicodeString pattern; + UnicodeSet set(*unsafeBackwardSet); + set.compact(); + set.toPattern(pattern, false); + + if(U_SUCCESS(errorCode)) { + // This fails (bug# ?) - which is why this method was abandoned. + + // UnicodeSet usA(pattern, errorCode); + // fprintf(stderr, "\n%s:%d: err creating set A %s\n", __FILE__, __LINE__, u_errorName(errorCode)); + // return 1; + } + + + const char16_t *buf = pattern.getBuffer(); + int32_t needed = pattern.length(); + + // print + { + char buf2[2048]; + int32_t len2 = pattern.extract(0, pattern.length(), buf2, "utf-8"); + buf2[len2]=0; + fprintf(stderr,"===\n%s\n===\n", buf2); + } + + const UnicodeString unsafeBackwardPattern(false, buf, needed); + if(U_SUCCESS(errorCode)) { + //UnicodeSet us(unsafeBackwardPattern, errorCode); + // fprintf(stderr, "\n%s:%d: err creating set %s\n", __FILE__, __LINE__, u_errorName(errorCode)); + } else { + fprintf(stderr, "Uset OK - \n"); + } +#endif + + + // Generate the output file. + + printf("// collunsafe.h\n"); + printf("// %s\n", U_COPYRIGHT_STRING); + printf("\n"); + printf("// To be included by collationdatareader.cpp, and generated by gencolusb.\n"); + printf("// Machine generated, do not edit.\n"); + printf("\n"); + printf("#ifndef COLLUNSAFE_H\n" + "#define COLLUNSAFE_H\n" + "\n" + "#include \"unicode/utypes.h\"\n" + "\n" + "#define COLLUNSAFE_ICU_VERSION \"" U_ICU_VERSION "\"\n"); + printf("#define COLLUNSAFE_COLL_VERSION \"%s\"\n", verString); + + + +#if PATTERN + printf("#define COLLUNSAFE_PATTERN 1\n"); + printf("static const int32_t collunsafe_len = %d;\n", needed); + printf("static const char16_t collunsafe_pattern[collunsafe_len] = {\n"); + for(int i=0;i<needed;i++) { + if( (i>0) && (i%8 == 0) ) { + printf(" // %d\n", i); + } + printf("0x%04X", buf[i]); // TODO check + if(i != (needed-1)) { + printf(", "); + } + } + printf(" //%d\n};\n", (needed-1)); +#endif + +#if RANGE + fprintf(stderr, "COLLUNSAFE_RANGE - no code support in collationdatareader.cpp for this\n"); + printf("#define COLLUNSAFE_RANGE 1\n"); + printf("static const int32_t unsafe_rangeCount = %d;\n", rangeCount); + printf("static const UChar32 unsafe_ranges[%d] = { \n", rangeCount*2); + for(int32_t i=0;i<rangeCount;i++) { + printf(" 0x%04X, 0x%04X, // %d\n", + unsafeBackwardSet->getRangeStart(i), + unsafeBackwardSet->getRangeEnd(i), + i); + } + printf("};\n"); +#endif + +#if SERIALIZE + printf("#define COLLUNSAFE_SERIALIZE 1\n"); + printf("static const int32_t unsafe_serializedCount = %d;\n", serializedCount); + printf("static const uint16_t unsafe_serializedData[%d] = { \n", serializedCount); + for(int32_t i=0;i<serializedCount;i++) { + if( (i>0) && (i%8 == 0) ) { + printf(" // %d\n", i); + } + printf("0x%04X", serializedData[i]); // TODO check + if(i != (serializedCount-1)) { + printf(", "); + } + } + printf("};\n"); +#endif + + printf("#endif\n"); + fflush(stderr); + fflush(stdout); + return(U_SUCCESS(errorCode)?0:1); +} diff --git a/intl/icu/source/tools/gencolusb/verify_uset.cpp b/intl/icu/source/tools/gencolusb/verify_uset.cpp new file mode 100644 index 0000000000..03a4930489 --- /dev/null +++ b/intl/icu/source/tools/gencolusb/verify_uset.cpp @@ -0,0 +1,71 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/** + * Copyright (c) 1999-2012, International Business Machines Corporation and + * others. All Rights Reserved. + * + * Test for source/i18n/collunsafe.h + */ + +#include <stdio.h> +#include "unicode/ucol.h" +#include "unicode/uniset.h" +#include "unicode/coll.h" +#include "collation.h" + +#include "collunsafe.h" + + +int main(int argc, const char *argv[]) { + puts("verify"); + UErrorCode errorCode = U_ZERO_ERROR; +#if defined (COLLUNSAFE_PATTERN) + puts("verify pattern"); + const UnicodeString unsafeBackwardPattern(false, collunsafe_pattern, collunsafe_len); + fprintf(stderr, "\n -- pat '%c%c%c%c%c'\n", + collunsafe_pattern[0], + collunsafe_pattern[1], + collunsafe_pattern[2], + collunsafe_pattern[3], + collunsafe_pattern[4]); + if(U_SUCCESS(errorCode)) { + UnicodeSet us(unsafeBackwardPattern, errorCode); + fprintf(stderr, "\n%s:%d: err creating set %s\n", __FILE__, __LINE__, u_errorName(errorCode)); + } +#endif + +#if defined (COLLUNSAFE_RANGE) + { + puts("verify range"); + UnicodeSet u; + for(int32_t i=0;i<unsafe_rangeCount*2;i+=2) { + u.add(unsafe_ranges[i+0],unsafe_ranges[i+1]); + } + printf("Finished with %d ranges\n", u.getRangeCount()); + } +#endif + +#if defined (COLLUNSAFE_SERIALIZE) + { + puts("verify serialize"); + UnicodeSet u(unsafe_serializedData, unsafe_serializedCount, UnicodeSet::kSerialized, errorCode); + fprintf(stderr, "\n%s:%d: err creating set %s\n", __FILE__, __LINE__, u_errorName(errorCode)); + printf("Finished deserialize with %d ranges\n", u.getRangeCount()); + } +#endif +// if(tailoring.unsafeBackwardSet == nullptr) { + // errorCode = U_MEMORY_ALLOCATION_ERROR; + // fprintf(stderr, "\n%s:%d: err %s\n", __FILE__, __LINE__, u_errorName(errorCode)); + // } + puts("verify col UCA"); + if(U_SUCCESS(errorCode)) { + Collator *col = Collator::createInstance(Locale::getEnglish(), errorCode); + fprintf(stderr, "\n%s:%d: err %s creating collator\n", __FILE__, __LINE__, u_errorName(errorCode)); + } + + if(U_FAILURE(errorCode)) { + return 1; + } else { + return 0; + } +} |